[v2] Defer throttle when task exits to user

[RFC PATCH v2 7/7] sched/fair: alternative way of accounting throttle time

Posted by Aaron Lu 10 months ago

Implement an alternative way of accounting cfs_rq throttle time which:
- starts accounting when a throttled cfs_rq has no tasks enqueued and its
  throttled list is not empty;
- stops accounting when this cfs_rq gets unthrottled or a task gets
  enqueued.

This way, the accounted throttle time is when the cfs_rq has absolutely
no tasks enqueued and has tasks throttled.

Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
---
 kernel/sched/fair.c  | 112 ++++++++++++++++++++++++++++++++-----------
 kernel/sched/sched.h |   4 ++
 2 files changed, 89 insertions(+), 27 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 20471a3aa35e6..70f7de82d1d9d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5300,6 +5300,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
+static void account_cfs_rq_throttle_self(struct cfs_rq *cfs_rq);
 
 static void
 requeue_delayed_entity(struct sched_entity *se);
@@ -5362,10 +5363,14 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 		if (throttled_hierarchy(cfs_rq)) {
 			struct rq *rq = rq_of(cfs_rq);
 
-			if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
-				cfs_rq->throttled_clock = rq_clock(rq);
-			if (!cfs_rq->throttled_clock_self)
-				cfs_rq->throttled_clock_self = rq_clock(rq);
+			if (cfs_rq->throttled_clock) {
+				cfs_rq->throttled_time +=
+					rq_clock(rq) - cfs_rq->throttled_clock;
+				cfs_rq->throttled_clock = 0;
+			}
+
+			if (cfs_rq->throttled_clock_self)
+				account_cfs_rq_throttle_self(cfs_rq);
 		}
 #endif
 	}
@@ -5453,7 +5458,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 		 * DELAY_DEQUEUE relies on spurious wakeups, special task
 		 * states must not suffer spurious wakeups, excempt them.
 		 */
-		if (flags & DEQUEUE_SPECIAL)
+		if (flags & (DEQUEUE_SPECIAL | DEQUEUE_THROTTLE))
 			delay = false;
 
 		WARN_ON_ONCE(delay && se->sched_delayed);
@@ -5513,8 +5518,24 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 	if (cfs_rq->nr_queued == 0) {
 		update_idle_cfs_rq_clock_pelt(cfs_rq);
-		if (throttled_hierarchy(cfs_rq))
+
+#ifdef CONFIG_CFS_BANDWIDTH
+		if (throttled_hierarchy(cfs_rq)) {
 			list_del_leaf_cfs_rq(cfs_rq);
+
+			if (cfs_rq->h_nr_throttled) {
+				struct rq *rq = rq_of(cfs_rq);
+
+				WARN_ON_ONCE(cfs_rq->throttled_clock_self);
+				cfs_rq->throttled_clock_self = rq_clock(rq);
+
+				if (cfs_rq_throttled(cfs_rq)) {
+					WARN_ON_ONCE(cfs_rq->throttled_clock);
+					cfs_rq->throttled_clock = rq_clock(rq);
+				}
+			}
+		}
+#endif
 	}
 
 	return true;
@@ -5809,6 +5830,18 @@ static inline bool task_is_throttled(struct task_struct *p)
 	return !list_empty(&p->throttle_node);
 }
 
+static inline void
+cfs_rq_inc_h_nr_throttled(struct cfs_rq *cfs_rq, unsigned int nr)
+{
+	cfs_rq->h_nr_throttled += nr;
+}
+
+static inline void
+cfs_rq_dec_h_nr_throttled(struct cfs_rq *cfs_rq, unsigned int nr)
+{
+	cfs_rq->h_nr_throttled -= nr;
+}
+
 static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
 static void throttle_cfs_rq_work(struct callback_head *work)
 {
@@ -5845,7 +5878,7 @@ static void throttle_cfs_rq_work(struct callback_head *work)
 		rq = scope.rq;
 		update_rq_clock(rq);
 		WARN_ON_ONCE(!list_empty(&p->throttle_node));
-		dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
+		dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_THROTTLE);
 		list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
 		resched_curr(rq);
 	}
@@ -5863,16 +5896,37 @@ void init_cfs_throttle_work(struct task_struct *p)
 
 static void dequeue_throttled_task(struct task_struct *p, int flags)
 {
+	struct sched_entity *se = &p->se;
+
 	/*
 	 * Task is throttled and someone wants to dequeue it again:
 	 * it must be sched/core when core needs to do things like
 	 * task affinity change, task group change, task sched class
 	 * change etc.
 	 */
-	WARN_ON_ONCE(p->se.on_rq);
-	WARN_ON_ONCE(flags & DEQUEUE_SLEEP);
+	WARN_ON_ONCE(se->on_rq);
+	WARN_ON_ONCE(flags & DEQUEUE_THROTTLE);
 
 	list_del_init(&p->throttle_node);
+
+	for_each_sched_entity(se) {
+		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+		cfs_rq->h_nr_throttled--;
+	}
+}
+
+static void account_cfs_rq_throttle_self(struct cfs_rq *cfs_rq)
+{
+	/* account self time */
+	u64 delta = rq_clock(rq_of(cfs_rq)) - cfs_rq->throttled_clock_self;
+
+	cfs_rq->throttled_clock_self = 0;
+
+	if (WARN_ON_ONCE((s64)delta < 0))
+		delta = 0;
+
+	cfs_rq->throttled_clock_self_time += delta;
 }
 
 static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags);
@@ -5889,27 +5943,21 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
 	cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
 		cfs_rq->throttled_clock_pelt;
 
-	if (cfs_rq->throttled_clock_self) {
-		u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
-
-		cfs_rq->throttled_clock_self = 0;
-
-		if (WARN_ON_ONCE((s64)delta < 0))
-			delta = 0;
-
-		cfs_rq->throttled_clock_self_time += delta;
-	}
+	if (cfs_rq->throttled_clock_self)
+		account_cfs_rq_throttle_self(cfs_rq);
 
 	/* Re-enqueue the tasks that have been throttled at this level. */
 	list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) {
 		list_del_init(&p->throttle_node);
-		enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
+		enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP | ENQUEUE_THROTTLE);
 	}
 
 	/* Add cfs_rq with load or one or more already running entities to the list */
 	if (!cfs_rq_is_decayed(cfs_rq))
 		list_add_leaf_cfs_rq(cfs_rq);
 
+	WARN_ON_ONCE(cfs_rq->h_nr_throttled);
+
 	return 0;
 }
 
@@ -5945,10 +5993,7 @@ static int tg_throttle_down(struct task_group *tg, void *data)
 	/* group is entering throttled state, stop time */
 	cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
 
-	WARN_ON_ONCE(cfs_rq->throttled_clock_self);
-	if (cfs_rq->nr_queued)
-		cfs_rq->throttled_clock_self = rq_clock(rq);
-	else
+	if (!cfs_rq->nr_queued)
 		list_del_leaf_cfs_rq(cfs_rq);
 
 	WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
@@ -5992,9 +6037,6 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	 * throttled-list.  rq->lock protects completion.
 	 */
 	cfs_rq->throttled = 1;
-	WARN_ON_ONCE(cfs_rq->throttled_clock);
-	if (cfs_rq->nr_queued)
-		cfs_rq->throttled_clock = rq_clock(rq);
 	return;
 }
 
@@ -6026,6 +6068,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 		cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
 		cfs_rq->throttled_clock = 0;
 	}
+	if (cfs_rq->throttled_time) {
+		cfs_b->throttled_time += cfs_rq->throttled_time;
+		cfs_rq->throttled_time = 0;
+	}
 	list_del_rcu(&cfs_rq->throttled_list);
 	raw_spin_unlock(&cfs_b->lock);
 
@@ -6710,6 +6756,8 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 static void task_throttle_setup_work(struct task_struct *p) {}
 static bool task_is_throttled(struct task_struct *p) { return false; }
 static void dequeue_throttled_task(struct task_struct *p, int flags) {}
+static void cfs_rq_inc_h_nr_throttled(struct cfs_rq *cfs_rq, unsigned int nr) {}
+static void cfs_rq_dec_h_nr_throttled(struct cfs_rq *cfs_rq, unsigned int nr) {}
 
 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 {
@@ -6898,6 +6946,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	struct sched_entity *se = &p->se;
 	int h_nr_idle = task_has_idle_policy(p);
 	int h_nr_runnable = 1;
+	int h_nr_throttled = (flags & ENQUEUE_THROTTLE) ? 1 : 0;
 	int task_new = !(flags & ENQUEUE_WAKEUP);
 	int rq_h_nr_queued = rq->cfs.h_nr_queued;
 	u64 slice = 0;
@@ -6951,6 +7000,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		cfs_rq->h_nr_runnable += h_nr_runnable;
 		cfs_rq->h_nr_queued++;
 		cfs_rq->h_nr_idle += h_nr_idle;
+		cfs_rq_dec_h_nr_throttled(cfs_rq, h_nr_throttled);
 
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = 1;
@@ -6973,6 +7023,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		cfs_rq->h_nr_runnable += h_nr_runnable;
 		cfs_rq->h_nr_queued++;
 		cfs_rq->h_nr_idle += h_nr_idle;
+		cfs_rq_dec_h_nr_throttled(cfs_rq, h_nr_throttled);
 
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = 1;
@@ -7027,10 +7078,12 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 	int rq_h_nr_queued = rq->cfs.h_nr_queued;
 	bool task_sleep = flags & DEQUEUE_SLEEP;
 	bool task_delayed = flags & DEQUEUE_DELAYED;
+	bool task_throttle = flags & DEQUEUE_THROTTLE;
 	struct task_struct *p = NULL;
 	int h_nr_idle = 0;
 	int h_nr_queued = 0;
 	int h_nr_runnable = 0;
+	int h_nr_throttled = 0;
 	struct cfs_rq *cfs_rq;
 	u64 slice = 0;
 
@@ -7040,6 +7093,9 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 		h_nr_idle = task_has_idle_policy(p);
 		if (task_sleep || task_delayed || !se->sched_delayed)
 			h_nr_runnable = 1;
+
+		if (task_throttle)
+			h_nr_throttled = 1;
 	} else {
 		cfs_rq = group_cfs_rq(se);
 		slice = cfs_rq_min_slice(cfs_rq);
@@ -7058,6 +7114,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 		cfs_rq->h_nr_runnable -= h_nr_runnable;
 		cfs_rq->h_nr_queued -= h_nr_queued;
 		cfs_rq->h_nr_idle -= h_nr_idle;
+		cfs_rq_inc_h_nr_throttled(cfs_rq, h_nr_throttled);
 
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = h_nr_queued;
@@ -7095,6 +7152,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 		cfs_rq->h_nr_runnable -= h_nr_runnable;
 		cfs_rq->h_nr_queued -= h_nr_queued;
 		cfs_rq->h_nr_idle -= h_nr_idle;
+		cfs_rq_inc_h_nr_throttled(cfs_rq, h_nr_throttled);
 
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = h_nr_queued;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 97be6a6f53b9c..54cdec21aa5c2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -721,6 +721,7 @@ struct cfs_rq {
 
 #ifdef CONFIG_CFS_BANDWIDTH
 	int			runtime_enabled;
+	unsigned int		h_nr_throttled;
 	s64			runtime_remaining;
 
 	u64			throttled_pelt_idle;
@@ -732,6 +733,7 @@ struct cfs_rq {
 	u64			throttled_clock_pelt_time;
 	u64			throttled_clock_self;
 	u64			throttled_clock_self_time;
+	u64			throttled_time;
 	int			throttled;
 	int			throttle_count;
 	struct list_head	throttled_list;
@@ -2360,6 +2362,7 @@ extern const u32		sched_prio_to_wmult[40];
 #define DEQUEUE_SPECIAL		0x10
 #define DEQUEUE_MIGRATING	0x100 /* Matches ENQUEUE_MIGRATING */
 #define DEQUEUE_DELAYED		0x200 /* Matches ENQUEUE_DELAYED */
+#define DEQUEUE_THROTTLE	0x800 /* Matches ENQUEUE_THROTTLE */
 
 #define ENQUEUE_WAKEUP		0x01
 #define ENQUEUE_RESTORE		0x02
@@ -2377,6 +2380,7 @@ extern const u32		sched_prio_to_wmult[40];
 #define ENQUEUE_MIGRATING	0x100
 #define ENQUEUE_DELAYED		0x200
 #define ENQUEUE_RQ_SELECTED	0x400
+#define ENQUEUE_THROTTLE	0x800
 
 #define RETRY_TASK		((void *)-1UL)
 
-- 
2.39.5

Re: [RFC PATCH v2 7/7] sched/fair: alternative way of accounting throttle time

Posted by Florian Bezdeka 9 months, 3 weeks ago

Hi Aaron,

On Wed, 2025-04-09 at 20:07 +0800, Aaron Lu wrote:
> @@ -5889,27 +5943,21 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
>  	cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
>  		cfs_rq->throttled_clock_pelt;
>  
> -	if (cfs_rq->throttled_clock_self) {
> -		u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
> -
> -		cfs_rq->throttled_clock_self = 0;
> -
> -		if (WARN_ON_ONCE((s64)delta < 0))
> -			delta = 0;
> -
> -		cfs_rq->throttled_clock_self_time += delta;
> -	}
> +	if (cfs_rq->throttled_clock_self)
> +		account_cfs_rq_throttle_self(cfs_rq);
>  
>  	/* Re-enqueue the tasks that have been throttled at this level. */
>  	list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) {
>  		list_del_init(&p->throttle_node);
> -		enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
> +		enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP | ENQUEUE_THROTTLE);
>  	}
>  
>  	/* Add cfs_rq with load or one or more already running entities to the list */
>  	if (!cfs_rq_is_decayed(cfs_rq))
>  		list_add_leaf_cfs_rq(cfs_rq);
>  
> +	WARN_ON_ONCE(cfs_rq->h_nr_throttled);
> +
>  	return 0;
>  }
>  

I got this warning while testing in our virtual environment:

Any idea?

[   26.639641] ------------[ cut here ]------------
[   26.639644] WARNING: CPU: 5 PID: 0 at kernel/sched/fair.c:5967 tg_unthrottle_up+0x1a6/0x3d0
[   26.639653] Modules linked in: veth xt_nat nft_chain_nat xt_MASQUERADE nf_nat nf_conntrack_netlink xfrm_user xfrm_algo br_netfilter bridge stp llc xt_recent rfkill ip6t_REJECT nf_reject_ipv6 xt_hl ip6t_rt vsock_loopback vmw_vsock_virtio_transport_common ipt_REJECT nf_reject_ipv4 xt_LOG nf_log_syslog vmw_vsock_vmci_transport xt_comment vsock nft_limit xt_limit xt_addrtype xt_tcpudp xt_conntrack nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 nft_compat nf_tables intel_rapl_msr intel_rapl_common nfnetlink binfmt_misc intel_uncore_frequency_common isst_if_mbox_msr isst_if_common skx_edac_common nfit libnvdimm ghash_clmulni_intel sha512_ssse3 sha256_ssse3 sha1_ssse3 aesni_intel snd_pcm crypto_simd cryptd snd_timer rapl snd soundcore vmw_balloon vmwgfx pcspkr drm_ttm_helper ttm drm_client_lib button ac drm_kms_helper sg vmw_vmci evdev joydev serio_raw drm loop efi_pstore configfs efivarfs ip_tables x_tables autofs4 overlay nls_ascii nls_cp437 vfat fat ext4 crc16 mbcache jbd2 squashfs dm_verity dm_bufio reed_solomon dm_mod
[   26.639715]  sd_mod ata_generic mptspi mptscsih ata_piix mptbase libata scsi_transport_spi psmouse scsi_mod vmxnet3 i2c_piix4 i2c_smbus scsi_common
[   26.639726] CPU: 5 UID: 0 PID: 0 Comm: swapper/5 Not tainted 6.14.2-CFSfixes #1
[   26.639729] Hardware name: VMware, Inc. VMware7,1/440BX Desktop Reference Platform, BIOS VMW71.00V.24224532.B64.2408191458 08/19/2024
[   26.639731] RIP: 0010:tg_unthrottle_up+0x1a6/0x3d0
[   26.639735] Code: 00 00 48 39 ca 74 14 48 8b 52 10 49 8b 8e 58 01 00 00 48 39 8a 28 01 00 00 74 24 41 8b 86 68 01 00 00 85 c0 0f 84 8d fe ff ff <0f> 0b e9 86 fe ff ff 49 8b 9e 38 01 00 00 41 8b 86 40 01 00 00 48
[   26.639737] RSP: 0000:ffffa5df8029cec8 EFLAGS: 00010002
[   26.639739] RAX: 0000000000000001 RBX: ffff981c6fcb6a80 RCX: ffff981943752e40
[   26.639741] RDX: 0000000000000005 RSI: ffff981c6fcb6a80 RDI: ffff981943752d00
[   26.639742] RBP: ffff9819607dc708 R08: ffff981c6fcb6a80 R09: 0000000000000000
[   26.639744] R10: 0000000000000001 R11: ffff981969936a10 R12: ffff9819607dc708
[   26.639745] R13: ffff9819607dc9d8 R14: ffff9819607dc800 R15: ffffffffad913fb0
[   26.639747] FS:  0000000000000000(0000) GS:ffff981c6fc80000(0000) knlGS:0000000000000000
[   26.639749] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   26.639750] CR2: 00007ff1292dc44c CR3: 000000015350e006 CR4: 00000000007706f0
[   26.639779] PKRU: 55555554
[   26.639781] Call Trace:
[   26.639783]  <IRQ>
[   26.639787]  ? __pfx_tg_unthrottle_up+0x10/0x10
[   26.639790]  ? __pfx_tg_nop+0x10/0x10
[   26.639793]  walk_tg_tree_from+0x58/0xb0
[   26.639797]  unthrottle_cfs_rq+0xf0/0x360
[   26.639800]  ? sched_clock_cpu+0xf/0x190
[   26.639808]  __cfsb_csd_unthrottle+0x11c/0x170
[   26.639812]  ? __pfx___cfsb_csd_unthrottle+0x10/0x10
[   26.639816]  __flush_smp_call_function_queue+0x103/0x410
[   26.639822]  __sysvec_call_function_single+0x1c/0xb0
[   26.639826]  sysvec_call_function_single+0x6c/0x90
[   26.639832]  </IRQ>
[   26.639833]  <TASK>
[   26.639834]  asm_sysvec_call_function_single+0x1a/0x20
[   26.639840] RIP: 0010:pv_native_safe_halt+0xf/0x20
[   26.639844] Code: 22 d7 c3 cc cc cc cc 0f 1f 40 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa 66 90 0f 00 2d 45 c1 13 00 fb f4 <c3> cc cc cc cc 66 66 2e 0f 1f 84 00 00 00 00 00 90 90 90 90 90 90
[   26.639846] RSP: 0000:ffffa5df80117ed8 EFLAGS: 00000242
[   26.639848] RAX: 0000000000000005 RBX: ffff981940804000 RCX: ffff9819a9df7000
[   26.639849] RDX: 0000000000000005 RSI: 0000000000000005 RDI: 000000000005c514
[   26.639851] RBP: 0000000000000005 R08: 0000000000000000 R09: 0000000000000001
[   26.639852] R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000000
[   26.639853] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
[   26.639858]  default_idle+0x9/0x20
[   26.639861]  default_idle_call+0x30/0x100
[   26.639863]  do_idle+0x1fd/0x240
[   26.639869]  cpu_startup_entry+0x29/0x30
[   26.639872]  start_secondary+0x11e/0x140
[   26.639875]  common_startup_64+0x13e/0x141
[   26.639881]  </TASK>
[   26.639882] ---[ end trace 0000000000000000 ]---

Best regards,
Florian

-- 
Siemens AG, Foundational Technologies
Linux Expert Center

Re: [RFC PATCH v2 7/7] sched/fair: alternative way of accounting throttle time

Posted by Aaron Lu 9 months ago

Hi Florian,

On Thu, Apr 17, 2025 at 04:06:16PM +0200, Florian Bezdeka wrote:
> Hi Aaron,
> 
> On Wed, 2025-04-09 at 20:07 +0800, Aaron Lu wrote:
> > @@ -5889,27 +5943,21 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
> >  	cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
> >  		cfs_rq->throttled_clock_pelt;
> >  
> > -	if (cfs_rq->throttled_clock_self) {
> > -		u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
> > -
> > -		cfs_rq->throttled_clock_self = 0;
> > -
> > -		if (WARN_ON_ONCE((s64)delta < 0))
> > -			delta = 0;
> > -
> > -		cfs_rq->throttled_clock_self_time += delta;
> > -	}
> > +	if (cfs_rq->throttled_clock_self)
> > +		account_cfs_rq_throttle_self(cfs_rq);
> >  
> >  	/* Re-enqueue the tasks that have been throttled at this level. */
> >  	list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) {
> >  		list_del_init(&p->throttle_node);
> > -		enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
> > +		enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP | ENQUEUE_THROTTLE);
> >  	}
> >  
> >  	/* Add cfs_rq with load or one or more already running entities to the list */
> >  	if (!cfs_rq_is_decayed(cfs_rq))
> >  		list_add_leaf_cfs_rq(cfs_rq);
> >  
> > +	WARN_ON_ONCE(cfs_rq->h_nr_throttled);
> > +
> >  	return 0;
> >  }
> >  
> 
> I got this warning while testing in our virtual environment:
> 
> Any idea?
>

I made a stupid mistake here: I thought when a cfs_rq gets unthrottled,
it should have no tasks in throttled state, hence I added that check in
tg_unthrottle_up():
        WARN_ON_ONCE(cfs_rq->h_nr_throttled);

But h_nr_throttled tracks hierarchical throttled task number, which
means if this cfs_rq has descendent cfs_rqs that are still in throttled
state, its h_nr_throttled can be > 0 when it gets unthrottled.

I just made a setup to emulate this scenario and can reproduce this
warning. I guess in your setup, there are multiple cpu.max settings in a
cgroup hierarchy.

It's just the warn_on_once() itself is incorrect, I'll remove it in next
version, thanks for the report!

Best regards,
Aaron

> [   26.639641] ------------[ cut here ]------------
> [   26.639644] WARNING: CPU: 5 PID: 0 at kernel/sched/fair.c:5967 tg_unthrottle_up+0x1a6/0x3d0
> [   26.639653] Modules linked in: veth xt_nat nft_chain_nat xt_MASQUERADE nf_nat nf_conntrack_netlink xfrm_user xfrm_algo br_netfilter bridge stp llc xt_recent rfkill ip6t_REJECT nf_reject_ipv6 xt_hl ip6t_rt vsock_loopback vmw_vsock_virtio_transport_common ipt_REJECT nf_reject_ipv4 xt_LOG nf_log_syslog vmw_vsock_vmci_transport xt_comment vsock nft_limit xt_limit xt_addrtype xt_tcpudp xt_conntrack nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 nft_compat nf_tables intel_rapl_msr intel_rapl_common nfnetlink binfmt_misc intel_uncore_frequency_common isst_if_mbox_msr isst_if_common skx_edac_common nfit libnvdimm ghash_clmulni_intel sha512_ssse3 sha256_ssse3 sha1_ssse3 aesni_intel snd_pcm crypto_simd cryptd snd_timer rapl snd soundcore vmw_balloon vmwgfx pcspkr drm_ttm_helper ttm drm_client_lib button ac drm_kms_helper sg vmw_vmci evdev joydev serio_raw drm loop efi_pstore configfs efivarfs ip_tables x_tables autofs4 overlay nls_ascii nls_cp437 vfat fat ext4 crc16 mbcache jbd2 squashfs dm_verity dm_bufio reed_solomon dm_mod
> [   26.639715]  sd_mod ata_generic mptspi mptscsih ata_piix mptbase libata scsi_transport_spi psmouse scsi_mod vmxnet3 i2c_piix4 i2c_smbus scsi_common
> [   26.639726] CPU: 5 UID: 0 PID: 0 Comm: swapper/5 Not tainted 6.14.2-CFSfixes #1
> [   26.639729] Hardware name: VMware, Inc. VMware7,1/440BX Desktop Reference Platform, BIOS VMW71.00V.24224532.B64.2408191458 08/19/2024
> [   26.639731] RIP: 0010:tg_unthrottle_up+0x1a6/0x3d0
> [   26.639735] Code: 00 00 48 39 ca 74 14 48 8b 52 10 49 8b 8e 58 01 00 00 48 39 8a 28 01 00 00 74 24 41 8b 86 68 01 00 00 85 c0 0f 84 8d fe ff ff <0f> 0b e9 86 fe ff ff 49 8b 9e 38 01 00 00 41 8b 86 40 01 00 00 48
> [   26.639737] RSP: 0000:ffffa5df8029cec8 EFLAGS: 00010002
> [   26.639739] RAX: 0000000000000001 RBX: ffff981c6fcb6a80 RCX: ffff981943752e40
> [   26.639741] RDX: 0000000000000005 RSI: ffff981c6fcb6a80 RDI: ffff981943752d00
> [   26.639742] RBP: ffff9819607dc708 R08: ffff981c6fcb6a80 R09: 0000000000000000
> [   26.639744] R10: 0000000000000001 R11: ffff981969936a10 R12: ffff9819607dc708
> [   26.639745] R13: ffff9819607dc9d8 R14: ffff9819607dc800 R15: ffffffffad913fb0
> [   26.639747] FS:  0000000000000000(0000) GS:ffff981c6fc80000(0000) knlGS:0000000000000000
> [   26.639749] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   26.639750] CR2: 00007ff1292dc44c CR3: 000000015350e006 CR4: 00000000007706f0
> [   26.639779] PKRU: 55555554
> [   26.639781] Call Trace:
> [   26.639783]  <IRQ>
> [   26.639787]  ? __pfx_tg_unthrottle_up+0x10/0x10
> [   26.639790]  ? __pfx_tg_nop+0x10/0x10
> [   26.639793]  walk_tg_tree_from+0x58/0xb0
> [   26.639797]  unthrottle_cfs_rq+0xf0/0x360
> [   26.639800]  ? sched_clock_cpu+0xf/0x190
> [   26.639808]  __cfsb_csd_unthrottle+0x11c/0x170
> [   26.639812]  ? __pfx___cfsb_csd_unthrottle+0x10/0x10
> [   26.639816]  __flush_smp_call_function_queue+0x103/0x410
> [   26.639822]  __sysvec_call_function_single+0x1c/0xb0
> [   26.639826]  sysvec_call_function_single+0x6c/0x90
> [   26.639832]  </IRQ>
> [   26.639833]  <TASK>
> [   26.639834]  asm_sysvec_call_function_single+0x1a/0x20
> [   26.639840] RIP: 0010:pv_native_safe_halt+0xf/0x20
> [   26.639844] Code: 22 d7 c3 cc cc cc cc 0f 1f 40 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa 66 90 0f 00 2d 45 c1 13 00 fb f4 <c3> cc cc cc cc 66 66 2e 0f 1f 84 00 00 00 00 00 90 90 90 90 90 90
> [   26.639846] RSP: 0000:ffffa5df80117ed8 EFLAGS: 00000242
> [   26.639848] RAX: 0000000000000005 RBX: ffff981940804000 RCX: ffff9819a9df7000
> [   26.639849] RDX: 0000000000000005 RSI: 0000000000000005 RDI: 000000000005c514
> [   26.639851] RBP: 0000000000000005 R08: 0000000000000000 R09: 0000000000000001
> [   26.639852] R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000000
> [   26.639853] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
> [   26.639858]  default_idle+0x9/0x20
> [   26.639861]  default_idle_call+0x30/0x100
> [   26.639863]  do_idle+0x1fd/0x240
> [   26.639869]  cpu_startup_entry+0x29/0x30
> [   26.639872]  start_secondary+0x11e/0x140
> [   26.639875]  common_startup_64+0x13e/0x141
> [   26.639881]  </TASK>
> [   26.639882] ---[ end trace 0000000000000000 ]---

Re: [RFC PATCH v2 7/7] sched/fair: alternative way of accounting throttle time

Posted by Florian Bezdeka 9 months ago

On Wed, 2025-05-07 at 17:09 +0800, Aaron Lu wrote:
> Hi Florian,
> 
> On Thu, Apr 17, 2025 at 04:06:16PM +0200, Florian Bezdeka wrote:
> > Hi Aaron,
> > 
> > On Wed, 2025-04-09 at 20:07 +0800, Aaron Lu wrote:
> > > @@ -5889,27 +5943,21 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
> > >  	cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
> > >  		cfs_rq->throttled_clock_pelt;
> > >  
> > > -	if (cfs_rq->throttled_clock_self) {
> > > -		u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
> > > -
> > > -		cfs_rq->throttled_clock_self = 0;
> > > -
> > > -		if (WARN_ON_ONCE((s64)delta < 0))
> > > -			delta = 0;
> > > -
> > > -		cfs_rq->throttled_clock_self_time += delta;
> > > -	}
> > > +	if (cfs_rq->throttled_clock_self)
> > > +		account_cfs_rq_throttle_self(cfs_rq);
> > >  
> > >  	/* Re-enqueue the tasks that have been throttled at this level. */
> > >  	list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) {
> > >  		list_del_init(&p->throttle_node);
> > > -		enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
> > > +		enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP | ENQUEUE_THROTTLE);
> > >  	}
> > >  
> > >  	/* Add cfs_rq with load or one or more already running entities to the list */
> > >  	if (!cfs_rq_is_decayed(cfs_rq))
> > >  		list_add_leaf_cfs_rq(cfs_rq);
> > >  
> > > +	WARN_ON_ONCE(cfs_rq->h_nr_throttled);
> > > +
> > >  	return 0;
> > >  }
> > >  
> > 
> > I got this warning while testing in our virtual environment:
> > 
> > Any idea?
> > 
> 
> I made a stupid mistake here: I thought when a cfs_rq gets unthrottled,
> it should have no tasks in throttled state, hence I added that check in
> tg_unthrottle_up():
>         WARN_ON_ONCE(cfs_rq->h_nr_throttled);
> 
> But h_nr_throttled tracks hierarchical throttled task number, which
> means if this cfs_rq has descendent cfs_rqs that are still in throttled
> state, its h_nr_throttled can be > 0 when it gets unthrottled.
> 
> I just made a setup to emulate this scenario and can reproduce this
> warning. I guess in your setup, there are multiple cpu.max settings in a
> cgroup hierarchy.

I will have a look.

> 
> It's just the warn_on_once() itself is incorrect, I'll remove it in next
> version, thanks for the report!

You're welcome. IOW: I can ignore the warning. Great.

I meanwhile forward ported the 5.15 based series that you provided to
6.1 and applied massive testing in our lab. It looks very promising up
to now. Our freeze seems solved now.

Thanks for you're help! Very much appreciated!

We updated one device in the field today - at customer site. It will
take another week until I can report success. Let's hope.

The tests based on 6.14 are also looking good.

To sum up: This series fixes (or seems to fix, let's wait for one more
week to be sure) a critical RT issue. Is there a chance that once we
made it into mainline that we see (official) backports? 6.12 or 6.1
would be nice.

I could paste my 6.1 and 6.12 series, if that would help. But as there
will be at least one more iteration that work needs a refresh as well.

Best regards,
Florian
> 
> > [   26.639641] ------------[ cut here ]------------
> > [   26.639644] WARNING: CPU: 5 PID: 0 at kernel/sched/fair.c:5967 tg_unthrottle_up+0x1a6/0x3d0
> > [   26.639653] Modules linked in: veth xt_nat nft_chain_nat xt_MASQUERADE nf_nat nf_conntrack_netlink xfrm_user xfrm_algo br_netfilter bridge stp llc xt_recent rfkill ip6t_REJECT nf_reject_ipv6 xt_hl ip6t_rt vsock_loopback vmw_vsock_virtio_transport_common ipt_REJECT nf_reject_ipv4 xt_LOG nf_log_syslog vmw_vsock_vmci_transport xt_comment vsock nft_limit xt_limit xt_addrtype xt_tcpudp xt_conntrack nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 nft_compat nf_tables intel_rapl_msr intel_rapl_common nfnetlink binfmt_misc intel_uncore_frequency_common isst_if_mbox_msr isst_if_common skx_edac_common nfit libnvdimm ghash_clmulni_intel sha512_ssse3 sha256_ssse3 sha1_ssse3 aesni_intel snd_pcm crypto_simd cryptd snd_timer rapl snd soundcore vmw_balloon vmwgfx pcspkr drm_ttm_helper ttm drm_client_lib button ac drm_kms_helper sg vmw_vmci evdev joydev serio_raw drm loop efi_pstore configfs efivarfs ip_tables x_tables autofs4 overlay nls_ascii nls_cp437 vfat fat ext4 crc16 mbcache jbd2 squashfs dm_verity dm_bufio reed_solomon dm_mod
> > [   26.639715]  sd_mod ata_generic mptspi mptscsih ata_piix mptbase libata scsi_transport_spi psmouse scsi_mod vmxnet3 i2c_piix4 i2c_smbus scsi_common
> > [   26.639726] CPU: 5 UID: 0 PID: 0 Comm: swapper/5 Not tainted 6.14.2-CFSfixes #1
> > [   26.639729] Hardware name: VMware, Inc. VMware7,1/440BX Desktop Reference Platform, BIOS VMW71.00V.24224532.B64.2408191458 08/19/2024
> > [   26.639731] RIP: 0010:tg_unthrottle_up+0x1a6/0x3d0
> > [   26.639735] Code: 00 00 48 39 ca 74 14 48 8b 52 10 49 8b 8e 58 01 00 00 48 39 8a 28 01 00 00 74 24 41 8b 86 68 01 00 00 85 c0 0f 84 8d fe ff ff <0f> 0b e9 86 fe ff ff 49 8b 9e 38 01 00 00 41 8b 86 40 01 00 00 48
> > [   26.639737] RSP: 0000:ffffa5df8029cec8 EFLAGS: 00010002
> > [   26.639739] RAX: 0000000000000001 RBX: ffff981c6fcb6a80 RCX: ffff981943752e40
> > [   26.639741] RDX: 0000000000000005 RSI: ffff981c6fcb6a80 RDI: ffff981943752d00
> > [   26.639742] RBP: ffff9819607dc708 R08: ffff981c6fcb6a80 R09: 0000000000000000
> > [   26.639744] R10: 0000000000000001 R11: ffff981969936a10 R12: ffff9819607dc708
> > [   26.639745] R13: ffff9819607dc9d8 R14: ffff9819607dc800 R15: ffffffffad913fb0
> > [   26.639747] FS:  0000000000000000(0000) GS:ffff981c6fc80000(0000) knlGS:0000000000000000
> > [   26.639749] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > [   26.639750] CR2: 00007ff1292dc44c CR3: 000000015350e006 CR4: 00000000007706f0
> > [   26.639779] PKRU: 55555554
> > [   26.639781] Call Trace:
> > [   26.639783]  <IRQ>
> > [   26.639787]  ? __pfx_tg_unthrottle_up+0x10/0x10
> > [   26.639790]  ? __pfx_tg_nop+0x10/0x10
> > [   26.639793]  walk_tg_tree_from+0x58/0xb0
> > [   26.639797]  unthrottle_cfs_rq+0xf0/0x360
> > [   26.639800]  ? sched_clock_cpu+0xf/0x190
> > [   26.639808]  __cfsb_csd_unthrottle+0x11c/0x170
> > [   26.639812]  ? __pfx___cfsb_csd_unthrottle+0x10/0x10
> > [   26.639816]  __flush_smp_call_function_queue+0x103/0x410
> > [   26.639822]  __sysvec_call_function_single+0x1c/0xb0
> > [   26.639826]  sysvec_call_function_single+0x6c/0x90
> > [   26.639832]  </IRQ>
> > [   26.639833]  <TASK>
> > [   26.639834]  asm_sysvec_call_function_single+0x1a/0x20
> > [   26.639840] RIP: 0010:pv_native_safe_halt+0xf/0x20
> > [   26.639844] Code: 22 d7 c3 cc cc cc cc 0f 1f 40 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa 66 90 0f 00 2d 45 c1 13 00 fb f4 <c3> cc cc cc cc 66 66 2e 0f 1f 84 00 00 00 00 00 90 90 90 90 90 90
> > [   26.639846] RSP: 0000:ffffa5df80117ed8 EFLAGS: 00000242
> > [   26.639848] RAX: 0000000000000005 RBX: ffff981940804000 RCX: ffff9819a9df7000
> > [   26.639849] RDX: 0000000000000005 RSI: 0000000000000005 RDI: 000000000005c514
> > [   26.639851] RBP: 0000000000000005 R08: 0000000000000000 R09: 0000000000000001
> > [   26.639852] R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000000
> > [   26.639853] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
> > [   26.639858]  default_idle+0x9/0x20
> > [   26.639861]  default_idle_call+0x30/0x100
> > [   26.639863]  do_idle+0x1fd/0x240
> > [   26.639869]  cpu_startup_entry+0x29/0x30
> > [   26.639872]  start_secondary+0x11e/0x140
> > [   26.639875]  common_startup_64+0x13e/0x141
> > [   26.639881]  </TASK>
> > [   26.639882] ---[ end trace 0000000000000000 ]---

Re: [RFC PATCH v2 7/7] sched/fair: alternative way of accounting throttle time

Posted by Aaron Lu 9 months ago

On Wed, May 07, 2025 at 11:33:42AM +0200, Florian Bezdeka wrote:
> On Wed, 2025-05-07 at 17:09 +0800, Aaron Lu wrote:
... ...
> > It's just the warn_on_once() itself is incorrect, I'll remove it in next
> > version, thanks for the report!
> 
> You're welcome. IOW: I can ignore the warning. Great.
>

Right :-)

> I meanwhile forward ported the 5.15 based series that you provided to
> 6.1 and applied massive testing in our lab. It looks very promising up
> to now. Our freeze seems solved now.
> 

Good to know this.

> Thanks for you're help! Very much appreciated!
> 

You are welcome.

> We updated one device in the field today - at customer site. It will
> take another week until I can report success. Let's hope.
> 
> The tests based on 6.14 are also looking good.
> 
> To sum up: This series fixes (or seems to fix, let's wait for one more
> week to be sure) a critical RT issue. Is there a chance that once we
> made it into mainline that we see (official) backports? 6.12 or 6.1
> would be nice.

I don't think there will be official backports if this series entered
mainline because stable kernels only take fixes while this series changed
throttle behavior dramatically. Of course, this is just my personal
view, and the maintainer will make the final decision.

Thanks,
Aaron

Re: [RFC PATCH v2 7/7] sched/fair: alternative way of accounting throttle time

Posted by Jan Kiszka 9 months ago

On 08.05.25 04:45, Aaron Lu wrote:
> On Wed, May 07, 2025 at 11:33:42AM +0200, Florian Bezdeka wrote:
>> To sum up: This series fixes (or seems to fix, let's wait for one more
>> week to be sure) a critical RT issue. Is there a chance that once we
>> made it into mainline that we see (official) backports? 6.12 or 6.1
>> would be nice.
> 
> I don't think there will be official backports if this series entered
> mainline because stable kernels only take fixes while this series changed
> throttle behavior dramatically. Of course, this is just my personal
> view, and the maintainer will make the final decision.

With 6.12 carrying RT in-tree and this patches serious fixing a hard
lock-up of that configuration, a backport to 6.12-stable would be
required IMHO. Backports beyond that should be a topic for the
(separate) rt-stable trees.

Jan

-- 
Siemens AG, Foundational Technologies
Linux Expert Center

Re: [RFC PATCH v2 7/7] sched/fair: alternative way of accounting throttle time

Posted by Steven Rostedt 9 months ago

On Thu, 8 May 2025 08:13:39 +0200
Jan Kiszka <jan.kiszka@siemens.com> wrote:

> On 08.05.25 04:45, Aaron Lu wrote:
> > On Wed, May 07, 2025 at 11:33:42AM +0200, Florian Bezdeka wrote:  
> >> To sum up: This series fixes (or seems to fix, let's wait for one more
> >> week to be sure) a critical RT issue. Is there a chance that once we
> >> made it into mainline that we see (official) backports? 6.12 or 6.1
> >> would be nice.  
> > 
> > I don't think there will be official backports if this series entered
> > mainline because stable kernels only take fixes while this series changed
> > throttle behavior dramatically. Of course, this is just my personal
> > view, and the maintainer will make the final decision.  
> 
> With 6.12 carrying RT in-tree and this patches serious fixing a hard
> lock-up of that configuration, a backport to 6.12-stable would be
> required IMHO. Backports beyond that should be a topic for the
> (separate) rt-stable trees.
>

Agreed, and I'm adding the stable RT maintainers as well in case this needs
to go earlier than 6.12.

Thanks,

-- Steve

Re: [RFC PATCH v2 7/7] sched/fair: alternative way of accounting throttle time

Posted by Aaron Lu 9 months, 3 weeks ago

Hi Florian,

On Thu, Apr 17, 2025 at 04:06:16PM +0200, Florian Bezdeka wrote:
> Hi Aaron,
> 
> On Wed, 2025-04-09 at 20:07 +0800, Aaron Lu wrote:
> > @@ -5889,27 +5943,21 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
> >  	cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
> >  		cfs_rq->throttled_clock_pelt;
> >  
> > -	if (cfs_rq->throttled_clock_self) {
> > -		u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
> > -
> > -		cfs_rq->throttled_clock_self = 0;
> > -
> > -		if (WARN_ON_ONCE((s64)delta < 0))
> > -			delta = 0;
> > -
> > -		cfs_rq->throttled_clock_self_time += delta;
> > -	}
> > +	if (cfs_rq->throttled_clock_self)
> > +		account_cfs_rq_throttle_self(cfs_rq);
> >  
> >  	/* Re-enqueue the tasks that have been throttled at this level. */
> >  	list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) {
> >  		list_del_init(&p->throttle_node);
> > -		enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
> > +		enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP | ENQUEUE_THROTTLE);
> >  	}
> >  
> >  	/* Add cfs_rq with load or one or more already running entities to the list */
> >  	if (!cfs_rq_is_decayed(cfs_rq))
> >  		list_add_leaf_cfs_rq(cfs_rq);
> >  
> > +	WARN_ON_ONCE(cfs_rq->h_nr_throttled);
> > +
> >  	return 0;
> >  }
> >  
> 
> I got this warning while testing in our virtual environment:

Thanks for the report.

> 
> Any idea?
>

Most likely the accounting of h_nr_throttle is incorrect somewhere.

> [   26.639641] ------------[ cut here ]------------
> [   26.639644] WARNING: CPU: 5 PID: 0 at kernel/sched/fair.c:5967 tg_unthrottle_up+0x1a6/0x3d0

The line doesn't match the code though, the below warning should be at
line 5959:
WARN_ON_ONCE(cfs_rq->h_nr_throttled); 

> [   26.639653] Modules linked in: veth xt_nat nft_chain_nat xt_MASQUERADE nf_nat nf_conntrack_netlink xfrm_user xfrm_algo br_netfilter bridge stp llc xt_recent rfkill ip6t_REJECT nf_reject_ipv6 xt_hl ip6t_rt vsock_loopback vmw_vsock_virtio_transport_common ipt_REJECT nf_reject_ipv4 xt_LOG nf_log_syslog vmw_vsock_vmci_transport xt_comment vsock nft_limit xt_limit xt_addrtype xt_tcpudp xt_conntrack nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 nft_compat nf_tables intel_rapl_msr intel_rapl_common nfnetlink binfmt_misc intel_uncore_frequency_common isst_if_mbox_msr isst_if_common skx_edac_common nfit libnvdimm ghash_clmulni_intel sha512_ssse3 sha256_ssse3 sha1_ssse3 aesni_intel snd_pcm crypto_simd cryptd snd_timer rapl snd soundcore vmw_balloon vmwgfx pcspkr drm_ttm_helper ttm drm_client_lib button ac drm_kms_helper sg vmw_vmci evdev joydev serio_raw drm loop efi_pstore configfs efivarfs ip_tables x_tables autofs4 overlay nls_ascii nls_cp437 vfat fat ext4 crc16 mbcache jbd2 squashfs dm_verity dm_bufio reed_solomon dm_mod
> [   26.639715]  sd_mod ata_generic mptspi mptscsih ata_piix mptbase libata scsi_transport_spi psmouse scsi_mod vmxnet3 i2c_piix4 i2c_smbus scsi_common
> [   26.639726] CPU: 5 UID: 0 PID: 0 Comm: swapper/5 Not tainted 6.14.2-CFSfixes #1

6.14.2-CFSfixes seems to be a backported kernel?
Do you also see this warning when using this series on top of the said
base commit 6432e163ba1b("sched/isolation: Make use of more than one
housekeeping cpu")? Just want to make sure it's not a problem due to
backport.

Thanks,
Aaron

> [   26.639729] Hardware name: VMware, Inc. VMware7,1/440BX Desktop Reference Platform, BIOS VMW71.00V.24224532.B64.2408191458 08/19/2024
> [   26.639731] RIP: 0010:tg_unthrottle_up+0x1a6/0x3d0
> [   26.639735] Code: 00 00 48 39 ca 74 14 48 8b 52 10 49 8b 8e 58 01 00 00 48 39 8a 28 01 00 00 74 24 41 8b 86 68 01 00 00 85 c0 0f 84 8d fe ff ff <0f> 0b e9 86 fe ff ff 49 8b 9e 38 01 00 00 41 8b 86 40 01 00 00 48
> [   26.639737] RSP: 0000:ffffa5df8029cec8 EFLAGS: 00010002
> [   26.639739] RAX: 0000000000000001 RBX: ffff981c6fcb6a80 RCX: ffff981943752e40
> [   26.639741] RDX: 0000000000000005 RSI: ffff981c6fcb6a80 RDI: ffff981943752d00
> [   26.639742] RBP: ffff9819607dc708 R08: ffff981c6fcb6a80 R09: 0000000000000000
> [   26.639744] R10: 0000000000000001 R11: ffff981969936a10 R12: ffff9819607dc708
> [   26.639745] R13: ffff9819607dc9d8 R14: ffff9819607dc800 R15: ffffffffad913fb0
> [   26.639747] FS:  0000000000000000(0000) GS:ffff981c6fc80000(0000) knlGS:0000000000000000
> [   26.639749] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   26.639750] CR2: 00007ff1292dc44c CR3: 000000015350e006 CR4: 00000000007706f0
> [   26.639779] PKRU: 55555554
> [   26.639781] Call Trace:
> [   26.639783]  <IRQ>
> [   26.639787]  ? __pfx_tg_unthrottle_up+0x10/0x10
> [   26.639790]  ? __pfx_tg_nop+0x10/0x10
> [   26.639793]  walk_tg_tree_from+0x58/0xb0
> [   26.639797]  unthrottle_cfs_rq+0xf0/0x360
> [   26.639800]  ? sched_clock_cpu+0xf/0x190
> [   26.639808]  __cfsb_csd_unthrottle+0x11c/0x170
> [   26.639812]  ? __pfx___cfsb_csd_unthrottle+0x10/0x10
> [   26.639816]  __flush_smp_call_function_queue+0x103/0x410
> [   26.639822]  __sysvec_call_function_single+0x1c/0xb0
> [   26.639826]  sysvec_call_function_single+0x6c/0x90
> [   26.639832]  </IRQ>
> [   26.639833]  <TASK>
> [   26.639834]  asm_sysvec_call_function_single+0x1a/0x20
> [   26.639840] RIP: 0010:pv_native_safe_halt+0xf/0x20
> [   26.639844] Code: 22 d7 c3 cc cc cc cc 0f 1f 40 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa 66 90 0f 00 2d 45 c1 13 00 fb f4 <c3> cc cc cc cc 66 66 2e 0f 1f 84 00 00 00 00 00 90 90 90 90 90 90
> [   26.639846] RSP: 0000:ffffa5df80117ed8 EFLAGS: 00000242
> [   26.639848] RAX: 0000000000000005 RBX: ffff981940804000 RCX: ffff9819a9df7000
> [   26.639849] RDX: 0000000000000005 RSI: 0000000000000005 RDI: 000000000005c514
> [   26.639851] RBP: 0000000000000005 R08: 0000000000000000 R09: 0000000000000001
> [   26.639852] R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000000
> [   26.639853] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
> [   26.639858]  default_idle+0x9/0x20
> [   26.639861]  default_idle_call+0x30/0x100
> [   26.639863]  do_idle+0x1fd/0x240
> [   26.639869]  cpu_startup_entry+0x29/0x30
> [   26.639872]  start_secondary+0x11e/0x140
> [   26.639875]  common_startup_64+0x13e/0x141
> [   26.639881]  </TASK>
> [   26.639882] ---[ end trace 0000000000000000 ]---

Re: [RFC PATCH v2 7/7] sched/fair: alternative way of accounting throttle time

Posted by Florian Bezdeka 9 months, 2 weeks ago

On Fri, 2025-04-18 at 11:15 +0800, Aaron Lu wrote:
> Hi Florian,
> 
> On Thu, Apr 17, 2025 at 04:06:16PM +0200, Florian Bezdeka wrote:
> > Hi Aaron,
> > 
> > On Wed, 2025-04-09 at 20:07 +0800, Aaron Lu wrote:
> > > @@ -5889,27 +5943,21 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
> > >  	cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
> > >  		cfs_rq->throttled_clock_pelt;
> > >  
> > > -	if (cfs_rq->throttled_clock_self) {
> > > -		u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
> > > -
> > > -		cfs_rq->throttled_clock_self = 0;
> > > -
> > > -		if (WARN_ON_ONCE((s64)delta < 0))
> > > -			delta = 0;
> > > -
> > > -		cfs_rq->throttled_clock_self_time += delta;
> > > -	}
> > > +	if (cfs_rq->throttled_clock_self)
> > > +		account_cfs_rq_throttle_self(cfs_rq);
> > >  
> > >  	/* Re-enqueue the tasks that have been throttled at this level. */
> > >  	list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) {
> > >  		list_del_init(&p->throttle_node);
> > > -		enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
> > > +		enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP | ENQUEUE_THROTTLE);
> > >  	}
> > >  
> > >  	/* Add cfs_rq with load or one or more already running entities to the list */
> > >  	if (!cfs_rq_is_decayed(cfs_rq))
> > >  		list_add_leaf_cfs_rq(cfs_rq);
> > >  
> > > +	WARN_ON_ONCE(cfs_rq->h_nr_throttled);
> > > +
> > >  	return 0;
> > >  }
> > >  
> > 
> > I got this warning while testing in our virtual environment:
> 
> Thanks for the report.
> 
> > 
> > Any idea?
> > 
> 
> Most likely the accounting of h_nr_throttle is incorrect somewhere.
> 
> > [   26.639641] ------------[ cut here ]------------
> > [   26.639644] WARNING: CPU: 5 PID: 0 at kernel/sched/fair.c:5967 tg_unthrottle_up+0x1a6/0x3d0
> 
> The line doesn't match the code though, the below warning should be at
> line 5959:
> WARN_ON_ONCE(cfs_rq->h_nr_throttled);

See below.

> 
> > [   26.639653] Modules linked in: veth xt_nat nft_chain_nat xt_MASQUERADE nf_nat nf_conntrack_netlink xfrm_user xfrm_algo br_netfilter bridge stp llc xt_recent rfkill ip6t_REJECT nf_reject_ipv6 xt_hl ip6t_rt vsock_loopback vmw_vsock_virtio_transport_common ipt_REJECT nf_reject_ipv4 xt_LOG nf_log_syslog vmw_vsock_vmci_transport xt_comment vsock nft_limit xt_limit xt_addrtype xt_tcpudp xt_conntrack nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 nft_compat nf_tables intel_rapl_msr intel_rapl_common nfnetlink binfmt_misc intel_uncore_frequency_common isst_if_mbox_msr isst_if_common skx_edac_common nfit libnvdimm ghash_clmulni_intel sha512_ssse3 sha256_ssse3 sha1_ssse3 aesni_intel snd_pcm crypto_simd cryptd snd_timer rapl snd soundcore vmw_balloon vmwgfx pcspkr drm_ttm_helper ttm drm_client_lib button ac drm_kms_helper sg vmw_vmci evdev joydev serio_raw drm loop efi_pstore configfs efivarfs ip_tables x_tables autofs4 overlay nls_ascii nls_cp437 vfat fat ext4 crc16 mbcache jbd2 squashfs dm_verity dm_bufio reed_solomon dm_mod
> > [   26.639715]  sd_mod ata_generic mptspi mptscsih ata_piix mptbase libata scsi_transport_spi psmouse scsi_mod vmxnet3 i2c_piix4 i2c_smbus scsi_common
> > [   26.639726] CPU: 5 UID: 0 PID: 0 Comm: swapper/5 Not tainted 6.14.2-CFSfixes #1
> 
> 6.14.2-CFSfixes seems to be a backported kernel?
> Do you also see this warning when using this series on top of the said
> base commit 6432e163ba1b("sched/isolation: Make use of more than one
> housekeeping cpu")? Just want to make sure it's not a problem due to
> backport.

Right, I should have mentioned that crucial detail. Sorry.

I ported your series to 6.14.2 because we did/do not trust anything
newer yet for testing. The problematic workload was not available in
our lab at that time, so we had to be very carefully about deployed
kernel versions.

I'm attaching the backported patches now, so you can compare / review
if you like. Spoiler: The only differences are line numbers ;-)


Best regards,
Florian

Re: [RFC PATCH v2 7/7] sched/fair: alternative way of accounting throttle time

Posted by Aaron Lu 9 months, 2 weeks ago

On Tue, Apr 22, 2025 at 05:03:19PM +0200, Florian Bezdeka wrote:
... ...

> Right, I should have mentioned that crucial detail. Sorry.
> 
> I ported your series to 6.14.2 because we did/do not trust anything
> newer yet for testing. The problematic workload was not available in
> our lab at that time, so we had to be very carefully about deployed
> kernel versions.
> 
> I'm attaching the backported patches now, so you can compare / review
> if you like. Spoiler: The only differences are line numbers ;-)

I didn't notice any problem regarding backport after a quick look.

May I know what kind of workload triggered this warning? I haven't been
able to trigger it, I'll have to stare harder at the code.

Thanks,
Aaron

Re: [RFC PATCH v2 7/7] sched/fair: alternative way of accounting throttle time

Posted by Florian Bezdeka 9 months, 2 weeks ago

On Wed, 2025-04-23 at 19:26 +0800, Aaron Lu wrote:
> On Tue, Apr 22, 2025 at 05:03:19PM +0200, Florian Bezdeka wrote:
> ... ...
> 
> > Right, I should have mentioned that crucial detail. Sorry.
> > 
> > I ported your series to 6.14.2 because we did/do not trust anything
> > newer yet for testing. The problematic workload was not available in
> > our lab at that time, so we had to be very carefully about deployed
> > kernel versions.
> > 
> > I'm attaching the backported patches now, so you can compare / review
> > if you like. Spoiler: The only differences are line numbers ;-)
> 
> I didn't notice any problem regarding backport after a quick look.
> 
> May I know what kind of workload triggered this warning? I haven't been
> able to trigger it, I'll have to stare harder at the code.

There are a couple of containers running. Nothing special as far as I
can tell. Network, IO, at least one container heavily using the epoll
interface.

The system is still operating fine though...

Once again: PREEMPT_RT enabled, so maybe handling an IRQ over the
accounting code could happen? Looking at the warning again it looks
like unthrottle_cfs_rq() is called from IRQ context. Is that expected?

Best regards,
Florian

Re: [RFC PATCH v2 7/7] sched/fair: alternative way of accounting throttle time

Posted by Aaron Lu 9 months, 2 weeks ago

On Wed, Apr 23, 2025 at 02:15:55PM +0200, Florian Bezdeka wrote:
> On Wed, 2025-04-23 at 19:26 +0800, Aaron Lu wrote:
> > On Tue, Apr 22, 2025 at 05:03:19PM +0200, Florian Bezdeka wrote:
> > ... ...
> > 
> > > Right, I should have mentioned that crucial detail. Sorry.
> > > 
> > > I ported your series to 6.14.2 because we did/do not trust anything
> > > newer yet for testing. The problematic workload was not available in
> > > our lab at that time, so we had to be very carefully about deployed
> > > kernel versions.
> > > 
> > > I'm attaching the backported patches now, so you can compare / review
> > > if you like. Spoiler: The only differences are line numbers ;-)
> > 
> > I didn't notice any problem regarding backport after a quick look.
> > 
> > May I know what kind of workload triggered this warning? I haven't been
> > able to trigger it, I'll have to stare harder at the code.
> 
> There are a couple of containers running. Nothing special as far as I
> can tell. Network, IO, at least one container heavily using the epoll
> interface.

Thanks for the info, I'll run with PREEMPT_RT enabled and see if I can
find anything.

> 
> The system is still operating fine though...
>

So that means only the h_nr_throttle accounting is incorrect. The throttle
time accounting will be affected but looks like the functionality is OK.

> Once again: PREEMPT_RT enabled, so maybe handling an IRQ over the
> accounting code could happen? Looking at the warning again it looks
> like unthrottle_cfs_rq() is called from IRQ context. Is that expected?

Yes it is.

The period timer handler will distribute runtime to individual
cfs_rqs of this task_group and those cfs_rqs are per-cpu. The timer
handler did this asynchronously, i.e. it sends IPI to corresponding CPU
to let them deal with unthrottling their cfs_rq by their own, to reduce
the time this timer handler runs. See commit 8ad075c2eb1f("sched: Async
unthrottling for cfs bandwidth").

I think this creates an interesting result in PREEMPT_RT: the CPU that
runs the hrtimer handler unthrottles its cfs_rq in ktimerd context while
all others unthrottle their cfs_rqs in hardirq context. I don't see any
problem with this, it just seems inconsistent.

Thanks,
Aaron

Re: [RFC PATCH v2 7/7] sched/fair: alternative way of accounting throttle time

Posted by Aaron Lu 10 months ago

On Wed, Apr 09, 2025 at 08:07:46PM +0800, Aaron Lu wrote:
> Implement an alternative way of accounting cfs_rq throttle time which:
> - starts accounting when a throttled cfs_rq has no tasks enqueued and its
>   throttled list is not empty;
> - stops accounting when this cfs_rq gets unthrottled or a task gets
>   enqueued.
> 
> This way, the accounted throttle time is when the cfs_rq has absolutely
> no tasks enqueued and has tasks throttled.
> 
> Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
> ---
>  kernel/sched/fair.c  | 112 ++++++++++++++++++++++++++++++++-----------
>  kernel/sched/sched.h |   4 ++
>  2 files changed, 89 insertions(+), 27 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 20471a3aa35e6..70f7de82d1d9d 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5300,6 +5300,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>  
>  static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
>  static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
> +static void account_cfs_rq_throttle_self(struct cfs_rq *cfs_rq);
>  
>  static void
>  requeue_delayed_entity(struct sched_entity *se);
> @@ -5362,10 +5363,14 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>  		if (throttled_hierarchy(cfs_rq)) {
>  			struct rq *rq = rq_of(cfs_rq);
>  
> -			if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
> -				cfs_rq->throttled_clock = rq_clock(rq);
> -			if (!cfs_rq->throttled_clock_self)
> -				cfs_rq->throttled_clock_self = rq_clock(rq);
> +			if (cfs_rq->throttled_clock) {
> +				cfs_rq->throttled_time +=
> +					rq_clock(rq) - cfs_rq->throttled_clock;
> +				cfs_rq->throttled_clock = 0;
> +			}

This probably needs more explanation.

We can also take cfs_b->lock and directly accounts the time into
cfs_b->throttled_time, but considering enqueue can be frequent so to
avoid possible lock contention, I chose to account this time to the cpu
local cfs_rq and on unthrottle, add the local accounted time to
cfs_b->throttled_time.

This has a side effect though: when reading cpu.stat and cpu.stat.local
for a task group with quota setting, the throttled_usec in cpu.stat can
be slightly smaller than throttled_usec in cpu.stat.local since some
throttled time is not accounted to cfs_b yet...

> +
> +			if (cfs_rq->throttled_clock_self)
> +				account_cfs_rq_throttle_self(cfs_rq);
>  		}
>  #endif
>  	}

[RFC PATCH v2 1/7] sched/fair: Add related data structure for task based throttle
[RFC PATCH v2 2/7] sched/fair: Handle throttle path for task based throttle
[RFC PATCH v2 3/7] sched/fair: Handle unthrottle path for task based throttle
[RFC PATCH v2 4/7] sched/fair: Take care of group/affinity/sched_class change for throttled task
[RFC PATCH v2 5/7] sched/fair: get rid of throttled_lb_pair()
[RFC PATCH v2 6/7] sched/fair: fix h_nr_runnable accounting with per-task throttle
[RFC PATCH v2 7/7] sched/fair: alternative way of accounting throttle time