[v4] sched_ext: Support high-performance monotonically non-decreasing clock

[PATCH v4 2/6] sched_ext: Implement scx_rq_clock_update/stale()

Posted by Changwoo Min 1 year, 2 months ago

scx_rq_clock_update() and scx_rq_clock_stale() manage the status of an
rq clock when sched_ext is enabled. scx_rq_clock_update() keeps the rq
clock in memory and its status valid. scx_rq_clock_stale() invalidates
the current rq clock not to use the cached rq clock.

Signed-off-by: Changwoo Min <changwoo@igalia.com>
---
 kernel/sched/sched.h | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 440ecedf871b..7e71d8685fcc 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -754,6 +754,7 @@ enum scx_rq_flags {
 	SCX_RQ_BAL_PENDING	= 1 << 2, /* balance hasn't run yet */
 	SCX_RQ_BAL_KEEP		= 1 << 3, /* balance decided to keep current */
 	SCX_RQ_BYPASSING	= 1 << 4,
+	SCX_RQ_CLK_VALID	= 1 << 5, /* RQ clock is fresh and valid */
 
 	SCX_RQ_IN_WAKEUP	= 1 << 16,
 	SCX_RQ_IN_BALANCE	= 1 << 17,
@@ -766,9 +767,11 @@ struct scx_rq {
 	unsigned long		ops_qseq;
 	u64			extra_enq_flags;	/* see move_task_to_local_dsq() */
 	u32			nr_running;
-	u32			flags;
 	u32			cpuperf_target;		/* [0, SCHED_CAPACITY_SCALE] */
 	bool			cpu_released;
+	u32			flags;
+	u64			clock;			/* current per-rq clock -- see scx_bpf_now_ns() */
+	u64			prev_clock;		/* previous per-rq clock -- see scx_bpf_now_ns() */
 	cpumask_var_t		cpus_to_kick;
 	cpumask_var_t		cpus_to_kick_if_idle;
 	cpumask_var_t		cpus_to_preempt;
@@ -1725,9 +1728,28 @@ DECLARE_STATIC_KEY_FALSE(__scx_switched_all);	/* all fair class tasks on SCX */
 
 #define scx_enabled()		static_branch_unlikely(&__scx_ops_enabled)
 #define scx_switched_all()	static_branch_unlikely(&__scx_switched_all)
+
+static inline void scx_rq_clock_update(struct rq *rq, u64 clock)
+{
+	if (scx_enabled()) {
+		rq->scx.prev_clock = rq->scx.clock;
+		rq->scx.clock = clock;
+		rq->scx.flags |= SCX_RQ_CLK_VALID;
+	}
+}
+
+static inline void scx_rq_clock_stale(struct rq *rq)
+{
+	if (scx_enabled())
+		rq->scx.flags &= ~SCX_RQ_CLK_VALID;
+}
+
 #else /* !CONFIG_SCHED_CLASS_EXT */
 #define scx_enabled()		false
 #define scx_switched_all()	false
+
+static inline void scx_rq_clock_update(struct rq *rq, u64 clock) {}
+static inline void scx_rq_clock_stale(struct rq *rq) {}
 #endif /* !CONFIG_SCHED_CLASS_EXT */
 
 /*
-- 
2.47.1

Re: [PATCH v4 2/6] sched_ext: Implement scx_rq_clock_update/stale()

Posted by Andrea Righi 1 year, 2 months ago

On Mon, Dec 09, 2024 at 03:15:27PM +0900, Changwoo Min wrote:
> scx_rq_clock_update() and scx_rq_clock_stale() manage the status of an
> rq clock when sched_ext is enabled. scx_rq_clock_update() keeps the rq
> clock in memory and its status valid. scx_rq_clock_stale() invalidates
> the current rq clock not to use the cached rq clock.
> 
> Signed-off-by: Changwoo Min <changwoo@igalia.com>
> ---
>  kernel/sched/sched.h | 24 +++++++++++++++++++++++-
>  1 file changed, 23 insertions(+), 1 deletion(-)
> 
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 440ecedf871b..7e71d8685fcc 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -754,6 +754,7 @@ enum scx_rq_flags {
>  	SCX_RQ_BAL_PENDING	= 1 << 2, /* balance hasn't run yet */
>  	SCX_RQ_BAL_KEEP		= 1 << 3, /* balance decided to keep current */
>  	SCX_RQ_BYPASSING	= 1 << 4,
> +	SCX_RQ_CLK_VALID	= 1 << 5, /* RQ clock is fresh and valid */
>  
>  	SCX_RQ_IN_WAKEUP	= 1 << 16,
>  	SCX_RQ_IN_BALANCE	= 1 << 17,
> @@ -766,9 +767,11 @@ struct scx_rq {
>  	unsigned long		ops_qseq;
>  	u64			extra_enq_flags;	/* see move_task_to_local_dsq() */
>  	u32			nr_running;
> -	u32			flags;
>  	u32			cpuperf_target;		/* [0, SCHED_CAPACITY_SCALE] */
>  	bool			cpu_released;
> +	u32			flags;
> +	u64			clock;			/* current per-rq clock -- see scx_bpf_now_ns() */
> +	u64			prev_clock;		/* previous per-rq clock -- see scx_bpf_now_ns() */

Since we're reordering this struct, we may want to move cpu_released all
the way to the bottom to get rid of the 3-bytes hole (and still have
flags, clock and prev_clock in the same cacheline).

>  	cpumask_var_t		cpus_to_kick;
>  	cpumask_var_t		cpus_to_kick_if_idle;
>  	cpumask_var_t		cpus_to_preempt;
> @@ -1725,9 +1728,28 @@ DECLARE_STATIC_KEY_FALSE(__scx_switched_all);	/* all fair class tasks on SCX */
>  
>  #define scx_enabled()		static_branch_unlikely(&__scx_ops_enabled)
>  #define scx_switched_all()	static_branch_unlikely(&__scx_switched_all)
> +
> +static inline void scx_rq_clock_update(struct rq *rq, u64 clock)
> +{
> +	if (scx_enabled()) {
> +		rq->scx.prev_clock = rq->scx.clock;
> +		rq->scx.clock = clock;
> +		rq->scx.flags |= SCX_RQ_CLK_VALID;
> +	}
> +}

Nit, this is just personal preference (feel free to ignore it):

	if (!scx_enabled())
		return;
	rq->scx.prev_clock = rq->scx.clock;
	rq->scx.clock = clock;
	rq->scx.flags |= SCX_RQ_CLK_VALID;

> +
> +static inline void scx_rq_clock_stale(struct rq *rq)
> +{
> +	if (scx_enabled())
> +		rq->scx.flags &= ~SCX_RQ_CLK_VALID;
> +}

I'm wondering if we need to invalidate the clock on all rqs when we call
scx_ops_enable() to prevent getting stale information from a previous
scx scheduler.

Probably it's not an issue, since scx_ops_disable_workfn() should make
sure that all tasks are going through rq_unpin_lock() before unloading
the current scheduler, maybe it could be helpful to add comment about
this scenario in scx_bpf_now_ns() (PATCH 4/6)?

> +
>  #else /* !CONFIG_SCHED_CLASS_EXT */
>  #define scx_enabled()		false
>  #define scx_switched_all()	false
> +
> +static inline void scx_rq_clock_update(struct rq *rq, u64 clock) {}
> +static inline void scx_rq_clock_stale(struct rq *rq) {}
>  #endif /* !CONFIG_SCHED_CLASS_EXT */
>  
>  /*
> -- 
> 2.47.1
> 

Thanks,
-Andrea

Re: [PATCH v4 2/6] sched_ext: Implement scx_rq_clock_update/stale()

Posted by Changwoo Min 1 year, 2 months ago

Hello Andrea,

Thank you for the review.

On 24. 12. 9. 18:40, Andrea Righi wrote:
>> @@ -766,9 +767,11 @@ struct scx_rq {
>>   	unsigned long		ops_qseq;
>>   	u64			extra_enq_flags;	/* see move_task_to_local_dsq() */
>>   	u32			nr_running;
>> -	u32			flags;
>>   	u32			cpuperf_target;		/* [0, SCHED_CAPACITY_SCALE] */
>>   	bool			cpu_released;
>> +	u32			flags;
>> +	u64			clock;			/* current per-rq clock -- see scx_bpf_now_ns() */
>> +	u64			prev_clock;		/* previous per-rq clock -- see scx_bpf_now_ns() */
> 
> Since we're reordering this struct, we may want to move cpu_released all
> the way to the bottom to get rid of the 3-bytes hole (and still have
> flags, clock and prev_clock in the same cacheline).

We'd better keep the layout as it is. That is because moving
cpu_released to the end of the struct creates 4-byte hole between
flags and clock and 7-byte padding at the end after cpu_released.
I double-checked the two layouts using pahole.

> Nit, this is just personal preference (feel free to ignore it):
> 
> 	if (!scx_enabled())
> 		return;
> 	rq->scx.prev_clock = rq->scx.clock;
> 	rq->scx.clock = clock;
> 	rq->scx.flags |= SCX_RQ_CLK_VALID;
> 
That's prettier. I will change it as you suggested.

> I'm wondering if we need to invalidate the clock on all rqs when we call
> scx_ops_enable() to prevent getting stale information from a previous
> scx scheduler.
> 
> Probably it's not an issue, since scx_ops_disable_workfn() should make
> sure that all tasks are going through rq_unpin_lock() before unloading
> the current scheduler, maybe it could be helpful to add comment about
> this scenario in scx_bpf_now_ns() (PATCH 4/6)?

That's a good catch. In theory, there is a possibility that
a scx_rq is not invalidated when unloading the sched_ext. Since
scx_ops_disable_workfn() iterates all the sched_ext tasks, an rq
would not be invalidated if there is no scx task on the rq.
I will add the code which iterates and invalidates all scx_rqs at
scx_ops_disable_workfn() in the next version.

Thank you again!
Changwoo Min

[PATCH v4 1/6] sched_ext: Relocate scx_enabled() related code
[PATCH v4 2/6] sched_ext: Implement scx_rq_clock_update/stale()
[PATCH v4 3/6] sched_ext: Manage the validity of scx_rq_clock
[PATCH v4 4/6] sched_ext: Implement scx_bpf_now_ns()
[PATCH v4 5/6] sched_ext: Add scx_bpf_now_ns() for BPF scheduler
[PATCH v4 6/6] sched_ext: Replace bpf_ktime_get_ns() to scx_bpf_now_ns()