[v2] rseq: Implement time slice extension mechanism

[patch V2 08/12] rseq: Implement time slice extension enforcement timer

Posted by Thomas Gleixner 3 months, 2 weeks ago

If a time slice extension is granted and the reschedule delayed, the kernel
has to ensure that user space cannot abuse the extension and exceed the
maximum granted time.

It was suggested to implement this via the existing hrtick() timer in the
scheduler, but that turned out to be problematic for several reasons:

   1) It creates a dependency on CONFIG_SCHED_HRTICK, which can be disabled
      independently of CONFIG_HIGHRES_TIMERS

   2) HRTICK usage in the scheduler can be runtime disabled or is only used
      for certain aspects of scheduling.

   3) The function is calling into the scheduler code and that might have
      unexpected consequences when this is invoked due to a time slice
      enforcement expiry. Especially when the task managed to clear the
      grant via sched_yield(0).

It would be possible to address #2 and #3 by storing state in the
scheduler, but that is extra complexity and fragility for no value.

Implement a dedicated per CPU hrtimer instead, which is solely used for the
purpose of time slice enforcement.

The timer is armed when an extension was granted right before actually
returning to user mode in rseq_exit_to_user_mode_restart().

It is disarmed, when the task relinquishes the CPU. This is expensive as
the timer is probably the first expiring timer on the CPU, which means it
has to reprogram the hardware. But that's less expensive than going through
a full hrtimer interrupt cycle for nothing.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
---
 include/linux/rseq_entry.h |   38 ++++++++++----
 include/linux/rseq_types.h |    2 
 kernel/rseq.c              |  119 ++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 147 insertions(+), 12 deletions(-)

--- a/include/linux/rseq_entry.h
+++ b/include/linux/rseq_entry.h
@@ -86,8 +86,24 @@ static __always_inline bool rseq_slice_e
 {
 	return static_branch_likely(&rseq_slice_extension_key);
 }
+
+extern unsigned int rseq_slice_ext_nsecs;
+bool __rseq_arm_slice_extension_timer(void);
+
+static __always_inline bool rseq_arm_slice_extension_timer(void)
+{
+	if (!rseq_slice_extension_enabled())
+		return false;
+
+	if (likely(!current->rseq.slice.state.granted))
+		return false;
+
+	return __rseq_arm_slice_extension_timer();
+}
+
 #else /* CONFIG_RSEQ_SLICE_EXTENSION */
 static inline bool rseq_slice_extension_enabled(void) { return false; }
+static inline bool rseq_arm_slice_extension_timer(void) { return false; }
 #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */
 
 bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr);
@@ -543,17 +559,19 @@ static __always_inline void clear_tif_rs
 static __always_inline bool
 rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
 {
-	if (likely(!test_tif_rseq(ti_work)))
-		return false;
-
-	if (unlikely(__rseq_exit_to_user_mode_restart(regs))) {
-		current->rseq.event.slowpath = true;
-		set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
-		return true;
+	if (unlikely(test_tif_rseq(ti_work))) {
+		if (unlikely(__rseq_exit_to_user_mode_restart(regs))) {
+			current->rseq.event.slowpath = true;
+			set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
+			return true;
+		}
+		clear_tif_rseq();
 	}
-
-	clear_tif_rseq();
-	return false;
+	/*
+	 * Arm the slice extension timer if nothing to do anymore and the
+	 * task really goes out to user space.
+	 */
+	return rseq_arm_slice_extension_timer();
 }
 
 #endif /* CONFIG_GENERIC_ENTRY */
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -89,9 +89,11 @@ union rseq_slice_state {
 /**
  * struct rseq_slice - Status information for rseq time slice extension
  * @state:	Time slice extension state
+ * @expires:	The time when a grant expires
  */
 struct rseq_slice {
 	union rseq_slice_state	state;
+	u64			expires;
 };
 
 /**
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -71,6 +71,8 @@
 #define RSEQ_BUILD_SLOW_PATH
 
 #include <linux/debugfs.h>
+#include <linux/hrtimer.h>
+#include <linux/percpu.h>
 #include <linux/prctl.h>
 #include <linux/ratelimit.h>
 #include <linux/rseq_entry.h>
@@ -500,8 +502,82 @@ SYSCALL_DEFINE4(rseq, struct rseq __user
 }
 
 #ifdef CONFIG_RSEQ_SLICE_EXTENSION
+struct slice_timer {
+	struct hrtimer	timer;
+	void		*cookie;
+};
+
+unsigned int rseq_slice_ext_nsecs __read_mostly = 30 * NSEC_PER_USEC;
+static DEFINE_PER_CPU(struct slice_timer, slice_timer);
 DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key);
 
+static enum hrtimer_restart rseq_slice_expired(struct hrtimer *tmr)
+{
+	struct slice_timer *st = container_of(tmr, struct slice_timer, timer);
+
+	if (st->cookie == current && current->rseq.slice.state.granted) {
+		rseq_stat_inc(rseq_stats.s_expired);
+		set_need_resched_current();
+	}
+	return HRTIMER_NORESTART;
+}
+
+bool __rseq_arm_slice_extension_timer(void)
+{
+	struct slice_timer *st = this_cpu_ptr(&slice_timer);
+	struct task_struct *curr = current;
+
+	lockdep_assert_irqs_disabled();
+
+	/*
+	 * This check prevents that a granted time slice extension exceeds
+	 * the maximum scheduling latency when the grant expired before
+	 * going out to user space. Don't bother to clear the grant here,
+	 * it will be cleaned up automatically before going out to user
+	 * space.
+	 */
+	if ((unlikely(curr->rseq.slice.expires < ktime_get_mono_fast_ns()))) {
+		set_need_resched_current();
+		return true;
+	}
+
+	/*
+	 * Store the task pointer as a cookie for comparison in the timer
+	 * function. This is safe as the timer is CPU local and cannot be
+	 * in the expiry function at this point.
+	 */
+	st->cookie = curr;
+	hrtimer_start(&st->timer, curr->rseq.slice.expires, HRTIMER_MODE_ABS_PINNED_HARD);
+	/* Arm the syscall entry work */
+	set_task_syscall_work(curr, SYSCALL_RSEQ_SLICE);
+	return false;
+}
+
+static void rseq_cancel_slice_extension_timer(void)
+{
+	struct slice_timer *st = this_cpu_ptr(&slice_timer);
+
+	/*
+	 * st->cookie can be safely read as preemption is disabled and the
+	 * timer is CPU local. The active check can obviously race with the
+	 * hrtimer interrupt, but that's better than disabling interrupts
+	 * unconditionally right away.
+	 *
+	 * As this is most probably the first expiring timer, the cancel is
+	 * expensive as it has to reprogram the hardware, but that's less
+	 * expensive than going through a full hrtimer_interrupt() cycle
+	 * for nothing.
+	 *
+	 * hrtimer_try_to_cancel() is sufficient here as with interrupts
+	 * disabled the timer callback cannot be running and the timer base
+	 * is well determined as the timer is pinned on the local CPU.
+	 */
+	if (st->cookie == current && hrtimer_active(&st->timer)) {
+		scoped_guard(irq)
+			hrtimer_try_to_cancel(&st->timer);
+	}
+}
+
 static inline void rseq_slice_set_need_resched(struct task_struct *curr)
 {
 	/*
@@ -559,10 +635,11 @@ void rseq_syscall_enter_work(long syscal
 	rseq_stat_inc(rseq_stats.s_yielded);
 
 	/*
-	 * Required to make set_tsk_need_resched() correct on PREEMPT[RT]
-	 * kernels.
+	 * Required to stabilize the per CPU timer pointer and to make
+	 * set_tsk_need_resched() correct on PREEMPT[RT] kernels.
 	 */
 	scoped_guard(preempt) {
+		rseq_cancel_slice_extension_timer();
 		/*
 		 * Now that preemption is disabled, quickly check whether
 		 * the task was already rescheduled before arriving here.
@@ -654,6 +731,31 @@ SYSCALL_DEFINE0(rseq_slice_yield)
 	return 0;
 }
 
+#ifdef CONFIG_SYSCTL
+static const unsigned int rseq_slice_ext_nsecs_min = 10 * NSEC_PER_USEC;
+static const unsigned int rseq_slice_ext_nsecs_max = 50 * NSEC_PER_USEC;
+
+static const struct ctl_table rseq_slice_ext_sysctl[] = {
+	{
+		.procname	= "rseq_slice_extension_nsec",
+		.data		= &rseq_slice_ext_nsecs,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_douintvec_minmax,
+		.extra1		= (unsigned int *)&rseq_slice_ext_nsecs_min,
+		.extra2		= (unsigned int *)&rseq_slice_ext_nsecs_max,
+	},
+};
+
+static void rseq_slice_sysctl_init(void)
+{
+	if (rseq_slice_extension_enabled())
+		register_sysctl_init("kernel", rseq_slice_ext_sysctl);
+}
+#else /* CONFIG_SYSCTL */
+static inline void rseq_slice_sysctl_init(void) { }
+#endif  /* !CONFIG_SYSCTL */
+
 static int __init rseq_slice_cmdline(char *str)
 {
 	bool on;
@@ -666,4 +768,17 @@ static int __init rseq_slice_cmdline(cha
 	return 1;
 }
 __setup("rseq_slice_ext=", rseq_slice_cmdline);
+
+static int __init rseq_slice_init(void)
+{
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu) {
+		hrtimer_setup(per_cpu_ptr(&slice_timer.timer, cpu), rseq_slice_expired,
+			      CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED_HARD);
+	}
+	rseq_slice_sysctl_init();
+	return 0;
+}
+device_initcall(rseq_slice_init);
 #endif /* CONFIG_RSEQ_SLICE_EXTENSION */

Re: [patch V2 08/12] rseq: Implement time slice extension enforcement timer

Posted by Sebastian Andrzej Siewior 3 months, 2 weeks ago

On 2025-10-22 14:57:38 [+0200], Thomas Gleixner wrote:
> --- a/kernel/rseq.c
> +++ b/kernel/rseq.c
> @@ -500,8 +502,82 @@ SYSCALL_DEFINE4(rseq, struct rseq __user
>  }
>  
>  #ifdef CONFIG_RSEQ_SLICE_EXTENSION
> +struct slice_timer {
> +	struct hrtimer	timer;
> +	void		*cookie;
> +};
> +
> +unsigned int rseq_slice_ext_nsecs __read_mostly = 30 * NSEC_PER_USEC;
> +static DEFINE_PER_CPU(struct slice_timer, slice_timer);
>  DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key);
>  
> +static enum hrtimer_restart rseq_slice_expired(struct hrtimer *tmr)
> +{
> +	struct slice_timer *st = container_of(tmr, struct slice_timer, timer);
> +
> +	if (st->cookie == current && current->rseq.slice.state.granted) {
> +		rseq_stat_inc(rseq_stats.s_expired);
> +		set_need_resched_current();
> +	}

You arm the timer while leaving to userland. Once in userland the task
can be migrated to another CPU. Once migrated, this CPU can host another
task while the timer fires and does nothing.

> +	return HRTIMER_NORESTART;
> +}
> +
…
> +static void rseq_cancel_slice_extension_timer(void)
> +{
> +	struct slice_timer *st = this_cpu_ptr(&slice_timer);
> +
> +	/*
> +	 * st->cookie can be safely read as preemption is disabled and the
> +	 * timer is CPU local. The active check can obviously race with the
> +	 * hrtimer interrupt, but that's better than disabling interrupts
> +	 * unconditionally right away.
> +	 *
> +	 * As this is most probably the first expiring timer, the cancel is
> +	 * expensive as it has to reprogram the hardware, but that's less
> +	 * expensive than going through a full hrtimer_interrupt() cycle
> +	 * for nothing.
> +	 *
> +	 * hrtimer_try_to_cancel() is sufficient here as with interrupts
> +	 * disabled the timer callback cannot be running and the timer base
> +	 * is well determined as the timer is pinned on the local CPU.
> +	 */
> +	if (st->cookie == current && hrtimer_active(&st->timer)) {
> +		scoped_guard(irq)
> +			hrtimer_try_to_cancel(&st->timer);

I don't see why hrtimer_active() and IRQ-disable is a benefit here.
Unless you want to avoid a branch to hrtimer_try_to_cancel().

The function has its own hrtimer_active() check and disables interrupts
while accessing the hrtimer_base lock. Since preemption is disabled,
st->cookie remains stable.
It can fire right after the hrtimer_active() here. You could just

	if (st->cookie == current)
		hrtimer_try_to_cancel(&st->timer);

at the expense of a branch to hrtimer_try_to_cancel() if the timer
already expired (no interrupts off/on).

> +}
> +
>  static inline void rseq_slice_set_need_resched(struct task_struct *curr)
>  {
>  	/*
> @@ -654,6 +731,31 @@ SYSCALL_DEFINE0(rseq_slice_yield)
>  	return 0;
>  }
>  
> +#ifdef CONFIG_SYSCTL
> +static const unsigned int rseq_slice_ext_nsecs_min = 10 * NSEC_PER_USEC;
> +static const unsigned int rseq_slice_ext_nsecs_max = 50 * NSEC_PER_USEC;
> +
> +static const struct ctl_table rseq_slice_ext_sysctl[] = {
> +	{
> +		.procname	= "rseq_slice_extension_nsec",
> +		.data		= &rseq_slice_ext_nsecs,
> +		.maxlen		= sizeof(unsigned int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_douintvec_minmax,
> +		.extra1		= (unsigned int *)&rseq_slice_ext_nsecs_min,
> +		.extra2		= (unsigned int *)&rseq_slice_ext_nsecs_max,
…

maybe +

diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index f3ee807b5d8b3..ed34d21ed94e4 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -1228,6 +1228,12 @@ reboot-cmd (SPARC only)
 ROM/Flash boot loader. Maybe to tell it what to do after
 rebooting. ???
 
+rseq_slice_extension_nsec
+=========================
+
+A task may ask to delay its scheduling if it is in a critical section via the
+prctl(PR_RSEQ_SLICE_EXTENSION_SET) mechanism. This sets the maximum allowed
+extension in nanoseconds before a mandatory scheduling of the task is forced.
 
 sched_energy_aware
 ==================


Sebastian

Re: [patch V2 08/12] rseq: Implement time slice extension enforcement timer

Posted by Thomas Gleixner 3 months, 2 weeks ago

On Mon, Oct 27 2025 at 12:38, Sebastian Andrzej Siewior wrote:
> On 2025-10-22 14:57:38 [+0200], Thomas Gleixner wrote:
>> +static enum hrtimer_restart rseq_slice_expired(struct hrtimer *tmr)
>> +{
>> +	struct slice_timer *st = container_of(tmr, struct slice_timer, timer);
>> +
>> +	if (st->cookie == current && current->rseq.slice.state.granted) {
>> +		rseq_stat_inc(rseq_stats.s_expired);
>> +		set_need_resched_current();
>> +	}
>
> You arm the timer while leaving to userland. Once in userland the task
> can be migrated to another CPU. Once migrated, this CPU can host another
> task while the timer fires and does nothing.

That's inevitable. If the scheduler decides to do that then there is
nothing which can be done about it and that's why the cookie pointer
exists.

>> +	return HRTIMER_NORESTART;
>> +}
>> +
> …
>> +static void rseq_cancel_slice_extension_timer(void)
>> +{
>> +	struct slice_timer *st = this_cpu_ptr(&slice_timer);
>> +
>> +	/*
>> +	 * st->cookie can be safely read as preemption is disabled and the
>> +	 * timer is CPU local. The active check can obviously race with the
>> +	 * hrtimer interrupt, but that's better than disabling interrupts
>> +	 * unconditionally right away.
>> +	 *
>> +	 * As this is most probably the first expiring timer, the cancel is
>> +	 * expensive as it has to reprogram the hardware, but that's less
>> +	 * expensive than going through a full hrtimer_interrupt() cycle
>> +	 * for nothing.
>> +	 *
>> +	 * hrtimer_try_to_cancel() is sufficient here as with interrupts
>> +	 * disabled the timer callback cannot be running and the timer base
>> +	 * is well determined as the timer is pinned on the local CPU.
>> +	 */
>> +	if (st->cookie == current && hrtimer_active(&st->timer)) {
>> +		scoped_guard(irq)
>> +			hrtimer_try_to_cancel(&st->timer);
>
> I don't see why hrtimer_active() and IRQ-disable is a benefit here.
> Unless you want to avoid a branch to hrtimer_try_to_cancel().
>
> The function has its own hrtimer_active() check and disables interrupts
> while accessing the hrtimer_base lock. Since preemption is disabled,
> st->cookie remains stable.
> It can fire right after the hrtimer_active() here. You could just
>
> 	if (st->cookie == current)
> 		hrtimer_try_to_cancel(&st->timer);
>
> at the expense of a branch to hrtimer_try_to_cancel() if the timer
> already expired (no interrupts off/on).

That's not equivalent. As this is CPU local the interrupt disable
ensures that the timer is not running on this CPU. Otherwise you need
hrtimer_cancel(). Read the comment. :)

If it fired already, then the task is reaching this code too
late. Nothing to see there.

>> +		.extra1		= (unsigned int *)&rseq_slice_ext_nsecs_min,
>> +		.extra2		= (unsigned int *)&rseq_slice_ext_nsecs_max,
> …
>
> maybe +
>
> diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
> index f3ee807b5d8b3..ed34d21ed94e4 100644
> --- a/Documentation/admin-guide/sysctl/kernel.rst
> +++ b/Documentation/admin-guide/sysctl/kernel.rst
> @@ -1228,6 +1228,12 @@ reboot-cmd (SPARC only)
>  ROM/Flash boot loader. Maybe to tell it what to do after
>  rebooting. ???
>  
> +rseq_slice_extension_nsec
> +=========================
> +
> +A task may ask to delay its scheduling if it is in a critical section via the
> +prctl(PR_RSEQ_SLICE_EXTENSION_SET) mechanism. This sets the maximum allowed
> +extension in nanoseconds before a mandatory scheduling of the task is forced.

Yes. Forgot about it as I already documented it in the time slice
extension docs. Let me add that.

Thanks,

        tglx

Re: [patch V2 08/12] rseq: Implement time slice extension enforcement timer

Posted by Sebastian Andrzej Siewior 3 months, 1 week ago

On 2025-10-27 17:26:29 [+0100], Thomas Gleixner wrote:
> On Mon, Oct 27 2025 at 12:38, Sebastian Andrzej Siewior wrote:
> > On 2025-10-22 14:57:38 [+0200], Thomas Gleixner wrote:
> >> +static enum hrtimer_restart rseq_slice_expired(struct hrtimer *tmr)
> >> +{
> >> +	struct slice_timer *st = container_of(tmr, struct slice_timer, timer);
> >> +
> >> +	if (st->cookie == current && current->rseq.slice.state.granted) {
> >> +		rseq_stat_inc(rseq_stats.s_expired);
> >> +		set_need_resched_current();
> >> +	}
> >
> > You arm the timer while leaving to userland. Once in userland the task
> > can be migrated to another CPU. Once migrated, this CPU can host another
> > task while the timer fires and does nothing.
> 
> That's inevitable. If the scheduler decides to do that then there is
> nothing which can be done about it and that's why the cookie pointer
> exists.

Without an interrupt on the target CPU, there is nothing stopping the
task from overstepping its fair share.

> >> +	return HRTIMER_NORESTART;
> >> +}
> >> +
> > …
> >> +static void rseq_cancel_slice_extension_timer(void)
> >> +{
> >> +	struct slice_timer *st = this_cpu_ptr(&slice_timer);
> >> +
> >> +	/*
> >> +	 * st->cookie can be safely read as preemption is disabled and the
> >> +	 * timer is CPU local. The active check can obviously race with the
> >> +	 * hrtimer interrupt, but that's better than disabling interrupts
> >> +	 * unconditionally right away.
> >> +	 *
> >> +	 * As this is most probably the first expiring timer, the cancel is
> >> +	 * expensive as it has to reprogram the hardware, but that's less
> >> +	 * expensive than going through a full hrtimer_interrupt() cycle
> >> +	 * for nothing.
> >> +	 *
> >> +	 * hrtimer_try_to_cancel() is sufficient here as with interrupts
> >> +	 * disabled the timer callback cannot be running and the timer base
> >> +	 * is well determined as the timer is pinned on the local CPU.
> >> +	 */
> >> +	if (st->cookie == current && hrtimer_active(&st->timer)) {
> >> +		scoped_guard(irq)
> >> +			hrtimer_try_to_cancel(&st->timer);
> >
> > I don't see why hrtimer_active() and IRQ-disable is a benefit here.
> > Unless you want to avoid a branch to hrtimer_try_to_cancel().
> >
> > The function has its own hrtimer_active() check and disables interrupts
> > while accessing the hrtimer_base lock. Since preemption is disabled,
> > st->cookie remains stable.
> > It can fire right after the hrtimer_active() here. You could just
> >
> > 	if (st->cookie == current)
> > 		hrtimer_try_to_cancel(&st->timer);
> >
> > at the expense of a branch to hrtimer_try_to_cancel() if the timer
> > already expired (no interrupts off/on).
> 
> That's not equivalent. As this is CPU local the interrupt disable
> ensures that the timer is not running on this CPU. Otherwise you need
> hrtimer_cancel(). Read the comment. :)

Since it is a CPU local timer which is HRTIMER_MODE_HARD, from this CPUs
perspective it is either about to run or it did run. Therefore the
hrtimer_try_to_cancel() can't return -1 due to
hrtimer_callback_running() == true.
If you drop hrtimer_active() check and scoped_guard(irq),
hrtimer_try_to_cancel() will do the same hrtimer_active() check as you
have above followed by disable interrupts via lock_hrtimer_base() and
here hrtimer_callback_running() can't return true because interrupts are
disabled and the timer can't run on a remote CPU because it is a
CPU-local timer.

So you avoid a branch to hrtimer_try_to_cancel() if the timer already
fired.

> Thanks,
> 
>         tglx

Sebastian

Re: [patch V2 08/12] rseq: Implement time slice extension enforcement timer

Posted by Thomas Gleixner 3 months, 1 week ago

On Tue, Oct 28 2025 at 09:33, Sebastian Andrzej Siewior wrote:
> On 2025-10-27 17:26:29 [+0100], Thomas Gleixner wrote:
>> On Mon, Oct 27 2025 at 12:38, Sebastian Andrzej Siewior wrote:
>> > On 2025-10-22 14:57:38 [+0200], Thomas Gleixner wrote:
>> >> +static enum hrtimer_restart rseq_slice_expired(struct hrtimer *tmr)
>> >> +{
>> >> +	struct slice_timer *st = container_of(tmr, struct slice_timer, timer);
>> >> +
>> >> +	if (st->cookie == current && current->rseq.slice.state.granted) {
>> >> +		rseq_stat_inc(rseq_stats.s_expired);
>> >> +		set_need_resched_current();
>> >> +	}
>> >
>> > You arm the timer while leaving to userland. Once in userland the task
>> > can be migrated to another CPU. Once migrated, this CPU can host another
>> > task while the timer fires and does nothing.
>> 
>> That's inevitable. If the scheduler decides to do that then there is
>> nothing which can be done about it and that's why the cookie pointer
>> exists.
>
> Without an interrupt on the target CPU, there is nothing stopping the
> task from overstepping its fair share.

If a task gets migrated then it can't overstep the share because the
migration is bringing it back into the kernel, schedules it out and
schedules it in on the new CPU. So the whole accounting start over
freshly. That's the same as if the task gets the extension granted, goes
to user space and gets interrupted again. If that interrupt sets
NEED_RESCHED the grant is "revoked" and the timer fires for nothing.

> Since it is a CPU local timer which is HRTIMER_MODE_HARD, from this CPUs
> perspective it is either about to run or it did run. Therefore the
> hrtimer_try_to_cancel() can't return -1 due to
> hrtimer_callback_running() == true.
> If you drop hrtimer_active() check and scoped_guard(irq),
> hrtimer_try_to_cancel() will do the same hrtimer_active() check as you
> have above followed by disable interrupts via lock_hrtimer_base() and
> here hrtimer_callback_running() can't return true because interrupts are
> disabled and the timer can't run on a remote CPU because it is a
> CPU-local timer.
>
> So you avoid a branch to hrtimer_try_to_cancel() if the timer already
> fired.

Yes you are right. Seems I've suffered from brain congestion. Let me
remove it.

Thanks,

        tglx

Re: [patch V2 08/12] rseq: Implement time slice extension enforcement timer

Posted by K Prateek Nayak 3 months, 1 week ago

Hello Sebastian,

On 10/28/2025 2:03 PM, Sebastian Andrzej Siewior wrote:
> On 2025-10-27 17:26:29 [+0100], Thomas Gleixner wrote:
>> On Mon, Oct 27 2025 at 12:38, Sebastian Andrzej Siewior wrote:
>>> On 2025-10-22 14:57:38 [+0200], Thomas Gleixner wrote:
>>>> +static enum hrtimer_restart rseq_slice_expired(struct hrtimer *tmr)
>>>> +{
>>>> +	struct slice_timer *st = container_of(tmr, struct slice_timer, timer);
>>>> +
>>>> +	if (st->cookie == current && current->rseq.slice.state.granted) {
>>>> +		rseq_stat_inc(rseq_stats.s_expired);
>>>> +		set_need_resched_current();
>>>> +	}
>>>
>>> You arm the timer while leaving to userland. Once in userland the task
>>> can be migrated to another CPU. Once migrated, this CPU can host another
>>> task while the timer fires and does nothing.
>>
>> That's inevitable. If the scheduler decides to do that then there is
>> nothing which can be done about it and that's why the cookie pointer
>> exists.
> 
> Without an interrupt on the target CPU, there is nothing stopping the
> task from overstepping its fair share.

When the task moves CPU, the rseq_exit_user_update() would clear all
of the slice extension state before running the task again. The task
will start off again with "rseq->slice_ctrl.request" and
"rseq->slice_ctrl.granted" both at 0 signifying the task was
rescheduled.

As for overstepping the limits on the previous CPU, the EEVDF
algorithm (using the task's "vlag" - the vruntime deviation from the
"avg_vruntime") would penalize it accordingly when enqueued.

The previous CPU would just get a spurious interrupt and since the
timer cookie doesn't match with "current", the handler does
nothing and goes away.

-- 
Thanks and Regards,
Prateek

Re: [patch V2 08/12] rseq: Implement time slice extension enforcement timer

Posted by Sebastian Andrzej Siewior 3 months, 1 week ago

On 2025-10-28 14:21:24 [+0530], K Prateek Nayak wrote:
> Hello Sebastian,
> 
> On 10/28/2025 2:03 PM, Sebastian Andrzej Siewior wrote:
> > On 2025-10-27 17:26:29 [+0100], Thomas Gleixner wrote:
> >> On Mon, Oct 27 2025 at 12:38, Sebastian Andrzej Siewior wrote:
> >>> On 2025-10-22 14:57:38 [+0200], Thomas Gleixner wrote:
> >>>> +static enum hrtimer_restart rseq_slice_expired(struct hrtimer *tmr)
> >>>> +{
> >>>> +	struct slice_timer *st = container_of(tmr, struct slice_timer, timer);
> >>>> +
> >>>> +	if (st->cookie == current && current->rseq.slice.state.granted) {
> >>>> +		rseq_stat_inc(rseq_stats.s_expired);
> >>>> +		set_need_resched_current();
> >>>> +	}
> >>>
> >>> You arm the timer while leaving to userland. Once in userland the task
> >>> can be migrated to another CPU. Once migrated, this CPU can host another
> >>> task while the timer fires and does nothing.
> >>
> >> That's inevitable. If the scheduler decides to do that then there is
> >> nothing which can be done about it and that's why the cookie pointer
> >> exists.
> > 
> > Without an interrupt on the target CPU, there is nothing stopping the
> > task from overstepping its fair share.
> 
> When the task moves CPU, the rseq_exit_user_update() would clear all
> of the slice extension state before running the task again. The task
> will start off again with "rseq->slice_ctrl.request" and
> "rseq->slice_ctrl.granted" both at 0 signifying the task was
> rescheduled.

I wasn't aware this is done once the task is in userland and then
relocated to another CPU.

> As for overstepping the limits on the previous CPU, the EEVDF
> algorithm (using the task's "vlag" - the vruntime deviation from the
> "avg_vruntime") would penalize it accordingly when enqueued.

So it wouldn't be the initial delay which is enforced by the timer, but
the regular scheduler that would put an end to it. Somehow forgot that
we still have a scheduler…

> The previous CPU would just get a spurious interrupt and since the
> timer cookie doesn't match with "current", the handler does
> nothing and goes away.

Yeah, that is fine.

Sebastian

Re: [patch V2 08/12] rseq: Implement time slice extension enforcement timer

Posted by K Prateek Nayak 3 months, 1 week ago

On 10/28/2025 2:30 PM, Sebastian Andrzej Siewior wrote:
>>> Without an interrupt on the target CPU, there is nothing stopping the
>>> task from overstepping its fair share.
>>
>> When the task moves CPU, the rseq_exit_user_update() would clear all
>> of the slice extension state before running the task again. The task
>> will start off again with "rseq->slice_ctrl.request" and
>> "rseq->slice_ctrl.granted" both at 0 signifying the task was
>> rescheduled.
> 
> I wasn't aware this is done once the task is in userland and then
> relocated to another CPU.

The exact path based on my understanding is:

  /* Task migrates to another CPU; Has to resume from kernel. */
  __schedule()
    context_switch()
      rseq_sched_switch_event()
        t->rseq.event.sched_switch = true;
        set_tsk_thread_flag(t, TIF_RSEQ);

    ...
    exit_to_user_mode_loop()
      rseq_exit_to_user_mode_restart()
        __rseq_exit_to_user_mode_restart()
          /* Sees t->rseq.event.sched_switch to be true. */
          rseq_exit_user_update()
            if (rseq_slice_extension_enabled())
              unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); /* Unconditionally clears all of "rseq_ctrl" */

-- 
Thanks and Regards,
Prateek

Re: [patch V2 08/12] rseq: Implement time slice extension enforcement timer

Posted by Sebastian Andrzej Siewior 3 months, 1 week ago

On 2025-10-28 14:52:09 [+0530], K Prateek Nayak wrote:
> On 10/28/2025 2:30 PM, Sebastian Andrzej Siewior wrote:
> >>> Without an interrupt on the target CPU, there is nothing stopping the
> >>> task from overstepping its fair share.
> >>
> >> When the task moves CPU, the rseq_exit_user_update() would clear all
> >> of the slice extension state before running the task again. The task
> >> will start off again with "rseq->slice_ctrl.request" and
> >> "rseq->slice_ctrl.granted" both at 0 signifying the task was
> >> rescheduled.
> > 
> > I wasn't aware this is done once the task is in userland and then
> > relocated to another CPU.
> 
> The exact path based on my understanding is:
> 
>   /* Task migrates to another CPU; Has to resume from kernel. */
>   __schedule()
>     context_switch()
>       rseq_sched_switch_event()
>         t->rseq.event.sched_switch = true;
>         set_tsk_thread_flag(t, TIF_RSEQ);
> 
>     ...
>     exit_to_user_mode_loop()
>       rseq_exit_to_user_mode_restart()
>         __rseq_exit_to_user_mode_restart()
>           /* Sees t->rseq.event.sched_switch to be true. */
>           rseq_exit_user_update()
>             if (rseq_slice_extension_enabled())
>               unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); /* Unconditionally clears all of "rseq_ctrl" */

You are right. The migration thread preempts it on the old CPU and then
it gets scheduled in on the new CPU.

Sebastian