[patch V3 08/12] rseq: Implement time slice extension enforcement timer

Thomas Gleixner posted 12 patches 3 months, 1 week ago
There is a newer version of this series
[patch V3 08/12] rseq: Implement time slice extension enforcement timer
Posted by Thomas Gleixner 3 months, 1 week ago
If a time slice extension is granted and the reschedule delayed, the kernel
has to ensure that user space cannot abuse the extension and exceed the
maximum granted time.

It was suggested to implement this via the existing hrtick() timer in the
scheduler, but that turned out to be problematic for several reasons:

   1) It creates a dependency on CONFIG_SCHED_HRTICK, which can be disabled
      independently of CONFIG_HIGHRES_TIMERS

   2) HRTICK usage in the scheduler can be runtime disabled or is only used
      for certain aspects of scheduling.

   3) The function is calling into the scheduler code and that might have
      unexpected consequences when this is invoked due to a time slice
      enforcement expiry. Especially when the task managed to clear the
      grant via sched_yield(0).

It would be possible to address #2 and #3 by storing state in the
scheduler, but that is extra complexity and fragility for no value.

Implement a dedicated per CPU hrtimer instead, which is solely used for the
purpose of time slice enforcement.

The timer is armed when an extension was granted right before actually
returning to user mode in rseq_exit_to_user_mode_restart().

It is disarmed, when the task relinquishes the CPU. This is expensive as
the timer is probably the first expiring timer on the CPU, which means it
has to reprogram the hardware. But that's less expensive than going through
a full hrtimer interrupt cycle for nothing.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
---
V3: Add sysctl documentation, simplify timer cancelation - Sebastian
---
 Documentation/admin-guide/sysctl/kernel.rst |    6 +
 include/linux/rseq_entry.h                  |   38 ++++++---
 include/linux/rseq_types.h                  |    2 
 kernel/rseq.c                               |  115 +++++++++++++++++++++++++++-
 4 files changed, 149 insertions(+), 12 deletions(-)

--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -1228,6 +1228,12 @@ reboot-cmd (SPARC only)
 ROM/Flash boot loader. Maybe to tell it what to do after
 rebooting. ???
 
+rseq_slice_extension_nsec
+=========================
+
+A task can request to delay its scheduling if it is in a critical section
+via the prctl(PR_RSEQ_SLICE_EXTENSION_SET) mechanism. This sets the maximum
+allowed extension in nanoseconds before scheduling of the task is enforced.
 
 sched_energy_aware
 ==================
--- a/include/linux/rseq_entry.h
+++ b/include/linux/rseq_entry.h
@@ -86,8 +86,24 @@ static __always_inline bool rseq_slice_e
 {
 	return static_branch_likely(&rseq_slice_extension_key);
 }
+
+extern unsigned int rseq_slice_ext_nsecs;
+bool __rseq_arm_slice_extension_timer(void);
+
+static __always_inline bool rseq_arm_slice_extension_timer(void)
+{
+	if (!rseq_slice_extension_enabled())
+		return false;
+
+	if (likely(!current->rseq.slice.state.granted))
+		return false;
+
+	return __rseq_arm_slice_extension_timer();
+}
+
 #else /* CONFIG_RSEQ_SLICE_EXTENSION */
 static inline bool rseq_slice_extension_enabled(void) { return false; }
+static inline bool rseq_arm_slice_extension_timer(void) { return false; }
 #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */
 
 bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr);
@@ -542,17 +558,19 @@ static __always_inline void clear_tif_rs
 static __always_inline bool
 rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
 {
-	if (likely(!test_tif_rseq(ti_work)))
-		return false;
-
-	if (unlikely(__rseq_exit_to_user_mode_restart(regs))) {
-		current->rseq.event.slowpath = true;
-		set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
-		return true;
+	if (unlikely(test_tif_rseq(ti_work))) {
+		if (unlikely(__rseq_exit_to_user_mode_restart(regs))) {
+			current->rseq.event.slowpath = true;
+			set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
+			return true;
+		}
+		clear_tif_rseq();
 	}
-
-	clear_tif_rseq();
-	return false;
+	/*
+	 * Arm the slice extension timer if nothing to do anymore and the
+	 * task really goes out to user space.
+	 */
+	return rseq_arm_slice_extension_timer();
 }
 
 #endif /* CONFIG_GENERIC_ENTRY */
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -89,9 +89,11 @@ union rseq_slice_state {
 /**
  * struct rseq_slice - Status information for rseq time slice extension
  * @state:	Time slice extension state
+ * @expires:	The time when a grant expires
  */
 struct rseq_slice {
 	union rseq_slice_state	state;
+	u64			expires;
 };
 
 /**
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -71,6 +71,8 @@
 #define RSEQ_BUILD_SLOW_PATH
 
 #include <linux/debugfs.h>
+#include <linux/hrtimer.h>
+#include <linux/percpu.h>
 #include <linux/prctl.h>
 #include <linux/ratelimit.h>
 #include <linux/rseq_entry.h>
@@ -499,8 +501,78 @@ SYSCALL_DEFINE4(rseq, struct rseq __user
 }
 
 #ifdef CONFIG_RSEQ_SLICE_EXTENSION
+struct slice_timer {
+	struct hrtimer	timer;
+	void		*cookie;
+};
+
+unsigned int rseq_slice_ext_nsecs __read_mostly = 30 * NSEC_PER_USEC;
+static DEFINE_PER_CPU(struct slice_timer, slice_timer);
 DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key);
 
+static enum hrtimer_restart rseq_slice_expired(struct hrtimer *tmr)
+{
+	struct slice_timer *st = container_of(tmr, struct slice_timer, timer);
+
+	if (st->cookie == current && current->rseq.slice.state.granted) {
+		rseq_stat_inc(rseq_stats.s_expired);
+		set_need_resched_current();
+	}
+	return HRTIMER_NORESTART;
+}
+
+bool __rseq_arm_slice_extension_timer(void)
+{
+	struct slice_timer *st = this_cpu_ptr(&slice_timer);
+	struct task_struct *curr = current;
+
+	lockdep_assert_irqs_disabled();
+
+	/*
+	 * This check prevents that a granted time slice extension exceeds
+	 * the maximum scheduling latency when the grant expired before
+	 * going out to user space. Don't bother to clear the grant here,
+	 * it will be cleaned up automatically before going out to user
+	 * space.
+	 */
+	if ((unlikely(curr->rseq.slice.expires < ktime_get_mono_fast_ns()))) {
+		set_need_resched_current();
+		return true;
+	}
+
+	/*
+	 * Store the task pointer as a cookie for comparison in the timer
+	 * function. This is safe as the timer is CPU local and cannot be
+	 * in the expiry function at this point.
+	 */
+	st->cookie = curr;
+	hrtimer_start(&st->timer, curr->rseq.slice.expires, HRTIMER_MODE_ABS_PINNED_HARD);
+	/* Arm the syscall entry work */
+	set_task_syscall_work(curr, SYSCALL_RSEQ_SLICE);
+	return false;
+}
+
+static void rseq_cancel_slice_extension_timer(void)
+{
+	struct slice_timer *st = this_cpu_ptr(&slice_timer);
+
+	/*
+	 * st->cookie can be safely read as preemption is disabled and the
+	 * timer is CPU local.
+	 *
+	 * As this is most probably the first expiring timer, the cancel is
+	 * expensive as it has to reprogram the hardware, but that's less
+	 * expensive than going through a full hrtimer_interrupt() cycle
+	 * for nothing.
+	 *
+	 * hrtimer_try_to_cancel() is sufficient here as the timer is CPU
+	 * local and once the hrtimer code disabled interrupts the timer
+	 * callback cannot be running.
+	 */
+	if (st->cookie == current)
+		hrtimer_try_to_cancel(&st->timer);
+}
+
 static inline void rseq_slice_set_need_resched(struct task_struct *curr)
 {
 	/*
@@ -558,10 +630,11 @@ void rseq_syscall_enter_work(long syscal
 	rseq_stat_inc(rseq_stats.s_yielded);
 
 	/*
-	 * Required to make set_tsk_need_resched() correct on PREEMPT[RT]
-	 * kernels.
+	 * Required to stabilize the per CPU timer pointer and to make
+	 * set_tsk_need_resched() correct on PREEMPT[RT] kernels.
 	 */
 	scoped_guard(preempt) {
+		rseq_cancel_slice_extension_timer();
 		/*
 		 * Now that preemption is disabled, quickly check whether
 		 * the task was already rescheduled before arriving here.
@@ -652,6 +725,31 @@ SYSCALL_DEFINE0(rseq_slice_yield)
 	return 0;
 }
 
+#ifdef CONFIG_SYSCTL
+static const unsigned int rseq_slice_ext_nsecs_min = 10 * NSEC_PER_USEC;
+static const unsigned int rseq_slice_ext_nsecs_max = 50 * NSEC_PER_USEC;
+
+static const struct ctl_table rseq_slice_ext_sysctl[] = {
+	{
+		.procname	= "rseq_slice_extension_nsec",
+		.data		= &rseq_slice_ext_nsecs,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_douintvec_minmax,
+		.extra1		= (unsigned int *)&rseq_slice_ext_nsecs_min,
+		.extra2		= (unsigned int *)&rseq_slice_ext_nsecs_max,
+	},
+};
+
+static void rseq_slice_sysctl_init(void)
+{
+	if (rseq_slice_extension_enabled())
+		register_sysctl_init("kernel", rseq_slice_ext_sysctl);
+}
+#else /* CONFIG_SYSCTL */
+static inline void rseq_slice_sysctl_init(void) { }
+#endif  /* !CONFIG_SYSCTL */
+
 static int __init rseq_slice_cmdline(char *str)
 {
 	bool on;
@@ -664,4 +762,17 @@ static int __init rseq_slice_cmdline(cha
 	return 1;
 }
 __setup("rseq_slice_ext=", rseq_slice_cmdline);
+
+static int __init rseq_slice_init(void)
+{
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu) {
+		hrtimer_setup(per_cpu_ptr(&slice_timer.timer, cpu), rseq_slice_expired,
+			      CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED_HARD);
+	}
+	rseq_slice_sysctl_init();
+	return 0;
+}
+device_initcall(rseq_slice_init);
 #endif /* CONFIG_RSEQ_SLICE_EXTENSION */
Re: [patch V3 08/12] rseq: Implement time slice extension enforcement timer
Posted by Mathieu Desnoyers 3 months, 1 week ago
On 2025-10-29 09:22, Thomas Gleixner wrote:
> 
>     3) The function is calling into the scheduler code and that might have
>        unexpected consequences when this is invoked due to a time slice
>        enforcement expiry. Especially when the task managed to clear the
>        grant via sched_yield(0).

Do you mean sys_rseq_slice_yield here ?

> ---
> V3: Add sysctl documentation, simplify timer cancelation - Sebastian

^ cancellation

Other than those nits:

Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>

-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
Re: [patch V3 08/12] rseq: Implement time slice extension enforcement timer
Posted by Steven Rostedt 3 months, 1 week ago
On Wed, 29 Oct 2025 14:22:26 +0100 (CET)
Thomas Gleixner <tglx@linutronix.de> wrote:

> --- a/include/linux/rseq_entry.h
> +++ b/include/linux/rseq_entry.h
> @@ -86,8 +86,24 @@ static __always_inline bool rseq_slice_e
>  {
>  	return static_branch_likely(&rseq_slice_extension_key);
>  }
> +
> +extern unsigned int rseq_slice_ext_nsecs;
> +bool __rseq_arm_slice_extension_timer(void);
> +
> +static __always_inline bool rseq_arm_slice_extension_timer(void)
> +{
> +	if (!rseq_slice_extension_enabled())
> +		return false;
> +
> +	if (likely(!current->rseq.slice.state.granted))
> +		return false;
> +
> +	return __rseq_arm_slice_extension_timer();
> +}
> +
>  #else /* CONFIG_RSEQ_SLICE_EXTENSION */
>  static inline bool rseq_slice_extension_enabled(void) { return false; }
> +static inline bool rseq_arm_slice_extension_timer(void) { return false; }
>  #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */
>  
>  bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr);
> @@ -542,17 +558,19 @@ static __always_inline void clear_tif_rs
>  static __always_inline bool
>  rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
>  {
> -	if (likely(!test_tif_rseq(ti_work)))
> -		return false;
> -
> -	if (unlikely(__rseq_exit_to_user_mode_restart(regs))) {
> -		current->rseq.event.slowpath = true;
> -		set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
> -		return true;
> +	if (unlikely(test_tif_rseq(ti_work))) {
> +		if (unlikely(__rseq_exit_to_user_mode_restart(regs))) {
> +			current->rseq.event.slowpath = true;
> +			set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
> +			return true;

Just to make sure I understand this. By setting TIF_NOTIFY_RESUME and
returning true it can still comeback to set the timer?

I guess this also begs the question of if user space can use both the
restartable sequences at the same time as requesting an extended time slice?

> +		}
> +		clear_tif_rseq();
>  	}
> -
> -	clear_tif_rseq();
> -	return false;
> +	/*
> +	 * Arm the slice extension timer if nothing to do anymore and the
> +	 * task really goes out to user space.
> +	 */
> +	return rseq_arm_slice_extension_timer();
>  }
>  
>  #endif /* CONFIG_GENERIC_ENTRY */
> --- a/include/linux/rseq_types.h
> +++ b/include/linux/rseq_types.h
> @@ -89,9 +89,11 @@ union rseq_slice_state {
>  /**
>   * struct rseq_slice - Status information for rseq time slice extension
>   * @state:	Time slice extension state
> + * @expires:	The time when a grant expires
>   */
>  struct rseq_slice {
>  	union rseq_slice_state	state;
> +	u64			expires;
>  };
>  
>  /**
> --- a/kernel/rseq.c
> +++ b/kernel/rseq.c
> @@ -71,6 +71,8 @@
>  #define RSEQ_BUILD_SLOW_PATH
>  
>  #include <linux/debugfs.h>
> +#include <linux/hrtimer.h>
> +#include <linux/percpu.h>
>  #include <linux/prctl.h>
>  #include <linux/ratelimit.h>
>  #include <linux/rseq_entry.h>
> @@ -499,8 +501,78 @@ SYSCALL_DEFINE4(rseq, struct rseq __user
>  }
>  
>  #ifdef CONFIG_RSEQ_SLICE_EXTENSION
> +struct slice_timer {
> +	struct hrtimer	timer;
> +	void		*cookie;
> +};
> +
> +unsigned int rseq_slice_ext_nsecs __read_mostly = 30 * NSEC_PER_USEC;
> +static DEFINE_PER_CPU(struct slice_timer, slice_timer);
>  DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key);
>  
> +static enum hrtimer_restart rseq_slice_expired(struct hrtimer *tmr)
> +{
> +	struct slice_timer *st = container_of(tmr, struct slice_timer, timer);
> +
> +	if (st->cookie == current && current->rseq.slice.state.granted) {
> +		rseq_stat_inc(rseq_stats.s_expired);
> +		set_need_resched_current();
> +	}
> +	return HRTIMER_NORESTART;
> +}
> +
> +bool __rseq_arm_slice_extension_timer(void)
> +{
> +	struct slice_timer *st = this_cpu_ptr(&slice_timer);
> +	struct task_struct *curr = current;
> +
> +	lockdep_assert_irqs_disabled();
> +
> +	/*
> +	 * This check prevents that a granted time slice extension exceeds

           This check prevents a granted time slice ...

> +	 * the maximum scheduling latency when the grant expired before
> +	 * going out to user space. Don't bother to clear the grant here,
> +	 * it will be cleaned up automatically before going out to user
> +	 * space.
> +	 */
> +	if ((unlikely(curr->rseq.slice.expires < ktime_get_mono_fast_ns()))) {
> +		set_need_resched_current();
> +		return true;
> +	}
> +
> +	/*
> +	 * Store the task pointer as a cookie for comparison in the timer
> +	 * function. This is safe as the timer is CPU local and cannot be
> +	 * in the expiry function at this point.
> +	 */

I'm just curious in this scenario:

  1) Task A requests an extension and is granted.
      st->cookie = Task A
      hrtimer_start();

  2) Before getting back to user space, a RT kernel thread wakes up and
     preempts Task A. Does this clear the timer?

  3) RT kernel thread finishes but then schedules Task B within the expiry.

  4) Task B requests an extension (assuming it had a short time slice that
     allowed it to end before the expiry of the original timer).

I guess it doesn't matter that st->cookie = Task B, as Task A was already
scheduled out. But would calling hrtimer_start() on an existing timer cause
any issue?

I guess it doesn't matter as it looks like the code in hrtimer_start() does
indeed remove an existing timer.

> +	st->cookie = curr;
> +	hrtimer_start(&st->timer, curr->rseq.slice.expires, HRTIMER_MODE_ABS_PINNED_HARD);
> +	/* Arm the syscall entry work */
> +	set_task_syscall_work(curr, SYSCALL_RSEQ_SLICE);
> +	return false;
> +}
> +
> +static void rseq_cancel_slice_extension_timer(void)
> +{
> +	struct slice_timer *st = this_cpu_ptr(&slice_timer);
> +
> +	/*
> +	 * st->cookie can be safely read as preemption is disabled and the
> +	 * timer is CPU local.
> +	 *
> +	 * As this is most probably the first expiring timer, the cancel is

           As this is probably the first ...

> +	 * expensive as it has to reprogram the hardware, but that's less
> +	 * expensive than going through a full hrtimer_interrupt() cycle
> +	 * for nothing.
> +	 *
> +	 * hrtimer_try_to_cancel() is sufficient here as the timer is CPU
> +	 * local and once the hrtimer code disabled interrupts the timer
> +	 * callback cannot be running.
> +	 */
> +	if (st->cookie == current)
> +		hrtimer_try_to_cancel(&st->timer);

If the above scenario did happen, the timer will go off as
st->cookie == current would likely be false?

Hmm, if it does go off and the task did schedule back in, would it get its
need_resched set? This is a very unlikely scenario thus I guess it doesn't
really matter.

I'm just thinking about corner cases and how it could affect this code and
possibly cause noticeable issues.

-- Steve


> +}
> +
>  static inline void rseq_slice_set_need_resched(struct task_struct *curr)
>  {
>  	/*
> @@ -558,10 +630,11 @@ void rseq_syscall_enter_work(long syscal
>  	rseq_stat_inc(rseq_stats.s_yielded);
>  
>  	/*
> -	 * Required to make set_tsk_need_resched() correct on PREEMPT[RT]
> -	 * kernels.
> +	 * Required to stabilize the per CPU timer pointer and to make
> +	 * set_tsk_need_resched() correct on PREEMPT[RT] kernels.
>  	 */
>  	scoped_guard(preempt) {
> +		rseq_cancel_slice_extension_timer();
>  		/*
>  		 * Now that preemption is disabled, quickly check whether
>  		 * the task was already rescheduled before arriving here.
> @@ -652,6 +725,31 @@ SYSCALL_DEFINE0(rseq_slice_yield)
>  	return 0;
>  }
>  
> +#ifdef CONFIG_SYSCTL
> +static const unsigned int rseq_slice_ext_nsecs_min = 10 * NSEC_PER_USEC;
> +static const unsigned int rseq_slice_ext_nsecs_max = 50 * NSEC_PER_USEC;
> +
> +static const struct ctl_table rseq_slice_ext_sysctl[] = {
> +	{
> +		.procname	= "rseq_slice_extension_nsec",
> +		.data		= &rseq_slice_ext_nsecs,
> +		.maxlen		= sizeof(unsigned int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_douintvec_minmax,
> +		.extra1		= (unsigned int *)&rseq_slice_ext_nsecs_min,
> +		.extra2		= (unsigned int *)&rseq_slice_ext_nsecs_max,
> +	},
> +};
> +
> +static void rseq_slice_sysctl_init(void)
> +{
> +	if (rseq_slice_extension_enabled())
> +		register_sysctl_init("kernel", rseq_slice_ext_sysctl);
> +}
> +#else /* CONFIG_SYSCTL */
> +static inline void rseq_slice_sysctl_init(void) { }
> +#endif  /* !CONFIG_SYSCTL */
> +
>  static int __init rseq_slice_cmdline(char *str)
>  {
>  	bool on;
> @@ -664,4 +762,17 @@ static int __init rseq_slice_cmdline(cha
>  	return 1;
>  }
>  __setup("rseq_slice_ext=", rseq_slice_cmdline);
> +
> +static int __init rseq_slice_init(void)
> +{
> +	unsigned int cpu;
> +
> +	for_each_possible_cpu(cpu) {
> +		hrtimer_setup(per_cpu_ptr(&slice_timer.timer, cpu), rseq_slice_expired,
> +			      CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED_HARD);
> +	}
> +	rseq_slice_sysctl_init();
> +	return 0;
> +}
> +device_initcall(rseq_slice_init);
>  #endif /* CONFIG_RSEQ_SLICE_EXTENSION */
Re: [patch V3 08/12] rseq: Implement time slice extension enforcement timer
Posted by Thomas Gleixner 3 months, 1 week ago
On Wed, Oct 29 2025 at 14:45, Steven Rostedt wrote:
> On Wed, 29 Oct 2025 14:22:26 +0100 (CET)
> Thomas Gleixner <tglx@linutronix.de> wrote:
>>  rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
>>  {
>> -	if (likely(!test_tif_rseq(ti_work)))
>> -		return false;
>> -
>> -	if (unlikely(__rseq_exit_to_user_mode_restart(regs))) {
>> -		current->rseq.event.slowpath = true;
>> -		set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
>> -		return true;
>> +	if (unlikely(test_tif_rseq(ti_work))) {
>> +		if (unlikely(__rseq_exit_to_user_mode_restart(regs))) {
>> +			current->rseq.event.slowpath = true;
>> +			set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
>> +			return true;
>
> Just to make sure I understand this. By setting TIF_NOTIFY_RESUME and
> returning true it can still comeback to set the timer?

No. NOTIFY_RESUME is only set when the access faults or when the user
space memory is corrupted and the grant is moot in that case.

But if TIF_RSEQ is set then a previously granted extensionn is anyway
revoked because that means:

        granted();
        ---> preemption (evtl. migration): Set's TIF_RSEQ
           schedule()
        rseq_exit_to_user_mode_restart()
           if (TIF_RSEQ is set)
              handle_rseq()
                 revoke_grant()
                 
> I guess this also begs the question of if user space can use both the
> restartable sequences at the same time as requesting an extended time slice?

It can and that actually makes sense.

       enter_cs()
         request_grant()
         set_cs()
         ...

interrupt
        set_need_resched()
        exit_to_user_mode()
           if (need_resched()
              grant_extention() // clears NEED_RESCHED
           ...
        rseq_exit_to_user_mode_restart()
           if (IF_RSEQ is set)  // Branch not taken
              ...
           arm_timer()
        return_to_user()

        leave_cs()
          if (granted)
             sys_rseq_sched_yield()

which means the extension grant prevented the critical section to be
aborted. If the extension is not granted or revoked then this behaves
like a regular RSEQ CS abort.

>> +	 * This check prevents that a granted time slice extension exceeds
>
>	   This check prevents a granted time slice ...
>
>> +	 * the maximum scheduling latency when the grant expired before

I'm not a native speaker, but your suggested edit is bogus. Let me
put it into the full sentence:

	   This check prevents a granted time slice extension exceeds
           the maximum ....

Can you spot the fail?

>> +	/*
>> +	 * Store the task pointer as a cookie for comparison in the timer
>> +	 * function. This is safe as the timer is CPU local and cannot be
>> +	 * in the expiry function at this point.
>> +	 */
>
> I'm just curious in this scenario:
>
>   1) Task A requests an extension and is granted.
>       st->cookie = Task A
>       hrtimer_start();
>
>   2) Before getting back to user space, a RT kernel thread wakes up and
>      preempts Task A. Does this clear the timer?

No.

>   3) RT kernel thread finishes but then schedules Task B within the expiry.
>
>   4) Task B requests an extension (assuming it had a short time slice that
>      allowed it to end before the expiry of the original timer).
>
> I guess it doesn't matter that st->cookie = Task B, as Task A was already
> scheduled out. But would calling hrtimer_start() on an existing timer cause
> any issue?

No. The timer is canceled and reprogrammed.

> I guess it doesn't matter as it looks like the code in hrtimer_start() does
> indeed remove an existing timer.

You guessed right :)

>> +	st->cookie = curr;
>> +	hrtimer_start(&st->timer, curr->rseq.slice.expires, HRTIMER_MODE_ABS_PINNED_HARD);
>> +	/* Arm the syscall entry work */
>> +	set_task_syscall_work(curr, SYSCALL_RSEQ_SLICE);
>> +	return false;
>> +}
>> +
>> +static void rseq_cancel_slice_extension_timer(void)
>> +{
>> +	struct slice_timer *st = this_cpu_ptr(&slice_timer);
>> +
>> +	/*
>> +	 * st->cookie can be safely read as preemption is disabled and the
>> +	 * timer is CPU local.
>> +	 *
>> +	 * As this is most probably the first expiring timer, the cancel is
>
>            As this is probably the first ...
>
>> +	 * expensive as it has to reprogram the hardware, but that's less
>> +	 * expensive than going through a full hrtimer_interrupt() cycle
>> +	 * for nothing.
>> +	 *
>> +	 * hrtimer_try_to_cancel() is sufficient here as the timer is CPU
>> +	 * local and once the hrtimer code disabled interrupts the timer
>> +	 * callback cannot be running.
>> +	 */
>> +	if (st->cookie == current)
>> +		hrtimer_try_to_cancel(&st->timer);
>
> If the above scenario did happen, the timer will go off as
> st->cookie == current would likely be false?
>
> Hmm, if it does go off and the task did schedule back in, would it get its
> need_resched set? This is a very unlikely scenario thus I guess it doesn't
> really matter.

Correct.

> I'm just thinking about corner cases and how it could affect this code and
> possibly cause noticeable issues.

Right. That corner case exists and there is not much to be done about it
unless you inflict the timer cancelation into schedule(), which is not
an option at all.

> -- Steve

/me trims 50+ lines of pointless quotation.

Thanks,

        tglx
Re: [patch V3 08/12] rseq: Implement time slice extension enforcement timer
Posted by Steven Rostedt 3 months, 1 week ago
On Wed, 29 Oct 2025 22:37:17 +0100
Thomas Gleixner <tglx@linutronix.de> wrote:

> >> +	 * This check prevents that a granted time slice extension exceeds  
> >
> >	   This check prevents a granted time slice ...
> >  
> >> +	 * the maximum scheduling latency when the grant expired before  
> 
> I'm not a native speaker, but your suggested edit is bogus. Let me
> put it into the full sentence:
> 
> 	   This check prevents a granted time slice extension exceeds
>            the maximum ....
> 
> Can you spot the fail?

Ah, I should have updated the entire sentence, as the original still sounds
funny to me, but you are correct, that update wasn't enough.

Perhaps:

   This check prevents a granted time slice extension from exceeding the
   maximum scheduling latency when the grant expires before going back out
   to user space.

-- Steve