Checkpoint/Restore in Userspace (CRIU) requires to reconstruct posix timers
with the same timer ID on restore. It uses sys_timer_create() and relies on
the monotonic increasing timer ID provided by this syscall. It creates and
deletes timers until the desired ID is reached. This is can loop for a long
time, when the checkpointed process had a very sparse timer ID range.
It has been debated to implement a new syscall to allow the creation of
timers with a given timer ID, but that's tideous due to the 32/64bit compat
issues of sigevent_t and of dubious value.
The restore mechanism of CRIU creates the timers in a state where all
threads of the restored process are held on a barrier and cannot issue
syscalls. That means the restorer task has exclusive control.
This allows to address this issue with a prctl() so that the restorer
thread can do:
if (prctl(PR_TIMER_CREATE_RESTORE_IDS, PR_TIMER_CREATE_RESTORE_IDS_ON))
goto linear_mode;
create_timers_with_explicit_ids();
prctl(PR_TIMER_CREATE_RESTORE_IDS, PR_TIMER_CREATE_RESTORE_IDS_OFF);
This is backwards compatible because the prctl() fails on older kernels and
CRIU can fall back to the linear timer ID mechanism. CRIU versions which do
not know about the prctl() just work as before.
Implement the prctl() and modify timer_create() so that it copies the
requested timer ID from userspace by utilizing the existing timer_t
pointer, which is used to copy out the allocated timer ID on success.
If the prctl() is disabled, which it is by default, timer_create() works as
before and does not try to read from the userspace pointer.
There is no problem when a broken or rogue user space application enables
the prctl(). If the user space pointer does not contain a valid ID, then
timer_create() fails. If the data is not initialized, but constains a
random valid ID, timer_create() will create that random timer ID or fail if
the ID is already given out.
As CRIU must use the raw syscall to avoid manipulating the internal state
of the restored process, this has no library dependencies and can be
adopted by CRIU right away.
Recreating two timers with IDs 1000000 and 2000000 takes 1.5 seconds with
the create/delete method. With the prctl() it takes 3 microseconds.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
V2: Move the ID counter ahead to avoid collisions after switching back to
normal mode.
---
include/linux/posix-timers.h | 2
include/linux/sched/signal.h | 1
include/uapi/linux/prctl.h | 10 ++++
kernel/sys.c | 5 ++
kernel/time/posix-timers.c | 97 +++++++++++++++++++++++++++++++------------
5 files changed, 89 insertions(+), 26 deletions(-)
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -114,6 +114,7 @@ bool posixtimer_init_sigqueue(struct sig
void posixtimer_send_sigqueue(struct k_itimer *tmr);
bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *timer_sigq);
void posixtimer_free_timer(struct k_itimer *timer);
+long posixtimer_create_prctl(unsigned long ctrl);
/* Init task static initializer */
#define INIT_CPU_TIMERBASE(b) { \
@@ -140,6 +141,7 @@ static inline void posixtimer_rearm_itim
static inline bool posixtimer_deliver_signal(struct kernel_siginfo *info,
struct sigqueue *timer_sigq) { return false; }
static inline void posixtimer_free_timer(struct k_itimer *timer) { }
+static inline long posixtimer_create_prctl(unsigned long ctrl) { return -EINVAL; }
#endif
#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -136,6 +136,7 @@ struct signal_struct {
#ifdef CONFIG_POSIX_TIMERS
/* POSIX.1b Interval Timers */
+ unsigned int timer_create_restore_ids:1;
atomic_t next_posix_timer_id;
struct hlist_head posix_timers;
struct hlist_head ignored_posix_timers;
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -353,4 +353,14 @@ struct prctl_mm_map {
*/
#define PR_LOCK_SHADOW_STACK_STATUS 76
+/*
+ * Controls the mode of timer_create() for CRIU restore operations.
+ * Enabling this allows CRIU to restore timers with explicit IDs.
+ *
+ * Don't use for normal operations as the result might be undefined.
+ */
+#define PR_TIMER_CREATE_RESTORE_IDS 77
+# define PR_TIMER_CREATE_RESTORE_IDS_OFF 0
+# define PR_TIMER_CREATE_RESTORE_IDS_ON 1
+
#endif /* _LINUX_PRCTL_H */
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2811,6 +2811,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsi
return -EINVAL;
error = arch_lock_shadow_stack_status(me, arg2);
break;
+ case PR_TIMER_CREATE_RESTORE_IDS:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = posixtimer_create_prctl(arg2);
+ break;
default:
trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5);
error = -EINVAL;
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -19,6 +19,7 @@
#include <linux/nospec.h>
#include <linux/posix-clock.h>
#include <linux/posix-timers.h>
+#include <linux/prctl.h>
#include <linux/sched/task.h>
#include <linux/slab.h>
#include <linux/syscalls.h>
@@ -57,6 +58,8 @@ static const struct k_clock * const posi
static const struct k_clock *clockid_to_kclock(const clockid_t id);
static const struct k_clock clock_realtime, clock_monotonic;
+#define TIMER_ANY_ID INT_MIN
+
/* SIGEV_THREAD_ID cannot share a bit with the other SIGEV values. */
#if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \
~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD))
@@ -128,38 +131,60 @@ static bool posix_timer_hashed(struct ti
return false;
}
-static int posix_timer_add(struct k_itimer *timer)
+static bool posix_timer_add_at(struct k_itimer *timer, struct signal_struct *sig, unsigned int id)
+{
+ struct timer_hash_bucket *bucket = hash_bucket(sig, id);
+
+ scoped_guard (spinlock, &bucket->lock) {
+ /*
+ * Validate under the lock as this could have raced against
+ * another thread ending up with the same ID, which is
+ * highly unlikely, but possible.
+ */
+ if (!posix_timer_hashed(bucket, sig, id)) {
+ /*
+ * Set the timer ID and the signal pointer to make
+ * it identifiable in the hash table. The signal
+ * pointer has bit 0 set to indicate that it is not
+ * yet fully initialized. posix_timer_hashed()
+ * masks this bit out, but the syscall lookup fails
+ * to match due to it being set. This guarantees
+ * that there can't be duplicate timer IDs handed
+ * out.
+ */
+ timer->it_id = (timer_t)id;
+ timer->it_signal = (struct signal_struct *)((unsigned long)sig | 1UL);
+ hlist_add_head_rcu(&timer->t_hash, &bucket->head);
+ return true;
+ }
+ }
+ return false;
+}
+
+static int posix_timer_add(struct k_itimer *timer, int req_id)
{
struct signal_struct *sig = current->signal;
+ if (unlikely(req_id != TIMER_ANY_ID)) {
+ if (!posix_timer_add_at(timer, sig, req_id))
+ return -EBUSY;
+
+ /*
+ * Move the ID counter past the requested ID, so that after
+ * switching back to normal mode the IDs are outside of the
+ * exact allocated region. That avoids ID collisions on the
+ * next regular timer_create() invocations.
+ */
+ atomic_set(&sig->next_posix_timer_id, req_id + 1);
+ return req_id;
+ }
+
for (unsigned int cnt = 0; cnt <= INT_MAX; cnt++) {
/* Get the next timer ID and clamp it to positive space */
unsigned int id = atomic_fetch_inc(&sig->next_posix_timer_id) & INT_MAX;
- struct timer_hash_bucket *bucket = hash_bucket(sig, id);
- scoped_guard (spinlock, &bucket->lock) {
- /*
- * Validate under the lock as this could have raced
- * against another thread ending up with the same
- * ID, which is highly unlikely, but possible.
- */
- if (!posix_timer_hashed(bucket, sig, id)) {
- /*
- * Set the timer ID and the signal pointer to make
- * it identifiable in the hash table. The signal
- * pointer has bit 0 set to indicate that it is not
- * yet fully initialized. posix_timer_hashed()
- * masks this bit out, but the syscall lookup fails
- * to match due to it being set. This guarantees
- * that there can't be duplicate timer IDs handed
- * out.
- */
- timer->it_id = (timer_t)id;
- timer->it_signal = (struct signal_struct *)((unsigned long)sig | 1UL);
- hlist_add_head_rcu(&timer->t_hash, &bucket->head);
- return id;
- }
- }
+ if (posix_timer_add_at(timer, sig, id))
+ return id;
cond_resched();
}
/* POSIX return code when no timer ID could be allocated */
@@ -364,6 +389,16 @@ static enum hrtimer_restart posix_timer_
return HRTIMER_NORESTART;
}
+long posixtimer_create_prctl(unsigned long ctrl)
+{
+ if (ctrl > PR_TIMER_CREATE_RESTORE_IDS_ON)
+ return -EINVAL;
+
+ guard(spinlock_irq)(¤t->sighand->siglock);
+ current->signal->timer_create_restore_ids = ctrl == PR_TIMER_CREATE_RESTORE_IDS_ON;
+ return 0;
+}
+
static struct pid *good_sigevent(sigevent_t * event)
{
struct pid *pid = task_tgid(current);
@@ -435,6 +470,7 @@ static int do_timer_create(clockid_t whi
timer_t __user *created_timer_id)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
+ timer_t req_id = TIMER_ANY_ID;
struct k_itimer *new_timer;
int error, new_timer_id;
@@ -449,11 +485,20 @@ static int do_timer_create(clockid_t whi
spin_lock_init(&new_timer->it_lock);
+ /* Special case for CRIU to restore timers with a given timer ID. */
+ if (unlikely(current->signal->timer_create_restore_ids)) {
+ if (copy_from_user(&req_id, created_timer_id, sizeof(req_id)))
+ return -EFAULT;
+ /* Valid IDs are 0..INT_MAX */
+ if ((unsigned int)req_id > INT_MAX)
+ return -EINVAL;
+ }
+
/*
* Add the timer to the hash table. The timer is not yet valid
* after insertion, but has a unique ID allocated.
*/
- new_timer_id = posix_timer_add(new_timer);
+ new_timer_id = posix_timer_add(new_timer, req_id);
if (new_timer_id < 0) {
posixtimer_free_timer(new_timer);
return new_timer_id;
Le Sat, Mar 08, 2025 at 05:48:47PM +0100, Thomas Gleixner a écrit :
> @@ -364,6 +389,16 @@ static enum hrtimer_restart posix_timer_
> return HRTIMER_NORESTART;
> }
>
> +long posixtimer_create_prctl(unsigned long ctrl)
> +{
> + if (ctrl > PR_TIMER_CREATE_RESTORE_IDS_ON)
> + return -EINVAL;
> +
> + guard(spinlock_irq)(¤t->sighand->siglock);
> + current->signal->timer_create_restore_ids = ctrl == PR_TIMER_CREATE_RESTORE_IDS_ON;
Is the locking necessary here? It's not used on the read side.
It only makes sense if more flags are to be added later in struct signal and the
fields write can race.
Also do we want to carry this PR_TIMER_CREATE_RESTORE_IDS_ON accross exec? Posix
timers are removed then anyway.
Thanks.
On Tue, Mar 11, 2025 at 10:35:46PM +0100, Frederic Weisbecker wrote:
> Le Sat, Mar 08, 2025 at 05:48:47PM +0100, Thomas Gleixner a écrit :
> > @@ -364,6 +389,16 @@ static enum hrtimer_restart posix_timer_
> > return HRTIMER_NORESTART;
> > }
> >
> > +long posixtimer_create_prctl(unsigned long ctrl)
> > +{
> > + if (ctrl > PR_TIMER_CREATE_RESTORE_IDS_ON)
> > + return -EINVAL;
> > +
> > + guard(spinlock_irq)(¤t->sighand->siglock);
> > + current->signal->timer_create_restore_ids = ctrl == PR_TIMER_CREATE_RESTORE_IDS_ON;
>
> Is the locking necessary here? It's not used on the read side.
> It only makes sense if more flags are to be added later in struct signal and the
> fields write can race.
Actually this is a very subtle moment. The @timer_create_restore_ids is a bit field and
updating them without a lock already lead into hard to catch bugs in the past especially
when we have close bits members such as is_child_subreaper/has_child_subreaper near it.
I thought of fork(clone_vm) calls in multithreaded application where real_parent may
point into our task which is doing prctl but didn't find any problem so far (though
internal feeling says that this is not hot path call and better would be to keep Thomas'
original lock code :-). Anyway, seems to be safe without it.
Cyrill
On Tue, Mar 11 2025 at 22:35, Frederic Weisbecker wrote:
> Le Sat, Mar 08, 2025 at 05:48:47PM +0100, Thomas Gleixner a écrit :
>> @@ -364,6 +389,16 @@ static enum hrtimer_restart posix_timer_
>> return HRTIMER_NORESTART;
>> }
>>
>> +long posixtimer_create_prctl(unsigned long ctrl)
>> +{
>> + if (ctrl > PR_TIMER_CREATE_RESTORE_IDS_ON)
>> + return -EINVAL;
>> +
>> + guard(spinlock_irq)(¤t->sighand->siglock);
>> + current->signal->timer_create_restore_ids = ctrl == PR_TIMER_CREATE_RESTORE_IDS_ON;
>
> Is the locking necessary here? It's not used on the read side.
> It only makes sense if more flags are to be added later in struct signal and the
> fields write can race.
True.
> Also do we want to carry this PR_TIMER_CREATE_RESTORE_IDS_ON accross exec? Posix
> timers are removed then anyway.
Indeed, we should clear that.
Checkpoint/Restore in Userspace (CRIU) requires to reconstruct posix timers
with the same timer ID on restore. It uses sys_timer_create() and relies on
the monotonic increasing timer ID provided by this syscall. It creates and
deletes timers until the desired ID is reached. This is can loop for a long
time, when the checkpointed process had a very sparse timer ID range.
It has been debated to implement a new syscall to allow the creation of
timers with a given timer ID, but that's tideous due to the 32/64bit compat
issues of sigevent_t and of dubious value.
The restore mechanism of CRIU creates the timers in a state where all
threads of the restored process are held on a barrier and cannot issue
syscalls. That means the restorer task has exclusive control.
This allows to address this issue with a prctl() so that the restorer
thread can do:
if (prctl(PR_TIMER_CREATE_RESTORE_IDS, PR_TIMER_CREATE_RESTORE_IDS_ON))
goto linear_mode;
create_timers_with_explicit_ids();
prctl(PR_TIMER_CREATE_RESTORE_IDS, PR_TIMER_CREATE_RESTORE_IDS_OFF);
This is backwards compatible because the prctl() fails on older kernels and
CRIU can fall back to the linear timer ID mechanism. CRIU versions which do
not know about the prctl() just work as before.
Implement the prctl() and modify timer_create() so that it copies the
requested timer ID from userspace by utilizing the existing timer_t
pointer, which is used to copy out the allocated timer ID on success.
If the prctl() is disabled, which it is by default, timer_create() works as
before and does not try to read from the userspace pointer.
There is no problem when a broken or rogue user space application enables
the prctl(). If the user space pointer does not contain a valid ID, then
timer_create() fails. If the data is not initialized, but constains a
random valid ID, timer_create() will create that random timer ID or fail if
the ID is already given out.
As CRIU must use the raw syscall to avoid manipulating the internal state
of the restored process, this has no library dependencies and can be
adopted by CRIU right away.
Recreating two timers with IDs 1000000 and 2000000 takes 1.5 seconds with
the create/delete method. With the prctl() it takes 3 microseconds.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
V3a: Remove the locking in the prctl() and clear restore mode on exec()
- Frederic
V2: Move the ID counter ahead to avoid collisions after switching back to
normal mode.
---
include/linux/posix-timers.h | 2
include/linux/sched/signal.h | 1
include/uapi/linux/prctl.h | 10 ++++
kernel/sys.c | 5 ++
kernel/time/posix-timers.c | 99 +++++++++++++++++++++++++++++++------------
5 files changed, 91 insertions(+), 26 deletions(-)
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -114,6 +114,7 @@ bool posixtimer_init_sigqueue(struct sig
void posixtimer_send_sigqueue(struct k_itimer *tmr);
bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *timer_sigq);
void posixtimer_free_timer(struct k_itimer *timer);
+long posixtimer_create_prctl(unsigned long ctrl);
/* Init task static initializer */
#define INIT_CPU_TIMERBASE(b) { \
@@ -140,6 +141,7 @@ static inline void posixtimer_rearm_itim
static inline bool posixtimer_deliver_signal(struct kernel_siginfo *info,
struct sigqueue *timer_sigq) { return false; }
static inline void posixtimer_free_timer(struct k_itimer *timer) { }
+static inline long posixtimer_create_prctl(unsigned long ctrl) { return -EINVAL; }
#endif
#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -136,6 +136,7 @@ struct signal_struct {
#ifdef CONFIG_POSIX_TIMERS
/* POSIX.1b Interval Timers */
+ unsigned int timer_create_restore_ids:1;
atomic_t next_posix_timer_id;
struct hlist_head posix_timers;
struct hlist_head ignored_posix_timers;
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -353,4 +353,14 @@ struct prctl_mm_map {
*/
#define PR_LOCK_SHADOW_STACK_STATUS 76
+/*
+ * Controls the mode of timer_create() for CRIU restore operations.
+ * Enabling this allows CRIU to restore timers with explicit IDs.
+ *
+ * Don't use for normal operations as the result might be undefined.
+ */
+#define PR_TIMER_CREATE_RESTORE_IDS 77
+# define PR_TIMER_CREATE_RESTORE_IDS_OFF 0
+# define PR_TIMER_CREATE_RESTORE_IDS_ON 1
+
#endif /* _LINUX_PRCTL_H */
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2811,6 +2811,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsi
return -EINVAL;
error = arch_lock_shadow_stack_status(me, arg2);
break;
+ case PR_TIMER_CREATE_RESTORE_IDS:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = posixtimer_create_prctl(arg2);
+ break;
default:
trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5);
error = -EINVAL;
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -19,6 +19,7 @@
#include <linux/nospec.h>
#include <linux/posix-clock.h>
#include <linux/posix-timers.h>
+#include <linux/prctl.h>
#include <linux/sched/task.h>
#include <linux/slab.h>
#include <linux/syscalls.h>
@@ -57,6 +58,8 @@ static const struct k_clock * const posi
static const struct k_clock *clockid_to_kclock(const clockid_t id);
static const struct k_clock clock_realtime, clock_monotonic;
+#define TIMER_ANY_ID INT_MIN
+
/* SIGEV_THREAD_ID cannot share a bit with the other SIGEV values. */
#if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \
~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD))
@@ -128,38 +131,60 @@ static bool posix_timer_hashed(struct ti
return false;
}
-static int posix_timer_add(struct k_itimer *timer)
+static bool posix_timer_add_at(struct k_itimer *timer, struct signal_struct *sig, unsigned int id)
+{
+ struct timer_hash_bucket *bucket = hash_bucket(sig, id);
+
+ scoped_guard (spinlock, &bucket->lock) {
+ /*
+ * Validate under the lock as this could have raced against
+ * another thread ending up with the same ID, which is
+ * highly unlikely, but possible.
+ */
+ if (!posix_timer_hashed(bucket, sig, id)) {
+ /*
+ * Set the timer ID and the signal pointer to make
+ * it identifiable in the hash table. The signal
+ * pointer has bit 0 set to indicate that it is not
+ * yet fully initialized. posix_timer_hashed()
+ * masks this bit out, but the syscall lookup fails
+ * to match due to it being set. This guarantees
+ * that there can't be duplicate timer IDs handed
+ * out.
+ */
+ timer->it_id = (timer_t)id;
+ timer->it_signal = (struct signal_struct *)((unsigned long)sig | 1UL);
+ hlist_add_head_rcu(&timer->t_hash, &bucket->head);
+ return true;
+ }
+ }
+ return false;
+}
+
+static int posix_timer_add(struct k_itimer *timer, int req_id)
{
struct signal_struct *sig = current->signal;
+ if (unlikely(req_id != TIMER_ANY_ID)) {
+ if (!posix_timer_add_at(timer, sig, req_id))
+ return -EBUSY;
+
+ /*
+ * Move the ID counter past the requested ID, so that after
+ * switching back to normal mode the IDs are outside of the
+ * exact allocated region. That avoids ID collisions on the
+ * next regular timer_create() invocations.
+ */
+ atomic_set(&sig->next_posix_timer_id, req_id + 1);
+ return req_id;
+ }
+
for (unsigned int cnt = 0; cnt <= INT_MAX; cnt++) {
/* Get the next timer ID and clamp it to positive space */
unsigned int id = atomic_fetch_inc(&sig->next_posix_timer_id) & INT_MAX;
- struct timer_hash_bucket *bucket = hash_bucket(sig, id);
- scoped_guard (spinlock, &bucket->lock) {
- /*
- * Validate under the lock as this could have raced
- * against another thread ending up with the same
- * ID, which is highly unlikely, but possible.
- */
- if (!posix_timer_hashed(bucket, sig, id)) {
- /*
- * Set the timer ID and the signal pointer to make
- * it identifiable in the hash table. The signal
- * pointer has bit 0 set to indicate that it is not
- * yet fully initialized. posix_timer_hashed()
- * masks this bit out, but the syscall lookup fails
- * to match due to it being set. This guarantees
- * that there can't be duplicate timer IDs handed
- * out.
- */
- timer->it_id = (timer_t)id;
- timer->it_signal = (struct signal_struct *)((unsigned long)sig | 1UL);
- hlist_add_head_rcu(&timer->t_hash, &bucket->head);
- return id;
- }
- }
+ if (posix_timer_add_at(timer, sig, id))
+ return id;
cond_resched();
}
/* POSIX return code when no timer ID could be allocated */
@@ -364,6 +389,15 @@ static enum hrtimer_restart posix_timer_
return HRTIMER_NORESTART;
}
+long posixtimer_create_prctl(unsigned long ctrl)
+{
+ if (ctrl > PR_TIMER_CREATE_RESTORE_IDS_ON)
+ return -EINVAL;
+
+ current->signal->timer_create_restore_ids = ctrl == PR_TIMER_CREATE_RESTORE_IDS_ON;
+ return 0;
+}
+
static struct pid *good_sigevent(sigevent_t * event)
{
struct pid *pid = task_tgid(current);
@@ -435,6 +469,7 @@ static int do_timer_create(clockid_t whi
timer_t __user *created_timer_id)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
+ timer_t req_id = TIMER_ANY_ID;
struct k_itimer *new_timer;
int error, new_timer_id;
@@ -449,11 +484,20 @@ static int do_timer_create(clockid_t whi
spin_lock_init(&new_timer->it_lock);
+ /* Special case for CRIU to restore timers with a given timer ID. */
+ if (unlikely(current->signal->timer_create_restore_ids)) {
+ if (copy_from_user(&req_id, created_timer_id, sizeof(req_id)))
+ return -EFAULT;
+ /* Valid IDs are 0..INT_MAX */
+ if ((unsigned int)req_id > INT_MAX)
+ return -EINVAL;
+ }
+
/*
* Add the timer to the hash table. The timer is not yet valid
* after insertion, but has a unique ID allocated.
*/
- new_timer_id = posix_timer_add(new_timer);
+ new_timer_id = posix_timer_add(new_timer, req_id);
if (new_timer_id < 0) {
posixtimer_free_timer(new_timer);
return new_timer_id;
@@ -1041,6 +1085,9 @@ void exit_itimers(struct task_struct *ts
struct hlist_node *next;
struct k_itimer *timer;
+ /* Clear restore mode for exec() */
+ tsk->signal->timer_create_restore_ids = 0;
+
if (hlist_empty(&tsk->signal->posix_timers))
return;
Le Tue, Mar 11, 2025 at 11:07:44PM +0100, Thomas Gleixner a écrit : > Checkpoint/Restore in Userspace (CRIU) requires to reconstruct posix timers > with the same timer ID on restore. It uses sys_timer_create() and relies on > the monotonic increasing timer ID provided by this syscall. It creates and > deletes timers until the desired ID is reached. This is can loop for a long > time, when the checkpointed process had a very sparse timer ID range. > > It has been debated to implement a new syscall to allow the creation of > timers with a given timer ID, but that's tideous due to the 32/64bit compat > issues of sigevent_t and of dubious value. > > The restore mechanism of CRIU creates the timers in a state where all > threads of the restored process are held on a barrier and cannot issue > syscalls. That means the restorer task has exclusive control. > > This allows to address this issue with a prctl() so that the restorer > thread can do: > > if (prctl(PR_TIMER_CREATE_RESTORE_IDS, PR_TIMER_CREATE_RESTORE_IDS_ON)) > goto linear_mode; > create_timers_with_explicit_ids(); > prctl(PR_TIMER_CREATE_RESTORE_IDS, PR_TIMER_CREATE_RESTORE_IDS_OFF); > > This is backwards compatible because the prctl() fails on older kernels and > CRIU can fall back to the linear timer ID mechanism. CRIU versions which do > not know about the prctl() just work as before. > > Implement the prctl() and modify timer_create() so that it copies the > requested timer ID from userspace by utilizing the existing timer_t > pointer, which is used to copy out the allocated timer ID on success. > > If the prctl() is disabled, which it is by default, timer_create() works as > before and does not try to read from the userspace pointer. > > There is no problem when a broken or rogue user space application enables > the prctl(). If the user space pointer does not contain a valid ID, then > timer_create() fails. If the data is not initialized, but constains a > random valid ID, timer_create() will create that random timer ID or fail if > the ID is already given out. > > As CRIU must use the raw syscall to avoid manipulating the internal state > of the restored process, this has no library dependencies and can be > adopted by CRIU right away. > > Recreating two timers with IDs 1000000 and 2000000 takes 1.5 seconds with > the create/delete method. With the prctl() it takes 3 microseconds. > > Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
On Tue, Mar 11, 2025 at 11:32:58PM +0100, Frederic Weisbecker wrote: ... > > > > Recreating two timers with IDs 1000000 and 2000000 takes 1.5 seconds with > > the create/delete method. With the prctl() it takes 3 microseconds. > > > > Signed-off-by: Thomas Gleixner <tglx@linutronix.de> > > Reviewed-by: Frederic Weisbecker <frederic@kernel.org> One thing which just popped up in my head -- this interface may be used not only by criu but any application which wants to create timer with specified id (hell know why, but whatever). As far as I understand we don't provide an interface to _read_ this property, don't we? Thus criu will restore such application which already has this bit set incorrectly. Cyrill
On Wed, Mar 12 2025 at 10:56, Cyrill Gorcunov wrote:
> On Tue, Mar 11, 2025 at 11:32:58PM +0100, Frederic Weisbecker wrote:
> ...
>> >
>> > Recreating two timers with IDs 1000000 and 2000000 takes 1.5 seconds with
>> > the create/delete method. With the prctl() it takes 3 microseconds.
>> >
>> > Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
>>
>> Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
>
> One thing which just popped up in my head -- this interface may be used not
> only by criu but any application which wants to create timer with specified
> id (hell know why, but whatever). As far as I understand we don't provide
Sure. Application developers are creative :)
> an interface to _read_ this property, don't we? Thus criu will restore such
> application which already has this bit set incorrectly.
Delta patch below.
Thanks,
tglx
---
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -362,5 +362,6 @@ struct prctl_mm_map {
#define PR_TIMER_CREATE_RESTORE_IDS 77
# define PR_TIMER_CREATE_RESTORE_IDS_OFF 0
# define PR_TIMER_CREATE_RESTORE_IDS_ON 1
+# define PR_TIMER_CREATE_RESTORE_IDS_GET 2
#endif /* _LINUX_PRCTL_H */
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -391,11 +391,17 @@ static enum hrtimer_restart posix_timer_
long posixtimer_create_prctl(unsigned long ctrl)
{
- if (ctrl > PR_TIMER_CREATE_RESTORE_IDS_ON)
- return -EINVAL;
-
- current->signal->timer_create_restore_ids = ctrl == PR_TIMER_CREATE_RESTORE_IDS_ON;
- return 0;
+ switch (ctrl) {
+ case PR_TIMER_CREATE_RESTORE_IDS_OFF:
+ current->signal->timer_create_restore_ids = 0;
+ return 0;
+ case PR_TIMER_CREATE_RESTORE_IDS_ON:
+ current->signal->timer_create_restore_ids = 0;
+ return 0;
+ case PR_TIMER_CREATE_RESTORE_IDS_GET:
+ return current->signal->timer_create_restore_ids;
+ }
+ return -EINVAL;
}
static struct pid *good_sigevent(sigevent_t * event)
On Wed, Mar 12, 2025 at 12:24:54PM +0100, Thomas Gleixner wrote:
> + switch (ctrl) {
> + case PR_TIMER_CREATE_RESTORE_IDS_OFF:
> + current->signal->timer_create_restore_ids = 0;
> + return 0;
> + case PR_TIMER_CREATE_RESTORE_IDS_ON:
> + current->signal->timer_create_restore_ids = 0;
Thanks a huge, Thomas! I suspect this might be a typo, you need "= 1;" here )
Cyrill
On Wed, Mar 12 2025 at 15:41, Cyrill Gorcunov wrote:
> On Wed, Mar 12, 2025 at 12:24:54PM +0100, Thomas Gleixner wrote:
>> + switch (ctrl) {
>> + case PR_TIMER_CREATE_RESTORE_IDS_OFF:
>> + current->signal->timer_create_restore_ids = 0;
>> + return 0;
>> + case PR_TIMER_CREATE_RESTORE_IDS_ON:
>> + current->signal->timer_create_restore_ids = 0;
>
> Thanks a huge, Thomas! I suspect this might be a typo, you need "= 1;" here )
Ooops.
On Wed, Mar 12 2025 at 12:24, Thomas Gleixner wrote:
> On Wed, Mar 12 2025 at 10:56, Cyrill Gorcunov wrote:
>> an interface to _read_ this property, don't we? Thus criu will restore such
>> application which already has this bit set incorrectly.
>
> Delta patch below.
That want's a fixup for the selftest too.
---
diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c
index 158138211f51..f0eceb0faf34 100644
--- a/tools/testing/selftests/timers/posix_timers.c
+++ b/tools/testing/selftests/timers/posix_timers.c
@@ -616,6 +616,7 @@ static int do_timer_delete(int id)
# define PR_TIMER_CREATE_RESTORE_IDS 77
# define PR_TIMER_CREATE_RESTORE_IDS_OFF 0
# define PR_TIMER_CREATE_RESTORE_IDS_ON 1
+# define PR_TIMER_CREATE_RESTORE_IDS_GET 2
#endif
static void check_timer_create_exact(void)
@@ -633,6 +634,9 @@ static void check_timer_create_exact(void)
}
}
+ if (prctl(PR_TIMER_CREATE_RESTORE_IDS, PR_TIMER_CREATE_RESTORE_IDS_GET, 0, 0, 0) != 1)
+ fatal_error(NULL, "prctl(GET) failed\n");
+
id = 8;
if (do_timer_create(&id) < 0)
fatal_error(NULL, "timer_create()");
@@ -641,7 +645,10 @@ static void check_timer_create_exact(void)
fatal_error(NULL, "timer_delete()");
if (prctl(PR_TIMER_CREATE_RESTORE_IDS, PR_TIMER_CREATE_RESTORE_IDS_OFF, 0, 0, 0))
- fatal_error(NULL, "prctl()");
+ fatal_error(NULL, "prctl(OFF)");
+
+ if (prctl(PR_TIMER_CREATE_RESTORE_IDS, PR_TIMER_CREATE_RESTORE_IDS_GET, 0, 0, 0) != 0)
+ fatal_error(NULL, "prctl(GET) failed\n");
if (id != 8) {
ksft_test_result_fail("check timer create exact %d != 8\n", id);
The following commit has been merged into the timers/core branch of tip:
Commit-ID: ec2d0c04624b3c8a7eb1682e006717fa20cfbe24
Gitweb: https://git.kernel.org/tip/ec2d0c04624b3c8a7eb1682e006717fa20cfbe24
Author: Thomas Gleixner <tglx@linutronix.de>
AuthorDate: Tue, 11 Mar 2025 23:07:44 +01:00
Committer: Thomas Gleixner <tglx@linutronix.de>
CommitterDate: Thu, 13 Mar 2025 12:07:18 +01:00
posix-timers: Provide a mechanism to allocate a given timer ID
Checkpoint/Restore in Userspace (CRIU) requires to reconstruct posix timers
with the same timer ID on restore. It uses sys_timer_create() and relies on
the monotonic increasing timer ID provided by this syscall. It creates and
deletes timers until the desired ID is reached. This is can loop for a long
time, when the checkpointed process had a very sparse timer ID range.
It has been debated to implement a new syscall to allow the creation of
timers with a given timer ID, but that's tideous due to the 32/64bit compat
issues of sigevent_t and of dubious value.
The restore mechanism of CRIU creates the timers in a state where all
threads of the restored process are held on a barrier and cannot issue
syscalls. That means the restorer task has exclusive control.
This allows to address this issue with a prctl() so that the restorer
thread can do:
if (prctl(PR_TIMER_CREATE_RESTORE_IDS, PR_TIMER_CREATE_RESTORE_IDS_ON))
goto linear_mode;
create_timers_with_explicit_ids();
prctl(PR_TIMER_CREATE_RESTORE_IDS, PR_TIMER_CREATE_RESTORE_IDS_OFF);
This is backwards compatible because the prctl() fails on older kernels and
CRIU can fall back to the linear timer ID mechanism. CRIU versions which do
not know about the prctl() just work as before.
Implement the prctl() and modify timer_create() so that it copies the
requested timer ID from userspace by utilizing the existing timer_t
pointer, which is used to copy out the allocated timer ID on success.
If the prctl() is disabled, which it is by default, timer_create() works as
before and does not try to read from the userspace pointer.
There is no problem when a broken or rogue user space application enables
the prctl(). If the user space pointer does not contain a valid ID, then
timer_create() fails. If the data is not initialized, but constains a
random valid ID, timer_create() will create that random timer ID or fail if
the ID is already given out.
As CRIU must use the raw syscall to avoid manipulating the internal state
of the restored process, this has no library dependencies and can be
adopted by CRIU right away.
Recreating two timers with IDs 1000000 and 2000000 takes 1.5 seconds with
the create/delete method. With the prctl() it takes 3 microseconds.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Cyrill Gorcunov <gorcunov@gmail.com>
Tested-by: Cyrill Gorcunov <gorcunov@gmail.com>
Link: https://lore.kernel.org/all/87jz8vz0en.ffs@tglx
---
include/linux/posix-timers.h | 2 +-
include/linux/sched/signal.h | 1 +-
include/uapi/linux/prctl.h | 11 ++++-
kernel/sys.c | 5 ++-
kernel/time/posix-timers.c | 105 +++++++++++++++++++++++++---------
5 files changed, 98 insertions(+), 26 deletions(-)
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 094ef57..dd48c64 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -114,6 +114,7 @@ bool posixtimer_init_sigqueue(struct sigqueue *q);
void posixtimer_send_sigqueue(struct k_itimer *tmr);
bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *timer_sigq);
void posixtimer_free_timer(struct k_itimer *timer);
+long posixtimer_create_prctl(unsigned long ctrl);
/* Init task static initializer */
#define INIT_CPU_TIMERBASE(b) { \
@@ -140,6 +141,7 @@ static inline void posixtimer_rearm_itimer(struct task_struct *p) { }
static inline bool posixtimer_deliver_signal(struct kernel_siginfo *info,
struct sigqueue *timer_sigq) { return false; }
static inline void posixtimer_free_timer(struct k_itimer *timer) { }
+static inline long posixtimer_create_prctl(unsigned long ctrl) { return -EINVAL; }
#endif
#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 72649d7..1ef1edb 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -136,6 +136,7 @@ struct signal_struct {
#ifdef CONFIG_POSIX_TIMERS
/* POSIX.1b Interval Timers */
+ unsigned int timer_create_restore_ids:1;
atomic_t next_posix_timer_id;
struct hlist_head posix_timers;
struct hlist_head ignored_posix_timers;
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 5c60806..15c18ef 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -353,4 +353,15 @@ struct prctl_mm_map {
*/
#define PR_LOCK_SHADOW_STACK_STATUS 76
+/*
+ * Controls the mode of timer_create() for CRIU restore operations.
+ * Enabling this allows CRIU to restore timers with explicit IDs.
+ *
+ * Don't use for normal operations as the result might be undefined.
+ */
+#define PR_TIMER_CREATE_RESTORE_IDS 77
+# define PR_TIMER_CREATE_RESTORE_IDS_OFF 0
+# define PR_TIMER_CREATE_RESTORE_IDS_ON 1
+# define PR_TIMER_CREATE_RESTORE_IDS_GET 2
+
#endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index cb366ff..982e1c4 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2811,6 +2811,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
return -EINVAL;
error = arch_lock_shadow_stack_status(me, arg2);
break;
+ case PR_TIMER_CREATE_RESTORE_IDS:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = posixtimer_create_prctl(arg2);
+ break;
default:
trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5);
error = -EINVAL;
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index b917a16..2ca1c55 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -19,6 +19,7 @@
#include <linux/nospec.h>
#include <linux/posix-clock.h>
#include <linux/posix-timers.h>
+#include <linux/prctl.h>
#include <linux/sched/task.h>
#include <linux/slab.h>
#include <linux/syscalls.h>
@@ -57,6 +58,8 @@ static const struct k_clock * const posix_clocks[];
static const struct k_clock *clockid_to_kclock(const clockid_t id);
static const struct k_clock clock_realtime, clock_monotonic;
+#define TIMER_ANY_ID INT_MIN
+
/* SIGEV_THREAD_ID cannot share a bit with the other SIGEV values. */
#if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \
~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD))
@@ -128,38 +131,60 @@ static bool posix_timer_hashed(struct timer_hash_bucket *bucket, struct signal_s
return false;
}
-static int posix_timer_add(struct k_itimer *timer)
+static bool posix_timer_add_at(struct k_itimer *timer, struct signal_struct *sig, unsigned int id)
+{
+ struct timer_hash_bucket *bucket = hash_bucket(sig, id);
+
+ scoped_guard (spinlock, &bucket->lock) {
+ /*
+ * Validate under the lock as this could have raced against
+ * another thread ending up with the same ID, which is
+ * highly unlikely, but possible.
+ */
+ if (!posix_timer_hashed(bucket, sig, id)) {
+ /*
+ * Set the timer ID and the signal pointer to make
+ * it identifiable in the hash table. The signal
+ * pointer has bit 0 set to indicate that it is not
+ * yet fully initialized. posix_timer_hashed()
+ * masks this bit out, but the syscall lookup fails
+ * to match due to it being set. This guarantees
+ * that there can't be duplicate timer IDs handed
+ * out.
+ */
+ timer->it_id = (timer_t)id;
+ timer->it_signal = (struct signal_struct *)((unsigned long)sig | 1UL);
+ hlist_add_head_rcu(&timer->t_hash, &bucket->head);
+ return true;
+ }
+ }
+ return false;
+}
+
+static int posix_timer_add(struct k_itimer *timer, int req_id)
{
struct signal_struct *sig = current->signal;
+ if (unlikely(req_id != TIMER_ANY_ID)) {
+ if (!posix_timer_add_at(timer, sig, req_id))
+ return -EBUSY;
+
+ /*
+ * Move the ID counter past the requested ID, so that after
+ * switching back to normal mode the IDs are outside of the
+ * exact allocated region. That avoids ID collisions on the
+ * next regular timer_create() invocations.
+ */
+ atomic_set(&sig->next_posix_timer_id, req_id + 1);
+ return req_id;
+ }
+
for (unsigned int cnt = 0; cnt <= INT_MAX; cnt++) {
/* Get the next timer ID and clamp it to positive space */
unsigned int id = atomic_fetch_inc(&sig->next_posix_timer_id) & INT_MAX;
- struct timer_hash_bucket *bucket = hash_bucket(sig, id);
- scoped_guard (spinlock, &bucket->lock) {
- /*
- * Validate under the lock as this could have raced
- * against another thread ending up with the same
- * ID, which is highly unlikely, but possible.
- */
- if (!posix_timer_hashed(bucket, sig, id)) {
- /*
- * Set the timer ID and the signal pointer to make
- * it identifiable in the hash table. The signal
- * pointer has bit 0 set to indicate that it is not
- * yet fully initialized. posix_timer_hashed()
- * masks this bit out, but the syscall lookup fails
- * to match due to it being set. This guarantees
- * that there can't be duplicate timer IDs handed
- * out.
- */
- timer->it_id = (timer_t)id;
- timer->it_signal = (struct signal_struct *)((unsigned long)sig | 1UL);
- hlist_add_head_rcu(&timer->t_hash, &bucket->head);
- return id;
- }
- }
+ if (posix_timer_add_at(timer, sig, id))
+ return id;
cond_resched();
}
/* POSIX return code when no timer ID could be allocated */
@@ -364,6 +389,21 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
+long posixtimer_create_prctl(unsigned long ctrl)
+{
+ switch (ctrl) {
+ case PR_TIMER_CREATE_RESTORE_IDS_OFF:
+ current->signal->timer_create_restore_ids = 0;
+ return 0;
+ case PR_TIMER_CREATE_RESTORE_IDS_ON:
+ current->signal->timer_create_restore_ids = 1;
+ return 0;
+ case PR_TIMER_CREATE_RESTORE_IDS_GET:
+ return current->signal->timer_create_restore_ids;
+ }
+ return -EINVAL;
+}
+
static struct pid *good_sigevent(sigevent_t * event)
{
struct pid *pid = task_tgid(current);
@@ -435,6 +475,7 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
timer_t __user *created_timer_id)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
+ timer_t req_id = TIMER_ANY_ID;
struct k_itimer *new_timer;
int error, new_timer_id;
@@ -449,11 +490,20 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
spin_lock_init(&new_timer->it_lock);
+ /* Special case for CRIU to restore timers with a given timer ID. */
+ if (unlikely(current->signal->timer_create_restore_ids)) {
+ if (copy_from_user(&req_id, created_timer_id, sizeof(req_id)))
+ return -EFAULT;
+ /* Valid IDs are 0..INT_MAX */
+ if ((unsigned int)req_id > INT_MAX)
+ return -EINVAL;
+ }
+
/*
* Add the timer to the hash table. The timer is not yet valid
* after insertion, but has a unique ID allocated.
*/
- new_timer_id = posix_timer_add(new_timer);
+ new_timer_id = posix_timer_add(new_timer, req_id);
if (new_timer_id < 0) {
posixtimer_free_timer(new_timer);
return new_timer_id;
@@ -1041,6 +1091,9 @@ void exit_itimers(struct task_struct *tsk)
struct hlist_node *next;
struct k_itimer *timer;
+ /* Clear restore mode for exec() */
+ tsk->signal->timer_create_restore_ids = 0;
+
if (hlist_empty(&tsk->signal->posix_timers))
return;
On Sat, Mar 08, 2025 at 05:48:47PM +0100, Thomas Gleixner wrote: > Checkpoint/Restore in Userspace (CRIU) requires to reconstruct posix timers > with the same timer ID on restore. It uses sys_timer_create() and relies on > the monotonic increasing timer ID provided by this syscall. It creates and > deletes timers until the desired ID is reached. This is can loop for a long > time, when the checkpointed process had a very sparse timer ID range. ... (I've reran test with new series) Reviewed-by: Cyrill Gorcunov <gorcunov@gmail.com> Tested-by: Cyrill Gorcunov <gorcunov@gmail.com>
© 2016 - 2025 Red Hat, Inc.