[PATCH v3] posix-timers: Prefer delivery of signals to the current thread

Dmitry Vyukov posted 1 patch 2 years, 7 months ago
There is a newer version of this series
kernel/signal.c                               | 19 ++++-
tools/testing/selftests/timers/posix_timers.c | 73 +++++++++++++++++++
2 files changed, 90 insertions(+), 2 deletions(-)
[PATCH v3] posix-timers: Prefer delivery of signals to the current thread
Posted by Dmitry Vyukov 2 years, 7 months ago
Prefer to deliver signals to the current thread if SIGEV_THREAD_ID
is not set. We used to prefer the main thread, but delivering
to the current thread is both faster, and allows to sample actual thread
activity for CLOCK_PROCESS_CPUTIME_ID timers, and does not change
the semantics (since we queue into shared_pending, all thread may
receive the signal in both cases).

Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Suggested-by: Oleg Nesterov <oleg@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Marco Elver <elver@google.com>

---

Changes in v3:
 - switched to the completely different implementation (much simpler)
   based on the Oleg's idea

Changes in RFC v2:
 - added additional Cc as Thomas asked
---
 kernel/signal.c                               | 19 ++++-
 tools/testing/selftests/timers/posix_timers.c | 73 +++++++++++++++++++
 2 files changed, 90 insertions(+), 2 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index ae26da61c4d9f..6ed71701f302c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1003,8 +1003,7 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
 	/*
 	 * Now find a thread we can wake up to take the signal off the queue.
 	 *
-	 * If the main thread wants the signal, it gets first crack.
-	 * Probably the least surprising to the average bear.
+	 * Try the suggested task first (may or may not be the main thread).
 	 */
 	if (wants_signal(sig, p))
 		t = p;
@@ -1970,7 +1969,23 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type)
 
 	ret = -1;
 	rcu_read_lock();
+	/*
+	 * This is called from posix timers. SIGEV_THREAD_ID signals
+	 * (type == PIDTYPE_PID) must be delivered to the user-specified
+	 * thread, so we use that and queue into t->pending.
+	 * Non-SIGEV_THREAD_ID signals must be delivered to "the process",
+	 * so we queue into shared_pending, but prefer to deliver to the
+	 * current thread. We used to prefer the main thread, but delivering
+	 * to the current thread is both faster, and allows user-space to
+	 * sample actual thread activity for CLOCK_PROCESS_CPUTIME_ID timers,
+	 * and does not change the semantics (since we queue into
+	 * shared_pending, all thread may receive the signal in both cases).
+	 * Note: current may be from a completely unrelated process for
+	 * non-cpu-time-based timers, we must be careful to not kick it.
+	 */
 	t = pid_task(pid, type);
+	if (t && type != PIDTYPE_PID && same_thread_group(t, current))
+		t = current;
 	if (!t || !likely(lock_task_sighand(t, &flags)))
 		goto ret;
 
diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c
index 0ba500056e635..fd3b933a191fe 100644
--- a/tools/testing/selftests/timers/posix_timers.c
+++ b/tools/testing/selftests/timers/posix_timers.c
@@ -188,6 +188,76 @@ static int check_timer_create(int which)
 	return 0;
 }
 
+int remain;
+__thread int got_signal;
+
+static void *distribution_thr(void *arg) {
+	while (__atomic_load_n(&remain, __ATOMIC_RELAXED));
+	return NULL;
+}
+
+static void distribution_handler(int nr)
+{
+	if (!__atomic_exchange_n(&got_signal, 1, __ATOMIC_RELAXED))
+		__atomic_fetch_sub(&remain, 1, __ATOMIC_RELAXED);
+}
+
+/* Test that all running threads receive CLOCK_PROCESS_CPUTIME_ID signals. */
+static int check_timer_distribution(void)
+{
+	int err, i;
+	timer_t id;
+	const int nthreads = 10;
+	pthread_t threads[nthreads];
+	struct itimerspec val = {
+		.it_value.tv_sec = 0,
+		.it_value.tv_nsec = 1000 * 1000,
+		.it_interval.tv_sec = 0,
+		.it_interval.tv_nsec = 1000 * 1000,
+	};
+
+	printf("Check timer_create() per process signal distribution... ");
+	fflush(stdout);
+
+	remain = nthreads + 1;  /* worker threads + this thread */
+	signal(SIGALRM, distribution_handler);
+	err = timer_create(CLOCK_PROCESS_CPUTIME_ID, NULL, &id);
+	if (err < 0) {
+		perror("Can't create timer\n");
+		return -1;
+	}
+	err = timer_settime(id, 0, &val, NULL);
+	if (err < 0) {
+		perror("Can't set timer\n");
+		return -1;
+	}
+
+	for (i = 0; i < nthreads; i++) {
+		if (pthread_create(&threads[i], NULL, distribution_thr, NULL)) {
+			perror("Can't create thread\n");
+			return -1;
+		}
+	}
+
+	/* Wait for all threads to receive the signal. */
+	while (__atomic_load_n(&remain, __ATOMIC_RELAXED));
+
+	for (i = 0; i < nthreads; i++) {
+		if (pthread_join(threads[i], NULL)) {
+			perror("Can't join thread\n");
+			return -1;
+		}
+	}
+
+	if (timer_delete(id)) {
+		perror("Can't delete timer\n");
+		return -1;
+	}
+
+	printf("[OK]\n");
+	return 0;
+}
+
 int main(int argc, char **argv)
 {
 	printf("Testing posix timers. False negative may happen on CPU execution \n");
@@ -217,5 +287,8 @@ int main(int argc, char **argv)
 	if (check_timer_create(CLOCK_PROCESS_CPUTIME_ID) < 0)
 		return ksft_exit_fail();
 
+	if (check_timer_distribution() < 0)
+		return ksft_exit_fail();
+
 	return ksft_exit_pass();
 }

base-commit: 7c46948a6e9cf47ed03b0d489fde894ad46f1437
-- 
2.39.1.456.gfc5497dd1b-goog
Re: [PATCH v3] posix-timers: Prefer delivery of signals to the current thread
Posted by Oleg Nesterov 2 years, 7 months ago
On 01/26, Dmitry Vyukov wrote:
>
>  	t = pid_task(pid, type);
> +	if (t && type != PIDTYPE_PID && same_thread_group(t, current))
> +		t = current;
>  	if (!t || !likely(lock_task_sighand(t, &flags)))
>  		goto ret;

Feel free to ignore, this is cosmetic/subjective, but

	t = pid_task(pid, type);
	if (!t)
		goto ret;
	if (type == PIDTYPE_TGID && same_thread_group(t, current))
		t = current;
	if (!likely(lock_task_sighand(t, &flags)))
		goto ret;

looks a bit more readable/clean to me.

LGTM. Lets wait for Thomas verdict.

Reviewed-by: Oleg Nesterov <oleg@redhat.com>
[PATCH v4] posix-timers: Prefer delivery of signals to the current thread
Posted by Dmitry Vyukov 2 years, 7 months ago
Prefer to deliver signals to the current thread if SIGEV_THREAD_ID
is not set. We used to prefer the main thread, but delivering
to the current thread is both faster, and allows to sample actual thread
activity for CLOCK_PROCESS_CPUTIME_ID timers, and does not change
the semantics (since we queue into shared_pending, all thread may
receive the signal in both cases).

Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Suggested-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Marco Elver <elver@google.com>

---

Changes in v4:
 - restructured checks in send_sigqueue() as suggested

Changes in v3:
 - switched to the completely different implementation (much simpler)
   based on the Oleg's idea

Changes in RFC v2:
 - added additional Cc as Thomas asked
---
 kernel/signal.c                               | 23 +++++-
 tools/testing/selftests/timers/posix_timers.c | 73 +++++++++++++++++++
 2 files changed, 93 insertions(+), 3 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index ae26da61c4d9f..706ad3a21933d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1003,8 +1003,7 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
 	/*
 	 * Now find a thread we can wake up to take the signal off the queue.
 	 *
-	 * If the main thread wants the signal, it gets first crack.
-	 * Probably the least surprising to the average bear.
+	 * Try the suggested task first (may or may not be the main thread).
 	 */
 	if (wants_signal(sig, p))
 		t = p;
@@ -1970,8 +1969,26 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type)
 
 	ret = -1;
 	rcu_read_lock();
+	/*
+	 * This is called from posix timers. SIGEV_THREAD_ID signals
+	 * (type == PIDTYPE_PID) must be delivered to the user-specified
+	 * thread, so we use that and queue into t->pending.
+	 * Non-SIGEV_THREAD_ID signals must be delivered to "the process",
+	 * so we queue into shared_pending, but prefer to deliver to the
+	 * current thread. We used to prefer the main thread, but delivering
+	 * to the current thread is both faster, and allows user-space to
+	 * sample actual thread activity for CLOCK_PROCESS_CPUTIME_ID timers,
+	 * and does not change the semantics (since we queue into
+	 * shared_pending, all thread may receive the signal in both cases).
+	 * Note: current may be from a completely unrelated process for
+	 * non-cpu-time-based timers, we must be careful to not kick it.
+	 */
 	t = pid_task(pid, type);
-	if (!t || !likely(lock_task_sighand(t, &flags)))
+	if (!t)
+		goto ret;
+	if (type != PIDTYPE_PID && same_thread_group(t, current))
+		t = current;
+	if (!likely(lock_task_sighand(t, &flags)))
 		goto ret;
 
 	ret = 1; /* the signal is ignored */
diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c
index 0ba500056e635..fd3b933a191fe 100644
--- a/tools/testing/selftests/timers/posix_timers.c
+++ b/tools/testing/selftests/timers/posix_timers.c
@@ -188,6 +188,76 @@ static int check_timer_create(int which)
 	return 0;
 }
 
+int remain;
+__thread int got_signal;
+
+static void *distribution_thr(void *arg) {
+	while (__atomic_load_n(&remain, __ATOMIC_RELAXED));
+	return NULL;
+}
+
+static void distribution_handler(int nr)
+{
+	if (!__atomic_exchange_n(&got_signal, 1, __ATOMIC_RELAXED))
+		__atomic_fetch_sub(&remain, 1, __ATOMIC_RELAXED);
+}
+
+/* Test that all running threads receive CLOCK_PROCESS_CPUTIME_ID signals. */
+static int check_timer_distribution(void)
+{
+	int err, i;
+	timer_t id;
+	const int nthreads = 10;
+	pthread_t threads[nthreads];
+	struct itimerspec val = {
+		.it_value.tv_sec = 0,
+		.it_value.tv_nsec = 1000 * 1000,
+		.it_interval.tv_sec = 0,
+		.it_interval.tv_nsec = 1000 * 1000,
+	};
+
+	printf("Check timer_create() per process signal distribution... ");
+	fflush(stdout);
+
+	remain = nthreads + 1;  /* worker threads + this thread */
+	signal(SIGALRM, distribution_handler);
+	err = timer_create(CLOCK_PROCESS_CPUTIME_ID, NULL, &id);
+	if (err < 0) {
+		perror("Can't create timer\n");
+		return -1;
+	}
+	err = timer_settime(id, 0, &val, NULL);
+	if (err < 0) {
+		perror("Can't set timer\n");
+		return -1;
+	}
+
+	for (i = 0; i < nthreads; i++) {
+		if (pthread_create(&threads[i], NULL, distribution_thr, NULL)) {
+			perror("Can't create thread\n");
+			return -1;
+		}
+	}
+
+	/* Wait for all threads to receive the signal. */
+	while (__atomic_load_n(&remain, __ATOMIC_RELAXED));
+
+	for (i = 0; i < nthreads; i++) {
+		if (pthread_join(threads[i], NULL)) {
+			perror("Can't join thread\n");
+			return -1;
+		}
+	}
+
+	if (timer_delete(id)) {
+		perror("Can't delete timer\n");
+		return -1;
+	}
+
+	printf("[OK]\n");
+	return 0;
+}
+
 int main(int argc, char **argv)
 {
 	printf("Testing posix timers. False negative may happen on CPU execution \n");
@@ -217,5 +287,8 @@ int main(int argc, char **argv)
 	if (check_timer_create(CLOCK_PROCESS_CPUTIME_ID) < 0)
 		return ksft_exit_fail();
 
+	if (check_timer_distribution() < 0)
+		return ksft_exit_fail();
+
 	return ksft_exit_pass();
 }

base-commit: 7c46948a6e9cf47ed03b0d489fde894ad46f1437
-- 
2.39.1.456.gfc5497dd1b-goog
Re: [PATCH v4] posix-timers: Prefer delivery of signals to the current thread
Posted by Marco Elver 2 years, 7 months ago
On Thu, 26 Jan 2023 at 16:41, Dmitry Vyukov <dvyukov@google.com> wrote:
>
> Prefer to deliver signals to the current thread if SIGEV_THREAD_ID
> is not set. We used to prefer the main thread, but delivering
> to the current thread is both faster, and allows to sample actual thread
> activity for CLOCK_PROCESS_CPUTIME_ID timers, and does not change
> the semantics (since we queue into shared_pending, all thread may
> receive the signal in both cases).
>
> Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
> Suggested-by: Oleg Nesterov <oleg@redhat.com>
> Reviewed-by: Oleg Nesterov <oleg@redhat.com>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Eric W. Biederman <ebiederm@xmission.com>
> Cc: Frederic Weisbecker <frederic@kernel.org>
> Cc: Marco Elver <elver@google.com>
>
> ---
>
> Changes in v4:
>  - restructured checks in send_sigqueue() as suggested
>
> Changes in v3:
>  - switched to the completely different implementation (much simpler)
>    based on the Oleg's idea
>
> Changes in RFC v2:
>  - added additional Cc as Thomas asked
> ---
>  kernel/signal.c                               | 23 +++++-
>  tools/testing/selftests/timers/posix_timers.c | 73 +++++++++++++++++++
>  2 files changed, 93 insertions(+), 3 deletions(-)

Reviewed-by: Marco Elver <elver@google.com>

Nice - and and given the test, hopefully this behaviour won't regress in future.


> diff --git a/kernel/signal.c b/kernel/signal.c
> index ae26da61c4d9f..706ad3a21933d 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -1003,8 +1003,7 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
>         /*
>          * Now find a thread we can wake up to take the signal off the queue.
>          *
> -        * If the main thread wants the signal, it gets first crack.
> -        * Probably the least surprising to the average bear.
> +        * Try the suggested task first (may or may not be the main thread).
>          */
>         if (wants_signal(sig, p))
>                 t = p;
> @@ -1970,8 +1969,26 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type)
>
>         ret = -1;
>         rcu_read_lock();
> +       /*
> +        * This is called from posix timers. SIGEV_THREAD_ID signals
> +        * (type == PIDTYPE_PID) must be delivered to the user-specified
> +        * thread, so we use that and queue into t->pending.
> +        * Non-SIGEV_THREAD_ID signals must be delivered to "the process",
> +        * so we queue into shared_pending, but prefer to deliver to the
> +        * current thread. We used to prefer the main thread, but delivering
> +        * to the current thread is both faster, and allows user-space to
> +        * sample actual thread activity for CLOCK_PROCESS_CPUTIME_ID timers,
> +        * and does not change the semantics (since we queue into
> +        * shared_pending, all thread may receive the signal in both cases).
> +        * Note: current may be from a completely unrelated process for
> +        * non-cpu-time-based timers, we must be careful to not kick it.
> +        */
>         t = pid_task(pid, type);
> -       if (!t || !likely(lock_task_sighand(t, &flags)))
> +       if (!t)
> +               goto ret;
> +       if (type != PIDTYPE_PID && same_thread_group(t, current))
> +               t = current;
> +       if (!likely(lock_task_sighand(t, &flags)))
>                 goto ret;
>
>         ret = 1; /* the signal is ignored */
> diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c
> index 0ba500056e635..fd3b933a191fe 100644
> --- a/tools/testing/selftests/timers/posix_timers.c
> +++ b/tools/testing/selftests/timers/posix_timers.c
> @@ -188,6 +188,76 @@ static int check_timer_create(int which)
>         return 0;
>  }
>
> +int remain;
> +__thread int got_signal;
> +
> +static void *distribution_thr(void *arg) {
> +       while (__atomic_load_n(&remain, __ATOMIC_RELAXED));
> +       return NULL;
> +}
> +
> +static void distribution_handler(int nr)
> +{
> +       if (!__atomic_exchange_n(&got_signal, 1, __ATOMIC_RELAXED))
> +               __atomic_fetch_sub(&remain, 1, __ATOMIC_RELAXED);
> +}
> +
> +/* Test that all running threads receive CLOCK_PROCESS_CPUTIME_ID signals. */
> +static int check_timer_distribution(void)
> +{
> +       int err, i;
> +       timer_t id;
> +       const int nthreads = 10;
> +       pthread_t threads[nthreads];
> +       struct itimerspec val = {
> +               .it_value.tv_sec = 0,
> +               .it_value.tv_nsec = 1000 * 1000,
> +               .it_interval.tv_sec = 0,
> +               .it_interval.tv_nsec = 1000 * 1000,
> +       };
> +
> +       printf("Check timer_create() per process signal distribution... ");
> +       fflush(stdout);
> +
> +       remain = nthreads + 1;  /* worker threads + this thread */
> +       signal(SIGALRM, distribution_handler);
> +       err = timer_create(CLOCK_PROCESS_CPUTIME_ID, NULL, &id);
> +       if (err < 0) {
> +               perror("Can't create timer\n");
> +               return -1;
> +       }
> +       err = timer_settime(id, 0, &val, NULL);
> +       if (err < 0) {
> +               perror("Can't set timer\n");
> +               return -1;
> +       }
> +
> +       for (i = 0; i < nthreads; i++) {
> +               if (pthread_create(&threads[i], NULL, distribution_thr, NULL)) {
> +                       perror("Can't create thread\n");
> +                       return -1;
> +               }
> +       }
> +
> +       /* Wait for all threads to receive the signal. */
> +       while (__atomic_load_n(&remain, __ATOMIC_RELAXED));
> +
> +       for (i = 0; i < nthreads; i++) {
> +               if (pthread_join(threads[i], NULL)) {
> +                       perror("Can't join thread\n");
> +                       return -1;
> +               }
> +       }
> +
> +       if (timer_delete(id)) {
> +               perror("Can't delete timer\n");
> +               return -1;
> +       }
> +
> +       printf("[OK]\n");
> +       return 0;
> +}
> +
>  int main(int argc, char **argv)
>  {
>         printf("Testing posix timers. False negative may happen on CPU execution \n");
> @@ -217,5 +287,8 @@ int main(int argc, char **argv)
>         if (check_timer_create(CLOCK_PROCESS_CPUTIME_ID) < 0)
>                 return ksft_exit_fail();
>
> +       if (check_timer_distribution() < 0)
> +               return ksft_exit_fail();
> +
>         return ksft_exit_pass();
>  }
>
> base-commit: 7c46948a6e9cf47ed03b0d489fde894ad46f1437
> --
> 2.39.1.456.gfc5497dd1b-goog
>
Re: [PATCH v4] posix-timers: Prefer delivery of signals to the current thread
Posted by Thomas Gleixner 2 years, 7 months ago
On Thu, Jan 26 2023 at 18:51, Marco Elver wrote:
> On Thu, 26 Jan 2023 at 16:41, Dmitry Vyukov <dvyukov@google.com> wrote:
>>
>> Prefer to deliver signals to the current thread if SIGEV_THREAD_ID
>> is not set. We used to prefer the main thread, but delivering
>> to the current thread is both faster, and allows to sample actual thread
>> activity for CLOCK_PROCESS_CPUTIME_ID timers, and does not change
>> the semantics (since we queue into shared_pending, all thread may
>> receive the signal in both cases).
>
> Reviewed-by: Marco Elver <elver@google.com>
>
> Nice - and and given the test, hopefully this behaviour won't regress in future.

The test does not tell much. It just waits until each thread got a
signal once, which can take quite a while. It does not tell about the
distribution of the signals, which can be completely randomly skewed
towards a few threads.

Also for real world use cases where you have multiple threads with
different periods and runtime per period I have a hard time to
understand how that signal measures anything useful.

The most time consuming thread might actually trigger rarely while other
short threads end up being the ones which cause the timer to fire.

What's the usefulness of this information?

Thanks,

        tglx
Re: [PATCH v4] posix-timers: Prefer delivery of signals to the current thread
Posted by Dmitry Vyukov 2 years, 7 months ago
On Thu, 26 Jan 2023 at 20:57, Thomas Gleixner <tglx@linutronix.de> wrote:
>
> On Thu, Jan 26 2023 at 18:51, Marco Elver wrote:
> > On Thu, 26 Jan 2023 at 16:41, Dmitry Vyukov <dvyukov@google.com> wrote:
> >>
> >> Prefer to deliver signals to the current thread if SIGEV_THREAD_ID
> >> is not set. We used to prefer the main thread, but delivering
> >> to the current thread is both faster, and allows to sample actual thread
> >> activity for CLOCK_PROCESS_CPUTIME_ID timers, and does not change
> >> the semantics (since we queue into shared_pending, all thread may
> >> receive the signal in both cases).
> >
> > Reviewed-by: Marco Elver <elver@google.com>
> >
> > Nice - and and given the test, hopefully this behaviour won't regress in future.
>
> The test does not tell much. It just waits until each thread got a
> signal once, which can take quite a while. It does not tell about the
> distribution of the signals, which can be completely randomly skewed
> towards a few threads.
>
> Also for real world use cases where you have multiple threads with
> different periods and runtime per period I have a hard time to
> understand how that signal measures anything useful.
>
> The most time consuming thread might actually trigger rarely while other
> short threads end up being the ones which cause the timer to fire.
>
> What's the usefulness of this information?
>
> Thanks,
>
>         tglx

Hi Thomas,

Our goal is to sample what threads are doing in production with low
frequency and low overhead. We did not find any reasonable existing
way of doing it on Linux today, as outlined in the RFC version of the
patch (other solutions are either much more complex and/or incur
higher memory and/or CPU overheads):
https://lore.kernel.org/all/20221216171807.760147-1-dvyukov@google.com/

This sampling does not need to be 100% precise as CPU profilers would
require, getting high precision generally requires more complexity and
overheads. The accent is on use in production and low overhead.
Consider we sample with O(seconds) interval, so some activities can
still be unsampled whatever we do here (if they take <second). But on
the other hand the intention is to use this over billions of CPU
hours. So on the global scale we still observe more-or-less
everything.

Currently all signals are practically delivered to the main thread and
the added test does not pass (at least I couldn't wait long enough).
After this change the test passes quickly (within a second for me).
Testing the actual distribution without flaky failures is very hard in
unit tests. After rounds of complaints and deflaking they usually
transform into roughly what this test is doing -- all threads are
getting at least something.
If we want to test ultimate fairness, we would need to start with the
scheduler itself. If threads don't get fair fractions, then signals
won't be evenly distributed as well. I am not sure if there are unit
tests for the scheduler that ensure this in all configurations (e.g.
uneven ratio of runnable threads to CPUs, running in VMs, etc).
I agree this test is not perfect, but as I said, it does not pass now.
So it is useful and will detect a future regression in this logic. It
ensures that running threads eventually get signals.

But regardless of our motivation, this change looks like an
improvement in general. Consider performance alone (why would we wake
another thread, maybe send an IPI, evict caches). Sending the signal
to the thread that overflowed the counter also looks reasonable. For
some programs it may actually give a good picture. Say thread A is
running for a prolonged time, then thread B is running. The program
will first get signals in thread A and then in thread B (instead of
getting them on an unrelated thread).
Re: [PATCH v4] posix-timers: Prefer delivery of signals to the current thread
Posted by Dmitry Vyukov 2 years, 7 months ago
,On Fri, 27 Jan 2023 at 07:58, Dmitry Vyukov <dvyukov@google.com> wrote:
>
> On Thu, 26 Jan 2023 at 20:57, Thomas Gleixner <tglx@linutronix.de> wrote:
> >
> > On Thu, Jan 26 2023 at 18:51, Marco Elver wrote:
> > > On Thu, 26 Jan 2023 at 16:41, Dmitry Vyukov <dvyukov@google.com> wrote:
> > >>
> > >> Prefer to deliver signals to the current thread if SIGEV_THREAD_ID
> > >> is not set. We used to prefer the main thread, but delivering
> > >> to the current thread is both faster, and allows to sample actual thread
> > >> activity for CLOCK_PROCESS_CPUTIME_ID timers, and does not change
> > >> the semantics (since we queue into shared_pending, all thread may
> > >> receive the signal in both cases).
> > >
> > > Reviewed-by: Marco Elver <elver@google.com>
> > >
> > > Nice - and and given the test, hopefully this behaviour won't regress in future.
> >
> > The test does not tell much. It just waits until each thread got a
> > signal once, which can take quite a while. It does not tell about the
> > distribution of the signals, which can be completely randomly skewed
> > towards a few threads.
> >
> > Also for real world use cases where you have multiple threads with
> > different periods and runtime per period I have a hard time to
> > understand how that signal measures anything useful.
> >
> > The most time consuming thread might actually trigger rarely while other
> > short threads end up being the ones which cause the timer to fire.
> >
> > What's the usefulness of this information?
> >
> > Thanks,
> >
> >         tglx
>
> Hi Thomas,
>
> Our goal is to sample what threads are doing in production with low
> frequency and low overhead. We did not find any reasonable existing
> way of doing it on Linux today, as outlined in the RFC version of the
> patch (other solutions are either much more complex and/or incur
> higher memory and/or CPU overheads):
> https://lore.kernel.org/all/20221216171807.760147-1-dvyukov@google.com/
>
> This sampling does not need to be 100% precise as CPU profilers would
> require, getting high precision generally requires more complexity and
> overheads. The accent is on use in production and low overhead.
> Consider we sample with O(seconds) interval, so some activities can
> still be unsampled whatever we do here (if they take <second). But on
> the other hand the intention is to use this over billions of CPU
> hours. So on the global scale we still observe more-or-less
> everything.
>
> Currently all signals are practically delivered to the main thread and
> the added test does not pass (at least I couldn't wait long enough).
> After this change the test passes quickly (within a second for me).
> Testing the actual distribution without flaky failures is very hard in
> unit tests. After rounds of complaints and deflaking they usually
> transform into roughly what this test is doing -- all threads are
> getting at least something.
> If we want to test ultimate fairness, we would need to start with the
> scheduler itself. If threads don't get fair fractions, then signals
> won't be evenly distributed as well. I am not sure if there are unit
> tests for the scheduler that ensure this in all configurations (e.g.
> uneven ratio of runnable threads to CPUs, running in VMs, etc).
> I agree this test is not perfect, but as I said, it does not pass now.
> So it is useful and will detect a future regression in this logic. It
> ensures that running threads eventually get signals.
>
> But regardless of our motivation, this change looks like an
> improvement in general. Consider performance alone (why would we wake
> another thread, maybe send an IPI, evict caches). Sending the signal
> to the thread that overflowed the counter also looks reasonable. For
> some programs it may actually give a good picture. Say thread A is
> running for a prolonged time, then thread B is running. The program
> will first get signals in thread A and then in thread B (instead of
> getting them on an unrelated thread).



Hi Thomas,

Has this answered your question? Do you have any other concerns?
If not, please take this into some tree for upstreamming.

Thanks
Re: [PATCH v4] posix-timers: Prefer delivery of signals to the current thread
Posted by Oleg Nesterov 2 years, 7 months ago
Dmitry,

I agree with what you said, just one note...

On 01/27, Dmitry Vyukov wrote:
>
> After this change the test passes quickly (within a second for me).

yet perhaps it makes sense to slightly change it? It does

	+static void *distribution_thr(void *arg) {
	+	while (__atomic_load_n(&remain, __ATOMIC_RELAXED));
	+	return NULL;
	+}

so distribution_thr() eats CPU even after this thread gets a signal and thus
(in theory) it can "steal" cpu_timer_fire() from other threads unpredictably
long ? How about

	-	while (__atomic_load_n(&remain, __ATOMIC_RELAXED));
	+	while (__atomic_load_n(&got_signal, __ATOMIC_RELAXED));

?

Oleg.
Re: [PATCH v4] posix-timers: Prefer delivery of signals to the current thread
Posted by Dmitry Vyukov 2 years, 7 months ago
On Sat, 28 Jan 2023 at 20:56, Oleg Nesterov <oleg@redhat.com> wrote:
>
> Dmitry,
>
> I agree with what you said, just one note...
>
> On 01/27, Dmitry Vyukov wrote:
> >
> > After this change the test passes quickly (within a second for me).
>
> yet perhaps it makes sense to slightly change it? It does
>
>         +static void *distribution_thr(void *arg) {
>         +       while (__atomic_load_n(&remain, __ATOMIC_RELAXED));
>         +       return NULL;
>         +}
>
> so distribution_thr() eats CPU even after this thread gets a signal and thus
> (in theory) it can "steal" cpu_timer_fire() from other threads unpredictably
> long ? How about
>
>         -       while (__atomic_load_n(&remain, __ATOMIC_RELAXED));
>         +       while (__atomic_load_n(&got_signal, __ATOMIC_RELAXED));
> ?

But why?
IIUC this makes the test even "weaker". As Thomas notes it's already
somewhat "weak". And this would make it even "weaker". So if it passes
in the current version, I would keep it as is. It makes sense to relax
it only if it's known to fail sometimes. But it doesn't fail as far as
I know. And the intention is really that the current version must pass
-- all threads must get signals even if other threads are running.
Re: [PATCH v4] posix-timers: Prefer delivery of signals to the current thread
Posted by Oleg Nesterov 2 years, 7 months ago
On 01/30, Dmitry Vyukov wrote:
>
> On Sat, 28 Jan 2023 at 20:56, Oleg Nesterov <oleg@redhat.com> wrote:
> >
> > Dmitry,
> >
> > I agree with what you said, just one note...
> >
> > On 01/27, Dmitry Vyukov wrote:
> > >
> > > After this change the test passes quickly (within a second for me).
> >
> > yet perhaps it makes sense to slightly change it? It does
> >
> >         +static void *distribution_thr(void *arg) {
> >         +       while (__atomic_load_n(&remain, __ATOMIC_RELAXED));
> >         +       return NULL;
> >         +}
> >
> > so distribution_thr() eats CPU even after this thread gets a signal and thus
> > (in theory) it can "steal" cpu_timer_fire() from other threads unpredictably
> > long ? How about
> >
> >         -       while (__atomic_load_n(&remain, __ATOMIC_RELAXED));
> >         +       while (__atomic_load_n(&got_signal, __ATOMIC_RELAXED));
> > ?
>
> But why?
> IIUC this makes the test even "weaker". As Thomas notes it's already
> somewhat "weak". And this would make it even "weaker".

Not sure I understand why can this change make the test more weak...

IIUC, _in theory_ the test-case can "hang" forever, since all threads
are running nothing guarentees that every thread will have a chance to
call cpu_timer_fire() and get a signal.

With this change this is not possible, and the test-case will still
verify that all threads must get a signal.

Nevermind,

> So if it passes
> in the current version, I would keep it as is.

OK, I won't insist, please forget.

Oleg.
Re: [PATCH v4] posix-timers: Prefer delivery of signals to the current thread
Posted by Oleg Nesterov 2 years, 7 months ago
Forgot to mention ...

On 01/28, Oleg Nesterov wrote:
>
> Dmitry,
>
> I agree with what you said, just one note...
>
> On 01/27, Dmitry Vyukov wrote:
> >
> > After this change the test passes quickly (within a second for me).
>
> yet perhaps it makes sense to slightly change it? It does
>
> 	+static void *distribution_thr(void *arg) {
> 	+	while (__atomic_load_n(&remain, __ATOMIC_RELAXED));
> 	+	return NULL;
> 	+}
>
> so distribution_thr() eats CPU even after this thread gets a signal and thus
> (in theory) it can "steal" cpu_timer_fire() from other threads unpredictably
> long ? How about
>
> 	-	while (__atomic_load_n(&remain, __ATOMIC_RELAXED));
> 	+	while (__atomic_load_n(&got_signal, __ATOMIC_RELAXED));
>
> ?

Of course, in this case it also makes sense to change the main() function the
same way and add BUG_ON(remain) after the "for (...) pthread_join()" block.

Oleg.
[PATCH v5] posix-timers: Prefer delivery of signals to the current thread
Posted by Dmitry Vyukov 2 years, 6 months ago
Prefer to deliver signals to the current thread if SIGEV_THREAD_ID
is not set. We used to prefer the main thread, but delivering
to the current thread is both faster, and allows to sample actual thread
activity for CLOCK_PROCESS_CPUTIME_ID timers, and does not change
the semantics (since we queue into shared_pending, all thread may
receive the signal in both cases).

Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Suggested-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Marco Elver <elver@google.com>

---
Changes in v5:
 - rebased onto v6.2

Changes in v4:
 - restructured checks in send_sigqueue() as suggested

Changes in v3:
 - switched to the completely different implementation (much simpler)
   based on the Oleg's idea

Changes in RFC v2:
 - added additional Cc as Thomas asked
---
 kernel/signal.c                               | 23 +++++-
 tools/testing/selftests/timers/posix_timers.c | 73 +++++++++++++++++++
 2 files changed, 93 insertions(+), 3 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index ae26da61c4d9f..706ad3a21933d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1003,8 +1003,7 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
 	/*
 	 * Now find a thread we can wake up to take the signal off the queue.
 	 *
-	 * If the main thread wants the signal, it gets first crack.
-	 * Probably the least surprising to the average bear.
+	 * Try the suggested task first (may or may not be the main thread).
 	 */
 	if (wants_signal(sig, p))
 		t = p;
@@ -1970,8 +1969,26 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type)
 
 	ret = -1;
 	rcu_read_lock();
+	/*
+	 * This is called from posix timers. SIGEV_THREAD_ID signals
+	 * (type == PIDTYPE_PID) must be delivered to the user-specified
+	 * thread, so we use that and queue into t->pending.
+	 * Non-SIGEV_THREAD_ID signals must be delivered to "the process",
+	 * so we queue into shared_pending, but prefer to deliver to the
+	 * current thread. We used to prefer the main thread, but delivering
+	 * to the current thread is both faster, and allows user-space to
+	 * sample actual thread activity for CLOCK_PROCESS_CPUTIME_ID timers,
+	 * and does not change the semantics (since we queue into
+	 * shared_pending, all thread may receive the signal in both cases).
+	 * Note: current may be from a completely unrelated process for
+	 * non-cpu-time-based timers, we must be careful to not kick it.
+	 */
 	t = pid_task(pid, type);
-	if (!t || !likely(lock_task_sighand(t, &flags)))
+	if (!t)
+		goto ret;
+	if (type != PIDTYPE_PID && same_thread_group(t, current))
+		t = current;
+	if (!likely(lock_task_sighand(t, &flags)))
 		goto ret;
 
 	ret = 1; /* the signal is ignored */
diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c
index 0ba500056e635..fd3b933a191fe 100644
--- a/tools/testing/selftests/timers/posix_timers.c
+++ b/tools/testing/selftests/timers/posix_timers.c
@@ -188,6 +188,76 @@ static int check_timer_create(int which)
 	return 0;
 }
 
+int remain;
+__thread int got_signal;
+
+static void *distribution_thr(void *arg) {
+	while (__atomic_load_n(&remain, __ATOMIC_RELAXED));
+	return NULL;
+}
+
+static void distribution_handler(int nr)
+{
+	if (!__atomic_exchange_n(&got_signal, 1, __ATOMIC_RELAXED))
+		__atomic_fetch_sub(&remain, 1, __ATOMIC_RELAXED);
+}
+
+/* Test that all running threads receive CLOCK_PROCESS_CPUTIME_ID signals. */
+static int check_timer_distribution(void)
+{
+	int err, i;
+	timer_t id;
+	const int nthreads = 10;
+	pthread_t threads[nthreads];
+	struct itimerspec val = {
+		.it_value.tv_sec = 0,
+		.it_value.tv_nsec = 1000 * 1000,
+		.it_interval.tv_sec = 0,
+		.it_interval.tv_nsec = 1000 * 1000,
+	};
+
+	printf("Check timer_create() per process signal distribution... ");
+	fflush(stdout);
+
+	remain = nthreads + 1;  /* worker threads + this thread */
+	signal(SIGALRM, distribution_handler);
+	err = timer_create(CLOCK_PROCESS_CPUTIME_ID, NULL, &id);
+	if (err < 0) {
+		perror("Can't create timer\n");
+		return -1;
+	}
+	err = timer_settime(id, 0, &val, NULL);
+	if (err < 0) {
+		perror("Can't set timer\n");
+		return -1;
+	}
+
+	for (i = 0; i < nthreads; i++) {
+		if (pthread_create(&threads[i], NULL, distribution_thr, NULL)) {
+			perror("Can't create thread\n");
+			return -1;
+		}
+	}
+
+	/* Wait for all threads to receive the signal. */
+	while (__atomic_load_n(&remain, __ATOMIC_RELAXED));
+
+	for (i = 0; i < nthreads; i++) {
+		if (pthread_join(threads[i], NULL)) {
+			perror("Can't join thread\n");
+			return -1;
+		}
+	}
+
+	if (timer_delete(id)) {
+		perror("Can't delete timer\n");
+		return -1;
+	}
+
+	printf("[OK]\n");
+	return 0;
+}
+
 int main(int argc, char **argv)
 {
 	printf("Testing posix timers. False negative may happen on CPU execution \n");
@@ -217,5 +287,8 @@ int main(int argc, char **argv)
 	if (check_timer_create(CLOCK_PROCESS_CPUTIME_ID) < 0)
 		return ksft_exit_fail();
 
+	if (check_timer_distribution() < 0)
+		return ksft_exit_fail();
+
 	return ksft_exit_pass();
 }

base-commit: c9c3395d5e3dcc6daee66c6908354d47bf98cb0c
-- 
2.39.2.637.g21b0678d19-goog
Re: [PATCH v5] posix-timers: Prefer delivery of signals to the current thread
Posted by Oleg Nesterov 2 years, 6 months ago
On 02/20, Dmitry Vyukov wrote:
>
>  kernel/signal.c                               | 23 +++++-
>  tools/testing/selftests/timers/posix_timers.c | 73 +++++++++++++++++++
>  2 files changed, 93 insertions(+), 3 deletions(-)

For the change in kernel/signal.c

Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Re: [PATCH v5] posix-timers: Prefer delivery of signals to the current thread
Posted by Dmitry Vyukov 2 years, 6 months ago
On Wed, 22 Feb 2023 at 16:19, Oleg Nesterov <oleg@redhat.com> wrote:
>
> On 02/20, Dmitry Vyukov wrote:
> >
> >  kernel/signal.c                               | 23 +++++-
> >  tools/testing/selftests/timers/posix_timers.c | 73 +++++++++++++++++++
> >  2 files changed, 93 insertions(+), 3 deletions(-)
>
> For the change in kernel/signal.c
>
> Reviewed-by: Oleg Nesterov <oleg@redhat.com>

Hi Thomas,

Do you have any remaining concerns about this patch?
Should I drop the test to not confuse people? Then it will be just a
few lines in signal.c.