sched: Introduce task_struct::latency_sensi_flag.

[PATCH] sched: Introduce task_struct::latency_sensi_flag.

Posted by fuyuanli 2 weeks ago

In the path local_bh_enable()->__local_bh_enable_ip(), the softirq
handlers will be executed in the context of current task. But for some
tasks sensitive to running latency, we expect that they will not spend
extra time executing softirq. So latency_sensi_flag is introduced in
task_struct, when it is set to 1, task only wakes up softirq daemon in
__local_bh_enable_ip().

A test has been made in two hosts named A and B. In A, several clients
sent udp packets to a single server in B concurrently as fast as
possible. In B, the IRQs of these flows were bound to CPU 0 by flow
director, so there was always a triggered net_rx softirq on CPU 0. Then
a test program was started in B, which was also bound to CPU 0, and
keeped calling sendto() in a loop. Sampling with perf, results showed
that about 25% of running time of test program was spent executing
local_bh_enable() contained in syscall sendto(), but after setting
latency_sensi_flag to 1, this proportion had been reduced to 0.5%.

Signed-off-by: fuyuanli <fuyuanli@didiglobal.com>
---
 include/linux/sched.h            |  2 ++
 include/uapi/linux/sched.h       |  4 +++-
 include/uapi/linux/sched/types.h |  3 +++
 init/init_task.c                 |  1 +
 kernel/sched/core.c              | 12 ++++++++++++
 kernel/softirq.c                 | 20 ++++++++++++--------
 6 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3c2abbc587b4..af39888079c0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -925,6 +925,8 @@ struct task_struct {
 	/* Bit to tell TOMOYO we're in execve(): */
 	unsigned			in_execve:1;
 	unsigned			in_iowait:1;
+	/* Bit means if task is sensitive to latency */
+	unsigned			latency_sensi_flag:1;
 #ifndef TIF_RESTORE_SIGMASK
 	unsigned			restore_sigmask:1;
 #endif
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 3bac0a8ceab2..07c7ec5bd5a6 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -132,6 +132,7 @@ struct clone_args {
 #define SCHED_FLAG_KEEP_PARAMS		0x10
 #define SCHED_FLAG_UTIL_CLAMP_MIN	0x20
 #define SCHED_FLAG_UTIL_CLAMP_MAX	0x40
+#define SCHED_FLAG_LATENCY_SENSITIVE	0x80
 
 #define SCHED_FLAG_KEEP_ALL	(SCHED_FLAG_KEEP_POLICY | \
 				 SCHED_FLAG_KEEP_PARAMS)
@@ -143,6 +144,7 @@ struct clone_args {
 			 SCHED_FLAG_RECLAIM		| \
 			 SCHED_FLAG_DL_OVERRUN		| \
 			 SCHED_FLAG_KEEP_ALL		| \
-			 SCHED_FLAG_UTIL_CLAMP)
+			 SCHED_FLAG_UTIL_CLAMP		| \
+			 SCHED_FLAG_LATENCY_SENSITIVE)
 
 #endif /* _UAPI_LINUX_SCHED_H */
diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h
index 90662385689b..d435b75e6ac9 100644
--- a/include/uapi/linux/sched/types.h
+++ b/include/uapi/linux/sched/types.h
@@ -116,6 +116,9 @@ struct sched_attr {
 	__u32 sched_util_min;
 	__u32 sched_util_max;
 
+	/* Latency sensitive flag */
+	__u32 sched_latency_sensi_flag;
+
 };
 
 #endif /* _UAPI_LINUX_SCHED_TYPES_H */
diff --git a/init/init_task.c b/init/init_task.c
index 4daee6d761c8..f36237d06485 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -98,6 +98,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
 #ifdef CONFIG_CGROUP_SCHED
 	.sched_task_group = &root_task_group,
 #endif
+	.latency_sensi_flag = 0,
 	.ptraced	= LIST_HEAD_INIT(init_task.ptraced),
 	.ptrace_entry	= LIST_HEAD_INIT(init_task.ptrace_entry),
 	.real_parent	= &init_task,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7019a40457a6..6dfc2db7ef88 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7763,6 +7763,9 @@ static int __sched_setscheduler(struct task_struct *p,
 			return retval;
 	}
 
+	if (attr->sched_latency_sensi_flag > 1)
+		return -EINVAL;
+
 	/*
 	 * SCHED_DEADLINE bandwidth accounting relies on stable cpusets
 	 * information.
@@ -7804,6 +7807,8 @@ static int __sched_setscheduler(struct task_struct *p,
 		if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
 			goto change;
 
+		if (attr->sched_flags & SCHED_FLAG_LATENCY_SENSITIVE)
+			p->latency_sensi_flag = attr->sched_latency_sensi_flag;
 		p->sched_reset_on_fork = reset_on_fork;
 		retval = 0;
 		goto unlock;
@@ -7908,6 +7913,9 @@ static int __sched_setscheduler(struct task_struct *p,
 
 	check_class_changed(rq, p, prev_class, oldprio);
 
+	if (attr->sched_flags & SCHED_FLAG_LATENCY_SENSITIVE)
+		p->latency_sensi_flag = attr->sched_latency_sensi_flag;
+
 	/* Avoid rq from going away on us: */
 	preempt_disable();
 	head = splice_balance_callbacks(rq);
@@ -8314,6 +8322,10 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
 		get_params(p, &kattr);
 		kattr.sched_flags &= SCHED_FLAG_ALL;
 
+		kattr.sched_latency_sensi_flag = p->latency_sensi_flag;
+		if (kattr.sched_latency_sensi_flag)
+			kattr.sched_flags |= SCHED_FLAG_LATENCY_SENSITIVE;
+
 #ifdef CONFIG_UCLAMP_TASK
 		/*
 		 * This could race with another potential updater, but this is fine
diff --git a/kernel/softirq.c b/kernel/softirq.c
index b315b21fb28c..f4e7ce4cde81 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -226,10 +226,10 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
 		goto out;
 
 	/*
-	 * If this was called from non preemptible context, wake up the
-	 * softirq daemon.
+	 * If this was called from non preemptible context, or current task is
+	 * sensitive to running latency, wake up the softirq daemon.
 	 */
-	if (!preempt_on) {
+	if (!preempt_on || current->latency_sensi_flag) {
 		wakeup_softirqd();
 		goto out;
 	}
@@ -375,11 +375,15 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
 	__preempt_count_sub(cnt - 1);
 
 	if (unlikely(!in_interrupt() && local_softirq_pending())) {
-		/*
-		 * Run softirq if any pending. And do it in its own stack
-		 * as we may be calling this deep in a task call stack already.
-		 */
-		do_softirq();
+		/* If task is sensitive to running latency, only wake up the softirq daemon. */
+		if (current->latency_sensi_flag)
+			wakeup_softirqd();
+		else
+			/*
+			 * Run softirq if any pending. And do it in its own stack
+			 * as we may be calling this deep in a task call stack already.
+			 */
+			do_softirq();
 	}
 
 	preempt_count_dec();
-- 
2.17.1

Re: [PATCH] sched: Introduce task_struct::latency_sensi_flag.

Posted by Jakub Kicinski 1 week, 6 days ago

On Sun, 5 May 2024 11:06:15 +0800 fuyuanli wrote:
> A test has been made in two hosts named A and B. In A, several clients
> sent udp packets to a single server in B concurrently as fast as
> possible. In B, the IRQs of these flows were bound to CPU 0 by flow
> director, so there was always a triggered net_rx softirq on CPU 0. Then
> a test program was started in B, which was also bound to CPU 0, and
> keeped calling sendto() in a loop. Sampling with perf, results showed
> that about 25% of running time of test program was spent executing
> local_bh_enable() contained in syscall sendto(), but after setting
> latency_sensi_flag to 1, this proportion had been reduced to 0.5%.

Enable threaded NAPI, it will have the same effect.

Re: [PATCH] sched: Introduce task_struct::latency_sensi_flag.

Posted by 付元力 Jerry Fu 1 week, 4 days ago

Sure, my first thought is providing a way to have a
low-latency task if user does not want to enable 
napi threaded. Once it is enabled, which will affect
all system. I want to have a low-latency task while 
minimizing the impact on the system.

Thanks
fuyuanli

在 2024/5/6 下午10:04，“Jakub Kicinski”<kuba@kernel.org <mailto:kuba@kernel.org>> 写入:


On Sun, 5 May 2024 11:06:15 +0800 fuyuanli wrote:
> A test has been made in two hosts named A and B. In A, several clients
> sent udp packets to a single server in B concurrently as fast as
> possible. In B, the IRQs of these flows were bound to CPU 0 by flow
> director, so there was always a triggered net_rx softirq on CPU 0. Then
> a test program was started in B, which was also bound to CPU 0, and
> keeped calling sendto() in a loop. Sampling with perf, results showed
> that about 25% of running time of test program was spent executing
> local_bh_enable() contained in syscall sendto(), but after setting
> latency_sensi_flag to 1, this proportion had been reduced to 0.5%.


Enable threaded NAPI, it will have the same effect.

Re: [PATCH] sched: Introduce task_struct::latency_sensi_flag.

Posted by Sebastian Andrzej Siewior 1 week, 6 days ago

On 2024-05-05 11:06:15 [+0800], fuyuanli wrote:
> In the path local_bh_enable()->__local_bh_enable_ip(), the softirq
> handlers will be executed in the context of current task. But for some
> tasks sensitive to running latency, we expect that they will not spend
> extra time executing softirq. So latency_sensi_flag is introduced in
> task_struct, when it is set to 1, task only wakes up softirq daemon in
> __local_bh_enable_ip().
> 
> A test has been made in two hosts named A and B. In A, several clients
> sent udp packets to a single server in B concurrently as fast as
> possible. In B, the IRQs of these flows were bound to CPU 0 by flow
> director, so there was always a triggered net_rx softirq on CPU 0. Then
> a test program was started in B, which was also bound to CPU 0, and
> keeped calling sendto() in a loop. Sampling with perf, results showed
> that about 25% of running time of test program was spent executing
> local_bh_enable() contained in syscall sendto(), but after setting
> latency_sensi_flag to 1, this proportion had been reduced to 0.5%.

Is this PREEMPT_RT related or not?
RT wise I worked hard to get rid of ksoftirqd usage because you use lose
context, priority and everything once this happens. Plus an innocent
thread can be forced to do the work instead.
Non-RT wise your performance can go rapidly down the hill if the wrong
task/ user is outsourcing the work to ksoftirqd.

And this is what you are doing: You are outsourcing work to a different
context and have 25% improvement here and 25% work somewhere else which
you don't measure. Not to mention that _another_ context could do this
softirq work if it happens to end up in the section before ksoftirqd had
a chance to run.

So, this does not sound good. If you want to have a low-latency task
which can send packets and not do the needed softirq part I would
suggest to have another thread where this is outsourced and the thread
does the work.

> Signed-off-by: fuyuanli <fuyuanli@didiglobal.com>

Sebastian

Re: [PATCH] sched: Introduce task_struct::latency_sensi_flag.

Posted by 付元力 Jerry Fu 1 week, 4 days ago

>
>
>
> 在 2024/5/7 上午9:28，“Sebastian Andrzej Siewior”<bigeasy@linutronix.de <mailto:bigeasy@linutronix.de>> 写入:
>
>
> On 2024-05-05 11:06:15 [+0800], fuyuanli wrote:
> > In the path local_bh_enable()->__local_bh_enable_ip(), the softirq
> > handlers will be executed in the context of current task. But for some
> > tasks sensitive to running latency, we expect that they will not spend
> > extra time executing softirq. So latency_sensi_flag is introduced in
> > task_struct, when it is set to 1, task only wakes up softirq daemon in
> > __local_bh_enable_ip().
> >
> > A test has been made in two hosts named A and B. In A, several clients
> > sent udp packets to a single server in B concurrently as fast as
> > possible. In B, the IRQs of these flows were bound to CPU 0 by flow
> > director, so there was always a triggered net_rx softirq on CPU 0. Then
> > a test program was started in B, which was also bound to CPU 0, and
> > keeped calling sendto() in a loop. Sampling with perf, results showed
> > that about 25% of running time of test program was spent executing
> > local_bh_enable() contained in syscall sendto(), but after setting
> > latency_sensi_flag to 1, this proportion had been reduced to 0.5%.
>
>
> Is this PREEMPT_RT related or not?
The problem that I met occured on RT kernel, a task had high latency
due to spend
much time doing softirq. And I think some tasks on Non-RT kernel may
also want low
latency, so introduce this flag to both RT and Non-RT kernel.
> RT wise I worked hard to get rid of ksoftirqd usage because you use lose
My implementation refers to current code in RT kernel. In current
version, __local_bh_enable_ip()
will outsource work to ksoftirqd when preempt is disabled.
> context, priority and everything once this happens. Plus an innocent
> thread can be forced to do the work instead.
Sorry, I don't understand, which thread you mean?
> Non-RT wise your performance can go rapidly down the hill if the wrong
> task/ user is outsourcing the work to ksoftirqd.
I agree, so the default value of new flag is 0, which does not affect
normal working of softirq.
Users need to evaluate both performance and latency impacts, and
decide if set it to 1
>
>
> And this is what you are doing: You are outsourcing work to a different
> context and have 25% improvement here and 25% work somewhere else which
Yeah, there must be 25% work somewhere else. I think the purpose of
outsourcing work is to
ensure some special tasks have low latency, not whole system. And the
decision lies with users.
> you don't measure. Not to mention that _another_ context could do this
> softirq work if it happens to end up in the section before ksoftirqd had
> a chance to run.
>
>
> So, this does not sound good. If you want to have a low-latency task
> which can send packets and not do the needed softirq part I would
> suggest to have another thread where this is outsourced and the thread
> does the work.
What you mean is napi thread? My thought is providing a way to have a
low-latency task if user does
not want to enable napi threaded,
>
>
> > Signed-off-by: fuyuanli <fuyuanli@didiglobal.com <mailto:fuyuanli@didiglobal.com>>
>
>
> Sebastian
>
>
>
Thanks
fuyuanli

在 2024/5/7 上午9:28，“Sebastian Andrzej Siewior”<bigeasy@linutronix.de <mailto:bigeasy@linutronix.de>> 写入:


On 2024-05-05 11:06:15 [+0800], fuyuanli wrote:
> In the path local_bh_enable()->__local_bh_enable_ip(), the softirq
> handlers will be executed in the context of current task. But for some
> tasks sensitive to running latency, we expect that they will not spend
> extra time executing softirq. So latency_sensi_flag is introduced in
> task_struct, when it is set to 1, task only wakes up softirq daemon in
> __local_bh_enable_ip().
> 
> A test has been made in two hosts named A and B. In A, several clients
> sent udp packets to a single server in B concurrently as fast as
> possible. In B, the IRQs of these flows were bound to CPU 0 by flow
> director, so there was always a triggered net_rx softirq on CPU 0. Then
> a test program was started in B, which was also bound to CPU 0, and
> keeped calling sendto() in a loop. Sampling with perf, results showed
> that about 25% of running time of test program was spent executing
> local_bh_enable() contained in syscall sendto(), but after setting
> latency_sensi_flag to 1, this proportion had been reduced to 0.5%.


Is this PREEMPT_RT related or not?
RT wise I worked hard to get rid of ksoftirqd usage because you use lose
context, priority and everything once this happens. Plus an innocent
thread can be forced to do the work instead.
Non-RT wise your performance can go rapidly down the hill if the wrong
task/ user is outsourcing the work to ksoftirqd.


And this is what you are doing: You are outsourcing work to a different
context and have 25% improvement here and 25% work somewhere else which
you don't measure. Not to mention that _another_ context could do this
softirq work if it happens to end up in the section before ksoftirqd had
a chance to run.


So, this does not sound good. If you want to have a low-latency task
which can send packets and not do the needed softirq part I would
suggest to have another thread where this is outsourced and the thread
does the work.


> Signed-off-by: fuyuanli <fuyuanli@didiglobal.com <mailto:fuyuanli@didiglobal.com>>


Sebastian