need_resched() added in commit c10d73671ad3 ("softirq: reduce latencies")
does improve latency for real workloads (for example memcache).
Unfortunately it triggers quite often even for non-network-heavy apps
(~900 times a second on a loaded webserver), and in small fraction of
cases whatever the scheduler decided to run will hold onto the CPU
for the entire time slice.
10ms+ stalls on a machine which is not actually under overload cause
erratic network behavior and spurious TCP retransmits. Typical end-to-end
latency in a datacenter is < 200us so its common to set TCP timeout
to 10ms or less.
The intent of the need_resched() is to let a low latency application
respond quickly and yield (to ksoftirqd). Put a time limit on this dance.
Ignore the fact that ksoftirqd is RUNNING if we were trying to be nice
and the application did not yield quickly.
On a webserver loaded at 90% CPU this change reduces the numer of 8ms+
stalls the network softirq processing sees by around 10x (2/sec -> 0.2/sec).
It also seems to reduce retransmissions by ~10% but the data is quite
noisy.
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
kernel/softirq.c | 21 ++++++++++++++++++---
1 file changed, 18 insertions(+), 3 deletions(-)
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 00b838d566c1..ad200d386ec1 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -59,6 +59,7 @@ EXPORT_PER_CPU_SYMBOL(irq_stat);
static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
+static DEFINE_PER_CPU(unsigned long, overload_limit);
const char * const softirq_to_name[NR_SOFTIRQS] = {
"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL",
@@ -89,10 +90,15 @@ static void wakeup_softirqd(void)
static bool ksoftirqd_should_handle(unsigned long pending)
{
struct task_struct *tsk = __this_cpu_read(ksoftirqd);
+ unsigned long ov_limit;
if (pending & SOFTIRQ_NOW_MASK)
return false;
- return tsk && task_is_running(tsk) && !__kthread_should_park(tsk);
+ if (likely(!tsk || !task_is_running(tsk) || __kthread_should_park(tsk)))
+ return false;
+
+ ov_limit = __this_cpu_read(overload_limit);
+ return time_is_after_jiffies(ov_limit);
}
#ifdef CONFIG_TRACE_IRQFLAGS
@@ -492,6 +498,9 @@ asmlinkage __visible void do_softirq(void)
#define MAX_SOFTIRQ_TIME msecs_to_jiffies(2)
#define MAX_SOFTIRQ_RESTART 10
+#define SOFTIRQ_OVERLOAD_TIME msecs_to_jiffies(100)
+#define SOFTIRQ_DEFER_TIME msecs_to_jiffies(2)
+
#ifdef CONFIG_TRACE_IRQFLAGS
/*
* When we run softirqs from irq_exit() and thus on the hardirq stack we need
@@ -588,10 +597,16 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
pending = local_softirq_pending();
if (pending) {
- if (time_before(jiffies, end) && !need_resched() &&
- --max_restart)
+ unsigned long limit;
+
+ if (time_is_before_eq_jiffies(end) || !--max_restart)
+ limit = SOFTIRQ_OVERLOAD_TIME;
+ else if (need_resched())
+ limit = SOFTIRQ_DEFER_TIME;
+ else
goto restart;
+ __this_cpu_write(overload_limit, jiffies + limit);
wakeup_softirqd();
}
--
2.38.1
Jakub! On Thu, Dec 22 2022 at 14:12, Jakub Kicinski wrote: > DEFINE_PER_CPU(struct task_struct *, ksoftirqd); > +static DEFINE_PER_CPU(unsigned long, overload_limit); > > const char * const softirq_to_name[NR_SOFTIRQS] = { > "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL", > @@ -89,10 +90,15 @@ static void wakeup_softirqd(void) > static bool ksoftirqd_should_handle(unsigned long pending) > { > struct task_struct *tsk = __this_cpu_read(ksoftirqd); > + unsigned long ov_limit; > > if (pending & SOFTIRQ_NOW_MASK) > return false; > - return tsk && task_is_running(tsk) && !__kthread_should_park(tsk); > + if (likely(!tsk || !task_is_running(tsk) || __kthread_should_park(tsk))) > + return false; > + > + ov_limit = __this_cpu_read(overload_limit); > + return time_is_after_jiffies(ov_limit); return time_is_after_jiffies(__this_cpu_read(overload_limit)); Plus a comment explaining the magic, please. > } > > #ifdef CONFIG_TRACE_IRQFLAGS > @@ -492,6 +498,9 @@ asmlinkage __visible void do_softirq(void) > #define MAX_SOFTIRQ_TIME msecs_to_jiffies(2) > #define MAX_SOFTIRQ_RESTART 10 > > +#define SOFTIRQ_OVERLOAD_TIME msecs_to_jiffies(100) > +#define SOFTIRQ_DEFER_TIME msecs_to_jiffies(2) > + > #ifdef CONFIG_TRACE_IRQFLAGS > /* > * When we run softirqs from irq_exit() and thus on the hardirq stack we need > @@ -588,10 +597,16 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) > > pending = local_softirq_pending(); > if (pending) { > - if (time_before(jiffies, end) && !need_resched() && > - --max_restart) > + unsigned long limit; > + > + if (time_is_before_eq_jiffies(end) || !--max_restart) > + limit = SOFTIRQ_OVERLOAD_TIME; > + else if (need_resched()) > + limit = SOFTIRQ_DEFER_TIME; > + else > goto restart; > > + __this_cpu_write(overload_limit, jiffies + limit); The logic of all this is non-obvious and I had to reread it 5 times to conclude that it is matching the intent. Please add comments. While I'm not a big fan of heuristical duct tape, this looks harmless enough to not end up in an endless stream of tweaking. Famous last words... But without the sched_clock() changes the actual defer time depends on HZ and the point in time where limit is set. That means it ranges from 0 to 1/HZ, i.e. the 2ms defer time ends up with close to 10ms on HZ=100 in the worst case, which perhaps explains the 8ms+ stalls you are still observing. Can you test with that sched_clock change applied, i.e. the first two commits from git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git core/softirq 59be25c466d9 ("softirq: Use sched_clock() based timeout") bd5a5bd77009 ("softirq: Rewrite softirq processing loop") whether that makes a difference? Those two can be applied with some minor polishing. The rest of that series is broken by f10020c97f4c ("softirq: Allow early break"). There is another issue with this overload limit. Assume max_restart or timeout triggered and limit was set to now + 100ms. ksoftirqd runs and gets the issue resolved after 10ms. So for the remaining 90ms any invocation of raise_softirq() outside of (soft)interrupt context, which wakes ksoftirqd again, prevents processing on return from interrupt until ksoftirqd gets on the CPU and goes back to sleep, because task_is_running() == true and the stale limit is not after jiffies. Probably not a big issue, but someone will notice on some weird workload sooner than later and the tweaking will start nevertheless. :) So maybe we fix it right away. :) Thanks, tglx
On Fri, 03 Mar 2023 14:30:46 +0100 Thomas Gleixner wrote: > > - if (time_before(jiffies, end) && !need_resched() && > > - --max_restart) > > + unsigned long limit; > > + > > + if (time_is_before_eq_jiffies(end) || !--max_restart) > > + limit = SOFTIRQ_OVERLOAD_TIME; > > + else if (need_resched()) > > + limit = SOFTIRQ_DEFER_TIME; > > + else > > goto restart; > > > > + __this_cpu_write(overload_limit, jiffies + limit); > > The logic of all this is non-obvious and I had to reread it 5 times to > conclude that it is matching the intent. Please add comments. > > While I'm not a big fan of heuristical duct tape, this looks harmless > enough to not end up in an endless stream of tweaking. Famous last > words... Would it all be more readable if I named the "overload_limit" "overloaded_until" instead? Naming.. I'll add comments, too. > But without the sched_clock() changes the actual defer time depends on > HZ and the point in time where limit is set. That means it ranges from 0 > to 1/HZ, i.e. the 2ms defer time ends up with close to 10ms on HZ=100 in > the worst case, which perhaps explains the 8ms+ stalls you are still > observing. Can you test with that sched_clock change applied, i.e. the > first two commits from > > git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git core/softirq > > 59be25c466d9 ("softirq: Use sched_clock() based timeout") > bd5a5bd77009 ("softirq: Rewrite softirq processing loop") Those will help, but I spent some time digging into the jiffies related warts with kprobes - while annoying they weren't a major source of wake ups. (FWIW the jiffies noise on our workloads is due to cgroup stats disabling IRQs for multiple ms on the timekeeping CPU). Here are fresh stats on why we wake up ksoftirqd on our Web workload (collected over 100 sec): Time exceeded: 484 Loop max run out: 6525 need_resched(): 10219 (control: 17226 - number of times wakeup_process called for ksirqd) As you can see need_resched() dominates. Zooming into the time exceeded - we can count nanoseconds between __do_softirq starting and the check. This is the histogram of actual usecs as seen by BPF (AKA ktime_get_mono_fast_ns() / 1000): [256, 512) 1 | | [512, 1K) 0 | | [1K, 2K) 217 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | [2K, 4K) 266 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| So yes, we can probably save ourselves ~200 wakeup with a better clock but that's just 1.3% of the total wake ups :( Now - now about the max loop count. I ORed the pending softirqs every time we get to the end of the loop. Looks like vast majority of the loop counter wake ups are exclusively due to RCU: @looped[512]: 5516 Where 512 is the ORed pending mask over all iterations 512 == 1 << RCU_SOFTIRQ. And they usually take less than 100us to consume the 10 iterations. Histogram of usecs consumed when we run out of loop iterations: [16, 32) 3 | | [32, 64) 4786 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [64, 128) 871 |@@@@@@@@@ | [128, 256) 34 | | [256, 512) 9 | | [512, 1K) 262 |@@ | [1K, 2K) 35 | | [2K, 4K) 1 | | Paul, is this expected? Is RCU not trying too hard to be nice? # cat /sys/module/rcutree/parameters/blimit 10 Or should we perhaps just raise the loop limit? Breaking after less than 100usec seems excessive :( > whether that makes a difference? Those two can be applied with some > minor polishing. The rest of that series is broken by f10020c97f4c > ("softirq: Allow early break"). > > There is another issue with this overload limit. Assume max_restart or > timeout triggered and limit was set to now + 100ms. ksoftirqd runs and > gets the issue resolved after 10ms. > > So for the remaining 90ms any invocation of raise_softirq() outside of > (soft)interrupt context, which wakes ksoftirqd again, prevents > processing on return from interrupt until ksoftirqd gets on the CPU and > goes back to sleep, because task_is_running() == true and the stale > limit is not after jiffies. > > Probably not a big issue, but someone will notice on some weird workload > sooner than later and the tweaking will start nevertheless. :) So maybe > we fix it right away. :) Hm, Paolo raised this point as well, but the overload time is strictly to stop paying attention to the fact ksoftirqd is running. IOW current kernels behave as if they had overload_limit of infinity. The current code already prevents processing until ksoftirqd schedules in, after raise_softirq() from a funky context.
Jakub! On Fri, Mar 03 2023 at 13:31, Jakub Kicinski wrote: > On Fri, 03 Mar 2023 14:30:46 +0100 Thomas Gleixner wrote: >> > + __this_cpu_write(overload_limit, jiffies + limit); >> >> The logic of all this is non-obvious and I had to reread it 5 times to >> conclude that it is matching the intent. Please add comments. >> >> While I'm not a big fan of heuristical duct tape, this looks harmless >> enough to not end up in an endless stream of tweaking. Famous last >> words... > > Would it all be more readable if I named the "overload_limit" > "overloaded_until" instead? Naming.. While naming matters it wont change the 'heuristical duct tape' property of this, right? > I'll add comments, too. They are definitely appreciated, but I'd prefer to have code which is self explanatory and does at least have a notion of a halfways scientific approach to the overall issue of softirqs. The point is that softirqs are just the proliferation of an at least 50 years old OS design paradigm. Back then everyhting which run in an interrupt handler was "important" and more or less allowed to hog the CPU at will. That obviously caused problems because it prevented other interrupt handlers from being served. This was attempted to work around in hardware by providing interrupt priority levels. No general purpose OS utilized that ever because there is no way to get this right. Not even on UP, unless you build a designed for the purpose "OS". Soft interrupts are not any better. They avoid the problem of stalling interrupts by moving the problem one level down to the scheduler. Granted they are a cute hack, but at the very end they are still evading the resource control mechanisms of the OS by defining their own rules: - NET RX defaults to 2ms with the ability to override via /proc - RCU defaults to 3ms with the ability to override via /sysfs while the "overload detection" in the core defines a hardcoded limit of 2ms. Alone the above does not sum up to the core limit and most of the other soft interrupt handlers do not even have the notion of limits. That clearly does not even remotely allow to do proper coordinated resource management. Not to talk about the sillyness of the jiffy based timouts which result in a randomized granularity of 0...1/Hz as mentioned before. I'm well aware of the fact that consulting a high resolution hardware clock frequently can be slow and hurting performance, but there are well understood workarounds, aka batching, which mitigate that. There is another aspect to softirqs which makes them a horror show: While they are conceptually seperate, at the very end they are all lumped together and especially the network code has implicit assumptions about that. It's simply impossible to seperate the processing of the various soft interrupt incarnations. IOW, resource control by developer preference and coincidence of events. That truly makes an understandable and to be relied on OS. We had seperate softirq threads and per softirq serialization (except NET_RX/TX which shared) in the early days of preempt RT, which gave fine grained control. Back then the interaction between different softirqs was halfways understandable and the handful of interaction points which relied on the per CPU global BH disable were fixable with local serializations. That lasted a year or two until we stopped maintaining that because the interaction between softirqs was becoming a whack a mole game. So we gave up and enjoy the full glory of a per CPU global lock, because that's what local BH disable actually is. I completely understand that ***GB networking is a challenge, but ***GB networking does not work without applications wwich use it. Those applications are unfortunately^Wrightfully subject to the scheduler, aka. resource control. IMO evading resource control is the worst of all approaches and the amount of heuristics you can apply to mitigate that, is never going to cover even a subset of the overall application space. Just look at the memcache vs. webserver use case vs. need_resched() and then the requirements coming from the low latency audio folks. I know the usual approach to that is to add some more heuristics which are by nature supposed to fail or to add yet another 'knob'. We have already too many knobs which are not comprehensible on their own. But even if a particular knob is comprehensible there is close to zero documentation and I even claim close to zero understanding of the interaction of knobs. Just for the record. Some of our engineers are working on TSN based real-time networking which is all about latency anc accuracy. Guess how well that works with the current overall design. That's not an esoteric niche use case as low-latency TSN is not restricted to the automation space. There are quite some use cases which go there even in the high end networking space. >> But without the sched_clock() changes the actual defer time depends on >> HZ and the point in time where limit is set. That means it ranges from 0 >> to 1/HZ, i.e. the 2ms defer time ends up with close to 10ms on HZ=100 in >> the worst case, which perhaps explains the 8ms+ stalls you are still >> observing. Can you test with that sched_clock change applied, i.e. the >> first two commits from >> >> git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git core/softirq >> >> 59be25c466d9 ("softirq: Use sched_clock() based timeout") >> bd5a5bd77009 ("softirq: Rewrite softirq processing loop") > > Those will help, but I spent some time digging into the jiffies related > warts with kprobes - while annoying they weren't a major source of wake > ups. (FWIW the jiffies noise on our workloads is due to cgroup stats > disabling IRQs for multiple ms on the timekeeping CPU). What? That's completely insane and needs to be fixed. > Here are fresh stats on why we wake up ksoftirqd on our Web workload > (collected over 100 sec): > > Time exceeded: 484 > Loop max run out: 6525 > need_resched(): 10219 > (control: 17226 - number of times wakeup_process called for ksirqd) > > As you can see need_resched() dominates. > Zooming into the time exceeded - we can count nanoseconds between > __do_softirq starting and the check. This is the histogram of actual > usecs as seen by BPF (AKA ktime_get_mono_fast_ns() / 1000): > > [256, 512) 1 | | > [512, 1K) 0 | | > [1K, 2K) 217 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | > [2K, 4K) 266 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| > > So yes, we can probably save ourselves ~200 wakeup with a better clock > but that's just 1.3% of the total wake ups :( Fair enough. Though that does not make our time limit handling any more consistent and we need to fix that too to handle the other issues. > Now - now about the max loop count. I ORed the pending softirqs every > time we get to the end of the loop. Looks like vast majority of the > loop counter wake ups are exclusively due to RCU: > > @looped[512]: 5516 If the loop counter breaks without consuming the time budget that's silly. > Where 512 is the ORed pending mask over all iterations > 512 == 1 << RCU_SOFTIRQ. > > And they usually take less than 100us to consume the 10 iterations. > Histogram of usecs consumed when we run out of loop iterations: > > [16, 32) 3 | | > [32, 64) 4786 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| > [64, 128) 871 |@@@@@@@@@ | > [128, 256) 34 | | > [256, 512) 9 | | > [512, 1K) 262 |@@ | > [1K, 2K) 35 | | > [2K, 4K) 1 | | > > Paul, is this expected? Is RCU not trying too hard to be nice? > > # cat /sys/module/rcutree/parameters/blimit > 10 > > Or should we perhaps just raise the loop limit? Breaking after less > than 100usec seems excessive :( No. Can we please stop twiddling a parameter here and there and go and fix this whole problem space properly. Increasing the loop count for RCU might work for your particular usecase and cause issues in other scenarios. Btw, RCU seems to be a perfect candidate to delegate batches from softirq into a seperate scheduler controllable entity. >> So for the remaining 90ms any invocation of raise_softirq() outside of >> (soft)interrupt context, which wakes ksoftirqd again, prevents >> processing on return from interrupt until ksoftirqd gets on the CPU and >> goes back to sleep, because task_is_running() == true and the stale >> limit is not after jiffies. >> >> Probably not a big issue, but someone will notice on some weird workload >> sooner than later and the tweaking will start nevertheless. :) So maybe >> we fix it right away. :) > > Hm, Paolo raised this point as well, but the overload time is strictly > to stop paying attention to the fact ksoftirqd is running. > IOW current kernels behave as if they had overload_limit of infinity. > > The current code already prevents processing until ksoftirqd schedules > in, after raise_softirq() from a funky context. Correct and it does so because we are just applying duct tape over and over. That said, I have no brilliant solution for that off the top of my head, but I'm not comfortable with applying more adhoc solutions which are contrary to the efforts of e.g. the audio folks. I have some vague ideas how to approach that, but I'm traveling all of next week, so I neither will be reading much email, nor will I have time to think deeply about softirqs. I'll resume when I'm back. Thanks, tglx
On Sun, 05 Mar 2023 21:43:23 +0100 Thomas Gleixner wrote: > > Would it all be more readable if I named the "overload_limit" > > "overloaded_until" instead? Naming.. > > While naming matters it wont change the 'heuristical duct tape' property > of this, right? I also hate heuristics, I hope we are on the same page there. The way I see it we allowed 2 heuristics already into the kernel: - ksoftirq running means overload - need_resched() means we should stop immediately (and wake ksoftirqd) Those two are clearly at odds with each other. And the latter is as weak / hacky as it gets :| See at the end for work in progress/"real solutions" but for this patch - can I replace the time limit with a simple per core "bool wa_for_yield" and change the overload check to: if (ksoftirqd_running() && !wa_for_yeild) ? That's not a heuristic, right? No magic values, predictable, repeatable behavior. > > I'll add comments, too. > > They are definitely appreciated, but I'd prefer to have code which is > self explanatory and does at least have a notion of a halfways > scientific approach to the overall issue of softirqs. > > The point is that softirqs are just the proliferation of an at least 50 > years old OS design paradigm. Back then everyhting which run in an > interrupt handler was "important" and more or less allowed to hog the > CPU at will. > > That obviously caused problems because it prevented other interrupt > handlers from being served. > > This was attempted to work around in hardware by providing interrupt > priority levels. No general purpose OS utilized that ever because there > is no way to get this right. Not even on UP, unless you build a designed > for the purpose "OS". > > Soft interrupts are not any better. They avoid the problem of stalling > interrupts by moving the problem one level down to the scheduler. > > Granted they are a cute hack, but at the very end they are still evading > the resource control mechanisms of the OS by defining their own rules: > > - NET RX defaults to 2ms with the ability to override via /proc > - RCU defaults to 3ms with the ability to override via /sysfs > > while the "overload detection" in the core defines a hardcoded limit of > 2ms. Alone the above does not sum up to the core limit and most of the > other soft interrupt handlers do not even have the notion of limits. > > That clearly does not even remotely allow to do proper coordinated > resource management. FWIW happy to delete all the procfs knobs we have in net. Anyone who feels like they need to tweak those should try to use/work on a real solution. > Not to talk about the sillyness of the jiffy based timouts which result > in a randomized granularity of 0...1/Hz as mentioned before. > > I'm well aware of the fact that consulting a high resolution hardware > clock frequently can be slow and hurting performance, but there are well > understood workarounds, aka batching, which mitigate that. > > There is another aspect to softirqs which makes them a horror show: > > While they are conceptually seperate, at the very end they are all > lumped together and especially the network code has implicit > assumptions about that. It's simply impossible to seperate the > processing of the various soft interrupt incarnations. Was that just about running in threads or making them preemptible? Running Rx in threads is "mostly solved", see at the end. > IOW, resource control by developer preference and coincidence of > events. That truly makes an understandable and to be relied on OS. > > We had seperate softirq threads and per softirq serialization (except > NET_RX/TX which shared) in the early days of preempt RT, which gave fine > grained control. Back then the interaction between different softirqs > was halfways understandable and the handful of interaction points which > relied on the per CPU global BH disable were fixable with local > serializations. That lasted a year or two until we stopped maintaining > that because the interaction between softirqs was becoming a whack a > mole game. So we gave up and enjoy the full glory of a per CPU global > lock, because that's what local BH disable actually is. > > I completely understand that ***GB networking is a challenge, but ***GB > networking does not work without applications wwich use it. Those > applications are unfortunately^Wrightfully subject to the scheduler, > aka. resource control. > > IMO evading resource control is the worst of all approaches and the > amount of heuristics you can apply to mitigate that, is never going to > cover even a subset of the overall application space. > > Just look at the memcache vs. webserver use case vs. need_resched() and > then the requirements coming from the low latency audio folks. Let me clarify that we only need the default to not be silly for applications which are _not_ doing a lot of networking. The webserver in my test is running a website (PHP?), not serving static content. It's doing maybe a few Gbps on a 25/50 Gbps NIC. > I know the usual approach to that is to add some more heuristics which > are by nature supposed to fail or to add yet another 'knob'. We have > already too many knobs which are not comprehensible on their own. But > even if a particular knob is comprehensible there is close to zero > documentation and I even claim close to zero understanding of the > interaction of knobs. > > Just for the record. Some of our engineers are working on TSN based > real-time networking which is all about latency anc accuracy. Guess how > well that works with the current overall design. That's not an esoteric > niche use case as low-latency TSN is not restricted to the automation > space. There are quite some use cases which go there even in the high > end networking space. TSN + ksoftirqd is definitely a bad idea :S > > Those will help, but I spent some time digging into the jiffies related > > warts with kprobes - while annoying they weren't a major source of wake > > ups. (FWIW the jiffies noise on our workloads is due to cgroup stats > > disabling IRQs for multiple ms on the timekeeping CPU). > > What? That's completely insane and needs to be fixed. Agreed, I made the right people aware.. > > Here are fresh stats on why we wake up ksoftirqd on our Web workload > > (collected over 100 sec): > > > > Time exceeded: 484 > > Loop max run out: 6525 > > need_resched(): 10219 > > (control: 17226 - number of times wakeup_process called for ksirqd) > > > > As you can see need_resched() dominates. > > > Zooming into the time exceeded - we can count nanoseconds between > > __do_softirq starting and the check. This is the histogram of actual > > usecs as seen by BPF (AKA ktime_get_mono_fast_ns() / 1000): > > > > [256, 512) 1 | | > > [512, 1K) 0 | | > > [1K, 2K) 217 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | > > [2K, 4K) 266 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| > > > > So yes, we can probably save ourselves ~200 wakeup with a better clock > > but that's just 1.3% of the total wake ups :( > > Fair enough. Though that does not make our time limit handling any more > consistent and we need to fix that too to handle the other issues. > > > Now - now about the max loop count. I ORed the pending softirqs every > > time we get to the end of the loop. Looks like vast majority of the > > loop counter wake ups are exclusively due to RCU: > > > > @looped[512]: 5516 > > If the loop counter breaks without consuming the time budget that's > silly. FWIW my initial reaction was to read the jiffies from the _local_ core, because if we're running softirq the can't have IRQs masked. So local clock will be ticking. But I'm insufficiently competent to code that up, and you'd presumably have done this already if it was a good idea. > > Where 512 is the ORed pending mask over all iterations > > 512 == 1 << RCU_SOFTIRQ. > > > > And they usually take less than 100us to consume the 10 iterations. > > Histogram of usecs consumed when we run out of loop iterations: > > > > [16, 32) 3 | | > > [32, 64) 4786 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| > > [64, 128) 871 |@@@@@@@@@ | > > [128, 256) 34 | | > > [256, 512) 9 | | > > [512, 1K) 262 |@@ | > > [1K, 2K) 35 | | > > [2K, 4K) 1 | | > > > > Paul, is this expected? Is RCU not trying too hard to be nice? > > > > # cat /sys/module/rcutree/parameters/blimit > > 10 > > > > Or should we perhaps just raise the loop limit? Breaking after less > > than 100usec seems excessive :( > > No. Can we please stop twiddling a parameter here and there and go and > fix this whole problem space properly. Increasing the loop count for RCU > might work for your particular usecase and cause issues in other > scenarios. > > Btw, RCU seems to be a perfect candidate to delegate batches from softirq > into a seperate scheduler controllable entity. Indeed, it knows how many callbacks it has, I wish we knew how many packets had arrived :) > > Hm, Paolo raised this point as well, but the overload time is strictly > > to stop paying attention to the fact ksoftirqd is running. > > IOW current kernels behave as if they had overload_limit of infinity. > > > > The current code already prevents processing until ksoftirqd schedules > > in, after raise_softirq() from a funky context. > > Correct and it does so because we are just applying duct tape over and > over. > > That said, I have no brilliant solution for that off the top of my head, > but I'm not comfortable with applying more adhoc solutions which are > contrary to the efforts of e.g. the audio folks. We are trying: Threaded NAPI was added in Feb 2021. We can create a thread per NAPI instance, then all Rx runs in dedicated kernel threads. Unfortunately for workloads I tested it negatively impacts RPS and latency. If the workload doesn't run the CPUs too hot it works well, but scheduler gets in the way. I have a vague recollection that Google proposed patches to the scheduler at some point to support isosync processing, but they were rejected? I'm very likely misremembering. But I do feel like we'll need better scheduler support to move network processing to threads. Maybe the upcoming BPF scheduler patches can help us with that. We'll see. The other way to go is to let the application take charge. We added support for applications pledging that they will "busy poll" a given queue. This turns off IRQs and expects an application to periodically call into NAPI. I think that's the future for high RPS applications, but real life results are scarce (other than pure forwarding workloads, I guess, which are trivial). Various folks in netdev had also experimented with using workqueues and kthreads which are not mapped statically to NAPI. So we are trying, and will continue to try. There are unknowns, however, which make me think it's worth addressing the obvious silliness in ksoftirqd behavior, tho. One - I'm not sure whether we can get to a paradigm which is as fast and as easy to use for 100% of use cases as softirq. Two - the solution may need tuning and infrastructure leaving smaller users behind. And the ksoftirqd experience is getting worse, which is why I posted the patches (I'm guessing scheduler changes, I don't even want to know who changed their heuristics).
On Sun, Mar 05, 2023 at 09:43:23PM +0100, Thomas Gleixner wrote: > That said, I have no brilliant solution for that off the top of my head, > but I'm not comfortable with applying more adhoc solutions which are > contrary to the efforts of e.g. the audio folks. > > I have some vague ideas how to approach that, but I'm traveling all of > next week, so I neither will be reading much email, nor will I have time > to think deeply about softirqs. I'll resume when I'm back. IIUC: the problem is that some (rare?) softirq vector callbacks rely on the fact they can not be interrupted by other local vectors and they rely on that to protect against concurrent per-cpu state access, right? And there is no automatic way to detect those cases otherwise we would have fixed them all with spinlocks already. So I fear the only (in-)sane idea I could think of is to do it the same way we did with the BKL. Some sort of pushdown: vector callbacks known for having no such subtle interaction can re-enable softirqs. For example known safe timers (either because they have no such interactions or because they handle them correctly via spinlocks) can carry a TIMER_SOFTIRQ_SAFE flag to tell about that. And RCU callbacks something alike. Of course this is going to be a tremendous amount of work but it has the advantage of being iterative and it will pay in the long run. Also I'm confident that the hottest places will be handled quickly. And most of them are likely to be in core networking code. Because I fear no hack will ever fix that otherwise, and we have tried a lot. Thanks.
On Mon, Mar 06, 2023 at 12:57:11PM +0100, Frederic Weisbecker wrote: > On Sun, Mar 05, 2023 at 09:43:23PM +0100, Thomas Gleixner wrote: > > That said, I have no brilliant solution for that off the top of my head, > > but I'm not comfortable with applying more adhoc solutions which are > > contrary to the efforts of e.g. the audio folks. > > > > I have some vague ideas how to approach that, but I'm traveling all of > > next week, so I neither will be reading much email, nor will I have time > > to think deeply about softirqs. I'll resume when I'm back. > > IIUC: the problem is that some (rare?) softirq vector callbacks rely on the > fact they can not be interrupted by other local vectors and they rely on > that to protect against concurrent per-cpu state access, right? > > And there is no automatic way to detect those cases otherwise we would have > fixed them all with spinlocks already. > > So I fear the only (in-)sane idea I could think of is to do it the same way > we did with the BKL. Some sort of pushdown: vector callbacks known for having > no such subtle interaction can re-enable softirqs. > > For example known safe timers (either because they have no such interactions > or because they handle them correctly via spinlocks) can carry a > TIMER_SOFTIRQ_SAFE flag to tell about that. And RCU callbacks something alike. When a given RCU callback causes latency problems, the usual quick fix is to have them instead spawn a workqueue, either from the callback or via queue_rcu_work(). But yes, this is one of the reasons that jiffies are so popular. Eric batched something like 30 RCU callbacks per costly time check, and you would quite possible need similar batching to attain efficiency for lightly loaded softirq vectors. But 30 long-running softirq handlers would be too many. One option is to check the expensive time when either a batch of (say) 30 completes or when jiffies says too much time has elapsed. > Of course this is going to be a tremendous amount of work but it has the > advantage of being iterative and it will pay in the long run. Also I'm confident > that the hottest places will be handled quickly. And most of them are likely to > be in core networking code. > > Because I fear no hack will ever fix that otherwise, and we have tried a lot. Indeed, if it was easy within current overall code structure, we would have already fixed it. Thanx, Paul
From: Thomas Gleixner > Sent: 05 March 2023 20:43 ... > The point is that softirqs are just the proliferation of an at least 50 > years old OS design paradigm. Back then everyhting which run in an > interrupt handler was "important" and more or less allowed to hog the > CPU at will. > > That obviously caused problems because it prevented other interrupt > handlers from being served. > > This was attempted to work around in hardware by providing interrupt > priority levels. No general purpose OS utilized that ever because there > is no way to get this right. Not even on UP, unless you build a designed > for the purpose "OS". > > Soft interrupts are not any better. They avoid the problem of stalling > interrupts by moving the problem one level down to the scheduler. > > Granted they are a cute hack, but at the very end they are still evading > the resource control mechanisms of the OS by defining their own rules: From some measurements I've done, while softints seem like a good idea they are almost pointless. What usually happens is a hardware interrupt happens, does some of the required work, schedules a softint and returns. Immediately a softint happens (at the same instruction) and does all the rest of the work. The work has to be done, but you've added cost of the extra scheduling and interrupt - so overall it is slower. The massive batching up of some operations (like ethernet transmit clearing and rx setup, and things being freed after rcu) doesn't help latency. Without the batching the softint would finish faster and cause less of a latency 'problem' to whatever was interrupted. Now softints do help interrupt latency, but that is only relevant if you have critical interrupts (like pulling data out of a hardware fifo). Most modern hardware doesn't have anything that critical. Now there is code that can decide to drop softint processing to a normal thread. If that ever happens you probably lose 'big time'. Normal softint processing is higher priority than any process code. But the kernel thread runs at the priority of a normal user thread. Pretty much the lowest of the low. So all this 'high priority' interrupt related processing that really does have to happen to keep the system running just doesn't get scheduled. I think it was Eric who had problems with ethernet packets being dropped and changed the logic (of dropping to a thread) to make it much less likely - but that got reverted (well more code added that effectively reverted it) not long after. Try (as I was) to run a test that requires you to receive ALL of the 500000 ethernet packets being sent to an interface every second while also doing enough processing on the packets to make the system (say) 90% busy (real time UDP audio processing) and you soon find the defaults are entirely hopeless. Even the interrupt 'mitigation' options on the ethernet controller don't actually work - packets get dropped at the low level. (That will fail on an otherwise idle system.) David - Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK Registration No: 1397386 (Wales)
On Sun, Mar 05, 2023 at 09:43:23PM +0100, Thomas Gleixner wrote: > On Fri, Mar 03 2023 at 13:31, Jakub Kicinski wrote: > > On Fri, 03 Mar 2023 14:30:46 +0100 Thomas Gleixner wrote: [ . . . ] > > Where 512 is the ORed pending mask over all iterations > > 512 == 1 << RCU_SOFTIRQ. > > > > And they usually take less than 100us to consume the 10 iterations. > > Histogram of usecs consumed when we run out of loop iterations: > > > > [16, 32) 3 | | > > [32, 64) 4786 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| > > [64, 128) 871 |@@@@@@@@@ | > > [128, 256) 34 | | > > [256, 512) 9 | | > > [512, 1K) 262 |@@ | > > [1K, 2K) 35 | | > > [2K, 4K) 1 | | > > > > Paul, is this expected? Is RCU not trying too hard to be nice? > > > > # cat /sys/module/rcutree/parameters/blimit > > 10 > > > > Or should we perhaps just raise the loop limit? Breaking after less > > than 100usec seems excessive :( > > No. Can we please stop twiddling a parameter here and there and go and > fix this whole problem space properly. Increasing the loop count for RCU > might work for your particular usecase and cause issues in other > scenarios. > > Btw, RCU seems to be a perfect candidate to delegate batches from softirq > into a seperate scheduler controllable entity. Indeed, as you well know, CONFIG_RCU_NOCB_CPU=y in combination with the rcutree.use_softirq kernel boot parameter in combination with either the nohz_full or rcu_nocbs kernel boot parameter and then the callbacks are invoked within separate kthreads so that the scheduler has full control. In addition, this dispenses with all of the heuristics that are otherwise necessary to avoid invoking too many callbacks in one shot. Back in the day, I tried making this the default (with an eye towards making it the sole callback-execution scheme), but this resulted in some ugly performance regressions. This was in part due to the extra synchronization required to queue a callback and in part due to the higher average cost of a wakeup compared to a raise_softirq(). So I changed to the current non-default arrangement. And of course, you can do it halfway by booting kernel built with CONFIG_RCU_NOCB_CPU=n with the rcutree.use_softirq kernel boot parameter. But then the callback-invocation-limit heuristics are still used, but this time to prevent callback invocation from preventing the CPU from reporting quiescent states. But if this was the only case, simpler heuristics would suffice. In short, it is not hard to make RCU avoid using softirq, but doing so is not without side effects. ;-) Thanx, Paul
On Sun, Mar 05, 2023 at 02:42:11PM -0800, Paul E. McKenney wrote: > On Sun, Mar 05, 2023 at 09:43:23PM +0100, Thomas Gleixner wrote: > Indeed, as you well know, CONFIG_RCU_NOCB_CPU=y in combination with the > rcutree.use_softirq kernel boot parameter in combination with either the > nohz_full or rcu_nocbs kernel boot parameter and then the callbacks are > invoked within separate kthreads so that the scheduler has full control. > In addition, this dispenses with all of the heuristics that are otherwise > necessary to avoid invoking too many callbacks in one shot. > > Back in the day, I tried making this the default (with an eye towards > making it the sole callback-execution scheme), but this resulted in > some ugly performance regressions. This was in part due to the extra > synchronization required to queue a callback and in part due to the > higher average cost of a wakeup compared to a raise_softirq(). > > So I changed to the current non-default arrangement. > > And of course, you can do it halfway by booting kernel built with > CONFIG_RCU_NOCB_CPU=n with the rcutree.use_softirq kernel boot parameter. > But then the callback-invocation-limit heuristics are still used, but > this time to prevent callback invocation from preventing the CPU from > reporting quiescent states. But if this was the only case, simpler > heuristics would suffice. > > In short, it is not hard to make RCU avoid using softirq, but doing so > is not without side effects. ;-) Right but note that, threaded or not, callbacks invocation happen within a local_bh_disable() section, preventing other softirqs from running. So this is still subject to the softirq per-CPU BKL.
On Mon, Mar 06, 2023 at 12:00:24AM +0100, Frederic Weisbecker wrote: > On Sun, Mar 05, 2023 at 02:42:11PM -0800, Paul E. McKenney wrote: > > On Sun, Mar 05, 2023 at 09:43:23PM +0100, Thomas Gleixner wrote: > > Indeed, as you well know, CONFIG_RCU_NOCB_CPU=y in combination with the > > rcutree.use_softirq kernel boot parameter in combination with either the > > nohz_full or rcu_nocbs kernel boot parameter and then the callbacks are > > invoked within separate kthreads so that the scheduler has full control. > > In addition, this dispenses with all of the heuristics that are otherwise > > necessary to avoid invoking too many callbacks in one shot. > > > > Back in the day, I tried making this the default (with an eye towards > > making it the sole callback-execution scheme), but this resulted in > > some ugly performance regressions. This was in part due to the extra > > synchronization required to queue a callback and in part due to the > > higher average cost of a wakeup compared to a raise_softirq(). > > > > So I changed to the current non-default arrangement. > > > > And of course, you can do it halfway by booting kernel built with > > CONFIG_RCU_NOCB_CPU=n with the rcutree.use_softirq kernel boot parameter. > > But then the callback-invocation-limit heuristics are still used, but > > this time to prevent callback invocation from preventing the CPU from > > reporting quiescent states. But if this was the only case, simpler > > heuristics would suffice. > > > > In short, it is not hard to make RCU avoid using softirq, but doing so > > is not without side effects. ;-) > > Right but note that, threaded or not, callbacks invocation happen > within a local_bh_disable() section, preventing other softirqs from running. > > So this is still subject to the softirq per-CPU BKL. True enough! But it momentarily enables BH after invoking each callback, so the other softirq vectors should be able to get a word in. Thanx, Paul
On Sun, Mar 05, 2023 at 08:30:33PM -0800, Paul E. McKenney wrote: > On Mon, Mar 06, 2023 at 12:00:24AM +0100, Frederic Weisbecker wrote: > > On Sun, Mar 05, 2023 at 02:42:11PM -0800, Paul E. McKenney wrote: > > > On Sun, Mar 05, 2023 at 09:43:23PM +0100, Thomas Gleixner wrote: > > > Indeed, as you well know, CONFIG_RCU_NOCB_CPU=y in combination with the > > > rcutree.use_softirq kernel boot parameter in combination with either the > > > nohz_full or rcu_nocbs kernel boot parameter and then the callbacks are > > > invoked within separate kthreads so that the scheduler has full control. > > > In addition, this dispenses with all of the heuristics that are otherwise > > > necessary to avoid invoking too many callbacks in one shot. > > > > > > Back in the day, I tried making this the default (with an eye towards > > > making it the sole callback-execution scheme), but this resulted in > > > some ugly performance regressions. This was in part due to the extra > > > synchronization required to queue a callback and in part due to the > > > higher average cost of a wakeup compared to a raise_softirq(). > > > > > > So I changed to the current non-default arrangement. > > > > > > And of course, you can do it halfway by booting kernel built with > > > CONFIG_RCU_NOCB_CPU=n with the rcutree.use_softirq kernel boot parameter. > > > But then the callback-invocation-limit heuristics are still used, but > > > this time to prevent callback invocation from preventing the CPU from > > > reporting quiescent states. But if this was the only case, simpler > > > heuristics would suffice. > > > > > > In short, it is not hard to make RCU avoid using softirq, but doing so > > > is not without side effects. ;-) > > > > Right but note that, threaded or not, callbacks invocation happen > > within a local_bh_disable() section, preventing other softirqs from running. > > > > So this is still subject to the softirq per-CPU BKL. > > True enough! But it momentarily enables BH after invoking each callback, > so the other softirq vectors should be able to get a word in. Indeed it's still less worse than having it in softirqs.
On Fri, Mar 03, 2023 at 01:31:43PM -0800, Jakub Kicinski wrote: > On Fri, 03 Mar 2023 14:30:46 +0100 Thomas Gleixner wrote: > > > - if (time_before(jiffies, end) && !need_resched() && > > > - --max_restart) > > > + unsigned long limit; > > > + > > > + if (time_is_before_eq_jiffies(end) || !--max_restart) > > > + limit = SOFTIRQ_OVERLOAD_TIME; > > > + else if (need_resched()) > > > + limit = SOFTIRQ_DEFER_TIME; > > > + else > > > goto restart; > > > > > > + __this_cpu_write(overload_limit, jiffies + limit); > > > > The logic of all this is non-obvious and I had to reread it 5 times to > > conclude that it is matching the intent. Please add comments. > > > > While I'm not a big fan of heuristical duct tape, this looks harmless > > enough to not end up in an endless stream of tweaking. Famous last > > words... > > Would it all be more readable if I named the "overload_limit" > "overloaded_until" instead? Naming.. > I'll add comments, too. > > > But without the sched_clock() changes the actual defer time depends on > > HZ and the point in time where limit is set. That means it ranges from 0 > > to 1/HZ, i.e. the 2ms defer time ends up with close to 10ms on HZ=100 in > > the worst case, which perhaps explains the 8ms+ stalls you are still > > observing. Can you test with that sched_clock change applied, i.e. the > > first two commits from > > > > git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git core/softirq > > > > 59be25c466d9 ("softirq: Use sched_clock() based timeout") > > bd5a5bd77009 ("softirq: Rewrite softirq processing loop") > > Those will help, but I spent some time digging into the jiffies related > warts with kprobes - while annoying they weren't a major source of wake > ups. (FWIW the jiffies noise on our workloads is due to cgroup stats > disabling IRQs for multiple ms on the timekeeping CPU). > > Here are fresh stats on why we wake up ksoftirqd on our Web workload > (collected over 100 sec): > > Time exceeded: 484 > Loop max run out: 6525 > need_resched(): 10219 > (control: 17226 - number of times wakeup_process called for ksirqd) > > As you can see need_resched() dominates. > > Zooming into the time exceeded - we can count nanoseconds between > __do_softirq starting and the check. This is the histogram of actual > usecs as seen by BPF (AKA ktime_get_mono_fast_ns() / 1000): > > [256, 512) 1 | | > [512, 1K) 0 | | > [1K, 2K) 217 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | > [2K, 4K) 266 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| > > So yes, we can probably save ourselves ~200 wakeup with a better clock > but that's just 1.3% of the total wake ups :( > > > Now - now about the max loop count. I ORed the pending softirqs every > time we get to the end of the loop. Looks like vast majority of the > loop counter wake ups are exclusively due to RCU: > > @looped[512]: 5516 > > Where 512 is the ORed pending mask over all iterations > 512 == 1 << RCU_SOFTIRQ. > > And they usually take less than 100us to consume the 10 iterations. > Histogram of usecs consumed when we run out of loop iterations: > > [16, 32) 3 | | > [32, 64) 4786 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| > [64, 128) 871 |@@@@@@@@@ | > [128, 256) 34 | | > [256, 512) 9 | | > [512, 1K) 262 |@@ | > [1K, 2K) 35 | | > [2K, 4K) 1 | | > > Paul, is this expected? Is RCU not trying too hard to be nice? This is from way back in the day, so it is quite possible that better tuning and/or better heuristics should be applied. On the other hand, 100 microseconds is a good long time from an CONFIG_PREEMPT_RT=y perspective! > # cat /sys/module/rcutree/parameters/blimit > 10 > > Or should we perhaps just raise the loop limit? Breaking after less > than 100usec seems excessive :( But note that RCU also has rcutree.rcu_divisor, which defaults to 7. And an rcutree.rcu_resched_ns, which defaults to three milliseconds (3,000,000 nanoseconds). This means that RCU will do: o All the callbacks if there are less than ten. o Ten callbacks or 1/128th of them, whichever is larger. o Unless the larger of them is more than 100 callbacks, in which case there is an additional limit of three milliseconds worth of them. Except that if a given CPU ends up with more than 10,000 callbacks (rcutree.qhimark), that CPU's blimit is set to 10,000. So there is much opportunity to tune the existing heuristics and also much opportunity to tweak the heuristics themselves. But let's see a good use case before tweaking, please. ;-) Thanx, Paul > > whether that makes a difference? Those two can be applied with some > > minor polishing. The rest of that series is broken by f10020c97f4c > > ("softirq: Allow early break"). > > > > There is another issue with this overload limit. Assume max_restart or > > timeout triggered and limit was set to now + 100ms. ksoftirqd runs and > > gets the issue resolved after 10ms. > > > > So for the remaining 90ms any invocation of raise_softirq() outside of > > (soft)interrupt context, which wakes ksoftirqd again, prevents > > processing on return from interrupt until ksoftirqd gets on the CPU and > > goes back to sleep, because task_is_running() == true and the stale > > limit is not after jiffies. > > > > Probably not a big issue, but someone will notice on some weird workload > > sooner than later and the tweaking will start nevertheless. :) So maybe > > we fix it right away. :) > > Hm, Paolo raised this point as well, but the overload time is strictly > to stop paying attention to the fact ksoftirqd is running. > IOW current kernels behave as if they had overload_limit of infinity. > > The current code already prevents processing until ksoftirqd schedules > in, after raise_softirq() from a funky context.
On Fri, Mar 03, 2023 at 02:37:39PM -0800, Paul E. McKenney wrote: > On Fri, Mar 03, 2023 at 01:31:43PM -0800, Jakub Kicinski wrote: > > On Fri, 03 Mar 2023 14:30:46 +0100 Thomas Gleixner wrote: > > > > - if (time_before(jiffies, end) && !need_resched() && > > > > - --max_restart) > > > > + unsigned long limit; > > > > + > > > > + if (time_is_before_eq_jiffies(end) || !--max_restart) > > > > + limit = SOFTIRQ_OVERLOAD_TIME; > > > > + else if (need_resched()) > > > > + limit = SOFTIRQ_DEFER_TIME; > > > > + else > > > > goto restart; > > > > > > > > + __this_cpu_write(overload_limit, jiffies + limit); > > > > > > The logic of all this is non-obvious and I had to reread it 5 times to > > > conclude that it is matching the intent. Please add comments. > > > > > > While I'm not a big fan of heuristical duct tape, this looks harmless > > > enough to not end up in an endless stream of tweaking. Famous last > > > words... > > > > Would it all be more readable if I named the "overload_limit" > > "overloaded_until" instead? Naming.. > > I'll add comments, too. > > > > > But without the sched_clock() changes the actual defer time depends on > > > HZ and the point in time where limit is set. That means it ranges from 0 > > > to 1/HZ, i.e. the 2ms defer time ends up with close to 10ms on HZ=100 in > > > the worst case, which perhaps explains the 8ms+ stalls you are still > > > observing. Can you test with that sched_clock change applied, i.e. the > > > first two commits from > > > > > > git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git core/softirq > > > > > > 59be25c466d9 ("softirq: Use sched_clock() based timeout") > > > bd5a5bd77009 ("softirq: Rewrite softirq processing loop") > > > > Those will help, but I spent some time digging into the jiffies related > > warts with kprobes - while annoying they weren't a major source of wake > > ups. (FWIW the jiffies noise on our workloads is due to cgroup stats > > disabling IRQs for multiple ms on the timekeeping CPU). > > > > Here are fresh stats on why we wake up ksoftirqd on our Web workload > > (collected over 100 sec): > > > > Time exceeded: 484 > > Loop max run out: 6525 > > need_resched(): 10219 > > (control: 17226 - number of times wakeup_process called for ksirqd) > > > > As you can see need_resched() dominates. > > > > Zooming into the time exceeded - we can count nanoseconds between > > __do_softirq starting and the check. This is the histogram of actual > > usecs as seen by BPF (AKA ktime_get_mono_fast_ns() / 1000): > > > > [256, 512) 1 | | > > [512, 1K) 0 | | > > [1K, 2K) 217 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | > > [2K, 4K) 266 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| > > > > So yes, we can probably save ourselves ~200 wakeup with a better clock > > but that's just 1.3% of the total wake ups :( > > > > > > Now - now about the max loop count. I ORed the pending softirqs every > > time we get to the end of the loop. Looks like vast majority of the > > loop counter wake ups are exclusively due to RCU: > > > > @looped[512]: 5516 > > > > Where 512 is the ORed pending mask over all iterations > > 512 == 1 << RCU_SOFTIRQ. > > > > And they usually take less than 100us to consume the 10 iterations. > > Histogram of usecs consumed when we run out of loop iterations: > > > > [16, 32) 3 | | > > [32, 64) 4786 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| > > [64, 128) 871 |@@@@@@@@@ | > > [128, 256) 34 | | > > [256, 512) 9 | | > > [512, 1K) 262 |@@ | > > [1K, 2K) 35 | | > > [2K, 4K) 1 | | > > > > Paul, is this expected? Is RCU not trying too hard to be nice? > > This is from way back in the day, so it is quite possible that better > tuning and/or better heuristics should be applied. > > On the other hand, 100 microseconds is a good long time from an > CONFIG_PREEMPT_RT=y perspective! > > > # cat /sys/module/rcutree/parameters/blimit > > 10 > > > > Or should we perhaps just raise the loop limit? Breaking after less > > than 100usec seems excessive :( > > But note that RCU also has rcutree.rcu_divisor, which defaults to 7. > And an rcutree.rcu_resched_ns, which defaults to three milliseconds > (3,000,000 nanoseconds). This means that RCU will do: > > o All the callbacks if there are less than ten. > > o Ten callbacks or 1/128th of them, whichever is larger. > > o Unless the larger of them is more than 100 callbacks, in which > case there is an additional limit of three milliseconds worth > of them. > > Except that if a given CPU ends up with more than 10,000 callbacks > (rcutree.qhimark), that CPU's blimit is set to 10,000. Also, if in the context of a softirq handler (as opposed to ksoftirqd) that interrupted the idle task with no pending task, the count of callbacks is ignored and only the 3-millisecond limit counts. In the context of ksoftirq, the only limit is that which the scheduler chooses to impose. But it sure seems like the ksoftirqd case should also pay attention to that 3-millisecond limit. I will queue a patch to that effect, and maybe Eric Dumazet will show me the error of my ways. > So there is much opportunity to tune the existing heuristics and also > much opportunity to tweak the heuristics themselves. > > But let's see a good use case before tweaking, please. ;-) Thanx, Paul > > > whether that makes a difference? Those two can be applied with some > > > minor polishing. The rest of that series is broken by f10020c97f4c > > > ("softirq: Allow early break"). > > > > > > There is another issue with this overload limit. Assume max_restart or > > > timeout triggered and limit was set to now + 100ms. ksoftirqd runs and > > > gets the issue resolved after 10ms. > > > > > > So for the remaining 90ms any invocation of raise_softirq() outside of > > > (soft)interrupt context, which wakes ksoftirqd again, prevents > > > processing on return from interrupt until ksoftirqd gets on the CPU and > > > goes back to sleep, because task_is_running() == true and the stale > > > limit is not after jiffies. > > > > > > Probably not a big issue, but someone will notice on some weird workload > > > sooner than later and the tweaking will start nevertheless. :) So maybe > > > we fix it right away. :) > > > > Hm, Paolo raised this point as well, but the overload time is strictly > > to stop paying attention to the fact ksoftirqd is running. > > IOW current kernels behave as if they had overload_limit of infinity. > > > > The current code already prevents processing until ksoftirqd schedules > > in, after raise_softirq() from a funky context.
On Fri, 3 Mar 2023 15:36:27 -0800 Paul E. McKenney wrote: > On Fri, Mar 03, 2023 at 02:37:39PM -0800, Paul E. McKenney wrote: > > On Fri, Mar 03, 2023 at 01:31:43PM -0800, Jakub Kicinski wrote: > > > Now - now about the max loop count. I ORed the pending softirqs every > > > time we get to the end of the loop. Looks like vast majority of the > > > loop counter wake ups are exclusively due to RCU: > > > > > > @looped[512]: 5516 > > > > > > Where 512 is the ORed pending mask over all iterations > > > 512 == 1 << RCU_SOFTIRQ. > > > > > > And they usually take less than 100us to consume the 10 iterations. > > > Histogram of usecs consumed when we run out of loop iterations: > > > > > > [16, 32) 3 | | > > > [32, 64) 4786 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| > > > [64, 128) 871 |@@@@@@@@@ | > > > [128, 256) 34 | | > > > [256, 512) 9 | | > > > [512, 1K) 262 |@@ | > > > [1K, 2K) 35 | | > > > [2K, 4K) 1 | | > > > > > > Paul, is this expected? Is RCU not trying too hard to be nice? > > > > This is from way back in the day, so it is quite possible that better > > tuning and/or better heuristics should be applied. > > > > On the other hand, 100 microseconds is a good long time from an > > CONFIG_PREEMPT_RT=y perspective! > > > > > # cat /sys/module/rcutree/parameters/blimit > > > 10 > > > > > > Or should we perhaps just raise the loop limit? Breaking after less > > > than 100usec seems excessive :( > > > > But note that RCU also has rcutree.rcu_divisor, which defaults to 7. > > And an rcutree.rcu_resched_ns, which defaults to three milliseconds > > (3,000,000 nanoseconds). This means that RCU will do: > > > > o All the callbacks if there are less than ten. > > > > o Ten callbacks or 1/128th of them, whichever is larger. > > > > o Unless the larger of them is more than 100 callbacks, in which > > case there is an additional limit of three milliseconds worth > > of them. > > > > Except that if a given CPU ends up with more than 10,000 callbacks > > (rcutree.qhimark), that CPU's blimit is set to 10,000. > > Also, if in the context of a softirq handler (as opposed to ksoftirqd) > that interrupted the idle task with no pending task, the count of > callbacks is ignored and only the 3-millisecond limit counts. In the > context of ksoftirq, the only limit is that which the scheduler chooses > to impose. > > But it sure seems like the ksoftirqd case should also pay attention to > that 3-millisecond limit. I will queue a patch to that effect, and maybe > Eric Dumazet will show me the error of my ways. Just to be sure - have you seen Peter's patches? git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git core/softirq I think it feeds the time limit to the callback from softirq, so the local 3ms is no more?
On Fri, Mar 03, 2023 at 03:44:13PM -0800, Jakub Kicinski wrote: > On Fri, 3 Mar 2023 15:36:27 -0800 Paul E. McKenney wrote: > > On Fri, Mar 03, 2023 at 02:37:39PM -0800, Paul E. McKenney wrote: > > > On Fri, Mar 03, 2023 at 01:31:43PM -0800, Jakub Kicinski wrote: > > > > Now - now about the max loop count. I ORed the pending softirqs every > > > > time we get to the end of the loop. Looks like vast majority of the > > > > loop counter wake ups are exclusively due to RCU: > > > > > > > > @looped[512]: 5516 > > > > > > > > Where 512 is the ORed pending mask over all iterations > > > > 512 == 1 << RCU_SOFTIRQ. > > > > > > > > And they usually take less than 100us to consume the 10 iterations. > > > > Histogram of usecs consumed when we run out of loop iterations: > > > > > > > > [16, 32) 3 | | > > > > [32, 64) 4786 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| > > > > [64, 128) 871 |@@@@@@@@@ | > > > > [128, 256) 34 | | > > > > [256, 512) 9 | | > > > > [512, 1K) 262 |@@ | > > > > [1K, 2K) 35 | | > > > > [2K, 4K) 1 | | > > > > > > > > Paul, is this expected? Is RCU not trying too hard to be nice? > > > > > > This is from way back in the day, so it is quite possible that better > > > tuning and/or better heuristics should be applied. > > > > > > On the other hand, 100 microseconds is a good long time from an > > > CONFIG_PREEMPT_RT=y perspective! > > > > > > > # cat /sys/module/rcutree/parameters/blimit > > > > 10 > > > > > > > > Or should we perhaps just raise the loop limit? Breaking after less > > > > than 100usec seems excessive :( > > > > > > But note that RCU also has rcutree.rcu_divisor, which defaults to 7. > > > And an rcutree.rcu_resched_ns, which defaults to three milliseconds > > > (3,000,000 nanoseconds). This means that RCU will do: > > > > > > o All the callbacks if there are less than ten. > > > > > > o Ten callbacks or 1/128th of them, whichever is larger. > > > > > > o Unless the larger of them is more than 100 callbacks, in which > > > case there is an additional limit of three milliseconds worth > > > of them. > > > > > > Except that if a given CPU ends up with more than 10,000 callbacks > > > (rcutree.qhimark), that CPU's blimit is set to 10,000. > > > > Also, if in the context of a softirq handler (as opposed to ksoftirqd) > > that interrupted the idle task with no pending task, the count of > > callbacks is ignored and only the 3-millisecond limit counts. In the > > context of ksoftirq, the only limit is that which the scheduler chooses > > to impose. > > > > But it sure seems like the ksoftirqd case should also pay attention to > > that 3-millisecond limit. I will queue a patch to that effect, and maybe > > Eric Dumazet will show me the error of my ways. > > Just to be sure - have you seen Peter's patches? > > git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git core/softirq > > I think it feeds the time limit to the callback from softirq, > so the local 3ms is no more? I might or might not have back in September of 2020. ;-) But either way, the question remains: Should RCU_SOFTIRQ do time checking in ksoftirqd context? Seems like the answer should be "yes", independently of Peter's patches. Thanx, Paul
On Fri, 3 Mar 2023 17:25:35 -0800 Paul E. McKenney wrote: > > Just to be sure - have you seen Peter's patches? > > > > git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git core/softirq > > > > I think it feeds the time limit to the callback from softirq, > > so the local 3ms is no more? > > I might or might not have back in September of 2020. ;-) > > But either way, the question remains: Should RCU_SOFTIRQ do time checking > in ksoftirqd context? Seems like the answer should be "yes", independently > of Peter's patches. :-o I didn't notice, I thought that's from Dec 22, LWN was writing about Peter's rework at that point. I'm not sure what the story is :( And when / if any of these changes are coming downstream.
On Fri, Mar 03, 2023 at 05:39:21PM -0800, Jakub Kicinski wrote: > On Fri, 3 Mar 2023 17:25:35 -0800 Paul E. McKenney wrote: > > > Just to be sure - have you seen Peter's patches? > > > > > > git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git core/softirq > > > > > > I think it feeds the time limit to the callback from softirq, > > > so the local 3ms is no more? > > > > I might or might not have back in September of 2020. ;-) > > > > But either way, the question remains: Should RCU_SOFTIRQ do time checking > > in ksoftirqd context? Seems like the answer should be "yes", independently > > of Peter's patches. > > :-o I didn't notice, I thought that's from Dec 22, LWN was writing > about Peter's rework at that point. I'm not sure what the story is :( > And when / if any of these changes are coming downstream. Not a problem either way, as the compiler would complain bitterly about the resulting merge conflict and it is easy to fix. ;-) Thanx, Paul
On Fri, Mar 03, 2023 at 07:11:09PM -0800, Paul E. McKenney wrote: > On Fri, Mar 03, 2023 at 05:39:21PM -0800, Jakub Kicinski wrote: > > On Fri, 3 Mar 2023 17:25:35 -0800 Paul E. McKenney wrote: > > > > Just to be sure - have you seen Peter's patches? > > > > > > > > git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git core/softirq > > > > > > > > I think it feeds the time limit to the callback from softirq, > > > > so the local 3ms is no more? > > > > > > I might or might not have back in September of 2020. ;-) > > > > > > But either way, the question remains: Should RCU_SOFTIRQ do time checking > > > in ksoftirqd context? Seems like the answer should be "yes", independently > > > of Peter's patches. > > > > :-o I didn't notice, I thought that's from Dec 22, LWN was writing > > about Peter's rework at that point. I'm not sure what the story is :( > > And when / if any of these changes are coming downstream. > > Not a problem either way, as the compiler would complain bitterly about > the resulting merge conflict and it is easy to fix. ;-) And even more not a problem because in_serving_softirq() covers both the softirq environment as well as ksoftirqd. So that "else" clause is for rcuoc kthreads, which do not block other softirq vectors. So I am adding a comment instead... Thanx, Paul
On Fri, Mar 3, 2023 at 2:56 PM Paul E. McKenney <paulmck@kernel.org> wrote: > > On Fri, Mar 03, 2023 at 01:31:43PM -0800, Jakub Kicinski wrote: > > On Fri, 03 Mar 2023 14:30:46 +0100 Thomas Gleixner wrote: > > > > - if (time_before(jiffies, end) && !need_resched() && > > > > - --max_restart) > > > > + unsigned long limit; > > > > + > > > > + if (time_is_before_eq_jiffies(end) || !--max_restart) > > > > + limit = SOFTIRQ_OVERLOAD_TIME; > > > > + else if (need_resched()) > > > > + limit = SOFTIRQ_DEFER_TIME; > > > > + else > > > > goto restart; > > > > > > > > + __this_cpu_write(overload_limit, jiffies + limit); > > > > > > The logic of all this is non-obvious and I had to reread it 5 times to > > > conclude that it is matching the intent. Please add comments. > > > > > > While I'm not a big fan of heuristical duct tape, this looks harmless > > > enough to not end up in an endless stream of tweaking. Famous last > > > words... > > > > Would it all be more readable if I named the "overload_limit" > > "overloaded_until" instead? Naming.. > > I'll add comments, too. > > > > > But without the sched_clock() changes the actual defer time depends on > > > HZ and the point in time where limit is set. That means it ranges from 0 > > > to 1/HZ, i.e. the 2ms defer time ends up with close to 10ms on HZ=100 in > > > the worst case, which perhaps explains the 8ms+ stalls you are still > > > observing. Can you test with that sched_clock change applied, i.e. the > > > first two commits from > > > > > > git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git core/softirq > > > > > > 59be25c466d9 ("softirq: Use sched_clock() based timeout") > > > bd5a5bd77009 ("softirq: Rewrite softirq processing loop") > > > > Those will help, but I spent some time digging into the jiffies related > > warts with kprobes - while annoying they weren't a major source of wake > > ups. (FWIW the jiffies noise on our workloads is due to cgroup stats > > disabling IRQs for multiple ms on the timekeeping CPU). > > > > Here are fresh stats on why we wake up ksoftirqd on our Web workload > > (collected over 100 sec): > > > > Time exceeded: 484 > > Loop max run out: 6525 > > need_resched(): 10219 > > (control: 17226 - number of times wakeup_process called for ksirqd) > > > > As you can see need_resched() dominates. > > > > Zooming into the time exceeded - we can count nanoseconds between > > __do_softirq starting and the check. This is the histogram of actual > > usecs as seen by BPF (AKA ktime_get_mono_fast_ns() / 1000): > > > > [256, 512) 1 | | > > [512, 1K) 0 | | > > [1K, 2K) 217 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | > > [2K, 4K) 266 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| > > > > So yes, we can probably save ourselves ~200 wakeup with a better clock > > but that's just 1.3% of the total wake ups :( > > > > > > Now - now about the max loop count. I ORed the pending softirqs every > > time we get to the end of the loop. Looks like vast majority of the > > loop counter wake ups are exclusively due to RCU: > > > > @looped[512]: 5516 > > > > Where 512 is the ORed pending mask over all iterations > > 512 == 1 << RCU_SOFTIRQ. > > > > And they usually take less than 100us to consume the 10 iterations. > > Histogram of usecs consumed when we run out of loop iterations: > > > > [16, 32) 3 | | > > [32, 64) 4786 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| > > [64, 128) 871 |@@@@@@@@@ | > > [128, 256) 34 | | > > [256, 512) 9 | | > > [512, 1K) 262 |@@ | > > [1K, 2K) 35 | | > > [2K, 4K) 1 | | > > > > Paul, is this expected? Is RCU not trying too hard to be nice? > > This is from way back in the day, so it is quite possible that better > tuning and/or better heuristics should be applied. > > On the other hand, 100 microseconds is a good long time from an > CONFIG_PREEMPT_RT=y perspective! All I have to add to this conversation is the observation that sampling things at the nyquist rate helps to observe problems like these. So if you care about sub 8ms response time, a sub 4ms sampling rate is needed. > > # cat /sys/module/rcutree/parameters/blimit > > 10 > > > > Or should we perhaps just raise the loop limit? Breaking after less > > than 100usec seems excessive :( > But note that RCU also has rcutree.rcu_divisor, which defaults to 7. > And an rcutree.rcu_resched_ns, which defaults to three milliseconds > (3,000,000 nanoseconds). This means that RCU will do: > > o All the callbacks if there are less than ten. > > o Ten callbacks or 1/128th of them, whichever is larger. > > o Unless the larger of them is more than 100 callbacks, in which > case there is an additional limit of three milliseconds worth > of them. > > Except that if a given CPU ends up with more than 10,000 callbacks > (rcutree.qhimark), that CPU's blimit is set to 10,000. > > So there is much opportunity to tune the existing heuristics and also > much opportunity to tweak the heuristics themselves. This I did not know, and to best observe rcu in action nyquist is 1.5ms... Something with less constants and more curves seems in order. > > But let's see a good use case before tweaking, please. ;-) > > Thanx, Paul > > > > whether that makes a difference? Those two can be applied with some > > > minor polishing. The rest of that series is broken by f10020c97f4c > > > ("softirq: Allow early break"). > > > > > > There is another issue with this overload limit. Assume max_restart or > > > timeout triggered and limit was set to now + 100ms. ksoftirqd runs and > > > gets the issue resolved after 10ms. > > > > > > So for the remaining 90ms any invocation of raise_softirq() outside of > > > (soft)interrupt context, which wakes ksoftirqd again, prevents > > > processing on return from interrupt until ksoftirqd gets on the CPU and > > > goes back to sleep, because task_is_running() == true and the stale > > > limit is not after jiffies. > > > > > > Probably not a big issue, but someone will notice on some weird workload > > > sooner than later and the tweaking will start nevertheless. :) So maybe > > > we fix it right away. :) > > > > Hm, Paolo raised this point as well, but the overload time is strictly > > to stop paying attention to the fact ksoftirqd is running. > > IOW current kernels behave as if they had overload_limit of infinity. > > > > The current code already prevents processing until ksoftirqd schedules > > in, after raise_softirq() from a funky context. -- A pithy note on VOQs vs SQM: https://blog.cerowrt.org/post/juniper/ Dave Täht CEO, TekLibre, LLC
On Fri, Mar 03, 2023 at 03:25:32PM -0800, Dave Taht wrote: > On Fri, Mar 3, 2023 at 2:56 PM Paul E. McKenney <paulmck@kernel.org> wrote: > > > > On Fri, Mar 03, 2023 at 01:31:43PM -0800, Jakub Kicinski wrote: > > > On Fri, 03 Mar 2023 14:30:46 +0100 Thomas Gleixner wrote: > > > > > - if (time_before(jiffies, end) && !need_resched() && > > > > > - --max_restart) > > > > > + unsigned long limit; > > > > > + > > > > > + if (time_is_before_eq_jiffies(end) || !--max_restart) > > > > > + limit = SOFTIRQ_OVERLOAD_TIME; > > > > > + else if (need_resched()) > > > > > + limit = SOFTIRQ_DEFER_TIME; > > > > > + else > > > > > goto restart; > > > > > > > > > > + __this_cpu_write(overload_limit, jiffies + limit); > > > > > > > > The logic of all this is non-obvious and I had to reread it 5 times to > > > > conclude that it is matching the intent. Please add comments. > > > > > > > > While I'm not a big fan of heuristical duct tape, this looks harmless > > > > enough to not end up in an endless stream of tweaking. Famous last > > > > words... > > > > > > Would it all be more readable if I named the "overload_limit" > > > "overloaded_until" instead? Naming.. > > > I'll add comments, too. > > > > > > > But without the sched_clock() changes the actual defer time depends on > > > > HZ and the point in time where limit is set. That means it ranges from 0 > > > > to 1/HZ, i.e. the 2ms defer time ends up with close to 10ms on HZ=100 in > > > > the worst case, which perhaps explains the 8ms+ stalls you are still > > > > observing. Can you test with that sched_clock change applied, i.e. the > > > > first two commits from > > > > > > > > git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git core/softirq > > > > > > > > 59be25c466d9 ("softirq: Use sched_clock() based timeout") > > > > bd5a5bd77009 ("softirq: Rewrite softirq processing loop") > > > > > > Those will help, but I spent some time digging into the jiffies related > > > warts with kprobes - while annoying they weren't a major source of wake > > > ups. (FWIW the jiffies noise on our workloads is due to cgroup stats > > > disabling IRQs for multiple ms on the timekeeping CPU). > > > > > > Here are fresh stats on why we wake up ksoftirqd on our Web workload > > > (collected over 100 sec): > > > > > > Time exceeded: 484 > > > Loop max run out: 6525 > > > need_resched(): 10219 > > > (control: 17226 - number of times wakeup_process called for ksirqd) > > > > > > As you can see need_resched() dominates. > > > > > > Zooming into the time exceeded - we can count nanoseconds between > > > __do_softirq starting and the check. This is the histogram of actual > > > usecs as seen by BPF (AKA ktime_get_mono_fast_ns() / 1000): > > > > > > [256, 512) 1 | | > > > [512, 1K) 0 | | > > > [1K, 2K) 217 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | > > > [2K, 4K) 266 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| > > > > > > So yes, we can probably save ourselves ~200 wakeup with a better clock > > > but that's just 1.3% of the total wake ups :( > > > > > > > > > Now - now about the max loop count. I ORed the pending softirqs every > > > time we get to the end of the loop. Looks like vast majority of the > > > loop counter wake ups are exclusively due to RCU: > > > > > > @looped[512]: 5516 > > > > > > Where 512 is the ORed pending mask over all iterations > > > 512 == 1 << RCU_SOFTIRQ. > > > > > > And they usually take less than 100us to consume the 10 iterations. > > > Histogram of usecs consumed when we run out of loop iterations: > > > > > > [16, 32) 3 | | > > > [32, 64) 4786 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| > > > [64, 128) 871 |@@@@@@@@@ | > > > [128, 256) 34 | | > > > [256, 512) 9 | | > > > [512, 1K) 262 |@@ | > > > [1K, 2K) 35 | | > > > [2K, 4K) 1 | | > > > > > > Paul, is this expected? Is RCU not trying too hard to be nice? > > > > This is from way back in the day, so it is quite possible that better > > tuning and/or better heuristics should be applied. > > > > On the other hand, 100 microseconds is a good long time from an > > CONFIG_PREEMPT_RT=y perspective! > > All I have to add to this conversation is the observation that > sampling things at the > nyquist rate helps to observe problems like these. > > So if you care about sub 8ms response time, a sub 4ms sampling rate is needed. My guess is that Jakub is side-stepping Nyquist by sampling every call to and return from the rcu_do_batch() function. > > > # cat /sys/module/rcutree/parameters/blimit > > > 10 > > > > > > Or should we perhaps just raise the loop limit? Breaking after less > > > than 100usec seems excessive :( > > > > But note that RCU also has rcutree.rcu_divisor, which defaults to 7. > > And an rcutree.rcu_resched_ns, which defaults to three milliseconds > > (3,000,000 nanoseconds). This means that RCU will do: > > > > o All the callbacks if there are less than ten. > > > > o Ten callbacks or 1/128th of them, whichever is larger. > > > > o Unless the larger of them is more than 100 callbacks, in which > > case there is an additional limit of three milliseconds worth > > of them. > > > > Except that if a given CPU ends up with more than 10,000 callbacks > > (rcutree.qhimark), that CPU's blimit is set to 10,000. > > > > So there is much opportunity to tune the existing heuristics and also > > much opportunity to tweak the heuristics themselves. > > This I did not know, and to best observe rcu in action nyquist is 1.5ms... This is not an oscillator, and because this all happens within a single system, you cannot you hang your hat on speed-of-light delays. In addition, an application can dump thousands of callbacks down RCU's throat in a very short time, which changes RCU's timing. Also, the time constants for expedited grace periods are typically in the tens of microseconds. Something about prioritizing survivability over measurability. ;-) But that is OK because ftrace and BPF can provide fine-grained measurements quite cheaply. > Something with less constants and more curves seems in order. In the immortal words of MS-DOS, are you sure? Thanx, Paul > > But let's see a good use case before tweaking, please. ;-) > > > > Thanx, Paul > > > > > > whether that makes a difference? Those two can be applied with some > > > > minor polishing. The rest of that series is broken by f10020c97f4c > > > > ("softirq: Allow early break"). > > > > > > > > There is another issue with this overload limit. Assume max_restart or > > > > timeout triggered and limit was set to now + 100ms. ksoftirqd runs and > > > > gets the issue resolved after 10ms. > > > > > > > > So for the remaining 90ms any invocation of raise_softirq() outside of > > > > (soft)interrupt context, which wakes ksoftirqd again, prevents > > > > processing on return from interrupt until ksoftirqd gets on the CPU and > > > > goes back to sleep, because task_is_running() == true and the stale > > > > limit is not after jiffies. > > > > > > > > Probably not a big issue, but someone will notice on some weird workload > > > > sooner than later and the tweaking will start nevertheless. :) So maybe > > > > we fix it right away. :) > > > > > > Hm, Paolo raised this point as well, but the overload time is strictly > > > to stop paying attention to the fact ksoftirqd is running. > > > IOW current kernels behave as if they had overload_limit of infinity. > > > > > > The current code already prevents processing until ksoftirqd schedules > > > in, after raise_softirq() from a funky context. > > > > -- > A pithy note on VOQs vs SQM: https://blog.cerowrt.org/post/juniper/ > Dave Täht CEO, TekLibre, LLC
Jakub! On Fri, Mar 03 2023 at 14:30, Thomas Gleixner wrote: > On Thu, Dec 22 2022 at 14:12, Jakub Kicinski wrote: > But without the sched_clock() changes the actual defer time depends on > HZ and the point in time where limit is set. That means it ranges from 0 > to 1/HZ, i.e. the 2ms defer time ends up with close to 10ms on HZ=100 in > the worst case, which perhaps explains the 8ms+ stalls you are still > observing. Can you test with that sched_clock change applied, i.e. the > first two commits from > > git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git core/softirq > > 59be25c466d9 ("softirq: Use sched_clock() based timeout") > bd5a5bd77009 ("softirq: Rewrite softirq processing loop") > > whether that makes a difference? Those two can be applied with some > minor polishing. The rest of that series is broken by f10020c97f4c > ("softirq: Allow early break"). WHile staring I noticed that the current jiffies based time limit handling has the exact same problem. For HZ=100 and HZ=250 MAX_SOFTIRQ_TIME resolves to 1 jiffy. So the window is between 0 and 1/HZ. Not really useful. Thanks, tglx
On Thu, 22 Dec 2022 14:12:43 -0800 Jakub Kicinski wrote: > need_resched() added in commit c10d73671ad3 ("softirq: reduce latencies") > does improve latency for real workloads (for example memcache). > Unfortunately it triggers quite often even for non-network-heavy apps > (~900 times a second on a loaded webserver), and in small fraction of > cases whatever the scheduler decided to run will hold onto the CPU > for the entire time slice. > > 10ms+ stalls on a machine which is not actually under overload cause > erratic network behavior and spurious TCP retransmits. Typical end-to-end > latency in a datacenter is < 200us so its common to set TCP timeout > to 10ms or less. > > The intent of the need_resched() is to let a low latency application > respond quickly and yield (to ksoftirqd). Put a time limit on this dance. > Ignore the fact that ksoftirqd is RUNNING if we were trying to be nice > and the application did not yield quickly. > > On a webserver loaded at 90% CPU this change reduces the numer of 8ms+ > stalls the network softirq processing sees by around 10x (2/sec -> 0.2/sec). > It also seems to reduce retransmissions by ~10% but the data is quite > noisy. Peter, is there a chance you could fold this patch into your ongoing softirq rework? We can't both work on softirq in parallel, unfortunately and this improvement is really key to counter balance whatever heuristics CFS accumulated between 5.12 and 5.19 :( Not to use the "r-word". I can spin a version of this on top of your core/softirq branch, would that work?
© 2016 - 2025 Red Hat, Inc.