include/linux/livepatch.h | 2 ++ kernel/livepatch/transition.c | 2 +- kernel/sched/core.c | 3 +++ 3 files changed, 6 insertions(+), 1 deletion(-)
Busy kernel threads may block the transition of livepatch. Call
klp_try_switch_task from __cond_resched to make the transition easier.
Signed-off-by: Song Liu <song@kernel.org>
---
include/linux/livepatch.h | 2 ++
kernel/livepatch/transition.c | 2 +-
kernel/sched/core.c | 3 +++
3 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
index 2614247a9781..a9209f62550a 100644
--- a/include/linux/livepatch.h
+++ b/include/linux/livepatch.h
@@ -236,6 +236,7 @@ int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs,
unsigned int symindex, unsigned int secindex,
const char *objname);
+bool klp_try_switch_task(struct task_struct *task);
#else /* !CONFIG_LIVEPATCH */
static inline int klp_module_coming(struct module *mod) { return 0; }
@@ -253,6 +254,7 @@ int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs,
return 0;
}
+bool klp_try_switch_task(struct task_struct *task) { return true; }
#endif /* CONFIG_LIVEPATCH */
#endif /* _LINUX_LIVEPATCH_H_ */
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index f6310f848f34..4257a8eec64b 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -278,7 +278,7 @@ static int klp_check_stack(struct task_struct *task, char *err_buf)
* running, or it's sleeping on a to-be-patched or to-be-unpatched function, or
* if the stack is unreliable, return false.
*/
-static bool klp_try_switch_task(struct task_struct *task)
+bool klp_try_switch_task(struct task_struct *task)
{
static char err_buf[STACK_ERR_BUF_SIZE];
struct rq *rq;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a7302b7b65f2..41d1c7a86912 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6990,6 +6990,9 @@ SYSCALL_DEFINE0(sched_yield)
#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
int __sched __cond_resched(void)
{
+ if (unlikely(klp_patch_pending(current)))
+ klp_try_switch_task(current);
+
if (should_resched(0)) {
preempt_schedule_common();
return 1;
--
2.30.2
On Sat 2022-05-07 10:46:28, Song Liu wrote:
> Busy kernel threads may block the transition of livepatch. Call
> klp_try_switch_task from __cond_resched to make the transition easier.
Do you have some numbers how this speeds up the transition
and how it slows down the scheduler, please?
cond_resched() is typically called in cycles with many interactions
where the task might spend a lot of time. There are two possibilities.
cond_resched() is called in:
+ livepatched function
In this case, klp_try_switch_task(current) will always fail.
And it will non-necessarily slow down every iteration by
checking the very same stack.
+ non-livepatched function
In this case, the transition will succeed on the first attempt.
OK, but it would succeed also without that patch. The task would
most likely sleep in this cond_resched() so that it might
be successfully transitioned on the next occasion.
From my POV this patch this patch brings more harm than good.
Note that scheduling is a fast path. It is repeated zillion-times
on any system. But livepatch transition is a slow path. It does not
matter if it takes 1 second or 1 hour.
Best Regards,
Petr
> On May 9, 2022, at 8:07 AM, Petr Mladek <pmladek@suse.com> wrote: > > On Sat 2022-05-07 10:46:28, Song Liu wrote: >> Busy kernel threads may block the transition of livepatch. Call >> klp_try_switch_task from __cond_resched to make the transition easier. > > Do you have some numbers how this speeds up the transition > and how it slows down the scheduler, please? We don’t have number on how much this would slow down the scheduler. For the transition, we see cases where the transition cannot finish with in 60 seconds (how much "kpatch load" waits by default). > > cond_resched() is typically called in cycles with many interactions > where the task might spend a lot of time. There are two possibilities. > cond_resched() is called in: > > + livepatched function > > In this case, klp_try_switch_task(current) will always fail. > And it will non-necessarily slow down every iteration by > checking the very same stack. > > > + non-livepatched function > > In this case, the transition will succeed on the first attempt. > OK, but it would succeed also without that patch. The task would > most likely sleep in this cond_resched() so that it might > be successfully transitioned on the next occasion. We are in the non-live patched case. But the transition didn’t happen in time, because the kernel thread doesn’t go to sleep. While there is clearly something weird with this thread, we think live patch should work because the thread does call cond_resched from time to time. Thanks, Song > > > From my POV this patch this patch brings more harm than good. > > Note that scheduling is a fast path. It is repeated zillion-times > on any system. But livepatch transition is a slow path. It does not > matter if it takes 1 second or 1 hour. > > Best Regards, > Petr
On Mon 2022-05-09 16:22:11, Song Liu wrote:
>
>
> > On May 9, 2022, at 8:07 AM, Petr Mladek <pmladek@suse.com> wrote:
> >
> > On Sat 2022-05-07 10:46:28, Song Liu wrote:
> >> Busy kernel threads may block the transition of livepatch. Call
> >> klp_try_switch_task from __cond_resched to make the transition easier.
> >
> > Do you have some numbers how this speeds up the transition
> > and how it slows down the scheduler, please?
>
> We don’t have number on how much this would slow down the scheduler.
> For the transition, we see cases where the transition cannot finish
> with in 60 seconds (how much "kpatch load" waits by default).
60s might be too low limit, see below.
> > cond_resched() is typically called in cycles with many interactions
> > where the task might spend a lot of time. There are two possibilities.
> > cond_resched() is called in:
> >
> > + livepatched function
> >
> > In this case, klp_try_switch_task(current) will always fail.
> > And it will non-necessarily slow down every iteration by
> > checking the very same stack.
> >
> >
> > + non-livepatched function
> >
> > In this case, the transition will succeed on the first attempt.
> > OK, but it would succeed also without that patch. The task would
> > most likely sleep in this cond_resched() so that it might
> > be successfully transitioned on the next occasion.
>
> We are in the non-live patched case. But the transition didn’t happen
> in time, because the kernel thread doesn’t go to sleep. While there is
> clearly something weird with this thread, we think live patch should
> work because the thread does call cond_resched from time to time.
I guess that it goes to sleep. Otherwise it would trigger soft lockup
report if you have the watchdog enabled.
IMHO, the problem is that klp_transition_work_fn() tries the
transition "only" once per second, see
void klp_try_complete_transition(void)
{
[...]
schedule_delayed_work(&klp_transition_work,
round_jiffies_relative(HZ));
[...]
}
It means that there are "only" 60 attempts to migrate the busy process.
It fails when the process is in the running state or sleeping in a
livepatched function. There is a _non-zero_ chance of a bad luck.
It would be great to measure how long it will take to complete
the transition if you remove the limit 60s.
Anyway, the limit 60s looks like a bad idea to me. It is too low.
For example, we do not use any limit at all in SUSE products.
And the only report was that some thread from a 3rd party
module could not be migrated. It was stuck with a livepatched
function on the stack. The kthread had really problematic
design. I am afraid that even this patch would not help
in that case.
Now, back to the limit. There are basically two problems when
the transition takes too long:
+ It blocks another transition. But the frequency of new
livepatches it typically counted in days or even weeks.
+ It means that a process is running the buggy/vulnerable
code longer time. But few hours should be acceptable
given how long it takes to prepare the livepatch.
Also it looks better to have 99.9% of processes running
the fixed code that revert the fix just because a single
process needs longer time to get transitioned.
I could imagine that there really is a process that is almost
impossible to livepatch. It might get worse on NO_HZ system.
The question is it happens in the real life.
I would personally start with prolonging or removing the limit.
Best Regards,
Petr
On Tue, 2022-05-10 at 09:56 +0200, Petr Mladek wrote:
>
> IMHO, the problem is that klp_transition_work_fn() tries the
> transition "only" once per second, see
>
> void klp_try_complete_transition(void)
> {
> [...]
> schedule_delayed_work(&klp_transition_work,
> round_jiffies_relative(HZ));
> [...]
> }
>
> It means that there are "only" 60 attempts to migrate the busy
> process.
> It fails when the process is in the running state or sleeping in a
> livepatched function. There is a _non-zero_ chance of a bad luck.
>
We are definitely hitting that non-zero chance :)
> Anyway, the limit 60s looks like a bad idea to me. It is too low.
That has its own issues, though. System management software
tracks whether kpatch succeeds, and a run of the system
management software will not complete until all of the commands
it has run have completed.
One reason for this is that allowing system management software
to just fork more and more things that might potentially get
stuck is that you never want your system management software
to come even close to resembling a fork bomb :)
Rollout of the next config change to a system should not be
blocked on KLP completion.
I think the best approach for us might be to just track what
is causing the transition failures, and send in trivial patches
to make the outer loop in such kernel threads do the same KLP
transition the idle task already does.
On Tue 2022-05-10 13:33:13, Rik van Riel wrote:
> On Tue, 2022-05-10 at 09:56 +0200, Petr Mladek wrote:
> >
> > IMHO, the problem is that klp_transition_work_fn() tries the
> > transition "only" once per second, see
> >
> > void klp_try_complete_transition(void)
> > {
> > [...]
> > schedule_delayed_work(&klp_transition_work,
> > round_jiffies_relative(HZ));
> > [...]
> > }
> >
> > It means that there are "only" 60 attempts to migrate the busy
> > process.
> > It fails when the process is in the running state or sleeping in a
> > livepatched function. There is a _non-zero_ chance of a bad luck.
> >
>
> We are definitely hitting that non-zero chance :)
>
> > Anyway, the limit 60s looks like a bad idea to me. It is too low.
>
> That has its own issues, though. System management software
> tracks whether kpatch succeeds, and a run of the system
> management software will not complete until all of the commands
> it has run have completed.
>
> One reason for this is that allowing system management software
> to just fork more and more things that might potentially get
> stuck is that you never want your system management software
> to come even close to resembling a fork bomb :)
>
> Rollout of the next config change to a system should not be
> blocked on KLP completion.
Makes sense.
> I think the best approach for us might be to just track what
> is causing the transition failures, and send in trivial patches
> to make the outer loop in such kernel threads do the same KLP
> transition the idle task already does.
I am afraid that is a way to hell. We might end up in doing
really crazy things if we want to complete the transition
in one minute.
The great thing about the current approach is that it tries
to livepatch the system without too much disruption. The
more we try to speed up the transition the more we might
disrupt the system. Not to say about the code complexity
and potential bugs.
IMHO a better approach is to fix your management system.
The task is done when the livepatch module is loaded.
If you want to know that there is some problem. Then the
livepatch code might write some warning when the transition
has not finished within some reasonable time frame
(1 hour or so). It might be monitored the same way
as the messages from various watchdogs, ...
Best Regards,
Petr
On Tue, 2022-05-10 at 17:44 +0200, Petr Mladek wrote: > On Tue 2022-05-10 13:33:13, Rik van Riel wrote: > > On Tue, 2022-05-10 at 09:56 +0200, Petr Mladek wrote: > > > > > I think the best approach for us might be to just track what > > is causing the transition failures, and send in trivial patches > > to make the outer loop in such kernel threads do the same KLP > > transition the idle task already does. > > I am afraid that is a way to hell. We might end up in doing > really crazy things if we want to complete the transition > in one minute. > Now I wonder if we could just hook up a preempt notifier for kernel live patches. All the distro kernels already need the preempt notifier for KVM, anyway... Is it crazy? Maybe a little. Is it self contained, and something that could be done without inserting any extra code into a hot path while not in the middle of a KLP transition? Yes. I'd be happy to come up with a patch that does that, unless anybody has good reasons I should not :)
On Tue, May 10, 2022 at 04:07:42PM +0000, Rik van Riel wrote: > On Tue, 2022-05-10 at 17:44 +0200, Petr Mladek wrote: > > On Tue 2022-05-10 13:33:13, Rik van Riel wrote: > > > On Tue, 2022-05-10 at 09:56 +0200, Petr Mladek wrote: > > > > > > > > I think the best approach for us might be to just track what > > > is causing the transition failures, and send in trivial patches > > > to make the outer loop in such kernel threads do the same KLP > > > transition the idle task already does. > > > > I am afraid that is a way to hell. We might end up in doing > > really crazy things if we want to complete the transition > > in one minute. > > > Now I wonder if we could just hook up a preempt notifier > for kernel live patches. All the distro kernels already > need the preempt notifier for KVM, anyway... > > Is it crazy? Maybe a little. > > Is it self contained, and something that could be done > without inserting any extra code into a hot path while > not in the middle of a KLP transition? Yes. > > I'd be happy to come up with a patch that does that, > unless anybody has good reasons I should not :) I wouldn't be opposed to that, but how does it solve this problem? If as Peter said cond_resched() can be a NOP, then preemption would have to be from an interrupt, in which case frame pointers aren't reliable. -- Josh
On Tue, 2022-05-10 at 09:52 -0700, Josh Poimboeuf wrote: > On Tue, May 10, 2022 at 04:07:42PM +0000, Rik van Riel wrote: > > > > > Now I wonder if we could just hook up a preempt notifier > > for kernel live patches. All the distro kernels already > > need the preempt notifier for KVM, anyway... > > > > I wouldn't be opposed to that, but how does it solve this problem? > If > as Peter said cond_resched() can be a NOP, then preemption would have > to > be from an interrupt, in which case frame pointers aren't reliable. > The systems where we are seeing problems do not, as far as I know, throw softlockup errors, so the kworker threads that fail to transition to the new KLP version are sleeping and getting scheduled out at times. A KLP transition preempt notifier would help those kernel threads transition to the new KLP version at any time they reschedule. How much it will help is hard to predict, but I should be able to get results from a fairly large sample size of systems within a few weeks :)
On Tue, May 10, 2022 at 06:07:00PM +0000, Rik van Riel wrote: > On Tue, 2022-05-10 at 09:52 -0700, Josh Poimboeuf wrote: > > On Tue, May 10, 2022 at 04:07:42PM +0000, Rik van Riel wrote: > > > > > > > Now I wonder if we could just hook up a preempt notifier > > > for kernel live patches. All the distro kernels already > > > need the preempt notifier for KVM, anyway... > > > > > > > I wouldn't be opposed to that, but how does it solve this problem? > > If > > as Peter said cond_resched() can be a NOP, then preemption would have > > to > > be from an interrupt, in which case frame pointers aren't reliable. > > > The systems where we are seeing problems do not, as far > as I know, throw softlockup errors, so the kworker > threads that fail to transition to the new KLP version > are sleeping and getting scheduled out at times. Are they sleeping due to an explicit call to cond_resched()? > A KLP transition preempt notifier would help those > kernel threads transition to the new KLP version at > any time they reschedule. ... unless cond_resched() is a no-op due to CONFIG_PREEMPT? > How much it will help is hard to predict, but I should > be able to get results from a fairly large sample size > of systems within a few weeks :) As Peter said, keep in mind that we will need to fix other cases beyond Facebook, i.e., CONFIG_PREEMPT combined with non-x86 arches which don't have ORC so they can't reliably unwind from an IRQ. -- Josh
> On May 10, 2022, at 11:42 AM, Josh Poimboeuf <jpoimboe@kernel.org> wrote: > > On Tue, May 10, 2022 at 06:07:00PM +0000, Rik van Riel wrote: >> On Tue, 2022-05-10 at 09:52 -0700, Josh Poimboeuf wrote: >>> On Tue, May 10, 2022 at 04:07:42PM +0000, Rik van Riel wrote: >>>>> >>>> Now I wonder if we could just hook up a preempt notifier >>>> for kernel live patches. All the distro kernels already >>>> need the preempt notifier for KVM, anyway... >>>> >>> >>> I wouldn't be opposed to that, but how does it solve this problem? >>> If >>> as Peter said cond_resched() can be a NOP, then preemption would have >>> to >>> be from an interrupt, in which case frame pointers aren't reliable. >>> >> The systems where we are seeing problems do not, as far >> as I know, throw softlockup errors, so the kworker >> threads that fail to transition to the new KLP version >> are sleeping and getting scheduled out at times. > > Are they sleeping due to an explicit call to cond_resched()? In this case, yes. The thread calls cond_resched(). > >> A KLP transition preempt notifier would help those >> kernel threads transition to the new KLP version at >> any time they reschedule. > > ... unless cond_resched() is a no-op due to CONFIG_PREEMPT? Based on my understanding (and a few other folks we chatted with), a kernel thread can legally run for extended time, as long as it calls cond_resched() at a reasonable frequency. Therefore, I think we should be able to patch such thread easily, unless it calls cond_resched() with being-patched function in the stack, of course. OTOH, Petr's mindset of allowing many minutes for the patch transition is new to me. I need to think more about it. Josh, what’s you opinion on this? IIUC, kpatch is designed to only wait up to 60 seconds (no option to overwrite the time). > >> How much it will help is hard to predict, but I should >> be able to get results from a fairly large sample size >> of systems within a few weeks :) > > As Peter said, keep in mind that we will need to fix other cases beyond > Facebook, i.e., CONFIG_PREEMPT combined with non-x86 arches which don't > have ORC so they can't reliably unwind from an IRQ. I think livepatch transition may fail in different cases, and we don't need to address all of them in one shoot. Fixing some cases is an improvement as long as we don't slow down other cases. I understand that adding tiny overhead to __cond_resched() may end up as a visible regression. But maybe adding it to preempt_schedule_common() is light enough? Did I miss/misunderstand something? Thanks, Song
On Tue, May 10, 2022 at 07:45:49PM +0000, Song Liu wrote: > >> A KLP transition preempt notifier would help those > >> kernel threads transition to the new KLP version at > >> any time they reschedule. > > > > ... unless cond_resched() is a no-op due to CONFIG_PREEMPT? > > Based on my understanding (and a few other folks we chatted with), > a kernel thread can legally run for extended time, as long as it > calls cond_resched() at a reasonable frequency. Therefore, I > think we should be able to patch such thread easily, unless it > calls cond_resched() with being-patched function in the stack, > of course. But again, with CONFIG_PREEMPT, that doesn't work. > OTOH, Petr's mindset of allowing many minutes for the patch > transition is new to me. I need to think more about it. > Josh, what’s you opinion on this? IIUC, kpatch is designed to > only wait up to 60 seconds (no option to overwrite the time). I wouldn't be necessarily opposed to changing the kpatch timeout to something bigger, or eliminating it altogether in favor of a WARN() after x minutes. > >> How much it will help is hard to predict, but I should > >> be able to get results from a fairly large sample size > >> of systems within a few weeks :) > > > > As Peter said, keep in mind that we will need to fix other cases beyond > > Facebook, i.e., CONFIG_PREEMPT combined with non-x86 arches which don't > > have ORC so they can't reliably unwind from an IRQ. > > I think livepatch transition may fail in different cases, and we > don't need to address all of them in one shoot. Fixing some cases > is an improvement as long as we don't slow down other cases. I > understand that adding tiny overhead to __cond_resched() may end > up as a visible regression. But maybe adding it to > preempt_schedule_common() is light enough? > > Did I miss/misunderstand something? If it's a real bug, we should fix it everywhere, not just for Facebook. Otherwise CONFIG_PREEMPT and/or non-x86 arches become second-class citizens. -- Josh
> On May 10, 2022, at 4:04 PM, Josh Poimboeuf <jpoimboe@kernel.org> wrote:
>
> On Tue, May 10, 2022 at 07:45:49PM +0000, Song Liu wrote:
>>>> A KLP transition preempt notifier would help those
>>>> kernel threads transition to the new KLP version at
>>>> any time they reschedule.
>>>
>>> ... unless cond_resched() is a no-op due to CONFIG_PREEMPT?
>>
>> Based on my understanding (and a few other folks we chatted with),
>> a kernel thread can legally run for extended time, as long as it
>> calls cond_resched() at a reasonable frequency. Therefore, I
>> think we should be able to patch such thread easily, unless it
>> calls cond_resched() with being-patched function in the stack,
>> of course.
>
> But again, with CONFIG_PREEMPT, that doesn't work.
>
>> OTOH, Petr's mindset of allowing many minutes for the patch
>> transition is new to me. I need to think more about it.
>> Josh, what’s you opinion on this? IIUC, kpatch is designed to
>> only wait up to 60 seconds (no option to overwrite the time).
>
> I wouldn't be necessarily opposed to changing the kpatch timeout to
> something bigger, or eliminating it altogether in favor of a WARN()
> after x minutes.
>
>>>> How much it will help is hard to predict, but I should
>>>> be able to get results from a fairly large sample size
>>>> of systems within a few weeks :)
>>>
>>> As Peter said, keep in mind that we will need to fix other cases beyond
>>> Facebook, i.e., CONFIG_PREEMPT combined with non-x86 arches which don't
>>> have ORC so they can't reliably unwind from an IRQ.
>>
>> I think livepatch transition may fail in different cases, and we
>> don't need to address all of them in one shoot. Fixing some cases
>> is an improvement as long as we don't slow down other cases. I
>> understand that adding tiny overhead to __cond_resched() may end
>> up as a visible regression. But maybe adding it to
>> preempt_schedule_common() is light enough?
>>
>> Did I miss/misunderstand something?
>
> If it's a real bug, we should fix it everywhere, not just for Facebook.
> Otherwise CONFIG_PREEMPT and/or non-x86 arches become second-class
> citizens.
I think "is it a real bug?" is the top question for me. So maybe we
should take a step back.
The behavior we see is: A busy kernel thread blocks klp transition
for more than a minute. But the transition eventually succeeded after
< 10 retries on most systems. The kernel thread is well-behaved, as
it calls cond_resched() at a reasonable frequency, so this is not a
deadlock.
If I understand Petr correctly, this behavior is expected, and thus
is not a bug or issue for the livepatch subsystem. This is different
to our original expectation, but if this is what we agree on, we
will look into ways to incorporate long wait time for patch
transition in our automations.
OTOH, if we think this is a suboptimal behavior (not necessarily a
bug, IIUC), we should consider improve it. I personally don’t think
shipping an improvement to one configuration makes other configurations
second-class citizens. And, it is possible PREEMPT kernel does NOT
even have such suboptimal behavior, right? (I really don't know.)
So, if we come back to the same question: is this a bug (or a suboptimal
behavior that worth fixing)? If so, we are open to any solution that
would also help PREEMPT and/or non-x86 arches.
Lastly, maybe a really naive question: does the following also helps
PREEMPT=y configurations?
Thanks,
Song
diff --git c/kernel/sched/core.c w/kernel/sched/core.c
index a7302b7b65f2..cc9b1c9c02ba 100644
--- c/kernel/sched/core.c
+++ w/kernel/sched/core.c
@@ -5226,6 +5226,9 @@ void __sched schedule_preempt_disabled(void)
static void __sched notrace preempt_schedule_common(void)
{
+ if (unlikely(klp_patch_pending(current)))
+ klp_try_switch_task(current);
+
do {
/*
* Because the function tracer can trace preempt_count_sub()
On Tue, 2022-05-10 at 23:57 +0000, Song Liu wrote:
>
> So, if we come back to the same question: is this a bug (or a
> suboptimal
> behavior that worth fixing)? If so, we are open to any solution that
> would also help PREEMPT and/or non-x86 arches.
>
Using the preempt notifiers during KLP transition should
work equally well for PREEMPT and !PREEMPT. It also does
not insert any additional code into the scheduler while
there is no KLP transition going on.
> Lastly, maybe a really naive question: does the following also helps
> PREEMPT=y configurations?
>
> static void __sched notrace preempt_schedule_common(void)
> {
> + if (unlikely(klp_patch_pending(current)))
> + klp_try_switch_task(current);
> +
> do {
>
While this would almost certainly speed up KLP
transitions, it would also slow down schedule
all the time, even while there is no KLP transition
going on.
That does not seem like a worthwhile tradeoff.
On Wed, May 11, 2022 at 12:35:11AM +0000, Rik van Riel wrote: > On Tue, 2022-05-10 at 23:57 +0000, Song Liu wrote: > > > > So, if we come back to the same question: is this a bug (or a > > suboptimal > > behavior that worth fixing)? If so, we are open to any solution that > > would also help PREEMPT and/or non-x86 arches. > > > Using the preempt notifiers during KLP transition should > work equally well for PREEMPT and !PREEMPT. It also does > not insert any additional code into the scheduler while > there is no KLP transition going on. As I've been saying, this is not going to work for PREEMPT because, without ORC, we can't reliably unwind from an IRQ handler, so the kthread won't get patched. -- Josh
On Tue, 2022-05-10 at 17:37 -0700, Josh Poimboeuf wrote: > On Wed, May 11, 2022 at 12:35:11AM +0000, Rik van Riel wrote: > > On Tue, 2022-05-10 at 23:57 +0000, Song Liu wrote: > > > > > > So, if we come back to the same question: is this a bug (or a > > > suboptimal > > > behavior that worth fixing)? If so, we are open to any solution > > > that > > > would also help PREEMPT and/or non-x86 arches. > > > > > Using the preempt notifiers during KLP transition should > > work equally well for PREEMPT and !PREEMPT. It also does > > not insert any additional code into the scheduler while > > there is no KLP transition going on. > > As I've been saying, this is not going to work for PREEMPT because, > without ORC, we can't reliably unwind from an IRQ handler, so the > kthread won't get patched. > Isn't the sched_out preempt notifier always run in process context? What am I missing?
On Wed, May 11, 2022 at 12:46:32AM +0000, Rik van Riel wrote: > On Tue, 2022-05-10 at 17:37 -0700, Josh Poimboeuf wrote: > > On Wed, May 11, 2022 at 12:35:11AM +0000, Rik van Riel wrote: > > > On Tue, 2022-05-10 at 23:57 +0000, Song Liu wrote: > > > > > > > > So, if we come back to the same question: is this a bug (or a > > > > suboptimal > > > > behavior that worth fixing)? If so, we are open to any solution > > > > that > > > > would also help PREEMPT and/or non-x86 arches. > > > > > > > Using the preempt notifiers during KLP transition should > > > work equally well for PREEMPT and !PREEMPT. It also does > > > not insert any additional code into the scheduler while > > > there is no KLP transition going on. > > > > As I've been saying, this is not going to work for PREEMPT because, > > without ORC, we can't reliably unwind from an IRQ handler, so the > > kthread won't get patched. > > > Isn't the sched_out preempt notifier always run in > process context? > > What am I missing? Maybe it's technically process context at that point. But the important point is that the call to the scheduler via preempt_schedule_irq() originates from the "return from interrupt" path. So the state of the interrupted task's stack is unknown. For example it could have been interrupted before the frame pointer prologue. Or in a leaf function which doesn't use frame pointers. -- Josh
On Tue, 2022-05-10 at 18:12 -0700, Josh Poimboeuf wrote: > On Wed, May 11, 2022 at 12:46:32AM +0000, Rik van Riel wrote: > > On Tue, 2022-05-10 at 17:37 -0700, Josh Poimboeuf wrote: > > > On Wed, May 11, 2022 at 12:35:11AM +0000, Rik van Riel wrote: > > > > On Tue, 2022-05-10 at 23:57 +0000, Song Liu wrote: > > > > > > > > > > So, if we come back to the same question: is this a bug (or a > > > > > suboptimal > > > > > behavior that worth fixing)? If so, we are open to any > > > > > solution > > > > > that > > > > > would also help PREEMPT and/or non-x86 arches. > > > > > > > > > Using the preempt notifiers during KLP transition should > > > > work equally well for PREEMPT and !PREEMPT. It also does > > > > not insert any additional code into the scheduler while > > > > there is no KLP transition going on. > > > > > > As I've been saying, this is not going to work for PREEMPT > > > because, > > > without ORC, we can't reliably unwind from an IRQ handler, so the > > > kthread won't get patched. > > > > > Isn't the sched_out preempt notifier always run in > > process context? > > > > What am I missing? > > Maybe it's technically process context at that point. But the > important > point is that the call to the scheduler via preempt_schedule_irq() > originates from the "return from interrupt" path. Ahhhh, I think I understand. Does that mean if the scheduling of the kernel thread originated from an IRQ, the KLP transition will fail probably? However, if the call to schedule came from a voluntary preemption, for example through a cond_resched() or due to the thread going to sleep a little bit, the stack walk will be reliable, and the KLP transition may succeed?
On Wed, May 11, 2022 at 06:09:28PM +0000, Rik van Riel wrote: > On Tue, 2022-05-10 at 18:12 -0700, Josh Poimboeuf wrote: > > On Wed, May 11, 2022 at 12:46:32AM +0000, Rik van Riel wrote: > > > On Tue, 2022-05-10 at 17:37 -0700, Josh Poimboeuf wrote: > > > > On Wed, May 11, 2022 at 12:35:11AM +0000, Rik van Riel wrote: > > > > > On Tue, 2022-05-10 at 23:57 +0000, Song Liu wrote: > > > > > > > > > > > > So, if we come back to the same question: is this a bug (or a > > > > > > suboptimal > > > > > > behavior that worth fixing)? If so, we are open to any > > > > > > solution > > > > > > that > > > > > > would also help PREEMPT and/or non-x86 arches. > > > > > > > > > > > Using the preempt notifiers during KLP transition should > > > > > work equally well for PREEMPT and !PREEMPT. It also does > > > > > not insert any additional code into the scheduler while > > > > > there is no KLP transition going on. > > > > > > > > As I've been saying, this is not going to work for PREEMPT > > > > because, > > > > without ORC, we can't reliably unwind from an IRQ handler, so the > > > > kthread won't get patched. > > > > > > > Isn't the sched_out preempt notifier always run in > > > process context? > > > > > > What am I missing? > > > > Maybe it's technically process context at that point. But the > > important > > point is that the call to the scheduler via preempt_schedule_irq() > > originates from the "return from interrupt" path. > > Ahhhh, I think I understand. > > Does that mean if the scheduling of the kernel thread originated > from an IRQ, the KLP transition will fail probably? It will fail definitely, unless you have the ORC unwinder. > However, if the call to schedule came from a voluntary preemption, > for example through a cond_resched() or due to the thread going > to sleep a little bit, the stack walk will be reliable, and the > KLP transition may succeed? Right. -- Josh
On Tue, May 10, 2022 at 11:57:04PM +0000, Song Liu wrote: > > If it's a real bug, we should fix it everywhere, not just for Facebook. > > Otherwise CONFIG_PREEMPT and/or non-x86 arches become second-class > > citizens. > > I think "is it a real bug?" is the top question for me. So maybe we > should take a step back. > > The behavior we see is: A busy kernel thread blocks klp transition > for more than a minute. But the transition eventually succeeded after > < 10 retries on most systems. The kernel thread is well-behaved, as > it calls cond_resched() at a reasonable frequency, so this is not a > deadlock. > > If I understand Petr correctly, this behavior is expected, and thus > is not a bug or issue for the livepatch subsystem. This is different > to our original expectation, but if this is what we agree on, we > will look into ways to incorporate long wait time for patch > transition in our automations. That's how we've traditionally looked at it, though apparently Red Hat and SUSE have implemented different ideas of what a long wait time is. In practice, one minute has always been enough for all of kpatch's users -- AFAIK, everybody except SUSE -- up until now. That timeout is not set in stone, and can be extended if needed, either with "kpatch load" waiting longer, or with it returning immediately and instead reporting livepatch stalls via WARN(). The latter option is a bigger change in behavior, since any errors will be delayed and reported in a different way, but if other users are ok with that, it's fine with me. Though, these options might be considered workarounds, as it's theoretically possible for a kthread to be CPU-bound indefinitely, beyond any arbitrarily chosen timeout. But maybe that's not realistic beyond a certain timeout value of X and we don't care? I dunno. > OTOH, if we think this is a suboptimal behavior (not necessarily a > bug, IIUC), we should consider improve it. I personally don’t think > shipping an improvement to one configuration makes other configurations > second-class citizens. And, it is possible PREEMPT kernel does NOT > even have such suboptimal behavior, right? (I really don't know.) No, PREEMPT will have the same problem. Preempting a kthread doesn't patch it unless the task happens to still be blocked during the next invocation of klp_transition_work_fn(). > So, if we come back to the same question: is this a bug (or a suboptimal > behavior that worth fixing)? If so, we are open to any solution that > would also help PREEMPT and/or non-x86 arches. You're obviously trying to fix a real world problem. Whether you call that issue a bug, or "suboptimal behavior", it's still a problem. As you said, the kthread is acting within the allowed parameters of how a kthread should behave. But "kpatch load" will fail. That sounds broken to me. We need to figure out a way to fix that for all configs/arches. > Lastly, maybe a really naive question: does the following also helps > PREEMPT=y configurations? As I have been trying to say, that won't work for PREEMPT+!ORC, because, when the kthread gets preempted, the stack trace will be attempted from an IRQ and will be reported as unreliable. Ideally we'd have the ORC unwinder for all arches, that would make this much easier. But we're not there yet. -- Josh
On Tue 2022-05-10 17:33:31, Josh Poimboeuf wrote: > On Tue, May 10, 2022 at 11:57:04PM +0000, Song Liu wrote: > > > If it's a real bug, we should fix it everywhere, not just for Facebook. > > > Otherwise CONFIG_PREEMPT and/or non-x86 arches become second-class > > > citizens. > > > > I think "is it a real bug?" is the top question for me. So maybe we > > should take a step back. > > > > The behavior we see is: A busy kernel thread blocks klp transition > > for more than a minute. But the transition eventually succeeded after > > < 10 retries on most systems. The kernel thread is well-behaved, as > > it calls cond_resched() at a reasonable frequency, so this is not a > > deadlock. > > > > If I understand Petr correctly, this behavior is expected, and thus > > is not a bug or issue for the livepatch subsystem. This is different > > to our original expectation, but if this is what we agree on, we > > will look into ways to incorporate long wait time for patch > > transition in our automations. > > That's how we've traditionally looked at it, though apparently Red Hat > and SUSE have implemented different ideas of what a long wait time is. > > In practice, one minute has always been enough for all of kpatch's users > -- AFAIK, everybody except SUSE -- up until now. I am actually surprised that nobody met the problem yet. There are "only" 60 attempts to transition the pending tasks. Well, the problem is mainly with kthreads. User space processes are migrated also on the kernel boundary. And the fake signal is likely pretty effective here. And it probably is not that common that a kthread would occupy a single CPU all the time. > Though, these options might be considered workarounds, as it's > theoretically possible for a kthread to be CPU-bound indefinitely, > beyond any arbitrarily chosen timeout. But maybe that's not realistic > beyond a certain timeout value of X and we don't care? I dunno. I agree that it might happen theoretically. And it would be great to be prepared for this. My only concern is the complexity and risk. We should know that it is worth it. > As I have been trying to say, that won't work for PREEMPT+!ORC, because, > when the kthread gets preempted, the stack trace will be attempted from > an IRQ and will be reported as unreliable. This limits the range of possible solutions quite a lot. But it is how it is. > Ideally we'd have the ORC unwinder for all arches, that would make this > much easier. But we're not there yet. The alternative solution is that the process has to migrate itself on some safe location. One crazy idea. It still might be possible to find the called functions on the stack even when it is not reliable. Then it might be possible to add another ftrace handler on these found functions. This other ftrace handler might migrate the task when it calls this function again. It assumes that the task will call the same functions again and again. Also it might require that the tasks checks its own stack from the ftrace handler. I am not sure if this is possible. There might be other variants of this approach. Best Regards, Petr
> On May 11, 2022, at 2:24 AM, Petr Mladek <pmladek@suse.com> wrote: > > On Tue 2022-05-10 17:33:31, Josh Poimboeuf wrote: >> On Tue, May 10, 2022 at 11:57:04PM +0000, Song Liu wrote: >>>> If it's a real bug, we should fix it everywhere, not just for Facebook. >>>> Otherwise CONFIG_PREEMPT and/or non-x86 arches become second-class >>>> citizens. >>> >>> I think "is it a real bug?" is the top question for me. So maybe we >>> should take a step back. >>> >>> The behavior we see is: A busy kernel thread blocks klp transition >>> for more than a minute. But the transition eventually succeeded after >>> < 10 retries on most systems. The kernel thread is well-behaved, as >>> it calls cond_resched() at a reasonable frequency, so this is not a >>> deadlock. >>> >>> If I understand Petr correctly, this behavior is expected, and thus >>> is not a bug or issue for the livepatch subsystem. This is different >>> to our original expectation, but if this is what we agree on, we >>> will look into ways to incorporate long wait time for patch >>> transition in our automations. >> >> That's how we've traditionally looked at it, though apparently Red Hat >> and SUSE have implemented different ideas of what a long wait time is. >> >> In practice, one minute has always been enough for all of kpatch's users >> -- AFAIK, everybody except SUSE -- up until now. > > I am actually surprised that nobody met the problem yet. There are > "only" 60 attempts to transition the pending tasks. Maybe we should consider increase the frequency we try? Say to 10 times per second? I guess this will solve most of the failures we are seeing in current case. > > Well, the problem is mainly with kthreads. User space processes are > migrated also on the kernel boundary. And the fake signal is likely > pretty effective here. And it probably is not that common that > a kthread would occupy a single CPU all the time. > > >> Though, these options might be considered workarounds, as it's >> theoretically possible for a kthread to be CPU-bound indefinitely, >> beyond any arbitrarily chosen timeout. But maybe that's not realistic >> beyond a certain timeout value of X and we don't care? I dunno. > > I agree that it might happen theoretically. And it would be great > to be prepared for this. My only concern is the complexity and risk. > We should know that it is worth it. > > >> As I have been trying to say, that won't work for PREEMPT+!ORC, because, >> when the kthread gets preempted, the stack trace will be attempted from >> an IRQ and will be reported as unreliable. > > This limits the range of possible solutions quite a lot. But it is > how it is. > >> Ideally we'd have the ORC unwinder for all arches, that would make this >> much easier. But we're not there yet. > > The alternative solution is that the process has to migrate itself > on some safe location. > > One crazy idea. It still might be possible to find the called > functions on the stack even when it is not reliable. Then it > might be possible to add another ftrace handler on > these found functions. This other ftrace handler might migrate > the task when it calls this function again. > > It assumes that the task will call the same functions again > and again. Also it might require that the tasks checks its > own stack from the ftrace handler. I am not sure if this > is possible. > > There might be other variants of this approach. This might be the ultimate solution! As ftrace allows filtering based on pid (/sys/kernel/tracing/set_ftrace_pid), we can technically trigger klp_try_switch_task() on every function of the pending tasks. If this works, we should finish most of the transition in seconds. And the only failure there would be threads with being patched function at the very sottom of its stack. Am I too optimistic here? Thanks, Song
On Wed 2022-05-11 16:33:57, Song Liu wrote: > > > > On May 11, 2022, at 2:24 AM, Petr Mladek <pmladek@suse.com> wrote: > > > > On Tue 2022-05-10 17:33:31, Josh Poimboeuf wrote: > >> On Tue, May 10, 2022 at 11:57:04PM +0000, Song Liu wrote: > >>>> If it's a real bug, we should fix it everywhere, not just for Facebook. > >>>> Otherwise CONFIG_PREEMPT and/or non-x86 arches become second-class > >>>> citizens. > >>> > >>> I think "is it a real bug?" is the top question for me. So maybe we > >>> should take a step back. > >>> > >>> The behavior we see is: A busy kernel thread blocks klp transition > >>> for more than a minute. But the transition eventually succeeded after > >>> < 10 retries on most systems. The kernel thread is well-behaved, as > >>> it calls cond_resched() at a reasonable frequency, so this is not a > >>> deadlock. > >>> > >>> If I understand Petr correctly, this behavior is expected, and thus > >>> is not a bug or issue for the livepatch subsystem. This is different > >>> to our original expectation, but if this is what we agree on, we > >>> will look into ways to incorporate long wait time for patch > >>> transition in our automations. > >> > >> That's how we've traditionally looked at it, though apparently Red Hat > >> and SUSE have implemented different ideas of what a long wait time is. > >> > >> In practice, one minute has always been enough for all of kpatch's users > >> -- AFAIK, everybody except SUSE -- up until now. > > > > I am actually surprised that nobody met the problem yet. There are > > "only" 60 attempts to transition the pending tasks. > > Maybe we should consider increase the frequency we try? Say to 10 times > per second? I guess this will solve most of the failures we are seeing > in current case. My concern is that klp_try_complete_transition() checks all processes under read_lock(&tasklist_lock). It might create some contention on this lock. I am not sure if this lock is fair. It might slow down block writers (creating/deleting tasks). Best Regards, Petr
On Fri, May 13, 2022 at 02:33:34PM +0200, Petr Mladek wrote: > My concern is that klp_try_complete_transition() checks all processes > under read_lock(&tasklist_lock). It might create some contention > on this lock. I am not sure if this lock is fair. It might slow down > block writers (creating/deleting tasks). rwlock_t is sorta fair. Is it fair for process contect usage, but in-interrupt use will have a reader bias. That said; contention on tasklist_lock is a real scalability concern. Can't you use RCU here? With RCU traversal of the tasklist you can miss new entry (which should already run the new code) or miss exits (which will stop running code).
On Wed, May 11, 2022 at 04:33:57PM +0000, Song Liu wrote: > >> Ideally we'd have the ORC unwinder for all arches, that would make this > >> much easier. But we're not there yet. > > > > The alternative solution is that the process has to migrate itself > > on some safe location. > > > > One crazy idea. It still might be possible to find the called > > functions on the stack even when it is not reliable. Then it > > might be possible to add another ftrace handler on > > these found functions. This other ftrace handler might migrate > > the task when it calls this function again. > > > > It assumes that the task will call the same functions again > > and again. Also it might require that the tasks checks its > > own stack from the ftrace handler. I am not sure if this > > is possible. > > > > There might be other variants of this approach. > > This might be the ultimate solution! As ftrace allows filtering based > on pid (/sys/kernel/tracing/set_ftrace_pid), we can technically trigger > klp_try_switch_task() on every function of the pending tasks. If this > works, we should finish most of the transition in seconds. And the only > failure there would be threads with being patched function at the very > sottom of its stack. Am I too optimistic here? It's a crazy idea, but I kind of like it ;-) Especially this variant of tracing all functions for the task. We'd have to make sure unwinding from an ftrace handler works for all arches/unwinders. -- Josh
On Sat, May 07, 2022 at 10:46:28AM -0700, Song Liu wrote: > Busy kernel threads may block the transition of livepatch. Call > klp_try_switch_task from __cond_resched to make the transition easier. What will a PREEMPT=y kernel do? How is it not a problem there, and if it is, this will not help that. That is; I don't think this can be right.
> On May 9, 2022, at 12:04 AM, Peter Zijlstra <peterz@infradead.org> wrote: > > On Sat, May 07, 2022 at 10:46:28AM -0700, Song Liu wrote: >> Busy kernel threads may block the transition of livepatch. Call >> klp_try_switch_task from __cond_resched to make the transition easier. > > What will a PREEMPT=y kernel do? How is it not a problem there, and if > it is, this will not help that. > > That is; I don't think this can be right. I guess on PREEMPT=y kernel, we can simply preempt the long running kernel thread and check the transition? In this case (PREEMPT=n), we see a long running kernel thread could not finish the transition. It calls cond_resched() and gets rescheduled (moves among different cores). However, it never finishes the transition, because live patch doesn’t get a chance to check the stack. Does this answer the question? Thanks, Song
On Mon, May 09, 2022 at 08:06:22AM +0000, Song Liu wrote: > > > > On May 9, 2022, at 12:04 AM, Peter Zijlstra <peterz@infradead.org> wrote: > > > > On Sat, May 07, 2022 at 10:46:28AM -0700, Song Liu wrote: > >> Busy kernel threads may block the transition of livepatch. Call > >> klp_try_switch_task from __cond_resched to make the transition easier. > > > > What will a PREEMPT=y kernel do? How is it not a problem there, and if > > it is, this will not help that. > > > > That is; I don't think this can be right. > > I guess on PREEMPT=y kernel, we can simply preempt the long running > kernel thread and check the transition? This is not a guessing game. > In this case (PREEMPT=n), we see a long running kernel thread could not > finish the transition. It calls cond_resched() and gets rescheduled > (moves among different cores). However, it never finishes the transition, > because live patch doesn’t get a chance to check the stack. > > Does this answer the question? Not really. There is no difference between an explicit preemption point (cond_resched) or an involuntary preemption point (PREEMPT=y). So unless you can *exactly* say why it isn't a problem on PREEMPT=y, none of this makes any sense.
On Mon, 2022-05-09 at 11:38 +0200, Peter Zijlstra wrote: > On Mon, May 09, 2022 at 08:06:22AM +0000, Song Liu wrote: > > > > > > > On May 9, 2022, at 12:04 AM, Peter Zijlstra > > > <peterz@infradead.org> wrote: > > > > > > On Sat, May 07, 2022 at 10:46:28AM -0700, Song Liu wrote: > > > > Busy kernel threads may block the transition of livepatch. Call > > > > klp_try_switch_task from __cond_resched to make the transition > > > > easier. > > > > > > What will a PREEMPT=y kernel do? How is it not a problem there, > > > and if > > > it is, this will not help that. > > Not really. There is no difference between an explicit preemption > point > (cond_resched) or an involuntary preemption point (PREEMPT=y). > > So unless you can *exactly* say why it isn't a problem on PREEMPT=y, > none of this makes any sense. I suspect it is a problem on PREEMPT=y too, but is there some sort of fairly light weight (in terms of stuff we need to add to the kernel) solution that could solve both? Do we have some real time per-CPU kernel threads we could just issue a NOOP call to, which would preempt long-running kernel threads (like a kworker with oodles of work to do)? Could the stopper workqueue be a suitable tool for this job?
On Mon 2022-05-09 14:13:17, Rik van Riel wrote: > On Mon, 2022-05-09 at 11:38 +0200, Peter Zijlstra wrote: > > On Mon, May 09, 2022 at 08:06:22AM +0000, Song Liu wrote: > > > > > > > > > > On May 9, 2022, at 12:04 AM, Peter Zijlstra > > > > <peterz@infradead.org> wrote: > > > > > > > > On Sat, May 07, 2022 at 10:46:28AM -0700, Song Liu wrote: > > > > > Busy kernel threads may block the transition of livepatch. Call > > > > > klp_try_switch_task from __cond_resched to make the transition > > > > > easier. > > > > > > > > What will a PREEMPT=y kernel do? How is it not a problem there, > > > > and if > > > > it is, this will not help that. > > > > Not really. There is no difference between an explicit preemption > > point > > (cond_resched) or an involuntary preemption point (PREEMPT=y). > > > > So unless you can *exactly* say why it isn't a problem on PREEMPT=y, > > none of this makes any sense. > > I suspect it is a problem on PREEMPT=y too, but is there some sort > of fairly light weight (in terms of stuff we need to add to the kernel) > solution that could solve both? > > Do we have some real time per-CPU kernel threads we could just > issue a NOOP call to, which would preempt long-running kernel > threads (like a kworker with oodles of work to do)? > > Could the stopper workqueue be a suitable tool for this job? An interesting solution would be to queue irq_work in CPU that is occupied by the long-running kernel task. It might be queued from klp_try_complete_transition() that is called from the regular klp_transition_work_fn(). Then the task might try to migrate itself from the irq_work. But the problem is that stack_trace_save_tsk_reliable() probably will not be able to store a reliable backtrace for the interrupted task. So, we might really need to stop the task (CPU). But there still might be problem if stack_trace_save_tsk_reliable() will consider the stack as reliable. Best Regards, Petr
On Sat, 2022-05-07 at 10:46 -0700, Song Liu wrote:
> Busy kernel threads may block the transition of livepatch. Call
> klp_try_switch_task from __cond_resched to make the transition
> easier.
>
That seems like a useful idea given what we're seeing on
some systems, but I do have a nitpick with your patch :)
> +++ b/kernel/sched/core.c
> @@ -6990,6 +6990,9 @@ SYSCALL_DEFINE0(sched_yield)
> #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
> int __sched __cond_resched(void)
> {
> + if (unlikely(klp_patch_pending(current)))
> + klp_try_switch_task(current);
> +
> if (should_resched(0)) {
> preempt_schedule_common();
> return 1;
While should_resched and klp_patch_pending check the same
cache line (task->flags), now there are two separate
conditionals on this.
Would it make sense to fold the tests for TIF_NEED_RESCHED
and TIF_PATCH_PENDING int should_resched(), and then re-do
the test for TIF_PATCH_PENDING only if should_resched()
returns true?
> On May 7, 2022, at 11:26 AM, Rik van Riel <riel@fb.com> wrote:
>
> On Sat, 2022-05-07 at 10:46 -0700, Song Liu wrote:
>> Busy kernel threads may block the transition of livepatch. Call
>> klp_try_switch_task from __cond_resched to make the transition
>> easier.
>>
> That seems like a useful idea given what we're seeing on
> some systems, but I do have a nitpick with your patch :)
>
>> +++ b/kernel/sched/core.c
>> @@ -6990,6 +6990,9 @@ SYSCALL_DEFINE0(sched_yield)
>> #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
>> int __sched __cond_resched(void)
>> {
>> + if (unlikely(klp_patch_pending(current)))
>> + klp_try_switch_task(current);
>> +
>> if (should_resched(0)) {
>> preempt_schedule_common();
>> return 1;
>
> While should_resched and klp_patch_pending check the same
> cache line (task->flags), now there are two separate
> conditionals on this.
>
> Would it make sense to fold the tests for TIF_NEED_RESCHED
> and TIF_PATCH_PENDING int should_resched(), and then re-do
> the test for TIF_PATCH_PENDING only if should_resched()
> returns true?
x86 has a different version of should_resched(), so I am not
quite sure what’s the right way o modify shhould_resched().
OTOH, we can probably see should_resched() as-is and just
move klp_patch_pending, like
int __sched __cond_resched(void)
{
if (should_resched(0)) {
if (unlikely(klp_patch_pending(current)))
klp_try_switch_task(current);
preempt_schedule_common();
return 1;
}
#ifndef CONFIG_PREEMPT_RCU
rcu_all_qs();
#endif
return 0;
}
Given live patch user space usually waits for many seconds,
I guess this should work?
Thanks,
Song
On Sat, 2022-05-07 at 19:04 +0000, Song Liu wrote:
> > On May 7, 2022, at 11:26 AM, Rik van Riel <riel@fb.com> wrote:
> >
> > On Sat, 2022-05-07 at 10:46 -0700, Song Liu wrote:
> > > Busy kernel threads may block the transition of livepatch. Call
> > > klp_try_switch_task from __cond_resched to make the transition
> > > easier.
> > >
> > That seems like a useful idea given what we're seeing on
> > some systems, but I do have a nitpick with your patch :)
> >
> > > +++ b/kernel/sched/core.c
> > > @@ -6990,6 +6990,9 @@ SYSCALL_DEFINE0(sched_yield)
> > > #if !defined(CONFIG_PREEMPTION) ||
> > > defined(CONFIG_PREEMPT_DYNAMIC)
> > > int __sched __cond_resched(void)
> > > {
> > > + if (unlikely(klp_patch_pending(current)))
> > > + klp_try_switch_task(current);
> > > +
> > > if (should_resched(0)) {
> > > preempt_schedule_common();
> > > return 1;
> >
> > While should_resched and klp_patch_pending check the same
> > cache line (task->flags), now there are two separate
> > conditionals on this.
> >
> > Would it make sense to fold the tests for TIF_NEED_RESCHED
> > and TIF_PATCH_PENDING int should_resched(), and then re-do
> > the test for TIF_PATCH_PENDING only if should_resched()
> > returns true?
>
> x86 has a different version of should_resched(),
Huh, I just looked at that, and the x86 should_resched()
only seems to check that we _can_ resched, not whether
we should...
/*
* Returns true when we need to resched and can (barring IRQ state).
*/
static __always_inline bool should_resched(int preempt_offset)
{
return unlikely(raw_cpu_read_4(__preempt_count) ==
preempt_offset);
}
I wonder if that was intended, and why, or whether
the x86 should_resched should also be checking for
TIF_NEED_RESCHED?
If the latter, the check for TIF_PATCH_PENDING could
just be merged there, too. Probably in the same macro
called from both places.
> so I am not
> quite sure what’s the right way o modify shhould_resched().
> OTOH, we can probably see should_resched() as-is and just
> move klp_patch_pending, like
>
> int __sched __cond_resched(void)
> {
> if (should_resched(0)) {
> if (unlikely(klp_patch_pending(current)))
> klp_try_switch_task(current);
>
> preempt_schedule_common();
> return 1;
> }
> #ifndef CONFIG_PREEMPT_RCU
> rcu_all_qs();
> #endif
> return 0;
> }
>
> Given live patch user space usually waits for many seconds,
> I guess this should work?
That should certainly work on x86, where should_resched
seems to always return true when we can reschedule?
On Sat, May 07, 2022 at 07:18:51PM +0000, Rik van Riel wrote:
> Huh, I just looked at that, and the x86 should_resched()
> only seems to check that we _can_ resched, not whether
> we should...
>
>
> /*
> * Returns true when we need to resched and can (barring IRQ state).
> */
> static __always_inline bool should_resched(int preempt_offset)
> {
> return unlikely(raw_cpu_read_4(__preempt_count) ==
> preempt_offset);
> }
>
> I wonder if that was intended, and why, or whether
> the x86 should_resched should also be checking for
> TIF_NEED_RESCHED?
No, it does what you think it should do, you're just getting confused by
the inverted PREEMPT_NEED_RESCHED bit :-)
On Sun, 2022-05-08 at 22:41 +0200, Peter Zijlstra wrote: > > No, it does what you think it should do, you're just getting confused > by > the inverted PREEMPT_NEED_RESCHED bit :-) Fair enough, that makes sense! Thanks for pointing that out. That is a very clever optimization. I suppose that should_resched() check is also something that KLP could use by periodically poking tasks that get stuck in the KLP transition (and are running) by simply calling set_preempt_need_resched(), after which that klp_patch_pending() check can happen only in the __cond_resched() path where we already call preempt_schedule_common() anyway? Is that something that would work, and be low enough overhead in the general case to be acceptable?
After talking with Peter, this seems like it might be a potential approach
to fix the issue for kernels both with PREEMPT enabled and disabled.
If this looks like a reasonable approach to people, we can run experiments
with this patch on a few thousand systems, and compare it with the kernel
live patch transition latencies (and number of failures) on a kernel without
that patch.
Does this look like an approach that could work?
---8<---
sched,livepatch: call stop_one_cpu in klp_check_and_switch_task
If a running task fails to transition to the new kernel live patch after the
first attempt, use the stopper thread to preempt it during subsequent attempts
at switching to the new kernel live patch.
<INSERT EXPERIMENTAL RESULTS HERE>
Signed-off-by: Rik van Riel <riel@surriel.com>
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index 5d03a2ad1066..26e9e5f09822 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -9,6 +9,7 @@
#include <linux/cpu.h>
#include <linux/stacktrace.h>
+#include <linux/stop_machine.h>
#include "core.h"
#include "patch.h"
#include "transition.h"
@@ -281,6 +282,11 @@ static int klp_check_and_switch_task(struct task_struct *task, void *arg)
return 0;
}
+static int kpatch_dummy_fn(void *dummy)
+{
+ return 0;
+}
+
/*
* Try to safely switch a task to the target patch state. If it's currently
* running, or it's sleeping on a to-be-patched or to-be-unpatched function, or
@@ -315,6 +321,9 @@ static bool klp_try_switch_task(struct task_struct *task)
case -EBUSY: /* klp_check_and_switch_task() */
pr_debug("%s: %s:%d is running\n",
__func__, task->comm, task->pid);
+ /* Preempt the task from the second KLP switch attempt. */
+ if (klp_signals_cnt)
+ stop_one_cpu(task_cpu(task), kpatch_dummy_fn, NULL);
break;
case -EINVAL: /* klp_check_and_switch_task() */
pr_debug("%s: %s:%d has an unreliable stack\n",
On Mon, May 09, 2022 at 11:52:27AM -0400, Rik van Riel wrote:
> Does this look like an approach that could work?
>
> ---8<---
> sched,livepatch: call stop_one_cpu in klp_check_and_switch_task
>
> If a running task fails to transition to the new kernel live patch after the
> first attempt, use the stopper thread to preempt it during subsequent attempts
> at switching to the new kernel live patch.
>
> <INSERT EXPERIMENTAL RESULTS HERE>
It would be helpful to add more info about the original problem being
solved, and how this is supposed to fix it.
> +static int kpatch_dummy_fn(void *dummy)
s/kpatch/klp/
> +{
> + return 0;
> +}
> +
> /*
> * Try to safely switch a task to the target patch state. If it's currently
> * running, or it's sleeping on a to-be-patched or to-be-unpatched function, or
> @@ -315,6 +321,9 @@ static bool klp_try_switch_task(struct task_struct *task)
> case -EBUSY: /* klp_check_and_switch_task() */
> pr_debug("%s: %s:%d is running\n",
> __func__, task->comm, task->pid);
> + /* Preempt the task from the second KLP switch attempt. */
> + if (klp_signals_cnt)
> + stop_one_cpu(task_cpu(task), kpatch_dummy_fn, NULL);
I must be missing something, how is briefly preempting a kthread
supposed to actually transition it? Won't it likely go back to running
on the CPU before the next periodic klp_transition_work_fn() check?
--
Josh
On Mon, 2022-05-09 at 11:00 -0700, Josh Poimboeuf wrote:
> On Mon, May 09, 2022 at 11:52:27AM -0400, Rik van Riel wrote:
> > Does this look like an approach that could work?
> >
> > @@ -315,6 +321,9 @@ static bool klp_try_switch_task(struct
> > task_struct *task)
> > case -EBUSY: /* klp_check_and_switch_task() */
> > pr_debug("%s: %s:%d is running\n",
> > __func__, task->comm, task->pid);
> > + /* Preempt the task from the second KLP switch
> > attempt. */
> > + if (klp_signals_cnt)
> > + stop_one_cpu(task_cpu(task),
> > kpatch_dummy_fn, NULL);
>
> I must be missing something, how is briefly preempting a kthread
> supposed to actually transition it? Won't it likely go back to
> running
> on the CPU before the next periodic klp_transition_work_fn() check?
>
That's the kind of feedback I was hoping for ;)
I looked around the code a little bit, and it seems
that only the idle tasks can transition to another KLP
while they are running?
That makes me wonder how the kworker thread that runs
the klp switching code transitions itself...
Should kernel threads that can use a lot of CPU have
something in their outer loop to transition KLPs,
just like the idle task does?
--
All Rights Reversed.
On Mon, May 09, 2022 at 03:10:16PM -0400, Rik van Riel wrote:
> On Mon, 2022-05-09 at 11:00 -0700, Josh Poimboeuf wrote:
> > On Mon, May 09, 2022 at 11:52:27AM -0400, Rik van Riel wrote:
> > > Does this look like an approach that could work?
> > >
> > > @@ -315,6 +321,9 @@ static bool klp_try_switch_task(struct
> > > task_struct *task)
> > > case -EBUSY: /* klp_check_and_switch_task() */
> > > pr_debug("%s: %s:%d is running\n",
> > > __func__, task->comm, task->pid);
> > > + /* Preempt the task from the second KLP switch
> > > attempt. */
> > > + if (klp_signals_cnt)
> > > + stop_one_cpu(task_cpu(task),
> > > kpatch_dummy_fn, NULL);
> >
> > I must be missing something, how is briefly preempting a kthread
> > supposed to actually transition it? Won't it likely go back to
> > running
> > on the CPU before the next periodic klp_transition_work_fn() check?
> >
> That's the kind of feedback I was hoping for ;)
>
> I looked around the code a little bit, and it seems
> that only the idle tasks can transition to another KLP
> while they are running?
Yes.
> That makes me wonder how the kworker thread that runs
> the klp switching code transitions itself...
See klp_check_and_switch_task(), in addition to checking blocked tasks,
it also checks the current task.
> Should kernel threads that can use a lot of CPU have
> something in their outer loop to transition KLPs,
> just like the idle task does?
Maybe - I suppose this is the first time we've had an issue with
CPU-bound kthreads. I didn't know that was a thing ;-)
--
Josh
On Mon, 2022-05-09 at 12:17 -0700, Josh Poimboeuf wrote: > On Mon, May 09, 2022 at 03:10:16PM -0400, Rik van Riel wrote: > > > > Should kernel threads that can use a lot of CPU have > > something in their outer loop to transition KLPs, > > just like the idle task does? > > Maybe - I suppose this is the first time we've had an issue with > CPU-bound kthreads. I didn't know that was a thing ;-) > Kworkers have as much work as you want them to do, and with things like btrfs compression that can be quite a bit. -- All Rights Reversed.
On Mon, May 09, 2022 at 03:49:52PM -0400, Rik van Riel wrote: > On Mon, 2022-05-09 at 12:17 -0700, Josh Poimboeuf wrote: > > On Mon, May 09, 2022 at 03:10:16PM -0400, Rik van Riel wrote: > > > > > > > Should kernel threads that can use a lot of CPU have > > > something in their outer loop to transition KLPs, > > > just like the idle task does? > > > > Maybe - I suppose this is the first time we've had an issue with > > CPU-bound kthreads. I didn't know that was a thing ;-) > > > Kworkers have as much work as you want them to do, and with > things like btrfs compression that can be quite a bit. To prevent patching, it would need to be some kind of sustained CPU activity, rather than a burst. I guess we haven't seen that show up as a real-world problem until now. If you're able to identify which kthreads would be problematic, then yeah, defining a "transition point" in their outer loops could be an option. We could look also at a more general approach, like stack checking from an irq handler. But as Petr alluded to, that would be problematic for CONFIG_FRAME_POINTER. We could maybe deprecate frame pointers on x86 for live patching, but I think other arches would have a similar problem unless they were to do something like the ORC unwinder. -- Josh
On Mon, 2022-05-09 at 13:09 -0700, Josh Poimboeuf wrote: > To prevent patching, it would need to be some kind of sustained CPU > activity, rather than a burst. I guess we haven't seen that show up > as > a real-world problem until now. > It's amazing what you see when you have a few million very busy servers. The problems you think of as "one in a million" happen all the time :) > If you're able to identify which kthreads would be problematic, then > yeah, defining a "transition point" in their outer loops could be an > option. > I'm in the middle of creating some scripts to gather kpatch output from all the systems, and then sort and count the results so we can easily see what the main pain points are. We'll be back with patches once we have the data in hand :) -- All Rights Reversed.
> On May 9, 2022, at 1:09 PM, Josh Poimboeuf <jpoimboe@kernel.org> wrote: > > On Mon, May 09, 2022 at 03:49:52PM -0400, Rik van Riel wrote: >> On Mon, 2022-05-09 at 12:17 -0700, Josh Poimboeuf wrote: >>> On Mon, May 09, 2022 at 03:10:16PM -0400, Rik van Riel wrote: >>> >>> >>>> Should kernel threads that can use a lot of CPU have >>>> something in their outer loop to transition KLPs, >>>> just like the idle task does? >>> >>> Maybe - I suppose this is the first time we've had an issue with >>> CPU-bound kthreads. I didn't know that was a thing ;-) >>> >> Kworkers have as much work as you want them to do, and with >> things like btrfs compression that can be quite a bit. > > To prevent patching, it would need to be some kind of sustained CPU > activity, rather than a burst. I guess we haven't seen that show up as > a real-world problem until now. Yes, we see this issue with sustained CPU activity from a kernel thread, which might be a bug itself. OTOH, the kernel thread does call cond_sched(), so it is not a deadlock. > > If you're able to identify which kthreads would be problematic, then > yeah, defining a "transition point" in their outer loops could be an > option. cond_sched() feels like a natural “transition point” to me, and it should solve many different variations of such problem. I agree that adding something to cond_sched() might be too much overhead for the system. So we are open for other suggestions. Thanks, Song > > We could look also at a more general approach, like stack checking from > an irq handler. But as Petr alluded to, that would be problematic for > CONFIG_FRAME_POINTER. > > We could maybe deprecate frame pointers on x86 for live patching, but I > think other arches would have a similar problem unless they were to do > something like the ORC unwinder.
On Tue, May 10, 2022 at 12:32:02AM +0000, Song Liu wrote: > cond_sched() feels like a natural “transition point” to me, and it You have to think of cond_resched() as a NOP.
> On May 9, 2022, at 8:52 AM, Rik van Riel <riel@surriel.com> wrote:
>
> After talking with Peter, this seems like it might be a potential approach
> to fix the issue for kernels both with PREEMPT enabled and disabled.
>
> If this looks like a reasonable approach to people, we can run experiments
> with this patch on a few thousand systems, and compare it with the kernel
> live patch transition latencies (and number of failures) on a kernel without
> that patch.
>
> Does this look like an approach that could work?
Hi Petr,
IIUC, this is similar to you proposal. Could you please share your thoughts
on this?
Thanks,
Song
>
> ---8<---
> sched,livepatch: call stop_one_cpu in klp_check_and_switch_task
>
> If a running task fails to transition to the new kernel live patch after the
> first attempt, use the stopper thread to preempt it during subsequent attempts
> at switching to the new kernel live patch.
>
> <INSERT EXPERIMENTAL RESULTS HERE>
>
> Signed-off-by: Rik van Riel <riel@surriel.com>
>
> diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
> index 5d03a2ad1066..26e9e5f09822 100644
> --- a/kernel/livepatch/transition.c
> +++ b/kernel/livepatch/transition.c
> @@ -9,6 +9,7 @@
>
> #include <linux/cpu.h>
> #include <linux/stacktrace.h>
> +#include <linux/stop_machine.h>
> #include "core.h"
> #include "patch.h"
> #include "transition.h"
> @@ -281,6 +282,11 @@ static int klp_check_and_switch_task(struct task_struct *task, void *arg)
> return 0;
> }
>
> +static int kpatch_dummy_fn(void *dummy)
> +{
> + return 0;
> +}
> +
> /*
> * Try to safely switch a task to the target patch state. If it's currently
> * running, or it's sleeping on a to-be-patched or to-be-unpatched function, or
> @@ -315,6 +321,9 @@ static bool klp_try_switch_task(struct task_struct *task)
> case -EBUSY: /* klp_check_and_switch_task() */
> pr_debug("%s: %s:%d is running\n",
> __func__, task->comm, task->pid);
> + /* Preempt the task from the second KLP switch attempt. */
> + if (klp_signals_cnt)
> + stop_one_cpu(task_cpu(task), kpatch_dummy_fn, NULL);
> break;
> case -EINVAL: /* klp_check_and_switch_task() */
> pr_debug("%s: %s:%d has an unreliable stack\n",
>
© 2016 - 2026 Red Hat, Inc.