arch/x86/include/asm/mce.h | 2 +- arch/x86/kernel/cpu/mce/core.c | 17 +++++++++++------ arch/x86/kernel/cpu/mce/intel.c | 8 ++++++-- arch/x86/kernel/cpu/mce/internal.h | 2 +- 4 files changed, 19 insertions(+), 10 deletions(-)
From: Li RongQing <lirongqing@baidu.com>
Since commit 011d82611172 ("RAS: Add a Corrected Errors Collector"),
mce_notify_irq() in should_enable_timer() always returns false even
when an MCE event is logged, because bit 0 of mce_need_notify is not
set in mce_log. This prevents the timer interval from being properly
adjusted.
Fix this by modifying machine_check_poll() to return a boolean indicating
whether an MCE was logged, and update mc_poll_banks() to propagate this
return value. The timer interval logic in mce_timer_fn() now uses this
return value directly instead of relying on mce_notify_irq().
Fixes: 011d82611172 ("RAS: Add a Corrected Errors Collector")
Signed-off-by: Li RongQing <lirongqing@baidu.com>
---
arch/x86/include/asm/mce.h | 2 +-
arch/x86/kernel/cpu/mce/core.c | 17 +++++++++++------
arch/x86/kernel/cpu/mce/intel.c | 8 ++++++--
arch/x86/kernel/cpu/mce/internal.h | 2 +-
4 files changed, 19 insertions(+), 10 deletions(-)
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 2d98886..fb9eab4 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -303,7 +303,7 @@ enum mcp_flags {
MCP_QUEUE_LOG = BIT(2), /* only queue to genpool */
};
-void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
+bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
DECLARE_PER_CPU(struct mce, injectm);
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 3444002..8d42691 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -813,10 +813,11 @@ static void clear_bank(struct mce *m)
* is already totally * confused. In this case it's likely it will
* not fully execute the machine check handler either.
*/
-void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
+bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
struct mce_hw_err err;
+ bool logged = false;
struct mce *m;
int i;
@@ -868,6 +869,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
else
mce_log(&err);
+ logged = true;
clear_it:
clear_bank(m);
}
@@ -878,6 +880,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
*/
sync_core();
+
+ return logged;
}
EXPORT_SYMBOL_GPL(machine_check_poll);
@@ -1776,12 +1780,12 @@ static void __start_timer(struct timer_list *t, unsigned long interval)
local_irq_restore(flags);
}
-static void mc_poll_banks_default(void)
+static bool mc_poll_banks_default(void)
{
- machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
+ return machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
}
-void (*mc_poll_banks)(void) = mc_poll_banks_default;
+bool (*mc_poll_banks)(void) = mc_poll_banks_default;
static bool should_enable_timer(unsigned long iv)
{
@@ -1792,19 +1796,20 @@ static void mce_timer_fn(struct timer_list *t)
{
struct timer_list *cpu_t = this_cpu_ptr(&mce_timer);
unsigned long iv;
+ bool logged = false;
WARN_ON(cpu_t != t);
iv = __this_cpu_read(mce_next_interval);
if (mce_available(this_cpu_ptr(&cpu_info)))
- mc_poll_banks();
+ logged = mc_poll_banks();
/*
* Alert userspace if needed. If we logged an MCE, reduce the polling
* interval, otherwise increase the polling interval.
*/
- if (mce_notify_irq())
+ if (logged)
iv = max(iv / 2, (unsigned long) HZ/100);
else
iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index 4655223..a3d2730 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -395,11 +395,15 @@ void cmci_disable_bank(int bank)
}
/* Bank polling function when CMCI is disabled. */
-static void cmci_mc_poll_banks(void)
+static bool cmci_mc_poll_banks(void)
{
+ bool logged;
+
spin_lock(&cmci_poll_lock);
- machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
+ logged = machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
spin_unlock(&cmci_poll_lock);
+
+ return logged;
}
void intel_init_cmci(void)
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index a31cf98..7bf2360 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -348,5 +348,5 @@ static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg)
return 0;
}
-extern void (*mc_poll_banks)(void);
+extern bool (*mc_poll_banks)(void);
#endif /* __X86_MCE_INTERNAL_H__ */
--
2.9.4
On 12.01.26 г. 10:27 ч., lirongqing wrote:
> From: Li RongQing <lirongqing@baidu.com>
>
> Since commit 011d82611172 ("RAS: Add a Corrected Errors Collector"),
> mce_notify_irq() in should_enable_timer() always returns false even
should_enable_timer doesn't call mce_notify_irq
> when an MCE event is logged, because bit 0 of mce_need_notify is not
> set in mce_log. This prevents the timer interval from being properly
> adjusted.
>
> Fix this by modifying machine_check_poll() to return a boolean indicating
> whether an MCE was logged, and update mc_poll_banks() to propagate this
> return value. The timer interval logic in mce_timer_fn() now uses this
> return value directly instead of relying on mce_notify_irq().
This warrants a bit more explanation why it's correct. Because
mce_notify_irq is really a misnomer, it will ideally be named
mce_notify_user(). That function is called from 2 places:
1. Early notifier block, but there it's guaranteed to do the right thing
as mce_need_notify is explicitly set.
2. From the timer function, where as you have pointed out
mce_need_notify is never set by the polling code, hence the function is
a noop. But actually calling mce_log() processes all logged errors in
the gen pool and that is processed by calling the x86_mce_decoder_chain
which will DTRT w.r.t to mce_notify_irq since the early notifier will be
called from there.
<snip>
>
> On 12.01.26 г. 10:27 ч., lirongqing wrote:
> > From: Li RongQing <lirongqing@baidu.com>
> >
> > Since commit 011d82611172 ("RAS: Add a Corrected Errors Collector"),
> > mce_notify_irq() in should_enable_timer() always returns false even
>
> should_enable_timer doesn't call mce_notify_irq
>
> > when an MCE event is logged, because bit 0 of mce_need_notify is not
> > set in mce_log. This prevents the timer interval from being properly
> > adjusted.
> >
> > Fix this by modifying machine_check_poll() to return a boolean
> > indicating whether an MCE was logged, and update mc_poll_banks() to
> > propagate this return value. The timer interval logic in
> > mce_timer_fn() now uses this return value directly instead of relying on
> mce_notify_irq().
>
> This warrants a bit more explanation why it's correct. Because mce_notify_irq is
> really a misnomer, it will ideally be named mce_notify_user(). That function is
> called from 2 places:
>
> 1. Early notifier block, but there it's guaranteed to do the right thing as
> mce_need_notify is explicitly set.
>
> 2. From the timer function, where as you have pointed out mce_need_notify is
> never set by the polling code, hence the function is a noop. But actually calling
> mce_log() processes all logged errors in the gen pool and that is processed by
> calling the x86_mce_decoder_chain which will DTRT w.r.t to mce_notify_irq
> since the early notifier will be called from there.
>
> <snip>
Ok, I will add more explanation, and rename mce_notify_irq() as mce_notify_user();
Thanks
-Li
On Mon, Jan 12, 2026 at 09:36:21AM +0000, Li,Rongqing wrote:
> Ok, I will add more explanation, and rename mce_notify_irq() as
> mce_notify_user();
No, first you should explain what you're fixing here and why.
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
> On Mon, Jan 12, 2026 at 09:36:21AM +0000, Li,Rongqing wrote:
> > Ok, I will add more explanation, and rename mce_notify_irq() as
> > mce_notify_user();
>
> No, first you should explain what you're fixing here and why.
>
> --
> Regards/Gruss,
> Boris.
>
How about modifying the changelog as follows
x86/mce: Fix timer interval adjustment after logging a MCE event
Since commit 011d82611172 ("RAS: Add a Corrected Errors Collector"),
mce_timer_fn() has incorrectly determined whether to adjust the
timer interval. The issue arises because mce_notify_irq() now always
returns false when called from the timer path, since the polling code
never sets bit 0 of mce_need_notify. This prevents proper adjustment of
the timer interval based on whether MCE events were logged.
The mce_notify_irq() is called from two contexts:
1. Early notifier block - correctly sets mce_need_notify
2. Timer function - never sets mce_need_notify, making it a noop
(though logged errors are still processed through mce_log()->
x86_mce_decoder_chain -> early notifier).
Fix this by modifying machine_check_poll() to return a boolean indicating
whether any MCE was logged, and updating mc_poll_banks() and related
functions to propagate this return value. Then, mce_timer_fn() can use
this direct return value instead of relying on mce_notify_irq() for
timer interval decisions.
This ensures the timer interval is correctly reduced when MCE events are
logged and increased when no events occur.
Fixes: 011d82611172 ("RAS: Add a Corrected Errors Collector")
> https://people.kernel.org/tglx/notes-about-netiquette
On Mon, Jan 12, 2026 at 10:24:11AM +0000, Li,Rongqing wrote:
> Since commit 011d82611172 ("RAS: Add a Corrected Errors Collector"),
> mce_timer_fn() has incorrectly determined whether to adjust the
> timer interval. The issue arises because mce_notify_irq() now always
> returns false when called from the timer path, since the polling code
> never sets bit 0 of mce_need_notify. This prevents proper adjustment of
> the timer interval based on whether MCE events were logged.
That's because you missed the main point of the error collector:
"The error decoding is done with the decoding chain now and
mce_first_notifier() gets to see the error first and the CEC decides
whether to log it and then the rest of the chain doesn't hear about it -
^^^^^^^^^^^^^^^^^^^^^^
basically the main reason for the CE collector - or to continue running
the notifiers."
So lemme ask you again: what are you really fixing?!
What is the actual problem you're trying to fix?
And do not send me another revised commit message.
Thx.
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
© 2016 - 2026 Red Hat, Inc.