[PATCH] x86/xen: Fix lazy mmu handling across context switch

Juergen Gross posted 1 patch 3 weeks, 1 day ago
Failed in applying to current master (apply log)
There is a newer version of this series
arch/x86/xen/enlighten_pv.c |  7 ++-----
include/linux/pgtable.h     | 33 ++++++++++++++++++++++++---------
2 files changed, 26 insertions(+), 14 deletions(-)
[PATCH] x86/xen: Fix lazy mmu handling across context switch
Posted by Juergen Gross 3 weeks, 1 day ago
The recent rework of mmu lazy mode has resulted in problems when
running as a Xen PV guest. Enabling lazy mmu mode for the new context
during context switch is done from the arch_end_context_switch() hook,
but when calling this hook current hasn't been changed yet, so the
lazy mmu mode state of the wrong task is modified.

Additionally it is much cleaner to use lazy_mmu_mode_pause() and
lazy_mmu_mode_resume() in the Xen context switch hooks, as it avoids
conditionals in those hooks.

In order not having to add another hook to be called after switching
current, modify lazy_mmu_mode_resume() to use a new sub-function which
takes a task pointer as parameter. This new sub-function can then be
used in the xen_end_context_switch() hook.

Fixes: 291b3abed657 ("x86/xen: use lazy_mmu_state when context-switching")
Signed-off-by: Juergen Gross <jgross@suse.com>
---
 arch/x86/xen/enlighten_pv.c |  7 ++-----
 include/linux/pgtable.h     | 33 ++++++++++++++++++++++++---------
 2 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index ed2d7a3756ce..67bb6bf6d240 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -424,9 +424,7 @@ static void xen_start_context_switch(struct task_struct *prev)
 {
 	BUG_ON(preemptible());
 
-	if (this_cpu_read(xen_lazy_mode) == XEN_LAZY_MMU) {
-		arch_leave_lazy_mmu_mode();
-	}
+	lazy_mmu_mode_pause();
 	enter_lazy(XEN_LAZY_CPU);
 }
 
@@ -436,8 +434,7 @@ static void xen_end_context_switch(struct task_struct *next)
 
 	xen_mc_flush();
 	leave_lazy(XEN_LAZY_CPU);
-	if (__task_lazy_mmu_mode_active(next))
-		arch_enter_lazy_mmu_mode();
+	lazy_mmu_mode_resume_task(next);
 }
 
 static unsigned long xen_store_tr(void)
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index cdd68ed3ae1a..83a099bf2038 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -326,6 +326,28 @@ static inline void lazy_mmu_mode_pause(void)
 		arch_leave_lazy_mmu_mode();
 }
 
+/**
+ * lazy_mmu_mode_resume_task() - Resume the lazy MMU mode for a specific task.
+ *
+ * Like lazy_mmu_mode_resume() below, but with a task specified.
+ * Must be called only by lazy_mmu_mode_resume() or during context switch.
+ * Must never be called in interrupt context.
+ *
+ * Must match a call to lazy_mmu_mode_pause().
+ *
+ * Has no effect if called:
+ * - While paused (inside another pause()/resume() pair)
+ */
+static inline void lazy_mmu_mode_resume_task(struct task_struct *task)
+{
+	struct lazy_mmu_state *state = &task->lazy_mmu_state;
+
+	VM_WARN_ON_ONCE(state->pause_count == 0);
+
+	if (--state->pause_count == 0 && state->enable_count > 0)
+		arch_enter_lazy_mmu_mode();
+}
+
 /**
  * lazy_mmu_mode_resume() - Resume the lazy MMU mode.
  *
@@ -341,15 +363,8 @@ static inline void lazy_mmu_mode_pause(void)
  */
 static inline void lazy_mmu_mode_resume(void)
 {
-	struct lazy_mmu_state *state = &current->lazy_mmu_state;
-
-	if (in_interrupt())
-		return;
-
-	VM_WARN_ON_ONCE(state->pause_count == 0);
-
-	if (--state->pause_count == 0 && state->enable_count > 0)
-		arch_enter_lazy_mmu_mode();
+	if (!in_interrupt())
+		lazy_mmu_mode_resume_task(current);
 }
 #else
 static inline void lazy_mmu_mode_enable(void) {}
-- 
2.54.0
Re: [PATCH] x86/xen: Fix lazy mmu handling across context switch
Posted by Jürgen Groß 3 weeks, 1 day ago
Please disregard this patch. It isn't fixing the real problem.

On 08.05.26 10:05, Juergen Gross wrote:
> The recent rework of mmu lazy mode has resulted in problems when
> running as a Xen PV guest. Enabling lazy mmu mode for the new context
> during context switch is done from the arch_end_context_switch() hook,
> but when calling this hook current hasn't been changed yet, so the
> lazy mmu mode state of the wrong task is modified.
> 
> Additionally it is much cleaner to use lazy_mmu_mode_pause() and
> lazy_mmu_mode_resume() in the Xen context switch hooks, as it avoids
> conditionals in those hooks.
> 
> In order not having to add another hook to be called after switching
> current, modify lazy_mmu_mode_resume() to use a new sub-function which
> takes a task pointer as parameter. This new sub-function can then be
> used in the xen_end_context_switch() hook.
> 
> Fixes: 291b3abed657 ("x86/xen: use lazy_mmu_state when context-switching")
> Signed-off-by: Juergen Gross <jgross@suse.com>
> ---
>   arch/x86/xen/enlighten_pv.c |  7 ++-----
>   include/linux/pgtable.h     | 33 ++++++++++++++++++++++++---------
>   2 files changed, 26 insertions(+), 14 deletions(-)
> 
> diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
> index ed2d7a3756ce..67bb6bf6d240 100644
> --- a/arch/x86/xen/enlighten_pv.c
> +++ b/arch/x86/xen/enlighten_pv.c
> @@ -424,9 +424,7 @@ static void xen_start_context_switch(struct task_struct *prev)
>   {
>   	BUG_ON(preemptible());
>   
> -	if (this_cpu_read(xen_lazy_mode) == XEN_LAZY_MMU) {
> -		arch_leave_lazy_mmu_mode();
> -	}
> +	lazy_mmu_mode_pause();
>   	enter_lazy(XEN_LAZY_CPU);
>   }
>   
> @@ -436,8 +434,7 @@ static void xen_end_context_switch(struct task_struct *next)
>   
>   	xen_mc_flush();
>   	leave_lazy(XEN_LAZY_CPU);
> -	if (__task_lazy_mmu_mode_active(next))
> -		arch_enter_lazy_mmu_mode();
> +	lazy_mmu_mode_resume_task(next);
>   }
>   
>   static unsigned long xen_store_tr(void)
> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> index cdd68ed3ae1a..83a099bf2038 100644
> --- a/include/linux/pgtable.h
> +++ b/include/linux/pgtable.h
> @@ -326,6 +326,28 @@ static inline void lazy_mmu_mode_pause(void)
>   		arch_leave_lazy_mmu_mode();
>   }
>   
> +/**
> + * lazy_mmu_mode_resume_task() - Resume the lazy MMU mode for a specific task.
> + *
> + * Like lazy_mmu_mode_resume() below, but with a task specified.
> + * Must be called only by lazy_mmu_mode_resume() or during context switch.
> + * Must never be called in interrupt context.
> + *
> + * Must match a call to lazy_mmu_mode_pause().
> + *
> + * Has no effect if called:
> + * - While paused (inside another pause()/resume() pair)
> + */
> +static inline void lazy_mmu_mode_resume_task(struct task_struct *task)
> +{
> +	struct lazy_mmu_state *state = &task->lazy_mmu_state;
> +
> +	VM_WARN_ON_ONCE(state->pause_count == 0);
> +
> +	if (--state->pause_count == 0 && state->enable_count > 0)
> +		arch_enter_lazy_mmu_mode();
> +}
> +
>   /**
>    * lazy_mmu_mode_resume() - Resume the lazy MMU mode.
>    *
> @@ -341,15 +363,8 @@ static inline void lazy_mmu_mode_pause(void)
>    */
>   static inline void lazy_mmu_mode_resume(void)
>   {
> -	struct lazy_mmu_state *state = &current->lazy_mmu_state;
> -
> -	if (in_interrupt())
> -		return;
> -
> -	VM_WARN_ON_ONCE(state->pause_count == 0);
> -
> -	if (--state->pause_count == 0 && state->enable_count > 0)
> -		arch_enter_lazy_mmu_mode();
> +	if (!in_interrupt())
> +		lazy_mmu_mode_resume_task(current);
>   }
>   #else
>   static inline void lazy_mmu_mode_enable(void) {}

Re: [PATCH] x86/xen: Fix lazy mmu handling across context switch
Posted by Kevin Brodsky 3 weeks, 1 day ago
On 08/05/2026 10:33, Jürgen Groß wrote:
> Please disregard this patch. It isn't fixing the real problem.

That's what I would expect, see below.

>
> On 08.05.26 10:05, Juergen Gross wrote:
>> The recent rework of mmu lazy mode has resulted in problems when
>> running as a Xen PV guest. Enabling lazy mmu mode for the new context
>> during context switch is done from the arch_end_context_switch() hook,
>> but when calling this hook current hasn't been changed yet, so the
>> lazy mmu mode state of the wrong task is modified.

Currently xen_end_context_switch() checks if next has lazy MMU mode
enabled and if so calls arch_enter_lazy_mmu_mode(), i.e.
enter_lazy(XEN_LAZY_MMU). This does *not* modify any task state, rather
it writes to the xen_lazy_mode percpu variable.

I've thought about this from various angles when reworking lazy MMU, and
the conclusion I made is that arch_{start,end}_context_switch() have no
reason to change any task state. On arm64, for instance, we do nothing
at all on context switching, since everything lazy MMU-related is
tracked in task_struct and therefore already switched.

Xen is trickier because it tracks lazy MMU/CPU state in a percpu
variable, so these hooks do need to do something about it. This is
entirely Xen-internal though, and there's no reason to be calling
generic functions like lazy_mmu_mode_pause() that modify task state.

The idea behind commit 291b3abed657 ("x86/xen: use lazy_mmu_state when
context-switching") is that TIF_LAZY_MMU_UPDATES now duplicates
lazy_mmu_state in task_struct and we can therefore replace the former
with the latter. More specifically, the assumption is that
TIF_LAZY_MMU_UPDATES is set if and only if the task has been scheduled
out and __task_lazy_mmu_mode_active(task) is true.

Clearly there is something wrong with this assumption, but I still can't
put my finger on it. For now I would suggest reverting this commit if
that solves the issue Marek reported; the intention was not to introduce
any functional change, but only a (minor) optimisation.

- Kevin

>>
>> Additionally it is much cleaner to use lazy_mmu_mode_pause() and
>> lazy_mmu_mode_resume() in the Xen context switch hooks, as it avoids
>> conditionals in those hooks.
>>
>> In order not having to add another hook to be called after switching
>> current, modify lazy_mmu_mode_resume() to use a new sub-function which
>> takes a task pointer as parameter. This new sub-function can then be
>> used in the xen_end_context_switch() hook.
>>
>> Fixes: 291b3abed657 ("x86/xen: use lazy_mmu_state when
>> context-switching")
>> Signed-off-by: Juergen Gross <jgross@suse.com>
>> ---
>>   arch/x86/xen/enlighten_pv.c |  7 ++-----
>>   include/linux/pgtable.h     | 33 ++++++++++++++++++++++++---------
>>   2 files changed, 26 insertions(+), 14 deletions(-)
>>
>> diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
>> index ed2d7a3756ce..67bb6bf6d240 100644
>> --- a/arch/x86/xen/enlighten_pv.c
>> +++ b/arch/x86/xen/enlighten_pv.c
>> @@ -424,9 +424,7 @@ static void xen_start_context_switch(struct
>> task_struct *prev)
>>   {
>>       BUG_ON(preemptible());
>>   -    if (this_cpu_read(xen_lazy_mode) == XEN_LAZY_MMU) {
>> -        arch_leave_lazy_mmu_mode();
>> -    }
>> +    lazy_mmu_mode_pause();
>>       enter_lazy(XEN_LAZY_CPU);
>>   }
>>   @@ -436,8 +434,7 @@ static void xen_end_context_switch(struct
>> task_struct *next)
>>         xen_mc_flush();
>>       leave_lazy(XEN_LAZY_CPU);
>> -    if (__task_lazy_mmu_mode_active(next))
>> -        arch_enter_lazy_mmu_mode();
>> +    lazy_mmu_mode_resume_task(next);
>>   }
>>     static unsigned long xen_store_tr(void)
>> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
>> index cdd68ed3ae1a..83a099bf2038 100644
>> --- a/include/linux/pgtable.h
>> +++ b/include/linux/pgtable.h
>> @@ -326,6 +326,28 @@ static inline void lazy_mmu_mode_pause(void)
>>           arch_leave_lazy_mmu_mode();
>>   }
>>   +/**
>> + * lazy_mmu_mode_resume_task() - Resume the lazy MMU mode for a
>> specific task.
>> + *
>> + * Like lazy_mmu_mode_resume() below, but with a task specified.
>> + * Must be called only by lazy_mmu_mode_resume() or during context
>> switch.
>> + * Must never be called in interrupt context.
>> + *
>> + * Must match a call to lazy_mmu_mode_pause().
>> + *
>> + * Has no effect if called:
>> + * - While paused (inside another pause()/resume() pair)
>> + */
>> +static inline void lazy_mmu_mode_resume_task(struct task_struct *task)
>> +{
>> +    struct lazy_mmu_state *state = &task->lazy_mmu_state;
>> +
>> +    VM_WARN_ON_ONCE(state->pause_count == 0);
>> +
>> +    if (--state->pause_count == 0 && state->enable_count > 0)
>> +        arch_enter_lazy_mmu_mode();
>> +}
>> +
>>   /**
>>    * lazy_mmu_mode_resume() - Resume the lazy MMU mode.
>>    *
>> @@ -341,15 +363,8 @@ static inline void lazy_mmu_mode_pause(void)
>>    */
>>   static inline void lazy_mmu_mode_resume(void)
>>   {
>> -    struct lazy_mmu_state *state = &current->lazy_mmu_state;
>> -
>> -    if (in_interrupt())
>> -        return;
>> -
>> -    VM_WARN_ON_ONCE(state->pause_count == 0);
>> -
>> -    if (--state->pause_count == 0 && state->enable_count > 0)
>> -        arch_enter_lazy_mmu_mode();
>> +    if (!in_interrupt())
>> +        lazy_mmu_mode_resume_task(current);
>>   }
>>   #else
>>   static inline void lazy_mmu_mode_enable(void) {}
>
Re: [PATCH] x86/xen: Fix lazy mmu handling across context switch
Posted by Kevin Brodsky 3 weeks, 1 day ago
+ Marek (the wrong address was in Cc)

On 08/05/2026 11:08, Kevin Brodsky wrote:
> On 08/05/2026 10:33, Jürgen Groß wrote:
>> Please disregard this patch. It isn't fixing the real problem.
> That's what I would expect, see below.
>
>> On 08.05.26 10:05, Juergen Gross wrote:
>>> The recent rework of mmu lazy mode has resulted in problems when
>>> running as a Xen PV guest. Enabling lazy mmu mode for the new context
>>> during context switch is done from the arch_end_context_switch() hook,
>>> but when calling this hook current hasn't been changed yet, so the
>>> lazy mmu mode state of the wrong task is modified.
> Currently xen_end_context_switch() checks if next has lazy MMU mode
> enabled and if so calls arch_enter_lazy_mmu_mode(), i.e.
> enter_lazy(XEN_LAZY_MMU). This does *not* modify any task state, rather
> it writes to the xen_lazy_mode percpu variable.
>
> I've thought about this from various angles when reworking lazy MMU, and
> the conclusion I made is that arch_{start,end}_context_switch() have no
> reason to change any task state. On arm64, for instance, we do nothing
> at all on context switching, since everything lazy MMU-related is
> tracked in task_struct and therefore already switched.
>
> Xen is trickier because it tracks lazy MMU/CPU state in a percpu
> variable, so these hooks do need to do something about it. This is
> entirely Xen-internal though, and there's no reason to be calling
> generic functions like lazy_mmu_mode_pause() that modify task state.
>
> The idea behind commit 291b3abed657 ("x86/xen: use lazy_mmu_state when
> context-switching") is that TIF_LAZY_MMU_UPDATES now duplicates
> lazy_mmu_state in task_struct and we can therefore replace the former
> with the latter. More specifically, the assumption is that
> TIF_LAZY_MMU_UPDATES is set if and only if the task has been scheduled
> out and __task_lazy_mmu_mode_active(task) is true.
>
> Clearly there is something wrong with this assumption, but I still can't
> put my finger on it. For now I would suggest reverting this commit if
> that solves the issue Marek reported; the intention was not to introduce
> any functional change, but only a (minor) optimisation.
>
> - Kevin
>
>>> Additionally it is much cleaner to use lazy_mmu_mode_pause() and
>>> lazy_mmu_mode_resume() in the Xen context switch hooks, as it avoids
>>> conditionals in those hooks.
>>>
>>> In order not having to add another hook to be called after switching
>>> current, modify lazy_mmu_mode_resume() to use a new sub-function which
>>> takes a task pointer as parameter. This new sub-function can then be
>>> used in the xen_end_context_switch() hook.
>>>
>>> Fixes: 291b3abed657 ("x86/xen: use lazy_mmu_state when
>>> context-switching")
>>> Signed-off-by: Juergen Gross <jgross@suse.com>
>>> ---
>>>   arch/x86/xen/enlighten_pv.c |  7 ++-----
>>>   include/linux/pgtable.h     | 33 ++++++++++++++++++++++++---------
>>>   2 files changed, 26 insertions(+), 14 deletions(-)
>>>
>>> diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
>>> index ed2d7a3756ce..67bb6bf6d240 100644
>>> --- a/arch/x86/xen/enlighten_pv.c
>>> +++ b/arch/x86/xen/enlighten_pv.c
>>> @@ -424,9 +424,7 @@ static void xen_start_context_switch(struct
>>> task_struct *prev)
>>>   {
>>>       BUG_ON(preemptible());
>>>   -    if (this_cpu_read(xen_lazy_mode) == XEN_LAZY_MMU) {
>>> -        arch_leave_lazy_mmu_mode();
>>> -    }
>>> +    lazy_mmu_mode_pause();
>>>       enter_lazy(XEN_LAZY_CPU);
>>>   }
>>>   @@ -436,8 +434,7 @@ static void xen_end_context_switch(struct
>>> task_struct *next)
>>>         xen_mc_flush();
>>>       leave_lazy(XEN_LAZY_CPU);
>>> -    if (__task_lazy_mmu_mode_active(next))
>>> -        arch_enter_lazy_mmu_mode();
>>> +    lazy_mmu_mode_resume_task(next);
>>>   }
>>>     static unsigned long xen_store_tr(void)
>>> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
>>> index cdd68ed3ae1a..83a099bf2038 100644
>>> --- a/include/linux/pgtable.h
>>> +++ b/include/linux/pgtable.h
>>> @@ -326,6 +326,28 @@ static inline void lazy_mmu_mode_pause(void)
>>>           arch_leave_lazy_mmu_mode();
>>>   }
>>>   +/**
>>> + * lazy_mmu_mode_resume_task() - Resume the lazy MMU mode for a
>>> specific task.
>>> + *
>>> + * Like lazy_mmu_mode_resume() below, but with a task specified.
>>> + * Must be called only by lazy_mmu_mode_resume() or during context
>>> switch.
>>> + * Must never be called in interrupt context.
>>> + *
>>> + * Must match a call to lazy_mmu_mode_pause().
>>> + *
>>> + * Has no effect if called:
>>> + * - While paused (inside another pause()/resume() pair)
>>> + */
>>> +static inline void lazy_mmu_mode_resume_task(struct task_struct *task)
>>> +{
>>> +    struct lazy_mmu_state *state = &task->lazy_mmu_state;
>>> +
>>> +    VM_WARN_ON_ONCE(state->pause_count == 0);
>>> +
>>> +    if (--state->pause_count == 0 && state->enable_count > 0)
>>> +        arch_enter_lazy_mmu_mode();
>>> +}
>>> +
>>>   /**
>>>    * lazy_mmu_mode_resume() - Resume the lazy MMU mode.
>>>    *
>>> @@ -341,15 +363,8 @@ static inline void lazy_mmu_mode_pause(void)
>>>    */
>>>   static inline void lazy_mmu_mode_resume(void)
>>>   {
>>> -    struct lazy_mmu_state *state = &current->lazy_mmu_state;
>>> -
>>> -    if (in_interrupt())
>>> -        return;
>>> -
>>> -    VM_WARN_ON_ONCE(state->pause_count == 0);
>>> -
>>> -    if (--state->pause_count == 0 && state->enable_count > 0)
>>> -        arch_enter_lazy_mmu_mode();
>>> +    if (!in_interrupt())
>>> +        lazy_mmu_mode_resume_task(current);
>>>   }
>>>   #else
>>>   static inline void lazy_mmu_mode_enable(void) {}