Commit 49147beb0ccb ("x86/xen: allow nesting of same lazy mode")
originally introduced support for nested lazy sections (LAZY_MMU and
LAZY_CPU). It later got reverted by commit c36549ff8d84 as its
implementation turned out to be intolerant to preemption.
Now that the lazy_mmu API allows enter() to pass through a state to
the matching leave() call, we can support nesting again for the
LAZY_MMU mode in a preemption-safe manner. If xen_enter_lazy_mmu() is
called inside an active lazy_mmu section, xen_lazy_mode will already
be set to XEN_LAZY_MMU and we can then return LAZY_MMU_NESTED to
instruct the matching xen_leave_lazy_mmu() call to leave
xen_lazy_mode unchanged.
The only effect of this patch is to ensure that xen_lazy_mode
remains set to XEN_LAZY_MMU until the outermost lazy_mmu section
ends. xen_leave_lazy_mmu() still calls xen_mc_flush()
unconditionally.
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
---
arch/x86/include/asm/paravirt.h | 6 ++----
arch/x86/include/asm/paravirt_types.h | 4 ++--
arch/x86/xen/mmu_pv.c | 11 ++++++++---
3 files changed, 12 insertions(+), 9 deletions(-)
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 65a0d394fba1..4ecd3a6b1dea 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -529,14 +529,12 @@ static inline void arch_end_context_switch(struct task_struct *next)
#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
static inline lazy_mmu_state_t arch_enter_lazy_mmu_mode(void)
{
- PVOP_VCALL0(mmu.lazy_mode.enter);
-
- return LAZY_MMU_DEFAULT;
+ return PVOP_CALL0(lazy_mmu_state_t, mmu.lazy_mode.enter);
}
static inline void arch_leave_lazy_mmu_mode(lazy_mmu_state_t state)
{
- PVOP_VCALL0(mmu.lazy_mode.leave);
+ PVOP_VCALL1(mmu.lazy_mode.leave, state);
}
static inline void arch_flush_lazy_mmu_mode(void)
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index bc1af86868a3..b7c567ccbf32 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -45,8 +45,8 @@ typedef int lazy_mmu_state_t;
struct pv_lazy_ops {
/* Set deferred update mode, used for batching operations. */
- void (*enter)(void);
- void (*leave)(void);
+ lazy_mmu_state_t (*enter)(void);
+ void (*leave)(lazy_mmu_state_t);
void (*flush)(void);
} __no_randomize_layout;
#endif
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 2039d5132ca3..6e5390ff06a5 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -2130,9 +2130,13 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
#endif
}
-static void xen_enter_lazy_mmu(void)
+static lazy_mmu_state_t xen_enter_lazy_mmu(void)
{
+ if (this_cpu_read(xen_lazy_mode) == XEN_LAZY_MMU)
+ return LAZY_MMU_NESTED;
+
enter_lazy(XEN_LAZY_MMU);
+ return LAZY_MMU_DEFAULT;
}
static void xen_flush_lazy_mmu(void)
@@ -2167,11 +2171,12 @@ static void __init xen_post_allocator_init(void)
pv_ops.mmu.write_cr3 = &xen_write_cr3;
}
-static void xen_leave_lazy_mmu(void)
+static void xen_leave_lazy_mmu(lazy_mmu_state_t state)
{
preempt_disable();
xen_mc_flush();
- leave_lazy(XEN_LAZY_MMU);
+ if (state != LAZY_MMU_NESTED)
+ leave_lazy(XEN_LAZY_MMU);
preempt_enable();
}
--
2.47.0
On 08.09.25 09:39, Kevin Brodsky wrote:
> Commit 49147beb0ccb ("x86/xen: allow nesting of same lazy mode")
> originally introduced support for nested lazy sections (LAZY_MMU and
> LAZY_CPU). It later got reverted by commit c36549ff8d84 as its
> implementation turned out to be intolerant to preemption.
>
> Now that the lazy_mmu API allows enter() to pass through a state to
> the matching leave() call, we can support nesting again for the
> LAZY_MMU mode in a preemption-safe manner. If xen_enter_lazy_mmu() is
> called inside an active lazy_mmu section, xen_lazy_mode will already
> be set to XEN_LAZY_MMU and we can then return LAZY_MMU_NESTED to
> instruct the matching xen_leave_lazy_mmu() call to leave
> xen_lazy_mode unchanged.
>
> The only effect of this patch is to ensure that xen_lazy_mode
> remains set to XEN_LAZY_MMU until the outermost lazy_mmu section
> ends. xen_leave_lazy_mmu() still calls xen_mc_flush()
> unconditionally.
>
> Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Juergen
On 08.09.25 09:39, Kevin Brodsky wrote:
> Commit 49147beb0ccb ("x86/xen: allow nesting of same lazy mode")
> originally introduced support for nested lazy sections (LAZY_MMU and
> LAZY_CPU). It later got reverted by commit c36549ff8d84 as its
> implementation turned out to be intolerant to preemption.
>
> Now that the lazy_mmu API allows enter() to pass through a state to
> the matching leave() call, we can support nesting again for the
> LAZY_MMU mode in a preemption-safe manner. If xen_enter_lazy_mmu() is
> called inside an active lazy_mmu section, xen_lazy_mode will already
> be set to XEN_LAZY_MMU and we can then return LAZY_MMU_NESTED to
> instruct the matching xen_leave_lazy_mmu() call to leave
> xen_lazy_mode unchanged.
>
> The only effect of this patch is to ensure that xen_lazy_mode
> remains set to XEN_LAZY_MMU until the outermost lazy_mmu section
> ends. xen_leave_lazy_mmu() still calls xen_mc_flush()
> unconditionally.
>
> Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
> ---
> arch/x86/include/asm/paravirt.h | 6 ++----
> arch/x86/include/asm/paravirt_types.h | 4 ++--
> arch/x86/xen/mmu_pv.c | 11 ++++++++---
> 3 files changed, 12 insertions(+), 9 deletions(-)
>
> diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
> index 65a0d394fba1..4ecd3a6b1dea 100644
> --- a/arch/x86/include/asm/paravirt.h
> +++ b/arch/x86/include/asm/paravirt.h
> @@ -529,14 +529,12 @@ static inline void arch_end_context_switch(struct task_struct *next)
> #define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
> static inline lazy_mmu_state_t arch_enter_lazy_mmu_mode(void)
> {
> - PVOP_VCALL0(mmu.lazy_mode.enter);
> -
> - return LAZY_MMU_DEFAULT;
> + return PVOP_CALL0(lazy_mmu_state_t, mmu.lazy_mode.enter);
> }
>
> static inline void arch_leave_lazy_mmu_mode(lazy_mmu_state_t state)
> {
> - PVOP_VCALL0(mmu.lazy_mode.leave);
> + PVOP_VCALL1(mmu.lazy_mode.leave, state);
> }
>
> static inline void arch_flush_lazy_mmu_mode(void)
> diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
> index bc1af86868a3..b7c567ccbf32 100644
> --- a/arch/x86/include/asm/paravirt_types.h
> +++ b/arch/x86/include/asm/paravirt_types.h
> @@ -45,8 +45,8 @@ typedef int lazy_mmu_state_t;
>
> struct pv_lazy_ops {
> /* Set deferred update mode, used for batching operations. */
> - void (*enter)(void);
> - void (*leave)(void);
> + lazy_mmu_state_t (*enter)(void);
> + void (*leave)(lazy_mmu_state_t);
> void (*flush)(void);
> } __no_randomize_layout;
> #endif
> diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
> index 2039d5132ca3..6e5390ff06a5 100644
> --- a/arch/x86/xen/mmu_pv.c
> +++ b/arch/x86/xen/mmu_pv.c
> @@ -2130,9 +2130,13 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
> #endif
> }
>
> -static void xen_enter_lazy_mmu(void)
> +static lazy_mmu_state_t xen_enter_lazy_mmu(void)
> {
> + if (this_cpu_read(xen_lazy_mode) == XEN_LAZY_MMU)
> + return LAZY_MMU_NESTED;
> +
You mention above "preemption-safe manner" above, so I am wondering,
what if we get preempted immediately after doing the this_cpu_read() and
get scheduled on another CPU?
--
Cheers
David / dhildenb
On 09.09.25 11:13, David Hildenbrand wrote:
> On 08.09.25 09:39, Kevin Brodsky wrote:
>> Commit 49147beb0ccb ("x86/xen: allow nesting of same lazy mode")
>> originally introduced support for nested lazy sections (LAZY_MMU and
>> LAZY_CPU). It later got reverted by commit c36549ff8d84 as its
>> implementation turned out to be intolerant to preemption.
>>
>> Now that the lazy_mmu API allows enter() to pass through a state to
>> the matching leave() call, we can support nesting again for the
>> LAZY_MMU mode in a preemption-safe manner. If xen_enter_lazy_mmu() is
>> called inside an active lazy_mmu section, xen_lazy_mode will already
>> be set to XEN_LAZY_MMU and we can then return LAZY_MMU_NESTED to
>> instruct the matching xen_leave_lazy_mmu() call to leave
>> xen_lazy_mode unchanged.
>>
>> The only effect of this patch is to ensure that xen_lazy_mode
>> remains set to XEN_LAZY_MMU until the outermost lazy_mmu section
>> ends. xen_leave_lazy_mmu() still calls xen_mc_flush()
>> unconditionally.
>>
>> Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
>> ---
>> arch/x86/include/asm/paravirt.h | 6 ++----
>> arch/x86/include/asm/paravirt_types.h | 4 ++--
>> arch/x86/xen/mmu_pv.c | 11 ++++++++---
>> 3 files changed, 12 insertions(+), 9 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
>> index 65a0d394fba1..4ecd3a6b1dea 100644
>> --- a/arch/x86/include/asm/paravirt.h
>> +++ b/arch/x86/include/asm/paravirt.h
>> @@ -529,14 +529,12 @@ static inline void arch_end_context_switch(struct
>> task_struct *next)
>> #define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
>> static inline lazy_mmu_state_t arch_enter_lazy_mmu_mode(void)
>> {
>> - PVOP_VCALL0(mmu.lazy_mode.enter);
>> -
>> - return LAZY_MMU_DEFAULT;
>> + return PVOP_CALL0(lazy_mmu_state_t, mmu.lazy_mode.enter);
>> }
>> static inline void arch_leave_lazy_mmu_mode(lazy_mmu_state_t state)
>> {
>> - PVOP_VCALL0(mmu.lazy_mode.leave);
>> + PVOP_VCALL1(mmu.lazy_mode.leave, state);
>> }
>> static inline void arch_flush_lazy_mmu_mode(void)
>> diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/
>> paravirt_types.h
>> index bc1af86868a3..b7c567ccbf32 100644
>> --- a/arch/x86/include/asm/paravirt_types.h
>> +++ b/arch/x86/include/asm/paravirt_types.h
>> @@ -45,8 +45,8 @@ typedef int lazy_mmu_state_t;
>> struct pv_lazy_ops {
>> /* Set deferred update mode, used for batching operations. */
>> - void (*enter)(void);
>> - void (*leave)(void);
>> + lazy_mmu_state_t (*enter)(void);
>> + void (*leave)(lazy_mmu_state_t);
>> void (*flush)(void);
>> } __no_randomize_layout;
>> #endif
>> diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
>> index 2039d5132ca3..6e5390ff06a5 100644
>> --- a/arch/x86/xen/mmu_pv.c
>> +++ b/arch/x86/xen/mmu_pv.c
>> @@ -2130,9 +2130,13 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t
>> phys, pgprot_t prot)
>> #endif
>> }
>> -static void xen_enter_lazy_mmu(void)
>> +static lazy_mmu_state_t xen_enter_lazy_mmu(void)
>> {
>> + if (this_cpu_read(xen_lazy_mode) == XEN_LAZY_MMU)
>> + return LAZY_MMU_NESTED;
>> +
>
> You mention above "preemption-safe manner" above, so I am wondering,
> what if we get preempted immediately after doing the this_cpu_read() and get
> scheduled on another CPU?
>
This should still be correct: preemption needs a context switch to happen,
so xen_start_context_switch() and xen_end_context_switch() are involved.
Those are dealing with this problem by doing the right thing in the old
and the new context.
Juergen
On 09.09.25 11:37, Jürgen Groß wrote:
> On 09.09.25 11:13, David Hildenbrand wrote:
>> On 08.09.25 09:39, Kevin Brodsky wrote:
>>> Commit 49147beb0ccb ("x86/xen: allow nesting of same lazy mode")
>>> originally introduced support for nested lazy sections (LAZY_MMU and
>>> LAZY_CPU). It later got reverted by commit c36549ff8d84 as its
>>> implementation turned out to be intolerant to preemption.
>>>
>>> Now that the lazy_mmu API allows enter() to pass through a state to
>>> the matching leave() call, we can support nesting again for the
>>> LAZY_MMU mode in a preemption-safe manner. If xen_enter_lazy_mmu() is
>>> called inside an active lazy_mmu section, xen_lazy_mode will already
>>> be set to XEN_LAZY_MMU and we can then return LAZY_MMU_NESTED to
>>> instruct the matching xen_leave_lazy_mmu() call to leave
>>> xen_lazy_mode unchanged.
>>>
>>> The only effect of this patch is to ensure that xen_lazy_mode
>>> remains set to XEN_LAZY_MMU until the outermost lazy_mmu section
>>> ends. xen_leave_lazy_mmu() still calls xen_mc_flush()
>>> unconditionally.
>>>
>>> Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
>>> ---
>>> arch/x86/include/asm/paravirt.h | 6 ++----
>>> arch/x86/include/asm/paravirt_types.h | 4 ++--
>>> arch/x86/xen/mmu_pv.c | 11 ++++++++---
>>> 3 files changed, 12 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
>>> index 65a0d394fba1..4ecd3a6b1dea 100644
>>> --- a/arch/x86/include/asm/paravirt.h
>>> +++ b/arch/x86/include/asm/paravirt.h
>>> @@ -529,14 +529,12 @@ static inline void arch_end_context_switch(struct
>>> task_struct *next)
>>> #define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
>>> static inline lazy_mmu_state_t arch_enter_lazy_mmu_mode(void)
>>> {
>>> - PVOP_VCALL0(mmu.lazy_mode.enter);
>>> -
>>> - return LAZY_MMU_DEFAULT;
>>> + return PVOP_CALL0(lazy_mmu_state_t, mmu.lazy_mode.enter);
>>> }
>>> static inline void arch_leave_lazy_mmu_mode(lazy_mmu_state_t state)
>>> {
>>> - PVOP_VCALL0(mmu.lazy_mode.leave);
>>> + PVOP_VCALL1(mmu.lazy_mode.leave, state);
>>> }
>>> static inline void arch_flush_lazy_mmu_mode(void)
>>> diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/
>>> paravirt_types.h
>>> index bc1af86868a3..b7c567ccbf32 100644
>>> --- a/arch/x86/include/asm/paravirt_types.h
>>> +++ b/arch/x86/include/asm/paravirt_types.h
>>> @@ -45,8 +45,8 @@ typedef int lazy_mmu_state_t;
>>> struct pv_lazy_ops {
>>> /* Set deferred update mode, used for batching operations. */
>>> - void (*enter)(void);
>>> - void (*leave)(void);
>>> + lazy_mmu_state_t (*enter)(void);
>>> + void (*leave)(lazy_mmu_state_t);
>>> void (*flush)(void);
>>> } __no_randomize_layout;
>>> #endif
>>> diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
>>> index 2039d5132ca3..6e5390ff06a5 100644
>>> --- a/arch/x86/xen/mmu_pv.c
>>> +++ b/arch/x86/xen/mmu_pv.c
>>> @@ -2130,9 +2130,13 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t
>>> phys, pgprot_t prot)
>>> #endif
>>> }
>>> -static void xen_enter_lazy_mmu(void)
>>> +static lazy_mmu_state_t xen_enter_lazy_mmu(void)
>>> {
>>> + if (this_cpu_read(xen_lazy_mode) == XEN_LAZY_MMU)
>>> + return LAZY_MMU_NESTED;
>>> +
>>
>> You mention above "preemption-safe manner" above, so I am wondering,
>> what if we get preempted immediately after doing the this_cpu_read() and get
>> scheduled on another CPU?
>>
>
> This should still be correct: preemption needs a context switch to happen,
> so xen_start_context_switch() and xen_end_context_switch() are involved.
> Those are dealing with this problem by doing the right thing in the old
> and the new context.
Thanks, that makes sense. Would be valuable to add that detail to the
patch description.
--
Cheers
David / dhildenb
On 09/09/2025 11:56, David Hildenbrand wrote:
> On 09.09.25 11:37, Jürgen Groß wrote:
>> On 09.09.25 11:13, David Hildenbrand wrote:
>>> On 08.09.25 09:39, Kevin Brodsky wrote:
>>>> Commit 49147beb0ccb ("x86/xen: allow nesting of same lazy mode")
>>>> originally introduced support for nested lazy sections (LAZY_MMU and
>>>> LAZY_CPU). It later got reverted by commit c36549ff8d84 as its
>>>> implementation turned out to be intolerant to preemption.
>>>>
>>>> Now that the lazy_mmu API allows enter() to pass through a state to
>>>> the matching leave() call, we can support nesting again for the
>>>> LAZY_MMU mode in a preemption-safe manner. If xen_enter_lazy_mmu() is
>>>> called inside an active lazy_mmu section, xen_lazy_mode will already
>>>> be set to XEN_LAZY_MMU and we can then return LAZY_MMU_NESTED to
>>>> instruct the matching xen_leave_lazy_mmu() call to leave
>>>> xen_lazy_mode unchanged.
>>>>
>>>> The only effect of this patch is to ensure that xen_lazy_mode
>>>> remains set to XEN_LAZY_MMU until the outermost lazy_mmu section
>>>> ends. xen_leave_lazy_mmu() still calls xen_mc_flush()
>>>> unconditionally.
>>>>
>>>> Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
>>>> ---
>>>> arch/x86/include/asm/paravirt.h | 6 ++----
>>>> arch/x86/include/asm/paravirt_types.h | 4 ++--
>>>> arch/x86/xen/mmu_pv.c | 11 ++++++++---
>>>> 3 files changed, 12 insertions(+), 9 deletions(-)
>>>>
>>>> diff --git a/arch/x86/include/asm/paravirt.h
>>>> b/arch/x86/include/asm/paravirt.h
>>>> index 65a0d394fba1..4ecd3a6b1dea 100644
>>>> --- a/arch/x86/include/asm/paravirt.h
>>>> +++ b/arch/x86/include/asm/paravirt.h
>>>> @@ -529,14 +529,12 @@ static inline void
>>>> arch_end_context_switch(struct
>>>> task_struct *next)
>>>> #define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
>>>> static inline lazy_mmu_state_t arch_enter_lazy_mmu_mode(void)
>>>> {
>>>> - PVOP_VCALL0(mmu.lazy_mode.enter);
>>>> -
>>>> - return LAZY_MMU_DEFAULT;
>>>> + return PVOP_CALL0(lazy_mmu_state_t, mmu.lazy_mode.enter);
>>>> }
>>>> static inline void arch_leave_lazy_mmu_mode(lazy_mmu_state_t state)
>>>> {
>>>> - PVOP_VCALL0(mmu.lazy_mode.leave);
>>>> + PVOP_VCALL1(mmu.lazy_mode.leave, state);
>>>> }
>>>> static inline void arch_flush_lazy_mmu_mode(void)
>>>> diff --git a/arch/x86/include/asm/paravirt_types.h
>>>> b/arch/x86/include/asm/
>>>> paravirt_types.h
>>>> index bc1af86868a3..b7c567ccbf32 100644
>>>> --- a/arch/x86/include/asm/paravirt_types.h
>>>> +++ b/arch/x86/include/asm/paravirt_types.h
>>>> @@ -45,8 +45,8 @@ typedef int lazy_mmu_state_t;
>>>> struct pv_lazy_ops {
>>>> /* Set deferred update mode, used for batching operations. */
>>>> - void (*enter)(void);
>>>> - void (*leave)(void);
>>>> + lazy_mmu_state_t (*enter)(void);
>>>> + void (*leave)(lazy_mmu_state_t);
>>>> void (*flush)(void);
>>>> } __no_randomize_layout;
>>>> #endif
>>>> diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
>>>> index 2039d5132ca3..6e5390ff06a5 100644
>>>> --- a/arch/x86/xen/mmu_pv.c
>>>> +++ b/arch/x86/xen/mmu_pv.c
>>>> @@ -2130,9 +2130,13 @@ static void xen_set_fixmap(unsigned idx,
>>>> phys_addr_t
>>>> phys, pgprot_t prot)
>>>> #endif
>>>> }
>>>> -static void xen_enter_lazy_mmu(void)
>>>> +static lazy_mmu_state_t xen_enter_lazy_mmu(void)
>>>> {
>>>> + if (this_cpu_read(xen_lazy_mode) == XEN_LAZY_MMU)
>>>> + return LAZY_MMU_NESTED;
>>>> +
>>>
>>> You mention above "preemption-safe manner" above, so I am wondering,
>>> what if we get preempted immediately after doing the this_cpu_read()
>>> and get
>>> scheduled on another CPU?
>>>
>>
>> This should still be correct: preemption needs a context switch to
>> happen,
>> so xen_start_context_switch() and xen_end_context_switch() are involved.
>> Those are dealing with this problem by doing the right thing in the old
>> and the new context.
>
> Thanks, that makes sense. Would be valuable to add that detail to the
> patch description.
That's a fair point, Alexander was also wondering in v1 (and so was I
when I worked on this patch). Will clarify in v3.
- Kevin
© 2016 - 2026 Red Hat, Inc.