Commit 49147beb0ccb ("x86/xen: allow nesting of same lazy mode")
originally introduced support for nested lazy sections (LAZY_MMU and
LAZY_CPU). It later got reverted by commit c36549ff8d84 as its
implementation turned out to be intolerant to preemption.
Now that the lazy_mmu API allows enter() to pass through a state to
the matching leave() call, we can support nesting again for the
LAZY_MMU mode in a preemption-safe manner. If xen_enter_lazy_mmu() is
called inside an active lazy_mmu section, xen_lazy_mode will already
be set to XEN_LAZY_MMU and we can then return LAZY_MMU_NESTED to
instruct the matching xen_leave_lazy_mmu() call to leave
xen_lazy_mode unchanged.
The only effect of this patch is to ensure that xen_lazy_mode
remains set to XEN_LAZY_MMU until the outermost lazy_mmu section
ends. xen_leave_lazy_mmu() still calls xen_mc_flush()
unconditionally.
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
---
arch/x86/include/asm/paravirt.h | 6 ++----
arch/x86/include/asm/paravirt_types.h | 4 ++--
arch/x86/xen/mmu_pv.c | 11 ++++++++---
3 files changed, 12 insertions(+), 9 deletions(-)
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 65a0d394fba1..4ecd3a6b1dea 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -529,14 +529,12 @@ static inline void arch_end_context_switch(struct task_struct *next)
#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
static inline lazy_mmu_state_t arch_enter_lazy_mmu_mode(void)
{
- PVOP_VCALL0(mmu.lazy_mode.enter);
-
- return LAZY_MMU_DEFAULT;
+ return PVOP_CALL0(lazy_mmu_state_t, mmu.lazy_mode.enter);
}
static inline void arch_leave_lazy_mmu_mode(lazy_mmu_state_t state)
{
- PVOP_VCALL0(mmu.lazy_mode.leave);
+ PVOP_VCALL1(mmu.lazy_mode.leave, state);
}
static inline void arch_flush_lazy_mmu_mode(void)
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index bc1af86868a3..b7c567ccbf32 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -45,8 +45,8 @@ typedef int lazy_mmu_state_t;
struct pv_lazy_ops {
/* Set deferred update mode, used for batching operations. */
- void (*enter)(void);
- void (*leave)(void);
+ lazy_mmu_state_t (*enter)(void);
+ void (*leave)(lazy_mmu_state_t);
void (*flush)(void);
} __no_randomize_layout;
#endif
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 2039d5132ca3..6e5390ff06a5 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -2130,9 +2130,13 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
#endif
}
-static void xen_enter_lazy_mmu(void)
+static lazy_mmu_state_t xen_enter_lazy_mmu(void)
{
+ if (this_cpu_read(xen_lazy_mode) == XEN_LAZY_MMU)
+ return LAZY_MMU_NESTED;
+
enter_lazy(XEN_LAZY_MMU);
+ return LAZY_MMU_DEFAULT;
}
static void xen_flush_lazy_mmu(void)
@@ -2167,11 +2171,12 @@ static void __init xen_post_allocator_init(void)
pv_ops.mmu.write_cr3 = &xen_write_cr3;
}
-static void xen_leave_lazy_mmu(void)
+static void xen_leave_lazy_mmu(lazy_mmu_state_t state)
{
preempt_disable();
xen_mc_flush();
- leave_lazy(XEN_LAZY_MMU);
+ if (state != LAZY_MMU_NESTED)
+ leave_lazy(XEN_LAZY_MMU);
preempt_enable();
}
--
2.47.0
On 08.09.25 09:39, Kevin Brodsky wrote: > Commit 49147beb0ccb ("x86/xen: allow nesting of same lazy mode") > originally introduced support for nested lazy sections (LAZY_MMU and > LAZY_CPU). It later got reverted by commit c36549ff8d84 as its > implementation turned out to be intolerant to preemption. > > Now that the lazy_mmu API allows enter() to pass through a state to > the matching leave() call, we can support nesting again for the > LAZY_MMU mode in a preemption-safe manner. If xen_enter_lazy_mmu() is > called inside an active lazy_mmu section, xen_lazy_mode will already > be set to XEN_LAZY_MMU and we can then return LAZY_MMU_NESTED to > instruct the matching xen_leave_lazy_mmu() call to leave > xen_lazy_mode unchanged. > > The only effect of this patch is to ensure that xen_lazy_mode > remains set to XEN_LAZY_MMU until the outermost lazy_mmu section > ends. xen_leave_lazy_mmu() still calls xen_mc_flush() > unconditionally. > > Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com> Reviewed-by: Juergen Gross <jgross@suse.com> Juergen
On 08.09.25 09:39, Kevin Brodsky wrote: > Commit 49147beb0ccb ("x86/xen: allow nesting of same lazy mode") > originally introduced support for nested lazy sections (LAZY_MMU and > LAZY_CPU). It later got reverted by commit c36549ff8d84 as its > implementation turned out to be intolerant to preemption. > > Now that the lazy_mmu API allows enter() to pass through a state to > the matching leave() call, we can support nesting again for the > LAZY_MMU mode in a preemption-safe manner. If xen_enter_lazy_mmu() is > called inside an active lazy_mmu section, xen_lazy_mode will already > be set to XEN_LAZY_MMU and we can then return LAZY_MMU_NESTED to > instruct the matching xen_leave_lazy_mmu() call to leave > xen_lazy_mode unchanged. > > The only effect of this patch is to ensure that xen_lazy_mode > remains set to XEN_LAZY_MMU until the outermost lazy_mmu section > ends. xen_leave_lazy_mmu() still calls xen_mc_flush() > unconditionally. > > Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com> > --- > arch/x86/include/asm/paravirt.h | 6 ++---- > arch/x86/include/asm/paravirt_types.h | 4 ++-- > arch/x86/xen/mmu_pv.c | 11 ++++++++--- > 3 files changed, 12 insertions(+), 9 deletions(-) > > diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h > index 65a0d394fba1..4ecd3a6b1dea 100644 > --- a/arch/x86/include/asm/paravirt.h > +++ b/arch/x86/include/asm/paravirt.h > @@ -529,14 +529,12 @@ static inline void arch_end_context_switch(struct task_struct *next) > #define __HAVE_ARCH_ENTER_LAZY_MMU_MODE > static inline lazy_mmu_state_t arch_enter_lazy_mmu_mode(void) > { > - PVOP_VCALL0(mmu.lazy_mode.enter); > - > - return LAZY_MMU_DEFAULT; > + return PVOP_CALL0(lazy_mmu_state_t, mmu.lazy_mode.enter); > } > > static inline void arch_leave_lazy_mmu_mode(lazy_mmu_state_t state) > { > - PVOP_VCALL0(mmu.lazy_mode.leave); > + PVOP_VCALL1(mmu.lazy_mode.leave, state); > } > > static inline void arch_flush_lazy_mmu_mode(void) > diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h > index bc1af86868a3..b7c567ccbf32 100644 > --- a/arch/x86/include/asm/paravirt_types.h > +++ b/arch/x86/include/asm/paravirt_types.h > @@ -45,8 +45,8 @@ typedef int lazy_mmu_state_t; > > struct pv_lazy_ops { > /* Set deferred update mode, used for batching operations. */ > - void (*enter)(void); > - void (*leave)(void); > + lazy_mmu_state_t (*enter)(void); > + void (*leave)(lazy_mmu_state_t); > void (*flush)(void); > } __no_randomize_layout; > #endif > diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c > index 2039d5132ca3..6e5390ff06a5 100644 > --- a/arch/x86/xen/mmu_pv.c > +++ b/arch/x86/xen/mmu_pv.c > @@ -2130,9 +2130,13 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) > #endif > } > > -static void xen_enter_lazy_mmu(void) > +static lazy_mmu_state_t xen_enter_lazy_mmu(void) > { > + if (this_cpu_read(xen_lazy_mode) == XEN_LAZY_MMU) > + return LAZY_MMU_NESTED; > + You mention above "preemption-safe manner" above, so I am wondering, what if we get preempted immediately after doing the this_cpu_read() and get scheduled on another CPU? -- Cheers David / dhildenb
On 09.09.25 11:13, David Hildenbrand wrote: > On 08.09.25 09:39, Kevin Brodsky wrote: >> Commit 49147beb0ccb ("x86/xen: allow nesting of same lazy mode") >> originally introduced support for nested lazy sections (LAZY_MMU and >> LAZY_CPU). It later got reverted by commit c36549ff8d84 as its >> implementation turned out to be intolerant to preemption. >> >> Now that the lazy_mmu API allows enter() to pass through a state to >> the matching leave() call, we can support nesting again for the >> LAZY_MMU mode in a preemption-safe manner. If xen_enter_lazy_mmu() is >> called inside an active lazy_mmu section, xen_lazy_mode will already >> be set to XEN_LAZY_MMU and we can then return LAZY_MMU_NESTED to >> instruct the matching xen_leave_lazy_mmu() call to leave >> xen_lazy_mode unchanged. >> >> The only effect of this patch is to ensure that xen_lazy_mode >> remains set to XEN_LAZY_MMU until the outermost lazy_mmu section >> ends. xen_leave_lazy_mmu() still calls xen_mc_flush() >> unconditionally. >> >> Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com> >> --- >> arch/x86/include/asm/paravirt.h | 6 ++---- >> arch/x86/include/asm/paravirt_types.h | 4 ++-- >> arch/x86/xen/mmu_pv.c | 11 ++++++++--- >> 3 files changed, 12 insertions(+), 9 deletions(-) >> >> diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h >> index 65a0d394fba1..4ecd3a6b1dea 100644 >> --- a/arch/x86/include/asm/paravirt.h >> +++ b/arch/x86/include/asm/paravirt.h >> @@ -529,14 +529,12 @@ static inline void arch_end_context_switch(struct >> task_struct *next) >> #define __HAVE_ARCH_ENTER_LAZY_MMU_MODE >> static inline lazy_mmu_state_t arch_enter_lazy_mmu_mode(void) >> { >> - PVOP_VCALL0(mmu.lazy_mode.enter); >> - >> - return LAZY_MMU_DEFAULT; >> + return PVOP_CALL0(lazy_mmu_state_t, mmu.lazy_mode.enter); >> } >> static inline void arch_leave_lazy_mmu_mode(lazy_mmu_state_t state) >> { >> - PVOP_VCALL0(mmu.lazy_mode.leave); >> + PVOP_VCALL1(mmu.lazy_mode.leave, state); >> } >> static inline void arch_flush_lazy_mmu_mode(void) >> diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/ >> paravirt_types.h >> index bc1af86868a3..b7c567ccbf32 100644 >> --- a/arch/x86/include/asm/paravirt_types.h >> +++ b/arch/x86/include/asm/paravirt_types.h >> @@ -45,8 +45,8 @@ typedef int lazy_mmu_state_t; >> struct pv_lazy_ops { >> /* Set deferred update mode, used for batching operations. */ >> - void (*enter)(void); >> - void (*leave)(void); >> + lazy_mmu_state_t (*enter)(void); >> + void (*leave)(lazy_mmu_state_t); >> void (*flush)(void); >> } __no_randomize_layout; >> #endif >> diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c >> index 2039d5132ca3..6e5390ff06a5 100644 >> --- a/arch/x86/xen/mmu_pv.c >> +++ b/arch/x86/xen/mmu_pv.c >> @@ -2130,9 +2130,13 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t >> phys, pgprot_t prot) >> #endif >> } >> -static void xen_enter_lazy_mmu(void) >> +static lazy_mmu_state_t xen_enter_lazy_mmu(void) >> { >> + if (this_cpu_read(xen_lazy_mode) == XEN_LAZY_MMU) >> + return LAZY_MMU_NESTED; >> + > > You mention above "preemption-safe manner" above, so I am wondering, > what if we get preempted immediately after doing the this_cpu_read() and get > scheduled on another CPU? > This should still be correct: preemption needs a context switch to happen, so xen_start_context_switch() and xen_end_context_switch() are involved. Those are dealing with this problem by doing the right thing in the old and the new context. Juergen
On 09.09.25 11:37, Jürgen Groß wrote: > On 09.09.25 11:13, David Hildenbrand wrote: >> On 08.09.25 09:39, Kevin Brodsky wrote: >>> Commit 49147beb0ccb ("x86/xen: allow nesting of same lazy mode") >>> originally introduced support for nested lazy sections (LAZY_MMU and >>> LAZY_CPU). It later got reverted by commit c36549ff8d84 as its >>> implementation turned out to be intolerant to preemption. >>> >>> Now that the lazy_mmu API allows enter() to pass through a state to >>> the matching leave() call, we can support nesting again for the >>> LAZY_MMU mode in a preemption-safe manner. If xen_enter_lazy_mmu() is >>> called inside an active lazy_mmu section, xen_lazy_mode will already >>> be set to XEN_LAZY_MMU and we can then return LAZY_MMU_NESTED to >>> instruct the matching xen_leave_lazy_mmu() call to leave >>> xen_lazy_mode unchanged. >>> >>> The only effect of this patch is to ensure that xen_lazy_mode >>> remains set to XEN_LAZY_MMU until the outermost lazy_mmu section >>> ends. xen_leave_lazy_mmu() still calls xen_mc_flush() >>> unconditionally. >>> >>> Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com> >>> --- >>> arch/x86/include/asm/paravirt.h | 6 ++---- >>> arch/x86/include/asm/paravirt_types.h | 4 ++-- >>> arch/x86/xen/mmu_pv.c | 11 ++++++++--- >>> 3 files changed, 12 insertions(+), 9 deletions(-) >>> >>> diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h >>> index 65a0d394fba1..4ecd3a6b1dea 100644 >>> --- a/arch/x86/include/asm/paravirt.h >>> +++ b/arch/x86/include/asm/paravirt.h >>> @@ -529,14 +529,12 @@ static inline void arch_end_context_switch(struct >>> task_struct *next) >>> #define __HAVE_ARCH_ENTER_LAZY_MMU_MODE >>> static inline lazy_mmu_state_t arch_enter_lazy_mmu_mode(void) >>> { >>> - PVOP_VCALL0(mmu.lazy_mode.enter); >>> - >>> - return LAZY_MMU_DEFAULT; >>> + return PVOP_CALL0(lazy_mmu_state_t, mmu.lazy_mode.enter); >>> } >>> static inline void arch_leave_lazy_mmu_mode(lazy_mmu_state_t state) >>> { >>> - PVOP_VCALL0(mmu.lazy_mode.leave); >>> + PVOP_VCALL1(mmu.lazy_mode.leave, state); >>> } >>> static inline void arch_flush_lazy_mmu_mode(void) >>> diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/ >>> paravirt_types.h >>> index bc1af86868a3..b7c567ccbf32 100644 >>> --- a/arch/x86/include/asm/paravirt_types.h >>> +++ b/arch/x86/include/asm/paravirt_types.h >>> @@ -45,8 +45,8 @@ typedef int lazy_mmu_state_t; >>> struct pv_lazy_ops { >>> /* Set deferred update mode, used for batching operations. */ >>> - void (*enter)(void); >>> - void (*leave)(void); >>> + lazy_mmu_state_t (*enter)(void); >>> + void (*leave)(lazy_mmu_state_t); >>> void (*flush)(void); >>> } __no_randomize_layout; >>> #endif >>> diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c >>> index 2039d5132ca3..6e5390ff06a5 100644 >>> --- a/arch/x86/xen/mmu_pv.c >>> +++ b/arch/x86/xen/mmu_pv.c >>> @@ -2130,9 +2130,13 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t >>> phys, pgprot_t prot) >>> #endif >>> } >>> -static void xen_enter_lazy_mmu(void) >>> +static lazy_mmu_state_t xen_enter_lazy_mmu(void) >>> { >>> + if (this_cpu_read(xen_lazy_mode) == XEN_LAZY_MMU) >>> + return LAZY_MMU_NESTED; >>> + >> >> You mention above "preemption-safe manner" above, so I am wondering, >> what if we get preempted immediately after doing the this_cpu_read() and get >> scheduled on another CPU? >> > > This should still be correct: preemption needs a context switch to happen, > so xen_start_context_switch() and xen_end_context_switch() are involved. > Those are dealing with this problem by doing the right thing in the old > and the new context. Thanks, that makes sense. Would be valuable to add that detail to the patch description. -- Cheers David / dhildenb
On 09/09/2025 11:56, David Hildenbrand wrote: > On 09.09.25 11:37, Jürgen Groß wrote: >> On 09.09.25 11:13, David Hildenbrand wrote: >>> On 08.09.25 09:39, Kevin Brodsky wrote: >>>> Commit 49147beb0ccb ("x86/xen: allow nesting of same lazy mode") >>>> originally introduced support for nested lazy sections (LAZY_MMU and >>>> LAZY_CPU). It later got reverted by commit c36549ff8d84 as its >>>> implementation turned out to be intolerant to preemption. >>>> >>>> Now that the lazy_mmu API allows enter() to pass through a state to >>>> the matching leave() call, we can support nesting again for the >>>> LAZY_MMU mode in a preemption-safe manner. If xen_enter_lazy_mmu() is >>>> called inside an active lazy_mmu section, xen_lazy_mode will already >>>> be set to XEN_LAZY_MMU and we can then return LAZY_MMU_NESTED to >>>> instruct the matching xen_leave_lazy_mmu() call to leave >>>> xen_lazy_mode unchanged. >>>> >>>> The only effect of this patch is to ensure that xen_lazy_mode >>>> remains set to XEN_LAZY_MMU until the outermost lazy_mmu section >>>> ends. xen_leave_lazy_mmu() still calls xen_mc_flush() >>>> unconditionally. >>>> >>>> Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com> >>>> --- >>>> arch/x86/include/asm/paravirt.h | 6 ++---- >>>> arch/x86/include/asm/paravirt_types.h | 4 ++-- >>>> arch/x86/xen/mmu_pv.c | 11 ++++++++--- >>>> 3 files changed, 12 insertions(+), 9 deletions(-) >>>> >>>> diff --git a/arch/x86/include/asm/paravirt.h >>>> b/arch/x86/include/asm/paravirt.h >>>> index 65a0d394fba1..4ecd3a6b1dea 100644 >>>> --- a/arch/x86/include/asm/paravirt.h >>>> +++ b/arch/x86/include/asm/paravirt.h >>>> @@ -529,14 +529,12 @@ static inline void >>>> arch_end_context_switch(struct >>>> task_struct *next) >>>> #define __HAVE_ARCH_ENTER_LAZY_MMU_MODE >>>> static inline lazy_mmu_state_t arch_enter_lazy_mmu_mode(void) >>>> { >>>> - PVOP_VCALL0(mmu.lazy_mode.enter); >>>> - >>>> - return LAZY_MMU_DEFAULT; >>>> + return PVOP_CALL0(lazy_mmu_state_t, mmu.lazy_mode.enter); >>>> } >>>> static inline void arch_leave_lazy_mmu_mode(lazy_mmu_state_t state) >>>> { >>>> - PVOP_VCALL0(mmu.lazy_mode.leave); >>>> + PVOP_VCALL1(mmu.lazy_mode.leave, state); >>>> } >>>> static inline void arch_flush_lazy_mmu_mode(void) >>>> diff --git a/arch/x86/include/asm/paravirt_types.h >>>> b/arch/x86/include/asm/ >>>> paravirt_types.h >>>> index bc1af86868a3..b7c567ccbf32 100644 >>>> --- a/arch/x86/include/asm/paravirt_types.h >>>> +++ b/arch/x86/include/asm/paravirt_types.h >>>> @@ -45,8 +45,8 @@ typedef int lazy_mmu_state_t; >>>> struct pv_lazy_ops { >>>> /* Set deferred update mode, used for batching operations. */ >>>> - void (*enter)(void); >>>> - void (*leave)(void); >>>> + lazy_mmu_state_t (*enter)(void); >>>> + void (*leave)(lazy_mmu_state_t); >>>> void (*flush)(void); >>>> } __no_randomize_layout; >>>> #endif >>>> diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c >>>> index 2039d5132ca3..6e5390ff06a5 100644 >>>> --- a/arch/x86/xen/mmu_pv.c >>>> +++ b/arch/x86/xen/mmu_pv.c >>>> @@ -2130,9 +2130,13 @@ static void xen_set_fixmap(unsigned idx, >>>> phys_addr_t >>>> phys, pgprot_t prot) >>>> #endif >>>> } >>>> -static void xen_enter_lazy_mmu(void) >>>> +static lazy_mmu_state_t xen_enter_lazy_mmu(void) >>>> { >>>> + if (this_cpu_read(xen_lazy_mode) == XEN_LAZY_MMU) >>>> + return LAZY_MMU_NESTED; >>>> + >>> >>> You mention above "preemption-safe manner" above, so I am wondering, >>> what if we get preempted immediately after doing the this_cpu_read() >>> and get >>> scheduled on another CPU? >>> >> >> This should still be correct: preemption needs a context switch to >> happen, >> so xen_start_context_switch() and xen_end_context_switch() are involved. >> Those are dealing with this problem by doing the right thing in the old >> and the new context. > > Thanks, that makes sense. Would be valuable to add that detail to the > patch description. That's a fair point, Alexander was also wondering in v1 (and so was I when I worked on this patch). Will clarify in v3. - Kevin
© 2016 - 2025 Red Hat, Inc.