Encourage the compiler to inline batch PTE logic and resolve constant
branches by adding __always_inline strategically.
Signed-off-by: Pedro Falcato <pfalcato@suse.de>
---
mm/mprotect.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 9681f055b9fc..1bd0d4aa07c2 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -103,7 +103,7 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
return can_change_shared_pte_writable(vma, pte);
}
-static int mprotect_folio_pte_batch(struct folio *folio, pte_t *ptep,
+static __always_inline int mprotect_folio_pte_batch(struct folio *folio, pte_t *ptep,
pte_t pte, int max_nr_ptes, fpb_t flags)
{
/* No underlying folio, so cannot batch */
@@ -117,9 +117,9 @@ static int mprotect_folio_pte_batch(struct folio *folio, pte_t *ptep,
}
/* Set nr_ptes number of ptes, starting from idx */
-static void prot_commit_flush_ptes(struct vm_area_struct *vma, unsigned long addr,
- pte_t *ptep, pte_t oldpte, pte_t ptent, int nr_ptes,
- int idx, bool set_write, struct mmu_gather *tlb)
+static __always_inline void prot_commit_flush_ptes(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep, pte_t oldpte, pte_t ptent,
+ int nr_ptes, int idx, bool set_write, struct mmu_gather *tlb)
{
/*
* Advance the position in the batch by idx; note that if idx > 0,
@@ -169,7 +169,7 @@ static int page_anon_exclusive_sub_batch(int start_idx, int max_len,
* pte of the batch. Therefore, we must individually check all pages and
* retrieve sub-batches.
*/
-static void commit_anon_folio_batch(struct vm_area_struct *vma,
+static __always_inline void commit_anon_folio_batch(struct vm_area_struct *vma,
struct folio *folio, struct page *first_page, unsigned long addr, pte_t *ptep,
pte_t oldpte, pte_t ptent, int nr_ptes, struct mmu_gather *tlb)
{
--
2.53.0
On 3/19/26 19:31, Pedro Falcato wrote:
> Encourage the compiler to inline batch PTE logic and resolve constant
> branches by adding __always_inline strategically.
>
> Signed-off-by: Pedro Falcato <pfalcato@suse.de>
> ---
> mm/mprotect.c | 10 +++++-----
> 1 file changed, 5 insertions(+), 5 deletions(-)
>
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index 9681f055b9fc..1bd0d4aa07c2 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -103,7 +103,7 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
> return can_change_shared_pte_writable(vma, pte);
> }
>
> -static int mprotect_folio_pte_batch(struct folio *folio, pte_t *ptep,
> +static __always_inline int mprotect_folio_pte_batch(struct folio *folio, pte_t *ptep,
> pte_t pte, int max_nr_ptes, fpb_t flags)
> {
> /* No underlying folio, so cannot batch */
> @@ -117,9 +117,9 @@ static int mprotect_folio_pte_batch(struct folio *folio, pte_t *ptep,
> }
>
> /* Set nr_ptes number of ptes, starting from idx */
> -static void prot_commit_flush_ptes(struct vm_area_struct *vma, unsigned long addr,
> - pte_t *ptep, pte_t oldpte, pte_t ptent, int nr_ptes,
> - int idx, bool set_write, struct mmu_gather *tlb)
> +static __always_inline void prot_commit_flush_ptes(struct vm_area_struct *vma,
> + unsigned long addr, pte_t *ptep, pte_t oldpte, pte_t ptent,
> + int nr_ptes, int idx, bool set_write, struct mmu_gather *tlb)
> {
> /*
> * Advance the position in the batch by idx; note that if idx > 0,
> @@ -169,7 +169,7 @@ static int page_anon_exclusive_sub_batch(int start_idx, int max_len,
> * pte of the batch. Therefore, we must individually check all pages and
> * retrieve sub-batches.
> */
> -static void commit_anon_folio_batch(struct vm_area_struct *vma,
> +static __always_inline void commit_anon_folio_batch(struct vm_area_struct *vma,
> struct folio *folio, struct page *first_page, unsigned long addr, pte_t *ptep,
> pte_t oldpte, pte_t ptent, int nr_ptes, struct mmu_gather *tlb)
> {
From my micro-optimization work on zapping and fork, I learned that
these batching functions are best optimized for order-0 page by
explicitly calling them from the code with "nr_ptes == 1" and then
force-inlining them. nr_ptes and all loops will essentially be optimized
out.
With no such explicit constants, is there really a real benefit to be
had here?
--
Cheers,
David
On Thu, Mar 19, 2026 at 10:28:47PM +0100, David Hildenbrand (Arm) wrote:
> On 3/19/26 19:31, Pedro Falcato wrote:
> > Encourage the compiler to inline batch PTE logic and resolve constant
> > branches by adding __always_inline strategically.
> >
> > Signed-off-by: Pedro Falcato <pfalcato@suse.de>
> > ---
> > mm/mprotect.c | 10 +++++-----
> > 1 file changed, 5 insertions(+), 5 deletions(-)
> >
> > diff --git a/mm/mprotect.c b/mm/mprotect.c
> > index 9681f055b9fc..1bd0d4aa07c2 100644
> > --- a/mm/mprotect.c
> > +++ b/mm/mprotect.c
> > @@ -103,7 +103,7 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
> > return can_change_shared_pte_writable(vma, pte);
> > }
> >
> > -static int mprotect_folio_pte_batch(struct folio *folio, pte_t *ptep,
> > +static __always_inline int mprotect_folio_pte_batch(struct folio *folio, pte_t *ptep,
> > pte_t pte, int max_nr_ptes, fpb_t flags)
> > {
> > /* No underlying folio, so cannot batch */
> > @@ -117,9 +117,9 @@ static int mprotect_folio_pte_batch(struct folio *folio, pte_t *ptep,
> > }
> >
> > /* Set nr_ptes number of ptes, starting from idx */
> > -static void prot_commit_flush_ptes(struct vm_area_struct *vma, unsigned long addr,
> > - pte_t *ptep, pte_t oldpte, pte_t ptent, int nr_ptes,
> > - int idx, bool set_write, struct mmu_gather *tlb)
> > +static __always_inline void prot_commit_flush_ptes(struct vm_area_struct *vma,
> > + unsigned long addr, pte_t *ptep, pte_t oldpte, pte_t ptent,
> > + int nr_ptes, int idx, bool set_write, struct mmu_gather *tlb)
> > {
> > /*
> > * Advance the position in the batch by idx; note that if idx > 0,
> > @@ -169,7 +169,7 @@ static int page_anon_exclusive_sub_batch(int start_idx, int max_len,
> > * pte of the batch. Therefore, we must individually check all pages and
> > * retrieve sub-batches.
> > */
> > -static void commit_anon_folio_batch(struct vm_area_struct *vma,
> > +static __always_inline void commit_anon_folio_batch(struct vm_area_struct *vma,
> > struct folio *folio, struct page *first_page, unsigned long addr, pte_t *ptep,
> > pte_t oldpte, pte_t ptent, int nr_ptes, struct mmu_gather *tlb)
> > {
>
> From my micro-optimization work on zapping and fork, I learned that
> these batching functions are best optimized for order-0 page by
> explicitly calling them from the code with "nr_ptes == 1" and then
> force-inlining them. nr_ptes and all loops will essentially be optimized
> out.
>
> With no such explicit constants, is there really a real benefit to be
> had here?
Per my measurements, I could measure a real speedup here. Of course things
may heavily depend on the microarchitecture you use. I want to note that
these three functions are part of the hot loop and thus we definitely want
them inlined. Particularly if we start special-casing stuff. You can cut
down _a lot_ of code if you simply tell it "yeah don't bother you're looking
at 1 pte only".
Of course a lot of this is just codegen fengshui but I tried sticking to
good fundamentals and inlining things that matter, while noinlining
things that aren't frequent. As-is the compiler seems to make poor
inlining decisions on its own (and basically every static function is
inlined, except e.g prot_commit_flush_ptes, FOR SOME REASON).
--
Pedro
On 3/20/26 10:59, Pedro Falcato wrote:
> On Thu, Mar 19, 2026 at 10:28:47PM +0100, David Hildenbrand (Arm) wrote:
>> On 3/19/26 19:31, Pedro Falcato wrote:
>>> Encourage the compiler to inline batch PTE logic and resolve constant
>>> branches by adding __always_inline strategically.
>>>
>>> Signed-off-by: Pedro Falcato <pfalcato@suse.de>
>>> ---
>>> mm/mprotect.c | 10 +++++-----
>>> 1 file changed, 5 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/mm/mprotect.c b/mm/mprotect.c
>>> index 9681f055b9fc..1bd0d4aa07c2 100644
>>> --- a/mm/mprotect.c
>>> +++ b/mm/mprotect.c
>>> @@ -103,7 +103,7 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
>>> return can_change_shared_pte_writable(vma, pte);
>>> }
>>>
>>> -static int mprotect_folio_pte_batch(struct folio *folio, pte_t *ptep,
>>> +static __always_inline int mprotect_folio_pte_batch(struct folio *folio, pte_t *ptep,
>>> pte_t pte, int max_nr_ptes, fpb_t flags)
>>> {
>>> /* No underlying folio, so cannot batch */
>>> @@ -117,9 +117,9 @@ static int mprotect_folio_pte_batch(struct folio *folio, pte_t *ptep,
>>> }
>>>
>>> /* Set nr_ptes number of ptes, starting from idx */
>>> -static void prot_commit_flush_ptes(struct vm_area_struct *vma, unsigned long addr,
>>> - pte_t *ptep, pte_t oldpte, pte_t ptent, int nr_ptes,
>>> - int idx, bool set_write, struct mmu_gather *tlb)
>>> +static __always_inline void prot_commit_flush_ptes(struct vm_area_struct *vma,
>>> + unsigned long addr, pte_t *ptep, pte_t oldpte, pte_t ptent,
>>> + int nr_ptes, int idx, bool set_write, struct mmu_gather *tlb)
>>> {
>>> /*
>>> * Advance the position in the batch by idx; note that if idx > 0,
>>> @@ -169,7 +169,7 @@ static int page_anon_exclusive_sub_batch(int start_idx, int max_len,
>>> * pte of the batch. Therefore, we must individually check all pages and
>>> * retrieve sub-batches.
>>> */
>>> -static void commit_anon_folio_batch(struct vm_area_struct *vma,
>>> +static __always_inline void commit_anon_folio_batch(struct vm_area_struct *vma,
>>> struct folio *folio, struct page *first_page, unsigned long addr, pte_t *ptep,
>>> pte_t oldpte, pte_t ptent, int nr_ptes, struct mmu_gather *tlb)
>>> {
>>
>> From my micro-optimization work on zapping and fork, I learned that
>> these batching functions are best optimized for order-0 page by
>> explicitly calling them from the code with "nr_ptes == 1" and then
>> force-inlining them. nr_ptes and all loops will essentially be optimized
>> out.
>>
>> With no such explicit constants, is there really a real benefit to be
>> had here?
>
> Per my measurements, I could measure a real speedup here. Of course things
> may heavily depend on the microarchitecture you use. I want to note that
> these three functions are part of the hot loop and thus we definitely want
> them inlined. Particularly if we start special-casing stuff. You can cut
> down _a lot_ of code if you simply tell it "yeah don't bother you're looking
> at 1 pte only".
That's why I think this change is a lot more valuable when squashing
patch #4.
--
Cheers,
David
On Thu, Mar 19, 2026 at 06:31:05PM +0000, Pedro Falcato wrote:
> Encourage the compiler to inline batch PTE logic and resolve constant
> branches by adding __always_inline strategically.
>
> Signed-off-by: Pedro Falcato <pfalcato@suse.de>
Does this vary by compiler/arch that much?
I wonder about how much ends up on the stack here too given the HUGE number of
arguments passed around but I guess you'd be pushing and popping some even if
these weren't inlined.
I wonder if we wouldn't want to carefully check different arches for this
though!
> ---
> mm/mprotect.c | 10 +++++-----
> 1 file changed, 5 insertions(+), 5 deletions(-)
>
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index 9681f055b9fc..1bd0d4aa07c2 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -103,7 +103,7 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
> return can_change_shared_pte_writable(vma, pte);
> }
>
> -static int mprotect_folio_pte_batch(struct folio *folio, pte_t *ptep,
> +static __always_inline int mprotect_folio_pte_batch(struct folio *folio, pte_t *ptep,
> pte_t pte, int max_nr_ptes, fpb_t flags)
> {
> /* No underlying folio, so cannot batch */
> @@ -117,9 +117,9 @@ static int mprotect_folio_pte_batch(struct folio *folio, pte_t *ptep,
> }
>
> /* Set nr_ptes number of ptes, starting from idx */
> -static void prot_commit_flush_ptes(struct vm_area_struct *vma, unsigned long addr,
> - pte_t *ptep, pte_t oldpte, pte_t ptent, int nr_ptes,
> - int idx, bool set_write, struct mmu_gather *tlb)
> +static __always_inline void prot_commit_flush_ptes(struct vm_area_struct *vma,
> + unsigned long addr, pte_t *ptep, pte_t oldpte, pte_t ptent,
> + int nr_ptes, int idx, bool set_write, struct mmu_gather *tlb)
> {
> /*
> * Advance the position in the batch by idx; note that if idx > 0,
> @@ -169,7 +169,7 @@ static int page_anon_exclusive_sub_batch(int start_idx, int max_len,
> * pte of the batch. Therefore, we must individually check all pages and
> * retrieve sub-batches.
> */
> -static void commit_anon_folio_batch(struct vm_area_struct *vma,
> +static __always_inline void commit_anon_folio_batch(struct vm_area_struct *vma,
> struct folio *folio, struct page *first_page, unsigned long addr, pte_t *ptep,
> pte_t oldpte, pte_t ptent, int nr_ptes, struct mmu_gather *tlb)
> {
> --
> 2.53.0
>
On Thu, Mar 19, 2026 at 06:59:37PM +0000, Lorenzo Stoakes (Oracle) wrote: > On Thu, Mar 19, 2026 at 06:31:05PM +0000, Pedro Falcato wrote: > > Encourage the compiler to inline batch PTE logic and resolve constant > > branches by adding __always_inline strategically. > > > > > Signed-off-by: Pedro Falcato <pfalcato@suse.de> > > Does this vary by compiler/arch that much? > > I wonder about how much ends up on the stack here too given the HUGE number of > arguments passed around but I guess you'd be pushing and popping some even if > these weren't inlined. > > I wonder if we wouldn't want to carefully check different arches for this > though! BTW have previously seen how compilers can be suuuuper picky as to what inlines or not based on hueristics so can see why doing this would move the needle if we were sure it'd universally help/at least not cause issues. Cheers, Lorenzo
© 2016 - 2026 Red Hat, Inc.