Implement the required arch functions to enable use of contpte in the
vmap when VM_ALLOW_HUGE_VMAP is specified. This speeds up vmap
operations due to only having to issue a DSB and ISB per contpte block
instead of per pte. But it also means that the TLB pressure reduces due
to only needing a single TLB entry for the whole contpte block.
Since vmap uses set_huge_pte_at() to set the contpte, that API is now
used for kernel mappings for the first time. Although in the vmap case
we never expect it to be called to modify a valid mapping so
clear_flush() should never be called, it's still wise to make it robust
for the kernel case, so amend the tlb flush function if the mm is for
kernel space.
Tested with vmalloc performance selftests:
# kself/mm/test_vmalloc.sh \
run_test_mask=1
test_repeat_count=5
nr_pages=256
test_loop_count=100000
use_huge=1
Duration reduced from 1274243 usec to 1083553 usec on Apple M2 for 15%
reduction in time taken.
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
---
arch/arm64/include/asm/vmalloc.h | 40 ++++++++++++++++++++++++++++++++
arch/arm64/mm/hugetlbpage.c | 5 +++-
2 files changed, 44 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/include/asm/vmalloc.h b/arch/arm64/include/asm/vmalloc.h
index 38fafffe699f..fbdeb40f3857 100644
--- a/arch/arm64/include/asm/vmalloc.h
+++ b/arch/arm64/include/asm/vmalloc.h
@@ -23,6 +23,46 @@ static inline bool arch_vmap_pmd_supported(pgprot_t prot)
return !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
}
+#define arch_vmap_pte_range_map_size arch_vmap_pte_range_map_size
+static inline unsigned long arch_vmap_pte_range_map_size(unsigned long addr,
+ unsigned long end, u64 pfn,
+ unsigned int max_page_shift)
+{
+ if (max_page_shift < CONT_PTE_SHIFT)
+ return PAGE_SIZE;
+
+ if (end - addr < CONT_PTE_SIZE)
+ return PAGE_SIZE;
+
+ if (!IS_ALIGNED(addr, CONT_PTE_SIZE))
+ return PAGE_SIZE;
+
+ if (!IS_ALIGNED(PFN_PHYS(pfn), CONT_PTE_SIZE))
+ return PAGE_SIZE;
+
+ return CONT_PTE_SIZE;
+}
+
+#define arch_vmap_pte_range_unmap_size arch_vmap_pte_range_unmap_size
+static inline unsigned long arch_vmap_pte_range_unmap_size(unsigned long addr,
+ pte_t *ptep)
+{
+ /*
+ * The caller handles alignment so it's sufficient just to check
+ * PTE_CONT.
+ */
+ return pte_valid_cont(__ptep_get(ptep)) ? CONT_PTE_SIZE : PAGE_SIZE;
+}
+
+#define arch_vmap_pte_supported_shift arch_vmap_pte_supported_shift
+static inline int arch_vmap_pte_supported_shift(unsigned long size)
+{
+ if (size >= CONT_PTE_SIZE)
+ return CONT_PTE_SHIFT;
+
+ return PAGE_SHIFT;
+}
+
#endif
#define arch_vmap_pgprot_tagged arch_vmap_pgprot_tagged
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 02afee31444e..a74e43101dad 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -217,7 +217,10 @@ static void clear_flush(struct mm_struct *mm,
for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
___ptep_get_and_clear(mm, ptep, pgsize);
- __flush_hugetlb_tlb_range(&vma, saddr, addr, pgsize, true);
+ if (mm == &init_mm)
+ flush_tlb_kernel_range(saddr, addr);
+ else
+ __flush_hugetlb_tlb_range(&vma, saddr, addr, pgsize, true);
}
void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
--
2.43.0
On 2/5/25 20:39, Ryan Roberts wrote:
> Implement the required arch functions to enable use of contpte in the
> vmap when VM_ALLOW_HUGE_VMAP is specified. This speeds up vmap
> operations due to only having to issue a DSB and ISB per contpte block
> instead of per pte. But it also means that the TLB pressure reduces due
> to only needing a single TLB entry for the whole contpte block.
Right.
>
> Since vmap uses set_huge_pte_at() to set the contpte, that API is now
> used for kernel mappings for the first time. Although in the vmap case
> we never expect it to be called to modify a valid mapping so
> clear_flush() should never be called, it's still wise to make it robust
> for the kernel case, so amend the tlb flush function if the mm is for
> kernel space.
Makes sense.
>
> Tested with vmalloc performance selftests:
>
> # kself/mm/test_vmalloc.sh \
> run_test_mask=1
> test_repeat_count=5
> nr_pages=256
> test_loop_count=100000
> use_huge=1
>
> Duration reduced from 1274243 usec to 1083553 usec on Apple M2 for 15%
> reduction in time taken.
>
> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
> ---
> arch/arm64/include/asm/vmalloc.h | 40 ++++++++++++++++++++++++++++++++
> arch/arm64/mm/hugetlbpage.c | 5 +++-
> 2 files changed, 44 insertions(+), 1 deletion(-)
>
> diff --git a/arch/arm64/include/asm/vmalloc.h b/arch/arm64/include/asm/vmalloc.h
> index 38fafffe699f..fbdeb40f3857 100644
> --- a/arch/arm64/include/asm/vmalloc.h
> +++ b/arch/arm64/include/asm/vmalloc.h
> @@ -23,6 +23,46 @@ static inline bool arch_vmap_pmd_supported(pgprot_t prot)
> return !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
> }
>
> +#define arch_vmap_pte_range_map_size arch_vmap_pte_range_map_size
> +static inline unsigned long arch_vmap_pte_range_map_size(unsigned long addr,
> + unsigned long end, u64 pfn,
> + unsigned int max_page_shift)
> +{
> + if (max_page_shift < CONT_PTE_SHIFT)
> + return PAGE_SIZE;
> +
> + if (end - addr < CONT_PTE_SIZE)
> + return PAGE_SIZE;
> +
> + if (!IS_ALIGNED(addr, CONT_PTE_SIZE))
> + return PAGE_SIZE;
> +
> + if (!IS_ALIGNED(PFN_PHYS(pfn), CONT_PTE_SIZE))
> + return PAGE_SIZE;
> +
> + return CONT_PTE_SIZE;
A small nit:
Should the rationale behind picking CONT_PTE_SIZE be added here as an in code
comment or something in the function - just to make things bit clear.
> +}
> +
> +#define arch_vmap_pte_range_unmap_size arch_vmap_pte_range_unmap_size
> +static inline unsigned long arch_vmap_pte_range_unmap_size(unsigned long addr,
> + pte_t *ptep)
> +{
> + /*
> + * The caller handles alignment so it's sufficient just to check
> + * PTE_CONT.
> + */
> + return pte_valid_cont(__ptep_get(ptep)) ? CONT_PTE_SIZE : PAGE_SIZE;
I guess it is safe to query the CONT_PTE from the mapped entry itself.
> +}
> +
> +#define arch_vmap_pte_supported_shift arch_vmap_pte_supported_shift
> +static inline int arch_vmap_pte_supported_shift(unsigned long size)
> +{
> + if (size >= CONT_PTE_SIZE)
> + return CONT_PTE_SHIFT;
> +
> + return PAGE_SHIFT;
> +}
> +
> #endif
>
> #define arch_vmap_pgprot_tagged arch_vmap_pgprot_tagged
> diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
> index 02afee31444e..a74e43101dad 100644
> --- a/arch/arm64/mm/hugetlbpage.c
> +++ b/arch/arm64/mm/hugetlbpage.c
> @@ -217,7 +217,10 @@ static void clear_flush(struct mm_struct *mm,
> for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
> ___ptep_get_and_clear(mm, ptep, pgsize);
>
> - __flush_hugetlb_tlb_range(&vma, saddr, addr, pgsize, true);
> + if (mm == &init_mm)
> + flush_tlb_kernel_range(saddr, addr);
> + else
> + __flush_hugetlb_tlb_range(&vma, saddr, addr, pgsize, true);
> }
>
> void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
Otherwise LGTM.
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
On 07/02/2025 10:04, Anshuman Khandual wrote:
>
>
> On 2/5/25 20:39, Ryan Roberts wrote:
>> Implement the required arch functions to enable use of contpte in the
>> vmap when VM_ALLOW_HUGE_VMAP is specified. This speeds up vmap
>> operations due to only having to issue a DSB and ISB per contpte block
>> instead of per pte. But it also means that the TLB pressure reduces due
>> to only needing a single TLB entry for the whole contpte block.
>
> Right.
>
>>
>> Since vmap uses set_huge_pte_at() to set the contpte, that API is now
>> used for kernel mappings for the first time. Although in the vmap case
>> we never expect it to be called to modify a valid mapping so
>> clear_flush() should never be called, it's still wise to make it robust
>> for the kernel case, so amend the tlb flush function if the mm is for
>> kernel space.
>
> Makes sense.
>
>>
>> Tested with vmalloc performance selftests:
>>
>> # kself/mm/test_vmalloc.sh \
>> run_test_mask=1
>> test_repeat_count=5
>> nr_pages=256
>> test_loop_count=100000
>> use_huge=1
>>
>> Duration reduced from 1274243 usec to 1083553 usec on Apple M2 for 15%
>> reduction in time taken.
>>
>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>> ---
>> arch/arm64/include/asm/vmalloc.h | 40 ++++++++++++++++++++++++++++++++
>> arch/arm64/mm/hugetlbpage.c | 5 +++-
>> 2 files changed, 44 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/arm64/include/asm/vmalloc.h b/arch/arm64/include/asm/vmalloc.h
>> index 38fafffe699f..fbdeb40f3857 100644
>> --- a/arch/arm64/include/asm/vmalloc.h
>> +++ b/arch/arm64/include/asm/vmalloc.h
>> @@ -23,6 +23,46 @@ static inline bool arch_vmap_pmd_supported(pgprot_t prot)
>> return !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
>> }
>>
>> +#define arch_vmap_pte_range_map_size arch_vmap_pte_range_map_size
>> +static inline unsigned long arch_vmap_pte_range_map_size(unsigned long addr,
>> + unsigned long end, u64 pfn,
>> + unsigned int max_page_shift)
>> +{
>> + if (max_page_shift < CONT_PTE_SHIFT)
>> + return PAGE_SIZE;
>> +
>> + if (end - addr < CONT_PTE_SIZE)
>> + return PAGE_SIZE;
>> +
>> + if (!IS_ALIGNED(addr, CONT_PTE_SIZE))
>> + return PAGE_SIZE;
>> +
>> + if (!IS_ALIGNED(PFN_PHYS(pfn), CONT_PTE_SIZE))
>> + return PAGE_SIZE;
>> +
>> + return CONT_PTE_SIZE;
>
> A small nit:
>
> Should the rationale behind picking CONT_PTE_SIZE be added here as an in code
> comment or something in the function - just to make things bit clear.
I'm not sure what other size we would pick?
>
>> +}
>> +
>> +#define arch_vmap_pte_range_unmap_size arch_vmap_pte_range_unmap_size
>> +static inline unsigned long arch_vmap_pte_range_unmap_size(unsigned long addr,
>> + pte_t *ptep)
>> +{
>> + /*
>> + * The caller handles alignment so it's sufficient just to check
>> + * PTE_CONT.
>> + */
>> + return pte_valid_cont(__ptep_get(ptep)) ? CONT_PTE_SIZE : PAGE_SIZE;
>
> I guess it is safe to query the CONT_PTE from the mapped entry itself.
Yes I don't see why not. Is there some specific aspect you're concerned about?
>
>> +}
>> +
>> +#define arch_vmap_pte_supported_shift arch_vmap_pte_supported_shift
>> +static inline int arch_vmap_pte_supported_shift(unsigned long size)
>> +{
>> + if (size >= CONT_PTE_SIZE)
>> + return CONT_PTE_SHIFT;
>> +
>> + return PAGE_SHIFT;
>> +}
>> +
>> #endif
>>
>> #define arch_vmap_pgprot_tagged arch_vmap_pgprot_tagged
>> diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
>> index 02afee31444e..a74e43101dad 100644
>> --- a/arch/arm64/mm/hugetlbpage.c
>> +++ b/arch/arm64/mm/hugetlbpage.c
>> @@ -217,7 +217,10 @@ static void clear_flush(struct mm_struct *mm,
>> for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
>> ___ptep_get_and_clear(mm, ptep, pgsize);
>>
>> - __flush_hugetlb_tlb_range(&vma, saddr, addr, pgsize, true);
>> + if (mm == &init_mm)
>> + flush_tlb_kernel_range(saddr, addr);
>> + else
>> + __flush_hugetlb_tlb_range(&vma, saddr, addr, pgsize, true);
>> }
>>
>> void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
>
> Otherwise LGTM.
>
> Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Thanks!
On 2/7/25 16:50, Ryan Roberts wrote:
> On 07/02/2025 10:04, Anshuman Khandual wrote:
>>
>>
>> On 2/5/25 20:39, Ryan Roberts wrote:
>>> Implement the required arch functions to enable use of contpte in the
>>> vmap when VM_ALLOW_HUGE_VMAP is specified. This speeds up vmap
>>> operations due to only having to issue a DSB and ISB per contpte block
>>> instead of per pte. But it also means that the TLB pressure reduces due
>>> to only needing a single TLB entry for the whole contpte block.
>>
>> Right.
>>
>>>
>>> Since vmap uses set_huge_pte_at() to set the contpte, that API is now
>>> used for kernel mappings for the first time. Although in the vmap case
>>> we never expect it to be called to modify a valid mapping so
>>> clear_flush() should never be called, it's still wise to make it robust
>>> for the kernel case, so amend the tlb flush function if the mm is for
>>> kernel space.
>>
>> Makes sense.
>>
>>>
>>> Tested with vmalloc performance selftests:
>>>
>>> # kself/mm/test_vmalloc.sh \
>>> run_test_mask=1
>>> test_repeat_count=5
>>> nr_pages=256
>>> test_loop_count=100000
>>> use_huge=1
>>>
>>> Duration reduced from 1274243 usec to 1083553 usec on Apple M2 for 15%
>>> reduction in time taken.
>>>
>>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>>> ---
>>> arch/arm64/include/asm/vmalloc.h | 40 ++++++++++++++++++++++++++++++++
>>> arch/arm64/mm/hugetlbpage.c | 5 +++-
>>> 2 files changed, 44 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/arch/arm64/include/asm/vmalloc.h b/arch/arm64/include/asm/vmalloc.h
>>> index 38fafffe699f..fbdeb40f3857 100644
>>> --- a/arch/arm64/include/asm/vmalloc.h
>>> +++ b/arch/arm64/include/asm/vmalloc.h
>>> @@ -23,6 +23,46 @@ static inline bool arch_vmap_pmd_supported(pgprot_t prot)
>>> return !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
>>> }
>>>
>>> +#define arch_vmap_pte_range_map_size arch_vmap_pte_range_map_size
>>> +static inline unsigned long arch_vmap_pte_range_map_size(unsigned long addr,
>>> + unsigned long end, u64 pfn,
>>> + unsigned int max_page_shift)
>>> +{
>>> + if (max_page_shift < CONT_PTE_SHIFT)
>>> + return PAGE_SIZE;
>>> +
>>> + if (end - addr < CONT_PTE_SIZE)
>>> + return PAGE_SIZE;
>>> +
>>> + if (!IS_ALIGNED(addr, CONT_PTE_SIZE))
>>> + return PAGE_SIZE;
>>> +
>>> + if (!IS_ALIGNED(PFN_PHYS(pfn), CONT_PTE_SIZE))
>>> + return PAGE_SIZE;
>>> +
>>> + return CONT_PTE_SIZE;
>>
>> A small nit:
>>
>> Should the rationale behind picking CONT_PTE_SIZE be added here as an in code
>> comment or something in the function - just to make things bit clear.
>
> I'm not sure what other size we would pick?
The suggestion was to add a small comment in the above helper function explaining
the rationale for various conditions in there while returning either PAGE_SIZE or
CONT_PTE_SIZE to improve readability etc.
>
>>
>>> +}
>>> +
>>> +#define arch_vmap_pte_range_unmap_size arch_vmap_pte_range_unmap_size
>>> +static inline unsigned long arch_vmap_pte_range_unmap_size(unsigned long addr,
>>> + pte_t *ptep)
>>> +{
>>> + /*
>>> + * The caller handles alignment so it's sufficient just to check
>>> + * PTE_CONT.
>>> + */
>>> + return pte_valid_cont(__ptep_get(ptep)) ? CONT_PTE_SIZE : PAGE_SIZE;
>>
>> I guess it is safe to query the CONT_PTE from the mapped entry itself.
>
> Yes I don't see why not. Is there some specific aspect you're concerned about?
Nothing came up while following this code, it was more of a general observation.
>
>>
>>> +}
>>> +
>>> +#define arch_vmap_pte_supported_shift arch_vmap_pte_supported_shift
>>> +static inline int arch_vmap_pte_supported_shift(unsigned long size)
>>> +{
>>> + if (size >= CONT_PTE_SIZE)
>>> + return CONT_PTE_SHIFT;
>>> +
>>> + return PAGE_SHIFT;
>>> +}
>>> +
>>> #endif
>>>
>>> #define arch_vmap_pgprot_tagged arch_vmap_pgprot_tagged
>>> diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
>>> index 02afee31444e..a74e43101dad 100644
>>> --- a/arch/arm64/mm/hugetlbpage.c
>>> +++ b/arch/arm64/mm/hugetlbpage.c
>>> @@ -217,7 +217,10 @@ static void clear_flush(struct mm_struct *mm,
>>> for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
>>> ___ptep_get_and_clear(mm, ptep, pgsize);
>>>
>>> - __flush_hugetlb_tlb_range(&vma, saddr, addr, pgsize, true);
>>> + if (mm == &init_mm)
>>> + flush_tlb_kernel_range(saddr, addr);
>>> + else
>>> + __flush_hugetlb_tlb_range(&vma, saddr, addr, pgsize, true);
>>> }
>>>
>>> void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
>>
>> Otherwise LGTM.
>>
>> Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
>
> Thanks!
>
>
>>>> +#define arch_vmap_pte_range_map_size arch_vmap_pte_range_map_size
>>>> +static inline unsigned long arch_vmap_pte_range_map_size(unsigned long addr,
>>>> + unsigned long end, u64 pfn,
>>>> + unsigned int max_page_shift)
>>>> +{
>>>> + if (max_page_shift < CONT_PTE_SHIFT)
>>>> + return PAGE_SIZE;
>>>> +
>>>> + if (end - addr < CONT_PTE_SIZE)
>>>> + return PAGE_SIZE;
>>>> +
>>>> + if (!IS_ALIGNED(addr, CONT_PTE_SIZE))
>>>> + return PAGE_SIZE;
>>>> +
>>>> + if (!IS_ALIGNED(PFN_PHYS(pfn), CONT_PTE_SIZE))
>>>> + return PAGE_SIZE;
>>>> +
>>>> + return CONT_PTE_SIZE;
>>>
>>> A small nit:
>>>
>>> Should the rationale behind picking CONT_PTE_SIZE be added here as an in code
>>> comment or something in the function - just to make things bit clear.
>>
>> I'm not sure what other size we would pick?
>
> The suggestion was to add a small comment in the above helper function explaining
> the rationale for various conditions in there while returning either PAGE_SIZE or
> CONT_PTE_SIZE to improve readability etc.
OK I've added the following:
/*
* If the block is at least CONT_PTE_SIZE in size, and is naturally
* aligned in both virtual and physical space, then we can pte-map the
* block using the PTE_CONT bit for more efficient use of the TLB.
*/
Thanks,
Ryan
On 2/13/25 14:39, Ryan Roberts wrote:
>
>>>>> +#define arch_vmap_pte_range_map_size arch_vmap_pte_range_map_size
>>>>> +static inline unsigned long arch_vmap_pte_range_map_size(unsigned long addr,
>>>>> + unsigned long end, u64 pfn,
>>>>> + unsigned int max_page_shift)
>>>>> +{
>>>>> + if (max_page_shift < CONT_PTE_SHIFT)
>>>>> + return PAGE_SIZE;
>>>>> +
>>>>> + if (end - addr < CONT_PTE_SIZE)
>>>>> + return PAGE_SIZE;
>>>>> +
>>>>> + if (!IS_ALIGNED(addr, CONT_PTE_SIZE))
>>>>> + return PAGE_SIZE;
>>>>> +
>>>>> + if (!IS_ALIGNED(PFN_PHYS(pfn), CONT_PTE_SIZE))
>>>>> + return PAGE_SIZE;
>>>>> +
>>>>> + return CONT_PTE_SIZE;
>>>>
>>>> A small nit:
>>>>
>>>> Should the rationale behind picking CONT_PTE_SIZE be added here as an in code
>>>> comment or something in the function - just to make things bit clear.
>>>
>>> I'm not sure what other size we would pick?
>>
>> The suggestion was to add a small comment in the above helper function explaining
>> the rationale for various conditions in there while returning either PAGE_SIZE or
>> CONT_PTE_SIZE to improve readability etc.
>
> OK I've added the following:
>
> /*
> * If the block is at least CONT_PTE_SIZE in size, and is naturally
> * aligned in both virtual and physical space, then we can pte-map the
> * block using the PTE_CONT bit for more efficient use of the TLB.
> */
Sounds good, thanks !
© 2016 - 2025 Red Hat, Inc.