[PATCH 1/2] arm64/mm: Enable batched TLB flush in unmap_hotplug_range()

Anshuman Khandual posted 2 patches 5 days, 1 hour ago
There is a newer version of this series
[PATCH 1/2] arm64/mm: Enable batched TLB flush in unmap_hotplug_range()
Posted by Anshuman Khandual 5 days, 1 hour ago
During a memory hot remove operartion both linear and vmemmap mappings for
the memory range being removed, get unmapped via unmap_hotplug_range() but
mapped pages get freed only for vmemmap mapping. This is just a sequential
operation where each table entry gets cleared, followed by a leaf specific
TLB flush, and then followed by memory free operation when applicable.

This approach was simple and uniform both for vmemmap and linear mappings.
But linear mapping might contain CONT marked block memory where it becomes
necessary to first clear out all entire in the range before a TLB flush.
This is as per the architecture requirement. Hence batch all TLB flushes
during the table tear down walk and finally do it in unmap_hotplug_range().

Besides it is helps in improving the performance via TLBI range operation
along with reduced synchronization instructions. The time spent executing
unmap_hotplug_range() improved 97% measured over a 2GB memory hot removal
in KVM guest.

This scheme is not applicable during vmemmap mapping tear down where memory
needs to be freed and hence a TLB flush is required after clearing out page
table entry.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Closes: https://lore.kernel.org/all/aWZYXhrT6D2M-7-N@willie-the-truck/
Fixes: bbd6ec605c0f ("arm64/mm: Enable memory hot remove")
Cc: stable@vger.kernel.org
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
---
 arch/arm64/mm/mmu.c | 81 +++++++++++++++++++++++++++++++++++++--------
 1 file changed, 67 insertions(+), 14 deletions(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 8e1d80a7033e..8ec8a287aaa1 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1458,10 +1458,32 @@ static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
 
 		WARN_ON(!pte_present(pte));
 		__pte_clear(&init_mm, addr, ptep);
-		flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
-		if (free_mapped)
+		if (free_mapped) {
+			/*
+			 * If page is part of an existing contiguous
+			 * memory block, individual TLB invalidation
+			 * here would not be appropriate. Instead it
+			 * will require clearing all entries for the
+			 * memory block and subsequently a TLB flush
+			 * for the entire range.
+			 */
+			WARN_ON(pte_cont(pte));
+
+			/*
+			 * TLB flush is essential for freeing memory.
+			 */
+			flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
 			free_hotplug_page_range(pte_page(pte),
 						PAGE_SIZE, altmap);
+		}
+
+		/*
+		 * TLB flush is batched in unmap_hotplug_range()
+		 * for the entire range, when memory need not be
+		 * freed. Besides linear mapping might have CONT
+		 * blocks where TLB flush needs to be done after
+		 * clearing all relevant entries.
+		 */
 	} while (addr += PAGE_SIZE, addr < end);
 }
 
@@ -1482,15 +1504,32 @@ static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
 		WARN_ON(!pmd_present(pmd));
 		if (pmd_sect(pmd)) {
 			pmd_clear(pmdp);
+			if (free_mapped) {
+				/*
+				 * If page is part of an existing contiguous
+				 * memory block, individual TLB invalidation
+				 * here would not be appropriate. Instead it
+				 * will require clearing all entries for the
+				 * memory block and subsequently a TLB flush
+				 * for the entire range.
+				 */
+				WARN_ON(pmd_cont(pmd));
+
+				/*
+				 * TLB flush is essential for freeing memory.
+				 */
+				flush_tlb_kernel_range(addr, addr + PMD_SIZE);
+				free_hotplug_page_range(pmd_page(pmd),
+							PMD_SIZE, altmap);
+			}
 
 			/*
-			 * One TLBI should be sufficient here as the PMD_SIZE
-			 * range is mapped with a single block entry.
+			 * TLB flush is batched in unmap_hotplug_range()
+			 * for the entire range, when memory need not be
+			 * freed. Besides linear mapping might have CONT
+			 * blocks where TLB flush needs to be done after
+			 * clearing all relevant entries.
 			 */
-			flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
-			if (free_mapped)
-				free_hotplug_page_range(pmd_page(pmd),
-							PMD_SIZE, altmap);
 			continue;
 		}
 		WARN_ON(!pmd_table(pmd));
@@ -1515,15 +1554,20 @@ static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr,
 		WARN_ON(!pud_present(pud));
 		if (pud_sect(pud)) {
 			pud_clear(pudp);
+			if (free_mapped) {
+				/*
+				 * TLB flush is essential for freeing memory.
+				 */
+				flush_tlb_kernel_range(addr, addr + PUD_SIZE);
+				free_hotplug_page_range(pud_page(pud),
+							PUD_SIZE, altmap);
+			}
 
 			/*
-			 * One TLBI should be sufficient here as the PUD_SIZE
-			 * range is mapped with a single block entry.
+			 * TLB flush is batched in unmap_hotplug_range()
+			 * for the entire range, when memory need not be
+			 * freed.
 			 */
-			flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
-			if (free_mapped)
-				free_hotplug_page_range(pud_page(pud),
-							PUD_SIZE, altmap);
 			continue;
 		}
 		WARN_ON(!pud_table(pud));
@@ -1553,6 +1597,7 @@ static void unmap_hotplug_p4d_range(pgd_t *pgdp, unsigned long addr,
 static void unmap_hotplug_range(unsigned long addr, unsigned long end,
 				bool free_mapped, struct vmem_altmap *altmap)
 {
+	unsigned long start = addr;
 	unsigned long next;
 	pgd_t *pgdp, pgd;
 
@@ -1574,6 +1619,14 @@ static void unmap_hotplug_range(unsigned long addr, unsigned long end,
 		WARN_ON(!pgd_present(pgd));
 		unmap_hotplug_p4d_range(pgdp, addr, next, free_mapped, altmap);
 	} while (addr = next, addr < end);
+
+	/*
+	 * Batched TLB flush only for linear mapping which
+	 * might contain CONT blocks, and does not require
+	 * freeing up memory as well.
+	 */
+	if (!free_mapped)
+		flush_tlb_kernel_range(start, end);
 }
 
 static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
-- 
2.30.2
Re: [PATCH 1/2] arm64/mm: Enable batched TLB flush in unmap_hotplug_range()
Posted by Ryan Roberts 4 days, 20 hours ago
On 02/02/2026 04:26, Anshuman Khandual wrote:
> During a memory hot remove operartion both linear and vmemmap mappings for
> the memory range being removed, get unmapped via unmap_hotplug_range() but
> mapped pages get freed only for vmemmap mapping. This is just a sequential
> operation where each table entry gets cleared, followed by a leaf specific
> TLB flush, and then followed by memory free operation when applicable.
> 
> This approach was simple and uniform both for vmemmap and linear mappings.
> But linear mapping might contain CONT marked block memory where it becomes
> necessary to first clear out all entire in the range before a TLB flush.
> This is as per the architecture requirement. Hence batch all TLB flushes
> during the table tear down walk and finally do it in unmap_hotplug_range().

I might be worth mentioning the impact of not bein architecture compliant here?

Something like:

  Prior to this fix, it was hypothetically possible for a speculative access to
  a higher address in the contiguous block to fill the TLB with shattered
  entries for the entire contiguous range after a lower address had already been
  cleared and invalidated. Due to the entries being shattered, the subsequent
  tlbi for the higher address would not then clear the TLB entries for the lower
  address, meaning stale TLB entries could persist.

> 
> Besides it is helps in improving the performance via TLBI range operation

nit:         ^^ (remove)

> along with reduced synchronization instructions. The time spent executing
> unmap_hotplug_range() improved 97% measured over a 2GB memory hot removal
> in KVM guest.

That's a great improvement :)

> 
> This scheme is not applicable during vmemmap mapping tear down where memory
> needs to be freed and hence a TLB flush is required after clearing out page
> table entry.
> 
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> Cc: linux-arm-kernel@lists.infradead.org
> Cc: linux-kernel@vger.kernel.org
> Closes: https://lore.kernel.org/all/aWZYXhrT6D2M-7-N@willie-the-truck/
> Fixes: bbd6ec605c0f ("arm64/mm: Enable memory hot remove")
> Cc: stable@vger.kernel.org
> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>

I suggested the original shape of this and I see you have added my SOB. Final
patch looks good to me - I'm not sure if it's correct for me to add Rb, but here
it is regardless:

Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>


> ---
>  arch/arm64/mm/mmu.c | 81 +++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 67 insertions(+), 14 deletions(-)
> 
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index 8e1d80a7033e..8ec8a287aaa1 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -1458,10 +1458,32 @@ static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
>  
>  		WARN_ON(!pte_present(pte));
>  		__pte_clear(&init_mm, addr, ptep);
> -		flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
> -		if (free_mapped)
> +		if (free_mapped) {
> +			/*
> +			 * If page is part of an existing contiguous
> +			 * memory block, individual TLB invalidation
> +			 * here would not be appropriate. Instead it
> +			 * will require clearing all entries for the
> +			 * memory block and subsequently a TLB flush
> +			 * for the entire range.
> +			 */
> +			WARN_ON(pte_cont(pte));
> +
> +			/*
> +			 * TLB flush is essential for freeing memory.
> +			 */
> +			flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
>  			free_hotplug_page_range(pte_page(pte),
>  						PAGE_SIZE, altmap);
> +		}
> +
> +		/*
> +		 * TLB flush is batched in unmap_hotplug_range()
> +		 * for the entire range, when memory need not be
> +		 * freed. Besides linear mapping might have CONT
> +		 * blocks where TLB flush needs to be done after
> +		 * clearing all relevant entries.
> +		 */
>  	} while (addr += PAGE_SIZE, addr < end);
>  }
>  
> @@ -1482,15 +1504,32 @@ static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
>  		WARN_ON(!pmd_present(pmd));
>  		if (pmd_sect(pmd)) {
>  			pmd_clear(pmdp);
> +			if (free_mapped) {
> +				/*
> +				 * If page is part of an existing contiguous
> +				 * memory block, individual TLB invalidation
> +				 * here would not be appropriate. Instead it
> +				 * will require clearing all entries for the
> +				 * memory block and subsequently a TLB flush
> +				 * for the entire range.
> +				 */
> +				WARN_ON(pmd_cont(pmd));
> +
> +				/*
> +				 * TLB flush is essential for freeing memory.
> +				 */
> +				flush_tlb_kernel_range(addr, addr + PMD_SIZE);
> +				free_hotplug_page_range(pmd_page(pmd),
> +							PMD_SIZE, altmap);
> +			}
>  
>  			/*
> -			 * One TLBI should be sufficient here as the PMD_SIZE
> -			 * range is mapped with a single block entry.
> +			 * TLB flush is batched in unmap_hotplug_range()
> +			 * for the entire range, when memory need not be
> +			 * freed. Besides linear mapping might have CONT
> +			 * blocks where TLB flush needs to be done after
> +			 * clearing all relevant entries.
>  			 */
> -			flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
> -			if (free_mapped)
> -				free_hotplug_page_range(pmd_page(pmd),
> -							PMD_SIZE, altmap);
>  			continue;
>  		}
>  		WARN_ON(!pmd_table(pmd));
> @@ -1515,15 +1554,20 @@ static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr,
>  		WARN_ON(!pud_present(pud));
>  		if (pud_sect(pud)) {
>  			pud_clear(pudp);
> +			if (free_mapped) {
> +				/*
> +				 * TLB flush is essential for freeing memory.
> +				 */
> +				flush_tlb_kernel_range(addr, addr + PUD_SIZE);
> +				free_hotplug_page_range(pud_page(pud),
> +							PUD_SIZE, altmap);
> +			}
>  
>  			/*
> -			 * One TLBI should be sufficient here as the PUD_SIZE
> -			 * range is mapped with a single block entry.
> +			 * TLB flush is batched in unmap_hotplug_range()
> +			 * for the entire range, when memory need not be
> +			 * freed.
>  			 */
> -			flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
> -			if (free_mapped)
> -				free_hotplug_page_range(pud_page(pud),
> -							PUD_SIZE, altmap);
>  			continue;
>  		}
>  		WARN_ON(!pud_table(pud));
> @@ -1553,6 +1597,7 @@ static void unmap_hotplug_p4d_range(pgd_t *pgdp, unsigned long addr,
>  static void unmap_hotplug_range(unsigned long addr, unsigned long end,
>  				bool free_mapped, struct vmem_altmap *altmap)
>  {
> +	unsigned long start = addr;
>  	unsigned long next;
>  	pgd_t *pgdp, pgd;
>  
> @@ -1574,6 +1619,14 @@ static void unmap_hotplug_range(unsigned long addr, unsigned long end,
>  		WARN_ON(!pgd_present(pgd));
>  		unmap_hotplug_p4d_range(pgdp, addr, next, free_mapped, altmap);
>  	} while (addr = next, addr < end);
> +
> +	/*
> +	 * Batched TLB flush only for linear mapping which
> +	 * might contain CONT blocks, and does not require
> +	 * freeing up memory as well.
> +	 */
> +	if (!free_mapped)
> +		flush_tlb_kernel_range(start, end);
>  }
>  
>  static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
Re: [PATCH 1/2] arm64/mm: Enable batched TLB flush in unmap_hotplug_range()
Posted by Anshuman Khandual 4 days, 19 hours ago

On 02/02/26 2:48 PM, Ryan Roberts wrote:
> On 02/02/2026 04:26, Anshuman Khandual wrote:
>> During a memory hot remove operartion both linear and vmemmap mappings for
>> the memory range being removed, get unmapped via unmap_hotplug_range() but
>> mapped pages get freed only for vmemmap mapping. This is just a sequential
>> operation where each table entry gets cleared, followed by a leaf specific
>> TLB flush, and then followed by memory free operation when applicable.
>>
>> This approach was simple and uniform both for vmemmap and linear mappings.
>> But linear mapping might contain CONT marked block memory where it becomes
>> necessary to first clear out all entire in the range before a TLB flush.
>> This is as per the architecture requirement. Hence batch all TLB flushes
>> during the table tear down walk and finally do it in unmap_hotplug_range().
> 
> I might be worth mentioning the impact of not bein architecture compliant here?
> 
> Something like:
> 
>   Prior to this fix, it was hypothetically possible for a speculative access to
>   a higher address in the contiguous block to fill the TLB with shattered
>   entries for the entire contiguous range after a lower address had already been
>   cleared and invalidated. Due to the entries being shattered, the subsequent
>   tlbi for the higher address would not then clear the TLB entries for the lower
>   address, meaning stale TLB entries could persist.

Sounds good - will add in the commit message.

> 
>>
>> Besides it is helps in improving the performance via TLBI range operation
> 
> nit:         ^^ (remove)

Will fix that.

> 
>> along with reduced synchronization instructions. The time spent executing
>> unmap_hotplug_range() improved 97% measured over a 2GB memory hot removal
>> in KVM guest.
> 
> That's a great improvement :)
> 
>>
>> This scheme is not applicable during vmemmap mapping tear down where memory
>> needs to be freed and hence a TLB flush is required after clearing out page
>> table entry.
>>
>> Cc: Catalin Marinas <catalin.marinas@arm.com>
>> Cc: Will Deacon <will@kernel.org>
>> Cc: linux-arm-kernel@lists.infradead.org
>> Cc: linux-kernel@vger.kernel.org
>> Closes: https://lore.kernel.org/all/aWZYXhrT6D2M-7-N@willie-the-truck/
>> Fixes: bbd6ec605c0f ("arm64/mm: Enable memory hot remove")
>> Cc: stable@vger.kernel.org
>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
> 
> I suggested the original shape of this and I see you have added my SOB. Final
> patch looks good to me - I'm not sure if it's correct for me to add Rb, but here
> it is regardless:
> 
> Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>

Thanks !

> 
> 
>> ---
>>  arch/arm64/mm/mmu.c | 81 +++++++++++++++++++++++++++++++++++++--------
>>  1 file changed, 67 insertions(+), 14 deletions(-)
>>
>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>> index 8e1d80a7033e..8ec8a287aaa1 100644
>> --- a/arch/arm64/mm/mmu.c
>> +++ b/arch/arm64/mm/mmu.c
>> @@ -1458,10 +1458,32 @@ static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
>>  
>>  		WARN_ON(!pte_present(pte));
>>  		__pte_clear(&init_mm, addr, ptep);
>> -		flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
>> -		if (free_mapped)
>> +		if (free_mapped) {
>> +			/*
>> +			 * If page is part of an existing contiguous
>> +			 * memory block, individual TLB invalidation
>> +			 * here would not be appropriate. Instead it
>> +			 * will require clearing all entries for the
>> +			 * memory block and subsequently a TLB flush
>> +			 * for the entire range.
>> +			 */
>> +			WARN_ON(pte_cont(pte));
>> +
>> +			/*
>> +			 * TLB flush is essential for freeing memory.
>> +			 */
>> +			flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
>>  			free_hotplug_page_range(pte_page(pte),
>>  						PAGE_SIZE, altmap);
>> +		}
>> +
>> +		/*
>> +		 * TLB flush is batched in unmap_hotplug_range()
>> +		 * for the entire range, when memory need not be
>> +		 * freed. Besides linear mapping might have CONT
>> +		 * blocks where TLB flush needs to be done after
>> +		 * clearing all relevant entries.
>> +		 */
>>  	} while (addr += PAGE_SIZE, addr < end);
>>  }
>>  
>> @@ -1482,15 +1504,32 @@ static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
>>  		WARN_ON(!pmd_present(pmd));
>>  		if (pmd_sect(pmd)) {
>>  			pmd_clear(pmdp);
>> +			if (free_mapped) {
>> +				/*
>> +				 * If page is part of an existing contiguous
>> +				 * memory block, individual TLB invalidation
>> +				 * here would not be appropriate. Instead it
>> +				 * will require clearing all entries for the
>> +				 * memory block and subsequently a TLB flush
>> +				 * for the entire range.
>> +				 */
>> +				WARN_ON(pmd_cont(pmd));
>> +
>> +				/*
>> +				 * TLB flush is essential for freeing memory.
>> +				 */
>> +				flush_tlb_kernel_range(addr, addr + PMD_SIZE);
>> +				free_hotplug_page_range(pmd_page(pmd),
>> +							PMD_SIZE, altmap);
>> +			}
>>  
>>  			/*
>> -			 * One TLBI should be sufficient here as the PMD_SIZE
>> -			 * range is mapped with a single block entry.
>> +			 * TLB flush is batched in unmap_hotplug_range()
>> +			 * for the entire range, when memory need not be
>> +			 * freed. Besides linear mapping might have CONT
>> +			 * blocks where TLB flush needs to be done after
>> +			 * clearing all relevant entries.
>>  			 */
>> -			flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
>> -			if (free_mapped)
>> -				free_hotplug_page_range(pmd_page(pmd),
>> -							PMD_SIZE, altmap);
>>  			continue;
>>  		}
>>  		WARN_ON(!pmd_table(pmd));
>> @@ -1515,15 +1554,20 @@ static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr,
>>  		WARN_ON(!pud_present(pud));
>>  		if (pud_sect(pud)) {
>>  			pud_clear(pudp);
>> +			if (free_mapped) {
>> +				/*
>> +				 * TLB flush is essential for freeing memory.
>> +				 */
>> +				flush_tlb_kernel_range(addr, addr + PUD_SIZE);
>> +				free_hotplug_page_range(pud_page(pud),
>> +							PUD_SIZE, altmap);
>> +			}
>>  
>>  			/*
>> -			 * One TLBI should be sufficient here as the PUD_SIZE
>> -			 * range is mapped with a single block entry.
>> +			 * TLB flush is batched in unmap_hotplug_range()
>> +			 * for the entire range, when memory need not be
>> +			 * freed.
>>  			 */
>> -			flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
>> -			if (free_mapped)
>> -				free_hotplug_page_range(pud_page(pud),
>> -							PUD_SIZE, altmap);
>>  			continue;
>>  		}
>>  		WARN_ON(!pud_table(pud));
>> @@ -1553,6 +1597,7 @@ static void unmap_hotplug_p4d_range(pgd_t *pgdp, unsigned long addr,
>>  static void unmap_hotplug_range(unsigned long addr, unsigned long end,
>>  				bool free_mapped, struct vmem_altmap *altmap)
>>  {
>> +	unsigned long start = addr;
>>  	unsigned long next;
>>  	pgd_t *pgdp, pgd;
>>  
>> @@ -1574,6 +1619,14 @@ static void unmap_hotplug_range(unsigned long addr, unsigned long end,
>>  		WARN_ON(!pgd_present(pgd));
>>  		unmap_hotplug_p4d_range(pgdp, addr, next, free_mapped, altmap);
>>  	} while (addr = next, addr < end);
>> +
>> +	/*
>> +	 * Batched TLB flush only for linear mapping which
>> +	 * might contain CONT blocks, and does not require
>> +	 * freeing up memory as well.
>> +	 */
>> +	if (!free_mapped)
>> +		flush_tlb_kernel_range(start, end);
>>  }
>>  
>>  static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
>