[v1] mm/vmalloc: lock contention optimization under multi-threading

[PATCH] mm/vmalloc: lock contention optimization under multi-threading

Posted by rulinhuang 2 years ago

When allocating a new memory area where the mapping address range is
known, it is observed that the vmap_area lock is acquired twice.
The first acquisition occurs in the alloc_vmap_area() function when
inserting the vm area into the vm mapping red-black tree. The second
acquisition occurs in the setup_vmalloc_vm() function when updating the
properties of the vm, such as flags and address, etc.
Combine these two operations together in alloc_vmap_area(), which
improves scalability when the vmap_area lock is contended. By doing so,
the need to acquire the lock twice can also be eliminated.
With the above change, tested on intel icelake platform(160 vcpu, kernel
v6.7), a 6% performance improvement and a 7% reduction in overall
spinlock hotspot are gained on
stress-ng/pthread(https://github.com/ColinIanKing/stress-ng), which is
the stress test of thread creations.

Reviewed-by: "Chen, Tim C" <tim.c.chen@intel.com>
Reviewed-by: "King, Colin" <colin.king@intel.com>
Signed-off-by: rulinhuang <rulin.huang@intel.com>
---
 mm/vmalloc.c | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d12a17fc0c171..3b1f616e8ecf0 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1577,13 +1577,13 @@ preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
 
 /*
  * Allocate a region of KVA of the specified size and alignment, within the
- * vstart and vend.
+ * vstart and vend. If vm is passed in, the two will also be bound.
  */
 static struct vmap_area *alloc_vmap_area(unsigned long size,
 				unsigned long align,
 				unsigned long vstart, unsigned long vend,
 				int node, gfp_t gfp_mask,
-				unsigned long va_flags)
+				unsigned long va_flags, struct vm_struct *vm)
 {
 	struct vmap_area *va;
 	unsigned long freed;
@@ -1627,9 +1627,12 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 
 	va->va_start = addr;
 	va->va_end = addr + size;
-	va->vm = NULL;
+	va->vm = vm;
 	va->flags = va_flags;
 
+	if (vm != NULL)
+		vm->addr = (void *)addr;
+
 	spin_lock(&vmap_area_lock);
 	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
 	spin_unlock(&vmap_area_lock);
@@ -2039,7 +2042,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
 	va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
 					VMALLOC_START, VMALLOC_END,
 					node, gfp_mask,
-					VMAP_RAM|VMAP_BLOCK);
+					VMAP_RAM|VMAP_BLOCK,
+					NULL);
 	if (IS_ERR(va)) {
 		kfree(vb);
 		return ERR_CAST(va);
@@ -2394,7 +2398,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
 		struct vmap_area *va;
 		va = alloc_vmap_area(size, PAGE_SIZE,
 				VMALLOC_START, VMALLOC_END,
-				node, GFP_KERNEL, VMAP_RAM);
+				node, GFP_KERNEL, VMAP_RAM,
+				NULL);
 		if (IS_ERR(va))
 			return NULL;
 
@@ -2548,14 +2553,6 @@ static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
 	va->vm = vm;
 }
 
-static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
-			      unsigned long flags, const void *caller)
-{
-	spin_lock(&vmap_area_lock);
-	setup_vmalloc_vm_locked(vm, va, flags, caller);
-	spin_unlock(&vmap_area_lock);
-}
-
 static void clear_vm_uninitialized_flag(struct vm_struct *vm)
 {
 	/*
@@ -2592,14 +2589,16 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
 	if (!(flags & VM_NO_GUARD))
 		size += PAGE_SIZE;
 
-	va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0);
+	area->flags = flags;
+	area->caller = caller;
+	area->size = size;
+
+	va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area);
 	if (IS_ERR(va)) {
 		kfree(area);
 		return NULL;
 	}
 
-	setup_vmalloc_vm(area, va, flags, caller);
-
 	/*
 	 * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
 	 * best-effort approach, as they can be mapped outside of vmalloc code.

base-commit: de927f6c0b07d9e698416c5b287c521b07694cac
-- 
2.43.0

Re: [PATCH] mm/vmalloc: lock contention optimization under multi-threading

Posted by Uladzislau Rezki 2 years ago

On Tue, Feb 06, 2024 at 10:30:59PM -0500, rulinhuang wrote:
> When allocating a new memory area where the mapping address range is
> known, it is observed that the vmap_area lock is acquired twice.
> The first acquisition occurs in the alloc_vmap_area() function when
> inserting the vm area into the vm mapping red-black tree. The second
> acquisition occurs in the setup_vmalloc_vm() function when updating the
> properties of the vm, such as flags and address, etc.
> Combine these two operations together in alloc_vmap_area(), which
> improves scalability when the vmap_area lock is contended. By doing so,
> the need to acquire the lock twice can also be eliminated.
> With the above change, tested on intel icelake platform(160 vcpu, kernel
> v6.7), a 6% performance improvement and a 7% reduction in overall
> spinlock hotspot are gained on
> stress-ng/pthread(https://github.com/ColinIanKing/stress-ng), which is
> the stress test of thread creations.
> 
> Reviewed-by: "Chen, Tim C" <tim.c.chen@intel.com>
> Reviewed-by: "King, Colin" <colin.king@intel.com>
> Signed-off-by: rulinhuang <rulin.huang@intel.com>
> ---
>  mm/vmalloc.c | 31 +++++++++++++++----------------
>  1 file changed, 15 insertions(+), 16 deletions(-)
> 
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index d12a17fc0c171..3b1f616e8ecf0 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -1577,13 +1577,13 @@ preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
>  
>  /*
>   * Allocate a region of KVA of the specified size and alignment, within the
> - * vstart and vend.
> + * vstart and vend. If vm is passed in, the two will also be bound.
>   */
>  static struct vmap_area *alloc_vmap_area(unsigned long size,
>  				unsigned long align,
>  				unsigned long vstart, unsigned long vend,
>  				int node, gfp_t gfp_mask,
> -				unsigned long va_flags)
> +				unsigned long va_flags, struct vm_struct *vm)
>  {
>  	struct vmap_area *va;
>  	unsigned long freed;
> @@ -1627,9 +1627,12 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
>  
>  	va->va_start = addr;
>  	va->va_end = addr + size;
> -	va->vm = NULL;
> +	va->vm = vm;
>  	va->flags = va_flags;
>  
> +	if (vm != NULL)
> +		vm->addr = (void *)addr;
> +
>  	spin_lock(&vmap_area_lock);
>  	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
>  	spin_unlock(&vmap_area_lock);
> @@ -2039,7 +2042,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
>  	va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
>  					VMALLOC_START, VMALLOC_END,
>  					node, gfp_mask,
> -					VMAP_RAM|VMAP_BLOCK);
> +					VMAP_RAM|VMAP_BLOCK,
> +					NULL);
>  	if (IS_ERR(va)) {
>  		kfree(vb);
>  		return ERR_CAST(va);
> @@ -2394,7 +2398,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
>  		struct vmap_area *va;
>  		va = alloc_vmap_area(size, PAGE_SIZE,
>  				VMALLOC_START, VMALLOC_END,
> -				node, GFP_KERNEL, VMAP_RAM);
> +				node, GFP_KERNEL, VMAP_RAM,
> +				NULL);
>  		if (IS_ERR(va))
>  			return NULL;
>  
> @@ -2548,14 +2553,6 @@ static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
>  	va->vm = vm;
>  }
>  
> -static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
> -			      unsigned long flags, const void *caller)
> -{
> -	spin_lock(&vmap_area_lock);
> -	setup_vmalloc_vm_locked(vm, va, flags, caller);
> -	spin_unlock(&vmap_area_lock);
> -}
> -
>  static void clear_vm_uninitialized_flag(struct vm_struct *vm)
>  {
>  	/*
> @@ -2592,14 +2589,16 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
>  	if (!(flags & VM_NO_GUARD))
>  		size += PAGE_SIZE;
>  
> -	va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0);
> +	area->flags = flags;
> +	area->caller = caller;
> +	area->size = size;
> +
> +	va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area);
>  	if (IS_ERR(va)) {
>  		kfree(area);
>  		return NULL;
>  	}
>  
> -	setup_vmalloc_vm(area, va, flags, caller);
> -
>  	/*
>
Thank you for improving this! That was in my radar quite a long time ago :)

Some comments though. I think that such "partial" way of initializing of
"vm_struct" or "vm" is not optimal because it becomes spread between two
places.

Also, i think that we should not remove the setup_vmalloc_vm() function,
instead we can move it in a place where the VA is not inserted to the
tree, i.e. to initialize before insert.

The "locked" variant can be renamed to setup_vmalloc_vm(). 

Also, could you please post how you tested this patch and stress-ng
figures? This is really good for other people and folk to know.

Thanks!

--
Uladzislau Rezki

Re: [PATCH] mm/vmalloc: lock contention optimization under multi-threading

Posted by rulinhuang 1 year, 11 months ago

Hi Rezki, we have submitted patch v2 to avoid the partial 
initialization issue of vm's members and separated insert_vmap_area() 
from alloc_vmap_area() so that setup_vmalloc_vm() has no need to 
require lock protection. We have also verified the improvement of 
6.3% at 160 vcpu on intel icelake platform with this patch.
Thank you for your patience.

Re: [PATCH] mm/vmalloc: lock contention optimization under multi-threading

Posted by Uladzislau Rezki 1 year, 11 months ago

Hello, Rulinhuang!

>
> Hi Rezki, we have submitted patch v2 to avoid the partial 
> initialization issue of vm's members and separated insert_vmap_area() 
> from alloc_vmap_area() so that setup_vmalloc_vm() has no need to 
> require lock protection. We have also verified the improvement of 
> 6.3% at 160 vcpu on intel icelake platform with this patch.
> Thank you for your patience.
>
Please work on the mm-unstable and try to measure it on the latest tip.

--
Uladzislau Rezki

Re: [PATCH] mm/vmalloc: lock contention optimization under multi-threading

Posted by rulinhuang 1 year, 12 months ago

Hi Rezki, thanks so much for your review. Exactly, your suggestions 
could effectively enhance the maintainability and readability of this 
code change as well as the whole vmalloc implementation. To avoid the 
partial initialization issue, we are trying to refine this patch by 
separating insert_map_area(), the insertion of VA into the tree, from 
alloc_map_area(), so that setup_vmalloc_vm() could be invoked between 
them. However, our initial trial ran into a boot-time error, which we 
are still debugging, and it may take a little bit longer than expected 
as the coming week is the public holiday of Lunar New Year in China. 
We will share with you the latest version of patch once ready for your 
review.
In the performance test, we firstly build stress-ng by following the 
instructions from https://github.com/ColinIanKing/stress-ng, and then 
launch the stressor for pthread (--pthread) for 30 seconds (-t 30) via 
the below command:
./stress-ng -t 30 --metrics-brief --pthread  -1 --no-rand-seed
The aggregated count of spawned threads per second (Bogo ops/s) is 
taken as the score of this workload. We evaluated the performance 
impact of this patch on the Ice Lake server with 40, 80, 120 and 160
 online cores respectively. And as is shown in below figure, with 
 the expansion of online cores, this patch could relieve the 
 increasingly severe lock contention and achieve quite significant 
 performance improvement of around 5.5% at 160 cores.

vcpu number                     40                           80                           120                         160
patched/original               100.5%                  100.8%                  105.2%                  105.5%
 
Thanks again for your help and please let us know if more details 
are needed.

[PATCH v2] mm/vmalloc: lock contention optimization under multi-threading

Posted by rulinhuang 1 year, 11 months ago

When allocating a new memory area where the mapping address range is
known, it is observed that the vmap_area lock is acquired twice.
The first acquisition occurs in the alloc_vmap_area() function when
inserting the vm area into the vm mapping red-black tree. The second
acquisition occurs in the setup_vmalloc_vm() function when updating the
properties of the vm, such as flags and address, etc.
Combine these two operations together in alloc_vmap_area(), which
improves scalability when the vmap_area lock is contended. By doing so,
the need to acquire the lock twice can also be eliminated.
With the above change, tested on intel icelake platform(160 vcpu, kernel
v6.7), a 6% performance improvement and a 7% reduction in overall
spinlock hotspot are gained on
stress-ng/pthread(https://github.com/ColinIanKing/stress-ng), which is
the stress test of thread creations.

Reviewed-by: Chen Tim C <tim.c.chen@intel.com>
Reviewed-by: King Colin <colin.king@intel.com>
Signed-off-by: rulinhuang <rulin.huang@intel.com>
---
V1 -> V2: Avoided the partial initialization issue of vm and 
separated insert_vmap_area() from alloc_vmap_area()
---
 mm/vmalloc.c | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d12a17fc0c171..768e45f2ed946 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1630,17 +1630,18 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	va->vm = NULL;
 	va->flags = va_flags;
 
-	spin_lock(&vmap_area_lock);
-	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
-	spin_unlock(&vmap_area_lock);
-
 	BUG_ON(!IS_ALIGNED(va->va_start, align));
 	BUG_ON(va->va_start < vstart);
 	BUG_ON(va->va_end > vend);
 
 	ret = kasan_populate_vmalloc(addr, size);
 	if (ret) {
-		free_vmap_area(va);
+		/*
+		 * Insert/Merge it back to the free tree/list.
+		 */
+		spin_lock(&free_vmap_area_lock);
+		merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list);
+		spin_unlock(&free_vmap_area_lock);
 		return ERR_PTR(ret);
 	}
 
@@ -1669,6 +1670,13 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	return ERR_PTR(-EBUSY);
 }
 
+static inline void insert_vmap_area_with_lock(struct vmap_area *va)
+{
+	spin_lock(&vmap_area_lock);
+	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+	spin_unlock(&vmap_area_lock);
+}
+
 int register_vmap_purge_notifier(struct notifier_block *nb)
 {
 	return blocking_notifier_chain_register(&vmap_notify_list, nb);
@@ -2045,6 +2053,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
 		return ERR_CAST(va);
 	}
 
+	insert_vmap_area_with_lock(va);
+
 	vaddr = vmap_block_vaddr(va->va_start, 0);
 	spin_lock_init(&vb->lock);
 	vb->va = va;
@@ -2398,6 +2408,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
 		if (IS_ERR(va))
 			return NULL;
 
+		insert_vmap_area_with_lock(va);
+
 		addr = va->va_start;
 		mem = (void *)addr;
 	}
@@ -2538,7 +2550,7 @@ static void vmap_init_free_space(void)
 	}
 }
 
-static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
+static inline void setup_vmalloc_vm(struct vm_struct *vm,
 	struct vmap_area *va, unsigned long flags, const void *caller)
 {
 	vm->flags = flags;
@@ -2548,14 +2560,6 @@ static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
 	va->vm = vm;
 }
 
-static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
-			      unsigned long flags, const void *caller)
-{
-	spin_lock(&vmap_area_lock);
-	setup_vmalloc_vm_locked(vm, va, flags, caller);
-	spin_unlock(&vmap_area_lock);
-}
-
 static void clear_vm_uninitialized_flag(struct vm_struct *vm)
 {
 	/*
@@ -2600,6 +2604,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
 
 	setup_vmalloc_vm(area, va, flags, caller);
 
+	insert_vmap_area_with_lock(va);
+
 	/*
 	 * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
 	 * best-effort approach, as they can be mapped outside of vmalloc code.
@@ -4166,7 +4172,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 	for (area = 0; area < nr_vms; area++) {
 		insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list);
 
-		setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
+		setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
 				 pcpu_get_vm_areas);
 	}
 	spin_unlock(&vmap_area_lock);

base-commit: de927f6c0b07d9e698416c5b287c521b07694cac
-- 
2.43.0

Re: [PATCH v2] mm/vmalloc: lock contention optimization under multi-threading

Posted by Andrew Morton 1 year, 11 months ago

On Tue, 20 Feb 2024 04:05:21 -0500 rulinhuang <rulin.huang@intel.com> wrote:

> When allocating a new memory area where the mapping address range is
> known, it is observed that the vmap_area lock is acquired twice.
> The first acquisition occurs in the alloc_vmap_area() function when
> inserting the vm area into the vm mapping red-black tree. The second
> acquisition occurs in the setup_vmalloc_vm() function when updating the
> properties of the vm, such as flags and address, etc.
> Combine these two operations together in alloc_vmap_area(), which
> improves scalability when the vmap_area lock is contended. By doing so,
> the need to acquire the lock twice can also be eliminated.
> With the above change, tested on intel icelake platform(160 vcpu, kernel
> v6.7), a 6% performance improvement and a 7% reduction in overall
> spinlock hotspot are gained on
> stress-ng/pthread(https://github.com/ColinIanKing/stress-ng), which is
> the stress test of thread creations.

vmalloc.c has changed a lot lately.  Please can you work against
linux-next or against the mm-unstable branch of
//git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Thanks.

Re: [PATCH v2] mm/vmalloc: lock contention optimization under multi-threading

Posted by rulinhuang 1 year, 11 months ago

Hi Andrew, we have rebased it on 6.8-rc5.
Thank you for your review.

[PATCH v5] mm/vmalloc: lock contention optimization under multi-threading

Posted by rulinhuang 1 year, 11 months ago

When allocating a new memory area where the mapping address range is
known, it is observed that the vmap_area lock is acquired twice.
The first acquisition occurs in the alloc_vmap_area() function when
inserting the vm area into the vm mapping red-black tree. The second
acquisition occurs in the setup_vmalloc_vm() function when updating the
properties of the vm, such as flags and address, etc.
Combine these two operations together in alloc_vmap_area(), which
improves scalability when the vmap_area lock is contended. By doing so,
the need to acquire the lock twice can also be eliminated.
With the above change, tested on intel icelake platform(160 vcpu, kernel
v6.7), a 6% performance improvement and a 7% reduction in overall
spinlock hotspot are gained on
stress-ng/pthread(https://github.com/ColinIanKing/stress-ng), which is
the stress test of thread creations.

Reviewed-by: Chen Tim C <tim.c.chen@intel.com>
Reviewed-by: King Colin <colin.king@intel.com>
Signed-off-by: rulinhuang <rulin.huang@intel.com>
---
V1 -> V2: Avoided the partial initialization issue of vm and
separated insert_vmap_area() from alloc_vmap_area()
V2 -> V3: Rebased on 6.8-rc5
V3 -> V4: Rebased on mm-unstable branch
V4 -> V5: cancel the split of alloc_vmap_area()
and keep insert_vmap_area()
---
 mm/vmalloc.c | 48 ++++++++++++++++++++++--------------------------
 1 file changed, 22 insertions(+), 26 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 25a8df497255..6baaf08737f8 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1841,15 +1841,26 @@ node_alloc(unsigned long size, unsigned long align,
 	return va;
 }
 
+static inline void setup_vmalloc_vm(struct vm_struct *vm,
+	struct vmap_area *va, unsigned long flags, const void *caller)
+{
+	vm->flags = flags;
+	vm->addr = (void *)va->va_start;
+	vm->size = va->va_end - va->va_start;
+	vm->caller = caller;
+	va->vm = vm;
+}
+
 /*
  * Allocate a region of KVA of the specified size and alignment, within the
- * vstart and vend.
+ * vstart and vend. If vm is passed in, the two will also be bound.
  */
 static struct vmap_area *alloc_vmap_area(unsigned long size,
 				unsigned long align,
 				unsigned long vstart, unsigned long vend,
 				int node, gfp_t gfp_mask,
-				unsigned long va_flags)
+				unsigned long va_flags, struct vm_struct *vm,
+				unsigned long flags, const void *caller)
 {
 	struct vmap_node *vn;
 	struct vmap_area *va;
@@ -1912,6 +1923,9 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	va->vm = NULL;
 	va->flags = (va_flags | vn_id);
 
+	if (vm)
+		setup_vmalloc_vm(vm, va, flags, caller);
+
 	vn = addr_to_node(va->va_start);
 
 	spin_lock(&vn->busy.lock);
@@ -2486,7 +2500,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
 	va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
 					VMALLOC_START, VMALLOC_END,
 					node, gfp_mask,
-					VMAP_RAM|VMAP_BLOCK);
+					VMAP_RAM|VMAP_BLOCK, NULL,
+					0, NULL);
 	if (IS_ERR(va)) {
 		kfree(vb);
 		return ERR_CAST(va);
@@ -2843,7 +2858,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
 		struct vmap_area *va;
 		va = alloc_vmap_area(size, PAGE_SIZE,
 				VMALLOC_START, VMALLOC_END,
-				node, GFP_KERNEL, VMAP_RAM);
+				node, GFP_KERNEL, VMAP_RAM,
+				NULL, 0, NULL);
 		if (IS_ERR(va))
 			return NULL;
 
@@ -2946,26 +2962,6 @@ void __init vm_area_register_early(struct vm_struct *vm, size_t align)
 	kasan_populate_early_vm_area_shadow(vm->addr, vm->size);
 }
 
-static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
-	struct vmap_area *va, unsigned long flags, const void *caller)
-{
-	vm->flags = flags;
-	vm->addr = (void *)va->va_start;
-	vm->size = va->va_end - va->va_start;
-	vm->caller = caller;
-	va->vm = vm;
-}
-
-static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
-			      unsigned long flags, const void *caller)
-{
-	struct vmap_node *vn = addr_to_node(va->va_start);
-
-	spin_lock(&vn->busy.lock);
-	setup_vmalloc_vm_locked(vm, va, flags, caller);
-	spin_unlock(&vn->busy.lock);
-}
-
 static void clear_vm_uninitialized_flag(struct vm_struct *vm)
 {
 	/*
@@ -3002,7 +2998,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
 	if (!(flags & VM_NO_GUARD))
 		size += PAGE_SIZE;
 
-	va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0);
+	va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area, flags, caller);
 	if (IS_ERR(va)) {
 		kfree(area);
 		return NULL;
@@ -4584,7 +4580,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 
 		spin_lock(&vn->busy.lock);
 		insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head);
-		setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
+		setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
 				 pcpu_get_vm_areas);
 		spin_unlock(&vn->busy.lock);
 	}

base-commit: c09a8e005eff6c064e2e9f11549966c36a724fbf
-- 
2.43.0

Re: [PATCH v5] mm/vmalloc: lock contention optimization under multi-threading

Posted by Baoquan He 1 year, 11 months ago

Hi Rulin,

On 02/23/24 at 08:03am, rulinhuang wrote:
> When allocating a new memory area where the mapping address range is
> known, it is observed that the vmap_area lock is acquired twice.
> The first acquisition occurs in the alloc_vmap_area() function when
> inserting the vm area into the vm mapping red-black tree. The second
> acquisition occurs in the setup_vmalloc_vm() function when updating the
> properties of the vm, such as flags and address, etc.
> Combine these two operations together in alloc_vmap_area(), which
> improves scalability when the vmap_area lock is contended. By doing so,
> the need to acquire the lock twice can also be eliminated.
> With the above change, tested on intel icelake platform(160 vcpu, kernel
> v6.7), a 6% performance improvement and a 7% reduction in overall
> spinlock hotspot are gained on
> stress-ng/pthread(https://github.com/ColinIanKing/stress-ng), which is
> the stress test of thread creations.

This log can be rearranged into several paragraphes for easier reading.

> 
> Reviewed-by: Chen Tim C <tim.c.chen@intel.com>
> Reviewed-by: King Colin <colin.king@intel.com>

Since you have taken different way to fix, these Reviewed-by from old
solution and patch should be removed. Carrying them is 

Yeah, this v5 is what I suggested in the draft. It looks good to me.
There's one concern in code, plesae see inline comment.

> Signed-off-by: rulinhuang <rulin.huang@intel.com>
> ---
> V1 -> V2: Avoided the partial initialization issue of vm and
> separated insert_vmap_area() from alloc_vmap_area()
> V2 -> V3: Rebased on 6.8-rc5
> V3 -> V4: Rebased on mm-unstable branch
> V4 -> V5: cancel the split of alloc_vmap_area()
> and keep insert_vmap_area()
> ---
>  mm/vmalloc.c | 48 ++++++++++++++++++++++--------------------------
>  1 file changed, 22 insertions(+), 26 deletions(-)
> 
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 25a8df497255..6baaf08737f8 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -1841,15 +1841,26 @@ node_alloc(unsigned long size, unsigned long align,
>  	return va;
>  }
>  
> +static inline void setup_vmalloc_vm(struct vm_struct *vm,
> +	struct vmap_area *va, unsigned long flags, const void *caller)
> +{
> +	vm->flags = flags;
> +	vm->addr = (void *)va->va_start;
> +	vm->size = va->va_end - va->va_start;
> +	vm->caller = caller;
> +	va->vm = vm;
> +}
> +
>  /*
>   * Allocate a region of KVA of the specified size and alignment, within the
> - * vstart and vend.
> + * vstart and vend. If vm is passed in, the two will also be bound.
>   */
>  static struct vmap_area *alloc_vmap_area(unsigned long size,
>  				unsigned long align,
>  				unsigned long vstart, unsigned long vend,
>  				int node, gfp_t gfp_mask,
> -				unsigned long va_flags)
> +				unsigned long va_flags, struct vm_struct *vm,
> +				unsigned long flags, const void *caller)
>  {
>  	struct vmap_node *vn;
>  	struct vmap_area *va;
> @@ -1912,6 +1923,9 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
>  	va->vm = NULL;
>  	va->flags = (va_flags | vn_id);
>  

We may need add a BUG_ON() or WARN_ON() checking if (va_flags & VMAP_RAM)
is true and vm is not NULL. Not sure if this is over thinking.

By the way, can you post it separately if you decide to post v6 to
polish the log and remove the ack tag? Sometime adding all posts in one
thread looks so confusing.

Thanks
Baoquan

> +	if (vm)
> +		setup_vmalloc_vm(vm, va, flags, caller);
> +
>  	vn = addr_to_node(va->va_start);
>  
>  	spin_lock(&vn->busy.lock);
> @@ -2486,7 +2500,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
>  	va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
>  					VMALLOC_START, VMALLOC_END,
>  					node, gfp_mask,
> -					VMAP_RAM|VMAP_BLOCK);
> +					VMAP_RAM|VMAP_BLOCK, NULL,
> +					0, NULL);
>  	if (IS_ERR(va)) {
>  		kfree(vb);
>  		return ERR_CAST(va);
> @@ -2843,7 +2858,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
>  		struct vmap_area *va;
>  		va = alloc_vmap_area(size, PAGE_SIZE,
>  				VMALLOC_START, VMALLOC_END,
> -				node, GFP_KERNEL, VMAP_RAM);
> +				node, GFP_KERNEL, VMAP_RAM,
> +				NULL, 0, NULL);
>  		if (IS_ERR(va))
>  			return NULL;
>  
> @@ -2946,26 +2962,6 @@ void __init vm_area_register_early(struct vm_struct *vm, size_t align)
>  	kasan_populate_early_vm_area_shadow(vm->addr, vm->size);
>  }
>  
> -static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
> -	struct vmap_area *va, unsigned long flags, const void *caller)
> -{
> -	vm->flags = flags;
> -	vm->addr = (void *)va->va_start;
> -	vm->size = va->va_end - va->va_start;
> -	vm->caller = caller;
> -	va->vm = vm;
> -}
> -
> -static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
> -			      unsigned long flags, const void *caller)
> -{
> -	struct vmap_node *vn = addr_to_node(va->va_start);
> -
> -	spin_lock(&vn->busy.lock);
> -	setup_vmalloc_vm_locked(vm, va, flags, caller);
> -	spin_unlock(&vn->busy.lock);
> -}
> -
>  static void clear_vm_uninitialized_flag(struct vm_struct *vm)
>  {
>  	/*
> @@ -3002,7 +2998,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
>  	if (!(flags & VM_NO_GUARD))
>  		size += PAGE_SIZE;
>  
> -	va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0);
> +	va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area, flags, caller);
>  	if (IS_ERR(va)) {
>  		kfree(area);
>  		return NULL;
> @@ -4584,7 +4580,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
>  
>  		spin_lock(&vn->busy.lock);
>  		insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head);
> -		setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
> +		setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
>  				 pcpu_get_vm_areas);
>  		spin_unlock(&vn->busy.lock);
>  	}
> 
> base-commit: c09a8e005eff6c064e2e9f11549966c36a724fbf
> -- 
> 2.43.0
>

[PATCH v3] mm/vmalloc: lock contention optimization under multi-threading

Posted by rulinhuang 1 year, 11 months ago

When allocating a new memory area where the mapping address range is
known, it is observed that the vmap_area lock is acquired twice.
The first acquisition occurs in the alloc_vmap_area() function when
inserting the vm area into the vm mapping red-black tree. The second
acquisition occurs in the setup_vmalloc_vm() function when updating the
properties of the vm, such as flags and address, etc.
Combine these two operations together in alloc_vmap_area(), which
improves scalability when the vmap_area lock is contended. By doing so,
the need to acquire the lock twice can also be eliminated.
With the above change, tested on intel icelake platform(160 vcpu, kernel
v6.7), a 6% performance improvement and a 7% reduction in overall
spinlock hotspot are gained on
stress-ng/pthread(https://github.com/ColinIanKing/stress-ng), which is
the stress test of thread creations.

Reviewed-by: Chen Tim C <tim.c.chen@intel.com>
Reviewed-by: King Colin <colin.king@intel.com>
Signed-off-by: rulinhuang <rulin.huang@intel.com>
---
V1 -> V2: Avoided the partial initialization issue of vm and 
separated insert_vmap_area() from alloc_vmap_area()
V2 -> V3: Rebased on 6.8-rc5
---
 mm/vmalloc.c | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d12a17fc0c17..768e45f2ed94 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1630,17 +1630,18 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	va->vm = NULL;
 	va->flags = va_flags;
 
-	spin_lock(&vmap_area_lock);
-	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
-	spin_unlock(&vmap_area_lock);
-
 	BUG_ON(!IS_ALIGNED(va->va_start, align));
 	BUG_ON(va->va_start < vstart);
 	BUG_ON(va->va_end > vend);
 
 	ret = kasan_populate_vmalloc(addr, size);
 	if (ret) {
-		free_vmap_area(va);
+		/*
+		 * Insert/Merge it back to the free tree/list.
+		 */
+		spin_lock(&free_vmap_area_lock);
+		merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list);
+		spin_unlock(&free_vmap_area_lock);
 		return ERR_PTR(ret);
 	}
 
@@ -1669,6 +1670,13 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	return ERR_PTR(-EBUSY);
 }
 
+static inline void insert_vmap_area_with_lock(struct vmap_area *va)
+{
+	spin_lock(&vmap_area_lock);
+	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+	spin_unlock(&vmap_area_lock);
+}
+
 int register_vmap_purge_notifier(struct notifier_block *nb)
 {
 	return blocking_notifier_chain_register(&vmap_notify_list, nb);
@@ -2045,6 +2053,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
 		return ERR_CAST(va);
 	}
 
+	insert_vmap_area_with_lock(va);
+
 	vaddr = vmap_block_vaddr(va->va_start, 0);
 	spin_lock_init(&vb->lock);
 	vb->va = va;
@@ -2398,6 +2408,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
 		if (IS_ERR(va))
 			return NULL;
 
+		insert_vmap_area_with_lock(va);
+
 		addr = va->va_start;
 		mem = (void *)addr;
 	}
@@ -2538,7 +2550,7 @@ static void vmap_init_free_space(void)
 	}
 }
 
-static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
+static inline void setup_vmalloc_vm(struct vm_struct *vm,
 	struct vmap_area *va, unsigned long flags, const void *caller)
 {
 	vm->flags = flags;
@@ -2548,14 +2560,6 @@ static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
 	va->vm = vm;
 }
 
-static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
-			      unsigned long flags, const void *caller)
-{
-	spin_lock(&vmap_area_lock);
-	setup_vmalloc_vm_locked(vm, va, flags, caller);
-	spin_unlock(&vmap_area_lock);
-}
-
 static void clear_vm_uninitialized_flag(struct vm_struct *vm)
 {
 	/*
@@ -2600,6 +2604,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
 
 	setup_vmalloc_vm(area, va, flags, caller);
 
+	insert_vmap_area_with_lock(va);
+
 	/*
 	 * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
 	 * best-effort approach, as they can be mapped outside of vmalloc code.
@@ -4166,7 +4172,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 	for (area = 0; area < nr_vms; area++) {
 		insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list);
 
-		setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
+		setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
 				 pcpu_get_vm_areas);
 	}
 	spin_unlock(&vmap_area_lock);

base-commit: b401b621758e46812da61fa58a67c3fd8d91de0d
-- 
2.43.0

Re: [PATCH v3] mm/vmalloc: lock contention optimization under multi-threading

Posted by Uladzislau Rezki 1 year, 11 months ago

On Tue, Feb 20, 2024 at 10:29:05PM -0500, rulinhuang wrote:
> When allocating a new memory area where the mapping address range is
> known, it is observed that the vmap_area lock is acquired twice.
> The first acquisition occurs in the alloc_vmap_area() function when
> inserting the vm area into the vm mapping red-black tree. The second
> acquisition occurs in the setup_vmalloc_vm() function when updating the
> properties of the vm, such as flags and address, etc.
> Combine these two operations together in alloc_vmap_area(), which
> improves scalability when the vmap_area lock is contended. By doing so,
> the need to acquire the lock twice can also be eliminated.
> With the above change, tested on intel icelake platform(160 vcpu, kernel
> v6.7), a 6% performance improvement and a 7% reduction in overall
> spinlock hotspot are gained on
> stress-ng/pthread(https://github.com/ColinIanKing/stress-ng), which is
> the stress test of thread creations.
> 
> Reviewed-by: Chen Tim C <tim.c.chen@intel.com>
> Reviewed-by: King Colin <colin.king@intel.com>
> Signed-off-by: rulinhuang <rulin.huang@intel.com>
> ---
> V1 -> V2: Avoided the partial initialization issue of vm and 
> separated insert_vmap_area() from alloc_vmap_area()
> V2 -> V3: Rebased on 6.8-rc5
> ---
>  mm/vmalloc.c | 36 +++++++++++++++++++++---------------
>  1 file changed, 21 insertions(+), 15 deletions(-)
> 
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index d12a17fc0c17..768e45f2ed94 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -1630,17 +1630,18 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
>  	va->vm = NULL;
>  	va->flags = va_flags;
>  
> -	spin_lock(&vmap_area_lock);
> -	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
> -	spin_unlock(&vmap_area_lock);
> -
>  	BUG_ON(!IS_ALIGNED(va->va_start, align));
>  	BUG_ON(va->va_start < vstart);
>  	BUG_ON(va->va_end > vend);
>  
>  	ret = kasan_populate_vmalloc(addr, size);
>  	if (ret) {
> -		free_vmap_area(va);
> +		/*
> +		 * Insert/Merge it back to the free tree/list.
> +		 */
> +		spin_lock(&free_vmap_area_lock);
> +		merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list);
> +		spin_unlock(&free_vmap_area_lock);
>  		return ERR_PTR(ret);
>  	}
>  
> @@ -1669,6 +1670,13 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
>  	return ERR_PTR(-EBUSY);
>  }
>  
> +static inline void insert_vmap_area_with_lock(struct vmap_area *va)
> +{
> +	spin_lock(&vmap_area_lock);
> +	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
> +	spin_unlock(&vmap_area_lock);
> +}
> +
>  int register_vmap_purge_notifier(struct notifier_block *nb)
>  {
>  	return blocking_notifier_chain_register(&vmap_notify_list, nb);
> @@ -2045,6 +2053,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
>  		return ERR_CAST(va);
>  	}
>  
> +	insert_vmap_area_with_lock(va);
> +
>  	vaddr = vmap_block_vaddr(va->va_start, 0);
>  	spin_lock_init(&vb->lock);
>  	vb->va = va;
> @@ -2398,6 +2408,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
>  		if (IS_ERR(va))
>  			return NULL;
>  
> +		insert_vmap_area_with_lock(va);
> +
>  		addr = va->va_start;
>  		mem = (void *)addr;
>  	}
> @@ -2538,7 +2550,7 @@ static void vmap_init_free_space(void)
>  	}
>  }
>  
> -static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
> +static inline void setup_vmalloc_vm(struct vm_struct *vm,
>  	struct vmap_area *va, unsigned long flags, const void *caller)
>  {
>  	vm->flags = flags;
> @@ -2548,14 +2560,6 @@ static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
>  	va->vm = vm;
>  }
>  
> -static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
> -			      unsigned long flags, const void *caller)
> -{
> -	spin_lock(&vmap_area_lock);
> -	setup_vmalloc_vm_locked(vm, va, flags, caller);
> -	spin_unlock(&vmap_area_lock);
> -}
> -
>  static void clear_vm_uninitialized_flag(struct vm_struct *vm)
>  {
>  	/*
> @@ -2600,6 +2604,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
>  
>  	setup_vmalloc_vm(area, va, flags, caller);
>  
> +	insert_vmap_area_with_lock(va);
> +
>
>  	/*
>  	 * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
>  	 * best-effort approach, as they can be mapped outside of vmalloc code.
> @@ -4166,7 +4172,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
>  	for (area = 0; area < nr_vms; area++) {
>  		insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list);
>  
> -		setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
> +		setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
>  				 pcpu_get_vm_areas);
>  	}
>  	spin_unlock(&vmap_area_lock);
> 
> base-commit: b401b621758e46812da61fa58a67c3fd8d91de0d
> -- 
> 2.43.0
> 
Spreading the insert_vmap_area_lock() across several callers, like:
__get_vm_area_node(), new_vmap_block(), vm_map_ram(), etc is not good
approach, simply because it changes the behaviour and people might
miss this point.

Could you please re-spin it on the mm-unstable, because the vmalloc
code was changes a lot? From my side i can check and help you how to
fix it in a better way. Because the v3 should be improved anyaway.

Apparently i have not seen you messages for some reason, i do not
understand why. I started to get emails with below topic:

"Bounce probe for linux-kernel@vger.kernel.org (no action required)"

--
Uladzislau Rezki

Re: [PATCH v3] mm/vmalloc: lock contention optimization under multi-threading

Posted by rulinhuang 1 year, 11 months ago

Hi Uladzislau and Andrew, we have rebased it(Patch v4) on branch 
mm-unstable and remeasured it. Could you kindly help confirm if 
this is the right base to work on?
Compared to the previous result at kernel v6.7 with a 5% performance 
gain on intel icelake(160 vcpu), we only had a 0.6% with this commit 
base. But we think our modification still has some significance. On 
the one hand, this does reduce a critical section. On the other hand, 
we have a 4% performance gain on intel sapphire rapids(224 vcpu), 
which suggests more performance improvement would likely be achieved 
when the core count of processors increases to hundreds or 
even thousands.
Thank you again for your comments.

Re: [PATCH v3] mm/vmalloc: lock contention optimization under multi-threading

Posted by Uladzislau Rezki 1 year, 11 months ago

Hello, Rulinhuang!

> Hi Uladzislau and Andrew, we have rebased it(Patch v4) on branch 
> mm-unstable and remeasured it. Could you kindly help confirm if 
> this is the right base to work on?
> Compared to the previous result at kernel v6.7 with a 5% performance 
> gain on intel icelake(160 vcpu), we only had a 0.6% with this commit 
> base. But we think our modification still has some significance. On 
> the one hand, this does reduce a critical section. On the other hand, 
> we have a 4% performance gain on intel sapphire rapids(224 vcpu), 
> which suggests more performance improvement would likely be achieved 
> when the core count of processors increases to hundreds or 
> even thousands.
> Thank you again for your comments.
>
According to the patch that was a correct rebase. Right a small delta
on your 160 CPUs is because of removing a contention. As for bigger
systems it is bigger impact, like you point here on your 224 vcpu
results where you see %4 perf improvement.

So we should fix it. But the way how it is fixed is not optimal from
my point of view, because the patch that is in question spreads the
internals from alloc_vmap_area(), like inserting busy area, across
many parts now.

--
Uladzislau Rezki

Re: [PATCH v3] mm/vmalloc: lock contention optimization under multi-threading

Posted by rulinhuang 1 year, 11 months ago

> 
> Hello, Rulinhuang!
> 
> > Hi Uladzislau and Andrew, we have rebased it(Patch v4) on branch
> > mm-unstable and remeasured it. Could you kindly help confirm if this
> > is the right base to work on?
> > Compared to the previous result at kernel v6.7 with a 5% performance
> > gain on intel icelake(160 vcpu), we only had a 0.6% with this commit
> > base. But we think our modification still has some significance. On
> > the one hand, this does reduce a critical section. On the other hand,
> > we have a 4% performance gain on intel sapphire rapids(224 vcpu),
> > which suggests more performance improvement would likely be achieved
> > when the core count of processors increases to hundreds or even
> > thousands.
> > Thank you again for your comments.
> >
> According to the patch that was a correct rebase. Right a small delta on your
> 160 CPUs is because of removing a contention. As for bigger systems it is
> bigger impact, like you point here on your 224 vcpu results where you see %4
> perf improvement.
> 
> So we should fix it. But the way how it is fixed is not optimal from my point of
> view, because the patch that is in question spreads the internals from
> alloc_vmap_area(), like inserting busy area, across many parts now.
> 
> --
> Uladzislau Rezki

Our modifications in patch 5 not only achieve the original effect, 
but also cancel the split of alloc_vmap_area()and setup_vmalloc_vm() 
is placed without lock and lengthen the critical section.
Without splitting alloc_vmap_area(), putting setup_vmalloc_vm() 
directly into it is all we can think of.
Regarding Baoquan’s changes, we think that:
We prefer put setup_vmalloc_vm() function not placed inside the 
critical section and it is no need to lengthen the critical section.
We prefer use judging (vm_data) rather than 
((!(va_flags & VMAP_RAM) && vm), and it is enough to deetermine the 
conditions for assignment. The change seem to be wandering about the 
judgment of va_flags.
Hi Uladzislau, could you please let us know if you have any better 
suggestions on the modification scheme?
Thank you for your advice!

Re: [PATCH v3] mm/vmalloc: lock contention optimization under multi-threading

Posted by Baoquan He 1 year, 11 months ago

On 02/22/24 at 01:52pm, Uladzislau Rezki wrote:
> Hello, Rulinhuang!
> 
> > Hi Uladzislau and Andrew, we have rebased it(Patch v4) on branch 
> > mm-unstable and remeasured it. Could you kindly help confirm if 
> > this is the right base to work on?
> > Compared to the previous result at kernel v6.7 with a 5% performance 
> > gain on intel icelake(160 vcpu), we only had a 0.6% with this commit 
> > base. But we think our modification still has some significance. On 
> > the one hand, this does reduce a critical section. On the other hand, 
> > we have a 4% performance gain on intel sapphire rapids(224 vcpu), 
> > which suggests more performance improvement would likely be achieved 
> > when the core count of processors increases to hundreds or 
> > even thousands.
> > Thank you again for your comments.
> >
> According to the patch that was a correct rebase. Right a small delta
> on your 160 CPUs is because of removing a contention. As for bigger
> systems it is bigger impact, like you point here on your 224 vcpu
> results where you see %4 perf improvement.
> 
> So we should fix it. But the way how it is fixed is not optimal from
> my point of view, because the patch that is in question spreads the
> internals from alloc_vmap_area(), like inserting busy area, across
> many parts now.

I happened to walk into this thread and come up with one draft patch.
Please help check if it's ok.

From 0112e39b3a8454a288e1bcece220c4599bac5326 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Thu, 22 Feb 2024 23:26:59 +0800
Subject: [PATCH] mm/vmalloc.c: avoid repeatedly requiring lock unnecessarily
Content-type: text/plain

By moving setup_vmalloc_vm() into alloc_vmap_area(), we can reduce
requiring lock one time in short time.

Signed-off-by: Baoquan He <bhe@redhat.com>
---
 mm/vmalloc.c | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index aeee71349157..6bda3c06b484 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1848,7 +1848,10 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 				unsigned long align,
 				unsigned long vstart, unsigned long vend,
 				int node, gfp_t gfp_mask,
-				unsigned long va_flags)
+				unsigned long va_flags,
+				struct vm_struct *vm,
+				unsigned long vm_flags,
+				const void *caller)
 {
 	struct vmap_node *vn;
 	struct vmap_area *va;
@@ -1915,6 +1918,8 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 
 	spin_lock(&vn->busy.lock);
 	insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
+	if (!(va_flags & VMAP_RAM) && vm)
+		setup_vmalloc_vm(vm, va, vm_flags, caller);
 	spin_unlock(&vn->busy.lock);
 
 	BUG_ON(!IS_ALIGNED(va->va_start, align));
@@ -2947,7 +2952,7 @@ void __init vm_area_register_early(struct vm_struct *vm, size_t align)
 	kasan_populate_early_vm_area_shadow(vm->addr, vm->size);
 }
 
-static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
+static inline void setup_vmalloc_vm(struct vm_struct *vm,
 	struct vmap_area *va, unsigned long flags, const void *caller)
 {
 	vm->flags = flags;
@@ -2957,16 +2962,6 @@ static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
 	va->vm = vm;
 }
 
-static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
-			      unsigned long flags, const void *caller)
-{
-	struct vmap_node *vn = addr_to_node(va->va_start);
-
-	spin_lock(&vn->busy.lock);
-	setup_vmalloc_vm_locked(vm, va, flags, caller);
-	spin_unlock(&vn->busy.lock);
-}
-
 static void clear_vm_uninitialized_flag(struct vm_struct *vm)
 {
 	/*
@@ -3009,8 +3004,6 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
 		return NULL;
 	}
 
-	setup_vmalloc_vm(area, va, flags, caller);
-
 	/*
 	 * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
 	 * best-effort approach, as they can be mapped outside of vmalloc code.
@@ -4586,7 +4579,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 
 		spin_lock(&vn->busy.lock);
 		insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head);
-		setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
+		setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
 				 pcpu_get_vm_areas);
 		spin_unlock(&vn->busy.lock);
 	}
-- 
2.41.0

Re: [PATCH v3] mm/vmalloc: lock contention optimization under multi-threading

Posted by rulinhuang 1 year, 11 months ago

[PATCH v4] mm/vmalloc: lock contention optimization under multi-threading

Posted by rulinhuang 1 year, 11 months ago

When allocating a new memory area where the mapping address range is
known, it is observed that the vmap_area lock is acquired twice.
The first acquisition occurs in the alloc_vmap_area() function when
inserting the vm area into the vm mapping red-black tree. The second
acquisition occurs in the setup_vmalloc_vm() function when updating the
properties of the vm, such as flags and address, etc.
Combine these two operations together in alloc_vmap_area(), which
improves scalability when the vmap_area lock is contended. By doing so,
the need to acquire the lock twice can also be eliminated.
With the above change, tested on intel icelake platform(160 vcpu, kernel
v6.7), a 6% performance improvement and a 7% reduction in overall
spinlock hotspot are gained on
stress-ng/pthread(https://github.com/ColinIanKing/stress-ng), which is
the stress test of thread creations.

Reviewed-by: Chen Tim C <tim.c.chen@intel.com>
Reviewed-by: King Colin <colin.king@intel.com>
Signed-off-by: rulinhuang <rulin.huang@intel.com>
---
V1 -> V2: Avoided the partial initialization issue of vm and
separated insert_vmap_area() from alloc_vmap_area()
V2 -> V3: Rebased on 6.8-rc5
V3 -> V4: Rebased on mm-unstable branch
---
 mm/vmalloc.c | 43 +++++++++++++++++++++++--------------------
 1 file changed, 23 insertions(+), 20 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 25a8df497255..ce126e7bc3d8 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1851,7 +1851,6 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 				int node, gfp_t gfp_mask,
 				unsigned long va_flags)
 {
-	struct vmap_node *vn;
 	struct vmap_area *va;
 	unsigned long freed;
 	unsigned long addr;
@@ -1912,19 +1911,18 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	va->vm = NULL;
 	va->flags = (va_flags | vn_id);
 
-	vn = addr_to_node(va->va_start);
-
-	spin_lock(&vn->busy.lock);
-	insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
-	spin_unlock(&vn->busy.lock);
-
 	BUG_ON(!IS_ALIGNED(va->va_start, align));
 	BUG_ON(va->va_start < vstart);
 	BUG_ON(va->va_end > vend);
 
 	ret = kasan_populate_vmalloc(addr, size);
 	if (ret) {
-		free_vmap_area(va);
+		/*
+		 * Insert/Merge it back to the free tree/list.
+		 */
+		spin_lock(&free_vmap_area_lock);
+		merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list);
+		spin_unlock(&free_vmap_area_lock);
 		return ERR_PTR(ret);
 	}
 
@@ -1953,6 +1951,15 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	return ERR_PTR(-EBUSY);
 }
 
+static inline void insert_vmap_area_locked(struct vmap_area *va)
+{
+	struct vmap_node *vn = addr_to_node(va->va_start);
+
+	spin_lock(&vn->busy.lock);
+	insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
+	spin_unlock(&vn->busy.lock);
+}
+
 int register_vmap_purge_notifier(struct notifier_block *nb)
 {
 	return blocking_notifier_chain_register(&vmap_notify_list, nb);
@@ -2492,6 +2499,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
 		return ERR_CAST(va);
 	}
 
+	insert_vmap_area_locked(va);
+
 	vaddr = vmap_block_vaddr(va->va_start, 0);
 	spin_lock_init(&vb->lock);
 	vb->va = va;
@@ -2847,6 +2856,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
 		if (IS_ERR(va))
 			return NULL;
 
+		insert_vmap_area_locked(va);
+
 		addr = va->va_start;
 		mem = (void *)addr;
 	}
@@ -2946,7 +2957,7 @@ void __init vm_area_register_early(struct vm_struct *vm, size_t align)
 	kasan_populate_early_vm_area_shadow(vm->addr, vm->size);
 }
 
-static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
+static inline void setup_vmalloc_vm(struct vm_struct *vm,
 	struct vmap_area *va, unsigned long flags, const void *caller)
 {
 	vm->flags = flags;
@@ -2956,16 +2967,6 @@ static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
 	va->vm = vm;
 }
 
-static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
-			      unsigned long flags, const void *caller)
-{
-	struct vmap_node *vn = addr_to_node(va->va_start);
-
-	spin_lock(&vn->busy.lock);
-	setup_vmalloc_vm_locked(vm, va, flags, caller);
-	spin_unlock(&vn->busy.lock);
-}
-
 static void clear_vm_uninitialized_flag(struct vm_struct *vm)
 {
 	/*
@@ -3010,6 +3011,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
 
 	setup_vmalloc_vm(area, va, flags, caller);
 
+	insert_vmap_area_locked(va);
+
 	/*
 	 * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
 	 * best-effort approach, as they can be mapped outside of vmalloc code.
@@ -4584,7 +4587,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 
 		spin_lock(&vn->busy.lock);
 		insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head);
-		setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
+		setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
 				 pcpu_get_vm_areas);
 		spin_unlock(&vn->busy.lock);
 	}

base-commit: 9d193b36872d153e02e80c26203de4ee15127b58
-- 
2.39.3