mm/vmalloc.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-)
When allocating a new memory area where the mapping address range is
known, it is observed that the vmap_area lock is acquired twice.
The first acquisition occurs in the alloc_vmap_area() function when
inserting the vm area into the vm mapping red-black tree. The second
acquisition occurs in the setup_vmalloc_vm() function when updating the
properties of the vm, such as flags and address, etc.
Combine these two operations together in alloc_vmap_area(), which
improves scalability when the vmap_area lock is contended. By doing so,
the need to acquire the lock twice can also be eliminated.
With the above change, tested on intel icelake platform(160 vcpu, kernel
v6.7), a 6% performance improvement and a 7% reduction in overall
spinlock hotspot are gained on
stress-ng/pthread(https://github.com/ColinIanKing/stress-ng), which is
the stress test of thread creations.
Reviewed-by: "Chen, Tim C" <tim.c.chen@intel.com>
Reviewed-by: "King, Colin" <colin.king@intel.com>
Signed-off-by: rulinhuang <rulin.huang@intel.com>
---
mm/vmalloc.c | 31 +++++++++++++++----------------
1 file changed, 15 insertions(+), 16 deletions(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d12a17fc0c171..3b1f616e8ecf0 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1577,13 +1577,13 @@ preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
/*
* Allocate a region of KVA of the specified size and alignment, within the
- * vstart and vend.
+ * vstart and vend. If vm is passed in, the two will also be bound.
*/
static struct vmap_area *alloc_vmap_area(unsigned long size,
unsigned long align,
unsigned long vstart, unsigned long vend,
int node, gfp_t gfp_mask,
- unsigned long va_flags)
+ unsigned long va_flags, struct vm_struct *vm)
{
struct vmap_area *va;
unsigned long freed;
@@ -1627,9 +1627,12 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
va->va_start = addr;
va->va_end = addr + size;
- va->vm = NULL;
+ va->vm = vm;
va->flags = va_flags;
+ if (vm != NULL)
+ vm->addr = (void *)addr;
+
spin_lock(&vmap_area_lock);
insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
spin_unlock(&vmap_area_lock);
@@ -2039,7 +2042,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
VMALLOC_START, VMALLOC_END,
node, gfp_mask,
- VMAP_RAM|VMAP_BLOCK);
+ VMAP_RAM|VMAP_BLOCK,
+ NULL);
if (IS_ERR(va)) {
kfree(vb);
return ERR_CAST(va);
@@ -2394,7 +2398,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
struct vmap_area *va;
va = alloc_vmap_area(size, PAGE_SIZE,
VMALLOC_START, VMALLOC_END,
- node, GFP_KERNEL, VMAP_RAM);
+ node, GFP_KERNEL, VMAP_RAM,
+ NULL);
if (IS_ERR(va))
return NULL;
@@ -2548,14 +2553,6 @@ static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
va->vm = vm;
}
-static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
- unsigned long flags, const void *caller)
-{
- spin_lock(&vmap_area_lock);
- setup_vmalloc_vm_locked(vm, va, flags, caller);
- spin_unlock(&vmap_area_lock);
-}
-
static void clear_vm_uninitialized_flag(struct vm_struct *vm)
{
/*
@@ -2592,14 +2589,16 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
if (!(flags & VM_NO_GUARD))
size += PAGE_SIZE;
- va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0);
+ area->flags = flags;
+ area->caller = caller;
+ area->size = size;
+
+ va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area);
if (IS_ERR(va)) {
kfree(area);
return NULL;
}
- setup_vmalloc_vm(area, va, flags, caller);
-
/*
* Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
* best-effort approach, as they can be mapped outside of vmalloc code.
base-commit: de927f6c0b07d9e698416c5b287c521b07694cac
--
2.43.0
On Tue, Feb 06, 2024 at 10:30:59PM -0500, rulinhuang wrote:
> When allocating a new memory area where the mapping address range is
> known, it is observed that the vmap_area lock is acquired twice.
> The first acquisition occurs in the alloc_vmap_area() function when
> inserting the vm area into the vm mapping red-black tree. The second
> acquisition occurs in the setup_vmalloc_vm() function when updating the
> properties of the vm, such as flags and address, etc.
> Combine these two operations together in alloc_vmap_area(), which
> improves scalability when the vmap_area lock is contended. By doing so,
> the need to acquire the lock twice can also be eliminated.
> With the above change, tested on intel icelake platform(160 vcpu, kernel
> v6.7), a 6% performance improvement and a 7% reduction in overall
> spinlock hotspot are gained on
> stress-ng/pthread(https://github.com/ColinIanKing/stress-ng), which is
> the stress test of thread creations.
>
> Reviewed-by: "Chen, Tim C" <tim.c.chen@intel.com>
> Reviewed-by: "King, Colin" <colin.king@intel.com>
> Signed-off-by: rulinhuang <rulin.huang@intel.com>
> ---
> mm/vmalloc.c | 31 +++++++++++++++----------------
> 1 file changed, 15 insertions(+), 16 deletions(-)
>
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index d12a17fc0c171..3b1f616e8ecf0 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -1577,13 +1577,13 @@ preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
>
> /*
> * Allocate a region of KVA of the specified size and alignment, within the
> - * vstart and vend.
> + * vstart and vend. If vm is passed in, the two will also be bound.
> */
> static struct vmap_area *alloc_vmap_area(unsigned long size,
> unsigned long align,
> unsigned long vstart, unsigned long vend,
> int node, gfp_t gfp_mask,
> - unsigned long va_flags)
> + unsigned long va_flags, struct vm_struct *vm)
> {
> struct vmap_area *va;
> unsigned long freed;
> @@ -1627,9 +1627,12 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
>
> va->va_start = addr;
> va->va_end = addr + size;
> - va->vm = NULL;
> + va->vm = vm;
> va->flags = va_flags;
>
> + if (vm != NULL)
> + vm->addr = (void *)addr;
> +
> spin_lock(&vmap_area_lock);
> insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
> spin_unlock(&vmap_area_lock);
> @@ -2039,7 +2042,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
> va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
> VMALLOC_START, VMALLOC_END,
> node, gfp_mask,
> - VMAP_RAM|VMAP_BLOCK);
> + VMAP_RAM|VMAP_BLOCK,
> + NULL);
> if (IS_ERR(va)) {
> kfree(vb);
> return ERR_CAST(va);
> @@ -2394,7 +2398,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
> struct vmap_area *va;
> va = alloc_vmap_area(size, PAGE_SIZE,
> VMALLOC_START, VMALLOC_END,
> - node, GFP_KERNEL, VMAP_RAM);
> + node, GFP_KERNEL, VMAP_RAM,
> + NULL);
> if (IS_ERR(va))
> return NULL;
>
> @@ -2548,14 +2553,6 @@ static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
> va->vm = vm;
> }
>
> -static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
> - unsigned long flags, const void *caller)
> -{
> - spin_lock(&vmap_area_lock);
> - setup_vmalloc_vm_locked(vm, va, flags, caller);
> - spin_unlock(&vmap_area_lock);
> -}
> -
> static void clear_vm_uninitialized_flag(struct vm_struct *vm)
> {
> /*
> @@ -2592,14 +2589,16 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
> if (!(flags & VM_NO_GUARD))
> size += PAGE_SIZE;
>
> - va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0);
> + area->flags = flags;
> + area->caller = caller;
> + area->size = size;
> +
> + va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area);
> if (IS_ERR(va)) {
> kfree(area);
> return NULL;
> }
>
> - setup_vmalloc_vm(area, va, flags, caller);
> -
> /*
>
Thank you for improving this! That was in my radar quite a long time ago :)
Some comments though. I think that such "partial" way of initializing of
"vm_struct" or "vm" is not optimal because it becomes spread between two
places.
Also, i think that we should not remove the setup_vmalloc_vm() function,
instead we can move it in a place where the VA is not inserted to the
tree, i.e. to initialize before insert.
The "locked" variant can be renamed to setup_vmalloc_vm().
Also, could you please post how you tested this patch and stress-ng
figures? This is really good for other people and folk to know.
Thanks!
--
Uladzislau Rezki
Hi Rezki, we have submitted patch v2 to avoid the partial initialization issue of vm's members and separated insert_vmap_area() from alloc_vmap_area() so that setup_vmalloc_vm() has no need to require lock protection. We have also verified the improvement of 6.3% at 160 vcpu on intel icelake platform with this patch. Thank you for your patience.
Hello, Rulinhuang! > > Hi Rezki, we have submitted patch v2 to avoid the partial > initialization issue of vm's members and separated insert_vmap_area() > from alloc_vmap_area() so that setup_vmalloc_vm() has no need to > require lock protection. We have also verified the improvement of > 6.3% at 160 vcpu on intel icelake platform with this patch. > Thank you for your patience. > Please work on the mm-unstable and try to measure it on the latest tip. -- Uladzislau Rezki
Hi Rezki, thanks so much for your review. Exactly, your suggestions could effectively enhance the maintainability and readability of this code change as well as the whole vmalloc implementation. To avoid the partial initialization issue, we are trying to refine this patch by separating insert_map_area(), the insertion of VA into the tree, from alloc_map_area(), so that setup_vmalloc_vm() could be invoked between them. However, our initial trial ran into a boot-time error, which we are still debugging, and it may take a little bit longer than expected as the coming week is the public holiday of Lunar New Year in China. We will share with you the latest version of patch once ready for your review. In the performance test, we firstly build stress-ng by following the instructions from https://github.com/ColinIanKing/stress-ng, and then launch the stressor for pthread (--pthread) for 30 seconds (-t 30) via the below command: ./stress-ng -t 30 --metrics-brief --pthread -1 --no-rand-seed The aggregated count of spawned threads per second (Bogo ops/s) is taken as the score of this workload. We evaluated the performance impact of this patch on the Ice Lake server with 40, 80, 120 and 160 online cores respectively. And as is shown in below figure, with the expansion of online cores, this patch could relieve the increasingly severe lock contention and achieve quite significant performance improvement of around 5.5% at 160 cores. vcpu number 40 80 120 160 patched/original 100.5% 100.8% 105.2% 105.5% Thanks again for your help and please let us know if more details are needed.
When allocating a new memory area where the mapping address range is
known, it is observed that the vmap_area lock is acquired twice.
The first acquisition occurs in the alloc_vmap_area() function when
inserting the vm area into the vm mapping red-black tree. The second
acquisition occurs in the setup_vmalloc_vm() function when updating the
properties of the vm, such as flags and address, etc.
Combine these two operations together in alloc_vmap_area(), which
improves scalability when the vmap_area lock is contended. By doing so,
the need to acquire the lock twice can also be eliminated.
With the above change, tested on intel icelake platform(160 vcpu, kernel
v6.7), a 6% performance improvement and a 7% reduction in overall
spinlock hotspot are gained on
stress-ng/pthread(https://github.com/ColinIanKing/stress-ng), which is
the stress test of thread creations.
Reviewed-by: Chen Tim C <tim.c.chen@intel.com>
Reviewed-by: King Colin <colin.king@intel.com>
Signed-off-by: rulinhuang <rulin.huang@intel.com>
---
V1 -> V2: Avoided the partial initialization issue of vm and
separated insert_vmap_area() from alloc_vmap_area()
---
mm/vmalloc.c | 36 +++++++++++++++++++++---------------
1 file changed, 21 insertions(+), 15 deletions(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d12a17fc0c171..768e45f2ed946 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1630,17 +1630,18 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
va->vm = NULL;
va->flags = va_flags;
- spin_lock(&vmap_area_lock);
- insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
- spin_unlock(&vmap_area_lock);
-
BUG_ON(!IS_ALIGNED(va->va_start, align));
BUG_ON(va->va_start < vstart);
BUG_ON(va->va_end > vend);
ret = kasan_populate_vmalloc(addr, size);
if (ret) {
- free_vmap_area(va);
+ /*
+ * Insert/Merge it back to the free tree/list.
+ */
+ spin_lock(&free_vmap_area_lock);
+ merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list);
+ spin_unlock(&free_vmap_area_lock);
return ERR_PTR(ret);
}
@@ -1669,6 +1670,13 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
return ERR_PTR(-EBUSY);
}
+static inline void insert_vmap_area_with_lock(struct vmap_area *va)
+{
+ spin_lock(&vmap_area_lock);
+ insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+ spin_unlock(&vmap_area_lock);
+}
+
int register_vmap_purge_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_register(&vmap_notify_list, nb);
@@ -2045,6 +2053,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
return ERR_CAST(va);
}
+ insert_vmap_area_with_lock(va);
+
vaddr = vmap_block_vaddr(va->va_start, 0);
spin_lock_init(&vb->lock);
vb->va = va;
@@ -2398,6 +2408,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
if (IS_ERR(va))
return NULL;
+ insert_vmap_area_with_lock(va);
+
addr = va->va_start;
mem = (void *)addr;
}
@@ -2538,7 +2550,7 @@ static void vmap_init_free_space(void)
}
}
-static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
+static inline void setup_vmalloc_vm(struct vm_struct *vm,
struct vmap_area *va, unsigned long flags, const void *caller)
{
vm->flags = flags;
@@ -2548,14 +2560,6 @@ static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
va->vm = vm;
}
-static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
- unsigned long flags, const void *caller)
-{
- spin_lock(&vmap_area_lock);
- setup_vmalloc_vm_locked(vm, va, flags, caller);
- spin_unlock(&vmap_area_lock);
-}
-
static void clear_vm_uninitialized_flag(struct vm_struct *vm)
{
/*
@@ -2600,6 +2604,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
setup_vmalloc_vm(area, va, flags, caller);
+ insert_vmap_area_with_lock(va);
+
/*
* Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
* best-effort approach, as they can be mapped outside of vmalloc code.
@@ -4166,7 +4172,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
for (area = 0; area < nr_vms; area++) {
insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list);
- setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
+ setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
pcpu_get_vm_areas);
}
spin_unlock(&vmap_area_lock);
base-commit: de927f6c0b07d9e698416c5b287c521b07694cac
--
2.43.0
On Tue, 20 Feb 2024 04:05:21 -0500 rulinhuang <rulin.huang@intel.com> wrote: > When allocating a new memory area where the mapping address range is > known, it is observed that the vmap_area lock is acquired twice. > The first acquisition occurs in the alloc_vmap_area() function when > inserting the vm area into the vm mapping red-black tree. The second > acquisition occurs in the setup_vmalloc_vm() function when updating the > properties of the vm, such as flags and address, etc. > Combine these two operations together in alloc_vmap_area(), which > improves scalability when the vmap_area lock is contended. By doing so, > the need to acquire the lock twice can also be eliminated. > With the above change, tested on intel icelake platform(160 vcpu, kernel > v6.7), a 6% performance improvement and a 7% reduction in overall > spinlock hotspot are gained on > stress-ng/pthread(https://github.com/ColinIanKing/stress-ng), which is > the stress test of thread creations. vmalloc.c has changed a lot lately. Please can you work against linux-next or against the mm-unstable branch of //git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Thanks.
Hi Andrew, we have rebased it on 6.8-rc5. Thank you for your review.
When allocating a new memory area where the mapping address range is
known, it is observed that the vmap_area lock is acquired twice.
The first acquisition occurs in the alloc_vmap_area() function when
inserting the vm area into the vm mapping red-black tree. The second
acquisition occurs in the setup_vmalloc_vm() function when updating the
properties of the vm, such as flags and address, etc.
Combine these two operations together in alloc_vmap_area(), which
improves scalability when the vmap_area lock is contended. By doing so,
the need to acquire the lock twice can also be eliminated.
With the above change, tested on intel icelake platform(160 vcpu, kernel
v6.7), a 6% performance improvement and a 7% reduction in overall
spinlock hotspot are gained on
stress-ng/pthread(https://github.com/ColinIanKing/stress-ng), which is
the stress test of thread creations.
Reviewed-by: Chen Tim C <tim.c.chen@intel.com>
Reviewed-by: King Colin <colin.king@intel.com>
Signed-off-by: rulinhuang <rulin.huang@intel.com>
---
V1 -> V2: Avoided the partial initialization issue of vm and
separated insert_vmap_area() from alloc_vmap_area()
V2 -> V3: Rebased on 6.8-rc5
V3 -> V4: Rebased on mm-unstable branch
V4 -> V5: cancel the split of alloc_vmap_area()
and keep insert_vmap_area()
---
mm/vmalloc.c | 48 ++++++++++++++++++++++--------------------------
1 file changed, 22 insertions(+), 26 deletions(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 25a8df497255..6baaf08737f8 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1841,15 +1841,26 @@ node_alloc(unsigned long size, unsigned long align,
return va;
}
+static inline void setup_vmalloc_vm(struct vm_struct *vm,
+ struct vmap_area *va, unsigned long flags, const void *caller)
+{
+ vm->flags = flags;
+ vm->addr = (void *)va->va_start;
+ vm->size = va->va_end - va->va_start;
+ vm->caller = caller;
+ va->vm = vm;
+}
+
/*
* Allocate a region of KVA of the specified size and alignment, within the
- * vstart and vend.
+ * vstart and vend. If vm is passed in, the two will also be bound.
*/
static struct vmap_area *alloc_vmap_area(unsigned long size,
unsigned long align,
unsigned long vstart, unsigned long vend,
int node, gfp_t gfp_mask,
- unsigned long va_flags)
+ unsigned long va_flags, struct vm_struct *vm,
+ unsigned long flags, const void *caller)
{
struct vmap_node *vn;
struct vmap_area *va;
@@ -1912,6 +1923,9 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
va->vm = NULL;
va->flags = (va_flags | vn_id);
+ if (vm)
+ setup_vmalloc_vm(vm, va, flags, caller);
+
vn = addr_to_node(va->va_start);
spin_lock(&vn->busy.lock);
@@ -2486,7 +2500,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
VMALLOC_START, VMALLOC_END,
node, gfp_mask,
- VMAP_RAM|VMAP_BLOCK);
+ VMAP_RAM|VMAP_BLOCK, NULL,
+ 0, NULL);
if (IS_ERR(va)) {
kfree(vb);
return ERR_CAST(va);
@@ -2843,7 +2858,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
struct vmap_area *va;
va = alloc_vmap_area(size, PAGE_SIZE,
VMALLOC_START, VMALLOC_END,
- node, GFP_KERNEL, VMAP_RAM);
+ node, GFP_KERNEL, VMAP_RAM,
+ NULL, 0, NULL);
if (IS_ERR(va))
return NULL;
@@ -2946,26 +2962,6 @@ void __init vm_area_register_early(struct vm_struct *vm, size_t align)
kasan_populate_early_vm_area_shadow(vm->addr, vm->size);
}
-static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
- struct vmap_area *va, unsigned long flags, const void *caller)
-{
- vm->flags = flags;
- vm->addr = (void *)va->va_start;
- vm->size = va->va_end - va->va_start;
- vm->caller = caller;
- va->vm = vm;
-}
-
-static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
- unsigned long flags, const void *caller)
-{
- struct vmap_node *vn = addr_to_node(va->va_start);
-
- spin_lock(&vn->busy.lock);
- setup_vmalloc_vm_locked(vm, va, flags, caller);
- spin_unlock(&vn->busy.lock);
-}
-
static void clear_vm_uninitialized_flag(struct vm_struct *vm)
{
/*
@@ -3002,7 +2998,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
if (!(flags & VM_NO_GUARD))
size += PAGE_SIZE;
- va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0);
+ va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area, flags, caller);
if (IS_ERR(va)) {
kfree(area);
return NULL;
@@ -4584,7 +4580,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
spin_lock(&vn->busy.lock);
insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head);
- setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
+ setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
pcpu_get_vm_areas);
spin_unlock(&vn->busy.lock);
}
base-commit: c09a8e005eff6c064e2e9f11549966c36a724fbf
--
2.43.0
Hi Rulin,
On 02/23/24 at 08:03am, rulinhuang wrote:
> When allocating a new memory area where the mapping address range is
> known, it is observed that the vmap_area lock is acquired twice.
> The first acquisition occurs in the alloc_vmap_area() function when
> inserting the vm area into the vm mapping red-black tree. The second
> acquisition occurs in the setup_vmalloc_vm() function when updating the
> properties of the vm, such as flags and address, etc.
> Combine these two operations together in alloc_vmap_area(), which
> improves scalability when the vmap_area lock is contended. By doing so,
> the need to acquire the lock twice can also be eliminated.
> With the above change, tested on intel icelake platform(160 vcpu, kernel
> v6.7), a 6% performance improvement and a 7% reduction in overall
> spinlock hotspot are gained on
> stress-ng/pthread(https://github.com/ColinIanKing/stress-ng), which is
> the stress test of thread creations.
This log can be rearranged into several paragraphes for easier reading.
>
> Reviewed-by: Chen Tim C <tim.c.chen@intel.com>
> Reviewed-by: King Colin <colin.king@intel.com>
Since you have taken different way to fix, these Reviewed-by from old
solution and patch should be removed. Carrying them is
Yeah, this v5 is what I suggested in the draft. It looks good to me.
There's one concern in code, plesae see inline comment.
> Signed-off-by: rulinhuang <rulin.huang@intel.com>
> ---
> V1 -> V2: Avoided the partial initialization issue of vm and
> separated insert_vmap_area() from alloc_vmap_area()
> V2 -> V3: Rebased on 6.8-rc5
> V3 -> V4: Rebased on mm-unstable branch
> V4 -> V5: cancel the split of alloc_vmap_area()
> and keep insert_vmap_area()
> ---
> mm/vmalloc.c | 48 ++++++++++++++++++++++--------------------------
> 1 file changed, 22 insertions(+), 26 deletions(-)
>
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 25a8df497255..6baaf08737f8 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -1841,15 +1841,26 @@ node_alloc(unsigned long size, unsigned long align,
> return va;
> }
>
> +static inline void setup_vmalloc_vm(struct vm_struct *vm,
> + struct vmap_area *va, unsigned long flags, const void *caller)
> +{
> + vm->flags = flags;
> + vm->addr = (void *)va->va_start;
> + vm->size = va->va_end - va->va_start;
> + vm->caller = caller;
> + va->vm = vm;
> +}
> +
> /*
> * Allocate a region of KVA of the specified size and alignment, within the
> - * vstart and vend.
> + * vstart and vend. If vm is passed in, the two will also be bound.
> */
> static struct vmap_area *alloc_vmap_area(unsigned long size,
> unsigned long align,
> unsigned long vstart, unsigned long vend,
> int node, gfp_t gfp_mask,
> - unsigned long va_flags)
> + unsigned long va_flags, struct vm_struct *vm,
> + unsigned long flags, const void *caller)
> {
> struct vmap_node *vn;
> struct vmap_area *va;
> @@ -1912,6 +1923,9 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
> va->vm = NULL;
> va->flags = (va_flags | vn_id);
>
We may need add a BUG_ON() or WARN_ON() checking if (va_flags & VMAP_RAM)
is true and vm is not NULL. Not sure if this is over thinking.
By the way, can you post it separately if you decide to post v6 to
polish the log and remove the ack tag? Sometime adding all posts in one
thread looks so confusing.
Thanks
Baoquan
> + if (vm)
> + setup_vmalloc_vm(vm, va, flags, caller);
> +
> vn = addr_to_node(va->va_start);
>
> spin_lock(&vn->busy.lock);
> @@ -2486,7 +2500,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
> va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
> VMALLOC_START, VMALLOC_END,
> node, gfp_mask,
> - VMAP_RAM|VMAP_BLOCK);
> + VMAP_RAM|VMAP_BLOCK, NULL,
> + 0, NULL);
> if (IS_ERR(va)) {
> kfree(vb);
> return ERR_CAST(va);
> @@ -2843,7 +2858,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
> struct vmap_area *va;
> va = alloc_vmap_area(size, PAGE_SIZE,
> VMALLOC_START, VMALLOC_END,
> - node, GFP_KERNEL, VMAP_RAM);
> + node, GFP_KERNEL, VMAP_RAM,
> + NULL, 0, NULL);
> if (IS_ERR(va))
> return NULL;
>
> @@ -2946,26 +2962,6 @@ void __init vm_area_register_early(struct vm_struct *vm, size_t align)
> kasan_populate_early_vm_area_shadow(vm->addr, vm->size);
> }
>
> -static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
> - struct vmap_area *va, unsigned long flags, const void *caller)
> -{
> - vm->flags = flags;
> - vm->addr = (void *)va->va_start;
> - vm->size = va->va_end - va->va_start;
> - vm->caller = caller;
> - va->vm = vm;
> -}
> -
> -static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
> - unsigned long flags, const void *caller)
> -{
> - struct vmap_node *vn = addr_to_node(va->va_start);
> -
> - spin_lock(&vn->busy.lock);
> - setup_vmalloc_vm_locked(vm, va, flags, caller);
> - spin_unlock(&vn->busy.lock);
> -}
> -
> static void clear_vm_uninitialized_flag(struct vm_struct *vm)
> {
> /*
> @@ -3002,7 +2998,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
> if (!(flags & VM_NO_GUARD))
> size += PAGE_SIZE;
>
> - va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0);
> + va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area, flags, caller);
> if (IS_ERR(va)) {
> kfree(area);
> return NULL;
> @@ -4584,7 +4580,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
>
> spin_lock(&vn->busy.lock);
> insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head);
> - setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
> + setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
> pcpu_get_vm_areas);
> spin_unlock(&vn->busy.lock);
> }
>
> base-commit: c09a8e005eff6c064e2e9f11549966c36a724fbf
> --
> 2.43.0
>
When allocating a new memory area where the mapping address range is
known, it is observed that the vmap_area lock is acquired twice.
The first acquisition occurs in the alloc_vmap_area() function when
inserting the vm area into the vm mapping red-black tree. The second
acquisition occurs in the setup_vmalloc_vm() function when updating the
properties of the vm, such as flags and address, etc.
Combine these two operations together in alloc_vmap_area(), which
improves scalability when the vmap_area lock is contended. By doing so,
the need to acquire the lock twice can also be eliminated.
With the above change, tested on intel icelake platform(160 vcpu, kernel
v6.7), a 6% performance improvement and a 7% reduction in overall
spinlock hotspot are gained on
stress-ng/pthread(https://github.com/ColinIanKing/stress-ng), which is
the stress test of thread creations.
Reviewed-by: Chen Tim C <tim.c.chen@intel.com>
Reviewed-by: King Colin <colin.king@intel.com>
Signed-off-by: rulinhuang <rulin.huang@intel.com>
---
V1 -> V2: Avoided the partial initialization issue of vm and
separated insert_vmap_area() from alloc_vmap_area()
V2 -> V3: Rebased on 6.8-rc5
---
mm/vmalloc.c | 36 +++++++++++++++++++++---------------
1 file changed, 21 insertions(+), 15 deletions(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d12a17fc0c17..768e45f2ed94 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1630,17 +1630,18 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
va->vm = NULL;
va->flags = va_flags;
- spin_lock(&vmap_area_lock);
- insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
- spin_unlock(&vmap_area_lock);
-
BUG_ON(!IS_ALIGNED(va->va_start, align));
BUG_ON(va->va_start < vstart);
BUG_ON(va->va_end > vend);
ret = kasan_populate_vmalloc(addr, size);
if (ret) {
- free_vmap_area(va);
+ /*
+ * Insert/Merge it back to the free tree/list.
+ */
+ spin_lock(&free_vmap_area_lock);
+ merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list);
+ spin_unlock(&free_vmap_area_lock);
return ERR_PTR(ret);
}
@@ -1669,6 +1670,13 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
return ERR_PTR(-EBUSY);
}
+static inline void insert_vmap_area_with_lock(struct vmap_area *va)
+{
+ spin_lock(&vmap_area_lock);
+ insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+ spin_unlock(&vmap_area_lock);
+}
+
int register_vmap_purge_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_register(&vmap_notify_list, nb);
@@ -2045,6 +2053,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
return ERR_CAST(va);
}
+ insert_vmap_area_with_lock(va);
+
vaddr = vmap_block_vaddr(va->va_start, 0);
spin_lock_init(&vb->lock);
vb->va = va;
@@ -2398,6 +2408,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
if (IS_ERR(va))
return NULL;
+ insert_vmap_area_with_lock(va);
+
addr = va->va_start;
mem = (void *)addr;
}
@@ -2538,7 +2550,7 @@ static void vmap_init_free_space(void)
}
}
-static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
+static inline void setup_vmalloc_vm(struct vm_struct *vm,
struct vmap_area *va, unsigned long flags, const void *caller)
{
vm->flags = flags;
@@ -2548,14 +2560,6 @@ static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
va->vm = vm;
}
-static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
- unsigned long flags, const void *caller)
-{
- spin_lock(&vmap_area_lock);
- setup_vmalloc_vm_locked(vm, va, flags, caller);
- spin_unlock(&vmap_area_lock);
-}
-
static void clear_vm_uninitialized_flag(struct vm_struct *vm)
{
/*
@@ -2600,6 +2604,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
setup_vmalloc_vm(area, va, flags, caller);
+ insert_vmap_area_with_lock(va);
+
/*
* Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
* best-effort approach, as they can be mapped outside of vmalloc code.
@@ -4166,7 +4172,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
for (area = 0; area < nr_vms; area++) {
insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list);
- setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
+ setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
pcpu_get_vm_areas);
}
spin_unlock(&vmap_area_lock);
base-commit: b401b621758e46812da61fa58a67c3fd8d91de0d
--
2.43.0
On Tue, Feb 20, 2024 at 10:29:05PM -0500, rulinhuang wrote:
> When allocating a new memory area where the mapping address range is
> known, it is observed that the vmap_area lock is acquired twice.
> The first acquisition occurs in the alloc_vmap_area() function when
> inserting the vm area into the vm mapping red-black tree. The second
> acquisition occurs in the setup_vmalloc_vm() function when updating the
> properties of the vm, such as flags and address, etc.
> Combine these two operations together in alloc_vmap_area(), which
> improves scalability when the vmap_area lock is contended. By doing so,
> the need to acquire the lock twice can also be eliminated.
> With the above change, tested on intel icelake platform(160 vcpu, kernel
> v6.7), a 6% performance improvement and a 7% reduction in overall
> spinlock hotspot are gained on
> stress-ng/pthread(https://github.com/ColinIanKing/stress-ng), which is
> the stress test of thread creations.
>
> Reviewed-by: Chen Tim C <tim.c.chen@intel.com>
> Reviewed-by: King Colin <colin.king@intel.com>
> Signed-off-by: rulinhuang <rulin.huang@intel.com>
> ---
> V1 -> V2: Avoided the partial initialization issue of vm and
> separated insert_vmap_area() from alloc_vmap_area()
> V2 -> V3: Rebased on 6.8-rc5
> ---
> mm/vmalloc.c | 36 +++++++++++++++++++++---------------
> 1 file changed, 21 insertions(+), 15 deletions(-)
>
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index d12a17fc0c17..768e45f2ed94 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -1630,17 +1630,18 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
> va->vm = NULL;
> va->flags = va_flags;
>
> - spin_lock(&vmap_area_lock);
> - insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
> - spin_unlock(&vmap_area_lock);
> -
> BUG_ON(!IS_ALIGNED(va->va_start, align));
> BUG_ON(va->va_start < vstart);
> BUG_ON(va->va_end > vend);
>
> ret = kasan_populate_vmalloc(addr, size);
> if (ret) {
> - free_vmap_area(va);
> + /*
> + * Insert/Merge it back to the free tree/list.
> + */
> + spin_lock(&free_vmap_area_lock);
> + merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list);
> + spin_unlock(&free_vmap_area_lock);
> return ERR_PTR(ret);
> }
>
> @@ -1669,6 +1670,13 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
> return ERR_PTR(-EBUSY);
> }
>
> +static inline void insert_vmap_area_with_lock(struct vmap_area *va)
> +{
> + spin_lock(&vmap_area_lock);
> + insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
> + spin_unlock(&vmap_area_lock);
> +}
> +
> int register_vmap_purge_notifier(struct notifier_block *nb)
> {
> return blocking_notifier_chain_register(&vmap_notify_list, nb);
> @@ -2045,6 +2053,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
> return ERR_CAST(va);
> }
>
> + insert_vmap_area_with_lock(va);
> +
> vaddr = vmap_block_vaddr(va->va_start, 0);
> spin_lock_init(&vb->lock);
> vb->va = va;
> @@ -2398,6 +2408,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
> if (IS_ERR(va))
> return NULL;
>
> + insert_vmap_area_with_lock(va);
> +
> addr = va->va_start;
> mem = (void *)addr;
> }
> @@ -2538,7 +2550,7 @@ static void vmap_init_free_space(void)
> }
> }
>
> -static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
> +static inline void setup_vmalloc_vm(struct vm_struct *vm,
> struct vmap_area *va, unsigned long flags, const void *caller)
> {
> vm->flags = flags;
> @@ -2548,14 +2560,6 @@ static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
> va->vm = vm;
> }
>
> -static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
> - unsigned long flags, const void *caller)
> -{
> - spin_lock(&vmap_area_lock);
> - setup_vmalloc_vm_locked(vm, va, flags, caller);
> - spin_unlock(&vmap_area_lock);
> -}
> -
> static void clear_vm_uninitialized_flag(struct vm_struct *vm)
> {
> /*
> @@ -2600,6 +2604,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
>
> setup_vmalloc_vm(area, va, flags, caller);
>
> + insert_vmap_area_with_lock(va);
> +
>
> /*
> * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
> * best-effort approach, as they can be mapped outside of vmalloc code.
> @@ -4166,7 +4172,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
> for (area = 0; area < nr_vms; area++) {
> insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list);
>
> - setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
> + setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
> pcpu_get_vm_areas);
> }
> spin_unlock(&vmap_area_lock);
>
> base-commit: b401b621758e46812da61fa58a67c3fd8d91de0d
> --
> 2.43.0
>
Spreading the insert_vmap_area_lock() across several callers, like:
__get_vm_area_node(), new_vmap_block(), vm_map_ram(), etc is not good
approach, simply because it changes the behaviour and people might
miss this point.
Could you please re-spin it on the mm-unstable, because the vmalloc
code was changes a lot? From my side i can check and help you how to
fix it in a better way. Because the v3 should be improved anyaway.
Apparently i have not seen you messages for some reason, i do not
understand why. I started to get emails with below topic:
"Bounce probe for linux-kernel@vger.kernel.org (no action required)"
--
Uladzislau Rezki
Hi Uladzislau and Andrew, we have rebased it(Patch v4) on branch mm-unstable and remeasured it. Could you kindly help confirm if this is the right base to work on? Compared to the previous result at kernel v6.7 with a 5% performance gain on intel icelake(160 vcpu), we only had a 0.6% with this commit base. But we think our modification still has some significance. On the one hand, this does reduce a critical section. On the other hand, we have a 4% performance gain on intel sapphire rapids(224 vcpu), which suggests more performance improvement would likely be achieved when the core count of processors increases to hundreds or even thousands. Thank you again for your comments.
Hello, Rulinhuang! > Hi Uladzislau and Andrew, we have rebased it(Patch v4) on branch > mm-unstable and remeasured it. Could you kindly help confirm if > this is the right base to work on? > Compared to the previous result at kernel v6.7 with a 5% performance > gain on intel icelake(160 vcpu), we only had a 0.6% with this commit > base. But we think our modification still has some significance. On > the one hand, this does reduce a critical section. On the other hand, > we have a 4% performance gain on intel sapphire rapids(224 vcpu), > which suggests more performance improvement would likely be achieved > when the core count of processors increases to hundreds or > even thousands. > Thank you again for your comments. > According to the patch that was a correct rebase. Right a small delta on your 160 CPUs is because of removing a contention. As for bigger systems it is bigger impact, like you point here on your 224 vcpu results where you see %4 perf improvement. So we should fix it. But the way how it is fixed is not optimal from my point of view, because the patch that is in question spreads the internals from alloc_vmap_area(), like inserting busy area, across many parts now. -- Uladzislau Rezki
> > Hello, Rulinhuang! > > > Hi Uladzislau and Andrew, we have rebased it(Patch v4) on branch > > mm-unstable and remeasured it. Could you kindly help confirm if this > > is the right base to work on? > > Compared to the previous result at kernel v6.7 with a 5% performance > > gain on intel icelake(160 vcpu), we only had a 0.6% with this commit > > base. But we think our modification still has some significance. On > > the one hand, this does reduce a critical section. On the other hand, > > we have a 4% performance gain on intel sapphire rapids(224 vcpu), > > which suggests more performance improvement would likely be achieved > > when the core count of processors increases to hundreds or even > > thousands. > > Thank you again for your comments. > > > According to the patch that was a correct rebase. Right a small delta on your > 160 CPUs is because of removing a contention. As for bigger systems it is > bigger impact, like you point here on your 224 vcpu results where you see %4 > perf improvement. > > So we should fix it. But the way how it is fixed is not optimal from my point of > view, because the patch that is in question spreads the internals from > alloc_vmap_area(), like inserting busy area, across many parts now. > > -- > Uladzislau Rezki Our modifications in patch 5 not only achieve the original effect, but also cancel the split of alloc_vmap_area()and setup_vmalloc_vm() is placed without lock and lengthen the critical section. Without splitting alloc_vmap_area(), putting setup_vmalloc_vm() directly into it is all we can think of. Regarding Baoquan’s changes, we think that: We prefer put setup_vmalloc_vm() function not placed inside the critical section and it is no need to lengthen the critical section. We prefer use judging (vm_data) rather than ((!(va_flags & VMAP_RAM) && vm), and it is enough to deetermine the conditions for assignment. The change seem to be wandering about the judgment of va_flags. Hi Uladzislau, could you please let us know if you have any better suggestions on the modification scheme? Thank you for your advice!
On 02/22/24 at 01:52pm, Uladzislau Rezki wrote:
> Hello, Rulinhuang!
>
> > Hi Uladzislau and Andrew, we have rebased it(Patch v4) on branch
> > mm-unstable and remeasured it. Could you kindly help confirm if
> > this is the right base to work on?
> > Compared to the previous result at kernel v6.7 with a 5% performance
> > gain on intel icelake(160 vcpu), we only had a 0.6% with this commit
> > base. But we think our modification still has some significance. On
> > the one hand, this does reduce a critical section. On the other hand,
> > we have a 4% performance gain on intel sapphire rapids(224 vcpu),
> > which suggests more performance improvement would likely be achieved
> > when the core count of processors increases to hundreds or
> > even thousands.
> > Thank you again for your comments.
> >
> According to the patch that was a correct rebase. Right a small delta
> on your 160 CPUs is because of removing a contention. As for bigger
> systems it is bigger impact, like you point here on your 224 vcpu
> results where you see %4 perf improvement.
>
> So we should fix it. But the way how it is fixed is not optimal from
> my point of view, because the patch that is in question spreads the
> internals from alloc_vmap_area(), like inserting busy area, across
> many parts now.
I happened to walk into this thread and come up with one draft patch.
Please help check if it's ok.
From 0112e39b3a8454a288e1bcece220c4599bac5326 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Thu, 22 Feb 2024 23:26:59 +0800
Subject: [PATCH] mm/vmalloc.c: avoid repeatedly requiring lock unnecessarily
Content-type: text/plain
By moving setup_vmalloc_vm() into alloc_vmap_area(), we can reduce
requiring lock one time in short time.
Signed-off-by: Baoquan He <bhe@redhat.com>
---
mm/vmalloc.c | 23 ++++++++---------------
1 file changed, 8 insertions(+), 15 deletions(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index aeee71349157..6bda3c06b484 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1848,7 +1848,10 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
unsigned long align,
unsigned long vstart, unsigned long vend,
int node, gfp_t gfp_mask,
- unsigned long va_flags)
+ unsigned long va_flags,
+ struct vm_struct *vm,
+ unsigned long vm_flags,
+ const void *caller)
{
struct vmap_node *vn;
struct vmap_area *va;
@@ -1915,6 +1918,8 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
spin_lock(&vn->busy.lock);
insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
+ if (!(va_flags & VMAP_RAM) && vm)
+ setup_vmalloc_vm(vm, va, vm_flags, caller);
spin_unlock(&vn->busy.lock);
BUG_ON(!IS_ALIGNED(va->va_start, align));
@@ -2947,7 +2952,7 @@ void __init vm_area_register_early(struct vm_struct *vm, size_t align)
kasan_populate_early_vm_area_shadow(vm->addr, vm->size);
}
-static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
+static inline void setup_vmalloc_vm(struct vm_struct *vm,
struct vmap_area *va, unsigned long flags, const void *caller)
{
vm->flags = flags;
@@ -2957,16 +2962,6 @@ static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
va->vm = vm;
}
-static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
- unsigned long flags, const void *caller)
-{
- struct vmap_node *vn = addr_to_node(va->va_start);
-
- spin_lock(&vn->busy.lock);
- setup_vmalloc_vm_locked(vm, va, flags, caller);
- spin_unlock(&vn->busy.lock);
-}
-
static void clear_vm_uninitialized_flag(struct vm_struct *vm)
{
/*
@@ -3009,8 +3004,6 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
return NULL;
}
- setup_vmalloc_vm(area, va, flags, caller);
-
/*
* Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
* best-effort approach, as they can be mapped outside of vmalloc code.
@@ -4586,7 +4579,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
spin_lock(&vn->busy.lock);
insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head);
- setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
+ setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
pcpu_get_vm_areas);
spin_unlock(&vn->busy.lock);
}
--
2.41.0
When allocating a new memory area where the mapping address range is
known, it is observed that the vmap_area lock is acquired twice.
The first acquisition occurs in the alloc_vmap_area() function when
inserting the vm area into the vm mapping red-black tree. The second
acquisition occurs in the setup_vmalloc_vm() function when updating the
properties of the vm, such as flags and address, etc.
Combine these two operations together in alloc_vmap_area(), which
improves scalability when the vmap_area lock is contended. By doing so,
the need to acquire the lock twice can also be eliminated.
With the above change, tested on intel icelake platform(160 vcpu, kernel
v6.7), a 6% performance improvement and a 7% reduction in overall
spinlock hotspot are gained on
stress-ng/pthread(https://github.com/ColinIanKing/stress-ng), which is
the stress test of thread creations.
Reviewed-by: Chen Tim C <tim.c.chen@intel.com>
Reviewed-by: King Colin <colin.king@intel.com>
Signed-off-by: rulinhuang <rulin.huang@intel.com>
---
V1 -> V2: Avoided the partial initialization issue of vm and
separated insert_vmap_area() from alloc_vmap_area()
V2 -> V3: Rebased on 6.8-rc5
V3 -> V4: Rebased on mm-unstable branch
---
mm/vmalloc.c | 43 +++++++++++++++++++++++--------------------
1 file changed, 23 insertions(+), 20 deletions(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 25a8df497255..ce126e7bc3d8 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1851,7 +1851,6 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
int node, gfp_t gfp_mask,
unsigned long va_flags)
{
- struct vmap_node *vn;
struct vmap_area *va;
unsigned long freed;
unsigned long addr;
@@ -1912,19 +1911,18 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
va->vm = NULL;
va->flags = (va_flags | vn_id);
- vn = addr_to_node(va->va_start);
-
- spin_lock(&vn->busy.lock);
- insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
- spin_unlock(&vn->busy.lock);
-
BUG_ON(!IS_ALIGNED(va->va_start, align));
BUG_ON(va->va_start < vstart);
BUG_ON(va->va_end > vend);
ret = kasan_populate_vmalloc(addr, size);
if (ret) {
- free_vmap_area(va);
+ /*
+ * Insert/Merge it back to the free tree/list.
+ */
+ spin_lock(&free_vmap_area_lock);
+ merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list);
+ spin_unlock(&free_vmap_area_lock);
return ERR_PTR(ret);
}
@@ -1953,6 +1951,15 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
return ERR_PTR(-EBUSY);
}
+static inline void insert_vmap_area_locked(struct vmap_area *va)
+{
+ struct vmap_node *vn = addr_to_node(va->va_start);
+
+ spin_lock(&vn->busy.lock);
+ insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
+ spin_unlock(&vn->busy.lock);
+}
+
int register_vmap_purge_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_register(&vmap_notify_list, nb);
@@ -2492,6 +2499,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
return ERR_CAST(va);
}
+ insert_vmap_area_locked(va);
+
vaddr = vmap_block_vaddr(va->va_start, 0);
spin_lock_init(&vb->lock);
vb->va = va;
@@ -2847,6 +2856,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
if (IS_ERR(va))
return NULL;
+ insert_vmap_area_locked(va);
+
addr = va->va_start;
mem = (void *)addr;
}
@@ -2946,7 +2957,7 @@ void __init vm_area_register_early(struct vm_struct *vm, size_t align)
kasan_populate_early_vm_area_shadow(vm->addr, vm->size);
}
-static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
+static inline void setup_vmalloc_vm(struct vm_struct *vm,
struct vmap_area *va, unsigned long flags, const void *caller)
{
vm->flags = flags;
@@ -2956,16 +2967,6 @@ static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
va->vm = vm;
}
-static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
- unsigned long flags, const void *caller)
-{
- struct vmap_node *vn = addr_to_node(va->va_start);
-
- spin_lock(&vn->busy.lock);
- setup_vmalloc_vm_locked(vm, va, flags, caller);
- spin_unlock(&vn->busy.lock);
-}
-
static void clear_vm_uninitialized_flag(struct vm_struct *vm)
{
/*
@@ -3010,6 +3011,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
setup_vmalloc_vm(area, va, flags, caller);
+ insert_vmap_area_locked(va);
+
/*
* Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
* best-effort approach, as they can be mapped outside of vmalloc code.
@@ -4584,7 +4587,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
spin_lock(&vn->busy.lock);
insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head);
- setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
+ setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
pcpu_get_vm_areas);
spin_unlock(&vn->busy.lock);
}
base-commit: 9d193b36872d153e02e80c26203de4ee15127b58
--
2.39.3
© 2016 - 2026 Red Hat, Inc.