Like global slab shrink, this commit also uses SRCU to make
memcg slab shrink lockless.
We can reproduce the down_read_trylock() hotspot through the
following script:
```
DIR="/root/shrinker/memcg/mnt"
do_create()
{
mkdir -p /sys/fs/cgroup/memory/test
mkdir -p /sys/fs/cgroup/perf_event/test
echo 4G > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
for i in `seq 0 $1`;
do
mkdir -p /sys/fs/cgroup/memory/test/$i;
echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
echo $$ > /sys/fs/cgroup/perf_event/test/cgroup.procs;
mkdir -p $DIR/$i;
done
}
do_mount()
{
for i in `seq $1 $2`;
do
mount -t tmpfs $i $DIR/$i;
done
}
do_touch()
{
for i in `seq $1 $2`;
do
echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
echo $$ > /sys/fs/cgroup/perf_event/test/cgroup.procs;
dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 &
done
}
case "$1" in
touch)
do_touch $2 $3
;;
test)
do_create 4000
do_mount 0 4000
do_touch 0 3000
;;
*)
exit 1
;;
esac
```
Save the above script, then run test and touch commands.
Then we can use the following perf command to view hotspots:
perf top -U -F 999
1) Before applying this patchset:
32.31% [kernel] [k] down_read_trylock
19.40% [kernel] [k] pv_native_safe_halt
16.24% [kernel] [k] up_read
15.70% [kernel] [k] shrink_slab
4.69% [kernel] [k] _find_next_bit
2.62% [kernel] [k] shrink_node
1.78% [kernel] [k] shrink_lruvec
0.76% [kernel] [k] do_shrink_slab
2) After applying this patchset:
27.83% [kernel] [k] _find_next_bit
16.97% [kernel] [k] shrink_slab
15.82% [kernel] [k] pv_native_safe_halt
9.58% [kernel] [k] shrink_node
8.31% [kernel] [k] shrink_lruvec
5.64% [kernel] [k] do_shrink_slab
3.88% [kernel] [k] mem_cgroup_iter
At the same time, we use the following perf command to capture
IPC information:
perf stat -e cycles,instructions -G test -a --repeat 5 -- sleep 10
1) Before applying this patchset:
Performance counter stats for 'system wide' (5 runs):
454187219766 cycles test ( +- 1.84% )
78896433101 instructions test # 0.17 insn per cycle ( +- 0.44% )
10.0020430 +- 0.0000366 seconds time elapsed ( +- 0.00% )
2) After applying this patchset:
Performance counter stats for 'system wide' (5 runs):
841954709443 cycles test ( +- 15.80% ) (98.69%)
527258677936 instructions test # 0.63 insn per cycle ( +- 15.11% ) (98.68%)
10.01064 +- 0.00831 seconds time elapsed ( +- 0.08% )
We can see that IPC drops very seriously when calling
down_read_trylock() at high frequency. After using SRCU,
the IPC is at a normal level.
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
mm/vmscan.c | 46 +++++++++++++++++++++++++++-------------------
1 file changed, 27 insertions(+), 19 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8515ac40bcaf..1de9bc3e5aa2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -57,6 +57,7 @@
#include <linux/khugepaged.h>
#include <linux/rculist_nulls.h>
#include <linux/random.h>
+#include <linux/srcu.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -221,8 +222,21 @@ static inline int shrinker_defer_size(int nr_items)
static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
int nid)
{
- return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
- lockdep_is_held(&shrinker_rwsem));
+ return srcu_dereference_check(memcg->nodeinfo[nid]->shrinker_info,
+ &shrinker_srcu,
+ lockdep_is_held(&shrinker_rwsem));
+}
+
+static struct shrinker_info *shrinker_info_srcu(struct mem_cgroup *memcg,
+ int nid)
+{
+ return srcu_dereference(memcg->nodeinfo[nid]->shrinker_info,
+ &shrinker_srcu);
+}
+
+static void free_shrinker_info_rcu(struct rcu_head *head)
+{
+ kvfree(container_of(head, struct shrinker_info, rcu));
}
static inline bool need_expand(int new_nr_max, int old_nr_max)
@@ -269,7 +283,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg,
defer_size - old_defer_size);
rcu_assign_pointer(pn->shrinker_info, new);
- kvfree_rcu(old, rcu);
+ call_srcu(&shrinker_srcu, &old->rcu, free_shrinker_info_rcu);
}
return 0;
@@ -355,15 +369,16 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
{
if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
struct shrinker_info *info;
+ int srcu_idx;
- rcu_read_lock();
- info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
+ srcu_idx = srcu_read_lock(&shrinker_srcu);
+ info = shrinker_info_srcu(memcg, nid);
if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) {
/* Pairs with smp mb in shrink_slab() */
smp_mb__before_atomic();
set_bit(shrinker_id, info->map);
}
- rcu_read_unlock();
+ srcu_read_unlock(&shrinker_srcu, srcu_idx);
}
}
@@ -377,7 +392,6 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker)
return -ENOSYS;
down_write(&shrinker_rwsem);
- /* This may call shrinker, so it must use down_read_trylock() */
id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
if (id < 0)
goto unlock;
@@ -411,7 +425,7 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
{
struct shrinker_info *info;
- info = shrinker_info_protected(memcg, nid);
+ info = shrinker_info_srcu(memcg, nid);
return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
}
@@ -420,7 +434,7 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
{
struct shrinker_info *info;
- info = shrinker_info_protected(memcg, nid);
+ info = shrinker_info_srcu(memcg, nid);
return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
}
@@ -898,15 +912,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
{
struct shrinker_info *info;
unsigned long ret, freed = 0;
+ int srcu_idx;
int i;
if (!mem_cgroup_online(memcg))
return 0;
- if (!down_read_trylock(&shrinker_rwsem))
- return 0;
-
- info = shrinker_info_protected(memcg, nid);
+ srcu_idx = srcu_read_lock(&shrinker_srcu);
+ info = shrinker_info_srcu(memcg, nid);
if (unlikely(!info))
goto unlock;
@@ -956,14 +969,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
set_shrinker_bit(memcg, nid, i);
}
freed += ret;
-
- if (rwsem_is_contended(&shrinker_rwsem)) {
- freed = freed ? : 1;
- break;
- }
}
unlock:
- up_read(&shrinker_rwsem);
+ srcu_read_unlock(&shrinker_srcu, srcu_idx);
return freed;
}
#else /* CONFIG_MEMCG */
--
2.20.1
On 3/7/23 07:56, Qi Zheng wrote:
> Like global slab shrink, this commit also uses SRCU to make
> memcg slab shrink lockless.
>
> We can reproduce the down_read_trylock() hotspot through the
> following script:
>
> ```
>
> DIR="/root/shrinker/memcg/mnt"
>
> do_create()
> {
> mkdir -p /sys/fs/cgroup/memory/test
> mkdir -p /sys/fs/cgroup/perf_event/test
> echo 4G > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
> for i in `seq 0 $1`;
> do
> mkdir -p /sys/fs/cgroup/memory/test/$i;
> echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
> echo $$ > /sys/fs/cgroup/perf_event/test/cgroup.procs;
> mkdir -p $DIR/$i;
> done
> }
>
> do_mount()
> {
> for i in `seq $1 $2`;
> do
> mount -t tmpfs $i $DIR/$i;
> done
> }
>
> do_touch()
> {
> for i in `seq $1 $2`;
> do
> echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
> echo $$ > /sys/fs/cgroup/perf_event/test/cgroup.procs;
> dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 &
> done
> }
>
> case "$1" in
> touch)
> do_touch $2 $3
> ;;
> test)
> do_create 4000
> do_mount 0 4000
> do_touch 0 3000
> ;;
> *)
> exit 1
> ;;
> esac
> ```
>
> Save the above script, then run test and touch commands.
> Then we can use the following perf command to view hotspots:
>
> perf top -U -F 999
>
> 1) Before applying this patchset:
>
> 32.31% [kernel] [k] down_read_trylock
> 19.40% [kernel] [k] pv_native_safe_halt
> 16.24% [kernel] [k] up_read
> 15.70% [kernel] [k] shrink_slab
> 4.69% [kernel] [k] _find_next_bit
> 2.62% [kernel] [k] shrink_node
> 1.78% [kernel] [k] shrink_lruvec
> 0.76% [kernel] [k] do_shrink_slab
>
> 2) After applying this patchset:
>
> 27.83% [kernel] [k] _find_next_bit
> 16.97% [kernel] [k] shrink_slab
> 15.82% [kernel] [k] pv_native_safe_halt
> 9.58% [kernel] [k] shrink_node
> 8.31% [kernel] [k] shrink_lruvec
> 5.64% [kernel] [k] do_shrink_slab
> 3.88% [kernel] [k] mem_cgroup_iter
>
> At the same time, we use the following perf command to capture
> IPC information:
>
> perf stat -e cycles,instructions -G test -a --repeat 5 -- sleep 10
>
> 1) Before applying this patchset:
>
> Performance counter stats for 'system wide' (5 runs):
>
> 454187219766 cycles test ( +- 1.84% )
> 78896433101 instructions test # 0.17 insn per cycle ( +- 0.44% )
>
> 10.0020430 +- 0.0000366 seconds time elapsed ( +- 0.00% )
>
> 2) After applying this patchset:
>
> Performance counter stats for 'system wide' (5 runs):
>
> 841954709443 cycles test ( +- 15.80% ) (98.69%)
> 527258677936 instructions test # 0.63 insn per cycle ( +- 15.11% ) (98.68%)
>
> 10.01064 +- 0.00831 seconds time elapsed ( +- 0.08% )
>
> We can see that IPC drops very seriously when calling
> down_read_trylock() at high frequency. After using SRCU,
> the IPC is at a normal level.
The interpretation looks somewhat weird to me. I'd say the workload is
stalled a lot as it fails the trylock (there might be some optimistic
spinning perhaps) and then goes to sleep. See how "pv_native_safe_halt" is
also more prominent in before. And because of that sleeping, there's less
instructions executed in the same amount of cycles (as it's a system wide
collection, otherwise it wouldn't be collecting the sleeping processes).
>
> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Other than that:
Acked-by: Vlastimil Babka <Vbabka@suse.cz>
A small thing below:
> ---
> mm/vmscan.c | 46 +++++++++++++++++++++++++++-------------------
> 1 file changed, 27 insertions(+), 19 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 8515ac40bcaf..1de9bc3e5aa2 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -57,6 +57,7 @@
> #include <linux/khugepaged.h>
> #include <linux/rculist_nulls.h>
> #include <linux/random.h>
> +#include <linux/srcu.h>
I guess this should have been in patch 2/8 already? It may work accidentaly
because some other header pulls it transitively...
Hi Vlastimil,
On 2023/3/9 06:46, Vlastimil Babka wrote:
> On 3/7/23 07:56, Qi Zheng wrote:
>> Like global slab shrink, this commit also uses SRCU to make
>> memcg slab shrink lockless.
>>
>> We can reproduce the down_read_trylock() hotspot through the
>> following script:
>>
>> ```
>>
>> DIR="/root/shrinker/memcg/mnt"
>>
>> do_create()
>> {
>> mkdir -p /sys/fs/cgroup/memory/test
>> mkdir -p /sys/fs/cgroup/perf_event/test
>> echo 4G > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
>> for i in `seq 0 $1`;
>> do
>> mkdir -p /sys/fs/cgroup/memory/test/$i;
>> echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
>> echo $$ > /sys/fs/cgroup/perf_event/test/cgroup.procs;
>> mkdir -p $DIR/$i;
>> done
>> }
>>
>> do_mount()
>> {
>> for i in `seq $1 $2`;
>> do
>> mount -t tmpfs $i $DIR/$i;
>> done
>> }
>>
>> do_touch()
>> {
>> for i in `seq $1 $2`;
>> do
>> echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
>> echo $$ > /sys/fs/cgroup/perf_event/test/cgroup.procs;
>> dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 &
>> done
>> }
>>
>> case "$1" in
>> touch)
>> do_touch $2 $3
>> ;;
>> test)
>> do_create 4000
>> do_mount 0 4000
>> do_touch 0 3000
>> ;;
>> *)
>> exit 1
>> ;;
>> esac
>> ```
>>
>> Save the above script, then run test and touch commands.
>> Then we can use the following perf command to view hotspots:
>>
>> perf top -U -F 999
>>
>> 1) Before applying this patchset:
>>
>> 32.31% [kernel] [k] down_read_trylock
>> 19.40% [kernel] [k] pv_native_safe_halt
>> 16.24% [kernel] [k] up_read
>> 15.70% [kernel] [k] shrink_slab
>> 4.69% [kernel] [k] _find_next_bit
>> 2.62% [kernel] [k] shrink_node
>> 1.78% [kernel] [k] shrink_lruvec
>> 0.76% [kernel] [k] do_shrink_slab
>>
>> 2) After applying this patchset:
>>
>> 27.83% [kernel] [k] _find_next_bit
>> 16.97% [kernel] [k] shrink_slab
>> 15.82% [kernel] [k] pv_native_safe_halt
>> 9.58% [kernel] [k] shrink_node
>> 8.31% [kernel] [k] shrink_lruvec
>> 5.64% [kernel] [k] do_shrink_slab
>> 3.88% [kernel] [k] mem_cgroup_iter
>>
>> At the same time, we use the following perf command to capture
>> IPC information:
>>
>> perf stat -e cycles,instructions -G test -a --repeat 5 -- sleep 10
>>
>> 1) Before applying this patchset:
>>
>> Performance counter stats for 'system wide' (5 runs):
>>
>> 454187219766 cycles test ( +- 1.84% )
>> 78896433101 instructions test # 0.17 insn per cycle ( +- 0.44% )
>>
>> 10.0020430 +- 0.0000366 seconds time elapsed ( +- 0.00% )
>>
>> 2) After applying this patchset:
>>
>> Performance counter stats for 'system wide' (5 runs):
>>
>> 841954709443 cycles test ( +- 15.80% ) (98.69%)
>> 527258677936 instructions test # 0.63 insn per cycle ( +- 15.11% ) (98.68%)
>>
>> 10.01064 +- 0.00831 seconds time elapsed ( +- 0.08% )
>>
>> We can see that IPC drops very seriously when calling
>> down_read_trylock() at high frequency. After using SRCU,
>> the IPC is at a normal level.
>
> The interpretation looks somewhat weird to me. I'd say the workload is
> stalled a lot as it fails the trylock (there might be some optimistic
> spinning perhaps) and then goes to sleep. See how "pv_native_safe_halt" is
> also more prominent in before. And because of that sleeping, there's less
> instructions executed in the same amount of cycles (as it's a system wide
> collection, otherwise it wouldn't be collecting the sleeping processes).
But in my tests, the trylock basically did not fail, so I think it is
caused by high-frequency atomic operation.
bpftrace -e 'kr:down_read_trylock {@[kstack, retval]=count();}
interval:s:1 {exit();}'
Attaching 2 probes...
<...>
@[
shrink_slab+288
shrink_node+640
do_try_to_free_pages+203
try_to_free_mem_cgroup_pages+266
try_charge_memcg+412
charge_memcg+51
__mem_cgroup_charge+44
__handle_mm_fault+2119
handle_mm_fault+272
do_user_addr_fault+712
exc_page_fault+124
asm_exc_page_fault+38
clear_user_erms+14
read_zero+86
vfs_read+173
ksys_read+93
do_syscall_64+56
entry_SYSCALL_64_after_hwframe+99
, 1]: 617019
@[
shrink_slab+288
shrink_node+640
do_try_to_free_pages+203
try_to_free_mem_cgroup_pages+266
try_charge_memcg+412
charge_memcg+51
__mem_cgroup_charge+44
shmem_add_to_page_cache+545
shmem_get_folio_gfp+621
shmem_write_begin+95
generic_perform_write+257
__generic_file_write_iter+202
generic_file_write_iter+97
vfs_write+704
ksys_write+93
do_syscall_64+56
entry_SYSCALL_64_after_hwframe+99
, 1]: 617065
>
>>
>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>
> Other than that:
>
> Acked-by: Vlastimil Babka <Vbabka@suse.cz>
Thanks.
>
> A small thing below:
>
>> ---
>> mm/vmscan.c | 46 +++++++++++++++++++++++++++-------------------
>> 1 file changed, 27 insertions(+), 19 deletions(-)
>>
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index 8515ac40bcaf..1de9bc3e5aa2 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -57,6 +57,7 @@
>> #include <linux/khugepaged.h>
>> #include <linux/rculist_nulls.h>
>> #include <linux/random.h>
>> +#include <linux/srcu.h>
>
> I guess this should have been in patch 2/8 already? It may work accidentaly
> because some other header pulls it transitively...
Yeah, in fact, patch 3/8 also can compile successfully without srcu.h,
but maybe it is better to explicitly include this header file, I will
add it in patch 2/8.
Thanks,
Qi
>
>
On 07.03.2023 09:56, Qi Zheng wrote:
> Like global slab shrink, this commit also uses SRCU to make
> memcg slab shrink lockless.
>
> We can reproduce the down_read_trylock() hotspot through the
> following script:
>
> ```
>
> DIR="/root/shrinker/memcg/mnt"
>
> do_create()
> {
> mkdir -p /sys/fs/cgroup/memory/test
> mkdir -p /sys/fs/cgroup/perf_event/test
> echo 4G > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
> for i in `seq 0 $1`;
> do
> mkdir -p /sys/fs/cgroup/memory/test/$i;
> echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
> echo $$ > /sys/fs/cgroup/perf_event/test/cgroup.procs;
> mkdir -p $DIR/$i;
> done
> }
>
> do_mount()
> {
> for i in `seq $1 $2`;
> do
> mount -t tmpfs $i $DIR/$i;
> done
> }
>
> do_touch()
> {
> for i in `seq $1 $2`;
> do
> echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
> echo $$ > /sys/fs/cgroup/perf_event/test/cgroup.procs;
> dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 &
> done
> }
>
> case "$1" in
> touch)
> do_touch $2 $3
> ;;
> test)
> do_create 4000
> do_mount 0 4000
> do_touch 0 3000
> ;;
> *)
> exit 1
> ;;
> esac
> ```
>
> Save the above script, then run test and touch commands.
> Then we can use the following perf command to view hotspots:
>
> perf top -U -F 999
>
> 1) Before applying this patchset:
>
> 32.31% [kernel] [k] down_read_trylock
> 19.40% [kernel] [k] pv_native_safe_halt
> 16.24% [kernel] [k] up_read
> 15.70% [kernel] [k] shrink_slab
> 4.69% [kernel] [k] _find_next_bit
> 2.62% [kernel] [k] shrink_node
> 1.78% [kernel] [k] shrink_lruvec
> 0.76% [kernel] [k] do_shrink_slab
>
> 2) After applying this patchset:
>
> 27.83% [kernel] [k] _find_next_bit
> 16.97% [kernel] [k] shrink_slab
> 15.82% [kernel] [k] pv_native_safe_halt
> 9.58% [kernel] [k] shrink_node
> 8.31% [kernel] [k] shrink_lruvec
> 5.64% [kernel] [k] do_shrink_slab
> 3.88% [kernel] [k] mem_cgroup_iter
>
> At the same time, we use the following perf command to capture
> IPC information:
>
> perf stat -e cycles,instructions -G test -a --repeat 5 -- sleep 10
>
> 1) Before applying this patchset:
>
> Performance counter stats for 'system wide' (5 runs):
>
> 454187219766 cycles test ( +- 1.84% )
> 78896433101 instructions test # 0.17 insn per cycle ( +- 0.44% )
>
> 10.0020430 +- 0.0000366 seconds time elapsed ( +- 0.00% )
>
> 2) After applying this patchset:
>
> Performance counter stats for 'system wide' (5 runs):
>
> 841954709443 cycles test ( +- 15.80% ) (98.69%)
> 527258677936 instructions test # 0.63 insn per cycle ( +- 15.11% ) (98.68%)
>
> 10.01064 +- 0.00831 seconds time elapsed ( +- 0.08% )
>
> We can see that IPC drops very seriously when calling
> down_read_trylock() at high frequency. After using SRCU,
> the IPC is at a normal level.
>
> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Acked-by: Kirill Tkhai <tkhai@ya.ru>
> ---
> mm/vmscan.c | 46 +++++++++++++++++++++++++++-------------------
> 1 file changed, 27 insertions(+), 19 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 8515ac40bcaf..1de9bc3e5aa2 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -57,6 +57,7 @@
> #include <linux/khugepaged.h>
> #include <linux/rculist_nulls.h>
> #include <linux/random.h>
> +#include <linux/srcu.h>
>
> #include <asm/tlbflush.h>
> #include <asm/div64.h>
> @@ -221,8 +222,21 @@ static inline int shrinker_defer_size(int nr_items)
> static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
> int nid)
> {
> - return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
> - lockdep_is_held(&shrinker_rwsem));
> + return srcu_dereference_check(memcg->nodeinfo[nid]->shrinker_info,
> + &shrinker_srcu,
> + lockdep_is_held(&shrinker_rwsem));
> +}
> +
> +static struct shrinker_info *shrinker_info_srcu(struct mem_cgroup *memcg,
> + int nid)
> +{
> + return srcu_dereference(memcg->nodeinfo[nid]->shrinker_info,
> + &shrinker_srcu);
> +}
> +
> +static void free_shrinker_info_rcu(struct rcu_head *head)
> +{
> + kvfree(container_of(head, struct shrinker_info, rcu));
> }
>
> static inline bool need_expand(int new_nr_max, int old_nr_max)
> @@ -269,7 +283,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg,
> defer_size - old_defer_size);
>
> rcu_assign_pointer(pn->shrinker_info, new);
> - kvfree_rcu(old, rcu);
> + call_srcu(&shrinker_srcu, &old->rcu, free_shrinker_info_rcu);
> }
>
> return 0;
> @@ -355,15 +369,16 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
> {
> if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
> struct shrinker_info *info;
> + int srcu_idx;
>
> - rcu_read_lock();
> - info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
> + srcu_idx = srcu_read_lock(&shrinker_srcu);
> + info = shrinker_info_srcu(memcg, nid);
> if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) {
> /* Pairs with smp mb in shrink_slab() */
> smp_mb__before_atomic();
> set_bit(shrinker_id, info->map);
> }
> - rcu_read_unlock();
> + srcu_read_unlock(&shrinker_srcu, srcu_idx);
> }
> }
>
> @@ -377,7 +392,6 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker)
> return -ENOSYS;
>
> down_write(&shrinker_rwsem);
> - /* This may call shrinker, so it must use down_read_trylock() */
> id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
> if (id < 0)
> goto unlock;
> @@ -411,7 +425,7 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
> {
> struct shrinker_info *info;
>
> - info = shrinker_info_protected(memcg, nid);
> + info = shrinker_info_srcu(memcg, nid);
> return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
> }
>
> @@ -420,7 +434,7 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
> {
> struct shrinker_info *info;
>
> - info = shrinker_info_protected(memcg, nid);
> + info = shrinker_info_srcu(memcg, nid);
> return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
> }
>
> @@ -898,15 +912,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
> {
> struct shrinker_info *info;
> unsigned long ret, freed = 0;
> + int srcu_idx;
> int i;
>
> if (!mem_cgroup_online(memcg))
> return 0;
>
> - if (!down_read_trylock(&shrinker_rwsem))
> - return 0;
> -
> - info = shrinker_info_protected(memcg, nid);
> + srcu_idx = srcu_read_lock(&shrinker_srcu);
> + info = shrinker_info_srcu(memcg, nid);
> if (unlikely(!info))
> goto unlock;
>
> @@ -956,14 +969,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
> set_shrinker_bit(memcg, nid, i);
> }
> freed += ret;
> -
> - if (rwsem_is_contended(&shrinker_rwsem)) {
> - freed = freed ? : 1;
> - break;
> - }
> }
> unlock:
> - up_read(&shrinker_rwsem);
> + srcu_read_unlock(&shrinker_srcu, srcu_idx);
> return freed;
> }
> #else /* CONFIG_MEMCG */
© 2016 - 2026 Red Hat, Inc.