[v4] make slab shrink lockless

[PATCH v4 3/8] mm: vmscan: make memcg slab shrink lockless

Posted by Qi Zheng 3 years, 1 month ago

Like global slab shrink, this commit also uses SRCU to make
memcg slab shrink lockless.

We can reproduce the down_read_trylock() hotspot through the
following script:

```

DIR="/root/shrinker/memcg/mnt"

do_create()
{
    mkdir -p /sys/fs/cgroup/memory/test
    mkdir -p /sys/fs/cgroup/perf_event/test
    echo 4G > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
    for i in `seq 0 $1`;
    do
        mkdir -p /sys/fs/cgroup/memory/test/$i;
        echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
        echo $$ > /sys/fs/cgroup/perf_event/test/cgroup.procs;
        mkdir -p $DIR/$i;
    done
}

do_mount()
{
    for i in `seq $1 $2`;
    do
        mount -t tmpfs $i $DIR/$i;
    done
}

do_touch()
{
    for i in `seq $1 $2`;
    do
        echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
        echo $$ > /sys/fs/cgroup/perf_event/test/cgroup.procs;
            dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 &
    done
}

case "$1" in
  touch)
    do_touch $2 $3
    ;;
  test)
      do_create 4000
    do_mount 0 4000
    do_touch 0 3000
    ;;
  *)
    exit 1
    ;;
esac
```

Save the above script, then run test and touch commands.
Then we can use the following perf command to view hotspots:

perf top -U -F 999

1) Before applying this patchset:

  32.31%  [kernel]           [k] down_read_trylock
  19.40%  [kernel]           [k] pv_native_safe_halt
  16.24%  [kernel]           [k] up_read
  15.70%  [kernel]           [k] shrink_slab
   4.69%  [kernel]           [k] _find_next_bit
   2.62%  [kernel]           [k] shrink_node
   1.78%  [kernel]           [k] shrink_lruvec
   0.76%  [kernel]           [k] do_shrink_slab

2) After applying this patchset:

  27.83%  [kernel]           [k] _find_next_bit
  16.97%  [kernel]           [k] shrink_slab
  15.82%  [kernel]           [k] pv_native_safe_halt
   9.58%  [kernel]           [k] shrink_node
   8.31%  [kernel]           [k] shrink_lruvec
   5.64%  [kernel]           [k] do_shrink_slab
   3.88%  [kernel]           [k] mem_cgroup_iter

At the same time, we use the following perf command to capture
IPC information:

perf stat -e cycles,instructions -G test -a --repeat 5 -- sleep 10

1) Before applying this patchset:

 Performance counter stats for 'system wide' (5 runs):

      454187219766      cycles                    test                    ( +-  1.84% )
       78896433101      instructions              test #    0.17  insn per cycle           ( +-  0.44% )

        10.0020430 +- 0.0000366 seconds time elapsed  ( +-  0.00% )

2) After applying this patchset:

 Performance counter stats for 'system wide' (5 runs):

      841954709443      cycles                    test                    ( +- 15.80% )  (98.69%)
      527258677936      instructions              test #    0.63  insn per cycle           ( +- 15.11% )  (98.68%)

          10.01064 +- 0.00831 seconds time elapsed  ( +-  0.08% )

We can see that IPC drops very seriously when calling
down_read_trylock() at high frequency. After using SRCU,
the IPC is at a normal level.

Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
 mm/vmscan.c | 46 +++++++++++++++++++++++++++-------------------
 1 file changed, 27 insertions(+), 19 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8515ac40bcaf..1de9bc3e5aa2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -57,6 +57,7 @@
 #include <linux/khugepaged.h>
 #include <linux/rculist_nulls.h>
 #include <linux/random.h>
+#include <linux/srcu.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -221,8 +222,21 @@ static inline int shrinker_defer_size(int nr_items)
 static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
 						     int nid)
 {
-	return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
-					 lockdep_is_held(&shrinker_rwsem));
+	return srcu_dereference_check(memcg->nodeinfo[nid]->shrinker_info,
+				      &shrinker_srcu,
+				      lockdep_is_held(&shrinker_rwsem));
+}
+
+static struct shrinker_info *shrinker_info_srcu(struct mem_cgroup *memcg,
+						     int nid)
+{
+	return srcu_dereference(memcg->nodeinfo[nid]->shrinker_info,
+				&shrinker_srcu);
+}
+
+static void free_shrinker_info_rcu(struct rcu_head *head)
+{
+	kvfree(container_of(head, struct shrinker_info, rcu));
 }
 
 static inline bool need_expand(int new_nr_max, int old_nr_max)
@@ -269,7 +283,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg,
 		       defer_size - old_defer_size);
 
 		rcu_assign_pointer(pn->shrinker_info, new);
-		kvfree_rcu(old, rcu);
+		call_srcu(&shrinker_srcu, &old->rcu, free_shrinker_info_rcu);
 	}
 
 	return 0;
@@ -355,15 +369,16 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
 {
 	if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
 		struct shrinker_info *info;
+		int srcu_idx;
 
-		rcu_read_lock();
-		info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
+		srcu_idx = srcu_read_lock(&shrinker_srcu);
+		info = shrinker_info_srcu(memcg, nid);
 		if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) {
 			/* Pairs with smp mb in shrink_slab() */
 			smp_mb__before_atomic();
 			set_bit(shrinker_id, info->map);
 		}
-		rcu_read_unlock();
+		srcu_read_unlock(&shrinker_srcu, srcu_idx);
 	}
 }
 
@@ -377,7 +392,6 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker)
 		return -ENOSYS;
 
 	down_write(&shrinker_rwsem);
-	/* This may call shrinker, so it must use down_read_trylock() */
 	id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
 	if (id < 0)
 		goto unlock;
@@ -411,7 +425,7 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
 {
 	struct shrinker_info *info;
 
-	info = shrinker_info_protected(memcg, nid);
+	info = shrinker_info_srcu(memcg, nid);
 	return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
 }
 
@@ -420,7 +434,7 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
 {
 	struct shrinker_info *info;
 
-	info = shrinker_info_protected(memcg, nid);
+	info = shrinker_info_srcu(memcg, nid);
 	return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
 }
 
@@ -898,15 +912,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 {
 	struct shrinker_info *info;
 	unsigned long ret, freed = 0;
+	int srcu_idx;
 	int i;
 
 	if (!mem_cgroup_online(memcg))
 		return 0;
 
-	if (!down_read_trylock(&shrinker_rwsem))
-		return 0;
-
-	info = shrinker_info_protected(memcg, nid);
+	srcu_idx = srcu_read_lock(&shrinker_srcu);
+	info = shrinker_info_srcu(memcg, nid);
 	if (unlikely(!info))
 		goto unlock;
 
@@ -956,14 +969,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 				set_shrinker_bit(memcg, nid, i);
 		}
 		freed += ret;
-
-		if (rwsem_is_contended(&shrinker_rwsem)) {
-			freed = freed ? : 1;
-			break;
-		}
 	}
 unlock:
-	up_read(&shrinker_rwsem);
+	srcu_read_unlock(&shrinker_srcu, srcu_idx);
 	return freed;
 }
 #else /* CONFIG_MEMCG */
-- 
2.20.1

Re: [PATCH v4 3/8] mm: vmscan: make memcg slab shrink lockless

Posted by Vlastimil Babka 3 years, 1 month ago

On 3/7/23 07:56, Qi Zheng wrote:
> Like global slab shrink, this commit also uses SRCU to make
> memcg slab shrink lockless.
> 
> We can reproduce the down_read_trylock() hotspot through the
> following script:
> 
> ```
> 
> DIR="/root/shrinker/memcg/mnt"
> 
> do_create()
> {
>     mkdir -p /sys/fs/cgroup/memory/test
>     mkdir -p /sys/fs/cgroup/perf_event/test
>     echo 4G > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
>     for i in `seq 0 $1`;
>     do
>         mkdir -p /sys/fs/cgroup/memory/test/$i;
>         echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
>         echo $$ > /sys/fs/cgroup/perf_event/test/cgroup.procs;
>         mkdir -p $DIR/$i;
>     done
> }
> 
> do_mount()
> {
>     for i in `seq $1 $2`;
>     do
>         mount -t tmpfs $i $DIR/$i;
>     done
> }
> 
> do_touch()
> {
>     for i in `seq $1 $2`;
>     do
>         echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
>         echo $$ > /sys/fs/cgroup/perf_event/test/cgroup.procs;
>             dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 &
>     done
> }
> 
> case "$1" in
>   touch)
>     do_touch $2 $3
>     ;;
>   test)
>       do_create 4000
>     do_mount 0 4000
>     do_touch 0 3000
>     ;;
>   *)
>     exit 1
>     ;;
> esac
> ```
> 
> Save the above script, then run test and touch commands.
> Then we can use the following perf command to view hotspots:
> 
> perf top -U -F 999
> 
> 1) Before applying this patchset:
> 
>   32.31%  [kernel]           [k] down_read_trylock
>   19.40%  [kernel]           [k] pv_native_safe_halt
>   16.24%  [kernel]           [k] up_read
>   15.70%  [kernel]           [k] shrink_slab
>    4.69%  [kernel]           [k] _find_next_bit
>    2.62%  [kernel]           [k] shrink_node
>    1.78%  [kernel]           [k] shrink_lruvec
>    0.76%  [kernel]           [k] do_shrink_slab
> 
> 2) After applying this patchset:
> 
>   27.83%  [kernel]           [k] _find_next_bit
>   16.97%  [kernel]           [k] shrink_slab
>   15.82%  [kernel]           [k] pv_native_safe_halt
>    9.58%  [kernel]           [k] shrink_node
>    8.31%  [kernel]           [k] shrink_lruvec
>    5.64%  [kernel]           [k] do_shrink_slab
>    3.88%  [kernel]           [k] mem_cgroup_iter
> 
> At the same time, we use the following perf command to capture
> IPC information:
> 
> perf stat -e cycles,instructions -G test -a --repeat 5 -- sleep 10
> 
> 1) Before applying this patchset:
> 
>  Performance counter stats for 'system wide' (5 runs):
> 
>       454187219766      cycles                    test                    ( +-  1.84% )
>        78896433101      instructions              test #    0.17  insn per cycle           ( +-  0.44% )
> 
>         10.0020430 +- 0.0000366 seconds time elapsed  ( +-  0.00% )
> 
> 2) After applying this patchset:
> 
>  Performance counter stats for 'system wide' (5 runs):
> 
>       841954709443      cycles                    test                    ( +- 15.80% )  (98.69%)
>       527258677936      instructions              test #    0.63  insn per cycle           ( +- 15.11% )  (98.68%)
> 
>           10.01064 +- 0.00831 seconds time elapsed  ( +-  0.08% )
> 
> We can see that IPC drops very seriously when calling
> down_read_trylock() at high frequency. After using SRCU,
> the IPC is at a normal level.

The interpretation looks somewhat weird to me. I'd say the workload is
stalled a lot as it fails the trylock (there might be some optimistic
spinning perhaps) and then goes to sleep. See how "pv_native_safe_halt" is
also more prominent in before. And because of that sleeping, there's less
instructions executed in the same amount of cycles (as it's a system wide
collection, otherwise it wouldn't be collecting the sleeping processes).

> 
> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>

Other than that:

Acked-by: Vlastimil Babka <Vbabka@suse.cz>

A small thing below:

> ---
>  mm/vmscan.c | 46 +++++++++++++++++++++++++++-------------------
>  1 file changed, 27 insertions(+), 19 deletions(-)
> 
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 8515ac40bcaf..1de9bc3e5aa2 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -57,6 +57,7 @@
>  #include <linux/khugepaged.h>
>  #include <linux/rculist_nulls.h>
>  #include <linux/random.h>
> +#include <linux/srcu.h>

I guess this should have been in patch 2/8 already? It may work accidentaly
because some other header pulls it transitively...

Re: [PATCH v4 3/8] mm: vmscan: make memcg slab shrink lockless

Posted by Qi Zheng 3 years, 1 month ago

Hi Vlastimil,

On 2023/3/9 06:46, Vlastimil Babka wrote:
> On 3/7/23 07:56, Qi Zheng wrote:
>> Like global slab shrink, this commit also uses SRCU to make
>> memcg slab shrink lockless.
>>
>> We can reproduce the down_read_trylock() hotspot through the
>> following script:
>>
>> ```
>>
>> DIR="/root/shrinker/memcg/mnt"
>>
>> do_create()
>> {
>>      mkdir -p /sys/fs/cgroup/memory/test
>>      mkdir -p /sys/fs/cgroup/perf_event/test
>>      echo 4G > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
>>      for i in `seq 0 $1`;
>>      do
>>          mkdir -p /sys/fs/cgroup/memory/test/$i;
>>          echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
>>          echo $$ > /sys/fs/cgroup/perf_event/test/cgroup.procs;
>>          mkdir -p $DIR/$i;
>>      done
>> }
>>
>> do_mount()
>> {
>>      for i in `seq $1 $2`;
>>      do
>>          mount -t tmpfs $i $DIR/$i;
>>      done
>> }
>>
>> do_touch()
>> {
>>      for i in `seq $1 $2`;
>>      do
>>          echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
>>          echo $$ > /sys/fs/cgroup/perf_event/test/cgroup.procs;
>>              dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 &
>>      done
>> }
>>
>> case "$1" in
>>    touch)
>>      do_touch $2 $3
>>      ;;
>>    test)
>>        do_create 4000
>>      do_mount 0 4000
>>      do_touch 0 3000
>>      ;;
>>    *)
>>      exit 1
>>      ;;
>> esac
>> ```
>>
>> Save the above script, then run test and touch commands.
>> Then we can use the following perf command to view hotspots:
>>
>> perf top -U -F 999
>>
>> 1) Before applying this patchset:
>>
>>    32.31%  [kernel]           [k] down_read_trylock
>>    19.40%  [kernel]           [k] pv_native_safe_halt
>>    16.24%  [kernel]           [k] up_read
>>    15.70%  [kernel]           [k] shrink_slab
>>     4.69%  [kernel]           [k] _find_next_bit
>>     2.62%  [kernel]           [k] shrink_node
>>     1.78%  [kernel]           [k] shrink_lruvec
>>     0.76%  [kernel]           [k] do_shrink_slab
>>
>> 2) After applying this patchset:
>>
>>    27.83%  [kernel]           [k] _find_next_bit
>>    16.97%  [kernel]           [k] shrink_slab
>>    15.82%  [kernel]           [k] pv_native_safe_halt
>>     9.58%  [kernel]           [k] shrink_node
>>     8.31%  [kernel]           [k] shrink_lruvec
>>     5.64%  [kernel]           [k] do_shrink_slab
>>     3.88%  [kernel]           [k] mem_cgroup_iter
>>
>> At the same time, we use the following perf command to capture
>> IPC information:
>>
>> perf stat -e cycles,instructions -G test -a --repeat 5 -- sleep 10
>>
>> 1) Before applying this patchset:
>>
>>   Performance counter stats for 'system wide' (5 runs):
>>
>>        454187219766      cycles                    test                    ( +-  1.84% )
>>         78896433101      instructions              test #    0.17  insn per cycle           ( +-  0.44% )
>>
>>          10.0020430 +- 0.0000366 seconds time elapsed  ( +-  0.00% )
>>
>> 2) After applying this patchset:
>>
>>   Performance counter stats for 'system wide' (5 runs):
>>
>>        841954709443      cycles                    test                    ( +- 15.80% )  (98.69%)
>>        527258677936      instructions              test #    0.63  insn per cycle           ( +- 15.11% )  (98.68%)
>>
>>            10.01064 +- 0.00831 seconds time elapsed  ( +-  0.08% )
>>
>> We can see that IPC drops very seriously when calling
>> down_read_trylock() at high frequency. After using SRCU,
>> the IPC is at a normal level.
> 
> The interpretation looks somewhat weird to me. I'd say the workload is
> stalled a lot as it fails the trylock (there might be some optimistic
> spinning perhaps) and then goes to sleep. See how "pv_native_safe_halt" is
> also more prominent in before. And because of that sleeping, there's less
> instructions executed in the same amount of cycles (as it's a system wide
> collection, otherwise it wouldn't be collecting the sleeping processes).

But in my tests, the trylock basically did not fail, so I think it is
caused by high-frequency atomic operation.

bpftrace -e 'kr:down_read_trylock {@[kstack, retval]=count();} 
interval:s:1 {exit();}'

Attaching 2 probes...

<...>
@[
     shrink_slab+288
     shrink_node+640
     do_try_to_free_pages+203
     try_to_free_mem_cgroup_pages+266
     try_charge_memcg+412
     charge_memcg+51
     __mem_cgroup_charge+44
     __handle_mm_fault+2119
     handle_mm_fault+272
     do_user_addr_fault+712
     exc_page_fault+124
     asm_exc_page_fault+38
     clear_user_erms+14
     read_zero+86
     vfs_read+173
     ksys_read+93
     do_syscall_64+56
     entry_SYSCALL_64_after_hwframe+99
, 1]: 617019
@[
     shrink_slab+288
     shrink_node+640
     do_try_to_free_pages+203
     try_to_free_mem_cgroup_pages+266
     try_charge_memcg+412
     charge_memcg+51
     __mem_cgroup_charge+44
     shmem_add_to_page_cache+545
     shmem_get_folio_gfp+621
     shmem_write_begin+95
     generic_perform_write+257
     __generic_file_write_iter+202
     generic_file_write_iter+97
     vfs_write+704
     ksys_write+93
     do_syscall_64+56
     entry_SYSCALL_64_after_hwframe+99
, 1]: 617065

> 
>>
>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> 
> Other than that:
> 
> Acked-by: Vlastimil Babka <Vbabka@suse.cz>

Thanks.

> 
> A small thing below:
> 
>> ---
>>   mm/vmscan.c | 46 +++++++++++++++++++++++++++-------------------
>>   1 file changed, 27 insertions(+), 19 deletions(-)
>>
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index 8515ac40bcaf..1de9bc3e5aa2 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -57,6 +57,7 @@
>>   #include <linux/khugepaged.h>
>>   #include <linux/rculist_nulls.h>
>>   #include <linux/random.h>
>> +#include <linux/srcu.h>
> 
> I guess this should have been in patch 2/8 already? It may work accidentaly
> because some other header pulls it transitively...

Yeah, in fact, patch 3/8 also can compile successfully without srcu.h,
but maybe it is better to explicitly include this header file, I will
add it in patch 2/8.

Thanks,
Qi

> 
>

Re: [PATCH v4 3/8] mm: vmscan: make memcg slab shrink lockless

Posted by Kirill Tkhai 3 years, 1 month ago

On 07.03.2023 09:56, Qi Zheng wrote:
> Like global slab shrink, this commit also uses SRCU to make
> memcg slab shrink lockless.
> 
> We can reproduce the down_read_trylock() hotspot through the
> following script:
> 
> ```
> 
> DIR="/root/shrinker/memcg/mnt"
> 
> do_create()
> {
>     mkdir -p /sys/fs/cgroup/memory/test
>     mkdir -p /sys/fs/cgroup/perf_event/test
>     echo 4G > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
>     for i in `seq 0 $1`;
>     do
>         mkdir -p /sys/fs/cgroup/memory/test/$i;
>         echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
>         echo $$ > /sys/fs/cgroup/perf_event/test/cgroup.procs;
>         mkdir -p $DIR/$i;
>     done
> }
> 
> do_mount()
> {
>     for i in `seq $1 $2`;
>     do
>         mount -t tmpfs $i $DIR/$i;
>     done
> }
> 
> do_touch()
> {
>     for i in `seq $1 $2`;
>     do
>         echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
>         echo $$ > /sys/fs/cgroup/perf_event/test/cgroup.procs;
>             dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 &
>     done
> }
> 
> case "$1" in
>   touch)
>     do_touch $2 $3
>     ;;
>   test)
>       do_create 4000
>     do_mount 0 4000
>     do_touch 0 3000
>     ;;
>   *)
>     exit 1
>     ;;
> esac
> ```
> 
> Save the above script, then run test and touch commands.
> Then we can use the following perf command to view hotspots:
> 
> perf top -U -F 999
> 
> 1) Before applying this patchset:
> 
>   32.31%  [kernel]           [k] down_read_trylock
>   19.40%  [kernel]           [k] pv_native_safe_halt
>   16.24%  [kernel]           [k] up_read
>   15.70%  [kernel]           [k] shrink_slab
>    4.69%  [kernel]           [k] _find_next_bit
>    2.62%  [kernel]           [k] shrink_node
>    1.78%  [kernel]           [k] shrink_lruvec
>    0.76%  [kernel]           [k] do_shrink_slab
> 
> 2) After applying this patchset:
> 
>   27.83%  [kernel]           [k] _find_next_bit
>   16.97%  [kernel]           [k] shrink_slab
>   15.82%  [kernel]           [k] pv_native_safe_halt
>    9.58%  [kernel]           [k] shrink_node
>    8.31%  [kernel]           [k] shrink_lruvec
>    5.64%  [kernel]           [k] do_shrink_slab
>    3.88%  [kernel]           [k] mem_cgroup_iter
> 
> At the same time, we use the following perf command to capture
> IPC information:
> 
> perf stat -e cycles,instructions -G test -a --repeat 5 -- sleep 10
> 
> 1) Before applying this patchset:
> 
>  Performance counter stats for 'system wide' (5 runs):
> 
>       454187219766      cycles                    test                    ( +-  1.84% )
>        78896433101      instructions              test #    0.17  insn per cycle           ( +-  0.44% )
> 
>         10.0020430 +- 0.0000366 seconds time elapsed  ( +-  0.00% )
> 
> 2) After applying this patchset:
> 
>  Performance counter stats for 'system wide' (5 runs):
> 
>       841954709443      cycles                    test                    ( +- 15.80% )  (98.69%)
>       527258677936      instructions              test #    0.63  insn per cycle           ( +- 15.11% )  (98.68%)
> 
>           10.01064 +- 0.00831 seconds time elapsed  ( +-  0.08% )
> 
> We can see that IPC drops very seriously when calling
> down_read_trylock() at high frequency. After using SRCU,
> the IPC is at a normal level.
> 
> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>

Acked-by: Kirill Tkhai <tkhai@ya.ru>

> ---
>  mm/vmscan.c | 46 +++++++++++++++++++++++++++-------------------
>  1 file changed, 27 insertions(+), 19 deletions(-)
> 
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 8515ac40bcaf..1de9bc3e5aa2 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -57,6 +57,7 @@
>  #include <linux/khugepaged.h>
>  #include <linux/rculist_nulls.h>
>  #include <linux/random.h>
> +#include <linux/srcu.h>
>  
>  #include <asm/tlbflush.h>
>  #include <asm/div64.h>
> @@ -221,8 +222,21 @@ static inline int shrinker_defer_size(int nr_items)
>  static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
>  						     int nid)
>  {
> -	return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
> -					 lockdep_is_held(&shrinker_rwsem));
> +	return srcu_dereference_check(memcg->nodeinfo[nid]->shrinker_info,
> +				      &shrinker_srcu,
> +				      lockdep_is_held(&shrinker_rwsem));
> +}
> +
> +static struct shrinker_info *shrinker_info_srcu(struct mem_cgroup *memcg,
> +						     int nid)
> +{
> +	return srcu_dereference(memcg->nodeinfo[nid]->shrinker_info,
> +				&shrinker_srcu);
> +}
> +
> +static void free_shrinker_info_rcu(struct rcu_head *head)
> +{
> +	kvfree(container_of(head, struct shrinker_info, rcu));
>  }
>  
>  static inline bool need_expand(int new_nr_max, int old_nr_max)
> @@ -269,7 +283,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg,
>  		       defer_size - old_defer_size);
>  
>  		rcu_assign_pointer(pn->shrinker_info, new);
> -		kvfree_rcu(old, rcu);
> +		call_srcu(&shrinker_srcu, &old->rcu, free_shrinker_info_rcu);
>  	}
>  
>  	return 0;
> @@ -355,15 +369,16 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
>  {
>  	if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
>  		struct shrinker_info *info;
> +		int srcu_idx;
>  
> -		rcu_read_lock();
> -		info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
> +		srcu_idx = srcu_read_lock(&shrinker_srcu);
> +		info = shrinker_info_srcu(memcg, nid);
>  		if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) {
>  			/* Pairs with smp mb in shrink_slab() */
>  			smp_mb__before_atomic();
>  			set_bit(shrinker_id, info->map);
>  		}
> -		rcu_read_unlock();
> +		srcu_read_unlock(&shrinker_srcu, srcu_idx);
>  	}
>  }
>  
> @@ -377,7 +392,6 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker)
>  		return -ENOSYS;
>  
>  	down_write(&shrinker_rwsem);
> -	/* This may call shrinker, so it must use down_read_trylock() */
>  	id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
>  	if (id < 0)
>  		goto unlock;
> @@ -411,7 +425,7 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
>  {
>  	struct shrinker_info *info;
>  
> -	info = shrinker_info_protected(memcg, nid);
> +	info = shrinker_info_srcu(memcg, nid);
>  	return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
>  }
>  
> @@ -420,7 +434,7 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
>  {
>  	struct shrinker_info *info;
>  
> -	info = shrinker_info_protected(memcg, nid);
> +	info = shrinker_info_srcu(memcg, nid);
>  	return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
>  }
>  
> @@ -898,15 +912,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>  {
>  	struct shrinker_info *info;
>  	unsigned long ret, freed = 0;
> +	int srcu_idx;
>  	int i;
>  
>  	if (!mem_cgroup_online(memcg))
>  		return 0;
>  
> -	if (!down_read_trylock(&shrinker_rwsem))
> -		return 0;
> -
> -	info = shrinker_info_protected(memcg, nid);
> +	srcu_idx = srcu_read_lock(&shrinker_srcu);
> +	info = shrinker_info_srcu(memcg, nid);
>  	if (unlikely(!info))
>  		goto unlock;
>  
> @@ -956,14 +969,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>  				set_shrinker_bit(memcg, nid, i);
>  		}
>  		freed += ret;
> -
> -		if (rwsem_is_contended(&shrinker_rwsem)) {
> -			freed = freed ? : 1;
> -			break;
> -		}
>  	}
>  unlock:
> -	up_read(&shrinker_rwsem);
> +	srcu_read_unlock(&shrinker_srcu, srcu_idx);
>  	return freed;
>  }
>  #else /* CONFIG_MEMCG */