[v1] mm, lru_gen: batch update pages when aging

[PATCH 1/3] mm, lru_gen: batch update counters on againg

Posted by Kairui Song 2 years, 1 month ago

From: Kairui Song <kasong@tencent.com>

When lru_gen is aging, it will update mm counters page by page,
which causes a higher overhead if age happens frequently or there
are a lot of pages in one generation getting moved.
Optimize this by doing the counter update in batch.

Although most __mod_*_state has its own caches the overhead
is still observable.

Tested in a 4G memcg on a EPYC 7K62 with:

  memcached -u nobody -m 16384 -s /tmp/memcached.socket \
    -a 0766 -t 16 -B binary &

  memtier_benchmark -S /tmp/memcached.socket \
    -P memcache_binary -n allkeys \
    --key-minimum=1 --key-maximum=16000000 -d 1024 \
    --ratio=1:0 --key-pattern=P:P -c 2 -t 16 --pipeline 8 -x 6

Average result of 18 test runs:

Before: 44017.78 Ops/sec
After:  44687.08 Ops/sec (+1.5%)

Signed-off-by: Kairui Song <kasong@tencent.com>
---
 mm/vmscan.c | 64 +++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 55 insertions(+), 9 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index b4ca3563bcf4..e3b4797b9729 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3095,9 +3095,47 @@ static int folio_update_gen(struct folio *folio, int gen)
 	return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
 }
 
+/*
+ * Update LRU gen in batch for each lru_gen LRU list. The batch is limited to
+ * each gen / type / zone level LRU. Batch is applied after finished or aborted
+ * scanning one LRU list.
+ */
+struct gen_update_batch {
+	int delta[MAX_NR_GENS];
+};
+
+static void lru_gen_update_batch(struct lruvec *lruvec, bool type, int zone,
+				 struct gen_update_batch *batch)
+{
+	int gen;
+	int promoted = 0;
+	struct lru_gen_folio *lrugen = &lruvec->lrugen;
+	enum lru_list lru = type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON;
+
+	for (gen = 0; gen < MAX_NR_GENS; gen++) {
+		int delta = batch->delta[gen];
+
+		if (!delta)
+			continue;
+
+		WRITE_ONCE(lrugen->nr_pages[gen][type][zone],
+			   lrugen->nr_pages[gen][type][zone] + delta);
+
+		if (lru_gen_is_active(lruvec, gen))
+			promoted += delta;
+	}
+
+	if (promoted) {
+		__update_lru_size(lruvec, lru, zone, -promoted);
+		__update_lru_size(lruvec, lru + LRU_ACTIVE, zone, promoted);
+	}
+}
+
 /* protect pages accessed multiple times through file descriptors */
-static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
+static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio,
+			 bool reclaiming, struct gen_update_batch *batch)
 {
+	int delta = folio_nr_pages(folio);
 	int type = folio_is_file_lru(folio);
 	struct lru_gen_folio *lrugen = &lruvec->lrugen;
 	int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
@@ -3120,7 +3158,8 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai
 			new_flags |= BIT(PG_reclaim);
 	} while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
 
-	lru_gen_update_size(lruvec, folio, old_gen, new_gen);
+	batch->delta[old_gen] -= delta;
+	batch->delta[new_gen] += delta;
 
 	return new_gen;
 }
@@ -3663,6 +3702,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
 {
 	int zone;
 	int remaining = MAX_LRU_BATCH;
+	struct gen_update_batch batch = { };
 	struct lru_gen_folio *lrugen = &lruvec->lrugen;
 	int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
 
@@ -3681,12 +3721,15 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
 			VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
 			VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
 
-			new_gen = folio_inc_gen(lruvec, folio, false);
+			new_gen = folio_inc_gen(lruvec, folio, false, &batch);
 			list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]);
 
-			if (!--remaining)
+			if (!--remaining) {
+				lru_gen_update_batch(lruvec, type, zone, &batch);
 				return false;
+			}
 		}
+		lru_gen_update_batch(lruvec, type, zone, &batch);
 	}
 done:
 	reset_ctrl_pos(lruvec, type, true);
@@ -4197,7 +4240,7 @@ static int lru_gen_memcg_seg(struct lruvec *lruvec)
  ******************************************************************************/
 
 static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc,
-		       int tier_idx)
+		       int tier_idx, struct gen_update_batch *batch)
 {
 	bool success;
 	int gen = folio_lru_gen(folio);
@@ -4239,7 +4282,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
 	if (tier > tier_idx || refs == BIT(LRU_REFS_WIDTH)) {
 		int hist = lru_hist_from_seq(lrugen->min_seq[type]);
 
-		gen = folio_inc_gen(lruvec, folio, false);
+		gen = folio_inc_gen(lruvec, folio, false, batch);
 		list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
 
 		WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
@@ -4249,7 +4292,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
 
 	/* ineligible */
 	if (zone > sc->reclaim_idx || skip_cma(folio, sc)) {
-		gen = folio_inc_gen(lruvec, folio, false);
+		gen = folio_inc_gen(lruvec, folio, false, batch);
 		list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
 		return true;
 	}
@@ -4257,7 +4300,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
 	/* waiting for writeback */
 	if (folio_test_locked(folio) || folio_test_writeback(folio) ||
 	    (type == LRU_GEN_FILE && folio_test_dirty(folio))) {
-		gen = folio_inc_gen(lruvec, folio, true);
+		gen = folio_inc_gen(lruvec, folio, true, batch);
 		list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
 		return true;
 	}
@@ -4323,6 +4366,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
 	for (i = MAX_NR_ZONES; i > 0; i--) {
 		LIST_HEAD(moved);
 		int skipped_zone = 0;
+		struct gen_update_batch batch = { };
 		int zone = (sc->reclaim_idx + i) % MAX_NR_ZONES;
 		struct list_head *head = &lrugen->folios[gen][type][zone];
 
@@ -4337,7 +4381,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
 
 			scanned += delta;
 
-			if (sort_folio(lruvec, folio, sc, tier))
+			if (sort_folio(lruvec, folio, sc, tier, &batch))
 				sorted += delta;
 			else if (isolate_folio(lruvec, folio, sc)) {
 				list_add(&folio->lru, list);
@@ -4357,6 +4401,8 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
 			skipped += skipped_zone;
 		}
 
+		lru_gen_update_batch(lruvec, type, zone, &batch);
+
 		if (!remaining || isolated >= MIN_LRU_BATCH)
 			break;
 	}
-- 
2.43.0

Re: [PATCH 1/3] mm, lru_gen: batch update counters on againg

Posted by Chris Li 2 years, 1 month ago

Hi Kairui,

Some early feedback on your patch. I am still working  my way through
your patches.
Might have more questions.

On Fri, Dec 22, 2023 at 2:24 AM Kairui Song <ryncsn@gmail.com> wrote:
>
> From: Kairui Song <kasong@tencent.com>
>
> When lru_gen is aging, it will update mm counters page by page,
> which causes a higher overhead if age happens frequently or there
> are a lot of pages in one generation getting moved.
> Optimize this by doing the counter update in batch.
>
> Although most __mod_*_state has its own caches the overhead
> is still observable.
>
> Tested in a 4G memcg on a EPYC 7K62 with:
>
>   memcached -u nobody -m 16384 -s /tmp/memcached.socket \
>     -a 0766 -t 16 -B binary &
>
>   memtier_benchmark -S /tmp/memcached.socket \
>     -P memcache_binary -n allkeys \
>     --key-minimum=1 --key-maximum=16000000 -d 1024 \
>     --ratio=1:0 --key-pattern=P:P -c 2 -t 16 --pipeline 8 -x 6
>
> Average result of 18 test runs:
>
> Before: 44017.78 Ops/sec
> After:  44687.08 Ops/sec (+1.5%)
>
> Signed-off-by: Kairui Song <kasong@tencent.com>
> ---
>  mm/vmscan.c | 64 +++++++++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 55 insertions(+), 9 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index b4ca3563bcf4..e3b4797b9729 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -3095,9 +3095,47 @@ static int folio_update_gen(struct folio *folio, int gen)
>         return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
>  }
>
> +/*
> + * Update LRU gen in batch for each lru_gen LRU list. The batch is limited to
> + * each gen / type / zone level LRU. Batch is applied after finished or aborted
> + * scanning one LRU list.
> + */
> +struct gen_update_batch {
> +       int delta[MAX_NR_GENS];
> +};
> +
> +static void lru_gen_update_batch(struct lruvec *lruvec, bool type, int zone,
"type" need to be int, it is either  LRU_GEN_FILE or LRU_GEN_ANON.

Ideally the type is an enum that defines LRU_GEN_FILE or LRU_GEN_ANON.
bool is not the right C type of "type" here. The rest of the code uses
"int" for type as well.

I saw you use "bool type" in other patches as well. All need to change
to "int type".

Chris

> +                                struct gen_update_batch *batch)
> +{
> +       int gen;
> +       int promoted = 0;
> +       struct lru_gen_folio *lrugen = &lruvec->lrugen;
> +       enum lru_list lru = type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON;
> +
> +       for (gen = 0; gen < MAX_NR_GENS; gen++) {
> +               int delta = batch->delta[gen];
> +
> +               if (!delta)
> +                       continue;
> +
> +               WRITE_ONCE(lrugen->nr_pages[gen][type][zone],
> +                          lrugen->nr_pages[gen][type][zone] + delta);
> +
> +               if (lru_gen_is_active(lruvec, gen))
> +                       promoted += delta;
> +       }
> +
> +       if (promoted) {
> +               __update_lru_size(lruvec, lru, zone, -promoted);
> +               __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, promoted);
> +       }
> +}
> +
>  /* protect pages accessed multiple times through file descriptors */
> -static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
> +static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio,
> +                        bool reclaiming, struct gen_update_batch *batch)
>  {
> +       int delta = folio_nr_pages(folio);
>         int type = folio_is_file_lru(folio);
>         struct lru_gen_folio *lrugen = &lruvec->lrugen;
>         int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
> @@ -3120,7 +3158,8 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai
>                         new_flags |= BIT(PG_reclaim);
>         } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
>
> -       lru_gen_update_size(lruvec, folio, old_gen, new_gen);
> +       batch->delta[old_gen] -= delta;
> +       batch->delta[new_gen] += delta;
>
>         return new_gen;
>  }
> @@ -3663,6 +3702,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
>  {
>         int zone;
>         int remaining = MAX_LRU_BATCH;
> +       struct gen_update_batch batch = { };
>         struct lru_gen_folio *lrugen = &lruvec->lrugen;
>         int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
>
> @@ -3681,12 +3721,15 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
>                         VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
>                         VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
>
> -                       new_gen = folio_inc_gen(lruvec, folio, false);
> +                       new_gen = folio_inc_gen(lruvec, folio, false, &batch);
>                         list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]);
>
> -                       if (!--remaining)
> +                       if (!--remaining) {
> +                               lru_gen_update_batch(lruvec, type, zone, &batch);
>                                 return false;
> +                       }
>                 }
> +               lru_gen_update_batch(lruvec, type, zone, &batch);
>         }
>  done:
>         reset_ctrl_pos(lruvec, type, true);
> @@ -4197,7 +4240,7 @@ static int lru_gen_memcg_seg(struct lruvec *lruvec)
>   ******************************************************************************/
>
>  static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc,
> -                      int tier_idx)
> +                      int tier_idx, struct gen_update_batch *batch)
>  {
>         bool success;
>         int gen = folio_lru_gen(folio);
> @@ -4239,7 +4282,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
>         if (tier > tier_idx || refs == BIT(LRU_REFS_WIDTH)) {
>                 int hist = lru_hist_from_seq(lrugen->min_seq[type]);
>
> -               gen = folio_inc_gen(lruvec, folio, false);
> +               gen = folio_inc_gen(lruvec, folio, false, batch);
>                 list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
>
>                 WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
> @@ -4249,7 +4292,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
>
>         /* ineligible */
>         if (zone > sc->reclaim_idx || skip_cma(folio, sc)) {
> -               gen = folio_inc_gen(lruvec, folio, false);
> +               gen = folio_inc_gen(lruvec, folio, false, batch);
>                 list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
>                 return true;
>         }
> @@ -4257,7 +4300,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
>         /* waiting for writeback */
>         if (folio_test_locked(folio) || folio_test_writeback(folio) ||
>             (type == LRU_GEN_FILE && folio_test_dirty(folio))) {
> -               gen = folio_inc_gen(lruvec, folio, true);
> +               gen = folio_inc_gen(lruvec, folio, true, batch);
>                 list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
>                 return true;
>         }
> @@ -4323,6 +4366,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
>         for (i = MAX_NR_ZONES; i > 0; i--) {
>                 LIST_HEAD(moved);
>                 int skipped_zone = 0;
> +               struct gen_update_batch batch = { };
>                 int zone = (sc->reclaim_idx + i) % MAX_NR_ZONES;
>                 struct list_head *head = &lrugen->folios[gen][type][zone];
>
> @@ -4337,7 +4381,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
>
>                         scanned += delta;
>
> -                       if (sort_folio(lruvec, folio, sc, tier))
> +                       if (sort_folio(lruvec, folio, sc, tier, &batch))
>                                 sorted += delta;
>                         else if (isolate_folio(lruvec, folio, sc)) {
>                                 list_add(&folio->lru, list);
> @@ -4357,6 +4401,8 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
>                         skipped += skipped_zone;
>                 }
>
> +               lru_gen_update_batch(lruvec, type, zone, &batch);
> +
>                 if (!remaining || isolated >= MIN_LRU_BATCH)
>                         break;
>         }
> --
> 2.43.0
>
>

Re: [PATCH 1/3] mm, lru_gen: batch update counters on againg

Posted by Kairui Song 2 years, 1 month ago

Chris Li <chrisl@kernel.org> 于2023年12月27日周三 07:43写道：
>
> Hi Kairui,
>
> Some early feedback on your patch. I am still working  my way through
> your patches.
> Might have more questions.

Hi Chris,

Thanks for the review.

> On Fri, Dec 22, 2023 at 2:24 AM Kairui Song <ryncsn@gmail.com> wrote:
> >
> > From: Kairui Song <kasong@tencent.com>
> >
> > When lru_gen is aging, it will update mm counters page by page,
> > which causes a higher overhead if age happens frequently or there
> > are a lot of pages in one generation getting moved.
> > Optimize this by doing the counter update in batch.
> >
> > Although most __mod_*_state has its own caches the overhead
> > is still observable.
> >
> > Tested in a 4G memcg on a EPYC 7K62 with:
> >
> >   memcached -u nobody -m 16384 -s /tmp/memcached.socket \
> >     -a 0766 -t 16 -B binary &
> >
> >   memtier_benchmark -S /tmp/memcached.socket \
> >     -P memcache_binary -n allkeys \
> >     --key-minimum=1 --key-maximum=16000000 -d 1024 \
> >     --ratio=1:0 --key-pattern=P:P -c 2 -t 16 --pipeline 8 -x 6
> >
> > Average result of 18 test runs:
> >
> > Before: 44017.78 Ops/sec
> > After:  44687.08 Ops/sec (+1.5%)
> >
> > Signed-off-by: Kairui Song <kasong@tencent.com>
> > ---
> >  mm/vmscan.c | 64 +++++++++++++++++++++++++++++++++++++++++++++--------
> >  1 file changed, 55 insertions(+), 9 deletions(-)
> >
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index b4ca3563bcf4..e3b4797b9729 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -3095,9 +3095,47 @@ static int folio_update_gen(struct folio *folio, int gen)
> >         return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
> >  }
> >
> > +/*
> > + * Update LRU gen in batch for each lru_gen LRU list. The batch is limited to
> > + * each gen / type / zone level LRU. Batch is applied after finished or aborted
> > + * scanning one LRU list.
> > + */
> > +struct gen_update_batch {
> > +       int delta[MAX_NR_GENS];
> > +};
> > +
> > +static void lru_gen_update_batch(struct lruvec *lruvec, bool type, int zone,
> "type" need to be int, it is either  LRU_GEN_FILE or LRU_GEN_ANON.

Yes, I'll update it with some more test results later.

Re: [PATCH 1/3] mm, lru_gen: batch update counters on againg

Posted by Yu Zhao 2 years, 1 month ago

On Fri, Dec 22, 2023 at 3:24 AM Kairui Song <ryncsn@gmail.com> wrote:
>
> From: Kairui Song <kasong@tencent.com>
>
> When lru_gen is aging, it will update mm counters page by page,
> which causes a higher overhead if age happens frequently or there
> are a lot of pages in one generation getting moved.
> Optimize this by doing the counter update in batch.
>
> Although most __mod_*_state has its own caches the overhead
> is still observable.
>
> Tested in a 4G memcg on a EPYC 7K62 with:
>
>   memcached -u nobody -m 16384 -s /tmp/memcached.socket \
>     -a 0766 -t 16 -B binary &
>
>   memtier_benchmark -S /tmp/memcached.socket \
>     -P memcache_binary -n allkeys \
>     --key-minimum=1 --key-maximum=16000000 -d 1024 \
>     --ratio=1:0 --key-pattern=P:P -c 2 -t 16 --pipeline 8 -x 6
>
> Average result of 18 test runs:
>
> Before: 44017.78 Ops/sec
> After:  44687.08 Ops/sec (+1.5%)
>
> Signed-off-by: Kairui Song <kasong@tencent.com>
> ---
>  mm/vmscan.c | 64 +++++++++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 55 insertions(+), 9 deletions(-)

Usually most reclaim activity happens in kswapd, e.g., from the
MongoDB benchmark (--duration=900):
pgscan_kswapd 11294317
pgscan_direct 128
And kswapd always has current->reclaim_state->mm_walk. So the
following should bring the vast majority of the improvement (assuming
it's not noise) with far less code change:

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9dd8977de5a2..c06e00635d2b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3095,6 +3095,8 @@ static int folio_update_gen(struct folio *folio, int gen)
 static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio,
bool reclaiming)
 {
        int type = folio_is_file_lru(folio);
+       struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
+
        struct lru_gen_folio *lrugen = &lruvec->lrugen;
        int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
        unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
@@ -3116,7 +3118,10 @@ static int folio_inc_gen(struct lruvec *lruvec,
struct folio *folio, bool reclai
                        new_flags |= BIT(PG_reclaim);
        } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));

-       lru_gen_update_size(lruvec, folio, old_gen, new_gen);
+       if (walk)
+               update_batch_size(walk, folio, old_gen, new_gen);
+       else
+               lru_gen_update_size(lruvec, folio, old_gen, new_gen);

        return new_gen;
 }
@@ -3739,6 +3744,8 @@ static void inc_max_seq(struct lruvec *lruvec,
bool can_swap, bool force_scan)
        int prev, next;
        int type, zone;
        struct lru_gen_folio *lrugen = &lruvec->lrugen;
+       struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
+
 restart:
        spin_lock_irq(&lruvec->lru_lock);

@@ -3758,6 +3765,9 @@ static void inc_max_seq(struct lruvec *lruvec,
bool can_swap, bool force_scan)
                goto restart;
        }

+       if (walk && walk->batched)
+               reset_batch_size(lruvec, walk);
+
        /*
         * Update the active/inactive LRU sizes for compatibility. Both sides of
         * the current max_seq need to be covered, since max_seq+1 can overlap

Re: [PATCH 1/3] mm, lru_gen: batch update counters on againg

Posted by Kairui Song 2 years, 1 month ago

Yu Zhao <yuzhao@google.com> 于2023年12月25日周一 15:29写道：
>
> On Fri, Dec 22, 2023 at 3:24 AM Kairui Song <ryncsn@gmail.com> wrote:
> >
> > From: Kairui Song <kasong@tencent.com>
> >
> > When lru_gen is aging, it will update mm counters page by page,
> > which causes a higher overhead if age happens frequently or there
> > are a lot of pages in one generation getting moved.
> > Optimize this by doing the counter update in batch.
> >
> > Although most __mod_*_state has its own caches the overhead
> > is still observable.
> >
> > Tested in a 4G memcg on a EPYC 7K62 with:
> >
> >   memcached -u nobody -m 16384 -s /tmp/memcached.socket \
> >     -a 0766 -t 16 -B binary &
> >
> >   memtier_benchmark -S /tmp/memcached.socket \
> >     -P memcache_binary -n allkeys \
> >     --key-minimum=1 --key-maximum=16000000 -d 1024 \
> >     --ratio=1:0 --key-pattern=P:P -c 2 -t 16 --pipeline 8 -x 6
> >
> > Average result of 18 test runs:
> >
> > Before: 44017.78 Ops/sec
> > After:  44687.08 Ops/sec (+1.5%)
> >
> > Signed-off-by: Kairui Song <kasong@tencent.com>
> > ---
> >  mm/vmscan.c | 64 +++++++++++++++++++++++++++++++++++++++++++++--------
> >  1 file changed, 55 insertions(+), 9 deletions(-)
>
> Usually most reclaim activity happens in kswapd, e.g., from the
> MongoDB benchmark (--duration=900):
> pgscan_kswapd 11294317
> pgscan_direct 128
> And kswapd always has current->reclaim_state->mm_walk. So the
> following should bring the vast majority of the improvement (assuming
> it's not noise) with far less code change:

Hi Yu,

This won't work for the fault path (eg. the memtier test):
Samples: 30K of event 'cycles', Event count (approx.): 69411674954
  Children      Self  Command          Shared Object               Symbol
-   85.95%     0.69%  memcached        [kernel.vmlinux]            [k]
asm_exc_page_fault
   - 85.25% asm_exc_page_fault
      - 85.00% exc_page_fault
         - 84.81% do_user_addr_fault
            - 84.01% handle_mm_fault
               - 83.70% __handle_mm_fault
                  - 82.57% do_swap_page
                     - 61.66% mem_cgroup_swapin_charge_folio
                        - 61.11% charge_memcg
                           - 60.76% try_charge_memcg
                              - 60.68% try_to_free_mem_cgroup_pages
                                   do_try_to_free_pages
                                 - shrink_node
                                    - 60.51% shrink_lruvec
                                       - 60.45% try_to_shrink_lruvec
                                          + 60.42% evict_folios
                     + 10.00% __swap_entry_free
                     + 3.81% swap_read_folio_bdev_sync
                     + 1.49% __pte_offset_map_lock
                     + 0.92% swap_cache_get_folio
                     + 0.80% folio_add_lru
                     + 0.75% vma_alloc_folio
                     + 0.60% swap_read_folio
                  + 0.73% do_anonymous_page
              0.54% lock_vma_under_rcu

And:
sudo cat /sys/kernel/debug/lru_gen_full | grep -A 25 benchmark
memcg    72 /benchmark
 node     0
        218       3283          1x          0x
                     0          0           0           0           0
         0           0
                     1          0           0           0           0
         0           0
                     2          0           0           0           0
         0           0
                     3          0           0           0           0
         0           0
                                0           0           0           0
         0           0
        219       2472       2756           0
                     0      14775r     303395e          0p          2r
         2e          0p
                     1          0r          0e          0p          0r
         0e          0p
                     2          0r          0e          0p          0r
         0e          0p
                     3          0r          0e      15262p          0r
         0e          0p
                                0           0           0           0
         0           0
        220       1652     456032          22
                     0          0           0           0           0
         0           0
                     1          0           0           0           0
         0           0
                     2          0           0           0           0
         0           0
                     3          0           0           0           0
         0           0
                                0           0           0           0
         0           0
        221        808     578570          13
                     0      15665R     309071T          0           0R
         1T          0
                     1          0R          0T          0           0R
         0T          0
                     2          0R          0T          0           0R
         0T          0
                     3          0R      15364T          0           0R
         0T          0
                          9191594L    3532525O    2425411Y      94393N
     18515F      10578A

It ages fast.

It's hard to share the code with mm_walk, because in next patch, it
tries to move the pages in bulk, there is no such logic for mm_walk.

It's not very effective with this benchmark indeed, I'll update with
some other tests.

[PATCH 1/3] mm, lru_gen: batch update counters on againg
[PATCH 2/3] mm, lru_gen: move pages in bulk when aging
[PATCH 3/3] mm, lru_gen: try to prefetch next page when canning LRU