[PATCH 2/3] mm: zswap: fix global shrinker error handling logic

Takero Funaki posted 3 patches 1 year, 8 months ago
There is a newer version of this series
[PATCH 2/3] mm: zswap: fix global shrinker error handling logic
Posted by Takero Funaki 1 year, 8 months ago
This patch fixes zswap global shrinker that did not shrink zpool as
expected.

The issue it addresses is that `shrink_worker()` did not distinguish
between unexpected errors and expected error codes that should be
skipped, such as when there is no stored page in a memcg. This led to
the shrinking process being aborted on the expected error codes.

The shrinker should ignore these cases and skip to the next memcg.
However,  skipping all memcgs presents another problem. To address this,
this patch tracks progress while walking the memcg tree and checks for
progress once the tree walk is completed.

To handle the empty memcg case, the helper function `shrink_memcg()` is
modified to check if the memcg is empty and then return -ENOENT.

Fixes: a65b0e7607cc ("zswap: make shrinking memcg-aware")
Signed-off-by: Takero Funaki <flintglass@gmail.com>
---
 mm/zswap.c | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 0b1052cee36c..08a6f5a6bf62 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1304,7 +1304,7 @@ static struct shrinker *zswap_alloc_shrinker(void)
 
 static int shrink_memcg(struct mem_cgroup *memcg)
 {
-	int nid, shrunk = 0;
+	int nid, shrunk = 0, stored = 0;
 
 	if (!mem_cgroup_zswap_writeback_enabled(memcg))
 		return -EINVAL;
@@ -1319,9 +1319,16 @@ static int shrink_memcg(struct mem_cgroup *memcg)
 	for_each_node_state(nid, N_NORMAL_MEMORY) {
 		unsigned long nr_to_walk = 1;
 
+		if (!list_lru_count_one(&zswap_list_lru, nid, memcg))
+			continue;
+		++stored;
 		shrunk += list_lru_walk_one(&zswap_list_lru, nid, memcg,
 					    &shrink_memcg_cb, NULL, &nr_to_walk);
 	}
+
+	if (!stored)
+		return -ENOENT;
+
 	return shrunk ? 0 : -EAGAIN;
 }
 
@@ -1329,12 +1336,18 @@ static void shrink_worker(struct work_struct *w)
 {
 	struct mem_cgroup *memcg = NULL;
 	struct mem_cgroup *next_memcg;
-	int ret, failures = 0;
+	int ret, failures = 0, progress;
 	unsigned long thr;
 
 	/* Reclaim down to the accept threshold */
 	thr = zswap_accept_thr_pages();
 
+	/*
+	 * We might start from the last memcg.
+	 * That is not a failure.
+	 */
+	progress = 1;
+
 	/* global reclaim will select cgroup in a round-robin fashion.
 	 *
 	 * We save iteration cursor memcg into zswap_next_shrink,
@@ -1366,9 +1379,12 @@ static void shrink_worker(struct work_struct *w)
 		 */
 		if (!memcg) {
 			spin_unlock(&zswap_shrink_lock);
-			if (++failures == MAX_RECLAIM_RETRIES)
+
+			/* tree walk completed but no progress */
+			if (!progress && ++failures == MAX_RECLAIM_RETRIES)
 				break;
 
+			progress = 0;
 			goto resched;
 		}
 
@@ -1391,10 +1407,15 @@ static void shrink_worker(struct work_struct *w)
 		/* drop the extra reference */
 		mem_cgroup_put(memcg);
 
-		if (ret == -EINVAL)
-			break;
+		/* not a writeback candidate memcg */
+		if (ret == -EINVAL || ret == -ENOENT)
+			continue;
+
 		if (ret && ++failures == MAX_RECLAIM_RETRIES)
 			break;
+
+		++progress;
+		/* reschedule as we performed some IO */
 resched:
 		cond_resched();
 	} while (zswap_total_pages() > thr);
-- 
2.43.0
Re: [PATCH 2/3] mm: zswap: fix global shrinker error handling logic
Posted by Nhat Pham 1 year, 8 months ago
On Mon, May 27, 2024 at 9:34 PM Takero Funaki <flintglass@gmail.com> wrote:
>
> This patch fixes zswap global shrinker that did not shrink zpool as
> expected.
>
> The issue it addresses is that `shrink_worker()` did not distinguish
> between unexpected errors and expected error codes that should be
> skipped, such as when there is no stored page in a memcg. This led to
> the shrinking process being aborted on the expected error codes.
>
> The shrinker should ignore these cases and skip to the next memcg.
> However,  skipping all memcgs presents another problem. To address this,
> this patch tracks progress while walking the memcg tree and checks for
> progress once the tree walk is completed.
>
> To handle the empty memcg case, the helper function `shrink_memcg()` is
> modified to check if the memcg is empty and then return -ENOENT.
>
> Fixes: a65b0e7607cc ("zswap: make shrinking memcg-aware")
> Signed-off-by: Takero Funaki <flintglass@gmail.com>
> ---
>  mm/zswap.c | 31 ++++++++++++++++++++++++++-----
>  1 file changed, 26 insertions(+), 5 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 0b1052cee36c..08a6f5a6bf62 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -1304,7 +1304,7 @@ static struct shrinker *zswap_alloc_shrinker(void)
>
>  static int shrink_memcg(struct mem_cgroup *memcg)
>  {
> -       int nid, shrunk = 0;
> +       int nid, shrunk = 0, stored = 0;
>
>         if (!mem_cgroup_zswap_writeback_enabled(memcg))
>                 return -EINVAL;
> @@ -1319,9 +1319,16 @@ static int shrink_memcg(struct mem_cgroup *memcg)
>         for_each_node_state(nid, N_NORMAL_MEMORY) {
>                 unsigned long nr_to_walk = 1;
>
> +               if (!list_lru_count_one(&zswap_list_lru, nid, memcg))
> +                       continue;
> +               ++stored;
>                 shrunk += list_lru_walk_one(&zswap_list_lru, nid, memcg,
>                                             &shrink_memcg_cb, NULL, &nr_to_walk);
>         }
> +
> +       if (!stored)
> +               return -ENOENT;
> +
>         return shrunk ? 0 : -EAGAIN;
>  }
>
> @@ -1329,12 +1336,18 @@ static void shrink_worker(struct work_struct *w)
>  {
>         struct mem_cgroup *memcg = NULL;
>         struct mem_cgroup *next_memcg;
> -       int ret, failures = 0;
> +       int ret, failures = 0, progress;
>         unsigned long thr;
>
>         /* Reclaim down to the accept threshold */
>         thr = zswap_accept_thr_pages();
>
> +       /*
> +        * We might start from the last memcg.
> +        * That is not a failure.
> +        */
> +       progress = 1;
> +
>         /* global reclaim will select cgroup in a round-robin fashion.
>          *
>          * We save iteration cursor memcg into zswap_next_shrink,
> @@ -1366,9 +1379,12 @@ static void shrink_worker(struct work_struct *w)
>                  */
>                 if (!memcg) {
>                         spin_unlock(&zswap_shrink_lock);
> -                       if (++failures == MAX_RECLAIM_RETRIES)
> +
> +                       /* tree walk completed but no progress */
> +                       if (!progress && ++failures == MAX_RECLAIM_RETRIES)
>                                 break;
>
> +                       progress = 0;
>                         goto resched;
>                 }
>
> @@ -1391,10 +1407,15 @@ static void shrink_worker(struct work_struct *w)
>                 /* drop the extra reference */
>                 mem_cgroup_put(memcg);
>
> -               if (ret == -EINVAL)
> -                       break;
> +               /* not a writeback candidate memcg */
> +               if (ret == -EINVAL || ret == -ENOENT)
> +                       continue;
> +

Can we get into an infinite loop or a really long running loops here
if all memcgs have their writeback disabled?

>                 if (ret && ++failures == MAX_RECLAIM_RETRIES)
>                         break;
> +
> +               ++progress;
> +               /* reschedule as we performed some IO */
>  resched:
>                 cond_resched();
>         } while (zswap_total_pages() > thr);
> --
> 2.43.0
>
Re: [PATCH 2/3] mm: zswap: fix global shrinker error handling logic
Posted by Takero Funaki 1 year, 8 months ago
2024年5月29日(水) 0:11 Nhat Pham <nphamcs@gmail.com>:
>
> On Mon, May 27, 2024 at 9:34 PM Takero Funaki <flintglass@gmail.com> wrote:
> >
> > This patch fixes zswap global shrinker that did not shrink zpool as
> > expected.
> >
> > The issue it addresses is that `shrink_worker()` did not distinguish
> > between unexpected errors and expected error codes that should be
> > skipped, such as when there is no stored page in a memcg. This led to
> > the shrinking process being aborted on the expected error codes.
> >
> > The shrinker should ignore these cases and skip to the next memcg.
> > However,  skipping all memcgs presents another problem. To address this,
> > this patch tracks progress while walking the memcg tree and checks for
> > progress once the tree walk is completed.
> >
> > To handle the empty memcg case, the helper function `shrink_memcg()` is
> > modified to check if the memcg is empty and then return -ENOENT.
> >
> > Fixes: a65b0e7607cc ("zswap: make shrinking memcg-aware")
> > Signed-off-by: Takero Funaki <flintglass@gmail.com>
> > ---
> >  mm/zswap.c | 31 ++++++++++++++++++++++++++-----
> >  1 file changed, 26 insertions(+), 5 deletions(-)
> >
> > diff --git a/mm/zswap.c b/mm/zswap.c
> > index 0b1052cee36c..08a6f5a6bf62 100644
> > --- a/mm/zswap.c
> > +++ b/mm/zswap.c
> > @@ -1304,7 +1304,7 @@ static struct shrinker *zswap_alloc_shrinker(void)
> >
> >  static int shrink_memcg(struct mem_cgroup *memcg)
> >  {
> > -       int nid, shrunk = 0;
> > +       int nid, shrunk = 0, stored = 0;
> >
> >         if (!mem_cgroup_zswap_writeback_enabled(memcg))
> >                 return -EINVAL;
> > @@ -1319,9 +1319,16 @@ static int shrink_memcg(struct mem_cgroup *memcg)
> >         for_each_node_state(nid, N_NORMAL_MEMORY) {
> >                 unsigned long nr_to_walk = 1;
> >
> > +               if (!list_lru_count_one(&zswap_list_lru, nid, memcg))
> > +                       continue;
> > +               ++stored;
> >                 shrunk += list_lru_walk_one(&zswap_list_lru, nid, memcg,
> >                                             &shrink_memcg_cb, NULL, &nr_to_walk);
> >         }
> > +
> > +       if (!stored)
> > +               return -ENOENT;
> > +
> >         return shrunk ? 0 : -EAGAIN;
> >  }
> >
> > @@ -1329,12 +1336,18 @@ static void shrink_worker(struct work_struct *w)
> >  {
> >         struct mem_cgroup *memcg = NULL;
> >         struct mem_cgroup *next_memcg;
> > -       int ret, failures = 0;
> > +       int ret, failures = 0, progress;
> >         unsigned long thr;
> >
> >         /* Reclaim down to the accept threshold */
> >         thr = zswap_accept_thr_pages();
> >
> > +       /*
> > +        * We might start from the last memcg.
> > +        * That is not a failure.
> > +        */
> > +       progress = 1;
> > +
> >         /* global reclaim will select cgroup in a round-robin fashion.
> >          *
> >          * We save iteration cursor memcg into zswap_next_shrink,
> > @@ -1366,9 +1379,12 @@ static void shrink_worker(struct work_struct *w)
> >                  */
> >                 if (!memcg) {
> >                         spin_unlock(&zswap_shrink_lock);
> > -                       if (++failures == MAX_RECLAIM_RETRIES)
> > +
> > +                       /* tree walk completed but no progress */
> > +                       if (!progress && ++failures == MAX_RECLAIM_RETRIES)
> >                                 break;
> >
> > +                       progress = 0;
> >                         goto resched;
> >                 }

Here, the `progress` counter tracks how many memcgs successfully evict
a page in a tree walking. (not per the while loop) then reset to 0.
progress > 0 ensures there is progress.
If we visit the tree root (NULL) without any progress, it will be a failure.

Before the loop starts, progress counter is initialized to 1 because
the first tree walk might not  iterate all the memcgs, e.g. the
previous worker was terminated at the very last memcg.


> >
> > @@ -1391,10 +1407,15 @@ static void shrink_worker(struct work_struct *w)
> >                 /* drop the extra reference */
> >                 mem_cgroup_put(memcg);
> >
> > -               if (ret == -EINVAL)
> > -                       break;
> > +               /* not a writeback candidate memcg */
> > +               if (ret == -EINVAL || ret == -ENOENT)
> > +                       continue;
> > +
>
> Can we get into an infinite loop or a really long running loops here
> if all memcgs have their writeback disabled?
>
> >                 if (ret && ++failures == MAX_RECLAIM_RETRIES)
> >                         break;
> > +
> > +               ++progress;
> > +               /* reschedule as we performed some IO */
> >  resched:
> >                 cond_resched();
> >         } while (zswap_total_pages() > thr);
> > --
> > 2.43.0
> >



-- 

<flintglass@gmail.com>