[v1] Revert "mm: skip CMA pages when they are not available"

[PATCH] Revert "mm: skip CMA pages when they are not available"

Posted by liuhailong@oppo.com 1 year, 11 months ago

From: "Hailong.Liu" <liuhailong@oppo.com>

This reverts commit 5da226dbfce3a2f44978c2c7cf88166e69a6788b.

patch may cause system not responding. if cma pages is large in lru_list
and system is in lowmemory, many tasks would enter direct reclaim and waste
cpu time to isolate and return. Test this patch on android-5.15 device
and tasks call stack as below.

Task name: UsbFfs-worker [affinity: 0xff] pid: 3374 cpu: 7 prio: 120 start: ffffff8897a35c80
state: 0x0[R] exit_state: 0x0 stack base: 0xffffffc01eaa0000
Last_enqueued_ts:       0.000000000 Last_sleep_ts:       0.000000000
Stack:
[<ffffffd32ee7d910>] __switch_to+0x180
[<ffffffd3302022fc>] __schedule+0x4dc
[<ffffffd330201e08>] preempt_schedule+0x5c
[<ffffffd33020a4d0>] _raw_spin_unlock_irq+0x54
[<ffffffd32f14906c>] shrink_inactive_list+0x1d0
[<ffffffd32f143998>] shrink_lruvec+0x1bc
[<ffffffd32f147c0c>] shrink_node_memcgs+0x184
[<ffffffd32f147414>] shrink_node+0x2d0
[<ffffffd32f146d38>] shrink_zones+0x14c
[<ffffffd32f142e84>] do_try_to_free_pages+0xe8
[<ffffffd32f142b08>] try_to_free_pages+0x2e0
[<ffffffd32f1a8e44>] __alloc_pages_direct_reclaim+0x84
[<ffffffd32f1a2d58>] __alloc_pages_slowpath+0x4d0
[<ffffffd32f1a23bc>] __alloc_pages_nodemask[jt]+0x124
[<ffffffd32f19a220>] __vmalloc_area_node+0x188
[<ffffffd32f19a540>] __vmalloc_node+0x148
[<ffffffd32f19a60c>] vmalloc+0x4c
[<ffffffd32f910218>] ffs_epfile_io+0x258
[<ffffffd330033780>] kretprobe_trampoline[jt]+0x0
[<ffffffd330033780>] kretprobe_trampoline[jt]+0x0
[<ffffffd32f28129c>] __io_submit_one+0x1c0
[<ffffffd32f280e38>] io_submit_one+0x88
[<ffffffd32f280c88>] __do_sys_io_submit+0x178
[<ffffffd32f27eac0>] __arm64_sys_io_submit+0x20
[<ffffffd32eeabb74>] el0_svc_common.llvm.9961749221945255377+0xd0
[<ffffffd32eeaba34>] do_el0_svc+0x28
[<ffffffd32ff21be8>] el0_svc+0x14
[<ffffffd32ff21b70>] el0_sync_handler+0x88
[<ffffffd32ee128b8>] el0_sync+0x1b8

Task name: kthreadd [affinity: 0xff] pid: 2 cpu: 7 prio: 120 start: ffffff87808c0000
state: 0x0[R] exit_state: 0x0 stack base: 0xffffffc008078000
Last_enqueued_ts:       0.000000000 Last_sleep_ts:       0.000000000
Stack:
[<ffffffd32ee7d910>] __switch_to+0x180
[<ffffffd3302022fc>] __schedule+0x4dc
[<ffffffd330201e08>] preempt_schedule+0x5c
[<ffffffd33020a4d0>] _raw_spin_unlock_irq+0x54
[<ffffffd32f149168>] shrink_inactive_list+0x2cc
[<ffffffd32f143998>] shrink_lruvec+0x1bc
[<ffffffd32f147c0c>] shrink_node_memcgs+0x184
[<ffffffd32f147414>] shrink_node+0x2d0
[<ffffffd32f146d38>] shrink_zones+0x14c
[<ffffffd32f142e84>] do_try_to_free_pages+0xe8
[<ffffffd32f142b08>] try_to_free_pages+0x2e0
[<ffffffd32f1a8e44>] __alloc_pages_direct_reclaim+0x84
[<ffffffd32f1a2d58>] __alloc_pages_slowpath+0x4d0
[<ffffffd32f1a23bc>] __alloc_pages_nodemask[jt]+0x124
[<ffffffd32f19a220>] __vmalloc_area_node+0x188
[<ffffffd32f19a044>] __vmalloc_node_range+0x88
[<ffffffd32f0fb430>] scs_alloc+0x1b8
[<ffffffd32f0fb62c>] scs_prepare+0x20
[<ffffffd32ef2ce04>] dup_task_struct+0xd4
[<ffffffd32ef2a77c>] copy_process+0x144
[<ffffffd32ef2bae4>] kernel_clone+0xb4
[<ffffffd32ef2c040>] kernel_thread+0x5c
[<ffffffd32ef618d0>] kthreadd+0x184

without this patch, the tasks will reclaim cma pages and wakeup
oom-killer or not spin on cpus.

Signed-off-by: Hailong.Liu <liuhailong@oppo.com>
---
 mm/vmscan.c | 22 +---------------------
 1 file changed, 1 insertion(+), 21 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2fe4a11d63f4..197ddf62019f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2261,25 +2261,6 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
 
 }
 
-#ifdef CONFIG_CMA
-/*
- * It is waste of effort to scan and reclaim CMA pages if it is not available
- * for current allocation context. Kswapd can not be enrolled as it can not
- * distinguish this scenario by using sc->gfp_mask = GFP_KERNEL
- */
-static bool skip_cma(struct folio *folio, struct scan_control *sc)
-{
-	return !current_is_kswapd() &&
-			gfp_migratetype(sc->gfp_mask) != MIGRATE_MOVABLE &&
-			get_pageblock_migratetype(&folio->page) == MIGRATE_CMA;
-}
-#else
-static bool skip_cma(struct folio *folio, struct scan_control *sc)
-{
-	return false;
-}
-#endif
-
 /*
  * Isolating page from the lruvec to fill in @dst list by nr_to_scan times.
  *
@@ -2326,8 +2307,7 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
 		nr_pages = folio_nr_pages(folio);
 		total_scan += nr_pages;
 
-		if (folio_zonenum(folio) > sc->reclaim_idx ||
-				skip_cma(folio, sc)) {
+		if (folio_zonenum(folio) > sc->reclaim_idx) {
 			nr_skipped[folio_zonenum(folio)] += nr_pages;
 			move_to = &folios_skipped;
 			goto move;
-- 
2.34.1

Re: [PATCH] Revert "mm: skip CMA pages when they are not available"

Posted by Yu Zhao 1 year, 11 months ago

On Thu, Mar 14, 2024 at 10:15 AM <liuhailong@oppo.com> wrote:
>
> From: "Hailong.Liu" <liuhailong@oppo.com>
>
> This reverts commit 5da226dbfce3a2f44978c2c7cf88166e69a6788b.
>
> patch may cause system not responding. if cma pages is large in lru_list
> and system is in lowmemory, many tasks would enter direct reclaim and waste
> cpu time to isolate and return. Test this patch on android-5.15 device
> and tasks call stack as below.
>
> Task name: UsbFfs-worker [affinity: 0xff] pid: 3374 cpu: 7 prio: 120 start: ffffff8897a35c80
> state: 0x0[R] exit_state: 0x0 stack base: 0xffffffc01eaa0000
> Last_enqueued_ts:       0.000000000 Last_sleep_ts:       0.000000000
> Stack:
> [<ffffffd32ee7d910>] __switch_to+0x180
> [<ffffffd3302022fc>] __schedule+0x4dc
> [<ffffffd330201e08>] preempt_schedule+0x5c
> [<ffffffd33020a4d0>] _raw_spin_unlock_irq+0x54
> [<ffffffd32f14906c>] shrink_inactive_list+0x1d0
> [<ffffffd32f143998>] shrink_lruvec+0x1bc
> [<ffffffd32f147c0c>] shrink_node_memcgs+0x184
> [<ffffffd32f147414>] shrink_node+0x2d0
> [<ffffffd32f146d38>] shrink_zones+0x14c
> [<ffffffd32f142e84>] do_try_to_free_pages+0xe8
> [<ffffffd32f142b08>] try_to_free_pages+0x2e0
> [<ffffffd32f1a8e44>] __alloc_pages_direct_reclaim+0x84
> [<ffffffd32f1a2d58>] __alloc_pages_slowpath+0x4d0
> [<ffffffd32f1a23bc>] __alloc_pages_nodemask[jt]+0x124
> [<ffffffd32f19a220>] __vmalloc_area_node+0x188
> [<ffffffd32f19a540>] __vmalloc_node+0x148
> [<ffffffd32f19a60c>] vmalloc+0x4c
> [<ffffffd32f910218>] ffs_epfile_io+0x258
> [<ffffffd330033780>] kretprobe_trampoline[jt]+0x0
> [<ffffffd330033780>] kretprobe_trampoline[jt]+0x0
> [<ffffffd32f28129c>] __io_submit_one+0x1c0
> [<ffffffd32f280e38>] io_submit_one+0x88
> [<ffffffd32f280c88>] __do_sys_io_submit+0x178
> [<ffffffd32f27eac0>] __arm64_sys_io_submit+0x20
> [<ffffffd32eeabb74>] el0_svc_common.llvm.9961749221945255377+0xd0
> [<ffffffd32eeaba34>] do_el0_svc+0x28
> [<ffffffd32ff21be8>] el0_svc+0x14
> [<ffffffd32ff21b70>] el0_sync_handler+0x88
> [<ffffffd32ee128b8>] el0_sync+0x1b8
>
> Task name: kthreadd [affinity: 0xff] pid: 2 cpu: 7 prio: 120 start: ffffff87808c0000
> state: 0x0[R] exit_state: 0x0 stack base: 0xffffffc008078000
> Last_enqueued_ts:       0.000000000 Last_sleep_ts:       0.000000000
> Stack:
> [<ffffffd32ee7d910>] __switch_to+0x180
> [<ffffffd3302022fc>] __schedule+0x4dc
> [<ffffffd330201e08>] preempt_schedule+0x5c
> [<ffffffd33020a4d0>] _raw_spin_unlock_irq+0x54
> [<ffffffd32f149168>] shrink_inactive_list+0x2cc
> [<ffffffd32f143998>] shrink_lruvec+0x1bc
> [<ffffffd32f147c0c>] shrink_node_memcgs+0x184
> [<ffffffd32f147414>] shrink_node+0x2d0
> [<ffffffd32f146d38>] shrink_zones+0x14c
> [<ffffffd32f142e84>] do_try_to_free_pages+0xe8
> [<ffffffd32f142b08>] try_to_free_pages+0x2e0
> [<ffffffd32f1a8e44>] __alloc_pages_direct_reclaim+0x84
> [<ffffffd32f1a2d58>] __alloc_pages_slowpath+0x4d0
> [<ffffffd32f1a23bc>] __alloc_pages_nodemask[jt]+0x124
> [<ffffffd32f19a220>] __vmalloc_area_node+0x188
> [<ffffffd32f19a044>] __vmalloc_node_range+0x88
> [<ffffffd32f0fb430>] scs_alloc+0x1b8
> [<ffffffd32f0fb62c>] scs_prepare+0x20
> [<ffffffd32ef2ce04>] dup_task_struct+0xd4
> [<ffffffd32ef2a77c>] copy_process+0x144
> [<ffffffd32ef2bae4>] kernel_clone+0xb4
> [<ffffffd32ef2c040>] kernel_thread+0x5c
> [<ffffffd32ef618d0>] kthreadd+0x184
>
> without this patch, the tasks will reclaim cma pages and wakeup
> oom-killer or not spin on cpus.
>
> Signed-off-by: Hailong.Liu <liuhailong@oppo.com>
> ---
>  mm/vmscan.c | 22 +---------------------
>  1 file changed, 1 insertion(+), 21 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 2fe4a11d63f4..197ddf62019f 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2261,25 +2261,6 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
>
>  }
>
> -#ifdef CONFIG_CMA
> -/*
> - * It is waste of effort to scan and reclaim CMA pages if it is not available
> - * for current allocation context. Kswapd can not be enrolled as it can not
> - * distinguish this scenario by using sc->gfp_mask = GFP_KERNEL
> - */
> -static bool skip_cma(struct folio *folio, struct scan_control *sc)
> -{
> -       return !current_is_kswapd() &&
> -                       gfp_migratetype(sc->gfp_mask) != MIGRATE_MOVABLE &&
> -                       get_pageblock_migratetype(&folio->page) == MIGRATE_CMA;
> -}
> -#else
> -static bool skip_cma(struct folio *folio, struct scan_control *sc)
> -{
> -       return false;
> -}
> -#endif
> -

NAK.

+Charan Teja Kalla -- This can cause build errors when CONFIG_LRU_GEN=y.

If you plan to post a v2, please include a reproducer. Thanks.



>  /*
>   * Isolating page from the lruvec to fill in @dst list by nr_to_scan times.
>   *
> @@ -2326,8 +2307,7 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
>                 nr_pages = folio_nr_pages(folio);
>                 total_scan += nr_pages;
>
> -               if (folio_zonenum(folio) > sc->reclaim_idx ||
> -                               skip_cma(folio, sc)) {
> +               if (folio_zonenum(folio) > sc->reclaim_idx) {
>                         nr_skipped[folio_zonenum(folio)] += nr_pages;
>                         move_to = &folios_skipped;
>                         goto move;
> --
> 2.34.1
>
>

reply: [PATCH] Revert "mm: skip CMA pages when they are not available"

Posted by 黄朝阳 (Zhaoyang Huang) 1 year, 10 months ago



On Thu, Mar 14, 2024 at 10:15 AM <liuhailong@oppo.com> wrote:
>
> From: "Hailong.Liu" <liuhailong@oppo.com>
>
> This reverts commit 5da226dbfce3a2f44978c2c7cf88166e69a6788b.
>
> patch may cause system not responding. if cma pages is large in lru_list
> and system is in lowmemory, many tasks would enter direct reclaim and waste
> cpu time to isolate and return. Test this patch on android-5.15 device
> and tasks call stack as below.
>
> Task name: UsbFfs-worker [affinity: 0xff] pid: 3374 cpu: 7 prio: 120 start: ffffff8897a35c80
> state: 0x0[R] exit_state: 0x0 stack base: 0xffffffc01eaa0000
> Last_enqueued_ts:       0.000000000 Last_sleep_ts:       0.000000000
> Stack:
> [<ffffffd32ee7d910>] __switch_to+0x180
> [<ffffffd3302022fc>] __schedule+0x4dc
> [<ffffffd330201e08>] preempt_schedule+0x5c
> [<ffffffd33020a4d0>] _raw_spin_unlock_irq+0x54
> [<ffffffd32f14906c>] shrink_inactive_list+0x1d0
> [<ffffffd32f143998>] shrink_lruvec+0x1bc
> [<ffffffd32f147c0c>] shrink_node_memcgs+0x184
> [<ffffffd32f147414>] shrink_node+0x2d0
> [<ffffffd32f146d38>] shrink_zones+0x14c
> [<ffffffd32f142e84>] do_try_to_free_pages+0xe8
> [<ffffffd32f142b08>] try_to_free_pages+0x2e0
> [<ffffffd32f1a8e44>] __alloc_pages_direct_reclaim+0x84
> [<ffffffd32f1a2d58>] __alloc_pages_slowpath+0x4d0
> [<ffffffd32f1a23bc>] __alloc_pages_nodemask[jt]+0x124
> [<ffffffd32f19a220>] __vmalloc_area_node+0x188
> [<ffffffd32f19a540>] __vmalloc_node+0x148
> [<ffffffd32f19a60c>] vmalloc+0x4c
> [<ffffffd32f910218>] ffs_epfile_io+0x258
> [<ffffffd330033780>] kretprobe_trampoline[jt]+0x0
> [<ffffffd330033780>] kretprobe_trampoline[jt]+0x0
> [<ffffffd32f28129c>] __io_submit_one+0x1c0
> [<ffffffd32f280e38>] io_submit_one+0x88
> [<ffffffd32f280c88>] __do_sys_io_submit+0x178
> [<ffffffd32f27eac0>] __arm64_sys_io_submit+0x20
> [<ffffffd32eeabb74>] el0_svc_common.llvm.9961749221945255377+0xd0
> [<ffffffd32eeaba34>] do_el0_svc+0x28
> [<ffffffd32ff21be8>] el0_svc+0x14
> [<ffffffd32ff21b70>] el0_sync_handler+0x88
> [<ffffffd32ee128b8>] el0_sync+0x1b8
>
> Task name: kthreadd [affinity: 0xff] pid: 2 cpu: 7 prio: 120 start: ffffff87808c0000
> state: 0x0[R] exit_state: 0x0 stack base: 0xffffffc008078000
> Last_enqueued_ts:       0.000000000 Last_sleep_ts:       0.000000000
> Stack:
> [<ffffffd32ee7d910>] __switch_to+0x180
> [<ffffffd3302022fc>] __schedule+0x4dc
> [<ffffffd330201e08>] preempt_schedule+0x5c
> [<ffffffd33020a4d0>] _raw_spin_unlock_irq+0x54
> [<ffffffd32f149168>] shrink_inactive_list+0x2cc
> [<ffffffd32f143998>] shrink_lruvec+0x1bc
> [<ffffffd32f147c0c>] shrink_node_memcgs+0x184
> [<ffffffd32f147414>] shrink_node+0x2d0
> [<ffffffd32f146d38>] shrink_zones+0x14c
> [<ffffffd32f142e84>] do_try_to_free_pages+0xe8
> [<ffffffd32f142b08>] try_to_free_pages+0x2e0
> [<ffffffd32f1a8e44>] __alloc_pages_direct_reclaim+0x84
> [<ffffffd32f1a2d58>] __alloc_pages_slowpath+0x4d0
> [<ffffffd32f1a23bc>] __alloc_pages_nodemask[jt]+0x124
> [<ffffffd32f19a220>] __vmalloc_area_node+0x188
> [<ffffffd32f19a044>] __vmalloc_node_range+0x88
> [<ffffffd32f0fb430>] scs_alloc+0x1b8
> [<ffffffd32f0fb62c>] scs_prepare+0x20
> [<ffffffd32ef2ce04>] dup_task_struct+0xd4
> [<ffffffd32ef2a77c>] copy_process+0x144
> [<ffffffd32ef2bae4>] kernel_clone+0xb4
> [<ffffffd32ef2c040>] kernel_thread+0x5c
> [<ffffffd32ef618d0>] kthreadd+0x184
>
> without this patch, the tasks will reclaim cma pages and wakeup
> oom-killer or not spin on cpus.
>
> Signed-off-by: Hailong.Liu <liuhailong@oppo.com>
> ---
>  mm/vmscan.c | 22 +---------------------
>  1 file changed, 1 insertion(+), 21 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 2fe4a11d63f4..197ddf62019f 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2261,25 +2261,6 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
>
>  }
>
> -#ifdef CONFIG_CMA
> -/*
> - * It is waste of effort to scan and reclaim CMA pages if it is not available
> - * for current allocation context. Kswapd can not be enrolled as it can not
> - * distinguish this scenario by using sc->gfp_mask = GFP_KERNEL
> - */
> -static bool skip_cma(struct folio *folio, struct scan_control *sc)
> -{
> -       return !current_is_kswapd() &&
> -                       gfp_migratetype(sc->gfp_mask) != MIGRATE_MOVABLE &&
> -                       get_pageblock_migratetype(&folio->page) == MIGRATE_CMA;
> -}
> -#else
> -static bool skip_cma(struct folio *folio, struct scan_control *sc)
> -{
> -       return false;
> -}
> -#endif
> -

>NAK.

>+Charan Teja Kalla -- This can cause build errors when CONFIG_LRU_GEN=y.

>If you plan to post a v2, please include a reproducer. Thanks.

Could you please retest the case with bellow patch, which has not been in the aosp yet.

From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>

According to current CMA utilization policy, an alloc_pages(GFP_USER)
could 'steal' UNMOVABLE & RECLAIMABLE page blocks via the help of
CMA(pass zone_watermark_ok by counting CMA in but use U&R in rmqueue),
which could lead to following alloc_pages(GFP_KERNEL) fail.
Solving this by introducing second watermark checking for GFP_MOVABLE,
which could have the allocation use CMA when proper.

-- Free_pages(30MB)
|
|
-- WMARK_LOW(25MB)
|
-- Free_CMA(12MB)
|
|
--

Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
---
v6: update comments
---
---
 mm/page_alloc.c | 44 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 40 insertions(+), 4 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 452459836b71..5a146aa7c0aa 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2078,6 +2078,43 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,

 }

+#ifdef CONFIG_CMA
+/*
+ * GFP_MOVABLE allocation could drain UNMOVABLE & RECLAIMABLE page blocks via
+ * the help of CMA which makes GFP_KERNEL failed. Checking if zone_watermark_ok
+ * again without ALLOC_CMA to see if to use CMA first.
+ */
+static bool use_cma_first(struct zone *zone, unsigned int order, unsigned int alloc_flags)
+{
+       unsigned long watermark;
+       bool cma_first = false;
+
+       watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
+       /* check if GFP_MOVABLE pass previous zone_watermark_ok via the help of CMA */
+       if (zone_watermark_ok(zone, order, watermark, 0, alloc_flags & (~ALLOC_CMA))) {
+               /*
+                * Balance movable allocations between regular and CMA areas by
+                * allocating from CMA when over half of the zone's free memory
+                * is in the CMA area.
+                */
+               cma_first = (zone_page_state(zone, NR_FREE_CMA_PAGES) >
+                               zone_page_state(zone, NR_FREE_PAGES) / 2);
+       } else {
+               /*
+                * watermark failed means UNMOVABLE & RECLAIMBLE is not enough
+                * now, we should use cma first to keep them stay around the
+                * corresponding watermark
+                */
+               cma_first = true;
+       }
+       return cma_first;
+}
+#else
+static bool use_cma_first(struct zone *zone, unsigned int order, unsigned int alloc_flags)
+{
+       return false;
+}
+#endif
 /*
  * Do the hard work of removing an element from the buddy allocator.
  * Call me with the zone->lock already held.
@@ -2091,12 +2128,11 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype,
        if (IS_ENABLED(CONFIG_CMA)) {
                /*
                 * Balance movable allocations between regular and CMA areas by
-                * allocating from CMA when over half of the zone's free memory
-                * is in the CMA area.
+                * allocating from CMA base on judging zone_watermark_ok again
+                * to see if the latest check got pass via the help of CMA
                 */
                if (alloc_flags & ALLOC_CMA &&
-                   zone_page_state(zone, NR_FREE_CMA_PAGES) >
-                   zone_page_state(zone, NR_FREE_PAGES) / 2) {
+                       use_cma_first(zone, order, alloc_flags)) {
                        page = __rmqueue_cma_fallback(zone, order);
                        if (page)
                                return page;
--


>  /*
>   * Isolating page from the lruvec to fill in @dst list by nr_to_scan times.
>   *
> @@ -2326,8 +2307,7 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
>                 nr_pages = folio_nr_pages(folio);
>                 total_scan += nr_pages;
>
> -               if (folio_zonenum(folio) > sc->reclaim_idx ||
> -                               skip_cma(folio, sc)) {
> +               if (folio_zonenum(folio) > sc->reclaim_idx) {
>                         nr_skipped[folio_zonenum(folio)] += nr_pages;
>                         move_to = &folios_skipped;
>                         goto move;
> --
> 2.34.1
>
>

Re: reply: [PATCH] Revert "mm: skip CMA pages when they are not available"

Posted by 刘海龙(LaoLiu) 1 year, 10 months ago

On 2024/3/15 15:41, 黄朝阳 (Zhaoyang Huang) wrote:
> 
> 
> On Thu, Mar 14, 2024 at 10:15 AM <liuhailong@oppo.com> wrote:
>>
>> From: "Hailong.Liu" <liuhailong@oppo.com>
>>
>> This reverts commit 5da226dbfce3a2f44978c2c7cf88166e69a6788b.
>>
>> patch may cause system not responding. if cma pages is large in lru_list
>> and system is in lowmemory, many tasks would enter direct reclaim and waste
>> cpu time to isolate and return. Test this patch on android-5.15 device
>> and tasks call stack as below.
>>
>> Task name: UsbFfs-worker [affinity: 0xff] pid: 3374 cpu: 7 prio: 120 start: ffffff8897a35c80
>> state: 0x0[R] exit_state: 0x0 stack base: 0xffffffc01eaa0000
>> Last_enqueued_ts:       0.000000000 Last_sleep_ts:       0.000000000
>> Stack:
>> [<ffffffd32ee7d910>] __switch_to+0x180
>> [<ffffffd3302022fc>] __schedule+0x4dc
>> [<ffffffd330201e08>] preempt_schedule+0x5c
>> [<ffffffd33020a4d0>] _raw_spin_unlock_irq+0x54
>> [<ffffffd32f14906c>] shrink_inactive_list+0x1d0
>> [<ffffffd32f143998>] shrink_lruvec+0x1bc
>> [<ffffffd32f147c0c>] shrink_node_memcgs+0x184
>> [<ffffffd32f147414>] shrink_node+0x2d0
>> [<ffffffd32f146d38>] shrink_zones+0x14c
>> [<ffffffd32f142e84>] do_try_to_free_pages+0xe8
>> [<ffffffd32f142b08>] try_to_free_pages+0x2e0
>> [<ffffffd32f1a8e44>] __alloc_pages_direct_reclaim+0x84
>> [<ffffffd32f1a2d58>] __alloc_pages_slowpath+0x4d0
>> [<ffffffd32f1a23bc>] __alloc_pages_nodemask[jt]+0x124
>> [<ffffffd32f19a220>] __vmalloc_area_node+0x188
>> [<ffffffd32f19a540>] __vmalloc_node+0x148
>> [<ffffffd32f19a60c>] vmalloc+0x4c
>> [<ffffffd32f910218>] ffs_epfile_io+0x258
>> [<ffffffd330033780>] kretprobe_trampoline[jt]+0x0
>> [<ffffffd330033780>] kretprobe_trampoline[jt]+0x0
>> [<ffffffd32f28129c>] __io_submit_one+0x1c0
>> [<ffffffd32f280e38>] io_submit_one+0x88
>> [<ffffffd32f280c88>] __do_sys_io_submit+0x178
>> [<ffffffd32f27eac0>] __arm64_sys_io_submit+0x20
>> [<ffffffd32eeabb74>] el0_svc_common.llvm.9961749221945255377+0xd0
>> [<ffffffd32eeaba34>] do_el0_svc+0x28
>> [<ffffffd32ff21be8>] el0_svc+0x14
>> [<ffffffd32ff21b70>] el0_sync_handler+0x88
>> [<ffffffd32ee128b8>] el0_sync+0x1b8
>>
>> Task name: kthreadd [affinity: 0xff] pid: 2 cpu: 7 prio: 120 start: ffffff87808c0000
>> state: 0x0[R] exit_state: 0x0 stack base: 0xffffffc008078000
>> Last_enqueued_ts:       0.000000000 Last_sleep_ts:       0.000000000
>> Stack:
>> [<ffffffd32ee7d910>] __switch_to+0x180
>> [<ffffffd3302022fc>] __schedule+0x4dc
>> [<ffffffd330201e08>] preempt_schedule+0x5c
>> [<ffffffd33020a4d0>] _raw_spin_unlock_irq+0x54
>> [<ffffffd32f149168>] shrink_inactive_list+0x2cc
>> [<ffffffd32f143998>] shrink_lruvec+0x1bc
>> [<ffffffd32f147c0c>] shrink_node_memcgs+0x184
>> [<ffffffd32f147414>] shrink_node+0x2d0
>> [<ffffffd32f146d38>] shrink_zones+0x14c
>> [<ffffffd32f142e84>] do_try_to_free_pages+0xe8
>> [<ffffffd32f142b08>] try_to_free_pages+0x2e0
>> [<ffffffd32f1a8e44>] __alloc_pages_direct_reclaim+0x84
>> [<ffffffd32f1a2d58>] __alloc_pages_slowpath+0x4d0
>> [<ffffffd32f1a23bc>] __alloc_pages_nodemask[jt]+0x124
>> [<ffffffd32f19a220>] __vmalloc_area_node+0x188
>> [<ffffffd32f19a044>] __vmalloc_node_range+0x88
>> [<ffffffd32f0fb430>] scs_alloc+0x1b8
>> [<ffffffd32f0fb62c>] scs_prepare+0x20
>> [<ffffffd32ef2ce04>] dup_task_struct+0xd4
>> [<ffffffd32ef2a77c>] copy_process+0x144
>> [<ffffffd32ef2bae4>] kernel_clone+0xb4
>> [<ffffffd32ef2c040>] kernel_thread+0x5c
>> [<ffffffd32ef618d0>] kthreadd+0x184
>>
>> without this patch, the tasks will reclaim cma pages and wakeup
>> oom-killer or not spin on cpus.
>>
>> Signed-off-by: Hailong.Liu <liuhailong@oppo.com>
>> ---
>>  mm/vmscan.c | 22 +---------------------
>>  1 file changed, 1 insertion(+), 21 deletions(-)
>>
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index 2fe4a11d63f4..197ddf62019f 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -2261,25 +2261,6 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
>>
>>  }
>>
>> -#ifdef CONFIG_CMA
>> -/*
>> - * It is waste of effort to scan and reclaim CMA pages if it is not available
>> - * for current allocation context. Kswapd can not be enrolled as it can not
>> - * distinguish this scenario by using sc->gfp_mask = GFP_KERNEL
>> - */
>> -static bool skip_cma(struct folio *folio, struct scan_control *sc)
>> -{
>> -       return !current_is_kswapd() &&
>> -                       gfp_migratetype(sc->gfp_mask) != MIGRATE_MOVABLE &&
>> -                       get_pageblock_migratetype(&folio->page) == MIGRATE_CMA;
>> -}
>> -#else
>> -static bool skip_cma(struct folio *folio, struct scan_control *sc)
>> -{
>> -       return false;
>> -}
>> -#endif
>> -
> 
>> NAK.
> 
>> +Charan Teja Kalla -- This can cause build errors when CONFIG_LRU_GEN=y.
> 
>> If you plan to post a v2, please include a reproducer. Thanks.
> 
> Could you please retest the case with bellow patch, which has not been in the aosp yet.
> 
> From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> 
> According to current CMA utilization policy, an alloc_pages(GFP_USER)
> could 'steal' UNMOVABLE & RECLAIMABLE page blocks via the help of
> CMA(pass zone_watermark_ok by counting CMA in but use U&R in rmqueue),
> which could lead to following alloc_pages(GFP_KERNEL) fail.
> Solving this by introducing second watermark checking for GFP_MOVABLE,
> which could have the allocation use CMA when proper.
> 
> -- Free_pages(30MB)
> |
> |
> -- WMARK_LOW(25MB)
> |
> -- Free_CMA(12MB)
> |
> |
> --
> 
> Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> ---
> v6: update comments
> ---
> ---
>  mm/page_alloc.c | 44 ++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 40 insertions(+), 4 deletions(-)
> 
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 452459836b71..5a146aa7c0aa 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -2078,6 +2078,43 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
> 
>  }
> 
> +#ifdef CONFIG_CMA
> +/*
> + * GFP_MOVABLE allocation could drain UNMOVABLE & RECLAIMABLE page blocks via
> + * the help of CMA which makes GFP_KERNEL failed. Checking if zone_watermark_ok
> + * again without ALLOC_CMA to see if to use CMA first.
> + */
> +static bool use_cma_first(struct zone *zone, unsigned int order, unsigned int alloc_flags)
> +{
> +       unsigned long watermark;
> +       bool cma_first = false;
> +
> +       watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
> +       /* check if GFP_MOVABLE pass previous zone_watermark_ok via the help of CMA */
> +       if (zone_watermark_ok(zone, order, watermark, 0, alloc_flags & (~ALLOC_CMA))) {
> +               /*
> +                * Balance movable allocations between regular and CMA areas by
> +                * allocating from CMA when over half of the zone's free memory
> +                * is in the CMA area.
> +                */
> +               cma_first = (zone_page_state(zone, NR_FREE_CMA_PAGES) >
> +                               zone_page_state(zone, NR_FREE_PAGES) / 2);
> +       } else {
> +               /*
> +                * watermark failed means UNMOVABLE & RECLAIMBLE is not enough
> +                * now, we should use cma first to keep them stay around the
> +                * corresponding watermark
> +                */
> +               cma_first = true;
> +       }
> +       return cma_first;
> +}
> +#else
> +static bool use_cma_first(struct zone *zone, unsigned int order, unsigned int alloc_flags)
> +{
> +       return false;
> +}
> +#endif
>  /*
>   * Do the hard work of removing an element from the buddy allocator.
>   * Call me with the zone->lock already held.
> @@ -2091,12 +2128,11 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype,
>         if (IS_ENABLED(CONFIG_CMA)) {
>                 /*
>                  * Balance movable allocations between regular and CMA areas by
> -                * allocating from CMA when over half of the zone's free memory
> -                * is in the CMA area.
> +                * allocating from CMA base on judging zone_watermark_ok again
> +                * to see if the latest check got pass via the help of CMA
>                  */
>                 if (alloc_flags & ALLOC_CMA &&
> -                   zone_page_state(zone, NR_FREE_CMA_PAGES) >
> -                   zone_page_state(zone, NR_FREE_PAGES) / 2) {
> +                       use_cma_first(zone, order, alloc_flags)) {
>                         page = __rmqueue_cma_fallback(zone, order);
>                         if (page)
>                                 return page;
> --
> 
Hi Zhaoyang:

I write a reproducer in v2-patch, this may not solve the case. because if 
system in lowmemory all lru_list is cma pages. direct_reclaim would wasting time
scan and skip. For now we could not know how many cma pages in lru and do
some heuristic is something weird.

Brs,
Hailong.

[PATCH v2] Revert "mm: skip CMA pages when they are not available"

Posted by liuhailong@oppo.com 1 year, 10 months ago

From: "Hailong.Liu" <liuhailong@oppo.com>

This reverts
commit b7108d66318a ("Multi-gen LRU: skip CMA pages when they are not eligible")
commit 5da226dbfce3 ("mm: skip CMA pages when they are not available")

skip_cma may cause system not responding. if cma pages is large in lru_list
and system is in lowmemory, many tasks would direct reclaim and waste
cpu time to isolate_lru_pages and return.

Test this patch on android-5.15 8G device
reproducer:
- cma_declare_contiguous 3G pages
- set /proc/sys/vm/swappiness 0 to enable direct_reclaim reclaim file
  only.
- run a memleak process in userspace

trace the trace_mm_vmscan_lru_isolate which get
[ 1178.059160] 2825  3405 I UsbFfs-worker   : isolate_lru_pages:1895 total_scan: 24385 skipped: 24382
[ 1178.059699] 2825  3405 I UsbFfs-worker   : isolate_lru_pages:1895 total_scan: 24412 skipped: 24401
[ 1178.061747] 2825  3405 I UsbFfs-worker   : isolate_lru_pages:1895 total_scan: 24412 skipped: 24401
From the log, tasks is waste time to scan lru list and call stack as below.

Task name: UsbFfs-worker [affinity: 0xff] pid: 3374 cpu: 7 prio: 120 start: ffffff8897a35c80
state: 0x0[R] exit_state: 0x0 stack base: 0xffffffc01eaa0000
Last_enqueued_ts:       0.000000000 Last_sleep_ts:       0.000000000
Stack:
[<ffffffd32ee7d910>] __switch_to+0x180
[<ffffffd3302022fc>] __schedule+0x4dc
[<ffffffd330201e08>] preempt_schedule+0x5c
[<ffffffd33020a4d0>] _raw_spin_unlock_irq+0x54
[<ffffffd32f14906c>] shrink_inactive_list+0x1d0
[<ffffffd32f143998>] shrink_lruvec+0x1bc
[<ffffffd32f147c0c>] shrink_node_memcgs+0x184
[<ffffffd32f147414>] shrink_node+0x2d0
[<ffffffd32f146d38>] shrink_zones+0x14c
[<ffffffd32f142e84>] do_try_to_free_pages+0xe8
[<ffffffd32f142b08>] try_to_free_pages+0x2e0
[<ffffffd32f1a8e44>] __alloc_pages_direct_reclaim+0x84
[<ffffffd32f1a2d58>] __alloc_pages_slowpath+0x4d0
[<ffffffd32f1a23bc>] __alloc_pages_nodemask[jt]+0x124
[<ffffffd32f19a220>] __vmalloc_area_node+0x188
[<ffffffd32f19a540>] __vmalloc_node+0x148
[<ffffffd32f19a60c>] vmalloc+0x4c
[<ffffffd32f910218>] ffs_epfile_io+0x258
[<ffffffd330033780>] kretprobe_trampoline[jt]+0x0
[<ffffffd330033780>] kretprobe_trampoline[jt]+0x0
[<ffffffd32f28129c>] __io_submit_one+0x1c0
[<ffffffd32f280e38>] io_submit_one+0x88
[<ffffffd32f280c88>] __do_sys_io_submit+0x178
[<ffffffd32f27eac0>] __arm64_sys_io_submit+0x20
[<ffffffd32eeabb74>] el0_svc_common.llvm.9961749221945255377+0xd0
[<ffffffd32eeaba34>] do_el0_svc+0x28
[<ffffffd32ff21be8>] el0_svc+0x14
[<ffffffd32ff21b70>] el0_sync_handler+0x88
[<ffffffd32ee128b8>] el0_sync+0x1b8

Task name: kthreadd [affinity: 0xff] pid: 2 cpu: 7 prio: 120 start: ffffff87808c0000
state: 0x0[R] exit_state: 0x0 stack base: 0xffffffc008078000
Last_enqueued_ts:       0.000000000 Last_sleep_ts:       0.000000000
Stack:
[<ffffffd32ee7d910>] __switch_to+0x180
[<ffffffd3302022fc>] __schedule+0x4dc
[<ffffffd330201e08>] preempt_schedule+0x5c
[<ffffffd33020a4d0>] _raw_spin_unlock_irq+0x54
[<ffffffd32f149168>] shrink_inactive_list+0x2cc
[<ffffffd32f143998>] shrink_lruvec+0x1bc
[<ffffffd32f147c0c>] shrink_node_memcgs+0x184
[<ffffffd32f147414>] shrink_node+0x2d0
[<ffffffd32f146d38>] shrink_zones+0x14c
[<ffffffd32f142e84>] do_try_to_free_pages+0xe8
[<ffffffd32f142b08>] try_to_free_pages+0x2e0
[<ffffffd32f1a8e44>] __alloc_pages_direct_reclaim+0x84
[<ffffffd32f1a2d58>] __alloc_pages_slowpath+0x4d0
[<ffffffd32f1a23bc>] __alloc_pages_nodemask[jt]+0x124
[<ffffffd32f19a220>] __vmalloc_area_node+0x188
[<ffffffd32f19a044>] __vmalloc_node_range+0x88
[<ffffffd32f0fb430>] scs_alloc+0x1b8
[<ffffffd32f0fb62c>] scs_prepare+0x20
[<ffffffd32ef2ce04>] dup_task_struct+0xd4
[<ffffffd32ef2a77c>] copy_process+0x144
[<ffffffd32ef2bae4>] kernel_clone+0xb4
[<ffffffd32ef2c040>] kernel_thread+0x5c
[<ffffffd32ef618d0>] kthreadd+0x184

Signed-off-by: Hailong.Liu <liuhailong@oppo.com>
---
 mm/vmscan.c | 24 ++----------------------
 1 file changed, 2 insertions(+), 22 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6f13394b112e..29306c29309f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2261,25 +2261,6 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,

 }

-#ifdef CONFIG_CMA
-/*
- * It is waste of effort to scan and reclaim CMA pages if it is not available
- * for current allocation context. Kswapd can not be enrolled as it can not
- * distinguish this scenario by using sc->gfp_mask = GFP_KERNEL
- */
-static bool skip_cma(struct folio *folio, struct scan_control *sc)
-{
-	return !current_is_kswapd() &&
-			gfp_migratetype(sc->gfp_mask) != MIGRATE_MOVABLE &&
-			get_pageblock_migratetype(&folio->page) == MIGRATE_CMA;
-}
-#else
-static bool skip_cma(struct folio *folio, struct scan_control *sc)
-{
-	return false;
-}
-#endif
-
 /*
  * Isolating page from the lruvec to fill in @dst list by nr_to_scan times.
  *
@@ -2326,8 +2307,7 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
 		nr_pages = folio_nr_pages(folio);
 		total_scan += nr_pages;

-		if (folio_zonenum(folio) > sc->reclaim_idx ||
-				skip_cma(folio, sc)) {
+		if (folio_zonenum(folio) > sc->reclaim_idx) {
 			nr_skipped[folio_zonenum(folio)] += nr_pages;
 			move_to = &folios_skipped;
 			goto move;
@@ -4945,7 +4925,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
 	}

 	/* ineligible */
-	if (zone > sc->reclaim_idx || skip_cma(folio, sc)) {
+	if (zone > sc->reclaim_idx) {
 		gen = folio_inc_gen(lruvec, folio, false);
 		list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
 		return true;
--
Changes in v2:
- add reproducer
- fix build error on v6.6-rc7