[PATCH v3 3/3] mm/memory-failure: refactor page_handle_poison()

Jiaqi Yan posted 3 patches 4 weeks ago
There is a newer version of this series
[PATCH v3 3/3] mm/memory-failure: refactor page_handle_poison()
Posted by Jiaqi Yan 4 weeks ago
Now that HWPoison page(s) within HugeTLB page will be rejected by
buddy allocator during dissolve_free_hugetlb_folio(), there is no
need to drain_all_pages() and take_page_off_buddy() anymore. In fact,
calling take_page_off_buddy() after dissolve_free_hugetlb_folio()
succeeded returns false, making caller think page_handl_poion() failed.

On the other hand, for hardware corrupted pages in buddy allocator,
take_page_off_buddy() is still a must-have.

Given hugepage and free buddy page should be treated differently,
refactor page_handle_poison() and __page_handle_poison():

- __page_handle_poison() is unwind into page_handle_poison().

- Callers of page_handle_poison() also need to explicitly tell if
  page is HugeTLB hugepage or free buddy page.

- Add helper hugepage_handle_poison() for several existing HugeTLB
  specific callsites.

Signed-off-by: Jiaqi Yan <jiaqiyan@google.com>
---
 mm/memory-failure.c | 84 ++++++++++++++++++++++-----------------------
 1 file changed, 41 insertions(+), 43 deletions(-)

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index d204de6c9792a..1fdaee1e48bb8 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -162,54 +162,48 @@ static struct rb_root_cached pfn_space_itree = RB_ROOT_CACHED;
 
 static DEFINE_MUTEX(pfn_space_lock);
 
-/*
- * Return values:
- *   1:   the page is dissolved (if needed) and taken off from buddy,
- *   0:   the page is dissolved (if needed) and not taken off from buddy,
- *   < 0: failed to dissolve.
+/**
+ * Handle the HugeTLB hugepage that @page belongs to. Return values:
+ *   = 0: the hugepage is free hugepage and is dissolved.
+ *   < 0: hugepage is in-use or failed to dissolve.
  */
-static int __page_handle_poison(struct page *page)
+static int hugepage_handle_poison(struct page *page)
 {
-	int ret;
+	return dissolve_free_hugetlb_folio(page_folio(page));
+}
+
+/**
+ * Helper at the end of handling @page having hardware errors.
+ * @huge: @page is part of a HugeTLB hugepage.
+ * @free: @page is free buddy page.
+ * @release: memory-failure module should release a pending refcount.
+ */
+static bool page_handle_poison(struct page *page, bool huge, bool free,
+			       bool release)
+{
+	int ret = 0;
 
 	/*
-	 * zone_pcp_disable() can't be used here. It will
-	 * hold pcp_batch_high_lock and dissolve_free_hugetlb_folio() might hold
-	 * cpu_hotplug_lock via static_key_slow_dec() when hugetlb vmemmap
-	 * optimization is enabled. This will break current lock dependency
-	 * chain and leads to deadlock.
-	 * Disabling pcp before dissolving the page was a deterministic
-	 * approach because we made sure that those pages cannot end up in any
-	 * PCP list. Draining PCP lists expels those pages to the buddy system,
-	 * but nothing guarantees that those pages do not get back to a PCP
-	 * queue if we need to refill those.
+	 * Buddy allocator will exclude the HWPoison page after hugepage
+	 * is successfully dissolved.
 	 */
-	ret = dissolve_free_hugetlb_folio(page_folio(page));
-	if (!ret) {
+	if (huge)
+		ret = hugepage_handle_poison(page);
+
+	if (free) {
 		drain_all_pages(page_zone(page));
-		ret = take_page_off_buddy(page);
+		ret = take_page_off_buddy(page) ? 0 : -1;
 	}
 
-	return ret;
-}
-
-static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release)
-{
-	if (hugepage_or_freepage) {
+	if ((huge || free) && ret < 0)
 		/*
-		 * Doing this check for free pages is also fine since
-		 * dissolve_free_hugetlb_folio() returns 0 for non-hugetlb folios as well.
+		 * We could fail to take off the target page from buddy
+		 * for example due to racy page allocation, but that's
+		 * acceptable because soft-offlined page is not broken
+		 * and if someone really want to use it, they should
+		 * take it.
 		 */
-		if (__page_handle_poison(page) <= 0)
-			/*
-			 * We could fail to take off the target page from buddy
-			 * for example due to racy page allocation, but that's
-			 * acceptable because soft-offlined page is not broken
-			 * and if someone really want to use it, they should
-			 * take it.
-			 */
-			return false;
-	}
+		return false;
 
 	SetPageHWPoison(page);
 	if (release)
@@ -1174,7 +1168,7 @@ static int me_huge_page(struct page_state *ps, struct page *p)
 		 * subpages.
 		 */
 		folio_put(folio);
-		if (__page_handle_poison(p) > 0) {
+		if (!hugepage_handle_poison(p)) {
 			page_ref_inc(p);
 			res = MF_RECOVERED;
 		} else {
@@ -2067,7 +2061,7 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb
 	 */
 	if (res == 0) {
 		folio_unlock(folio);
-		if (__page_handle_poison(p) > 0) {
+		if (!hugepage_handle_poison(p)) {
 			page_ref_inc(p);
 			res = MF_RECOVERED;
 		} else {
@@ -2815,7 +2809,7 @@ static int soft_offline_in_use_page(struct page *page)
 
 	if (ret) {
 		pr_info("%#lx: invalidated\n", pfn);
-		page_handle_poison(page, false, true);
+		page_handle_poison(page, false, false, true);
 		return 0;
 	}
 
@@ -2836,7 +2830,7 @@ static int soft_offline_in_use_page(struct page *page)
 		if (!ret) {
 			bool release = !huge;
 
-			if (!page_handle_poison(page, huge, release))
+			if (!page_handle_poison(page, huge, false, release))
 				ret = -EBUSY;
 		} else {
 			if (!list_empty(&pagelist))
@@ -2884,6 +2878,8 @@ int soft_offline_page(unsigned long pfn, int flags)
 {
 	int ret;
 	bool try_again = true;
+	bool huge;
+	bool free;
 	struct page *page;
 
 	if (!pfn_valid(pfn)) {
@@ -2929,7 +2925,9 @@ int soft_offline_page(unsigned long pfn, int flags)
 	if (ret > 0) {
 		ret = soft_offline_in_use_page(page);
 	} else if (ret == 0) {
-		if (!page_handle_poison(page, true, false)) {
+		huge = folio_test_hugetlb(page_folio(page));
+		free = is_free_buddy_page(page);
+		if (!page_handle_poison(page, huge, free, false)) {
 			if (try_again) {
 				try_again = false;
 				flags &= ~MF_COUNT_INCREASED;
-- 
2.52.0.457.g6b5491de43-goog
Re: [PATCH v3 3/3] mm/memory-failure: refactor page_handle_poison()
Posted by Miaohe Lin 3 weeks, 4 days ago
On 2026/1/12 8:49, Jiaqi Yan wrote:
> Now that HWPoison page(s) within HugeTLB page will be rejected by
> buddy allocator during dissolve_free_hugetlb_folio(), there is no
> need to drain_all_pages() and take_page_off_buddy() anymore. In fact,
> calling take_page_off_buddy() after dissolve_free_hugetlb_folio()
> succeeded returns false, making caller think page_handl_poion() failed.

s/page_handl_poion/page_handle_poison/

> 
> On the other hand, for hardware corrupted pages in buddy allocator,
> take_page_off_buddy() is still a must-have.
> 
> Given hugepage and free buddy page should be treated differently,
> refactor page_handle_poison() and __page_handle_poison():
> 
> - __page_handle_poison() is unwind into page_handle_poison().
> 
> - Callers of page_handle_poison() also need to explicitly tell if
>   page is HugeTLB hugepage or free buddy page.
> 
> - Add helper hugepage_handle_poison() for several existing HugeTLB
>   specific callsites.
> 
> Signed-off-by: Jiaqi Yan <jiaqiyan@google.com>
> ---
>  mm/memory-failure.c | 84 ++++++++++++++++++++++-----------------------
>  1 file changed, 41 insertions(+), 43 deletions(-)
> 
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index d204de6c9792a..1fdaee1e48bb8 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -162,54 +162,48 @@ static struct rb_root_cached pfn_space_itree = RB_ROOT_CACHED;
>  
>  static DEFINE_MUTEX(pfn_space_lock);
>  
> -/*
> - * Return values:
> - *   1:   the page is dissolved (if needed) and taken off from buddy,
> - *   0:   the page is dissolved (if needed) and not taken off from buddy,
> - *   < 0: failed to dissolve.
> +/**
> + * Handle the HugeTLB hugepage that @page belongs to. Return values:
> + *   = 0: the hugepage is free hugepage and is dissolved.

In soft offline scene, dissolve_free_hugetlb_folio would return 0 when the page becomes
a normal page due to race.

> + *   < 0: hugepage is in-use or failed to dissolve.
>   */
> -static int __page_handle_poison(struct page *page)
> +static int hugepage_handle_poison(struct page *page)
>  {
> -	int ret;
> +	return dissolve_free_hugetlb_folio(page_folio(page));
> +}
> +
> +/**
> + * Helper at the end of handling @page having hardware errors.
> + * @huge: @page is part of a HugeTLB hugepage.
> + * @free: @page is free buddy page.
> + * @release: memory-failure module should release a pending refcount.
> + */
> +static bool page_handle_poison(struct page *page, bool huge, bool free,
> +			       bool release)
> +{
> +	int ret = 0;
>  
>  	/*
> -	 * zone_pcp_disable() can't be used here. It will
> -	 * hold pcp_batch_high_lock and dissolve_free_hugetlb_folio() might hold
> -	 * cpu_hotplug_lock via static_key_slow_dec() when hugetlb vmemmap
> -	 * optimization is enabled. This will break current lock dependency
> -	 * chain and leads to deadlock.
> -	 * Disabling pcp before dissolving the page was a deterministic
> -	 * approach because we made sure that those pages cannot end up in any
> -	 * PCP list. Draining PCP lists expels those pages to the buddy system,
> -	 * but nothing guarantees that those pages do not get back to a PCP
> -	 * queue if we need to refill those.
> +	 * Buddy allocator will exclude the HWPoison page after hugepage
> +	 * is successfully dissolved.
>  	 */
> -	ret = dissolve_free_hugetlb_folio(page_folio(page));
> -	if (!ret) {
> +	if (huge)
> +		ret = hugepage_handle_poison(page);
> +
> +	if (free) {

Nit: huge and free won't be both true. So we could write it as:
	if (huge) {
		...
	} else if (free) {

>  		drain_all_pages(page_zone(page));
> -		ret = take_page_off_buddy(page);
> +		ret = take_page_off_buddy(page) ? 0 : -1;
>  	}
>  
> -	return ret;
> -}
> -
> -static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release)
> -{
> -	if (hugepage_or_freepage) {
> +	if ((huge || free) && ret < 0)

Nit: ret won't be <0 if both huge and free are false. So I think we might simplify it as:

	if (ret < 0)

>  		/*
> -		 * Doing this check for free pages is also fine since
> -		 * dissolve_free_hugetlb_folio() returns 0 for non-hugetlb folios as well.
> +		 * We could fail to take off the target page from buddy
> +		 * for example due to racy page allocation, but that's
> +		 * acceptable because soft-offlined page is not broken
> +		 * and if someone really want to use it, they should
> +		 * take it.
>  		 */
> -		if (__page_handle_poison(page) <= 0)
> -			/*
> -			 * We could fail to take off the target page from buddy
> -			 * for example due to racy page allocation, but that's
> -			 * acceptable because soft-offlined page is not broken
> -			 * and if someone really want to use it, they should
> -			 * take it.
> -			 */
> -			return false;
> -	}
> +		return false;
>  
>  	SetPageHWPoison(page);
>  	if (release)
> @@ -1174,7 +1168,7 @@ static int me_huge_page(struct page_state *ps, struct page *p)
>  		 * subpages.
>  		 */
>  		folio_put(folio);
> -		if (__page_handle_poison(p) > 0) {
> +		if (!hugepage_handle_poison(p)) {
>  			page_ref_inc(p);
>  			res = MF_RECOVERED;
>  		} else {
> @@ -2067,7 +2061,7 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb
>  	 */
>  	if (res == 0) {
>  		folio_unlock(folio);
> -		if (__page_handle_poison(p) > 0) {
> +		if (!hugepage_handle_poison(p)) {
>  			page_ref_inc(p);
>  			res = MF_RECOVERED;
>  		} else {
> @@ -2815,7 +2809,7 @@ static int soft_offline_in_use_page(struct page *page)
>  
>  	if (ret) {
>  		pr_info("%#lx: invalidated\n", pfn);
> -		page_handle_poison(page, false, true);
> +		page_handle_poison(page, false, false, true);
>  		return 0;
>  	}
>  
> @@ -2836,7 +2830,7 @@ static int soft_offline_in_use_page(struct page *page)
>  		if (!ret) {
>  			bool release = !huge;
>  
> -			if (!page_handle_poison(page, huge, release))
> +			if (!page_handle_poison(page, huge, false, release))

This might not work for soft offline. PageHWPoison is not yet set so folio_clear_hugetlb_hwpoison
won't be called when dissolve hugetlb hugepages...

>  				ret = -EBUSY;
>  		} else {
>  			if (!list_empty(&pagelist))
> @@ -2884,6 +2878,8 @@ int soft_offline_page(unsigned long pfn, int flags)
>  {
>  	int ret;
>  	bool try_again = true;
> +	bool huge;
> +	bool free;
>  	struct page *page;
>  
>  	if (!pfn_valid(pfn)) {
> @@ -2929,7 +2925,9 @@ int soft_offline_page(unsigned long pfn, int flags)
>  	if (ret > 0) {
>  		ret = soft_offline_in_use_page(page);
>  	} else if (ret == 0) {
> -		if (!page_handle_poison(page, true, false)) {
> +		huge = folio_test_hugetlb(page_folio(page));

folio_test_hugetlb check is racy because there's no guarantee that hugetlb hugepage won't
be dissolved before calling page_handle_poison. That will lead to problem...

soft_offline_page
  folio_test_hugetlb -- true now
  page_handle_poison
  /* Hugepage is dissolved somewhere. */
    hugepage_handle_poison -- return 0 because page is normal page or free buddy page.
    SetPageHWPoison(page);
    page_ref_inc(page); -- refcnt is increased while page might be on buddy...

> +		free = is_free_buddy_page(page);
> +		if (!page_handle_poison(page, huge, free, false)) {

We assume free is always true due to ret is 0. So we can write it as:
	if (!page_handle_poison(page, huge, true, false)) {

>  			if (try_again) {
>  				try_again = false;
>  				flags &= ~MF_COUNT_INCREASED;
> 

Thanks.
.
Re: [PATCH v3 3/3] mm/memory-failure: refactor page_handle_poison()
Posted by Jiaqi Yan 1 week, 4 days ago
On Wed, Jan 14, 2026 at 7:41 PM Miaohe Lin <linmiaohe@huawei.com> wrote:
>
> On 2026/1/12 8:49, Jiaqi Yan wrote:
> > Now that HWPoison page(s) within HugeTLB page will be rejected by
> > buddy allocator during dissolve_free_hugetlb_folio(), there is no
> > need to drain_all_pages() and take_page_off_buddy() anymore. In fact,
> > calling take_page_off_buddy() after dissolve_free_hugetlb_folio()
> > succeeded returns false, making caller think page_handl_poion() failed.
>
> s/page_handl_poion/page_handle_poison/
>
> >
> > On the other hand, for hardware corrupted pages in buddy allocator,
> > take_page_off_buddy() is still a must-have.
> >
> > Given hugepage and free buddy page should be treated differently,
> > refactor page_handle_poison() and __page_handle_poison():
> >
> > - __page_handle_poison() is unwind into page_handle_poison().
> >
> > - Callers of page_handle_poison() also need to explicitly tell if
> >   page is HugeTLB hugepage or free buddy page.
> >
> > - Add helper hugepage_handle_poison() for several existing HugeTLB
> >   specific callsites.
> >
> > Signed-off-by: Jiaqi Yan <jiaqiyan@google.com>
> > ---
> >  mm/memory-failure.c | 84 ++++++++++++++++++++++-----------------------
> >  1 file changed, 41 insertions(+), 43 deletions(-)
> >
> > diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> > index d204de6c9792a..1fdaee1e48bb8 100644
> > --- a/mm/memory-failure.c
> > +++ b/mm/memory-failure.c
> > @@ -162,54 +162,48 @@ static struct rb_root_cached pfn_space_itree = RB_ROOT_CACHED;
> >
> >  static DEFINE_MUTEX(pfn_space_lock);
> >
> > -/*
> > - * Return values:
> > - *   1:   the page is dissolved (if needed) and taken off from buddy,
> > - *   0:   the page is dissolved (if needed) and not taken off from buddy,
> > - *   < 0: failed to dissolve.
> > +/**
> > + * Handle the HugeTLB hugepage that @page belongs to. Return values:
> > + *   = 0: the hugepage is free hugepage and is dissolved.
>
> In soft offline scene, dissolve_free_hugetlb_folio would return 0 when the page becomes
> a normal page due to race.
>
> > + *   < 0: hugepage is in-use or failed to dissolve.
> >   */
> > -static int __page_handle_poison(struct page *page)
> > +static int hugepage_handle_poison(struct page *page)
> >  {
> > -     int ret;
> > +     return dissolve_free_hugetlb_folio(page_folio(page));
> > +}
> > +
> > +/**
> > + * Helper at the end of handling @page having hardware errors.
> > + * @huge: @page is part of a HugeTLB hugepage.
> > + * @free: @page is free buddy page.
> > + * @release: memory-failure module should release a pending refcount.
> > + */
> > +static bool page_handle_poison(struct page *page, bool huge, bool free,
> > +                            bool release)
> > +{
> > +     int ret = 0;
> >
> >       /*
> > -      * zone_pcp_disable() can't be used here. It will
> > -      * hold pcp_batch_high_lock and dissolve_free_hugetlb_folio() might hold
> > -      * cpu_hotplug_lock via static_key_slow_dec() when hugetlb vmemmap
> > -      * optimization is enabled. This will break current lock dependency
> > -      * chain and leads to deadlock.
> > -      * Disabling pcp before dissolving the page was a deterministic
> > -      * approach because we made sure that those pages cannot end up in any
> > -      * PCP list. Draining PCP lists expels those pages to the buddy system,
> > -      * but nothing guarantees that those pages do not get back to a PCP
> > -      * queue if we need to refill those.
> > +      * Buddy allocator will exclude the HWPoison page after hugepage
> > +      * is successfully dissolved.
> >        */
> > -     ret = dissolve_free_hugetlb_folio(page_folio(page));
> > -     if (!ret) {
> > +     if (huge)
> > +             ret = hugepage_handle_poison(page);
> > +
> > +     if (free) {
>
> Nit: huge and free won't be both true. So we could write it as:
>         if (huge) {
>                 ...
>         } else if (free) {
>
> >               drain_all_pages(page_zone(page));
> > -             ret = take_page_off_buddy(page);
> > +             ret = take_page_off_buddy(page) ? 0 : -1;
> >       }
> >
> > -     return ret;
> > -}
> > -
> > -static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release)
> > -{
> > -     if (hugepage_or_freepage) {
> > +     if ((huge || free) && ret < 0)
>
> Nit: ret won't be <0 if both huge and free are false. So I think we might simplify it as:
>
>         if (ret < 0)
>
> >               /*
> > -              * Doing this check for free pages is also fine since
> > -              * dissolve_free_hugetlb_folio() returns 0 for non-hugetlb folios as well.
> > +              * We could fail to take off the target page from buddy
> > +              * for example due to racy page allocation, but that's
> > +              * acceptable because soft-offlined page is not broken
> > +              * and if someone really want to use it, they should
> > +              * take it.
> >                */
> > -             if (__page_handle_poison(page) <= 0)
> > -                     /*
> > -                      * We could fail to take off the target page from buddy
> > -                      * for example due to racy page allocation, but that's
> > -                      * acceptable because soft-offlined page is not broken
> > -                      * and if someone really want to use it, they should
> > -                      * take it.
> > -                      */
> > -                     return false;
> > -     }
> > +             return false;
> >
> >       SetPageHWPoison(page);
> >       if (release)
> > @@ -1174,7 +1168,7 @@ static int me_huge_page(struct page_state *ps, struct page *p)
> >                * subpages.
> >                */
> >               folio_put(folio);
> > -             if (__page_handle_poison(p) > 0) {
> > +             if (!hugepage_handle_poison(p)) {
> >                       page_ref_inc(p);
> >                       res = MF_RECOVERED;
> >               } else {
> > @@ -2067,7 +2061,7 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb
> >        */
> >       if (res == 0) {
> >               folio_unlock(folio);
> > -             if (__page_handle_poison(p) > 0) {
> > +             if (!hugepage_handle_poison(p)) {
> >                       page_ref_inc(p);
> >                       res = MF_RECOVERED;
> >               } else {
> > @@ -2815,7 +2809,7 @@ static int soft_offline_in_use_page(struct page *page)
> >
> >       if (ret) {
> >               pr_info("%#lx: invalidated\n", pfn);
> > -             page_handle_poison(page, false, true);
> > +             page_handle_poison(page, false, false, true);
> >               return 0;
> >       }
> >
> > @@ -2836,7 +2830,7 @@ static int soft_offline_in_use_page(struct page *page)
> >               if (!ret) {
> >                       bool release = !huge;
> >
> > -                     if (!page_handle_poison(page, huge, release))
> > +                     if (!page_handle_poison(page, huge, false, release))
>
> This might not work for soft offline. PageHWPoison is not yet set so folio_clear_hugetlb_hwpoison
> won't be called when dissolve hugetlb hugepages...

Thanks for pointing this problem (and the later problem) out, Miaohe!
You are right, and I think the reason for both problems is, soft
offline is a totally different case than memory_failure(): there is no
PG_HWPoison until the end of page_handle_poison(). So
free_has_hwpoisoned() can't help dissolve_free_hugetlb_folio() to
exclude the page that triggered soft_offline_page().

For free_has_hwpoisoned(), I should only change the call sites in the
memory_failure() path, and leave
soft_offline_page()/page_handle_poison()/__ __page_handle_poison()
alone. Looking at the current code, HWPoison hugetlb pages happens to
be handled by __page_handle_poison() in either me_huge_page() or
try_memory_failure_hugetlb(). So I think I can replace them with a new
function that doesn't do take_page_off_buddy(), something like:

 /*
+ * Only for a HugeTLB page being handled by memory_failure(). The key
+ * difference to soft_offline() is that, no HWPoison subpage will make
+ * into buddy allocator after a successful dissolve_free_hugetlb_folio(),
+ * so take_page_off_buddy() is unnecessary.
+ */
+static int __hugepage_handle_poison(struct page *page)
+{
+       struct folio *folio = page_folio(page);
+
+       VM_WARN_ON_FOLIO(!folio_test_hwpoison(folio), folio);
+
+       /*
+        * Can't use dissolve_free_hugetlb_folio() without a reliable
+        * raw_hwp_list telling which subpage is HWPoison.
+        */
+       if (folio_test_hugetlb_raw_hwp_unreliable(folio))
+               /* raw_hwp_list becomes unreliable when kmalloc() fails. */
+               return -ENOMEM;
+
+       return dissolve_free_hugetlb_folio(folio);
+}
+

On the other hand, just leave __page_handle_poison() and
page_handle_poison() as is to do take_page_off_buddy() for soft
offline case.

>
> >                               ret = -EBUSY;
> >               } else {
> >                       if (!list_empty(&pagelist))
> > @@ -2884,6 +2878,8 @@ int soft_offline_page(unsigned long pfn, int flags)
> >  {
> >       int ret;
> >       bool try_again = true;
> > +     bool huge;
> > +     bool free;
> >       struct page *page;
> >
> >       if (!pfn_valid(pfn)) {
> > @@ -2929,7 +2925,9 @@ int soft_offline_page(unsigned long pfn, int flags)
> >       if (ret > 0) {
> >               ret = soft_offline_in_use_page(page);
> >       } else if (ret == 0) {
> > -             if (!page_handle_poison(page, true, false)) {
> > +             huge = folio_test_hugetlb(page_folio(page));
>
> folio_test_hugetlb check is racy because there's no guarantee that hugetlb hugepage won't
> be dissolved before calling page_handle_poison. That will lead to problem...
>
> soft_offline_page
>   folio_test_hugetlb -- true now
>   page_handle_poison
>   /* Hugepage is dissolved somewhere. */
>     hugepage_handle_poison -- return 0 because page is normal page or free buddy page.
>     SetPageHWPoison(page);
>     page_ref_inc(page); -- refcnt is increased while page might be on buddy...
>
> > +             free = is_free_buddy_page(page);
> > +             if (!page_handle_poison(page, huge, free, false)) {
>
> We assume free is always true due to ret is 0. So we can write it as:
>         if (!page_handle_poison(page, huge, true, false)) {
>
> >                       if (try_again) {
> >                               try_again = false;
> >                               flags &= ~MF_COUNT_INCREASED;
> >
>
> Thanks.
> .
>