[PATCH resend] mm/memory-failure: fix hugetlb_lock AA deadlock in get_huge_page_for_hwpoison

Wupeng Ma posted 1 patch 2 days, 17 hours ago
There is a newer version of this series
include/linux/hugetlb.h |  8 --------
include/linux/mm.h      |  8 --------
mm/hugetlb.c            | 11 -----------
mm/memory-failure.c     |  8 ++++----
4 files changed, 4 insertions(+), 31 deletions(-)
[PATCH resend] mm/memory-failure: fix hugetlb_lock AA deadlock in get_huge_page_for_hwpoison
Posted by Wupeng Ma 2 days, 17 hours ago
Two concurrent madvise(MADV_HWPOISON) calls on the same hugetlb page
can trigger a recursive spinlock self-deadlock (AA deadlock) on
hugetlb_lock when racing with a concurrent unmap:

  thread#0                              thread#1
  --------                              --------
  madvise(folio, MADV_HWPOISON)
    -> poisons the folio successfully
  madvise(folio, MADV_HWPOISON)         unmap(folio)
    try_memory_failure_hugetlb
      get_huge_page_for_hwpoison
        spin_lock_irq(&hugetlb_lock)    <- held
        __get_huge_page_for_hwpoison
          hugetlb_update_hwpoison()
            -> MF_HUGETLB_FOLIO_PRE_POISONED
          goto out:
            folio_put()
              refcount: 1 -> 0
              free_huge_folio()
                spin_lock_irqsave(&hugetlb_lock)
                  -> AA DEADLOCK!

The out: path in __get_huge_page_for_hwpoison() calls folio_put() to
drop the GUP reference while the hugetlb_lock is still held by the
hugetlb.c wrapper get_huge_page_for_hwpoison().  If concurrent unmap
has released the page table mapping reference, folio_put() drops the
folio refcount to zero, triggering free_huge_folio() which attempts
to re-acquire the non-recursive hugetlb_lock.

Fix this by moving hugetlb_lock acquisition from the hugetlb.c wrapper
into get_huge_page_for_hwpoison().  Place spin_unlock_irq() before the
folio_put() at the out: label so the folio is always released outside
the lock.

Fixes: 405ce051236c ("mm/hwpoison: fix race between hugetlb free/demotion and memory_failure_hugetlb()")
Signed-off-by: Wupeng Ma <mawupeng1@huawei.com>
---
Changelog since v3[1]:
 - update commit message to fit current issue  

[1]: https://lore.kernel.org/linux-mm/20260520020128.3506168-1-mawupeng1@huawei.com/ 
---
 include/linux/hugetlb.h |  8 --------
 include/linux/mm.h      |  8 --------
 mm/hugetlb.c            | 11 -----------
 mm/memory-failure.c     |  8 ++++----
 4 files changed, 4 insertions(+), 31 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 93418625d3c5..059749ed519f 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -153,8 +153,6 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
 						long freed);
 bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list);
 int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison);
-int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
-				bool *migratable_cleared);
 void folio_putback_hugetlb(struct folio *folio);
 void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason);
 void hugetlb_fix_reserve_counts(struct inode *inode);
@@ -422,12 +420,6 @@ static inline int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb,
 	return 0;
 }

-static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
-					bool *migratable_cleared)
-{
-	return 0;
-}
-
 static inline void folio_putback_hugetlb(struct folio *folio)
 {
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0b776907152e..4c4d1a61a6a7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4975,8 +4975,6 @@ extern int soft_offline_page(unsigned long pfn, int flags);
  */
 extern const struct attribute_group memory_failure_attr_group;
 extern void memory_failure_queue(unsigned long pfn, int flags);
-extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
-					bool *migratable_cleared);
 void num_poisoned_pages_inc(unsigned long pfn);
 void num_poisoned_pages_sub(unsigned long pfn, long i);
 #else
@@ -4984,12 +4982,6 @@ static inline void memory_failure_queue(unsigned long pfn, int flags)
 {
 }

-static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
-					bool *migratable_cleared)
-{
-	return 0;
-}
-
 static inline void num_poisoned_pages_inc(unsigned long pfn)
 {
 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f24bf49be047..67243923fa24 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7154,17 +7154,6 @@ int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison
 	return ret;
 }

-int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
-				bool *migratable_cleared)
-{
-	int ret;
-
-	spin_lock_irq(&hugetlb_lock);
-	ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared);
-	spin_unlock_irq(&hugetlb_lock);
-	return ret;
-}
-
 /**
  * folio_putback_hugetlb - unisolate a hugetlb folio
  * @folio: the isolated hugetlb folio
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ee42d4361309..28522180cf7f 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1966,10 +1966,7 @@ void folio_clear_hugetlb_hwpoison(struct folio *folio)
 	folio_free_raw_hwp(folio, true);
 }

-/*
- * Called from hugetlb code with hugetlb_lock held.
- */
-int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
+static int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
 				 bool *migratable_cleared)
 {
 	struct page *page = pfn_to_page(pfn);
@@ -1977,6 +1974,7 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
 	bool count_increased = false;
 	int ret, rc;

+	spin_lock_irq(&hugetlb_lock);
 	if (!folio_test_hugetlb(folio)) {
 		ret = MF_HUGETLB_NON_HUGEPAGE;
 		goto out;
@@ -2013,8 +2011,10 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
 		*migratable_cleared = true;
 	}

+	spin_unlock_irq(&hugetlb_lock);
 	return ret;
 out:
+	spin_unlock_irq(&hugetlb_lock);
 	if (count_increased)
 		folio_put(folio);
 	return ret;
--
2.43.0
Re: [PATCH resend] mm/memory-failure: fix hugetlb_lock AA deadlock in get_huge_page_for_hwpoison
Posted by Andrew Morton 1 day, 14 hours ago
On Fri, 22 May 2026 09:03:05 +0800 Wupeng Ma <mawupeng1@huawei.com> wrote:

> Two concurrent madvise(MADV_HWPOISON) calls on the same hugetlb page
> can trigger a recursive spinlock self-deadlock (AA deadlock) on
> hugetlb_lock when racing with a concurrent unmap:

Well we don't want that.

> Fixes: 405ce051236c ("mm/hwpoison: fix race between hugetlb free/demotion and memory_failure_hugetlb()")

So I'll add cc:stable here.

AI review didn't like the unlocked page_folio():

	https://sashiko.dev/#/patchset/20260522010305.4099834-1-mawupeng1@huawei.com

So I'll add a followup patch which addresses that (and which addresses
Miaohe's naming nit).

Please let's check this - perhaps the locking alteration isn't needed.


From: Andrew Morton <akpm@linux-foundation.org>
Subject: mm-memory-failure-fix-hugetlb_lock-aa-deadlock-in-get_huge_page_for_hwpoison-fix
Date: Fri May 22 08:44:25 PM PDT 2026

- address possible race identified by Sashiko

- s/out/out_unlock/, per Miaohe

Link: https://sashiko.dev/#/patchset/20260522010305.4099834-1-mawupeng1@huawei.com
Link: https://lore.kernel.org/f39f405e-4b4b-8f79-70fe-a2b5b62114eb@huawei.com
Cc: David Hildenbrand <david@kernel.org>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <ljs@kernel.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Oscar Salvador (SUSE) <osalvador@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@kernel.org>
Cc: Wupeng Ma <mawupeng1@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 mm/memory-failure.c |   11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

--- a/mm/memory-failure.c~mm-memory-failure-fix-hugetlb_lock-aa-deadlock-in-get_huge_page_for_hwpoison-fix
+++ a/mm/memory-failure.c
@@ -1970,14 +1970,15 @@ static int get_huge_page_for_hwpoison(un
 				 bool *migratable_cleared)
 {
 	struct page *page = pfn_to_page(pfn);
-	struct folio *folio = page_folio(page);
+	struct folio *folio;
 	bool count_increased = false;
 	int ret, rc;
 
 	spin_lock_irq(&hugetlb_lock);
+	folio = page_folio(page);
 	if (!folio_test_hugetlb(folio)) {
 		ret = MF_HUGETLB_NON_HUGEPAGE;
-		goto out;
+		goto out_unlock;
 	} else if (flags & MF_COUNT_INCREASED) {
 		ret = MF_HUGETLB_IN_USED;
 		count_increased = true;
@@ -1993,13 +1994,13 @@ static int get_huge_page_for_hwpoison(un
 	} else {
 		ret = MF_HUGETLB_RETRY;
 		if (!(flags & MF_NO_RETRY))
-			goto out;
+			goto out_unlock;
 	}
 
 	rc = hugetlb_update_hwpoison(folio, page);
 	if (rc >= MF_HUGETLB_FOLIO_PRE_POISONED) {
 		ret = rc;
-		goto out;
+		goto out_unlock;
 	}
 
 	/*
@@ -2013,7 +2014,7 @@ static int get_huge_page_for_hwpoison(un
 
 	spin_unlock_irq(&hugetlb_lock);
 	return ret;
-out:
+out_unlock:
 	spin_unlock_irq(&hugetlb_lock);
 	if (count_increased)
 		folio_put(folio);
_
Re: [PATCH resend] mm/memory-failure: fix hugetlb_lock AA deadlock in get_huge_page_for_hwpoison
Posted by Miaohe Lin 2 days, 9 hours ago
On 2026/5/22 9:03, Wupeng Ma wrote:
> Two concurrent madvise(MADV_HWPOISON) calls on the same hugetlb page
> can trigger a recursive spinlock self-deadlock (AA deadlock) on
> hugetlb_lock when racing with a concurrent unmap:
> 
>   thread#0                              thread#1
>   --------                              --------
>   madvise(folio, MADV_HWPOISON)
>     -> poisons the folio successfully
>   madvise(folio, MADV_HWPOISON)         unmap(folio)
>     try_memory_failure_hugetlb
>       get_huge_page_for_hwpoison
>         spin_lock_irq(&hugetlb_lock)    <- held
>         __get_huge_page_for_hwpoison
>           hugetlb_update_hwpoison()
>             -> MF_HUGETLB_FOLIO_PRE_POISONED
>           goto out:
>             folio_put()
>               refcount: 1 -> 0
>               free_huge_folio()
>                 spin_lock_irqsave(&hugetlb_lock)
>                   -> AA DEADLOCK!
> 
> The out: path in __get_huge_page_for_hwpoison() calls folio_put() to
> drop the GUP reference while the hugetlb_lock is still held by the
> hugetlb.c wrapper get_huge_page_for_hwpoison().  If concurrent unmap
> has released the page table mapping reference, folio_put() drops the
> folio refcount to zero, triggering free_huge_folio() which attempts
> to re-acquire the non-recursive hugetlb_lock.
> 
> Fix this by moving hugetlb_lock acquisition from the hugetlb.c wrapper
> into get_huge_page_for_hwpoison().  Place spin_unlock_irq() before the
> folio_put() at the out: label so the folio is always released outside
> the lock.
> 
> Fixes: 405ce051236c ("mm/hwpoison: fix race between hugetlb free/demotion and memory_failure_hugetlb()")
> Signed-off-by: Wupeng Ma <mawupeng1@huawei.com>

Thanks for your patch.

> ---
> Changelog since v3[1]:
>  - update commit message to fit current issue  
> 
> [1]: https://lore.kernel.org/linux-mm/20260520020128.3506168-1-mawupeng1@huawei.com/ 
> ---
>  include/linux/hugetlb.h |  8 --------
>  include/linux/mm.h      |  8 --------
>  mm/hugetlb.c            | 11 -----------
>  mm/memory-failure.c     |  8 ++++----
>  4 files changed, 4 insertions(+), 31 deletions(-)
> 
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 93418625d3c5..059749ed519f 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -153,8 +153,6 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
>  						long freed);
>  bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list);
>  int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison);
> -int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> -				bool *migratable_cleared);
>  void folio_putback_hugetlb(struct folio *folio);
>  void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason);
>  void hugetlb_fix_reserve_counts(struct inode *inode);
> @@ -422,12 +420,6 @@ static inline int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb,
>  	return 0;
>  }
> 
> -static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> -					bool *migratable_cleared)
> -{
> -	return 0;
> -}
> -
>  static inline void folio_putback_hugetlb(struct folio *folio)
>  {
>  }
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 0b776907152e..4c4d1a61a6a7 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -4975,8 +4975,6 @@ extern int soft_offline_page(unsigned long pfn, int flags);
>   */
>  extern const struct attribute_group memory_failure_attr_group;
>  extern void memory_failure_queue(unsigned long pfn, int flags);
> -extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> -					bool *migratable_cleared);
>  void num_poisoned_pages_inc(unsigned long pfn);
>  void num_poisoned_pages_sub(unsigned long pfn, long i);
>  #else
> @@ -4984,12 +4982,6 @@ static inline void memory_failure_queue(unsigned long pfn, int flags)
>  {
>  }
> 
> -static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> -					bool *migratable_cleared)
> -{
> -	return 0;
> -}
> -
>  static inline void num_poisoned_pages_inc(unsigned long pfn)
>  {
>  }
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index f24bf49be047..67243923fa24 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -7154,17 +7154,6 @@ int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison
>  	return ret;
>  }
> 
> -int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> -				bool *migratable_cleared)
> -{
> -	int ret;
> -
> -	spin_lock_irq(&hugetlb_lock);
> -	ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared);
> -	spin_unlock_irq(&hugetlb_lock);
> -	return ret;
> -}
> -
>  /**
>   * folio_putback_hugetlb - unisolate a hugetlb folio
>   * @folio: the isolated hugetlb folio
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index ee42d4361309..28522180cf7f 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -1966,10 +1966,7 @@ void folio_clear_hugetlb_hwpoison(struct folio *folio)
>  	folio_free_raw_hwp(folio, true);
>  }
> 
> -/*
> - * Called from hugetlb code with hugetlb_lock held.
> - */
> -int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> +static int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>  				 bool *migratable_cleared)
>  {
>  	struct page *page = pfn_to_page(pfn);
> @@ -1977,6 +1974,7 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>  	bool count_increased = false;
>  	int ret, rc;
> 
> +	spin_lock_irq(&hugetlb_lock);
>  	if (!folio_test_hugetlb(folio)) {
>  		ret = MF_HUGETLB_NON_HUGEPAGE;
>  		goto out;
> @@ -2013,8 +2011,10 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>  		*migratable_cleared = true;
>  	}
> 
> +	spin_unlock_irq(&hugetlb_lock);
>  	return ret;
>  out:

It might be better to rename out: as out_unlock. But that's trivial.

Acked-by: Miaohe Lin <linmiaohe@huawei.com>

Thanks.
.
Re: [PATCH resend] mm/memory-failure: fix hugetlb_lock AA deadlock in get_huge_page_for_hwpoison
Posted by Kefeng Wang 2 days, 9 hours ago

On 5/22/2026 9:03 AM, Wupeng Ma wrote:
> Two concurrent madvise(MADV_HWPOISON) calls on the same hugetlb page
> can trigger a recursive spinlock self-deadlock (AA deadlock) on
> hugetlb_lock when racing with a concurrent unmap:
> 
>    thread#0                              thread#1
>    --------                              --------
>    madvise(folio, MADV_HWPOISON)
>      -> poisons the folio successfully
>    madvise(folio, MADV_HWPOISON)         unmap(folio)
>      try_memory_failure_hugetlb
>        get_huge_page_for_hwpoison
>          spin_lock_irq(&hugetlb_lock)    <- held
>          __get_huge_page_for_hwpoison
>            hugetlb_update_hwpoison()
>              -> MF_HUGETLB_FOLIO_PRE_POISONED
>            goto out:
>              folio_put()
>                refcount: 1 -> 0
>                free_huge_folio()
>                  spin_lock_irqsave(&hugetlb_lock)
>                    -> AA DEADLOCK!
> 
> The out: path in __get_huge_page_for_hwpoison() calls folio_put() to
> drop the GUP reference while the hugetlb_lock is still held by the
> hugetlb.c wrapper get_huge_page_for_hwpoison().  If concurrent unmap
> has released the page table mapping reference, folio_put() drops the
> folio refcount to zero, triggering free_huge_folio() which attempts
> to re-acquire the non-recursive hugetlb_lock.
> 
> Fix this by moving hugetlb_lock acquisition from the hugetlb.c wrapper
> into get_huge_page_for_hwpoison().  Place spin_unlock_irq() before the
> folio_put() at the out: label so the folio is always released outside
> the lock.
> 
> Fixes: 405ce051236c ("mm/hwpoison: fix race between hugetlb free/demotion and memory_failure_hugetlb()")
> Signed-off-by: Wupeng Ma <mawupeng1@huawei.com>

Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Re: [PATCH resend] mm/memory-failure: fix hugetlb_lock AA deadlock in get_huge_page_for_hwpoison
Posted by Muchun Song 2 days, 9 hours ago

> On May 22, 2026, at 09:03, Wupeng Ma <mawupeng1@huawei.com> wrote:
> 
> Two concurrent madvise(MADV_HWPOISON) calls on the same hugetlb page
> can trigger a recursive spinlock self-deadlock (AA deadlock) on
> hugetlb_lock when racing with a concurrent unmap:
> 
>  thread#0                              thread#1
>  --------                              --------
>  madvise(folio, MADV_HWPOISON)
>    -> poisons the folio successfully
>  madvise(folio, MADV_HWPOISON)         unmap(folio)
>    try_memory_failure_hugetlb
>      get_huge_page_for_hwpoison
>        spin_lock_irq(&hugetlb_lock)    <- held
>        __get_huge_page_for_hwpoison
>          hugetlb_update_hwpoison()
>            -> MF_HUGETLB_FOLIO_PRE_POISONED
>          goto out:
>            folio_put()
>              refcount: 1 -> 0
>              free_huge_folio()
>                spin_lock_irqsave(&hugetlb_lock)
>                  -> AA DEADLOCK!
> 
> The out: path in __get_huge_page_for_hwpoison() calls folio_put() to
> drop the GUP reference while the hugetlb_lock is still held by the
> hugetlb.c wrapper get_huge_page_for_hwpoison().  If concurrent unmap
> has released the page table mapping reference, folio_put() drops the
> folio refcount to zero, triggering free_huge_folio() which attempts
> to re-acquire the non-recursive hugetlb_lock.
> 
> Fix this by moving hugetlb_lock acquisition from the hugetlb.c wrapper
> into get_huge_page_for_hwpoison().  Place spin_unlock_irq() before the
> folio_put() at the out: label so the folio is always released outside
> the lock.
> 
> Fixes: 405ce051236c ("mm/hwpoison: fix race between hugetlb free/demotion and memory_failure_hugetlb()")
> Signed-off-by: Wupeng Ma <mawupeng1@huawei.com>

Acked-by: Muchun Song <muchun.song@linux.dev>

Thanks.
Re: [PATCH resend] mm/memory-failure: fix hugetlb_lock AA deadlock in get_huge_page_for_hwpoison
Posted by Oscar Salvador (SUSE) 2 days, 10 hours ago
On Fri, May 22, 2026 at 09:03:05AM +0800, Wupeng Ma wrote:
> Two concurrent madvise(MADV_HWPOISON) calls on the same hugetlb page
> can trigger a recursive spinlock self-deadlock (AA deadlock) on
> hugetlb_lock when racing with a concurrent unmap:
> 
>   thread#0                              thread#1
>   --------                              --------
>   madvise(folio, MADV_HWPOISON)
>     -> poisons the folio successfully
>   madvise(folio, MADV_HWPOISON)         unmap(folio)
>     try_memory_failure_hugetlb
>       get_huge_page_for_hwpoison
>         spin_lock_irq(&hugetlb_lock)    <- held
>         __get_huge_page_for_hwpoison
>           hugetlb_update_hwpoison()
>             -> MF_HUGETLB_FOLIO_PRE_POISONED
>           goto out:
>             folio_put()
>               refcount: 1 -> 0
>               free_huge_folio()
>                 spin_lock_irqsave(&hugetlb_lock)
>                   -> AA DEADLOCK!
> 
> The out: path in __get_huge_page_for_hwpoison() calls folio_put() to
> drop the GUP reference while the hugetlb_lock is still held by the
> hugetlb.c wrapper get_huge_page_for_hwpoison().  If concurrent unmap
> has released the page table mapping reference, folio_put() drops the
> folio refcount to zero, triggering free_huge_folio() which attempts
> to re-acquire the non-recursive hugetlb_lock.
> 
> Fix this by moving hugetlb_lock acquisition from the hugetlb.c wrapper
> into get_huge_page_for_hwpoison().  Place spin_unlock_irq() before the
> folio_put() at the out: label so the folio is always released outside
> the lock.
> 
> Fixes: 405ce051236c ("mm/hwpoison: fix race between hugetlb free/demotion and memory_failure_hugetlb()")
> Signed-off-by: Wupeng Ma <mawupeng1@huawei.com>

I was also able to hit this with two threads and adding some delays.

Acked-by: Oscar Salvador (SUSE) <osalvador@kernel.org>



-- 
Oscar Salvador
SUSE Labs