[v3] mm/memory-failure: fix hugetlb_lock AA deadlock in get_huge_page_for_hwpoison

[PATCH v3] mm/memory-failure: fix hugetlb_lock AA deadlock in get_huge_page_for_hwpoison

Posted by Wupeng Ma 4 days, 16 hours ago

madvise(MADV_HWPOISON) can trigger a recursive spinlock self-deadlock
(AA deadlock) on hugetlb_lock due to a race with concurrent folio
unmapping.  The race scenario:

  Thread 1 (madvise MADV_HWPOISON)          Thread 2 (unmap)
  -------------------------------          -----------------
  madvise_inject_error()
    get_user_pages_fast()  <- refcount++
    memory_failure(MF_COUNT_INCREASED)
      get_huge_page_for_hwpoison()
        spin_lock_irq(&hugetlb_lock)
        // refcount == 2 (gup + map)
        // MF_COUNT_INCREASED path:
        count_increased = true
                                           zap_pte_range()
                                             page_remove_rmap()
                                             put_page()  <- drops map ref
                                             // refcount: 2 -> 1
        hugetlb_update_hwpoison()
          -> MF_HUGETLB_FOLIO_PRE_POISONED
          -> goto out
        out:
          folio_put(folio)  <- drops gup ref
            // refcount: 1 -> 0
            free_huge_folio()
              spin_lock_irq(&hugetlb_lock)  <- AA DEADLOCK

When Thread 2's put_page() drops the mapping reference while Thread 1
holds hugetlb_lock, the folio refcount drops to 1.  The subsequent
folio_put() at the out: label frees the folio, and free_huge_folio()
attempts to re-acquire the non-recursive hugetlb_lock on the same CPU,
resulting in an AA self-deadlock.

The same deadlock can also occur on the folio_try_get() path: when a
migratable folio is found and folio_try_get() succeeds (refcount rises
to refcount+1), a concurrent unmap and a hugetlb_update_hwpoison()
returning pre-poisoned status will land at out: where folio_put() again
may free the folio under hugetlb_lock.

Fix this by removing the hugetlb_lock wrapper from hugetlb.c and
moving the lock acquisition directly into get_huge_page_for_hwpoison()
(formerly __get_huge_page_for_hwpoison) in memory-failure.c.  Place
spin_unlock_irq() before folio_put() at the out: label so that the
folio is always released outside the lock, preventing any recursive
lock acquisition.  Remove the now-incorrect "Called from hugetlb code
with hugetlb_lock held" comment, and update the stale
__get_huge_page_for_hwpoison declarations in include/linux/mm.h.

Fixes: 405ce051236c ("mm/hwpoison: fix race between hugetlb free/demotion and memory_failure_hugetlb()")
Signed-off-by: Wupeng Ma <mawupeng1@huawei.com>
---
 include/linux/hugetlb.h |  8 --------
 include/linux/mm.h      |  8 --------
 mm/hugetlb.c            | 11 -----------
 mm/memory-failure.c     |  8 ++++----
 4 files changed, 4 insertions(+), 31 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 65910437be1ca..aa3eb42e0a01a 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -153,8 +153,6 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
 						long freed);
 bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list);
 int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison);
-int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
-				bool *migratable_cleared);
 void folio_putback_hugetlb(struct folio *folio);
 void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason);
 void hugetlb_fix_reserve_counts(struct inode *inode);
@@ -422,12 +420,6 @@ static inline int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb,
 	return 0;
 }
 
-static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
-					bool *migratable_cleared)
-{
-	return 0;
-}
-
 static inline void folio_putback_hugetlb(struct folio *folio)
 {
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index abb4963c1f064..46e5936dabaa8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4602,8 +4602,6 @@ extern int soft_offline_page(unsigned long pfn, int flags);
  */
 extern const struct attribute_group memory_failure_attr_group;
 extern void memory_failure_queue(unsigned long pfn, int flags);
-extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
-					bool *migratable_cleared);
 void num_poisoned_pages_inc(unsigned long pfn);
 void num_poisoned_pages_sub(unsigned long pfn, long i);
 #else
@@ -4611,12 +4609,6 @@ static inline void memory_failure_queue(unsigned long pfn, int flags)
 {
 }
 
-static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
-					bool *migratable_cleared)
-{
-	return 0;
-}
-
 static inline void num_poisoned_pages_inc(unsigned long pfn)
 {
 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 327eaa4074d39..4c99bb868ad08 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7170,17 +7170,6 @@ int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison
 	return ret;
 }
 
-int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
-				bool *migratable_cleared)
-{
-	int ret;
-
-	spin_lock_irq(&hugetlb_lock);
-	ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared);
-	spin_unlock_irq(&hugetlb_lock);
-	return ret;
-}
-
 /**
  * folio_putback_hugetlb - unisolate a hugetlb folio
  * @folio: the isolated hugetlb folio
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ee42d43613097..28522180cf7f8 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1966,10 +1966,7 @@ void folio_clear_hugetlb_hwpoison(struct folio *folio)
 	folio_free_raw_hwp(folio, true);
 }
 
-/*
- * Called from hugetlb code with hugetlb_lock held.
- */
-int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
+static int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
 				 bool *migratable_cleared)
 {
 	struct page *page = pfn_to_page(pfn);
@@ -1977,6 +1974,7 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
 	bool count_increased = false;
 	int ret, rc;
 
+	spin_lock_irq(&hugetlb_lock);
 	if (!folio_test_hugetlb(folio)) {
 		ret = MF_HUGETLB_NON_HUGEPAGE;
 		goto out;
@@ -2013,8 +2011,10 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
 		*migratable_cleared = true;
 	}
 
+	spin_unlock_irq(&hugetlb_lock);
 	return ret;
 out:
+	spin_unlock_irq(&hugetlb_lock);
 	if (count_increased)
 		folio_put(folio);
 	return ret;
-- 
2.43.0

Re: [PATCH v3] mm/memory-failure: fix hugetlb_lock AA deadlock in get_huge_page_for_hwpoison

Posted by Oscar Salvador (SUSE) 4 days, 10 hours ago

On Wed, May 20, 2026 at 10:01:28AM +0800, Wupeng Ma wrote:
> madvise(MADV_HWPOISON) can trigger a recursive spinlock self-deadlock
> (AA deadlock) on hugetlb_lock due to a race with concurrent folio
> unmapping.  The race scenario:
> 
>   Thread 1 (madvise MADV_HWPOISON)          Thread 2 (unmap)
>   -------------------------------          -----------------
>   madvise_inject_error()
>     get_user_pages_fast()  <- refcount++
>     memory_failure(MF_COUNT_INCREASED)
>       get_huge_page_for_hwpoison()
>         spin_lock_irq(&hugetlb_lock)
>         // refcount == 2 (gup + map)
>         // MF_COUNT_INCREASED path:
>         count_increased = true
>                                            zap_pte_range()
>                                              page_remove_rmap()
>                                              put_page()  <- drops map ref
>                                              // refcount: 2 -> 1

Ok, bear with me.
I am not saying the change itself is wrong (maybe it is not), but how we ended
up in zap_pte_range() for a hugetlb folio?
The stacktrace does not seem to have much sense?


 

-- 
Oscar Salvador
SUSE Labs

Re: [PATCH v3] mm/memory-failure: fix hugetlb_lock AA deadlock in get_huge_page_for_hwpoison

Posted by mawupeng 4 days, 7 hours ago


On 周三 2026-5-20 16:13, Oscar Salvador (SUSE) wrote:
> On Wed, May 20, 2026 at 10:01:28AM +0800, Wupeng Ma wrote:
>> madvise(MADV_HWPOISON) can trigger a recursive spinlock self-deadlock
>> (AA deadlock) on hugetlb_lock due to a race with concurrent folio
>> unmapping.  The race scenario:
>>
>>   Thread 1 (madvise MADV_HWPOISON)          Thread 2 (unmap)
>>   -------------------------------          -----------------
>>   madvise_inject_error()
>>     get_user_pages_fast()  <- refcount++
>>     memory_failure(MF_COUNT_INCREASED)
>>       get_huge_page_for_hwpoison()
>>         spin_lock_irq(&hugetlb_lock)
>>         // refcount == 2 (gup + map)
>>         // MF_COUNT_INCREASED path:
>>         count_increased = true
>>                                            zap_pte_range()
>>                                              page_remove_rmap()
>>                                              put_page()  <- drops map ref
>>                                              // refcount: 2 -> 1
> 
> Ok, bear with me.
> I am not saying the change itself is wrong (maybe it is not), but how we ended
> up in zap_pte_range() for a hugetlb folio?
> The stacktrace does not seem to have much sense?

You are correct. The refcount dropping logic in the `unmap` path was indeed flawed. 
This issue was originally uncovered by fuzzing. Based on the initial stack trace, 
we diagnosed it as a recursive locking (AA) deadlock on `hugetlb_lock`.

We initially suspected that `unmap` had prematurely released the folio reference 
count, triggering the free path. However, after a thorough analysis of the refcount 
state machine and the actual execution context, we confirmed that this hypothesis 
is impossible. The root cause lies elsewhere in the locking hierarchy, and we are 
currently tracing the exact call path that leads to the nested `hugetlb_lock` 
acquisition.

The deadlock can be triggered by injecting hardware poison errors on a hugetlb
page while concurrent unmapping activity occurs. The following minimal userspace
test case demonstrates the race condition by spawning multiple processes to
widen the timing window for the lock contention.

/*
 * Repro: hugetlb_lock AA deadlock via consecutive MADV_HWPOISON
 */
main():
    // 1. Map hugetlb page
    mmap(0x20000000, 0xa000, PROT_READ|PROT_EXEC,
         MAP_LOCKED|MAP_HUGETLB|MAP_FIXED, -1, 0)

    // 2. Inject hwpoison twice on same page
    madvise(0x20000000, 0x4000, MADV_HWPOISON)  
    madvise(0x20000000, 0x4000, MADV_HWPOISON)  

    // 3. Repeat in a fork loop to widen race window

Here is the detailed AA deadlock stack with lockdep enabled

  ============================================
  WARNING: possible recursive locking detected
  7.0.0-g0eba4942ce53-dirty #335 Not tainted
  --------------------------------------------
  repro/742 is trying to acquire lock:
  ffff800083702958 (hugetlb_lock){....}-{3:3}, at: free_huge_folio+0x194/0x3c0
  
  but task is already holding lock:
  ffff800083702958 (hugetlb_lock){....}-{3:3}, at: get_huge_page_for_hwpoison+0x38/  
  
  other info that might help us debug this:
   Possible unsafe locking scenario:
  
         CPU0
         ----
    lock(hugetlb_lock);
    lock(hugetlb_lock);
  
   *** DEADLOCK ***
  
   May be due to missing lock nesting notation
  
  2 locks held by repro/742:
   #0: ffff800086f942b0 (mf_mutex){+.+.}-{4:4}, at: memory_failure+0x6c/0xdd8
   #1: ffff800083702958 (hugetlb_lock){....}-{3:3}, at: get_huge_page_for_hwpoison  
  
  stack backtrace:
  CPU: 2 UID: 0 PID: 742 Comm: repro Not tainted 7.0.0-g0eba4942ce53-dirty #335   
  Hardware name: QEMU QEMU Virtual Machine, BIOS 0.0.0 02/06/2015
  Call trace:
   __lock_acquire+0xe7c/0x1e38
   lock_acquire+0x2b8/0x3f8
   _raw_spin_lock_irqsave+0x74/0xd8
   free_huge_folio+0x194/0x3c0
   __folio_put+0x124/0x130
   __get_huge_page_for_hwpoison+0x138/0x358
   get_huge_page_for_hwpoison+0x48/0x78
   memory_failure+0xb4/0xdd8
   madvise_do_behavior+0x39c/0x660
   do_madvise+0xe4/0x158
   __arm64_sys_madvise+0x2c/0x48


> 
> 
>  
>

Re: [PATCH v3] mm/memory-failure: fix hugetlb_lock AA deadlock in get_huge_page_for_hwpoison

Posted by Oscar Salvador (SUSE) 3 days, 9 hours ago

On Wed, May 20, 2026 at 07:24:28PM +0800, mawupeng wrote:
 
> You are correct. The refcount dropping logic in the `unmap` path was indeed flawed. 
> This issue was originally uncovered by fuzzing. Based on the initial stack trace, 
> we diagnosed it as a recursive locking (AA) deadlock on `hugetlb_lock`.
> 
> We initially suspected that `unmap` had prematurely released the folio reference 
> count, triggering the free path. However, after a thorough analysis of the refcount 
> state machine and the actual execution context, we confirmed that this hypothesis 
> is impossible. The root cause lies elsewhere in the locking hierarchy, and we are 
> currently tracing the exact call path that leads to the nested `hugetlb_lock` 
> acquisition.
> 
> The deadlock can be triggered by injecting hardware poison errors on a hugetlb
> page while concurrent unmapping activity occurs. The following minimal userspace
> test case demonstrates the race condition by spawning multiple processes to
> widen the timing window for the lock contention.


After staring at it, it is obvious the code is wrong.
We __should__ not be calling folio_put under the lock, as recursion will
happen if we are the last user holding a reference.
Thinking about it, I cannot think of a way we would need nesting here.

Anyway, this is a genuine bug, so thanks for that, but it all got very
confusing because of the traces pointing to wwrong places.
The thing is quite simple:

- We start with the assumption that a hugetlb folio is mapped to
  userspace and that madvise 

 thread#0                                     thread#1
  madvise(folio, MADV_HWPOISON) (we poisoned the page)
  madvise(folio, MADV_HWPOISON) (second call)
                                              unmap(folio)
   try_memory_failure_hugetlb
    get_huge_page_for_hwpoison (takes lock)
     __get_huge_page_for_hwpoison
       hugetlb_update_hwpoison
        - we get MF_HUGETLB_FOLIO_PRE_POISONED
	  we jump to out which does
	  folio_put
	   free_huge_page      (takes lock.. yaiks)


So yes, the fix is to have the folio_put happening not within the lock.

Please, send the patch with the right changelog (and no version) and I will ack it.



-- 
Oscar Salvador
SUSE Labs

Re: [PATCH v3] mm/memory-failure: fix hugetlb_lock AA deadlock in get_huge_page_for_hwpoison

Posted by mawupeng 3 days, 9 hours ago


On 周四 2026-5-21 16:48, Oscar Salvador (SUSE) wrote:
> On Wed, May 20, 2026 at 07:24:28PM +0800, mawupeng wrote:
>  
>> You are correct. The refcount dropping logic in the `unmap` path was indeed flawed. 
>> This issue was originally uncovered by fuzzing. Based on the initial stack trace, 
>> we diagnosed it as a recursive locking (AA) deadlock on `hugetlb_lock`.
>>
>> We initially suspected that `unmap` had prematurely released the folio reference 
>> count, triggering the free path. However, after a thorough analysis of the refcount 
>> state machine and the actual execution context, we confirmed that this hypothesis 
>> is impossible. The root cause lies elsewhere in the locking hierarchy, and we are 
>> currently tracing the exact call path that leads to the nested `hugetlb_lock` 
>> acquisition.
>>
>> The deadlock can be triggered by injecting hardware poison errors on a hugetlb
>> page while concurrent unmapping activity occurs. The following minimal userspace
>> test case demonstrates the race condition by spawning multiple processes to
>> widen the timing window for the lock contention.
> 
> 
> After staring at it, it is obvious the code is wrong.
> We __should__ not be calling folio_put under the lock, as recursion will
> happen if we are the last user holding a reference.
> Thinking about it, I cannot think of a way we would need nesting here.
> 
> Anyway, this is a genuine bug, so thanks for that, but it all got very
> confusing because of the traces pointing to wwrong places.
> The thing is quite simple:
> 
> - We start with the assumption that a hugetlb folio is mapped to
>   userspace and that madvise 
> 
>  thread#0                                     thread#1
>   madvise(folio, MADV_HWPOISON) (we poisoned the page)
>   madvise(folio, MADV_HWPOISON) (second call)
>                                               unmap(folio)
>    try_memory_failure_hugetlb
>     get_huge_page_for_hwpoison (takes lock)
>      __get_huge_page_for_hwpoison
>        hugetlb_update_hwpoison
>         - we get MF_HUGETLB_FOLIO_PRE_POISONED
> 	  we jump to out which does
> 	  folio_put
> 	   free_huge_page      (takes lock.. yaiks)
> 
> 
> So yes, the fix is to have the folio_put happening not within the lock.
> 
> Please, send the patch with the right changelog (and no version) and I will ack it.

Thanks for reviewing.

I will send new version with right changelog.

> 
> 
>

Re: [PATCH v3] mm/memory-failure: fix hugetlb_lock AA deadlock in get_huge_page_for_hwpoison

Posted by David Hildenbrand (Arm) 4 days, 8 hours ago

On 5/20/26 10:13, Oscar Salvador (SUSE) wrote:
> On Wed, May 20, 2026 at 10:01:28AM +0800, Wupeng Ma wrote:
>> madvise(MADV_HWPOISON) can trigger a recursive spinlock self-deadlock
>> (AA deadlock) on hugetlb_lock due to a race with concurrent folio
>> unmapping.  The race scenario:
>>
>>   Thread 1 (madvise MADV_HWPOISON)          Thread 2 (unmap)
>>   -------------------------------          -----------------
>>   madvise_inject_error()
>>     get_user_pages_fast()  <- refcount++
>>     memory_failure(MF_COUNT_INCREASED)
>>       get_huge_page_for_hwpoison()
>>         spin_lock_irq(&hugetlb_lock)
>>         // refcount == 2 (gup + map)
>>         // MF_COUNT_INCREASED path:
>>         count_increased = true
>>                                            zap_pte_range()
>>                                              page_remove_rmap()
>>                                              put_page()  <- drops map ref
>>                                              // refcount: 2 -> 1
> 
> Ok, bear with me.
> I am not saying the change itself is wrong (maybe it is not), but how we ended
> up in zap_pte_range() for a hugetlb folio?
> The stacktrace does not seem to have much sense?

Right, that does not make sense. Not even page_remove_rmap() makes sense,
because that function is long gone.

Undisclosed usage of shitty AI?

-- 
Cheers,

David

Re: [PATCH v3] mm/memory-failure: fix hugetlb_lock AA deadlock in get_huge_page_for_hwpoison

Posted by Kefeng Wang 4 days, 16 hours ago

You should remove the v3 since this is the first version for mainline.

On 5/20/2026 10:01 AM, Wupeng Ma wrote:
> madvise(MADV_HWPOISON) can trigger a recursive spinlock self-deadlock
> (AA deadlock) on hugetlb_lock due to a race with concurrent folio
> unmapping.  The race scenario:
> 
>    Thread 1 (madvise MADV_HWPOISON)          Thread 2 (unmap)
>    -------------------------------          -----------------
>    madvise_inject_error()
>      get_user_pages_fast()  <- refcount++
>      memory_failure(MF_COUNT_INCREASED)
>        get_huge_page_for_hwpoison()
>          spin_lock_irq(&hugetlb_lock)
>          // refcount == 2 (gup + map)
>          // MF_COUNT_INCREASED path:
>          count_increased = true
>                                             zap_pte_range()
>                                               page_remove_rmap()
>                                               put_page()  <- drops map ref
>                                               // refcount: 2 -> 1
>          hugetlb_update_hwpoison()
>            -> MF_HUGETLB_FOLIO_PRE_POISONED
>            -> goto out
>          out:
>            folio_put(folio)  <- drops gup ref
>              // refcount: 1 -> 0
>              free_huge_folio()
>                spin_lock_irq(&hugetlb_lock)  <- AA DEADLOCK
> 
> When Thread 2's put_page() drops the mapping reference while Thread 1
> holds hugetlb_lock, the folio refcount drops to 1.  The subsequent
> folio_put() at the out: label frees the folio, and free_huge_folio()
> attempts to re-acquire the non-recursive hugetlb_lock on the same CPU,
> resulting in an AA self-deadlock.
> 
> The same deadlock can also occur on the folio_try_get() path: when a
> migratable folio is found and folio_try_get() succeeds (refcount rises
> to refcount+1), a concurrent unmap and a hugetlb_update_hwpoison()
> returning pre-poisoned status will land at out: where folio_put() again
> may free the folio under hugetlb_lock.
> 
> Fix this by removing the hugetlb_lock wrapper from hugetlb.c and
> moving the lock acquisition directly into get_huge_page_for_hwpoison()
> (formerly __get_huge_page_for_hwpoison) in memory-failure.c.  Place
> spin_unlock_irq() before folio_put() at the out: label so that the
> folio is always released outside the lock, preventing any recursive
> lock acquisition.  


The following comment is not useful and correct for this version.

Remove the now-incorrect "Called from hugetlb code
> with hugetlb_lock held" comment, and update the stale
> __get_huge_page_for_hwpoison declarations in include/linux/mm.h.
> 

The change is lgtm,  Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>


> Fixes: 405ce051236c ("mm/hwpoison: fix race between hugetlb free/demotion and memory_failure_hugetlb()")
> Signed-off-by: Wupeng Ma <mawupeng1@huawei.com>
> ---
>   include/linux/hugetlb.h |  8 --------
>   include/linux/mm.h      |  8 --------
>   mm/hugetlb.c            | 11 -----------
>   mm/memory-failure.c     |  8 ++++----
>   4 files changed, 4 insertions(+), 31 deletions(-)
> 
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 65910437be1ca..aa3eb42e0a01a 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -153,8 +153,6 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
>   						long freed);
>   bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list);
>   int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison);
> -int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> -				bool *migratable_cleared);
>   void folio_putback_hugetlb(struct folio *folio);
>   void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason);
>   void hugetlb_fix_reserve_counts(struct inode *inode);
> @@ -422,12 +420,6 @@ static inline int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb,
>   	return 0;
>   }
>   
> -static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> -					bool *migratable_cleared)
> -{
> -	return 0;
> -}
> -
>   static inline void folio_putback_hugetlb(struct folio *folio)
>   {
>   }
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index abb4963c1f064..46e5936dabaa8 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -4602,8 +4602,6 @@ extern int soft_offline_page(unsigned long pfn, int flags);
>    */
>   extern const struct attribute_group memory_failure_attr_group;
>   extern void memory_failure_queue(unsigned long pfn, int flags);
> -extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> -					bool *migratable_cleared);
>   void num_poisoned_pages_inc(unsigned long pfn);
>   void num_poisoned_pages_sub(unsigned long pfn, long i);
>   #else
> @@ -4611,12 +4609,6 @@ static inline void memory_failure_queue(unsigned long pfn, int flags)
>   {
>   }
>   
> -static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> -					bool *migratable_cleared)
> -{
> -	return 0;
> -}
> -
>   static inline void num_poisoned_pages_inc(unsigned long pfn)
>   {
>   }
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 327eaa4074d39..4c99bb868ad08 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -7170,17 +7170,6 @@ int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison
>   	return ret;
>   }
>   
> -int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> -				bool *migratable_cleared)
> -{
> -	int ret;
> -
> -	spin_lock_irq(&hugetlb_lock);
> -	ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared);
> -	spin_unlock_irq(&hugetlb_lock);
> -	return ret;
> -}
> -
>   /**
>    * folio_putback_hugetlb - unisolate a hugetlb folio
>    * @folio: the isolated hugetlb folio
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index ee42d43613097..28522180cf7f8 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -1966,10 +1966,7 @@ void folio_clear_hugetlb_hwpoison(struct folio *folio)
>   	folio_free_raw_hwp(folio, true);
>   }
>   
> -/*
> - * Called from hugetlb code with hugetlb_lock held.
> - */
> -int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> +static int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>   				 bool *migratable_cleared)
>   {
>   	struct page *page = pfn_to_page(pfn);
> @@ -1977,6 +1974,7 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>   	bool count_increased = false;
>   	int ret, rc;
>   
> +	spin_lock_irq(&hugetlb_lock);
>   	if (!folio_test_hugetlb(folio)) {
>   		ret = MF_HUGETLB_NON_HUGEPAGE;
>   		goto out;
> @@ -2013,8 +2011,10 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>   		*migratable_cleared = true;
>   	}
>   
> +	spin_unlock_irq(&hugetlb_lock);
>   	return ret;
>   out:
> +	spin_unlock_irq(&hugetlb_lock);
>   	if (count_increased)
>   		folio_put(folio);
>   	return ret;

Re: [PATCH v3] mm/memory-failure: fix hugetlb_lock AA deadlock in get_huge_page_for_hwpoison

Posted by mawupeng 4 days, 15 hours ago


On 周三 2026-5-20 10:40, Kefeng Wang wrote:
> You should remove the v3 since this is the first version for mainline.

Thanks for reewing.

Sorry for my mistack, it will be removed next time.

> 
> On 5/20/2026 10:01 AM, Wupeng Ma wrote:
>> madvise(MADV_HWPOISON) can trigger a recursive spinlock self-deadlock
>> (AA deadlock) on hugetlb_lock due to a race with concurrent folio
>> unmapping.  The race scenario:
>>
>>    Thread 1 (madvise MADV_HWPOISON)          Thread 2 (unmap)
>>    -------------------------------          -----------------
>>    madvise_inject_error()
>>      get_user_pages_fast()  <- refcount++
>>      memory_failure(MF_COUNT_INCREASED)
>>        get_huge_page_for_hwpoison()
>>          spin_lock_irq(&hugetlb_lock)
>>          // refcount == 2 (gup + map)
>>          // MF_COUNT_INCREASED path:
>>          count_increased = true
>>                                             zap_pte_range()
>>                                               page_remove_rmap()
>>                                               put_page()  <- drops map ref
>>                                               // refcount: 2 -> 1
>>          hugetlb_update_hwpoison()
>>            -> MF_HUGETLB_FOLIO_PRE_POISONED
>>            -> goto out
>>          out:
>>            folio_put(folio)  <- drops gup ref
>>              // refcount: 1 -> 0
>>              free_huge_folio()
>>                spin_lock_irq(&hugetlb_lock)  <- AA DEADLOCK
>>
>> When Thread 2's put_page() drops the mapping reference while Thread 1
>> holds hugetlb_lock, the folio refcount drops to 1.  The subsequent
>> folio_put() at the out: label frees the folio, and free_huge_folio()
>> attempts to re-acquire the non-recursive hugetlb_lock on the same CPU,
>> resulting in an AA self-deadlock.
>>
>> The same deadlock can also occur on the folio_try_get() path: when a
>> migratable folio is found and folio_try_get() succeeds (refcount rises
>> to refcount+1), a concurrent unmap and a hugetlb_update_hwpoison()
>> returning pre-poisoned status will land at out: where folio_put() again
>> may free the folio under hugetlb_lock.
>>
>> Fix this by removing the hugetlb_lock wrapper from hugetlb.c and
>> moving the lock acquisition directly into get_huge_page_for_hwpoison()
>> (formerly __get_huge_page_for_hwpoison) in memory-failure.c.  Place
>> spin_unlock_irq() before folio_put() at the out: label so that the
>> folio is always released outside the lock, preventing any recursive
>> lock acquisition.  
> 
> 
> The following comment is not useful and correct for this version.

Thanks for reviewing. 

Will be removed in later version.

> 
> Remove the now-incorrect "Called from hugetlb code
>> with hugetlb_lock held" comment, and update the stale
>> __get_huge_page_for_hwpoison declarations in include/linux/mm.h.
>>
> 
> The change is lgtm,  Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
> 
> 
>> Fixes: 405ce051236c ("mm/hwpoison: fix race between hugetlb free/demotion and memory_failure_hugetlb()")
>> Signed-off-by: Wupeng Ma <mawupeng1@huawei.com>
>> ---
>>   include/linux/hugetlb.h |  8 --------
>>   include/linux/mm.h      |  8 --------
>>   mm/hugetlb.c            | 11 -----------
>>   mm/memory-failure.c     |  8 ++++----
>>   4 files changed, 4 insertions(+), 31 deletions(-)
>>
>> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
>> index 65910437be1ca..aa3eb42e0a01a 100644
>> --- a/include/linux/hugetlb.h
>> +++ b/include/linux/hugetlb.h
>> @@ -153,8 +153,6 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
>>                           long freed);
>>   bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list);
>>   int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison);
>> -int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>> -                bool *migratable_cleared);
>>   void folio_putback_hugetlb(struct folio *folio);
>>   void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason);
>>   void hugetlb_fix_reserve_counts(struct inode *inode);
>> @@ -422,12 +420,6 @@ static inline int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb,
>>       return 0;
>>   }
>>   -static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>> -                    bool *migratable_cleared)
>> -{
>> -    return 0;
>> -}
>> -
>>   static inline void folio_putback_hugetlb(struct folio *folio)
>>   {
>>   }
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index abb4963c1f064..46e5936dabaa8 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -4602,8 +4602,6 @@ extern int soft_offline_page(unsigned long pfn, int flags);
>>    */
>>   extern const struct attribute_group memory_failure_attr_group;
>>   extern void memory_failure_queue(unsigned long pfn, int flags);
>> -extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>> -                    bool *migratable_cleared);
>>   void num_poisoned_pages_inc(unsigned long pfn);
>>   void num_poisoned_pages_sub(unsigned long pfn, long i);
>>   #else
>> @@ -4611,12 +4609,6 @@ static inline void memory_failure_queue(unsigned long pfn, int flags)
>>   {
>>   }
>>   -static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>> -                    bool *migratable_cleared)
>> -{
>> -    return 0;
>> -}
>> -
>>   static inline void num_poisoned_pages_inc(unsigned long pfn)
>>   {
>>   }
>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>> index 327eaa4074d39..4c99bb868ad08 100644
>> --- a/mm/hugetlb.c
>> +++ b/mm/hugetlb.c
>> @@ -7170,17 +7170,6 @@ int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison
>>       return ret;
>>   }
>>   -int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>> -                bool *migratable_cleared)
>> -{
>> -    int ret;
>> -
>> -    spin_lock_irq(&hugetlb_lock);
>> -    ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared);
>> -    spin_unlock_irq(&hugetlb_lock);
>> -    return ret;
>> -}
>> -
>>   /**
>>    * folio_putback_hugetlb - unisolate a hugetlb folio
>>    * @folio: the isolated hugetlb folio
>> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
>> index ee42d43613097..28522180cf7f8 100644
>> --- a/mm/memory-failure.c
>> +++ b/mm/memory-failure.c
>> @@ -1966,10 +1966,7 @@ void folio_clear_hugetlb_hwpoison(struct folio *folio)
>>       folio_free_raw_hwp(folio, true);
>>   }
>>   -/*
>> - * Called from hugetlb code with hugetlb_lock held.
>> - */
>> -int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>> +static int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>>                    bool *migratable_cleared)
>>   {
>>       struct page *page = pfn_to_page(pfn);
>> @@ -1977,6 +1974,7 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>>       bool count_increased = false;
>>       int ret, rc;
>>   +    spin_lock_irq(&hugetlb_lock);
>>       if (!folio_test_hugetlb(folio)) {
>>           ret = MF_HUGETLB_NON_HUGEPAGE;
>>           goto out;
>> @@ -2013,8 +2011,10 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>>           *migratable_cleared = true;
>>       }
>>   +    spin_unlock_irq(&hugetlb_lock);
>>       return ret;
>>   out:
>> +    spin_unlock_irq(&hugetlb_lock);
>>       if (count_increased)
>>           folio_put(folio);
>>       return ret;
>