include/linux/hugetlb.h | 8 -------- include/linux/mm.h | 8 -------- mm/hugetlb.c | 11 ----------- mm/memory-failure.c | 8 ++++---- 4 files changed, 4 insertions(+), 31 deletions(-)
madvise(MADV_HWPOISON) can trigger a recursive spinlock self-deadlock
(AA deadlock) on hugetlb_lock due to a race with concurrent folio
unmapping. The race scenario:
Thread 1 (madvise MADV_HWPOISON) Thread 2 (unmap)
------------------------------- -----------------
madvise_inject_error()
get_user_pages_fast() <- refcount++
memory_failure(MF_COUNT_INCREASED)
get_huge_page_for_hwpoison()
spin_lock_irq(&hugetlb_lock)
// refcount == 2 (gup + map)
// MF_COUNT_INCREASED path:
count_increased = true
zap_pte_range()
page_remove_rmap()
put_page() <- drops map ref
// refcount: 2 -> 1
hugetlb_update_hwpoison()
-> MF_HUGETLB_FOLIO_PRE_POISONED
-> goto out
out:
folio_put(folio) <- drops gup ref
// refcount: 1 -> 0
free_huge_folio()
spin_lock_irq(&hugetlb_lock) <- AA DEADLOCK
When Thread 2's put_page() drops the mapping reference while Thread 1
holds hugetlb_lock, the folio refcount drops to 1. The subsequent
folio_put() at the out: label frees the folio, and free_huge_folio()
attempts to re-acquire the non-recursive hugetlb_lock on the same CPU,
resulting in an AA self-deadlock.
The same deadlock can also occur on the folio_try_get() path: when a
migratable folio is found and folio_try_get() succeeds (refcount rises
to refcount+1), a concurrent unmap and a hugetlb_update_hwpoison()
returning pre-poisoned status will land at out: where folio_put() again
may free the folio under hugetlb_lock.
Fix this by removing the hugetlb_lock wrapper from hugetlb.c and
moving the lock acquisition directly into get_huge_page_for_hwpoison()
(formerly __get_huge_page_for_hwpoison) in memory-failure.c. Place
spin_unlock_irq() before folio_put() at the out: label so that the
folio is always released outside the lock, preventing any recursive
lock acquisition. Remove the now-incorrect "Called from hugetlb code
with hugetlb_lock held" comment, and update the stale
__get_huge_page_for_hwpoison declarations in include/linux/mm.h.
Fixes: 405ce051236c ("mm/hwpoison: fix race between hugetlb free/demotion and memory_failure_hugetlb()")
Signed-off-by: Wupeng Ma <mawupeng1@huawei.com>
---
include/linux/hugetlb.h | 8 --------
include/linux/mm.h | 8 --------
mm/hugetlb.c | 11 -----------
mm/memory-failure.c | 8 ++++----
4 files changed, 4 insertions(+), 31 deletions(-)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 65910437be1ca..aa3eb42e0a01a 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -153,8 +153,6 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
long freed);
bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list);
int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison);
-int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
- bool *migratable_cleared);
void folio_putback_hugetlb(struct folio *folio);
void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason);
void hugetlb_fix_reserve_counts(struct inode *inode);
@@ -422,12 +420,6 @@ static inline int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb,
return 0;
}
-static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
- bool *migratable_cleared)
-{
- return 0;
-}
-
static inline void folio_putback_hugetlb(struct folio *folio)
{
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index abb4963c1f064..46e5936dabaa8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4602,8 +4602,6 @@ extern int soft_offline_page(unsigned long pfn, int flags);
*/
extern const struct attribute_group memory_failure_attr_group;
extern void memory_failure_queue(unsigned long pfn, int flags);
-extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
- bool *migratable_cleared);
void num_poisoned_pages_inc(unsigned long pfn);
void num_poisoned_pages_sub(unsigned long pfn, long i);
#else
@@ -4611,12 +4609,6 @@ static inline void memory_failure_queue(unsigned long pfn, int flags)
{
}
-static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
- bool *migratable_cleared)
-{
- return 0;
-}
-
static inline void num_poisoned_pages_inc(unsigned long pfn)
{
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 327eaa4074d39..4c99bb868ad08 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7170,17 +7170,6 @@ int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison
return ret;
}
-int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
- bool *migratable_cleared)
-{
- int ret;
-
- spin_lock_irq(&hugetlb_lock);
- ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared);
- spin_unlock_irq(&hugetlb_lock);
- return ret;
-}
-
/**
* folio_putback_hugetlb - unisolate a hugetlb folio
* @folio: the isolated hugetlb folio
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ee42d43613097..28522180cf7f8 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1966,10 +1966,7 @@ void folio_clear_hugetlb_hwpoison(struct folio *folio)
folio_free_raw_hwp(folio, true);
}
-/*
- * Called from hugetlb code with hugetlb_lock held.
- */
-int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
+static int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
bool *migratable_cleared)
{
struct page *page = pfn_to_page(pfn);
@@ -1977,6 +1974,7 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
bool count_increased = false;
int ret, rc;
+ spin_lock_irq(&hugetlb_lock);
if (!folio_test_hugetlb(folio)) {
ret = MF_HUGETLB_NON_HUGEPAGE;
goto out;
@@ -2013,8 +2011,10 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
*migratable_cleared = true;
}
+ spin_unlock_irq(&hugetlb_lock);
return ret;
out:
+ spin_unlock_irq(&hugetlb_lock);
if (count_increased)
folio_put(folio);
return ret;
--
2.43.0
On Wed, May 20, 2026 at 10:01:28AM +0800, Wupeng Ma wrote: > madvise(MADV_HWPOISON) can trigger a recursive spinlock self-deadlock > (AA deadlock) on hugetlb_lock due to a race with concurrent folio > unmapping. The race scenario: > > Thread 1 (madvise MADV_HWPOISON) Thread 2 (unmap) > ------------------------------- ----------------- > madvise_inject_error() > get_user_pages_fast() <- refcount++ > memory_failure(MF_COUNT_INCREASED) > get_huge_page_for_hwpoison() > spin_lock_irq(&hugetlb_lock) > // refcount == 2 (gup + map) > // MF_COUNT_INCREASED path: > count_increased = true > zap_pte_range() > page_remove_rmap() > put_page() <- drops map ref > // refcount: 2 -> 1 Ok, bear with me. I am not saying the change itself is wrong (maybe it is not), but how we ended up in zap_pte_range() for a hugetlb folio? The stacktrace does not seem to have much sense? -- Oscar Salvador SUSE Labs
On 周三 2026-5-20 16:13, Oscar Salvador (SUSE) wrote:
> On Wed, May 20, 2026 at 10:01:28AM +0800, Wupeng Ma wrote:
>> madvise(MADV_HWPOISON) can trigger a recursive spinlock self-deadlock
>> (AA deadlock) on hugetlb_lock due to a race with concurrent folio
>> unmapping. The race scenario:
>>
>> Thread 1 (madvise MADV_HWPOISON) Thread 2 (unmap)
>> ------------------------------- -----------------
>> madvise_inject_error()
>> get_user_pages_fast() <- refcount++
>> memory_failure(MF_COUNT_INCREASED)
>> get_huge_page_for_hwpoison()
>> spin_lock_irq(&hugetlb_lock)
>> // refcount == 2 (gup + map)
>> // MF_COUNT_INCREASED path:
>> count_increased = true
>> zap_pte_range()
>> page_remove_rmap()
>> put_page() <- drops map ref
>> // refcount: 2 -> 1
>
> Ok, bear with me.
> I am not saying the change itself is wrong (maybe it is not), but how we ended
> up in zap_pte_range() for a hugetlb folio?
> The stacktrace does not seem to have much sense?
You are correct. The refcount dropping logic in the `unmap` path was indeed flawed.
This issue was originally uncovered by fuzzing. Based on the initial stack trace,
we diagnosed it as a recursive locking (AA) deadlock on `hugetlb_lock`.
We initially suspected that `unmap` had prematurely released the folio reference
count, triggering the free path. However, after a thorough analysis of the refcount
state machine and the actual execution context, we confirmed that this hypothesis
is impossible. The root cause lies elsewhere in the locking hierarchy, and we are
currently tracing the exact call path that leads to the nested `hugetlb_lock`
acquisition.
The deadlock can be triggered by injecting hardware poison errors on a hugetlb
page while concurrent unmapping activity occurs. The following minimal userspace
test case demonstrates the race condition by spawning multiple processes to
widen the timing window for the lock contention.
/*
* Repro: hugetlb_lock AA deadlock via consecutive MADV_HWPOISON
*/
main():
// 1. Map hugetlb page
mmap(0x20000000, 0xa000, PROT_READ|PROT_EXEC,
MAP_LOCKED|MAP_HUGETLB|MAP_FIXED, -1, 0)
// 2. Inject hwpoison twice on same page
madvise(0x20000000, 0x4000, MADV_HWPOISON)
madvise(0x20000000, 0x4000, MADV_HWPOISON)
// 3. Repeat in a fork loop to widen race window
Here is the detailed AA deadlock stack with lockdep enabled
============================================
WARNING: possible recursive locking detected
7.0.0-g0eba4942ce53-dirty #335 Not tainted
--------------------------------------------
repro/742 is trying to acquire lock:
ffff800083702958 (hugetlb_lock){....}-{3:3}, at: free_huge_folio+0x194/0x3c0
but task is already holding lock:
ffff800083702958 (hugetlb_lock){....}-{3:3}, at: get_huge_page_for_hwpoison+0x38/
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0
----
lock(hugetlb_lock);
lock(hugetlb_lock);
*** DEADLOCK ***
May be due to missing lock nesting notation
2 locks held by repro/742:
#0: ffff800086f942b0 (mf_mutex){+.+.}-{4:4}, at: memory_failure+0x6c/0xdd8
#1: ffff800083702958 (hugetlb_lock){....}-{3:3}, at: get_huge_page_for_hwpoison
stack backtrace:
CPU: 2 UID: 0 PID: 742 Comm: repro Not tainted 7.0.0-g0eba4942ce53-dirty #335
Hardware name: QEMU QEMU Virtual Machine, BIOS 0.0.0 02/06/2015
Call trace:
__lock_acquire+0xe7c/0x1e38
lock_acquire+0x2b8/0x3f8
_raw_spin_lock_irqsave+0x74/0xd8
free_huge_folio+0x194/0x3c0
__folio_put+0x124/0x130
__get_huge_page_for_hwpoison+0x138/0x358
get_huge_page_for_hwpoison+0x48/0x78
memory_failure+0xb4/0xdd8
madvise_do_behavior+0x39c/0x660
do_madvise+0xe4/0x158
__arm64_sys_madvise+0x2c/0x48
>
>
>
>
On Wed, May 20, 2026 at 07:24:28PM +0800, mawupeng wrote:
> You are correct. The refcount dropping logic in the `unmap` path was indeed flawed.
> This issue was originally uncovered by fuzzing. Based on the initial stack trace,
> we diagnosed it as a recursive locking (AA) deadlock on `hugetlb_lock`.
>
> We initially suspected that `unmap` had prematurely released the folio reference
> count, triggering the free path. However, after a thorough analysis of the refcount
> state machine and the actual execution context, we confirmed that this hypothesis
> is impossible. The root cause lies elsewhere in the locking hierarchy, and we are
> currently tracing the exact call path that leads to the nested `hugetlb_lock`
> acquisition.
>
> The deadlock can be triggered by injecting hardware poison errors on a hugetlb
> page while concurrent unmapping activity occurs. The following minimal userspace
> test case demonstrates the race condition by spawning multiple processes to
> widen the timing window for the lock contention.
After staring at it, it is obvious the code is wrong.
We __should__ not be calling folio_put under the lock, as recursion will
happen if we are the last user holding a reference.
Thinking about it, I cannot think of a way we would need nesting here.
Anyway, this is a genuine bug, so thanks for that, but it all got very
confusing because of the traces pointing to wwrong places.
The thing is quite simple:
- We start with the assumption that a hugetlb folio is mapped to
userspace and that madvise
thread#0 thread#1
madvise(folio, MADV_HWPOISON) (we poisoned the page)
madvise(folio, MADV_HWPOISON) (second call)
unmap(folio)
try_memory_failure_hugetlb
get_huge_page_for_hwpoison (takes lock)
__get_huge_page_for_hwpoison
hugetlb_update_hwpoison
- we get MF_HUGETLB_FOLIO_PRE_POISONED
we jump to out which does
folio_put
free_huge_page (takes lock.. yaiks)
So yes, the fix is to have the folio_put happening not within the lock.
Please, send the patch with the right changelog (and no version) and I will ack it.
--
Oscar Salvador
SUSE Labs
On 周四 2026-5-21 16:48, Oscar Salvador (SUSE) wrote: > On Wed, May 20, 2026 at 07:24:28PM +0800, mawupeng wrote: > >> You are correct. The refcount dropping logic in the `unmap` path was indeed flawed. >> This issue was originally uncovered by fuzzing. Based on the initial stack trace, >> we diagnosed it as a recursive locking (AA) deadlock on `hugetlb_lock`. >> >> We initially suspected that `unmap` had prematurely released the folio reference >> count, triggering the free path. However, after a thorough analysis of the refcount >> state machine and the actual execution context, we confirmed that this hypothesis >> is impossible. The root cause lies elsewhere in the locking hierarchy, and we are >> currently tracing the exact call path that leads to the nested `hugetlb_lock` >> acquisition. >> >> The deadlock can be triggered by injecting hardware poison errors on a hugetlb >> page while concurrent unmapping activity occurs. The following minimal userspace >> test case demonstrates the race condition by spawning multiple processes to >> widen the timing window for the lock contention. > > > After staring at it, it is obvious the code is wrong. > We __should__ not be calling folio_put under the lock, as recursion will > happen if we are the last user holding a reference. > Thinking about it, I cannot think of a way we would need nesting here. > > Anyway, this is a genuine bug, so thanks for that, but it all got very > confusing because of the traces pointing to wwrong places. > The thing is quite simple: > > - We start with the assumption that a hugetlb folio is mapped to > userspace and that madvise > > thread#0 thread#1 > madvise(folio, MADV_HWPOISON) (we poisoned the page) > madvise(folio, MADV_HWPOISON) (second call) > unmap(folio) > try_memory_failure_hugetlb > get_huge_page_for_hwpoison (takes lock) > __get_huge_page_for_hwpoison > hugetlb_update_hwpoison > - we get MF_HUGETLB_FOLIO_PRE_POISONED > we jump to out which does > folio_put > free_huge_page (takes lock.. yaiks) > > > So yes, the fix is to have the folio_put happening not within the lock. > > Please, send the patch with the right changelog (and no version) and I will ack it. Thanks for reviewing. I will send new version with right changelog. > > >
On 5/20/26 10:13, Oscar Salvador (SUSE) wrote: > On Wed, May 20, 2026 at 10:01:28AM +0800, Wupeng Ma wrote: >> madvise(MADV_HWPOISON) can trigger a recursive spinlock self-deadlock >> (AA deadlock) on hugetlb_lock due to a race with concurrent folio >> unmapping. The race scenario: >> >> Thread 1 (madvise MADV_HWPOISON) Thread 2 (unmap) >> ------------------------------- ----------------- >> madvise_inject_error() >> get_user_pages_fast() <- refcount++ >> memory_failure(MF_COUNT_INCREASED) >> get_huge_page_for_hwpoison() >> spin_lock_irq(&hugetlb_lock) >> // refcount == 2 (gup + map) >> // MF_COUNT_INCREASED path: >> count_increased = true >> zap_pte_range() >> page_remove_rmap() >> put_page() <- drops map ref >> // refcount: 2 -> 1 > > Ok, bear with me. > I am not saying the change itself is wrong (maybe it is not), but how we ended > up in zap_pte_range() for a hugetlb folio? > The stacktrace does not seem to have much sense? Right, that does not make sense. Not even page_remove_rmap() makes sense, because that function is long gone. Undisclosed usage of shitty AI? -- Cheers, David
You should remove the v3 since this is the first version for mainline.
On 5/20/2026 10:01 AM, Wupeng Ma wrote:
> madvise(MADV_HWPOISON) can trigger a recursive spinlock self-deadlock
> (AA deadlock) on hugetlb_lock due to a race with concurrent folio
> unmapping. The race scenario:
>
> Thread 1 (madvise MADV_HWPOISON) Thread 2 (unmap)
> ------------------------------- -----------------
> madvise_inject_error()
> get_user_pages_fast() <- refcount++
> memory_failure(MF_COUNT_INCREASED)
> get_huge_page_for_hwpoison()
> spin_lock_irq(&hugetlb_lock)
> // refcount == 2 (gup + map)
> // MF_COUNT_INCREASED path:
> count_increased = true
> zap_pte_range()
> page_remove_rmap()
> put_page() <- drops map ref
> // refcount: 2 -> 1
> hugetlb_update_hwpoison()
> -> MF_HUGETLB_FOLIO_PRE_POISONED
> -> goto out
> out:
> folio_put(folio) <- drops gup ref
> // refcount: 1 -> 0
> free_huge_folio()
> spin_lock_irq(&hugetlb_lock) <- AA DEADLOCK
>
> When Thread 2's put_page() drops the mapping reference while Thread 1
> holds hugetlb_lock, the folio refcount drops to 1. The subsequent
> folio_put() at the out: label frees the folio, and free_huge_folio()
> attempts to re-acquire the non-recursive hugetlb_lock on the same CPU,
> resulting in an AA self-deadlock.
>
> The same deadlock can also occur on the folio_try_get() path: when a
> migratable folio is found and folio_try_get() succeeds (refcount rises
> to refcount+1), a concurrent unmap and a hugetlb_update_hwpoison()
> returning pre-poisoned status will land at out: where folio_put() again
> may free the folio under hugetlb_lock.
>
> Fix this by removing the hugetlb_lock wrapper from hugetlb.c and
> moving the lock acquisition directly into get_huge_page_for_hwpoison()
> (formerly __get_huge_page_for_hwpoison) in memory-failure.c. Place
> spin_unlock_irq() before folio_put() at the out: label so that the
> folio is always released outside the lock, preventing any recursive
> lock acquisition.
The following comment is not useful and correct for this version.
Remove the now-incorrect "Called from hugetlb code
> with hugetlb_lock held" comment, and update the stale
> __get_huge_page_for_hwpoison declarations in include/linux/mm.h.
>
The change is lgtm, Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
> Fixes: 405ce051236c ("mm/hwpoison: fix race between hugetlb free/demotion and memory_failure_hugetlb()")
> Signed-off-by: Wupeng Ma <mawupeng1@huawei.com>
> ---
> include/linux/hugetlb.h | 8 --------
> include/linux/mm.h | 8 --------
> mm/hugetlb.c | 11 -----------
> mm/memory-failure.c | 8 ++++----
> 4 files changed, 4 insertions(+), 31 deletions(-)
>
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 65910437be1ca..aa3eb42e0a01a 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -153,8 +153,6 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
> long freed);
> bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list);
> int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison);
> -int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> - bool *migratable_cleared);
> void folio_putback_hugetlb(struct folio *folio);
> void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason);
> void hugetlb_fix_reserve_counts(struct inode *inode);
> @@ -422,12 +420,6 @@ static inline int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb,
> return 0;
> }
>
> -static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> - bool *migratable_cleared)
> -{
> - return 0;
> -}
> -
> static inline void folio_putback_hugetlb(struct folio *folio)
> {
> }
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index abb4963c1f064..46e5936dabaa8 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -4602,8 +4602,6 @@ extern int soft_offline_page(unsigned long pfn, int flags);
> */
> extern const struct attribute_group memory_failure_attr_group;
> extern void memory_failure_queue(unsigned long pfn, int flags);
> -extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> - bool *migratable_cleared);
> void num_poisoned_pages_inc(unsigned long pfn);
> void num_poisoned_pages_sub(unsigned long pfn, long i);
> #else
> @@ -4611,12 +4609,6 @@ static inline void memory_failure_queue(unsigned long pfn, int flags)
> {
> }
>
> -static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> - bool *migratable_cleared)
> -{
> - return 0;
> -}
> -
> static inline void num_poisoned_pages_inc(unsigned long pfn)
> {
> }
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 327eaa4074d39..4c99bb868ad08 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -7170,17 +7170,6 @@ int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison
> return ret;
> }
>
> -int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> - bool *migratable_cleared)
> -{
> - int ret;
> -
> - spin_lock_irq(&hugetlb_lock);
> - ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared);
> - spin_unlock_irq(&hugetlb_lock);
> - return ret;
> -}
> -
> /**
> * folio_putback_hugetlb - unisolate a hugetlb folio
> * @folio: the isolated hugetlb folio
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index ee42d43613097..28522180cf7f8 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -1966,10 +1966,7 @@ void folio_clear_hugetlb_hwpoison(struct folio *folio)
> folio_free_raw_hwp(folio, true);
> }
>
> -/*
> - * Called from hugetlb code with hugetlb_lock held.
> - */
> -int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> +static int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> bool *migratable_cleared)
> {
> struct page *page = pfn_to_page(pfn);
> @@ -1977,6 +1974,7 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> bool count_increased = false;
> int ret, rc;
>
> + spin_lock_irq(&hugetlb_lock);
> if (!folio_test_hugetlb(folio)) {
> ret = MF_HUGETLB_NON_HUGEPAGE;
> goto out;
> @@ -2013,8 +2011,10 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> *migratable_cleared = true;
> }
>
> + spin_unlock_irq(&hugetlb_lock);
> return ret;
> out:
> + spin_unlock_irq(&hugetlb_lock);
> if (count_increased)
> folio_put(folio);
> return ret;
On 周三 2026-5-20 10:40, Kefeng Wang wrote:
> You should remove the v3 since this is the first version for mainline.
Thanks for reewing.
Sorry for my mistack, it will be removed next time.
>
> On 5/20/2026 10:01 AM, Wupeng Ma wrote:
>> madvise(MADV_HWPOISON) can trigger a recursive spinlock self-deadlock
>> (AA deadlock) on hugetlb_lock due to a race with concurrent folio
>> unmapping. The race scenario:
>>
>> Thread 1 (madvise MADV_HWPOISON) Thread 2 (unmap)
>> ------------------------------- -----------------
>> madvise_inject_error()
>> get_user_pages_fast() <- refcount++
>> memory_failure(MF_COUNT_INCREASED)
>> get_huge_page_for_hwpoison()
>> spin_lock_irq(&hugetlb_lock)
>> // refcount == 2 (gup + map)
>> // MF_COUNT_INCREASED path:
>> count_increased = true
>> zap_pte_range()
>> page_remove_rmap()
>> put_page() <- drops map ref
>> // refcount: 2 -> 1
>> hugetlb_update_hwpoison()
>> -> MF_HUGETLB_FOLIO_PRE_POISONED
>> -> goto out
>> out:
>> folio_put(folio) <- drops gup ref
>> // refcount: 1 -> 0
>> free_huge_folio()
>> spin_lock_irq(&hugetlb_lock) <- AA DEADLOCK
>>
>> When Thread 2's put_page() drops the mapping reference while Thread 1
>> holds hugetlb_lock, the folio refcount drops to 1. The subsequent
>> folio_put() at the out: label frees the folio, and free_huge_folio()
>> attempts to re-acquire the non-recursive hugetlb_lock on the same CPU,
>> resulting in an AA self-deadlock.
>>
>> The same deadlock can also occur on the folio_try_get() path: when a
>> migratable folio is found and folio_try_get() succeeds (refcount rises
>> to refcount+1), a concurrent unmap and a hugetlb_update_hwpoison()
>> returning pre-poisoned status will land at out: where folio_put() again
>> may free the folio under hugetlb_lock.
>>
>> Fix this by removing the hugetlb_lock wrapper from hugetlb.c and
>> moving the lock acquisition directly into get_huge_page_for_hwpoison()
>> (formerly __get_huge_page_for_hwpoison) in memory-failure.c. Place
>> spin_unlock_irq() before folio_put() at the out: label so that the
>> folio is always released outside the lock, preventing any recursive
>> lock acquisition.
>
>
> The following comment is not useful and correct for this version.
Thanks for reviewing.
Will be removed in later version.
>
> Remove the now-incorrect "Called from hugetlb code
>> with hugetlb_lock held" comment, and update the stale
>> __get_huge_page_for_hwpoison declarations in include/linux/mm.h.
>>
>
> The change is lgtm, Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
>
>
>> Fixes: 405ce051236c ("mm/hwpoison: fix race between hugetlb free/demotion and memory_failure_hugetlb()")
>> Signed-off-by: Wupeng Ma <mawupeng1@huawei.com>
>> ---
>> include/linux/hugetlb.h | 8 --------
>> include/linux/mm.h | 8 --------
>> mm/hugetlb.c | 11 -----------
>> mm/memory-failure.c | 8 ++++----
>> 4 files changed, 4 insertions(+), 31 deletions(-)
>>
>> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
>> index 65910437be1ca..aa3eb42e0a01a 100644
>> --- a/include/linux/hugetlb.h
>> +++ b/include/linux/hugetlb.h
>> @@ -153,8 +153,6 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
>> long freed);
>> bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list);
>> int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison);
>> -int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>> - bool *migratable_cleared);
>> void folio_putback_hugetlb(struct folio *folio);
>> void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason);
>> void hugetlb_fix_reserve_counts(struct inode *inode);
>> @@ -422,12 +420,6 @@ static inline int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb,
>> return 0;
>> }
>> -static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>> - bool *migratable_cleared)
>> -{
>> - return 0;
>> -}
>> -
>> static inline void folio_putback_hugetlb(struct folio *folio)
>> {
>> }
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index abb4963c1f064..46e5936dabaa8 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -4602,8 +4602,6 @@ extern int soft_offline_page(unsigned long pfn, int flags);
>> */
>> extern const struct attribute_group memory_failure_attr_group;
>> extern void memory_failure_queue(unsigned long pfn, int flags);
>> -extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>> - bool *migratable_cleared);
>> void num_poisoned_pages_inc(unsigned long pfn);
>> void num_poisoned_pages_sub(unsigned long pfn, long i);
>> #else
>> @@ -4611,12 +4609,6 @@ static inline void memory_failure_queue(unsigned long pfn, int flags)
>> {
>> }
>> -static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>> - bool *migratable_cleared)
>> -{
>> - return 0;
>> -}
>> -
>> static inline void num_poisoned_pages_inc(unsigned long pfn)
>> {
>> }
>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>> index 327eaa4074d39..4c99bb868ad08 100644
>> --- a/mm/hugetlb.c
>> +++ b/mm/hugetlb.c
>> @@ -7170,17 +7170,6 @@ int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison
>> return ret;
>> }
>> -int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>> - bool *migratable_cleared)
>> -{
>> - int ret;
>> -
>> - spin_lock_irq(&hugetlb_lock);
>> - ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared);
>> - spin_unlock_irq(&hugetlb_lock);
>> - return ret;
>> -}
>> -
>> /**
>> * folio_putback_hugetlb - unisolate a hugetlb folio
>> * @folio: the isolated hugetlb folio
>> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
>> index ee42d43613097..28522180cf7f8 100644
>> --- a/mm/memory-failure.c
>> +++ b/mm/memory-failure.c
>> @@ -1966,10 +1966,7 @@ void folio_clear_hugetlb_hwpoison(struct folio *folio)
>> folio_free_raw_hwp(folio, true);
>> }
>> -/*
>> - * Called from hugetlb code with hugetlb_lock held.
>> - */
>> -int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>> +static int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>> bool *migratable_cleared)
>> {
>> struct page *page = pfn_to_page(pfn);
>> @@ -1977,6 +1974,7 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>> bool count_increased = false;
>> int ret, rc;
>> + spin_lock_irq(&hugetlb_lock);
>> if (!folio_test_hugetlb(folio)) {
>> ret = MF_HUGETLB_NON_HUGEPAGE;
>> goto out;
>> @@ -2013,8 +2011,10 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>> *migratable_cleared = true;
>> }
>> + spin_unlock_irq(&hugetlb_lock);
>> return ret;
>> out:
>> + spin_unlock_irq(&hugetlb_lock);
>> if (count_increased)
>> folio_put(folio);
>> return ret;
>
© 2016 - 2026 Red Hat, Inc.