[PATCH] mm/memory-failure: fix missing ->mf_stats count in hugetlb poison

Jane Chu posted 1 patch 1 day, 15 hours ago
include/linux/hugetlb.h |  4 ++--
include/linux/mm.h      |  4 ++--
mm/hugetlb.c            |  4 ++--
mm/memory-failure.c     | 22 +++++++++++++---------
4 files changed, 19 insertions(+), 15 deletions(-)
[PATCH] mm/memory-failure: fix missing ->mf_stats count in hugetlb poison
Posted by Jane Chu 1 day, 15 hours ago
When a newly poisoned subpage ends up in an already poisoned hugetlb
folio, 'num_poisoned_pages' is incremented, but the per node ->mf_stats
is not. Fix the inconsistency by designating action_result() to update
them both.

Fixes: 18f41fa616ee4 ("mm: memory-failure: bump memory failure stats to pglist_data")
Cc: <stable@vger.kernel.org>
Signed-off-by: Jane Chu <jane.chu@oracle.com>
---
 include/linux/hugetlb.h |  4 ++--
 include/linux/mm.h      |  4 ++--
 mm/hugetlb.c            |  4 ++--
 mm/memory-failure.c     | 22 +++++++++++++---------
 4 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 8e63e46b8e1f..2e6690c9df96 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -157,7 +157,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
 bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list);
 int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison);
 int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
-				bool *migratable_cleared);
+				bool *migratable_cleared, bool *samepg);
 void folio_putback_hugetlb(struct folio *folio);
 void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason);
 void hugetlb_fix_reserve_counts(struct inode *inode);
@@ -420,7 +420,7 @@ static inline int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb,
 }
 
 static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
-					bool *migratable_cleared)
+					bool *migratable_cleared, bool *samepg)
 {
 	return 0;
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7c79b3369b82..68b1812e9c0a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4036,7 +4036,7 @@ extern int soft_offline_page(unsigned long pfn, int flags);
 extern const struct attribute_group memory_failure_attr_group;
 extern void memory_failure_queue(unsigned long pfn, int flags);
 extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
-					bool *migratable_cleared);
+					bool *migratable_cleared, bool *samepg);
 void num_poisoned_pages_inc(unsigned long pfn);
 void num_poisoned_pages_sub(unsigned long pfn, long i);
 #else
@@ -4045,7 +4045,7 @@ static inline void memory_failure_queue(unsigned long pfn, int flags)
 }
 
 static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
-					bool *migratable_cleared)
+					bool *migratable_cleared, bool *samepg)
 {
 	return 0;
 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0455119716ec..f78562a578e5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7818,12 +7818,12 @@ int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison
 }
 
 int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
-				bool *migratable_cleared)
+				bool *migratable_cleared, bool *samepg)
 {
 	int ret;
 
 	spin_lock_irq(&hugetlb_lock);
-	ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared);
+	ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared, samepg);
 	spin_unlock_irq(&hugetlb_lock);
 	return ret;
 }
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 3edebb0cda30..070f43bb110a 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1873,7 +1873,8 @@ static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag)
 	return count;
 }
 
-static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
+static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page,
+					bool *samepg)
 {
 	struct llist_head *head;
 	struct raw_hwp_page *raw_hwp;
@@ -1889,17 +1890,16 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
 		return -EHWPOISON;
 	head = raw_hwp_list_head(folio);
 	llist_for_each_entry(p, head->first, node) {
-		if (p->page == page)
+		if (p->page == page) {
+			*samepg = true;
 			return -EHWPOISON;
+		}
 	}
 
 	raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC);
 	if (raw_hwp) {
 		raw_hwp->page = page;
 		llist_add(&raw_hwp->node, head);
-		/* the first error event will be counted in action_result(). */
-		if (ret)
-			num_poisoned_pages_inc(page_to_pfn(page));
 	} else {
 		/*
 		 * Failed to save raw error info.  We no longer trace all
@@ -1956,7 +1956,7 @@ void folio_clear_hugetlb_hwpoison(struct folio *folio)
  *   -EHWPOISON    - the hugepage is already hwpoisoned
  */
 int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
-				 bool *migratable_cleared)
+				 bool *migratable_cleared, bool *samepg)
 {
 	struct page *page = pfn_to_page(pfn);
 	struct folio *folio = page_folio(page);
@@ -1981,7 +1981,7 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
 			goto out;
 	}
 
-	if (folio_set_hugetlb_hwpoison(folio, page)) {
+	if (folio_set_hugetlb_hwpoison(folio, page, samepg)) {
 		ret = -EHWPOISON;
 		goto out;
 	}
@@ -2014,11 +2014,12 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb
 	struct page *p = pfn_to_page(pfn);
 	struct folio *folio;
 	unsigned long page_flags;
+	bool samepg = false;
 	bool migratable_cleared = false;
 
 	*hugetlb = 1;
 retry:
-	res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared);
+	res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared, &samepg);
 	if (res == 2) { /* fallback to normal page handling */
 		*hugetlb = 0;
 		return 0;
@@ -2027,7 +2028,10 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb
 			folio = page_folio(p);
 			res = kill_accessing_process(current, folio_pfn(folio), flags);
 		}
-		action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
+		if (samepg)
+			action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
+		else
+			action_result(pfn, MF_MSG_HUGE, MF_FAILED);
 		return res;
 	} else if (res == -EBUSY) {
 		if (!(flags & MF_NO_RETRY)) {
-- 
2.43.5
Re: [PATCH] mm/memory-failure: fix missing ->mf_stats count in hugetlb poison
Posted by David Hildenbrand (Red Hat) 5 hours ago
On 12/16/25 22:56, Jane Chu wrote:
> When a newly poisoned subpage ends up in an already poisoned hugetlb

The concept of subpages does not exist. It's a page of a hugetlb folio.

> folio, 'num_poisoned_pages' is incremented, but the per node ->mf_stats
> is not. Fix the inconsistency by designating action_result() to update
> them both.

What is the user-visible result of that?

> 
> Fixes: 18f41fa616ee4 ("mm: memory-failure: bump memory failure stats to pglist_data")
> Cc: <stable@vger.kernel.org>
> Signed-off-by: Jane Chu <jane.chu@oracle.com>
> ---
>   include/linux/hugetlb.h |  4 ++--
>   include/linux/mm.h      |  4 ++--
>   mm/hugetlb.c            |  4 ++--
>   mm/memory-failure.c     | 22 +++++++++++++---------
>   4 files changed, 19 insertions(+), 15 deletions(-)
> 
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 8e63e46b8e1f..2e6690c9df96 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -157,7 +157,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
>   bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list);
>   int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison);
>   int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> -				bool *migratable_cleared);
> +				bool *migratable_cleared, bool *samepg);
>   void folio_putback_hugetlb(struct folio *folio);
>   void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason);
>   void hugetlb_fix_reserve_counts(struct inode *inode);
> @@ -420,7 +420,7 @@ static inline int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb,
>   }
>   
>   static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> -					bool *migratable_cleared)
> +					bool *migratable_cleared, bool *samepg)
>   {
>   	return 0;
>   }
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 7c79b3369b82..68b1812e9c0a 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -4036,7 +4036,7 @@ extern int soft_offline_page(unsigned long pfn, int flags);
>   extern const struct attribute_group memory_failure_attr_group;
>   extern void memory_failure_queue(unsigned long pfn, int flags);
>   extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> -					bool *migratable_cleared);
> +					bool *migratable_cleared, bool *samepg);
>   void num_poisoned_pages_inc(unsigned long pfn);
>   void num_poisoned_pages_sub(unsigned long pfn, long i);
>   #else
> @@ -4045,7 +4045,7 @@ static inline void memory_failure_queue(unsigned long pfn, int flags)
>   }
>   
>   static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> -					bool *migratable_cleared)
> +					bool *migratable_cleared, bool *samepg)
>   {
>   	return 0;
>   }
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 0455119716ec..f78562a578e5 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -7818,12 +7818,12 @@ int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison
>   }
>   
>   int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> -				bool *migratable_cleared)
> +				bool *migratable_cleared, bool *samepg)
>   {
>   	int ret;
>   
>   	spin_lock_irq(&hugetlb_lock);
> -	ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared);
> +	ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared, samepg);
>   	spin_unlock_irq(&hugetlb_lock);
>   	return ret;
>   }
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index 3edebb0cda30..070f43bb110a 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -1873,7 +1873,8 @@ static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag)
>   	return count;
>   }
>   
> -static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
> +static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page,
> +					bool *samepg)
>   {
>   	struct llist_head *head;
>   	struct raw_hwp_page *raw_hwp;
> @@ -1889,17 +1890,16 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
>   		return -EHWPOISON;
>   	head = raw_hwp_list_head(folio);
>   	llist_for_each_entry(p, head->first, node) {
> -		if (p->page == page)
> +		if (p->page == page) {
> +			*samepg = true;
>   			return -EHWPOISON;
> +		}
>   	}
>   
>   	raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC);
>   	if (raw_hwp) {
>   		raw_hwp->page = page;
>   		llist_add(&raw_hwp->node, head);
> -		/* the first error event will be counted in action_result(). */
> -		if (ret)
> -			num_poisoned_pages_inc(page_to_pfn(page));
>   	} else {
>   		/*
>   		 * Failed to save raw error info.  We no longer trace all
> @@ -1956,7 +1956,7 @@ void folio_clear_hugetlb_hwpoison(struct folio *folio)
>    *   -EHWPOISON    - the hugepage is already hwpoisoned
>    */
>   int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> -				 bool *migratable_cleared)
> +				 bool *migratable_cleared, bool *samepg)
>   {
>   	struct page *page = pfn_to_page(pfn);
>   	struct folio *folio = page_folio(page);
> @@ -1981,7 +1981,7 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>   			goto out;
>   	}
>   
> -	if (folio_set_hugetlb_hwpoison(folio, page)) {
> +	if (folio_set_hugetlb_hwpoison(folio, page, samepg)) {
>   		ret = -EHWPOISON;
>   		goto out;
>   	}
> @@ -2014,11 +2014,12 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb
>   	struct page *p = pfn_to_page(pfn);
>   	struct folio *folio;
>   	unsigned long page_flags;
> +	bool samepg = false;
>   	bool migratable_cleared = false;
>   
>   	*hugetlb = 1;
>   retry:
> -	res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared);
> +	res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared, &samepg);
>   	if (res == 2) { /* fallback to normal page handling */
>   		*hugetlb = 0;
>   		return 0;
> @@ -2027,7 +2028,10 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb
>   			folio = page_folio(p);
>   			res = kill_accessing_process(current, folio_pfn(folio), flags);
>   		}
> -		action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
> +		if (samepg)
> +			action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
> +		else
> +			action_result(pfn, MF_MSG_HUGE, MF_FAILED);

Can't we somehow return that result from get_huge_page_for_hwpoison() 
... folio_set_hugetlb_hwpoison() differently? E.g., return an enum 
instead of "-EHWPOISON" or magic value "2".

"samepg" is petty much unreadable. Same with what?

What you really mean is "page was already hwpoisoned".

In an enum you might be better able to describe the various scenarios.

-- 
Cheers

David