[PATCH v3 1/3] mm: handle poisoning of pfn without struct pages

ankita@nvidia.com posted 3 patches 3 months, 3 weeks ago
There is a newer version of this series
[PATCH v3 1/3] mm: handle poisoning of pfn without struct pages
Posted by ankita@nvidia.com 3 months, 3 weeks ago
From: Ankit Agrawal <ankita@nvidia.com>

The kernel MM currently does not handle ECC errors / poison on a memory
region that is not backed by struct pages. If a memory region mapped
using remap_pfn_range() for example, but not added to the kernel, MM
will not have associated struct pages. Add a new mechanism to handle
memory failure on such memory.

Make kernel MM expose a function to allow modules managing the device
memory to register the device memory SPA and the address space associated
it. MM maintains this information as an interval tree. On poison, MM can
search for the range that the poisoned PFN belong and use the address_space
to determine the mapping VMA.

In this implementation, kernel MM follows the following sequence that is
largely similar to the memory_failure() handler for struct page backed
memory:
1. memory_failure() is triggered on reception of a poison error. An
absence of struct page is detected and consequently memory_failure_pfn()
is executed.
2. memory_failure_pfn() collects the processes mapped to the PFN.
3. memory_failure_pfn() sends SIGBUS to all the processes mapping the
poisoned PFN using kill_procs().

Note that there is one primary difference versus the handling of the
poison on struct pages, which is to skip unmapping to the faulty PFN.
This is done to handle the huge PFNMAP support added recently [1] that
enables VM_PFNMAP vmas to map in either PMD level. Otherwise, a poison
to a PFN would need breaking the PMD mapping into PTEs to unmap only
the poisoned PFN. This will have a major performance impact.

Link: https://lore.kernel.org/all/20240826204353.2228736-1-peterx@redhat.com/ [1]

Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
---
 MAINTAINERS                    |   1 +
 include/linux/memory-failure.h |  17 +++++
 include/linux/mm.h             |   1 +
 include/ras/ras_event.h        |   1 +
 mm/Kconfig                     |   1 +
 mm/memory-failure.c            | 128 ++++++++++++++++++++++++++++++++-
 6 files changed, 148 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/memory-failure.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 520fb4e379a3..463d062d0386 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11359,6 +11359,7 @@ M:	Miaohe Lin <linmiaohe@huawei.com>
 R:	Naoya Horiguchi <nao.horiguchi@gmail.com>
 L:	linux-mm@kvack.org
 S:	Maintained
+F:	include/linux/memory-failure.h
 F:	mm/hwpoison-inject.c
 F:	mm/memory-failure.c
 
diff --git a/include/linux/memory-failure.h b/include/linux/memory-failure.h
new file mode 100644
index 000000000000..bc326503d2d2
--- /dev/null
+++ b/include/linux/memory-failure.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_MEMORY_FAILURE_H
+#define _LINUX_MEMORY_FAILURE_H
+
+#include <linux/interval_tree.h>
+
+struct pfn_address_space;
+
+struct pfn_address_space {
+	struct interval_tree_node node;
+	struct address_space *mapping;
+};
+
+int register_pfn_address_space(struct pfn_address_space *pfn_space);
+void unregister_pfn_address_space(struct pfn_address_space *pfn_space);
+
+#endif /* _LINUX_MEMORY_FAILURE_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1ae97a0b8ec7..0ab4ea82ce9e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4006,6 +4006,7 @@ enum mf_action_page_type {
 	MF_MSG_DAX,
 	MF_MSG_UNSPLIT_THP,
 	MF_MSG_ALREADY_POISONED,
+	MF_MSG_PFN_MAP,
 	MF_MSG_UNKNOWN,
 };
 
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
index c8cd0f00c845..fecfeb7c8be7 100644
--- a/include/ras/ras_event.h
+++ b/include/ras/ras_event.h
@@ -375,6 +375,7 @@ TRACE_EVENT(aer_event,
 	EM ( MF_MSG_DAX, "dax page" )					\
 	EM ( MF_MSG_UNSPLIT_THP, "unsplit thp" )			\
 	EM ( MF_MSG_ALREADY_POISONED, "already poisoned" )		\
+	EM ( MF_MSG_PFN_MAP, "non struct page pfn" )                    \
 	EMe ( MF_MSG_UNKNOWN, "unknown page" )
 
 /*
diff --git a/mm/Kconfig b/mm/Kconfig
index e443fe8cd6cf..0b07219390b9 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -777,6 +777,7 @@ config MEMORY_FAILURE
 	depends on ARCH_SUPPORTS_MEMORY_FAILURE
 	bool "Enable recovery from hardware memory errors"
 	select MEMORY_ISOLATION
+	select INTERVAL_TREE
 	select RAS
 	help
 	  Enables code to recover from some memory failures on systems
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index df6ee59527dd..acfe5a9bde1d 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -38,6 +38,7 @@
 
 #include <linux/kernel.h>
 #include <linux/mm.h>
+#include <linux/memory-failure.h>
 #include <linux/page-flags.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/task.h>
@@ -154,6 +155,10 @@ static const struct ctl_table memory_failure_table[] = {
 	}
 };
 
+static struct rb_root_cached pfn_space_itree = RB_ROOT_CACHED;
+
+static DEFINE_MUTEX(pfn_space_lock);
+
 /*
  * Return values:
  *   1:   the page is dissolved (if needed) and taken off from buddy,
@@ -957,6 +962,7 @@ static const char * const action_page_types[] = {
 	[MF_MSG_DAX]			= "dax page",
 	[MF_MSG_UNSPLIT_THP]		= "unsplit thp",
 	[MF_MSG_ALREADY_POISONED]	= "already poisoned page",
+	[MF_MSG_PFN_MAP]                = "non struct page pfn",
 	[MF_MSG_UNKNOWN]		= "unknown page",
 };
 
@@ -1349,7 +1355,7 @@ static int action_result(unsigned long pfn, enum mf_action_page_type type,
 {
 	trace_memory_failure_event(pfn, type, result);
 
-	if (type != MF_MSG_ALREADY_POISONED) {
+	if (type != MF_MSG_ALREADY_POISONED && type != MF_MSG_PFN_MAP) {
 		num_poisoned_pages_inc(pfn);
 		update_per_node_mf_stats(pfn, result);
 	}
@@ -2216,6 +2222,121 @@ static void kill_procs_now(struct page *p, unsigned long pfn, int flags,
 	kill_procs(&tokill, true, pfn, flags);
 }
 
+int register_pfn_address_space(struct pfn_address_space *pfn_space)
+{
+	if (!pfn_space)
+		return -EINVAL;
+
+	mutex_lock(&pfn_space_lock);
+
+	if (interval_tree_iter_first(&pfn_space_itree,
+				     pfn_space->node.start,
+				     pfn_space->node.last)) {
+		mutex_unlock(&pfn_space_lock);
+		return -EBUSY;
+	}
+
+	interval_tree_insert(&pfn_space->node, &pfn_space_itree);
+	mutex_unlock(&pfn_space_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(register_pfn_address_space);
+
+void unregister_pfn_address_space(struct pfn_address_space *pfn_space)
+{
+	if (!pfn_space)
+		return;
+
+	mutex_lock(&pfn_space_lock);
+	interval_tree_remove(&pfn_space->node, &pfn_space_itree);
+	mutex_unlock(&pfn_space_lock);
+}
+EXPORT_SYMBOL_GPL(unregister_pfn_address_space);
+
+static void add_to_kill_pfn(struct task_struct *tsk,
+			    struct vm_area_struct *vma,
+			    struct list_head *to_kill,
+			    unsigned long pfn)
+{
+	struct to_kill *tk;
+
+	tk = kmalloc(sizeof(*tk), GFP_ATOMIC);
+	if (!tk)
+		return;
+
+	/* Check for pgoff not backed by struct page */
+	tk->addr = vma_address(vma, pfn, 1);
+	tk->size_shift = PAGE_SHIFT;
+
+	if (tk->addr == -EFAULT)
+		pr_info("Unable to find address %lx in %s\n",
+			pfn, tsk->comm);
+
+	get_task_struct(tsk);
+	tk->tsk = tsk;
+	list_add_tail(&tk->nd, to_kill);
+}
+
+/*
+ * Collect processes when the error hit a PFN not backed by struct page.
+ */
+static void collect_procs_pfn(struct address_space *mapping,
+			      unsigned long pfn, struct list_head *to_kill)
+{
+	struct vm_area_struct *vma;
+	struct task_struct *tsk;
+
+	i_mmap_lock_read(mapping);
+	rcu_read_lock();
+	for_each_process(tsk) {
+		struct task_struct *t = tsk;
+
+		t = task_early_kill(tsk, true);
+		if (!t)
+			continue;
+		vma_interval_tree_foreach(vma, &mapping->i_mmap, pfn, pfn) {
+			if (vma->vm_mm == t->mm)
+				add_to_kill_pfn(t, vma, to_kill, pfn);
+		}
+	}
+	rcu_read_unlock();
+	i_mmap_unlock_read(mapping);
+}
+
+static int memory_failure_pfn(unsigned long pfn, int flags)
+{
+	struct interval_tree_node *node;
+	LIST_HEAD(tokill);
+
+	mutex_lock(&pfn_space_lock);
+	/*
+	 * Modules registers with MM the address space mapping to the device memory they
+	 * manage. Iterate to identify exactly which address space has mapped to this
+	 * failing PFN.
+	 */
+	for (node = interval_tree_iter_first(&pfn_space_itree, pfn, pfn); node;
+	     node = interval_tree_iter_next(node, pfn, pfn)) {
+		struct pfn_address_space *pfn_space =
+			container_of(node, struct pfn_address_space, node);
+
+		collect_procs_pfn(pfn_space->mapping, pfn, &tokill);
+	}
+	mutex_unlock(&pfn_space_lock);
+
+	/*
+	 * Unlike System-RAM there is no possibility to swap in a different
+	 * physical page at a given virtual address, so all userspace
+	 * consumption of direct PFN memory necessitates SIGBUS (i.e.
+	 * MF_MUST_KILL)
+	 */
+	flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
+
+	kill_procs(&tokill, true, pfn, flags);
+
+	return action_result(pfn, MF_MSG_PFN_MAP, MF_RECOVERED);
+}
+
 /**
  * memory_failure - Handle memory failure of a page.
  * @pfn: Page Number of the corrupted page
@@ -2259,6 +2380,11 @@ int memory_failure(unsigned long pfn, int flags)
 	if (!(flags & MF_SW_SIMULATED))
 		hw_memory_failure = true;
 
+	if (!pfn_valid(pfn) && !arch_is_platform_page(PFN_PHYS(pfn))) {
+		res = memory_failure_pfn(pfn, flags);
+		goto unlock_mutex;
+	}
+
 	p = pfn_to_online_page(pfn);
 	if (!p) {
 		res = arch_memory_failure(pfn, flags);
-- 
2.34.1
Re: [PATCH v3 1/3] mm: handle poisoning of pfn without struct pages
Posted by Shuai Xue 3 months, 2 weeks ago

在 2025/10/21 18:23, ankita@nvidia.com 写道:
> From: Ankit Agrawal <ankita@nvidia.com>
> 
> The kernel MM currently does not handle ECC errors / poison on a memory
> region that is not backed by struct pages. If a memory region mapped
> using remap_pfn_range() for example, but not added to the kernel, MM
> will not have associated struct pages. Add a new mechanism to handle
> memory failure on such memory.
> 
> Make kernel MM expose a function to allow modules managing the device
> memory to register the device memory SPA and the address space associated
> it. MM maintains this information as an interval tree. On poison, MM can
> search for the range that the poisoned PFN belong and use the address_space
> to determine the mapping VMA.
> 
> In this implementation, kernel MM follows the following sequence that is
> largely similar to the memory_failure() handler for struct page backed
> memory:
> 1. memory_failure() is triggered on reception of a poison error. An
> absence of struct page is detected and consequently memory_failure_pfn()
> is executed. 

This step depends on PATCH 2. I suggest reordering the patches so that
PATCH 2 comes first, which would make the series easier to review and
understand.

> 2. memory_failure_pfn() collects the processes mapped to the PFN.
> 3. memory_failure_pfn() sends SIGBUS to all the processes mapping the
> poisoned PFN using kill_procs().
> 
> Note that there is one primary difference versus the handling of the
> poison on struct pages, which is to skip unmapping to the faulty PFN.
> This is done to handle the huge PFNMAP support added recently [1] that
> enables VM_PFNMAP vmas to map in either PMD level. Otherwise, a poison
> to a PFN would need breaking the PMD mapping into PTEs to unmap only
> the poisoned PFN. This will have a major performance impact.
> 
> Link: https://lore.kernel.org/all/20240826204353.2228736-1-peterx@redhat.com/ [1]
> 
> Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
> ---
>   MAINTAINERS                    |   1 +
>   include/linux/memory-failure.h |  17 +++++
>   include/linux/mm.h             |   1 +
>   include/ras/ras_event.h        |   1 +
>   mm/Kconfig                     |   1 +
>   mm/memory-failure.c            | 128 ++++++++++++++++++++++++++++++++-
>   6 files changed, 148 insertions(+), 1 deletion(-)
>   create mode 100644 include/linux/memory-failure.h
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 520fb4e379a3..463d062d0386 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -11359,6 +11359,7 @@ M:	Miaohe Lin <linmiaohe@huawei.com>
>   R:	Naoya Horiguchi <nao.horiguchi@gmail.com>
>   L:	linux-mm@kvack.org
>   S:	Maintained
> +F:	include/linux/memory-failure.h
>   F:	mm/hwpoison-inject.c
>   F:	mm/memory-failure.c
>   
> diff --git a/include/linux/memory-failure.h b/include/linux/memory-failure.h
> new file mode 100644
> index 000000000000..bc326503d2d2
> --- /dev/null
> +++ b/include/linux/memory-failure.h
> @@ -0,0 +1,17 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_MEMORY_FAILURE_H
> +#define _LINUX_MEMORY_FAILURE_H
> +
> +#include <linux/interval_tree.h>
> +
> +struct pfn_address_space;
> +
> +struct pfn_address_space {
> +	struct interval_tree_node node;
> +	struct address_space *mapping;
> +};
> +
> +int register_pfn_address_space(struct pfn_address_space *pfn_space);
> +void unregister_pfn_address_space(struct pfn_address_space *pfn_space);
> +
> +#endif /* _LINUX_MEMORY_FAILURE_H */
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 1ae97a0b8ec7..0ab4ea82ce9e 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -4006,6 +4006,7 @@ enum mf_action_page_type {
>   	MF_MSG_DAX,
>   	MF_MSG_UNSPLIT_THP,
>   	MF_MSG_ALREADY_POISONED,
> +	MF_MSG_PFN_MAP,
>   	MF_MSG_UNKNOWN,
>   };
>   
> diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
> index c8cd0f00c845..fecfeb7c8be7 100644
> --- a/include/ras/ras_event.h
> +++ b/include/ras/ras_event.h
> @@ -375,6 +375,7 @@ TRACE_EVENT(aer_event,
>   	EM ( MF_MSG_DAX, "dax page" )					\
>   	EM ( MF_MSG_UNSPLIT_THP, "unsplit thp" )			\
>   	EM ( MF_MSG_ALREADY_POISONED, "already poisoned" )		\
> +	EM ( MF_MSG_PFN_MAP, "non struct page pfn" )                    \
>   	EMe ( MF_MSG_UNKNOWN, "unknown page" )
>   
>   /*
> diff --git a/mm/Kconfig b/mm/Kconfig
> index e443fe8cd6cf..0b07219390b9 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -777,6 +777,7 @@ config MEMORY_FAILURE
>   	depends on ARCH_SUPPORTS_MEMORY_FAILURE
>   	bool "Enable recovery from hardware memory errors"
>   	select MEMORY_ISOLATION
> +	select INTERVAL_TREE
>   	select RAS
>   	help
>   	  Enables code to recover from some memory failures on systems
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index df6ee59527dd..acfe5a9bde1d 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -38,6 +38,7 @@
>   
>   #include <linux/kernel.h>
>   #include <linux/mm.h>
> +#include <linux/memory-failure.h>
>   #include <linux/page-flags.h>
>   #include <linux/sched/signal.h>
>   #include <linux/sched/task.h>
> @@ -154,6 +155,10 @@ static const struct ctl_table memory_failure_table[] = {
>   	}
>   };
>   
> +static struct rb_root_cached pfn_space_itree = RB_ROOT_CACHED;
> +
> +static DEFINE_MUTEX(pfn_space_lock);
> +
>   /*
>    * Return values:
>    *   1:   the page is dissolved (if needed) and taken off from buddy,
> @@ -957,6 +962,7 @@ static const char * const action_page_types[] = {
>   	[MF_MSG_DAX]			= "dax page",
>   	[MF_MSG_UNSPLIT_THP]		= "unsplit thp",
>   	[MF_MSG_ALREADY_POISONED]	= "already poisoned page",
> +	[MF_MSG_PFN_MAP]                = "non struct page pfn",
>   	[MF_MSG_UNKNOWN]		= "unknown page",
>   };
>   
> @@ -1349,7 +1355,7 @@ static int action_result(unsigned long pfn, enum mf_action_page_type type,
>   {
>   	trace_memory_failure_event(pfn, type, result);
>   
> -	if (type != MF_MSG_ALREADY_POISONED) {
> +	if (type != MF_MSG_ALREADY_POISONED && type != MF_MSG_PFN_MAP) {
>   		num_poisoned_pages_inc(pfn);
>   		update_per_node_mf_stats(pfn, result);
>   	}
> @@ -2216,6 +2222,121 @@ static void kill_procs_now(struct page *p, unsigned long pfn, int flags,
>   	kill_procs(&tokill, true, pfn, flags);
>   }
>   
> +int register_pfn_address_space(struct pfn_address_space *pfn_space)

I have a design consideration here. Non-struct page PFNs typically
represent device memory managed by device drivers through their own
memory allocators. These drivers are responsible for allocation and
deallocation of this memory.

Rather than having MM maintain metadata about these PFNs, have you
considered adding an operation callback similar to
dev_pagemap_ops->memory_failure? This would allow device memory
allocators to:

- Maintain their own metadata tracking poison status (similar to
   TestSetPageHWPoison())
- Handle device-specific requirements for memory failure
- Provide more flexibility for different types of device memory

Thanks.
Shuai
Re: [PATCH v3 1/3] mm: handle poisoning of pfn without struct pages
Posted by Jason Gunthorpe 3 months, 2 weeks ago
On Fri, Oct 24, 2025 at 05:45:45PM +0800, Shuai Xue wrote:
> Rather than having MM maintain metadata about these PFNs, have you
> considered adding an operation callback similar to
> dev_pagemap_ops->memory_failure? 

I think someone could come with such a proposal on top of this, it
would not be hard to add some ops to pfn_address_space and have the
code call them instead of using the address_space.

This version just needs to link into the existing VMA machinery (ie
collect_procs_pfn), it doesn't make alot of sense to push that work
into drivers.

Jason
Re: [PATCH v3 1/3] mm: handle poisoning of pfn without struct pages
Posted by Jiaqi Yan 3 months, 2 weeks ago
On Tue, Oct 21, 2025 at 3:23 AM <ankita@nvidia.com> wrote:
>
> From: Ankit Agrawal <ankita@nvidia.com>
>
> The kernel MM currently does not handle ECC errors / poison on a memory
> region that is not backed by struct pages. If a memory region mapped
> using remap_pfn_range() for example, but not added to the kernel, MM
> will not have associated struct pages. Add a new mechanism to handle
> memory failure on such memory.
>
> Make kernel MM expose a function to allow modules managing the device
> memory to register the device memory SPA and the address space associated
> it. MM maintains this information as an interval tree. On poison, MM can
> search for the range that the poisoned PFN belong and use the address_space
> to determine the mapping VMA.
>
> In this implementation, kernel MM follows the following sequence that is
> largely similar to the memory_failure() handler for struct page backed
> memory:
> 1. memory_failure() is triggered on reception of a poison error. An
> absence of struct page is detected and consequently memory_failure_pfn()
> is executed.
> 2. memory_failure_pfn() collects the processes mapped to the PFN.
> 3. memory_failure_pfn() sends SIGBUS to all the processes mapping the
> poisoned PFN using kill_procs().
>
> Note that there is one primary difference versus the handling of the
> poison on struct pages, which is to skip unmapping to the faulty PFN.
> This is done to handle the huge PFNMAP support added recently [1] that
> enables VM_PFNMAP vmas to map in either PMD level. Otherwise, a poison
> to a PFN would need breaking the PMD mapping into PTEs to unmap only
> the poisoned PFN. This will have a major performance impact.
>
> Link: https://lore.kernel.org/all/20240826204353.2228736-1-peterx@redhat.com/ [1]
>
> Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
> ---
>  MAINTAINERS                    |   1 +
>  include/linux/memory-failure.h |  17 +++++
>  include/linux/mm.h             |   1 +
>  include/ras/ras_event.h        |   1 +
>  mm/Kconfig                     |   1 +
>  mm/memory-failure.c            | 128 ++++++++++++++++++++++++++++++++-
>  6 files changed, 148 insertions(+), 1 deletion(-)
>  create mode 100644 include/linux/memory-failure.h
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 520fb4e379a3..463d062d0386 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -11359,6 +11359,7 @@ M:      Miaohe Lin <linmiaohe@huawei.com>
>  R:     Naoya Horiguchi <nao.horiguchi@gmail.com>
>  L:     linux-mm@kvack.org
>  S:     Maintained
> +F:     include/linux/memory-failure.h
>  F:     mm/hwpoison-inject.c
>  F:     mm/memory-failure.c
>
> diff --git a/include/linux/memory-failure.h b/include/linux/memory-failure.h
> new file mode 100644
> index 000000000000..bc326503d2d2
> --- /dev/null
> +++ b/include/linux/memory-failure.h
> @@ -0,0 +1,17 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_MEMORY_FAILURE_H
> +#define _LINUX_MEMORY_FAILURE_H
> +
> +#include <linux/interval_tree.h>
> +
> +struct pfn_address_space;
> +
> +struct pfn_address_space {
> +       struct interval_tree_node node;
> +       struct address_space *mapping;
> +};
> +
> +int register_pfn_address_space(struct pfn_address_space *pfn_space);
> +void unregister_pfn_address_space(struct pfn_address_space *pfn_space);
> +
> +#endif /* _LINUX_MEMORY_FAILURE_H */
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 1ae97a0b8ec7..0ab4ea82ce9e 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -4006,6 +4006,7 @@ enum mf_action_page_type {
>         MF_MSG_DAX,
>         MF_MSG_UNSPLIT_THP,
>         MF_MSG_ALREADY_POISONED,
> +       MF_MSG_PFN_MAP,
>         MF_MSG_UNKNOWN,
>  };
>
> diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
> index c8cd0f00c845..fecfeb7c8be7 100644
> --- a/include/ras/ras_event.h
> +++ b/include/ras/ras_event.h
> @@ -375,6 +375,7 @@ TRACE_EVENT(aer_event,
>         EM ( MF_MSG_DAX, "dax page" )                                   \
>         EM ( MF_MSG_UNSPLIT_THP, "unsplit thp" )                        \
>         EM ( MF_MSG_ALREADY_POISONED, "already poisoned" )              \
> +       EM ( MF_MSG_PFN_MAP, "non struct page pfn" )                    \
>         EMe ( MF_MSG_UNKNOWN, "unknown page" )
>
>  /*
> diff --git a/mm/Kconfig b/mm/Kconfig
> index e443fe8cd6cf..0b07219390b9 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -777,6 +777,7 @@ config MEMORY_FAILURE
>         depends on ARCH_SUPPORTS_MEMORY_FAILURE
>         bool "Enable recovery from hardware memory errors"
>         select MEMORY_ISOLATION
> +       select INTERVAL_TREE
>         select RAS
>         help
>           Enables code to recover from some memory failures on systems
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index df6ee59527dd..acfe5a9bde1d 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -38,6 +38,7 @@
>
>  #include <linux/kernel.h>
>  #include <linux/mm.h>
> +#include <linux/memory-failure.h>
>  #include <linux/page-flags.h>
>  #include <linux/sched/signal.h>
>  #include <linux/sched/task.h>
> @@ -154,6 +155,10 @@ static const struct ctl_table memory_failure_table[] = {
>         }
>  };
>
> +static struct rb_root_cached pfn_space_itree = RB_ROOT_CACHED;
> +
> +static DEFINE_MUTEX(pfn_space_lock);
> +
>  /*
>   * Return values:
>   *   1:   the page is dissolved (if needed) and taken off from buddy,
> @@ -957,6 +962,7 @@ static const char * const action_page_types[] = {
>         [MF_MSG_DAX]                    = "dax page",
>         [MF_MSG_UNSPLIT_THP]            = "unsplit thp",
>         [MF_MSG_ALREADY_POISONED]       = "already poisoned page",
> +       [MF_MSG_PFN_MAP]                = "non struct page pfn",
>         [MF_MSG_UNKNOWN]                = "unknown page",
>  };
>
> @@ -1349,7 +1355,7 @@ static int action_result(unsigned long pfn, enum mf_action_page_type type,
>  {
>         trace_memory_failure_event(pfn, type, result);
>
> -       if (type != MF_MSG_ALREADY_POISONED) {
> +       if (type != MF_MSG_ALREADY_POISONED && type != MF_MSG_PFN_MAP) {
>                 num_poisoned_pages_inc(pfn);
>                 update_per_node_mf_stats(pfn, result);
>         }
> @@ -2216,6 +2222,121 @@ static void kill_procs_now(struct page *p, unsigned long pfn, int flags,
>         kill_procs(&tokill, true, pfn, flags);
>  }
>
> +int register_pfn_address_space(struct pfn_address_space *pfn_space)
> +{
> +       if (!pfn_space)
> +               return -EINVAL;
> +
> +       mutex_lock(&pfn_space_lock);
> +
> +       if (interval_tree_iter_first(&pfn_space_itree,
> +                                    pfn_space->node.start,
> +                                    pfn_space->node.last)) {
> +               mutex_unlock(&pfn_space_lock);
> +               return -EBUSY;
> +       }
> +
> +       interval_tree_insert(&pfn_space->node, &pfn_space_itree);
> +       mutex_unlock(&pfn_space_lock);
> +
> +       return 0;
> +}
> +EXPORT_SYMBOL_GPL(register_pfn_address_space);
> +
> +void unregister_pfn_address_space(struct pfn_address_space *pfn_space)
> +{
> +       if (!pfn_space)
> +               return;
> +
> +       mutex_lock(&pfn_space_lock);
> +       interval_tree_remove(&pfn_space->node, &pfn_space_itree);

IIRC removing something not in interval tree will panic kernel. If I
am not mistaken, should here do something like
interval_tree_iter_first before interval_tree_remove, to avoid
driver's ill behavior crash the system?

> +       mutex_unlock(&pfn_space_lock);
> +}
> +EXPORT_SYMBOL_GPL(unregister_pfn_address_space);
> +
> +static void add_to_kill_pfn(struct task_struct *tsk,
> +                           struct vm_area_struct *vma,
> +                           struct list_head *to_kill,
> +                           unsigned long pfn)
> +{
> +       struct to_kill *tk;
> +
> +       tk = kmalloc(sizeof(*tk), GFP_ATOMIC);
> +       if (!tk)
> +               return;
> +
> +       /* Check for pgoff not backed by struct page */
> +       tk->addr = vma_address(vma, pfn, 1);
> +       tk->size_shift = PAGE_SHIFT;
> +
> +       if (tk->addr == -EFAULT)
> +               pr_info("Unable to find address %lx in %s\n",
> +                       pfn, tsk->comm);
> +
> +       get_task_struct(tsk);
> +       tk->tsk = tsk;
> +       list_add_tail(&tk->nd, to_kill);
> +}
> +
> +/*
> + * Collect processes when the error hit a PFN not backed by struct page.
> + */
> +static void collect_procs_pfn(struct address_space *mapping,
> +                             unsigned long pfn, struct list_head *to_kill)
> +{
> +       struct vm_area_struct *vma;
> +       struct task_struct *tsk;
> +
> +       i_mmap_lock_read(mapping);
> +       rcu_read_lock();
> +       for_each_process(tsk) {
> +               struct task_struct *t = tsk;
> +
> +               t = task_early_kill(tsk, true);
> +               if (!t)
> +                       continue;
> +               vma_interval_tree_foreach(vma, &mapping->i_mmap, pfn, pfn) {
> +                       if (vma->vm_mm == t->mm)
> +                               add_to_kill_pfn(t, vma, to_kill, pfn);
> +               }
> +       }
> +       rcu_read_unlock();
> +       i_mmap_unlock_read(mapping);
> +}
> +
> +static int memory_failure_pfn(unsigned long pfn, int flags)
> +{
> +       struct interval_tree_node *node;
> +       LIST_HEAD(tokill);
> +
> +       mutex_lock(&pfn_space_lock);
> +       /*
> +        * Modules registers with MM the address space mapping to the device memory they
> +        * manage. Iterate to identify exactly which address space has mapped to this
> +        * failing PFN.
> +        */
> +       for (node = interval_tree_iter_first(&pfn_space_itree, pfn, pfn); node;
> +            node = interval_tree_iter_next(node, pfn, pfn)) {
> +               struct pfn_address_space *pfn_space =
> +                       container_of(node, struct pfn_address_space, node);
> +
> +               collect_procs_pfn(pfn_space->mapping, pfn, &tokill);
> +       }
> +       mutex_unlock(&pfn_space_lock);
> +
> +       /*
> +        * Unlike System-RAM there is no possibility to swap in a different
> +        * physical page at a given virtual address, so all userspace
> +        * consumption of direct PFN memory necessitates SIGBUS (i.e.
> +        * MF_MUST_KILL)
> +        */
> +       flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
> +
> +       kill_procs(&tokill, true, pfn, flags);
> +
> +       return action_result(pfn, MF_MSG_PFN_MAP, MF_RECOVERED);
> +}
> +
>  /**
>   * memory_failure - Handle memory failure of a page.
>   * @pfn: Page Number of the corrupted page
> @@ -2259,6 +2380,11 @@ int memory_failure(unsigned long pfn, int flags)
>         if (!(flags & MF_SW_SIMULATED))
>                 hw_memory_failure = true;
>
> +       if (!pfn_valid(pfn) && !arch_is_platform_page(PFN_PHYS(pfn))) {
> +               res = memory_failure_pfn(pfn, flags);
> +               goto unlock_mutex;
> +       }
> +
>         p = pfn_to_online_page(pfn);
>         if (!p) {
>                 res = arch_memory_failure(pfn, flags);
> --
> 2.34.1
>
>
Re: [PATCH v3 1/3] mm: handle poisoning of pfn without struct pages
Posted by Ira Weiny 3 months, 2 weeks ago
ankita@ wrote:
> From: Ankit Agrawal <ankita@nvidia.com>
> 
> The kernel MM currently does not handle ECC errors / poison on a memory
> region that is not backed by struct pages. If a memory region mapped
> using remap_pfn_range() for example, but not added to the kernel, MM
> will not have associated struct pages. Add a new mechanism to handle
> memory failure on such memory.
> 
> Make kernel MM expose a function to allow modules managing the device
> memory to register the device memory SPA and the address space associated
> it. MM maintains this information as an interval tree. On poison, MM can
> search for the range that the poisoned PFN belong and use the address_space
> to determine the mapping VMA.
> 
> In this implementation, kernel MM follows the following sequence that is
> largely similar to the memory_failure() handler for struct page backed
> memory:
> 1. memory_failure() is triggered on reception of a poison error. An
> absence of struct page is detected and consequently memory_failure_pfn()
> is executed.
> 2. memory_failure_pfn() collects the processes mapped to the PFN.
> 3. memory_failure_pfn() sends SIGBUS to all the processes mapping the
> poisoned PFN using kill_procs().
> 
> Note that there is one primary difference versus the handling of the
> poison on struct pages, which is to skip unmapping to the faulty PFN.
> This is done to handle the huge PFNMAP support added recently [1] that
> enables VM_PFNMAP vmas to map in either PMD level. Otherwise, a poison
> to a PFN would need breaking the PMD mapping into PTEs to unmap only
> the poisoned PFN. This will have a major performance impact.
> 
> Link: https://lore.kernel.org/all/20240826204353.2228736-1-peterx@redhat.com/ [1]
> 

[snip]

> @@ -2216,6 +2222,121 @@ static void kill_procs_now(struct page *p, unsigned long pfn, int flags,
>  	kill_procs(&tokill, true, pfn, flags);
>  }
>  
> +int register_pfn_address_space(struct pfn_address_space *pfn_space)
> +{
> +	if (!pfn_space)
> +		return -EINVAL;

Is there a reason callers may be passing NULL?

> +
> +	mutex_lock(&pfn_space_lock);
> +
> +	if (interval_tree_iter_first(&pfn_space_itree,
> +				     pfn_space->node.start,
> +				     pfn_space->node.last)) {
> +		mutex_unlock(&pfn_space_lock);

Register and unregister are good places to use guard().

> +		return -EBUSY;
> +	}
> +
> +	interval_tree_insert(&pfn_space->node, &pfn_space_itree);
> +	mutex_unlock(&pfn_space_lock);
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(register_pfn_address_space);
> +
> +void unregister_pfn_address_space(struct pfn_address_space *pfn_space)
> +{
> +	if (!pfn_space)
> +		return;
> +
> +	mutex_lock(&pfn_space_lock);
> +	interval_tree_remove(&pfn_space->node, &pfn_space_itree);
> +	mutex_unlock(&pfn_space_lock);
> +}
> +EXPORT_SYMBOL_GPL(unregister_pfn_address_space);
> +
> +static void add_to_kill_pfn(struct task_struct *tsk,
> +			    struct vm_area_struct *vma,
> +			    struct list_head *to_kill,
> +			    unsigned long pfn)
> +{
> +	struct to_kill *tk;
> +
> +	tk = kmalloc(sizeof(*tk), GFP_ATOMIC);
> +	if (!tk)
> +		return;
> +
> +	/* Check for pgoff not backed by struct page */
> +	tk->addr = vma_address(vma, pfn, 1);
> +	tk->size_shift = PAGE_SHIFT;
> +
> +	if (tk->addr == -EFAULT)
> +		pr_info("Unable to find address %lx in %s\n",
> +			pfn, tsk->comm);

If the pfn is not in the process why is it added to the kill list?

> +
> +	get_task_struct(tsk);
> +	tk->tsk = tsk;
> +	list_add_tail(&tk->nd, to_kill);
> +}
> +
> +/*
> + * Collect processes when the error hit a PFN not backed by struct page.
> + */
> +static void collect_procs_pfn(struct address_space *mapping,
> +			      unsigned long pfn, struct list_head *to_kill)
> +{
> +	struct vm_area_struct *vma;
> +	struct task_struct *tsk;
> +
> +	i_mmap_lock_read(mapping);
> +	rcu_read_lock();
> +	for_each_process(tsk) {
> +		struct task_struct *t = tsk;
> +
> +		t = task_early_kill(tsk, true);
> +		if (!t)
> +			continue;
> +		vma_interval_tree_foreach(vma, &mapping->i_mmap, pfn, pfn) {
> +			if (vma->vm_mm == t->mm)
> +				add_to_kill_pfn(t, vma, to_kill, pfn);
> +		}
> +	}
> +	rcu_read_unlock();
> +	i_mmap_unlock_read(mapping);
> +}
> +
> +static int memory_failure_pfn(unsigned long pfn, int flags)
> +{
> +	struct interval_tree_node *node;
> +	LIST_HEAD(tokill);
> +
> +	mutex_lock(&pfn_space_lock);

scoped_guard()  Or probably wrap this part in a guarded function.

Ira

> +	/*
> +	 * Modules registers with MM the address space mapping to the device memory they
> +	 * manage. Iterate to identify exactly which address space has mapped to this
> +	 * failing PFN.
> +	 */
> +	for (node = interval_tree_iter_first(&pfn_space_itree, pfn, pfn); node;
> +	     node = interval_tree_iter_next(node, pfn, pfn)) {
> +		struct pfn_address_space *pfn_space =
> +			container_of(node, struct pfn_address_space, node);
> +
> +		collect_procs_pfn(pfn_space->mapping, pfn, &tokill);
> +	}
> +	mutex_unlock(&pfn_space_lock);
> +
> +	/*
> +	 * Unlike System-RAM there is no possibility to swap in a different
> +	 * physical page at a given virtual address, so all userspace
> +	 * consumption of direct PFN memory necessitates SIGBUS (i.e.
> +	 * MF_MUST_KILL)
> +	 */
> +	flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
> +
> +	kill_procs(&tokill, true, pfn, flags);
> +
> +	return action_result(pfn, MF_MSG_PFN_MAP, MF_RECOVERED);
> +}
> +
>  /**
>   * memory_failure - Handle memory failure of a page.
>   * @pfn: Page Number of the corrupted page
> @@ -2259,6 +2380,11 @@ int memory_failure(unsigned long pfn, int flags)
>  	if (!(flags & MF_SW_SIMULATED))
>  		hw_memory_failure = true;
>  
> +	if (!pfn_valid(pfn) && !arch_is_platform_page(PFN_PHYS(pfn))) {
> +		res = memory_failure_pfn(pfn, flags);
> +		goto unlock_mutex;
> +	}
> +
>  	p = pfn_to_online_page(pfn);
>  	if (!p) {
>  		res = arch_memory_failure(pfn, flags);
> -- 
> 2.34.1
> 
>