[PATCH RFC v2 12/15] vfio/nvgrace-egm: Introduce ioctl to share retired pages

ankita@nvidia.com posted 15 patches 1 month, 1 week ago
[PATCH RFC v2 12/15] vfio/nvgrace-egm: Introduce ioctl to share retired pages
Posted by ankita@nvidia.com 1 month, 1 week ago
From: Ankit Agrawal <ankita@nvidia.com>

nvgrace-egm module stores the list of retired page offsets to be made
available for usermode processes. Introduce an ioctl to share the
information with the userspace.

The ioctl is called by usermode apps such as QEMU to get the retired
page offsets. The usermode apps are expected to take appropriate action
to communicate the list to the VM.

Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
---
 MAINTAINERS                        |  1 +
 drivers/vfio/pci/nvgrace-gpu/egm.c | 67 ++++++++++++++++++++++++++++++
 include/uapi/linux/egm.h           | 28 +++++++++++++
 3 files changed, 96 insertions(+)
 create mode 100644 include/uapi/linux/egm.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 1fc551d7d667..94cf15a1e82c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -27389,6 +27389,7 @@ M:	Ankit Agrawal <ankita@nvidia.com>
 L:	kvm@vger.kernel.org
 S:	Supported
 F:	drivers/vfio/pci/nvgrace-gpu/egm.c
+F:	include/uapi/linux/egm.h
 
 VFIO PCI DEVICE SPECIFIC DRIVERS
 R:	Jason Gunthorpe <jgg@nvidia.com>
diff --git a/drivers/vfio/pci/nvgrace-gpu/egm.c b/drivers/vfio/pci/nvgrace-gpu/egm.c
index 077de3833046..918979d8fcd4 100644
--- a/drivers/vfio/pci/nvgrace-gpu/egm.c
+++ b/drivers/vfio/pci/nvgrace-gpu/egm.c
@@ -5,6 +5,7 @@
 
 #include <linux/vfio_pci_core.h>
 #include <linux/nvgrace-egm.h>
+#include <linux/egm.h>
 
 #define MAX_EGM_NODES 4
 
@@ -119,11 +120,77 @@ static int nvgrace_egm_mmap(struct file *file, struct vm_area_struct *vma)
 			       vma->vm_page_prot);
 }
 
+static long nvgrace_egm_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	unsigned long minsz = offsetofend(struct egm_retired_pages_list, count);
+	struct egm_retired_pages_list info;
+	void __user *uarg = (void __user *)arg;
+	struct chardev *egm_chardev = file->private_data;
+
+	if (copy_from_user(&info, uarg, minsz))
+		return -EFAULT;
+
+	if (info.argsz < minsz || !egm_chardev)
+		return -EINVAL;
+
+	switch (cmd) {
+	case EGM_RETIRED_PAGES_LIST:
+		int ret;
+		unsigned long retired_page_struct_size = sizeof(struct egm_retired_pages_info);
+		struct egm_retired_pages_info tmp;
+		struct h_node *cur_page;
+		struct hlist_node *tmp_node;
+		unsigned long bkt;
+		int count = 0, index = 0;
+
+		hash_for_each_safe(egm_chardev->htbl, bkt, tmp_node, cur_page, node)
+			count++;
+
+		if (info.argsz < (minsz + count * retired_page_struct_size)) {
+			info.argsz = minsz + count * retired_page_struct_size;
+			info.count = 0;
+			goto done;
+		} else {
+			hash_for_each_safe(egm_chardev->htbl, bkt, tmp_node, cur_page, node) {
+				/*
+				 * This check fails if there was an ECC error
+				 * after the usermode app read the count of
+				 * bad pages through this ioctl.
+				 */
+				if (minsz + index * retired_page_struct_size >= info.argsz) {
+					info.argsz = minsz + index * retired_page_struct_size;
+					info.count = index;
+					goto done;
+				}
+
+				tmp.offset = cur_page->mem_offset;
+				tmp.size = PAGE_SIZE;
+
+				ret = copy_to_user(uarg + minsz +
+						   index * retired_page_struct_size,
+						   &tmp, retired_page_struct_size);
+				if (ret)
+					return -EFAULT;
+				index++;
+			}
+
+			info.count = index;
+		}
+		break;
+	default:
+		return -EINVAL;
+	}
+
+done:
+	return copy_to_user(uarg, &info, minsz) ? -EFAULT : 0;
+}
+
 static const struct file_operations file_ops = {
 	.owner = THIS_MODULE,
 	.open = nvgrace_egm_open,
 	.release = nvgrace_egm_release,
 	.mmap = nvgrace_egm_mmap,
+	.unlocked_ioctl = nvgrace_egm_ioctl,
 };
 
 static void egm_chardev_release(struct device *dev)
diff --git a/include/uapi/linux/egm.h b/include/uapi/linux/egm.h
new file mode 100644
index 000000000000..4d3a2304d4f0
--- /dev/null
+++ b/include/uapi/linux/egm.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#ifndef _UAPI_LINUX_EGM_H
+#define _UAPI_LINUX_EGM_H
+
+#include <linux/types.h>
+
+#define EGM_TYPE ('E')
+
+struct egm_retired_pages_info {
+	__aligned_u64 offset;
+	__aligned_u64 size;
+};
+
+struct egm_retired_pages_list {
+	__u32 argsz;
+	/* out */
+	__u32 count;
+	/* out */
+	struct egm_retired_pages_info retired_pages[];
+};
+
+#define EGM_RETIRED_PAGES_LIST     _IO(EGM_TYPE, 100)
+
+#endif /* _UAPI_LINUX_EGM_H */
-- 
2.34.1
Re: [PATCH RFC v2 12/15] vfio/nvgrace-egm: Introduce ioctl to share retired pages
Posted by Alex Williamson 1 month ago
On Mon, 23 Feb 2026 15:55:11 +0000
<ankita@nvidia.com> wrote:

> From: Ankit Agrawal <ankita@nvidia.com>
> 
> nvgrace-egm module stores the list of retired page offsets to be made
> available for usermode processes. Introduce an ioctl to share the
> information with the userspace.
> 
> The ioctl is called by usermode apps such as QEMU to get the retired
> page offsets. The usermode apps are expected to take appropriate action
> to communicate the list to the VM.
> 
> Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
> ---
>  MAINTAINERS                        |  1 +
>  drivers/vfio/pci/nvgrace-gpu/egm.c | 67 ++++++++++++++++++++++++++++++
>  include/uapi/linux/egm.h           | 28 +++++++++++++
>  3 files changed, 96 insertions(+)
>  create mode 100644 include/uapi/linux/egm.h
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 1fc551d7d667..94cf15a1e82c 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -27389,6 +27389,7 @@ M:	Ankit Agrawal <ankita@nvidia.com>
>  L:	kvm@vger.kernel.org
>  S:	Supported
>  F:	drivers/vfio/pci/nvgrace-gpu/egm.c
> +F:	include/uapi/linux/egm.h
>  
>  VFIO PCI DEVICE SPECIFIC DRIVERS
>  R:	Jason Gunthorpe <jgg@nvidia.com>
> diff --git a/drivers/vfio/pci/nvgrace-gpu/egm.c b/drivers/vfio/pci/nvgrace-gpu/egm.c
> index 077de3833046..918979d8fcd4 100644
> --- a/drivers/vfio/pci/nvgrace-gpu/egm.c
> +++ b/drivers/vfio/pci/nvgrace-gpu/egm.c
> @@ -5,6 +5,7 @@
>  
>  #include <linux/vfio_pci_core.h>
>  #include <linux/nvgrace-egm.h>
> +#include <linux/egm.h>
>  
>  #define MAX_EGM_NODES 4
>  
> @@ -119,11 +120,77 @@ static int nvgrace_egm_mmap(struct file *file, struct vm_area_struct *vma)
>  			       vma->vm_page_prot);
>  }
>  
> +static long nvgrace_egm_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
> +{
> +	unsigned long minsz = offsetofend(struct egm_retired_pages_list, count);
> +	struct egm_retired_pages_list info;
> +	void __user *uarg = (void __user *)arg;
> +	struct chardev *egm_chardev = file->private_data;
> +
> +	if (copy_from_user(&info, uarg, minsz))
> +		return -EFAULT;
> +
> +	if (info.argsz < minsz || !egm_chardev)
> +		return -EINVAL;

How could we get here with !egm_chardev?

> +
> +	switch (cmd) {
> +	case EGM_RETIRED_PAGES_LIST:
> +		int ret;
> +		unsigned long retired_page_struct_size = sizeof(struct egm_retired_pages_info);
> +		struct egm_retired_pages_info tmp;
> +		struct h_node *cur_page;
> +		struct hlist_node *tmp_node;
> +		unsigned long bkt;
> +		int count = 0, index = 0;

No brackets for inline declarations.  Ordering could be improved.

> +
> +		hash_for_each_safe(egm_chardev->htbl, bkt, tmp_node, cur_page, node)
> +			count++;

Why not keep track of the count as they're added?

Neither loop here needs the _safe variant here since we're not removing
entries.

> +
> +		if (info.argsz < (minsz + count * retired_page_struct_size)) {
> +			info.argsz = minsz + count * retired_page_struct_size;
> +			info.count = 0;

vfio returns success when there's not enough space for compatibility
for new capabilities.  For a new ioctl just set argsz and count and
return -ENOSPC.

> +			goto done;
> +		} else {

We don't need an else if the previous branch unconditionally goes
somewhere else.

> +			hash_for_each_safe(egm_chardev->htbl, bkt, tmp_node, cur_page, node) {
> +				/*
> +				 * This check fails if there was an ECC error
> +				 * after the usermode app read the count of
> +				 * bad pages through this ioctl.
> +				 */
> +				if (minsz + index * retired_page_struct_size >= info.argsz) {
> +					info.argsz = minsz + index * retired_page_struct_size;
> +					info.count = index;

If only we had locking to prevent such races...

> +					goto done;
> +				}
> +
> +				tmp.offset = cur_page->mem_offset;
> +				tmp.size = PAGE_SIZE;

Is firmware recording 4K or 64K pages in this table?

The above comment alludes runtime ECC faults, are those a different
page size from the granularity firmware reports in the table?

> +
> +				ret = copy_to_user(uarg + minsz +
> +						   index * retired_page_struct_size,
> +						   &tmp, retired_page_struct_size);
> +				if (ret)
> +					return -EFAULT;
> +				index++;
> +			}
> +
> +			info.count = index;
> +		}
> +		break;
> +	default:
> +		return -EINVAL;
> +	}
> +
> +done:
> +	return copy_to_user(uarg, &info, minsz) ? -EFAULT : 0;
> +}
> +
>  static const struct file_operations file_ops = {
>  	.owner = THIS_MODULE,
>  	.open = nvgrace_egm_open,
>  	.release = nvgrace_egm_release,
>  	.mmap = nvgrace_egm_mmap,
> +	.unlocked_ioctl = nvgrace_egm_ioctl,
>  };
>  
>  static void egm_chardev_release(struct device *dev)
> diff --git a/include/uapi/linux/egm.h b/include/uapi/linux/egm.h
> new file mode 100644
> index 000000000000..4d3a2304d4f0
> --- /dev/null
> +++ b/include/uapi/linux/egm.h
> @@ -0,0 +1,28 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> +/*
> + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved

2026

> + */
> +
> +#ifndef _UAPI_LINUX_EGM_H
> +#define _UAPI_LINUX_EGM_H
> +
> +#include <linux/types.h>
> +
> +#define EGM_TYPE ('E')

Arbitrarily chosen?  Update ioctl-number.rst?

> +
> +struct egm_retired_pages_info {
> +	__aligned_u64 offset;
> +	__aligned_u64 size;
> +};
> +
> +struct egm_retired_pages_list {
> +	__u32 argsz;
> +	/* out */
> +	__u32 count;
> +	/* out */
> +	struct egm_retired_pages_info retired_pages[];
> +};

I imagine you want some uapi description of this ioctl.  Thanks,

Alex

> +
> +#define EGM_RETIRED_PAGES_LIST     _IO(EGM_TYPE, 100)
> +
> +#endif /* _UAPI_LINUX_EGM_H */