From: Vipin Sharma <vipinsh@google.com>
Enable userspace to retrieve preserved VFIO device files from VFIO after
a Live Update by implementing the retrieve() and finish() file handler
callbacks.
Use an anonymous inode when creating the file, since the retrieved
device file is not opened through any particular cdev inode, and the
cdev inode does not matter in practice.
For now the retrieved file is functionally equivalent a opening the
corresponding VFIO cdev file. Subsequent commits will leverage the
preserved state associated with the retrieved file to preserve bits of
the device across Live Update.
Signed-off-by: Vipin Sharma <vipinsh@google.com>
Co-Developed-by: David Matlack <dmatlack@google.com>
Signed-off-by: David Matlack <dmatlack@google.com>
---
drivers/vfio/device_cdev.c | 21 +++++---
drivers/vfio/pci/vfio_pci_liveupdate.c | 75 +++++++++++++++++++++++++-
drivers/vfio/vfio_main.c | 13 +++++
include/linux/vfio.h | 12 +++++
4 files changed, 113 insertions(+), 8 deletions(-)
diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
index 480cac3a0c27..0a6e972f322b 100644
--- a/drivers/vfio/device_cdev.c
+++ b/drivers/vfio/device_cdev.c
@@ -16,14 +16,8 @@ void vfio_init_device_cdev(struct vfio_device *device)
device->cdev.owner = THIS_MODULE;
}
-/*
- * device access via the fd opened by this function is blocked until
- * .open_device() is called successfully during BIND_IOMMUFD.
- */
-int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep)
+int __vfio_device_fops_cdev_open(struct vfio_device *device, struct file *filep)
{
- struct vfio_device *device = container_of(inode->i_cdev,
- struct vfio_device, cdev);
struct vfio_device_file *df;
int ret;
@@ -52,6 +46,19 @@ int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep)
vfio_device_put_registration(device);
return ret;
}
+EXPORT_SYMBOL_GPL(__vfio_device_fops_cdev_open);
+
+/*
+ * device access via the fd opened by this function is blocked until
+ * .open_device() is called successfully during BIND_IOMMUFD.
+ */
+int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep)
+{
+ struct vfio_device *device = container_of(inode->i_cdev,
+ struct vfio_device, cdev);
+
+ return __vfio_device_fops_cdev_open(device, filep);
+}
static void vfio_df_get_kvm_safe(struct vfio_device_file *df)
{
diff --git a/drivers/vfio/pci/vfio_pci_liveupdate.c b/drivers/vfio/pci/vfio_pci_liveupdate.c
index a0147dee8c0f..b7451007fca4 100644
--- a/drivers/vfio/pci/vfio_pci_liveupdate.c
+++ b/drivers/vfio/pci/vfio_pci_liveupdate.c
@@ -8,6 +8,8 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
#include <linux/kexec_handover.h>
#include <linux/kho/abi/vfio_pci.h>
#include <linux/liveupdate.h>
@@ -124,13 +126,83 @@ static int vfio_pci_liveupdate_freeze(struct liveupdate_file_op_args *args)
return ret;
}
+static int match_device(struct device *dev, const void *arg)
+{
+ struct vfio_device *device = container_of(dev, struct vfio_device, device);
+ const struct vfio_pci_core_device_ser *ser = arg;
+ struct vfio_pci_core_device *vdev;
+ struct pci_dev *pdev;
+
+ vdev = container_of(device, struct vfio_pci_core_device, vdev);
+ pdev = vdev->pdev;
+
+ return ser->bdf == pci_dev_id(pdev) && ser->domain == pci_domain_nr(pdev->bus);
+}
+
static int vfio_pci_liveupdate_retrieve(struct liveupdate_file_op_args *args)
{
- return -EOPNOTSUPP;
+ struct vfio_pci_core_device_ser *ser;
+ struct vfio_device *device;
+ struct folio *folio;
+ struct file *file;
+ int ret;
+
+ folio = kho_restore_folio(args->serialized_data);
+ if (!folio)
+ return -ENOENT;
+
+ ser = folio_address(folio);
+
+ device = vfio_find_device(ser, match_device);
+ if (!device)
+ return -ENODEV;
+
+ /*
+ * During a Live Update userspace retrieves preserved VFIO cdev files by
+ * issuing an ioctl on /dev/liveupdate rather than by opening VFIO
+ * character devices.
+ *
+ * To handle that scenario, this routine simulates opening the VFIO
+ * character device for userspace with an anonymous inode. The returned
+ * file has the same properties as a cdev file (e.g. operations are
+ * blocked until BIND_IOMMUFD is called), aside from the inode
+ * association.
+ */
+ file = anon_inode_getfile_fmode("[vfio-device-liveupdate]",
+ &vfio_device_fops, NULL,
+ O_RDWR, FMODE_PREAD | FMODE_PWRITE);
+
+ if (IS_ERR(file)) {
+ ret = PTR_ERR(file);
+ goto out;
+ }
+
+ ret = __vfio_device_fops_cdev_open(device, file);
+ if (ret) {
+ fput(file);
+ goto out;
+ }
+
+ args->file = file;
+
+out:
+ /* Drop the reference from vfio_find_device() */
+ put_device(&device->device);
+
+ return ret;
+}
+
+static bool vfio_pci_liveupdate_can_finish(struct liveupdate_file_op_args *args)
+{
+ return args->retrieved;
}
static void vfio_pci_liveupdate_finish(struct liveupdate_file_op_args *args)
{
+ struct folio *folio;
+
+ folio = virt_to_folio(phys_to_virt(args->serialized_data));
+ folio_put(folio);
}
static const struct liveupdate_file_ops vfio_pci_liveupdate_file_ops = {
@@ -139,6 +211,7 @@ static const struct liveupdate_file_ops vfio_pci_liveupdate_file_ops = {
.unpreserve = vfio_pci_liveupdate_unpreserve,
.freeze = vfio_pci_liveupdate_freeze,
.retrieve = vfio_pci_liveupdate_retrieve,
+ .can_finish = vfio_pci_liveupdate_can_finish,
.finish = vfio_pci_liveupdate_finish,
.owner = THIS_MODULE,
};
diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index 9182dc46d73f..c5b5eb509474 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -13,6 +13,7 @@
#include <linux/cdev.h>
#include <linux/compat.h>
#include <linux/device.h>
+#include <linux/device/class.h>
#include <linux/fs.h>
#include <linux/idr.h>
#include <linux/iommu.h>
@@ -1706,6 +1707,18 @@ int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
}
EXPORT_SYMBOL(vfio_dma_rw);
+struct vfio_device *vfio_find_device(const void *data, device_match_t match)
+{
+ struct device *device;
+
+ device = class_find_device(vfio.device_class, NULL, data, match);
+ if (!device)
+ return NULL;
+
+ return container_of(device, struct vfio_device, device);
+}
+EXPORT_SYMBOL_GPL(vfio_find_device);
+
/*
* Module/class support
*/
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index f09da3bdf786..4e400a7219ea 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -413,4 +413,16 @@ int vfio_virqfd_enable(void *opaque, int (*handler)(void *, void *),
void vfio_virqfd_disable(struct virqfd **pvirqfd);
void vfio_virqfd_flush_thread(struct virqfd **pvirqfd);
+#if IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV)
+int __vfio_device_fops_cdev_open(struct vfio_device *device, struct file *filep);
+#else
+static inline int __vfio_device_fops_cdev_open(struct vfio_device *device,
+ struct file *filep)
+{
+ return -EOPNOTSUPP;
+}
+#endif /* IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV) */
+
+struct vfio_device *vfio_find_device(const void *data, device_match_t match);
+
#endif /* VFIO_H */
--
2.52.0.487.g5c8c507ade-goog
On Wed, Nov 26, 2025 at 07:35:53PM +0000, David Matlack wrote:
> From: Vipin Sharma <vipinsh@google.com>
> static int vfio_pci_liveupdate_retrieve(struct liveupdate_file_op_args *args)
> {
> - return -EOPNOTSUPP;
> + struct vfio_pci_core_device_ser *ser;
> + struct vfio_device *device;
> + struct folio *folio;
> + struct file *file;
> + int ret;
> +
> + folio = kho_restore_folio(args->serialized_data);
> + if (!folio)
> + return -ENOENT;
Should this be consistent with the behavior of pci_flb_retrieve() which panics
on failure? The short circuit failure paths which follow leak the folio,
which seems like a hygiene issue, but the practical significance is moot if
vfio_pci_liveupdate_retrieve() failure is catastrophic anyways?
> +
> + ser = folio_address(folio);
> +
> + device = vfio_find_device(ser, match_device);
> + if (!device)
> + return -ENODEV;
> +
> + /*
> + * During a Live Update userspace retrieves preserved VFIO cdev files by
> + * issuing an ioctl on /dev/liveupdate rather than by opening VFIO
> + * character devices.
> + *
> + * To handle that scenario, this routine simulates opening the VFIO
> + * character device for userspace with an anonymous inode. The returned
> + * file has the same properties as a cdev file (e.g. operations are
> + * blocked until BIND_IOMMUFD is called), aside from the inode
> + * association.
> + */
> + file = anon_inode_getfile_fmode("[vfio-device-liveupdate]",
> + &vfio_device_fops, NULL,
> + O_RDWR, FMODE_PREAD | FMODE_PWRITE);
> +
> + if (IS_ERR(file)) {
> + ret = PTR_ERR(file);
> + goto out;
> + }
> +
> + ret = __vfio_device_fops_cdev_open(device, file);
> + if (ret) {
> + fput(file);
> + goto out;
> + }
> +
> + args->file = file;
> +
> +out:
> + /* Drop the reference from vfio_find_device() */
> + put_device(&device->device);
> +
> + return ret;
> +}
On Wed, Dec 3, 2025 at 7:55 AM Alex Mastro <amastro@fb.com> wrote:
>
> On Wed, Nov 26, 2025 at 07:35:53PM +0000, David Matlack wrote:
> > From: Vipin Sharma <vipinsh@google.com>
> > static int vfio_pci_liveupdate_retrieve(struct liveupdate_file_op_args *args)
> > {
> > - return -EOPNOTSUPP;
> > + struct vfio_pci_core_device_ser *ser;
> > + struct vfio_device *device;
> > + struct folio *folio;
> > + struct file *file;
> > + int ret;
> > +
> > + folio = kho_restore_folio(args->serialized_data);
> > + if (!folio)
> > + return -ENOENT;
>
> Should this be consistent with the behavior of pci_flb_retrieve() which panics
> on failure? The short circuit failure paths which follow leak the folio,
> which seems like a hygiene issue, but the practical significance is moot if
> vfio_pci_liveupdate_retrieve() failure is catastrophic anyways?
pci_flb_retrieve() is used during boot. If it fails, we risk DMA
corrupting any memory region, so a panic makes sense. In contrast,
this retrieval happens once we are already in userspace, allowing the
user to decide how to handle the failure to recover the preserved
cdev.
Pasha
>
> > +
> > + ser = folio_address(folio);
> > +
> > + device = vfio_find_device(ser, match_device);
> > + if (!device)
> > + return -ENODEV;
> > +
> > + /*
> > + * During a Live Update userspace retrieves preserved VFIO cdev files by
> > + * issuing an ioctl on /dev/liveupdate rather than by opening VFIO
> > + * character devices.
> > + *
> > + * To handle that scenario, this routine simulates opening the VFIO
> > + * character device for userspace with an anonymous inode. The returned
> > + * file has the same properties as a cdev file (e.g. operations are
> > + * blocked until BIND_IOMMUFD is called), aside from the inode
> > + * association.
> > + */
> > + file = anon_inode_getfile_fmode("[vfio-device-liveupdate]",
> > + &vfio_device_fops, NULL,
> > + O_RDWR, FMODE_PREAD | FMODE_PWRITE);
> > +
> > + if (IS_ERR(file)) {
> > + ret = PTR_ERR(file);
> > + goto out;
> > + }
> > +
> > + ret = __vfio_device_fops_cdev_open(device, file);
> > + if (ret) {
> > + fput(file);
> > + goto out;
> > + }
> > +
> > + args->file = file;
> > +
> > +out:
> > + /* Drop the reference from vfio_find_device() */
> > + put_device(&device->device);
> > +
> > + return ret;
> > +}
On Wed, Dec 3, 2025 at 7:46 AM Pasha Tatashin <pasha.tatashin@soleen.com> wrote:
>
> On Wed, Dec 3, 2025 at 7:55 AM Alex Mastro <amastro@fb.com> wrote:
> >
> > On Wed, Nov 26, 2025 at 07:35:53PM +0000, David Matlack wrote:
> > > From: Vipin Sharma <vipinsh@google.com>
> > > static int vfio_pci_liveupdate_retrieve(struct liveupdate_file_op_args *args)
> > > {
> > > - return -EOPNOTSUPP;
> > > + struct vfio_pci_core_device_ser *ser;
> > > + struct vfio_device *device;
> > > + struct folio *folio;
> > > + struct file *file;
> > > + int ret;
> > > +
> > > + folio = kho_restore_folio(args->serialized_data);
> > > + if (!folio)
> > > + return -ENOENT;
> >
> > Should this be consistent with the behavior of pci_flb_retrieve() which panics
> > on failure? The short circuit failure paths which follow leak the folio,
Thanks for catching the leaked folio. I'll fix that in the next version.
> > which seems like a hygiene issue, but the practical significance is moot if
> > vfio_pci_liveupdate_retrieve() failure is catastrophic anyways?
>
> pci_flb_retrieve() is used during boot. If it fails, we risk DMA
> corrupting any memory region, so a panic makes sense. In contrast,
> this retrieval happens once we are already in userspace, allowing the
> user to decide how to handle the failure to recover the preserved
> cdev.
This is what I was thinking as well. vfio_pci_liveupdate_retrieve()
runs in the context of the ioctl LIVEUPDATE_SESSION_RETRIEVE_FD, so we
can just return an error up to userspace if anything goes wrong and
let userspace initiate the reboot to recover the device if/when it's
ready.
OTOH, pci_flb_retrieve() gets called by the kernel during early boot
to determine what devices the previous kernel preserved. If the kernel
can't determine which devices were preserved by the previous kernel
and once the kernel starts preserving I/O page tables, that could lead
to corruption, so panicking is warranted.
On Wed, Dec 03, 2025 at 09:29:27AM -0800, David Matlack wrote:
> On Wed, Dec 3, 2025 at 7:46 AM Pasha Tatashin <pasha.tatashin@soleen.com> wrote:
> >
> > On Wed, Dec 3, 2025 at 7:55 AM Alex Mastro <amastro@fb.com> wrote:
> > >
> > > On Wed, Nov 26, 2025 at 07:35:53PM +0000, David Matlack wrote:
> > > > From: Vipin Sharma <vipinsh@google.com>
> > > > static int vfio_pci_liveupdate_retrieve(struct liveupdate_file_op_args *args)
> > > > {
> > > > - return -EOPNOTSUPP;
> > > > + struct vfio_pci_core_device_ser *ser;
> > > > + struct vfio_device *device;
> > > > + struct folio *folio;
> > > > + struct file *file;
> > > > + int ret;
> > > > +
> > > > + folio = kho_restore_folio(args->serialized_data);
> > > > + if (!folio)
> > > > + return -ENOENT;
> > >
> > > Should this be consistent with the behavior of pci_flb_retrieve() which panics
> > > on failure? The short circuit failure paths which follow leak the folio,
>
> Thanks for catching the leaked folio. I'll fix that in the next version.
>
> > > which seems like a hygiene issue, but the practical significance is moot if
> > > vfio_pci_liveupdate_retrieve() failure is catastrophic anyways?
> >
> > pci_flb_retrieve() is used during boot. If it fails, we risk DMA
> > corrupting any memory region, so a panic makes sense. In contrast,
> > this retrieval happens once we are already in userspace, allowing the
> > user to decide how to handle the failure to recover the preserved
> > cdev.
>
> This is what I was thinking as well. vfio_pci_liveupdate_retrieve()
> runs in the context of the ioctl LIVEUPDATE_SESSION_RETRIEVE_FD, so we
> can just return an error up to userspace if anything goes wrong and
> let userspace initiate the reboot to recover the device if/when it's
> ready.
>
> OTOH, pci_flb_retrieve() gets called by the kernel during early boot
> to determine what devices the previous kernel preserved. If the kernel
> can't determine which devices were preserved by the previous kernel
> and once the kernel starts preserving I/O page tables, that could lead
> to corruption, so panicking is warranted.
Make sense, thanks for elaborating David and Pasha.
© 2016 - 2026 Red Hat, Inc.