Now actually implementing the serialise callback for iommufd.
On KHO activate, iterate through all persisted domains and write their
metadata to the device tree format. For now just a few fields are
serialised to demonstrate the concept. To actually make this useful a
lot more field and related objects will need to be serialised too.
---
drivers/iommu/iommufd/iommufd_private.h | 2 +
drivers/iommu/iommufd/main.c | 2 +-
drivers/iommu/iommufd/serialise.c | 81 ++++++++++++++++++++++++-
3 files changed, 81 insertions(+), 4 deletions(-)
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index a26728646a22..ad8d180269bd 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -18,6 +18,8 @@ struct iommu_group;
struct iommu_option;
struct iommufd_device;
+extern struct xarray persistent_iommufds;
+
struct iommufd_ctx {
struct file *file;
struct xarray objects;
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index fa4f0fe336ad..21a7e1ad40d1 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -30,7 +30,7 @@ struct iommufd_object_ops {
static const struct iommufd_object_ops iommufd_object_ops[];
static struct miscdevice vfio_misc_dev;
-static DEFINE_XARRAY_ALLOC(persistent_iommufds);
+DEFINE_XARRAY_ALLOC(persistent_iommufds);
struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
size_t size,
diff --git a/drivers/iommu/iommufd/serialise.c b/drivers/iommu/iommufd/serialise.c
index 6e8bcc384771..6b4c306dce40 100644
--- a/drivers/iommu/iommufd/serialise.c
+++ b/drivers/iommu/iommufd/serialise.c
@@ -1,19 +1,94 @@
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kexec.h>
+#include <linux/libfdt.h>
#include "iommufd_private.h"
+#include "io_pagetable.h"
+
+/**
+ * Serialised format:
+ * /iommufd
+ * compatible = "iommufd-v0",
+ * iommufds = [
+ * persistent_id = {
+ * account_mode = u8
+ * ioases = [
+ * {
+ * areas = [
+ * ]
+ * }
+ * ]
+ * }
+ * ]
+ */
+static int serialise_iommufd(void *fdt, struct iommufd_ctx *ictx)
+{
+ int err = 0;
+ char name[24];
+ struct iommufd_object *obj;
+ unsigned long obj_idx;
+
+ snprintf(name, sizeof(name), "%lu", ictx->persistent_id);
+ err |= fdt_begin_node(fdt, name);
+ err |= fdt_begin_node(fdt, "ioases");
+ xa_for_each(&ictx->objects, obj_idx, obj) {
+ struct iommufd_ioas *ioas;
+ struct iopt_area *area;
+ int area_idx = 0;
+
+ if (obj->type != IOMMUFD_OBJ_IOAS)
+ continue;
+
+ ioas = (struct iommufd_ioas *) obj;
+ snprintf(name, sizeof(name), "%lu", obj_idx);
+ err |= fdt_begin_node(fdt, name);
+
+ for (area = iopt_area_iter_first(&ioas->iopt, 0, ULONG_MAX); area;
+ area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
+ unsigned long iova_start, iova_len;
+
+ snprintf(name, sizeof(name), "%i", area_idx);
+ err |= fdt_begin_node(fdt, name);
+ iova_start = iopt_area_iova(area);
+ iova_len = iopt_area_length(area);
+ err |= fdt_property(fdt, "iova-start",
+ &iova_start, sizeof(iova_start));
+ err |= fdt_property(fdt, "iova-len",
+ &iova_len, sizeof(iova_len));
+ err |= fdt_property(fdt, "iommu-prot",
+ &area->iommu_prot, sizeof(area->iommu_prot));
+ err |= fdt_end_node(fdt); /* area_idx */
+ ++area_idx;
+ }
+ err |= fdt_end_node(fdt); /* ioas obj_idx */
+ }
+ err |= fdt_end_node(fdt); /* ioases*/
+ err |= fdt_end_node(fdt); /* ictx->persistent_id */
+ return 0;
+}
int iommufd_serialise_kho(struct notifier_block *self, unsigned long cmd,
void *fdt)
{
- pr_info("would serialise here\n");
+ static const char compatible[] = "iommufd-v0";
+ struct iommufd_ctx *ictx;
+ unsigned long xa_idx;
+ int err = 0;
+
switch (cmd) {
case KEXEC_KHO_ABORT:
/* Would do serialise rollback here. */
return NOTIFY_DONE;
case KEXEC_KHO_DUMP:
- /* Would do serialise here. */
- return NOTIFY_DONE;
+ err |= fdt_begin_node(fdt, "iommufd");
+ fdt_property(fdt, "compatible", compatible, sizeof(compatible));
+ err |= fdt_begin_node(fdt, "iommufds");
+ xa_for_each(&persistent_iommufds, xa_idx, ictx) {
+ err |= serialise_iommufd(fdt, ictx);
+ }
+ err |= fdt_end_node(fdt); /* iommufds */
+ err |= fdt_end_node(fdt); /* iommufd */
+ return err? NOTIFY_BAD : NOTIFY_DONE;
default:
return NOTIFY_BAD;
}
--
2.34.1
Hi James, On Mon, 16 Sep 2024 13:30:54 +0200 James Gowans <jgowans@amazon.com> wrote: > +static int serialise_iommufd(void *fdt, struct iommufd_ctx *ictx) > +{ > + int err = 0; > + char name[24]; > + struct iommufd_object *obj; > + unsigned long obj_idx; > + > + snprintf(name, sizeof(name), "%lu", ictx->persistent_id); > + err |= fdt_begin_node(fdt, name); > + err |= fdt_begin_node(fdt, "ioases"); > + xa_for_each(&ictx->objects, obj_idx, obj) { > + struct iommufd_ioas *ioas; > + struct iopt_area *area; > + int area_idx = 0; > + > + if (obj->type != IOMMUFD_OBJ_IOAS) > + continue; I was wondering how device state persistency is managed here. Is it correct to assume that all devices bound to an iommufd context should be persistent? If so, should we be serializing IOMMUFD_OBJ_DEVICE as well? I'm considering this from the perspective of user mode drivers, including those that use noiommu mode (need to be added to iommufd cdev). In this scenario, we only need to maintain the device states persistently without IOAS.
Hi James, Just a gentle reminder. Let me also explain the problem we are trying to solve for the live update of OpenHCL paravisor[1]. OpenHCL has user space drivers based on VFIO noiommu mode, we are in the process of converting to iommufd cdev. Similarly, running DMA continuously across updates is required, but unlike your case, OpenHCL updates do not involve preserving the IO page tables in that it is managed by the hypervisor which is not part of the update. It seems reasonable to share the device persistence code path with the plan laid out in your cover letter. IOAS code path will be different since noiommu option does not have IOAS. If we were to revive noiommu support for iommufd cdev[2], can we use the persistent iommufd context to allow device persistence? Perhaps through IOMMUFD_OBJ_DEVICE and IOMMUFD_OBJ_ACCESS(used in [2])? @David, @Jason, @Alex, @Yi, any comments or suggestions? Thanks, Jacob 1. (openvmm/Guide/src/reference/architecture/openhcl.md at main · microsoft/openvmm. 2. [PATCH v11 00/23] Add vfio_device cdev for iommufd support - Yi Liu On Wed, 16 Oct 2024 15:20:47 -0700 Jacob Pan <jacob.pan@linux.microsoft.com> wrote: > Hi James, > > On Mon, 16 Sep 2024 13:30:54 +0200 > James Gowans <jgowans@amazon.com> wrote: > > > +static int serialise_iommufd(void *fdt, struct iommufd_ctx *ictx) > > +{ > > + int err = 0; > > + char name[24]; > > + struct iommufd_object *obj; > > + unsigned long obj_idx; > > + > > + snprintf(name, sizeof(name), "%lu", ictx->persistent_id); > > + err |= fdt_begin_node(fdt, name); > > + err |= fdt_begin_node(fdt, "ioases"); > > + xa_for_each(&ictx->objects, obj_idx, obj) { > > + struct iommufd_ioas *ioas; > > + struct iopt_area *area; > > + int area_idx = 0; > > + > > + if (obj->type != IOMMUFD_OBJ_IOAS) > > + continue; > I was wondering how device state persistency is managed here. Is it > correct to assume that all devices bound to an iommufd context should > be persistent? If so, should we be serializing IOMMUFD_OBJ_DEVICE as > well? > > I'm considering this from the perspective of user mode drivers, > including those that use noiommu mode (need to be added to iommufd > cdev). In this scenario, we only need to maintain the device states > persistently without IOAS.
Hi Jacob, apologies for the late reply. On Mon, 2024-10-28 at 09:03 -0700, Jacob Pan wrote: > Hi James, > > Just a gentle reminder. Let me also explain the problem we are trying > to solve for the live update of OpenHCL paravisor[1]. OpenHCL has user > space drivers based on VFIO noiommu mode, we are in the process of > converting to iommufd cdev. > > Similarly, running DMA continuously across updates is required, but > unlike your case, OpenHCL updates do not involve preserving the IO page > tables in that it is managed by the hypervisor which is not part of the > update. > > It seems reasonable to share the device persistence code path > with the plan laid out in your cover letter. IOAS code path will be > different since noiommu option does not have IOAS. > > If we were to revive noiommu support for iommufd cdev[2], can we use > the persistent iommufd context to allow device persistence? Perhaps > through IOMMUFD_OBJ_DEVICE and IOMMUFD_OBJ_ACCESS(used in [2])? > > @David, @Jason, @Alex, @Yi, any comments or suggestions? IIRC we did discuss this device persistence use case with some of your colleagues at Linux Plumbers. Adding Jinank to this thread as he was part of the conversation too. Yes, I think the guidance was to bind a device to iommufd in noiommu mode. It does seem a bit weird to use iommufd with noiommu, but we agreed it's the best/simplest way to get the functionality. Then as you suggest below the IOMMUFD_OBJ_DEVICE would be serialised too in some way, probably by iommufd telling the PCI layer that this device must be persistent and hence not to re-probe it on kexec. I think this would get you what you want? Specifically you want to make sure that the device is not reset during kexec so it can keep running? And some handle for userspace to pick it up again and continue interacting with it after kexec. It's all a bit hand wavy at the moment, but something along those lines probably makes sense. I need to work on rev2 of this RFC as per Jason's feedback in the other thread. Rev2 will make the restore path more userspace driven, with fresh iommufd and pgtables objects being created and then atomically swapped over too. I'll also get the PCI layer involved with rev2. Once that's out (it'll be a few weeks as I'm on leave) then let's take a look at how the noiommu device persistence case would fit in. JG > > > Thanks, > > Jacob > > 1. (openvmm/Guide/src/reference/architecture/openhcl.md at main · > microsoft/openvmm. > 2. [PATCH v11 00/23] Add vfio_device cdev for > iommufd support - Yi Liu > > On Wed, 16 Oct 2024 15:20:47 -0700 Jacob Pan > <jacob.pan@linux.microsoft.com> wrote: > > > Hi James, > > > > On Mon, 16 Sep 2024 13:30:54 +0200 > > James Gowans <jgowans@amazon.com> wrote: > > > > > +static int serialise_iommufd(void *fdt, struct iommufd_ctx *ictx) > > > +{ > > > + int err = 0; > > > + char name[24]; > > > + struct iommufd_object *obj; > > > + unsigned long obj_idx; > > > + > > > + snprintf(name, sizeof(name), "%lu", ictx->persistent_id); > > > + err |= fdt_begin_node(fdt, name); > > > + err |= fdt_begin_node(fdt, "ioases"); > > > + xa_for_each(&ictx->objects, obj_idx, obj) { > > > + struct iommufd_ioas *ioas; > > > + struct iopt_area *area; > > > + int area_idx = 0; > > > + > > > + if (obj->type != IOMMUFD_OBJ_IOAS) > > > + continue; > > I was wondering how device state persistency is managed here. Is it > > correct to assume that all devices bound to an iommufd context should > > be persistent? If so, should we be serializing IOMMUFD_OBJ_DEVICE as > > well? > > > > I'm considering this from the perspective of user mode drivers, > > including those that use noiommu mode (need to be added to iommufd > > cdev). In this scenario, we only need to maintain the device states > > persistently without IOAS. >
On Sat, Nov 02, 2024 at 10:22:54AM +0000, Gowans, James wrote: > Yes, I think the guidance was to bind a device to iommufd in noiommu > mode. It does seem a bit weird to use iommufd with noiommu, but we > agreed it's the best/simplest way to get the functionality. noiommu should still have an ioas and still have kernel managed page pinning. My remark to bring it to iommufd was to also make it a fully architected feature and stop relying on mprotect and /proc/ tricks. > Then as you suggest below the IOMMUFD_OBJ_DEVICE would be serialised > too in some way, probably by iommufd telling the PCI layer that this > device must be persistent and hence not to re-probe it on kexec. Presumably VFIO would be doing some/most of this part since it is the driver that will be binding? > It's all a bit hand wavy at the moment, but something along those lines > probably makes sense. I need to work on rev2 of this RFC as per Jason's > feedback in the other thread. Rev2 will make the restore path more > userspace driven, with fresh iommufd and pgtables objects being created > and then atomically swapped over too. I'll also get the PCI layer > involved with rev2. Once that's out (it'll be a few weeks as I'm on > leave) then let's take a look at how the noiommu device persistence case > would fit in. In a certain sense it would be nice to see the noiommu flow as it breaks apart the problem into the first dependency: How to get the device handed across the kexec and safely land back in VFIO, and only VFIO's hands. Preserving the iommu HW configuration is an incremental step built on that base line. Also, FWIW, this needs to follow good open source practices - we need an open userspace for the feature and the kernel stuff should be merged in a logical order. Jason
Hi Jason, On Mon, 4 Nov 2024 09:00:11 -0400 Jason Gunthorpe <jgg@ziepe.ca> wrote: > On Sat, Nov 02, 2024 at 10:22:54AM +0000, Gowans, James wrote: > > > Yes, I think the guidance was to bind a device to iommufd in noiommu > > mode. It does seem a bit weird to use iommufd with noiommu, but we > > agreed it's the best/simplest way to get the functionality. > > noiommu should still have an ioas and still have kernel managed page > pinning. > > My remark to bring it to iommufd was to also make it a fully > architected feature and stop relying on mprotect and /proc/ tricks. > Just to clarify my tentative understanding with more details(please correct): 1. create an iommufd access object for noiommu device when binding to an iommufd ctx. 2. all user memory used by the device under noiommu mode should be pinned by iommufd, i.e. iommufd_access_pin_pages(). I guess you meant stop doing mlock instead of mprotect trick? I think openHCL is using /dev/mem trick. 3. ioas can be attched to the noiommu iommufd_access object, similar to emulated device, mdev. What kind/source of memory should be supported here? e.g. device meory regions exposed by PCI BARs. > > Then as you suggest below the IOMMUFD_OBJ_DEVICE would be serialised > > too in some way, probably by iommufd telling the PCI layer that this > > device must be persistent and hence not to re-probe it on kexec. > > Presumably VFIO would be doing some/most of this part since it is the > driver that will be binding? > Yes, it is the user mode driver that initiates the binding. I was thinking since the granularity for persistency is per iommufd ctx, the VFIO device flag to mark keep_alive can come from iommufd ctx. > > It's all a bit hand wavy at the moment, but something along those > > lines probably makes sense. I need to work on rev2 of this RFC as > > per Jason's feedback in the other thread. Rev2 will make the > > restore path more userspace driven, with fresh iommufd and pgtables > > objects being created and then atomically swapped over too. I'll > > also get the PCI layer involved with rev2. Once that's out (it'll > > be a few weeks as I'm on leave) then let's take a look at how the > > noiommu device persistence case would fit in. > > In a certain sense it would be nice to see the noiommu flow as it > breaks apart the problem into the first dependency: > > How to get the device handed across the kexec and safely land back in > VFIO, and only VFIO's hands. > > Preserving the iommu HW configuration is an incremental step built on > that base line. Makes sense, I need to catch up on the KHO series and hook up noiommu at the first step. > Also, FWIW, this needs to follow good open source practices - we need > an open userspace for the feature and the kernel stuff should be > merged in a logical order. > Yes, we will have matching userspace in openHCL https://github.com/microsoft/openvmm Thanks, Jacob
On Mon, Sep 16, 2024 at 01:30:54PM +0200, James Gowans wrote: > Now actually implementing the serialise callback for iommufd. > On KHO activate, iterate through all persisted domains and write their > metadata to the device tree format. For now just a few fields are > serialised to demonstrate the concept. To actually make this useful a > lot more field and related objects will need to be serialised too. But isn't that a rather difficult problem? The "a lot more fields" include things like pointers to the mm struct, the user_struct and task_struct, then all the pinning accounting as well. Coming work extends this to memfds and more is coming. I would expect this KHO stuff to use the memfd-like path to access the physical VM memory too. I think expecting to serialize and restore everything like this is probably much too complicated. If you could just retain a small portion and then directly reconstruct the missing parts it seems like it would be more maintainable. Ie "recover" a HWPT from a KHO on a manually created a IOAS with the right "memfd" for the backing storage. Then the recovery can just validate that things are correct and adopt the iommu_domain as the hwpt. Eventually you'll want this to work for the viommus as well, and that seems like a lot more tricky complexity.. Jason
On Wed, 2024-10-02 at 15:55 -0300, Jason Gunthorpe wrote: > On Mon, Sep 16, 2024 at 01:30:54PM +0200, James Gowans wrote: > > Now actually implementing the serialise callback for iommufd. > > On KHO activate, iterate through all persisted domains and write their > > metadata to the device tree format. For now just a few fields are > > serialised to demonstrate the concept. To actually make this useful a > > lot more field and related objects will need to be serialised too. > > But isn't that a rather difficult problem? The "a lot more fields" > include things like pointers to the mm struct, the user_struct and > task_struct, then all the pinning accounting as well. > > Coming work extends this to memfds and more is coming. I would expect > this KHO stuff to use the memfd-like path to access the physical VM > memory too. > > I think expecting to serialize and restore everything like this is > probably much too complicated. On reflection I think you're right - this will be complex both from a development and a maintenance perspective, trying to make sure we serialise all the necessary state and reconstruct it correctly. Even more complex when structs are refactored/changed across kernel versions. An important requirement of this functionality is the ability to kexec between different kernel versions including going back to an older kernel version in the case of a rollback. So, let's look at other options: > > If you could just retain a small portion and then directly reconstruct > the missing parts it seems like it would be more maintainable. I think we have two other possible approaches here: 1. What this RFC is sketching out, serialising fields from the structs and setting those fields again on deserialise. As you point out this will be complicated. 2. Get userspace to do the work: userspace needs to re-do the ioctls after kexec to reconstruct the objects. My main issue with this approach is that the kernel needs to do some sort of trust but verify approach to ensure that userspace constructs everything the same way after kexec as it was before kexec. We don't want to end up in a state where the iommufd objects don't match the persisted page tables. 3. Serialise and reply the ioctls. Ioctl APIs and payloads should (must?) be stable across kernel versions. If IOMMUFD records the ioctls executed by userspace then it could replay them as part of deserialise and give userspace a handle to the resulting objects after kexec. This way we are guaranteed consistent iommufd / IOAS objects. By "consistent" I mean they are the same as before kexec and match the persisted page tables. By having the kernel do this it means it doesn't need to depend on userspace doing the correct thing. What do you think of this 3rd approach? I can try to sketch it out and send another RFC if you think it sounds reasonable. > > Ie "recover" a HWPT from a KHO on a manually created a IOAS with the > right "memfd" for the backing storage. Then the recovery can just > validate that things are correct and adopt the iommu_domain as the > hwpt. This sounds more like option 2 where we expect userspace to re-drive the ioctls, but verify that they have corresponding payloads as before kexec so that iommufd objects are consistent with persisted page tables. If the kernel is doing verification wouldn't it be better for the kernel to do the ioctl work itself and give the resulting objects to userspace? > > Eventually you'll want this to work for the viommus as well, and that > seems like a lot more tricky complexity.. > > Jason
On Mon, Oct 07, 2024 at 08:39:53AM +0000, Gowans, James wrote: > 2. Get userspace to do the work: userspace needs to re-do the ioctls > after kexec to reconstruct the objects. My main issue with this approach > is that the kernel needs to do some sort of trust but verify approach to > ensure that userspace constructs everything the same way after kexec as > it was before kexec. We don't want to end up in a state where the > iommufd objects don't match the persisted page tables. I think the verification is not so bad, it is just extracting the physical addresses from the IOAS and comparing to what is stored in the iommu_domain. If they don't match then the domain can't be adopted to the IOAS. We actually don't care about anything else, if userspace creates different objects with different parameters who cares? All that matters is that the radix tree contains the same expected information. > What do you think of this 3rd approach? I can try to sketch it out and > send another RFC if you think it sounds reasonable. I think it is the same problem, just in a more maintainable wrapper. You still have to serialize lots and lots of different objects and their relationships. > > Ie "recover" a HWPT from a KHO on a manually created a IOAS with the > > right "memfd" for the backing storage. Then the recovery can just > > validate that things are correct and adopt the iommu_domain as the > > hwpt. > > This sounds more like option 2 where we expect userspace to re-drive the > ioctls, but verify that they have corresponding payloads as before kexec > so that iommufd objects are consistent with persisted page tables. Yes > If the kernel is doing verification wouldn't it be better for the kernel > to do the ioctl work itself and give the resulting objects to > userspace? No :) It is so much easier to validate the IOPTEs in a radix tree. At the very worst you just create a HWPT and iommu_domain for validation, do validation and then throw it away. Compare for two radix trees is about 50 lines in generic pt, I have it already. Jason
On Mon, 2024-10-07 at 08:39 +0000, Gowans, James wrote: > > I think we have two other possible approaches here: > > 1. What this RFC is sketching out, serialising fields from the structs > and setting those fields again on deserialise. As you point out this > will be complicated. > > 2. Get userspace to do the work: userspace needs to re-do the ioctls > after kexec to reconstruct the objects. My main issue with this approach > is that the kernel needs to do some sort of trust but verify approach to > ensure that userspace constructs everything the same way after kexec as > it was before kexec. We don't want to end up in a state where the > iommufd objects don't match the persisted page tables. To what extent does the kernel really need to trust or verify? At LPC we seemed to speak of a model where userspace builds a "new" address space for each device and then atomically switches to the new page tables instead of the original ones inherited from the previous kernel. That does involve having space for another set of page tables, of course, but that's not impossible. > 3. Serialise and reply the ioctls. Ioctl APIs and payloads should > (must?) be stable across kernel versions. If IOMMUFD records the ioctls > executed by userspace then it could replay them as part of deserialise > and give userspace a handle to the resulting objects after kexec. This > way we are guaranteed consistent iommufd / IOAS objects. By "consistent" > I mean they are the same as before kexec and match the persisted page > tables. By having the kernel do this it means it doesn't need to depend > on userspace doing the correct thing. > > What do you think of this 3rd approach? I can try to sketch it out and > send another RFC if you think it sounds reasonable.
On Mon, Oct 07, 2024 at 09:47:53AM +0100, David Woodhouse wrote: > On Mon, 2024-10-07 at 08:39 +0000, Gowans, James wrote: > > > > I think we have two other possible approaches here: > > > > 1. What this RFC is sketching out, serialising fields from the structs > > and setting those fields again on deserialise. As you point out this > > will be complicated. > > > > 2. Get userspace to do the work: userspace needs to re-do the ioctls > > after kexec to reconstruct the objects. My main issue with this approach > > is that the kernel needs to do some sort of trust but verify approach to > > ensure that userspace constructs everything the same way after kexec as > > it was before kexec. We don't want to end up in a state where the > > iommufd objects don't match the persisted page tables. > > To what extent does the kernel really need to trust or verify? If iommufd is going to adopt an existing iommu_domain then that iommu_domain must have exactly the IOPTEs it expects it to have otherwise there will be functional problems in iommufd. So, IMHO, some kind of validation would be needed to ensure that userspace has created the same structure as the old kernel had. >At LPC we seemed to speak of a model where userspace builds a "new" > address space for each device and then atomically switches to the > new page tables instead of the original ones inherited from the > previous kernel. The hitless replace model would leave the old translation in place while userspace builds up a replacement translation that is equivalent. Then hitless replace would adopt the new translation and we discard the old ones memory. IMHO this is easiest to make correct and least maintenance burden because the only kernel thing you are asking for in iommufd is hitless iommu_domain replace, which we already want to add to the drivers anyhow. (ARM already has it) Jason
On Mon, 2024-10-07 at 09:47 +0100, David Woodhouse wrote: > On Mon, 2024-10-07 at 08:39 +0000, Gowans, James wrote: > > > > I think we have two other possible approaches here: > > > > 1. What this RFC is sketching out, serialising fields from the structs > > and setting those fields again on deserialise. As you point out this > > will be complicated. > > > > 2. Get userspace to do the work: userspace needs to re-do the ioctls > > after kexec to reconstruct the objects. My main issue with this approach > > is that the kernel needs to do some sort of trust but verify approach to > > ensure that userspace constructs everything the same way after kexec as > > it was before kexec. We don't want to end up in a state where the > > iommufd objects don't match the persisted page tables. > > To what extent does the kernel really need to trust or verify? At LPC > we seemed to speak of a model where userspace builds a "new" address > space for each device and then atomically switches to the new page > tables instead of the original ones inherited from the previous kernel. > > That does involve having space for another set of page tables, of > course, but that's not impossible. The idea of constructing fresh page tables and then swapping over to that is indeed appealing, but I don't know if that's always possible. With the ARM SMMUv3 for example I think there are break-before-make requirement, so is it possible to do an atomic switch of the SMMUv3 page table PGD in a hitless way? Everything here must be hitless - serialise and deserialise must not cause any DMA faults. If it's not possible to do a hitless atomic switch (I am unsure about this, need to RTFM) then we're compelled to re-use the existing page tables and if that's the case I think the kernel MUST ensure that the iommufd IOAS object exactly match the ones before kexec. I can imagine all sorts of mess if those go out of sync!
On Mon, Oct 07, 2024 at 08:57:07AM +0000, Gowans, James wrote: > With the ARM SMMUv3 for example I think there are break-before-make > requirement, so is it possible to do an atomic switch of the SMMUv3 page > table PGD in a hitless way? The BBM rules are only about cached translations. If all your IOPTEs result in the same translation *size* then you are safe. You can change the radix memory storing the IOPTEs freely, AFAIK. BBM level 2 capable HW doesn't have those limitations either and everything is possible. Jason
On Mon, 2024-10-07 at 12:01 -0300, Jason Gunthorpe wrote: > On Mon, Oct 07, 2024 at 08:57:07AM +0000, Gowans, James wrote: > > With the ARM SMMUv3 for example I think there are break-before-make > > requirement, so is it possible to do an atomic switch of the SMMUv3 page > > table PGD in a hitless way? > > The BBM rules are only about cached translations. If all your IOPTEs > result in the same translation *size* then you are safe. You can > change the radix memory storing the IOPTEs freely, AFAIK. Okay, but in general this still means that the page tables must have exactly the same translations if we try to switch from one set to another. If it is possible to change translations then translation table entries could be created at different granularity (PTE, PMD, PUD) level which would violate this requirement. It's also possible for different IOMMU driver versions to set up the the same translations, but at different page table levels. Perhaps an older version did not coalesce come PTEs, but a newer version does coalesce. Would the same translations but at a different size violate BBM? If we say that to be safe/correct in the general case then it is necessary for the translations to be *exactly* the same before and after kexec, is there any benefit to building new translation tables and switching to them? We may as well continue to use the exact same page tables and construct iommufd objects (IOAS, etc) to match. There is also a performance consideration here: when doing live update every millisecond of down time matters. I'm not sure if this iommufd re- initialisation will end up being in the hot path of things that need to be done before the VM can start running again. If it in the hot path then it would be useful to avoid rebuilding identical tables. Maybe it ends up being in the "warm" path - the VM can start running but will sleep if taking a page fault before IOMMUFD is re-initalised... So overall my point is that I think we end up with a requirement that the pgtables are identical before and after kexec so there is any benefit in rebuilding them? JG
On Wed, Oct 09, 2024 at 11:44:30AM +0000, Gowans, James wrote: > Okay, but in general this still means that the page tables must have > exactly the same translations if we try to switch from one set to > another. If it is possible to change translations then translation table > entries could be created at different granularity (PTE, PMD, PUD) level > which would violate this requirement. Yes, but we strive to make page tables consistently and it isn't that often that we get new features that would chang the layout (contig bit for instance). I'd suggest in these cases you'd add some creation flag to the HWPT that can inhibit the new feature and your VMM will deal with it. Or you sweep it and manually split/join to deal with BBM < level 2. Generic pt will have code to do all of this so it is not that bad. If this little issue already scares you then I don't think I want to see you serialize anything more complex, there are endless scenarios for compatibility problems :\ > It's also possible for different IOMMU driver versions to set up the the > same translations, but at different page table levels. Perhaps an older > version did not coalesce come PTEs, but a newer version does coalesce. > Would the same translations but at a different size violate BBM? Yes, that is the only thing that violates BBM. > If we say that to be safe/correct in the general case then it is > necessary for the translations to be *exactly* the same before and after > kexec, is there any benefit to building new translation tables and > switching to them? We may as well continue to use the exact same page > tables and construct iommufd objects (IOAS, etc) to match. The benifit is principally that you did all the machinery to get up to that point, including re-pinning and so forth all the memory, instead of trying to magically recover that additional state. This is the philosophy that you replay instead of de-serialize, so you have to replay into a page table at some level to make that work. > There is also a performance consideration here: when doing live update > every millisecond of down time matters. I'm not sure if this iommufd re- > initialisation will end up being in the hot path of things that need to > be done before the VM can start running again. As we talked about in the session, your KVM can start running immediately, you don't need iommufd to be fully setup. You only need iommufd fully working again if you intend to do certain operations, like memory hotplug or something that requires an address map change. So you can operate in a degraded state that is largely invisible to the guest while recovering this stuff. It shouldn't be on your critical path. > then it would be useful to avoid rebuilding identical tables. Maybe it > ends up being in the "warm" path - the VM can start running but will > sleep if taking a page fault before IOMMUFD is re-initalised... I didn't think you'd support page faults? There are bigger issues here if you expect to have a vIOMMU in the guest. Jason
On Wed, 2024-10-09 at 09:28 -0300, Jason Gunthorpe wrote: > On Wed, Oct 09, 2024 at 11:44:30AM +0000, Gowans, James wrote: > > > Okay, but in general this still means that the page tables must have > > exactly the same translations if we try to switch from one set to > > another. If it is possible to change translations then translation table > > entries could be created at different granularity (PTE, PMD, PUD) level > > which would violate this requirement. > > Yes, but we strive to make page tables consistently and it isn't that > often that we get new features that would chang the layout (contig bit > for instance). I'd suggest in these cases you'd add some creation flag > to the HWPT that can inhibit the new feature and your VMM will deal > with it. > > Or you sweep it and manually split/join to deal with BBM < level > 2. Generic pt will have code to do all of this so it is not that bad. > > If this little issue already scares you then I don't think I want to > see you serialize anything more complex, there are endless scenarios > for compatibility problems :\ The things that scare me is some subtle page table difference which causes silent data corruption... This is one of the reasons I liked re- using the existing tables, there is no way for this sort of subtle bug to happen. In general I agree with your point about reducing the complexity of what we serialise and rather exercising the machinery again after kexec to create fresh objects. The only (?) counter point to that is about performance of anything in the hot path (discussed more below). > > > If we say that to be safe/correct in the general case then it is > > necessary for the translations to be *exactly* the same before and after > > kexec, is there any benefit to building new translation tables and > > switching to them? We may as well continue to use the exact same page > > tables and construct iommufd objects (IOAS, etc) to match. > > The benifit is principally that you did all the machinery to get up to > that point, including re-pinning and so forth all the memory, instead > of trying to magically recover that additional state. > > This is the philosophy that you replay instead of de-serialize, so you > have to replay into a page table at some level to make that work. We could have some "skip_pgtable_update" flag which the replay machinery sets, allowing IOMMUFD to create fresh objects internally and leave the page tables alone? Anyway, that sort of thing is a potential future optimisation we could do. In the interests of making progress I'll re-work this RFC to re- create everything (iommufd objects, IOMMU driver domains and page tables) and do the atomic handover of page tables after re- initialisation. > > > > then it would be useful to avoid rebuilding identical tables. Maybe it > > ends up being in the "warm" path - the VM can start running but will > > sleep if taking a page fault before IOMMUFD is re-initalised... > > I didn't think you'd support page faults? There are bigger issues here > if you expect to have a vIOMMU in the guest. vIOMMU is one case, but another is memory oversubscription. With PRI/ATS we can oversubscribe memory which is DMA mapped. In that case a page fault would be a blocking operation until IOMMUFD is all set up and ready to go. I suspect there will be benefit in getting this fast, but as long as we have a path to optimise it in future I'm totally fine to start with re-creating everything. JG
On Thu, Oct 10, 2024 at 03:12:09PM +0000, Gowans, James wrote: > > If this little issue already scares you then I don't think I want to > > see you serialize anything more complex, there are endless scenarios > > for compatibility problems :\ > > The things that scare me is some subtle page table difference which > causes silent data corruption... This is one of the reasons I liked re- > using the existing tables, there is no way for this sort of subtle bug > to happen. > > > If we say that to be safe/correct in the general case then it is > > > necessary for the translations to be *exactly* the same before and after > > > kexec, is there any benefit to building new translation tables and > > > switching to them? We may as well continue to use the exact same page > > > tables and construct iommufd objects (IOAS, etc) to match. > > > > The benifit is principally that you did all the machinery to get up to > > that point, including re-pinning and so forth all the memory, instead > > of trying to magically recover that additional state. > > > > This is the philosophy that you replay instead of de-serialize, so you > > have to replay into a page table at some level to make that work. > > We could have some "skip_pgtable_update" flag which the replay machinery > sets, allowing IOMMUFD to create fresh objects internally and leave the > page tables alone? The point made before was that iommufd hard depends on the content of the iommu_domain for correctness since it uses it as the storage for the PFNs. Making an assumption that the prior kernle domain matches what iommufd requires opens up the easy possibility of hypervisor kernel corruption. I think this is a bad direction.. You have to at least validate that userspace has set things up in a way that is consistent with the prior domain before adopting it. It would be easier to understand this if the performance costs of doing such a validation was more understood. Perhaps it can be optimized somehow. > > > then it would be useful to avoid rebuilding identical tables. Maybe it > > > ends up being in the "warm" path - the VM can start running but will > > > sleep if taking a page fault before IOMMUFD is re-initalised... > > > > I didn't think you'd support page faults? There are bigger issues here > > if you expect to have a vIOMMU in the guest. > > vIOMMU is one case, but another is memory oversubscription. With PRI/ATS > we can oversubscribe memory which is DMA mapped. In that case a page > fault would be a blocking operation until IOMMUFD is all set up and > ready to go. I suspect there will be benefit in getting this fast, but > as long as we have a path to optimise it in future I'm totally fine to > start with re-creating everything. Yes, this is true, but if you intend to do this kind of manipulation of the page table then it really should be in the exact format the new kernel is tested to understand. Expecting the new kernel to interwork with the old kernel's page table is likely to be OK, but also along the same lines of your fear there could be differences :\ Still, PRI/ATS for backing guests storage is a pretty advanced concept, we don't have support for that yet. Jason
© 2016 - 2024 Red Hat, Inc.