[RFC PATCH 08/15] iommu/riscv: Add IRQ domain for interrupt remapping

Andrew Jones posted 15 patches 1 week, 1 day ago
[RFC PATCH 08/15] iommu/riscv: Add IRQ domain for interrupt remapping
Posted by Andrew Jones 1 week, 1 day ago
This is just a skeleton. Until irq_set_vcpu_affinity() is
implemented the IRQ domain doesn't serve any purpose.

Signed-off-by: Andrew Jones <ajones@ventanamicro.com>
---
 drivers/iommu/riscv/Makefile   |   2 +-
 drivers/iommu/riscv/iommu-ir.c | 209 +++++++++++++++++++++++++++++++++
 drivers/iommu/riscv/iommu.c    |  43 ++++++-
 drivers/iommu/riscv/iommu.h    |  21 ++++
 4 files changed, 270 insertions(+), 5 deletions(-)
 create mode 100644 drivers/iommu/riscv/iommu-ir.c

diff --git a/drivers/iommu/riscv/Makefile b/drivers/iommu/riscv/Makefile
index f54c9ed17d41..8420dd1776cb 100644
--- a/drivers/iommu/riscv/Makefile
+++ b/drivers/iommu/riscv/Makefile
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
-obj-$(CONFIG_RISCV_IOMMU) += iommu.o iommu-platform.o
+obj-$(CONFIG_RISCV_IOMMU) += iommu.o iommu-ir.o iommu-platform.o
 obj-$(CONFIG_RISCV_IOMMU_PCI) += iommu-pci.o
diff --git a/drivers/iommu/riscv/iommu-ir.c b/drivers/iommu/riscv/iommu-ir.c
new file mode 100644
index 000000000000..c177e064b205
--- /dev/null
+++ b/drivers/iommu/riscv/iommu-ir.c
@@ -0,0 +1,209 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * IOMMU Interrupt Remapping
+ *
+ * Copyright © 2024 Ventana Micro Systems Inc.
+ */
+#include <linux/irqdomain.h>
+#include <linux/msi.h>
+
+#include "../iommu-pages.h"
+#include "iommu.h"
+
+static size_t riscv_iommu_ir_get_msipte_idx(struct riscv_iommu_domain *domain,
+					    phys_addr_t msi_pa)
+{
+	phys_addr_t addr = msi_pa >> 12;
+	size_t idx;
+
+	if (domain->group_index_bits) {
+		phys_addr_t group_mask = BIT(domain->group_index_bits) - 1;
+		phys_addr_t group_shift = domain->group_index_shift - 12;
+		phys_addr_t group = (addr >> group_shift) & group_mask;
+		phys_addr_t mask = domain->msiptp.msi_addr_mask & ~(group_mask << group_shift);
+
+		idx = addr & mask;
+		idx |= group << fls64(mask);
+	} else {
+		idx = addr & domain->msiptp.msi_addr_mask;
+	}
+
+	return idx;
+}
+
+static struct riscv_iommu_msipte *riscv_iommu_ir_get_msipte(struct riscv_iommu_domain *domain,
+							    phys_addr_t msi_pa)
+{
+	size_t idx;
+
+	if (((msi_pa >> 12) & ~domain->msiptp.msi_addr_mask) != domain->msiptp.msi_addr_pattern)
+		return NULL;
+
+	idx = riscv_iommu_ir_get_msipte_idx(domain, msi_pa);
+	return &domain->msi_root[idx];
+}
+
+static size_t riscv_iommu_ir_nr_msiptes(struct riscv_iommu_domain *domain)
+{
+	phys_addr_t base = domain->msiptp.msi_addr_pattern << 12;
+	phys_addr_t max_addr = base | (domain->msiptp.msi_addr_mask << 12);
+	size_t max_idx = riscv_iommu_ir_get_msipte_idx(domain, max_addr);
+
+	return max_idx + 1;
+}
+
+static struct irq_chip riscv_iommu_irq_chip = {
+	.name			= "IOMMU-IR",
+	.irq_mask		= irq_chip_mask_parent,
+	.irq_unmask		= irq_chip_unmask_parent,
+};
+
+static int riscv_iommu_irq_domain_alloc_irqs(struct irq_domain *irqdomain,
+					     unsigned int irq_base, unsigned int nr_irqs,
+					     void *arg)
+{
+	struct irq_data *data;
+	int i, ret;
+
+	ret = irq_domain_alloc_irqs_parent(irqdomain, irq_base, nr_irqs, arg);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < nr_irqs; i++) {
+		data = irq_domain_get_irq_data(irqdomain, irq_base + i);
+		data->chip = &riscv_iommu_irq_chip;
+	}
+
+	return 0;
+}
+
+static const struct irq_domain_ops riscv_iommu_irq_domain_ops = {
+	.alloc = riscv_iommu_irq_domain_alloc_irqs,
+	.free = irq_domain_free_irqs_parent,
+};
+
+static const struct msi_parent_ops riscv_iommu_msi_parent_ops = {
+	.prefix			= "IR-",
+	.supported_flags	= MSI_GENERIC_FLAGS_MASK |
+				  MSI_FLAG_PCI_MSIX,
+	.required_flags		= MSI_FLAG_USE_DEF_DOM_OPS |
+				  MSI_FLAG_USE_DEF_CHIP_OPS,
+	.init_dev_msi_info	= msi_parent_init_dev_msi_info,
+};
+
+int riscv_iommu_irq_domain_create(struct riscv_iommu_domain *domain,
+				  struct device *dev)
+{
+	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
+	struct fwnode_handle *fn;
+	char *fwname;
+
+	if (domain->irqdomain) {
+		dev_set_msi_domain(dev, domain->irqdomain);
+		return 0;
+	}
+
+	if (!(iommu->caps & RISCV_IOMMU_CAPABILITIES_MSI_FLAT)) {
+		dev_warn(iommu->dev, "Cannot enable interrupt remapping\n");
+		return 0;
+	}
+
+	spin_lock_init(&domain->msi_lock);
+	/*
+	 * TODO: The hypervisor should be in control of this size. For now
+	 * we just allocate enough space for 512 VCPUs.
+	 */
+	domain->msi_order = 1;
+	domain->msi_root = iommu_alloc_pages_node(domain->numa_node,
+						  GFP_KERNEL_ACCOUNT, domain->msi_order);
+	if (!domain->msi_root)
+		return -ENOMEM;
+
+	fwname = kasprintf(GFP_KERNEL, "IOMMU-IR-%s", dev_name(dev));
+	if (!fwname) {
+		iommu_free_pages(domain->msi_root, domain->msi_order);
+		return -ENOMEM;
+	}
+
+	fn = irq_domain_alloc_named_fwnode(fwname);
+	kfree(fwname);
+	if (!fn) {
+		dev_err(iommu->dev, "Couldn't allocate fwnode\n");
+		iommu_free_pages(domain->msi_root, domain->msi_order);
+		return -ENOMEM;
+	}
+
+	domain->irqdomain = irq_domain_create_hierarchy(dev_get_msi_domain(dev),
+							0, 0, fn,
+							&riscv_iommu_irq_domain_ops,
+							domain);
+	if (!domain->irqdomain) {
+		dev_err(iommu->dev, "Failed to create IOMMU irq domain\n");
+		iommu_free_pages(domain->msi_root, domain->msi_order);
+		irq_domain_free_fwnode(fn);
+		return -ENOMEM;
+	}
+
+	domain->irqdomain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT |
+				    IRQ_DOMAIN_FLAG_ISOLATED_MSI;
+	domain->irqdomain->msi_parent_ops = &riscv_iommu_msi_parent_ops;
+	irq_domain_update_bus_token(domain->irqdomain, DOMAIN_BUS_MSI_REMAP);
+	dev_set_msi_domain(dev, domain->irqdomain);
+
+	return 0;
+}
+
+void riscv_iommu_ir_get_resv_regions(struct device *dev, struct list_head *head)
+{
+	struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
+	struct riscv_iommu_domain *domain = info->domain;
+	struct iommu_resv_region *reg;
+	phys_addr_t base, addr;
+	size_t nr_pages, i;
+
+	if (!domain || !domain->msiptp.msiptp)
+		return;
+
+	base = domain->msiptp.msi_addr_pattern << 12;
+
+	if (domain->group_index_bits) {
+		phys_addr_t group_mask = BIT(domain->group_index_bits) - 1;
+		phys_addr_t group_shift = domain->group_index_shift - 12;
+		phys_addr_t mask = domain->msiptp.msi_addr_mask & ~(group_mask << group_shift);
+
+		nr_pages = mask + 1;
+	} else {
+		nr_pages = domain->msiptp.msi_addr_mask + 1;
+	}
+
+	for (i = 0; i < BIT(domain->group_index_bits); i++) {
+		addr = base | (i << domain->group_index_shift);
+		reg = iommu_alloc_resv_region(addr, nr_pages * 4096,
+					      0, IOMMU_RESV_MSI, GFP_KERNEL);
+		if (reg)
+			list_add_tail(&reg->list, head);
+	}
+}
+
+void riscv_iommu_irq_domain_remove(struct riscv_iommu_domain *domain)
+{
+	struct fwnode_handle *fn;
+
+	if (!domain->irqdomain)
+		return;
+
+	iommu_free_pages(domain->msi_root, domain->msi_order);
+
+	fn = domain->irqdomain->fwnode;
+	irq_domain_remove(domain->irqdomain);
+	irq_domain_free_fwnode(fn);
+}
+
+void riscv_iommu_irq_domain_unlink(struct riscv_iommu_domain *domain,
+				   struct device *dev)
+{
+	if (!domain || !domain->irqdomain)
+		return;
+
+	dev_set_msi_domain(dev, domain->irqdomain->parent);
+}
diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
index 6e8ea3d22ff5..c4a47b21c58f 100644
--- a/drivers/iommu/riscv/iommu.c
+++ b/drivers/iommu/riscv/iommu.c
@@ -943,7 +943,8 @@ static void riscv_iommu_iotlb_inval(struct riscv_iommu_domain *domain,
 	rcu_read_unlock();
 }
 
-#define RISCV_IOMMU_FSC_BARE 0
+#define RISCV_IOMMU_FSC_BARE		0
+#define RISCV_IOMMU_IOHGATP_BARE	0
 
 /*
  * Update IODIR for the device.
@@ -1245,6 +1246,8 @@ static void riscv_iommu_free_paging_domain(struct iommu_domain *iommu_domain)
 
 	WARN_ON(!list_empty(&domain->bonds));
 
+	riscv_iommu_irq_domain_remove(domain);
+
 	if (domain->pscid > 0)
 		ida_free(&riscv_iommu_pscids, domain->pscid);
 	if (domain->gscid > 0)
@@ -1276,10 +1279,30 @@ static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain,
 	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
 	struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
 	struct riscv_iommu_dc dc = {0};
+	int ret;
 
 	if (!riscv_iommu_pt_supported(iommu, domain->pgd_mode))
 		return -ENODEV;
 
+	if (riscv_iommu_bond_link(domain, dev))
+		return -ENOMEM;
+
+	if (iommu_domain->type == IOMMU_DOMAIN_UNMANAGED) {
+		domain->gscid = ida_alloc_range(&riscv_iommu_gscids, 1,
+						RISCV_IOMMU_MAX_GSCID, GFP_KERNEL);
+		if (domain->gscid < 0) {
+			riscv_iommu_bond_unlink(domain, dev);
+			return -ENOMEM;
+		}
+
+		ret = riscv_iommu_irq_domain_create(domain, dev);
+		if (ret) {
+			riscv_iommu_bond_unlink(domain, dev);
+			ida_free(&riscv_iommu_gscids, domain->gscid);
+			return ret;
+		}
+	}
+
 	if (domain->gscid) {
 		dc.iohgatp = FIELD_PREP(RISCV_IOMMU_DC_IOHGATP_MODE, domain->pgd_mode) |
 			     FIELD_PREP(RISCV_IOMMU_DC_IOHGATP_GSCID, domain->gscid) |
@@ -1292,10 +1315,9 @@ static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain,
 	dc.ta = FIELD_PREP(RISCV_IOMMU_PC_TA_PSCID, domain->pscid) |
 			   RISCV_IOMMU_PC_TA_V;
 
-	if (riscv_iommu_bond_link(domain, dev))
-		return -ENOMEM;
-
 	riscv_iommu_iodir_update(iommu, dev, &dc);
+
+	riscv_iommu_irq_domain_unlink(info->domain, dev);
 	riscv_iommu_bond_unlink(info->domain, dev);
 	info->domain = domain;
 
@@ -1389,9 +1411,12 @@ static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain,
 	struct riscv_iommu_dc dc = {0};
 
 	dc.fsc = RISCV_IOMMU_FSC_BARE;
+	dc.iohgatp = RISCV_IOMMU_IOHGATP_BARE;
 
 	/* Make device context invalid, translation requests will fault w/ #258 */
 	riscv_iommu_iodir_update(iommu, dev, &dc);
+
+	riscv_iommu_irq_domain_unlink(info->domain, dev);
 	riscv_iommu_bond_unlink(info->domain, dev);
 	info->domain = NULL;
 
@@ -1413,15 +1438,24 @@ static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain,
 	struct riscv_iommu_dc dc = {0};
 
 	dc.fsc = RISCV_IOMMU_FSC_BARE;
+	dc.iohgatp = RISCV_IOMMU_IOHGATP_BARE;
 	dc.ta = RISCV_IOMMU_PC_TA_V;
 
 	riscv_iommu_iodir_update(iommu, dev, &dc);
+
+	riscv_iommu_irq_domain_unlink(info->domain, dev);
 	riscv_iommu_bond_unlink(info->domain, dev);
 	info->domain = NULL;
 
 	return 0;
 }
 
+static void riscv_iommu_get_resv_regions(struct device *dev,
+					 struct list_head *head)
+{
+	riscv_iommu_ir_get_resv_regions(dev, head);
+}
+
 static struct iommu_domain riscv_iommu_identity_domain = {
 	.type = IOMMU_DOMAIN_IDENTITY,
 	.ops = &(const struct iommu_domain_ops) {
@@ -1516,6 +1550,7 @@ static const struct iommu_ops riscv_iommu_ops = {
 	.blocked_domain = &riscv_iommu_blocking_domain,
 	.release_domain = &riscv_iommu_blocking_domain,
 	.domain_alloc_paging = riscv_iommu_alloc_paging_domain,
+	.get_resv_regions = riscv_iommu_get_resv_regions,
 	.device_group = riscv_iommu_device_group,
 	.probe_device = riscv_iommu_probe_device,
 	.release_device	= riscv_iommu_release_device,
diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h
index dd538b19fbb7..6ce71095781c 100644
--- a/drivers/iommu/riscv/iommu.h
+++ b/drivers/iommu/riscv/iommu.h
@@ -23,6 +23,12 @@
 #define RISCV_IOMMU_DDTP_TIMEOUT	10000000
 #define RISCV_IOMMU_IOTINVAL_TIMEOUT	90000000
 
+struct riscv_iommu_msiptp_state {
+	u64 msiptp;
+	u64 msi_addr_mask;
+	u64 msi_addr_pattern;
+};
+
 /* This struct contains protection domain specific IOMMU driver data. */
 struct riscv_iommu_domain {
 	struct iommu_domain domain;
@@ -34,6 +40,13 @@ struct riscv_iommu_domain {
 	int numa_node;
 	unsigned int pgd_mode;
 	unsigned long *pgd_root;
+	u32 group_index_bits;
+	u32 group_index_shift;
+	int msi_order;
+	struct riscv_iommu_msipte *msi_root;
+	spinlock_t msi_lock;
+	struct riscv_iommu_msiptp_state msiptp;
+	struct irq_domain *irqdomain;
 };
 
 /* Private IOMMU data for managed devices, dev_iommu_priv_* */
@@ -119,6 +132,14 @@ void riscv_iommu_cmd_send(struct riscv_iommu_device *iommu,
 void riscv_iommu_cmd_sync(struct riscv_iommu_device *iommu,
 			  unsigned int timeout_us);
 
+int riscv_iommu_irq_domain_create(struct riscv_iommu_domain *domain,
+				  struct device *dev);
+void riscv_iommu_irq_domain_remove(struct riscv_iommu_domain *domain);
+void riscv_iommu_irq_domain_unlink(struct riscv_iommu_domain *domain,
+				   struct device *dev);
+void riscv_iommu_ir_get_resv_regions(struct device *dev,
+				     struct list_head *head);
+
 #define riscv_iommu_readl(iommu, addr) \
 	readl_relaxed((iommu)->reg + (addr))
 
-- 
2.47.0

Re: [RFC PATCH 08/15] iommu/riscv: Add IRQ domain for interrupt remapping
Posted by Jason Gunthorpe 4 days, 7 hours ago
On Thu, Nov 14, 2024 at 05:18:53PM +0100, Andrew Jones wrote:
> @@ -1276,10 +1279,30 @@ static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain,
>  	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
>  	struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
>  	struct riscv_iommu_dc dc = {0};
> +	int ret;
>  
>  	if (!riscv_iommu_pt_supported(iommu, domain->pgd_mode))
>  		return -ENODEV;
>  
> +	if (riscv_iommu_bond_link(domain, dev))
> +		return -ENOMEM;
> +
> +	if (iommu_domain->type == IOMMU_DOMAIN_UNMANAGED) {

Drivers should not be making tests like this.

> +		domain->gscid = ida_alloc_range(&riscv_iommu_gscids, 1,
> +						RISCV_IOMMU_MAX_GSCID, GFP_KERNEL);
> +		if (domain->gscid < 0) {
> +			riscv_iommu_bond_unlink(domain, dev);
> +			return -ENOMEM;
> +		}
> +
> +		ret = riscv_iommu_irq_domain_create(domain, dev);
> +		if (ret) {
> +			riscv_iommu_bond_unlink(domain, dev);
> +			ida_free(&riscv_iommu_gscids, domain->gscid);
> +			return ret;
> +		}
> +	}

What are you trying to do? Make something behave different for VFIO?
That isn't OK, we are trying to remove all the hacky VFIO special
cases in drivers.

What is the HW issue here? It is very very strange (and probably not
going to work right) that the irq domains change when domain
attachment changes.

The IRQ setup should really be fixed before any device drivers probe
onto the device.

Jason
Re: [RFC PATCH 08/15] iommu/riscv: Add IRQ domain for interrupt remapping
Posted by Andrew Jones 3 days, 18 hours ago
On Mon, Nov 18, 2024 at 02:43:36PM -0400, Jason Gunthorpe wrote:
> On Thu, Nov 14, 2024 at 05:18:53PM +0100, Andrew Jones wrote:
> > @@ -1276,10 +1279,30 @@ static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain,
> >  	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
> >  	struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
> >  	struct riscv_iommu_dc dc = {0};
> > +	int ret;
> >  
> >  	if (!riscv_iommu_pt_supported(iommu, domain->pgd_mode))
> >  		return -ENODEV;
> >  
> > +	if (riscv_iommu_bond_link(domain, dev))
> > +		return -ENOMEM;
> > +
> > +	if (iommu_domain->type == IOMMU_DOMAIN_UNMANAGED) {
> 
> Drivers should not be making tests like this.
> 
> > +		domain->gscid = ida_alloc_range(&riscv_iommu_gscids, 1,
> > +						RISCV_IOMMU_MAX_GSCID, GFP_KERNEL);
> > +		if (domain->gscid < 0) {
> > +			riscv_iommu_bond_unlink(domain, dev);
> > +			return -ENOMEM;
> > +		}
> > +
> > +		ret = riscv_iommu_irq_domain_create(domain, dev);
> > +		if (ret) {
> > +			riscv_iommu_bond_unlink(domain, dev);
> > +			ida_free(&riscv_iommu_gscids, domain->gscid);
> > +			return ret;
> > +		}
> > +	}
> 
> What are you trying to do? Make something behave different for VFIO?
> That isn't OK, we are trying to remove all the hacky VFIO special
> cases in drivers.
> 
> What is the HW issue here? It is very very strange (and probably not
> going to work right) that the irq domains change when domain
> attachment changes.
> 
> The IRQ setup should really be fixed before any device drivers probe
> onto the device.

I can't disagree with the statement that this looks hacky, but considering
a VFIO domain needs to use the g-stage for its single-stage translation
and a paging domain for the host would use s-stage, then it seems we need
to identify the VFIO domains for their special treatment. Is there an
example of converting VFIO special casing in other drivers to something
cleaner that you can point me at?

The IRQ domain will only be useful for device assignment, as that's when
an MSI translation will be needed. I can't think of any problems that
could arise from only creating the IRQ domain when probing assigned
devices, but I could certainly be missing something. Do you have some
potential problems in mind?

Thanks,
drew
Re: [RFC PATCH 08/15] iommu/riscv: Add IRQ domain for interrupt remapping
Posted by Jason Gunthorpe 3 days, 12 hours ago
On Tue, Nov 19, 2024 at 08:49:37AM +0100, Andrew Jones wrote:
> On Mon, Nov 18, 2024 at 02:43:36PM -0400, Jason Gunthorpe wrote:
> > On Thu, Nov 14, 2024 at 05:18:53PM +0100, Andrew Jones wrote:
> > > @@ -1276,10 +1279,30 @@ static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain,
> > >  	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
> > >  	struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
> > >  	struct riscv_iommu_dc dc = {0};
> > > +	int ret;
> > >  
> > >  	if (!riscv_iommu_pt_supported(iommu, domain->pgd_mode))
> > >  		return -ENODEV;
> > >  
> > > +	if (riscv_iommu_bond_link(domain, dev))
> > > +		return -ENOMEM;
> > > +
> > > +	if (iommu_domain->type == IOMMU_DOMAIN_UNMANAGED) {
> > 
> > Drivers should not be making tests like this.
> > 
> > > +		domain->gscid = ida_alloc_range(&riscv_iommu_gscids, 1,
> > > +						RISCV_IOMMU_MAX_GSCID, GFP_KERNEL);
> > > +		if (domain->gscid < 0) {
> > > +			riscv_iommu_bond_unlink(domain, dev);
> > > +			return -ENOMEM;
> > > +		}
> > > +
> > > +		ret = riscv_iommu_irq_domain_create(domain, dev);
> > > +		if (ret) {
> > > +			riscv_iommu_bond_unlink(domain, dev);
> > > +			ida_free(&riscv_iommu_gscids, domain->gscid);
> > > +			return ret;
> > > +		}
> > > +	}
> > 
> > What are you trying to do? Make something behave different for VFIO?
> > That isn't OK, we are trying to remove all the hacky VFIO special
> > cases in drivers.
> > 
> > What is the HW issue here? It is very very strange (and probably not
> > going to work right) that the irq domains change when domain
> > attachment changes.
> > 
> > The IRQ setup should really be fixed before any device drivers probe
> > onto the device.
> 
> I can't disagree with the statement that this looks hacky, but considering
> a VFIO domain needs to use the g-stage for its single-stage translation
> and a paging domain for the host would use s-stage, then it seems we need
> to identify the VFIO domains for their special treatment.

This is the wrong thinking entirely. There is no such thing as a "VFIO
domain".

Default VFIO created domains should act excatly the same as a DMA API
domain.

If you want your system to have irq remapping, then it should be on by
default and DMA API gets remapping too. There would need to be a very
strong reason not to do that in order to make something special for
riscv. If so you'd need to add some kind of flag to select it.

Until you reach nested translation there is no "need" for VFIO to use
any particular stage. The design is that default VFIO uses the same
stage as the DMA API because it is doing the same basic default
translation function.

Nested translation has a control to select the stage, and you can
then force the g-stage for VFIO users at that point.

Regardless, you must not use UNMANAGED as some indication of VFIO,
that is not what it means, that is not what it is for.

> Is there an example of converting VFIO special casing in other
> drivers to something cleaner that you can point me at?

Nobody has had an issue where they want interrupt remapping on/off
depending on VFIO. I think that is inherently wrong.

> The IRQ domain will only be useful for device assignment, as that's when
> an MSI translation will be needed. I can't think of any problems that
> could arise from only creating the IRQ domain when probing assigned
> devices, but I could certainly be missing something. Do you have some
> potential problems in mind?

I'm not an expert in the interrupt subsystem, but my understanding was
we expect the interrupt domains/etc to be static once a device driver
is probed. Changing things during iommu domain attach is after drivers
are probed. I don't really expect it to work correctly in all corner
cases.

VFIO is allowed to change the translation as it operates and we expect
that interrupts are not disturbed.

Jason
Re: [RFC PATCH 08/15] iommu/riscv: Add IRQ domain for interrupt remapping
Posted by Andrew Jones 3 days, 11 hours ago
On November 19, 2024 3:00:47 PM GMT+01:00, Jason Gunthorpe <jgg@ziepe.ca> wrote:
>On Tue, Nov 19, 2024 at 08:49:37AM +0100, Andrew Jones wrote:
>> On Mon, Nov 18, 2024 at 02:43:36PM -0400, Jason Gunthorpe wrote:
>> > On Thu, Nov 14, 2024 at 05:18:53PM +0100, Andrew Jones wrote:
>> > > @@ -1276,10 +1279,30 @@ static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain,
>> > >  	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
>> > >  	struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
>> > >  	struct riscv_iommu_dc dc = {0};
>> > > +	int ret;
>> > >  
>> > >  	if (!riscv_iommu_pt_supported(iommu, domain->pgd_mode))
>> > >  		return -ENODEV;
>> > >  
>> > > +	if (riscv_iommu_bond_link(domain, dev))
>> > > +		return -ENOMEM;
>> > > +
>> > > +	if (iommu_domain->type == IOMMU_DOMAIN_UNMANAGED) {
>> > 
>> > Drivers should not be making tests like this.
>> > 
>> > > +		domain->gscid = ida_alloc_range(&riscv_iommu_gscids, 1,
>> > > +						RISCV_IOMMU_MAX_GSCID, GFP_KERNEL);
>> > > +		if (domain->gscid < 0) {
>> > > +			riscv_iommu_bond_unlink(domain, dev);
>> > > +			return -ENOMEM;
>> > > +		}
>> > > +
>> > > +		ret = riscv_iommu_irq_domain_create(domain, dev);
>> > > +		if (ret) {
>> > > +			riscv_iommu_bond_unlink(domain, dev);
>> > > +			ida_free(&riscv_iommu_gscids, domain->gscid);
>> > > +			return ret;
>> > > +		}
>> > > +	}
>> > 
>> > What are you trying to do? Make something behave different for VFIO?
>> > That isn't OK, we are trying to remove all the hacky VFIO special
>> > cases in drivers.
>> > 
>> > What is the HW issue here? It is very very strange (and probably not
>> > going to work right) that the irq domains change when domain
>> > attachment changes.
>> > 
>> > The IRQ setup should really be fixed before any device drivers probe
>> > onto the device.
>> 
>> I can't disagree with the statement that this looks hacky, but considering
>> a VFIO domain needs to use the g-stage for its single-stage translation
>> and a paging domain for the host would use s-stage, then it seems we need
>> to identify the VFIO domains for their special treatment.
>
>This is the wrong thinking entirely. There is no such thing as a "VFIO
>domain".
>
>Default VFIO created domains should act excatly the same as a DMA API
>domain.
>
>If you want your system to have irq remapping, then it should be on by
>default and DMA API gets remapping too. There would need to be a very
>strong reason not to do that in order to make something special for
>riscv. If so you'd need to add some kind of flag to select it.
>
>Until you reach nested translation there is no "need" for VFIO to use
>any particular stage. The design is that default VFIO uses the same
>stage as the DMA API because it is doing the same basic default
>translation function.

The RISC-V IOMMU needs to use g-stage for device assignment, if we also want to enable irqbypass, because the IOMMU is specified to only look at the MSI table when g-stage is in use. This is actually another reason the irq domain only makes sense for device assignment.

>
>Nested translation has a control to select the stage, and you can
>then force the g-stage for VFIO users at that point.

We could force riscv device assignment to always be nested, and when not providing an iommu to the guest, it will still be single-stage, but g-stage, but I don't think that's currently possible with VFIO, is it?

>
>Regardless, you must not use UNMANAGED as some indication of VFIO,
>that is not what it means, that is not what it is for.
>
>> Is there an example of converting VFIO special casing in other
>> drivers to something cleaner that you can point me at?
>
>Nobody has had an issue where they want interrupt remapping on/off
>depending on VFIO. I think that is inherently wrong.
>
>> The IRQ domain will only be useful for device assignment, as that's when
>> an MSI translation will be needed. I can't think of any problems that
>> could arise from only creating the IRQ domain when probing assigned
>> devices, but I could certainly be missing something. Do you have some
>> potential problems in mind?
>
>I'm not an expert in the interrupt subsystem, but my understanding was
>we expect the interrupt domains/etc to be static once a device driver
>is probed. Changing things during iommu domain attach is after drivers
>are probed. I don't really expect it to work correctly in all corner
>cases.

With VFIO the iommu domain attach comes after an unbind/bind, so the new driver is probed. I think that's a safe time. However, if there could be cases where the attach does not follow an unbind/bind, then I agree that wouldn't be safe. I'll consider always creating an IRQ domain, even if it won't provide any additional functionality unless the device is assigned.

>
>VFIO is allowed to change the translation as it operates and we expect
>that interrupts are not disturbed.
>

The IRQ domain stays the same during operation, the only changes are the mappings from what the guest believes are its s-mode interrupt files to the hypervisor selected guest interrupt files, and these changes are made possible by the IRQ domain's vcpu-affinity support.

Thanks,
drew
Re: [RFC PATCH 08/15] iommu/riscv: Add IRQ domain for interrupt remapping
Posted by Jason Gunthorpe 3 days, 10 hours ago
On Tue, Nov 19, 2024 at 04:03:05PM +0100, Andrew Jones wrote:

> >This is the wrong thinking entirely. There is no such thing as a "VFIO
> >domain".
> >
> >Default VFIO created domains should act excatly the same as a DMA API
> >domain.
> >
> >If you want your system to have irq remapping, then it should be on by
> >default and DMA API gets remapping too. There would need to be a very
> >strong reason not to do that in order to make something special for
> >riscv. If so you'd need to add some kind of flag to select it.
> >
> >Until you reach nested translation there is no "need" for VFIO to use
> >any particular stage. The design is that default VFIO uses the same
> >stage as the DMA API because it is doing the same basic default
> >translation function.
> 
> The RISC-V IOMMU needs to use g-stage for device assignment, if we
> also want to enable irqbypass, because the IOMMU is specified to
> only look at the MSI table when g-stage is in use. This is actually
> another reason the irq domain only makes sense for device
> assignment.

Most HW has enablable interrupt remapping and typically Linux just
turns it always on.

Is there a reason the DMA API shouldn't use this translation mode too?
That seems to be the main issue here, you are trying to avoid
interrupt remapping for DMA API and use it only for VFIO, and haven't
explained why we need such complexity. Just use it always?

> >Nested translation has a control to select the stage, and you can
> >then force the g-stage for VFIO users at that point.
> 
> We could force riscv device assignment to always be nested, and when
> not providing an iommu to the guest, it will still be single-stage,
> but g-stage, but I don't think that's currently possible with VFIO,
> is it?

That isn't what I mean, I mean you should not be forcing the kind of
domain being created until you get to special cases like nested.

Default VFIO should work the same as the DMA API.

> >> The IRQ domain will only be useful for device assignment, as that's when
> >> an MSI translation will be needed. I can't think of any problems that
> >> could arise from only creating the IRQ domain when probing assigned
> >> devices, but I could certainly be missing something. Do you have some
> >> potential problems in mind?
> >
> >I'm not an expert in the interrupt subsystem, but my understanding was
> >we expect the interrupt domains/etc to be static once a device driver
> >is probed. Changing things during iommu domain attach is after drivers
> >are probed. I don't really expect it to work correctly in all corner
> >cases.
> 
> With VFIO the iommu domain attach comes after an unbind/bind, so the
> new driver is probed.

That's the opposite of what I mean. The irq domain should be setup
*before* VFIO binds to the driver.

Changing the active irq_domain while VFIO is already probed to the
device is highly unlikely to work right in all cases.

> I think that's a safe time. However, if there
> could be cases where the attach does not follow an unbind/bind, then
> I agree that wouldn't be safe.

These cases exist.

> I'll consider always creating an IRQ
> domain, even if it won't provide any additional functionality unless
> the device is assigned.

That isn't ideal, the translation under the IRQs shouldn't really be
changing as the translation under the IOMMU changes.

Further, VFIO assumes iommu_group_has_isolated_msi(), ie
IRQ_DOMAIN_FLAG_ISOLATED_MSI, is fixed while it is is bound. Will that
be true if the iommu is flapping all about? What will you do when VFIO
has it attached to a blocked domain?

It just doesn't make sense to change something so fundamental as the
interrupt path on an iommu domain attachement. :\

> >VFIO is allowed to change the translation as it operates and we expect
> >that interrupts are not disturbed.
> 
> The IRQ domain stays the same during operation, the only changes are
> the mappings from what the guest believes are its s-mode interrupt
> files to the hypervisor selected guest interrupt files, and these
> changes are made possible by the IRQ domain's vcpu-affinity support.

That is only the case when talking about kvm, this all still has to
work fully for non-kvm VFIO uses cases too.

Jason
Re: [RFC PATCH 08/15] iommu/riscv: Add IRQ domain for interrupt remapping
Posted by Andrew Jones 11 hours ago
On Tue, Nov 19, 2024 at 11:36:22AM -0400, Jason Gunthorpe wrote:
> On Tue, Nov 19, 2024 at 04:03:05PM +0100, Andrew Jones wrote:
> 
> > >This is the wrong thinking entirely. There is no such thing as a "VFIO
> > >domain".
> > >
> > >Default VFIO created domains should act excatly the same as a DMA API
> > >domain.
> > >
> > >If you want your system to have irq remapping, then it should be on by
> > >default and DMA API gets remapping too. There would need to be a very
> > >strong reason not to do that in order to make something special for
> > >riscv. If so you'd need to add some kind of flag to select it.
> > >
> > >Until you reach nested translation there is no "need" for VFIO to use
> > >any particular stage. The design is that default VFIO uses the same
> > >stage as the DMA API because it is doing the same basic default
> > >translation function.
> > 
> > The RISC-V IOMMU needs to use g-stage for device assignment, if we
> > also want to enable irqbypass, because the IOMMU is specified to
> > only look at the MSI table when g-stage is in use. This is actually
> > another reason the irq domain only makes sense for device
> > assignment.
> 
> Most HW has enablable interrupt remapping and typically Linux just
> turns it always on.
> 
> Is there a reason the DMA API shouldn't use this translation mode too?
> That seems to be the main issue here, you are trying to avoid
> interrupt remapping for DMA API and use it only for VFIO, and haven't
> explained why we need such complexity. Just use it always?

The reason is that the RISC-V IOMMU only checks the MSI table, i.e.
enables its support for MSI remapping, when the g-stage (second-stage)
page table is in use. However, the expected virtual memory scheme for an
OS to use for DMA would be to have s-stage (first-stage) in use and the
g-stage set to 'Bare' (not in use). OIOW, it doesn't appear the spec
authors expected MSI remapping to be enabled for the host DMA use case.
That does make some sense, since it's actually not necessary. For the
host DMA use case, providing mappings for each s-mode interrupt file
which the device is allowed to write to in the s-stage page table
sufficiently enables MSIs to be delivered.

> 
> > >Nested translation has a control to select the stage, and you can
> > >then force the g-stage for VFIO users at that point.
> > 
> > We could force riscv device assignment to always be nested, and when
> > not providing an iommu to the guest, it will still be single-stage,
> > but g-stage, but I don't think that's currently possible with VFIO,
> > is it?
> 
> That isn't what I mean, I mean you should not be forcing the kind of
> domain being created until you get to special cases like nested.
> 
> Default VFIO should work the same as the DMA API.

If "default VFIO" means VFIO without irqbypass, then it would work the
same as the DMA API, assuming all mappings for all necessary s-mode
interrupt files are created (something the DMA API needs as well).
However, VFIO would also need 'vfio_iommu_type1.allow_unsafe_interrupts=1'
to be set for this no-irqbypass configuration.

> 
> > >> The IRQ domain will only be useful for device assignment, as that's when
> > >> an MSI translation will be needed. I can't think of any problems that
> > >> could arise from only creating the IRQ domain when probing assigned
> > >> devices, but I could certainly be missing something. Do you have some
> > >> potential problems in mind?
> > >
> > >I'm not an expert in the interrupt subsystem, but my understanding was
> > >we expect the interrupt domains/etc to be static once a device driver
> > >is probed. Changing things during iommu domain attach is after drivers
> > >are probed. I don't really expect it to work correctly in all corner
> > >cases.
> > 
> > With VFIO the iommu domain attach comes after an unbind/bind, so the
> > new driver is probed.
> 
> That's the opposite of what I mean. The irq domain should be setup
> *before* VFIO binds to the driver.
> 
> Changing the active irq_domain while VFIO is already probed to the
> device is highly unlikely to work right in all cases.
> 
> > I think that's a safe time. However, if there
> > could be cases where the attach does not follow an unbind/bind, then
> > I agree that wouldn't be safe.
> 
> These cases exist.
> 
> > I'll consider always creating an IRQ
> > domain, even if it won't provide any additional functionality unless
> > the device is assigned.
> 
> That isn't ideal, the translation under the IRQs shouldn't really be
> changing as the translation under the IOMMU changes.

Unless the device is assigned to a guest, then the IRQ domain wouldn't
do anything at all (it'd just sit between the device and the device's
old MSI parent domain), but it also wouldn't come and go, risking issues
with anything sensitive to changes in the IRQ domain hierarchy.

> 
> Further, VFIO assumes iommu_group_has_isolated_msi(), ie
> IRQ_DOMAIN_FLAG_ISOLATED_MSI, is fixed while it is is bound. Will that
> be true if the iommu is flapping all about? What will you do when VFIO
> has it attached to a blocked domain?
> 
> It just doesn't make sense to change something so fundamental as the
> interrupt path on an iommu domain attachement. :\

Yes, it does appear I should be doing this at iommu device probe time
instead. It won't provide any additional functionality to use cases which
aren't assigning devices to guests, but it also won't hurt, and it should
avoid the risks you point out.

> 
> > >VFIO is allowed to change the translation as it operates and we expect
> > >that interrupts are not disturbed.
> > 
> > The IRQ domain stays the same during operation, the only changes are
> > the mappings from what the guest believes are its s-mode interrupt
> > files to the hypervisor selected guest interrupt files, and these
> > changes are made possible by the IRQ domain's vcpu-affinity support.
> 
> That is only the case when talking about kvm, this all still has to
> work fully for non-kvm VFIO uses cases too.
> 

I'll rework the irq domain creation to be at iommu device probe time for
the next version.

Thanks,
drew
Re: [RFC PATCH 08/15] iommu/riscv: Add IRQ domain for interrupt remapping
Posted by Jason Gunthorpe 10 hours ago
On Fri, Nov 22, 2024 at 04:11:36PM +0100, Andrew Jones wrote:

> The reason is that the RISC-V IOMMU only checks the MSI table, i.e.
> enables its support for MSI remapping, when the g-stage (second-stage)
> page table is in use. However, the expected virtual memory scheme for an
> OS to use for DMA would be to have s-stage (first-stage) in use and the
> g-stage set to 'Bare' (not in use). 

That isn't really a technical reason.

> OIOW, it doesn't appear the spec authors expected MSI remapping to
> be enabled for the host DMA use case.  That does make some sense,
> since it's actually not necessary. For the host DMA use case,
> providing mappings for each s-mode interrupt file which the device
> is allowed to write to in the s-stage page table sufficiently
> enables MSIs to be delivered.

Well, that seems to be the main problem here. You are grappling with a
spec design that doesn't match the SW expecations. Since it has
deviated from what everyone else has done you now have extra
challenges to resolve in some way.

Just always using interrupt remapping if the HW is capable of
interrupt remapping and ignoring the spec "expectation" is a nice a
simple way to make things work with existing Linux.

> If "default VFIO" means VFIO without irqbypass, then it would work the
> same as the DMA API, assuming all mappings for all necessary s-mode
> interrupt files are created (something the DMA API needs as well).
> However, VFIO would also need 'vfio_iommu_type1.allow_unsafe_interrupts=1'
> to be set for this no-irqbypass configuration.

Which isn't what anyone wants, you need to make the DMA API domain be
fully functional so that VFIO works.

> > That isn't ideal, the translation under the IRQs shouldn't really be
> > changing as the translation under the IOMMU changes.
> 
> Unless the device is assigned to a guest, then the IRQ domain wouldn't
> do anything at all (it'd just sit between the device and the device's
> old MSI parent domain), but it also wouldn't come and go, risking issues
> with anything sensitive to changes in the IRQ domain hierarchy.

VFIO isn't restricted to such a simple use model. You have to support
all the generality, which includes fully supporting changing the iommu
translation on the fly.

> > Further, VFIO assumes iommu_group_has_isolated_msi(), ie
> > IRQ_DOMAIN_FLAG_ISOLATED_MSI, is fixed while it is is bound. Will that
> > be true if the iommu is flapping all about? What will you do when VFIO
> > has it attached to a blocked domain?
> > 
> > It just doesn't make sense to change something so fundamental as the
> > interrupt path on an iommu domain attachement. :\
> 
> Yes, it does appear I should be doing this at iommu device probe time
> instead. It won't provide any additional functionality to use cases which
> aren't assigning devices to guests, but it also won't hurt, and it should
> avoid the risks you point out.

Even if you statically create the domain you can't change the value of
IRQ_DOMAIN_FLAG_ISOLATED_MSI depending on what is currently attached
to the IOMMU.

What you are trying to do is not supported by the software stack right
now. You need to make much bigger, more intrusive changes, if you
really want to make interrupt remapping dynamic.

Jason
Re: [RFC PATCH 08/15] iommu/riscv: Add IRQ domain for interrupt remapping
Posted by Andrew Jones 9 hours ago
On Fri, Nov 22, 2024 at 11:33:40AM -0400, Jason Gunthorpe wrote:
> On Fri, Nov 22, 2024 at 04:11:36PM +0100, Andrew Jones wrote:
> 
> > The reason is that the RISC-V IOMMU only checks the MSI table, i.e.
> > enables its support for MSI remapping, when the g-stage (second-stage)
> > page table is in use. However, the expected virtual memory scheme for an
> > OS to use for DMA would be to have s-stage (first-stage) in use and the
> > g-stage set to 'Bare' (not in use). 
> 
> That isn't really a technical reason.
> 
> > OIOW, it doesn't appear the spec authors expected MSI remapping to
> > be enabled for the host DMA use case.  That does make some sense,
> > since it's actually not necessary. For the host DMA use case,
> > providing mappings for each s-mode interrupt file which the device
> > is allowed to write to in the s-stage page table sufficiently
> > enables MSIs to be delivered.
> 
> Well, that seems to be the main problem here. You are grappling with a
> spec design that doesn't match the SW expecations. Since it has
> deviated from what everyone else has done you now have extra
> challenges to resolve in some way.
> 
> Just always using interrupt remapping if the HW is capable of
> interrupt remapping and ignoring the spec "expectation" is a nice a
> simple way to make things work with existing Linux.
> 
> > If "default VFIO" means VFIO without irqbypass, then it would work the
> > same as the DMA API, assuming all mappings for all necessary s-mode
> > interrupt files are created (something the DMA API needs as well).
> > However, VFIO would also need 'vfio_iommu_type1.allow_unsafe_interrupts=1'
> > to be set for this no-irqbypass configuration.
> 
> Which isn't what anyone wants, you need to make the DMA API domain be
> fully functional so that VFIO works.
> 
> > > That isn't ideal, the translation under the IRQs shouldn't really be
> > > changing as the translation under the IOMMU changes.
> > 
> > Unless the device is assigned to a guest, then the IRQ domain wouldn't
> > do anything at all (it'd just sit between the device and the device's
> > old MSI parent domain), but it also wouldn't come and go, risking issues
> > with anything sensitive to changes in the IRQ domain hierarchy.
> 
> VFIO isn't restricted to such a simple use model. You have to support
> all the generality, which includes fully supporting changing the iommu
> translation on the fly.
> 
> > > Further, VFIO assumes iommu_group_has_isolated_msi(), ie
> > > IRQ_DOMAIN_FLAG_ISOLATED_MSI, is fixed while it is is bound. Will that
> > > be true if the iommu is flapping all about? What will you do when VFIO
> > > has it attached to a blocked domain?
> > > 
> > > It just doesn't make sense to change something so fundamental as the
> > > interrupt path on an iommu domain attachement. :\
> > 
> > Yes, it does appear I should be doing this at iommu device probe time
> > instead. It won't provide any additional functionality to use cases which
> > aren't assigning devices to guests, but it also won't hurt, and it should
> > avoid the risks you point out.
> 
> Even if you statically create the domain you can't change the value of
> IRQ_DOMAIN_FLAG_ISOLATED_MSI depending on what is currently attached
> to the IOMMU.
> 
> What you are trying to do is not supported by the software stack right
> now. You need to make much bigger, more intrusive changes, if you
> really want to make interrupt remapping dynamic.
>

Let the fun begin. I'll look into this more. It also looks like I need to
collect some test cases to ensure I can support all use cases with
whatever I propose next. Pointers for those would be welcome.

Thanks,
drew