[PATCH v7 2/2] s390/pci: Migrate s390 IRQ logic to IRQ domain API

Tobias Schumacher posted 2 patches 2 months, 1 week ago
There is a newer version of this series
[PATCH v7 2/2] s390/pci: Migrate s390 IRQ logic to IRQ domain API
Posted by Tobias Schumacher 2 months, 1 week ago
s390 is one of the last architectures using the legacy API for setup and
teardown of PCI MSI IRQs. Migrate the s390 IRQ allocation and teardown
to the MSI parent domain API. For details, see:

https://lore.kernel.org/lkml/20221111120501.026511281@linutronix.de

In detail, create an MSI parent domain for each PCI domain. When a PCI
device sets up MSI or MSI-X IRQs, the library creates a per-device IRQ
domain for this device, which is used by the device for allocating and
freeing IRQs.

The per-device domain delegates this allocation and freeing to the
parent-domain. In the end, the corresponding callbacks of the parent
domain are responsible for allocating and freeing the IRQs.

The allocation is split into two parts:
- zpci_msi_prepare() is called once for each device and allocates the
  required resources. On s390, each PCI function has its own airq
  vector and a summary bit, which must be configured once per function.
  This is done in prepare().
- zpci_msi_alloc() can be called multiple times for allocating one or
  more MSI/MSI-X IRQs. This creates a mapping between the virtual IRQ
  number in the kernel and the hardware IRQ number.

Freeing is split into two counterparts:
- zpci_msi_free() reverts the effects of zpci_msi_alloc() and
- zpci_msi_teardown() reverts the effects of zpci_msi_prepare(). This is
  called once when all IRQs are freed before a device is removed.

Since the parent domain in the end allocates the IRQs, the hwirq
encoding must be unambiguous for all IRQs of all devices. This is
achieved by encoding the hwirq using the devfn and the MSI index.

Signed-off-by: Tobias Schumacher <ts@linux.ibm.com>
---
 arch/s390/Kconfig           |   1 +
 arch/s390/include/asm/pci.h |   5 +
 arch/s390/pci/pci.c         |   6 +
 arch/s390/pci/pci_bus.c     |  18 ++-
 arch/s390/pci/pci_irq.c     | 310 ++++++++++++++++++++++++++++----------------
 5 files changed, 224 insertions(+), 116 deletions(-)

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 778ce20d34046cad84dd4ef57cab5a662e5796d9..fc82dd4f893d78f12837f36ab82a05f2c52e0501 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -251,6 +251,7 @@ config S390
 	select HOTPLUG_SMT
 	select IOMMU_HELPER		if PCI
 	select IOMMU_SUPPORT		if PCI
+	select IRQ_MSI_LIB		if PCI
 	select KASAN_VMALLOC if KASAN
 	select LOCK_MM_AND_FIND_VMA
 	select MMU_GATHER_MERGE_VMAS
diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h
index a32f465ecf73a5cc3408a312d94ec888d62848cc..0aa6915346a50077f22868cef39638919979d478 100644
--- a/arch/s390/include/asm/pci.h
+++ b/arch/s390/include/asm/pci.h
@@ -5,6 +5,7 @@
 #include <linux/pci.h>
 #include <linux/mutex.h>
 #include <linux/iommu.h>
+#include <linux/irqdomain.h>
 #include <linux/pci_hotplug.h>
 #include <asm/pci_clp.h>
 #include <asm/pci_debug.h>
@@ -109,6 +110,7 @@ struct zpci_bus {
 	struct list_head	resources;
 	struct list_head	bus_next;
 	struct resource		bus_resource;
+	struct irq_domain       *msi_parent_domain;
 	int			topo;		/* TID if topo_is_tid, PCHID otherwise */
 	int			domain_nr;
 	u8			multifunction	: 1;
@@ -310,6 +312,9 @@ int zpci_dma_exit_device(struct zpci_dev *zdev);
 /* IRQ */
 int __init zpci_irq_init(void);
 void __init zpci_irq_exit(void);
+int zpci_set_irq(struct zpci_dev *zdev);
+int zpci_create_parent_msi_domain(struct zpci_bus *zbus);
+void zpci_remove_parent_msi_domain(struct zpci_bus *zbus);
 
 /* FMB */
 int zpci_fmb_enable_device(struct zpci_dev *);
diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
index c82c577db2bcd2143476cb8189fd89b9a4dc9836..2e47bf6a3289615307c71cae7b04ef77d964144a 100644
--- a/arch/s390/pci/pci.c
+++ b/arch/s390/pci/pci.c
@@ -709,6 +709,12 @@ int zpci_reenable_device(struct zpci_dev *zdev)
 	if (rc)
 		return rc;
 
+	if (zdev->msi_nr_irqs > 0) {
+		rc = zpci_set_irq(zdev);
+		if (rc)
+			return rc;
+	}
+
 	rc = zpci_iommu_register_ioat(zdev, &status);
 	if (rc)
 		zpci_disable_device(zdev);
diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c
index be8c697fea0cc755cfdb4fb0a9e3b95183bec0dc..2d7b389f36e8682c3f0a10befe87698751596584 100644
--- a/arch/s390/pci/pci_bus.c
+++ b/arch/s390/pci/pci_bus.c
@@ -15,6 +15,7 @@
 #include <linux/err.h>
 #include <linux/delay.h>
 #include <linux/seq_file.h>
+#include <linux/irqdomain.h>
 #include <linux/jump_label.h>
 #include <linux/pci.h>
 #include <linux/printk.h>
@@ -199,19 +200,27 @@ static int zpci_bus_create_pci_bus(struct zpci_bus *zbus, struct zpci_dev *fr, s
 	zbus->multifunction = zpci_bus_is_multifunction_root(fr);
 	zbus->max_bus_speed = fr->max_bus_speed;
 
+	if (zpci_create_parent_msi_domain(zbus))
+		goto out_free_domain;
+
 	/*
 	 * Note that the zbus->resources are taken over and zbus->resources
 	 * is empty after a successful call
 	 */
 	bus = pci_create_root_bus(NULL, ZPCI_BUS_NR, ops, zbus, &zbus->resources);
-	if (!bus) {
-		zpci_free_domain(zbus->domain_nr);
-		return -EFAULT;
-	}
+	if (!bus)
+		goto out_remove_msi_domain;
 
 	zbus->bus = bus;
+	dev_set_msi_domain(&zbus->bus->dev, zbus->msi_parent_domain);
 
 	return 0;
+
+out_remove_msi_domain:
+	zpci_remove_parent_msi_domain(zbus);
+out_free_domain:
+	zpci_free_domain(zbus->domain_nr);
+	return -ENOMEM;
 }
 
 static void zpci_bus_release(struct kref *kref)
@@ -232,6 +241,7 @@ static void zpci_bus_release(struct kref *kref)
 	mutex_lock(&zbus_list_lock);
 	list_del(&zbus->bus_next);
 	mutex_unlock(&zbus_list_lock);
+	zpci_remove_parent_msi_domain(zbus);
 	kfree(zbus);
 }
 
diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c
index e73be96ce5fe6473fc193d65b8f0ff635d6a98ba..2ac0fab605a83a2f06be6a0a68718e528125ced6 100644
--- a/arch/s390/pci/pci_irq.c
+++ b/arch/s390/pci/pci_irq.c
@@ -7,6 +7,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/pci.h>
 #include <linux/msi.h>
+#include <linux/irqchip/irq-msi-lib.h>
 #include <linux/smp.h>
 
 #include <asm/isc.h>
@@ -98,7 +99,7 @@ static int zpci_clear_directed_irq(struct zpci_dev *zdev)
 }
 
 /* Register adapter interruptions */
-static int zpci_set_irq(struct zpci_dev *zdev)
+int zpci_set_irq(struct zpci_dev *zdev)
 {
 	int rc;
 
@@ -126,27 +127,53 @@ static int zpci_clear_irq(struct zpci_dev *zdev)
 static int zpci_set_irq_affinity(struct irq_data *data, const struct cpumask *dest,
 				 bool force)
 {
-	struct msi_desc *entry = irq_data_get_msi_desc(data);
-	struct msi_msg msg = entry->msg;
-	int cpu_addr = smp_cpu_get_cpu_address(cpumask_first(dest));
+	irq_data_update_affinity(data, dest);
+	return IRQ_SET_MASK_OK;
+}
+
+/*
+ * Encode the hwirq number for the parent domain. The encoding must be unique
+ * for each IRQ of each device in the parent domain, so it uses the devfn to
+ * identify the device and the msi_index to identify the IRQ within that device.
+ */
+static inline u32 zpci_encode_hwirq(u8 devfn, u16 msi_index)
+{
+	return (devfn << 16) | msi_index;
+}
+
+static inline u16 zpci_decode_hwirq_msi_index(u32 hwirq)
+{
+	return hwirq & 0xffff;
+}
 
-	msg.address_lo &= 0xff0000ff;
-	msg.address_lo |= (cpu_addr << 8);
-	pci_write_msi_msg(data->irq, &msg);
+static void zpci_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
+{
+	struct msi_desc *desc = irq_data_get_msi_desc(data);
+	struct zpci_dev *zdev = to_zpci_dev(desc->dev);
 
-	return IRQ_SET_MASK_OK;
+	if (irq_delivery == DIRECTED) {
+		int cpu = cpumask_first(irq_data_get_affinity_mask(data));
+
+		msg->address_lo = zdev->msi_addr & 0xff0000ff;
+		msg->address_lo |= (smp_cpu_get_cpu_address(cpu) << 8);
+	} else {
+		msg->address_lo = zdev->msi_addr & 0xffffffff;
+	}
+	msg->address_hi = zdev->msi_addr >> 32;
+	msg->data = zpci_decode_hwirq_msi_index(data->hwirq);
 }
 
 static struct irq_chip zpci_irq_chip = {
 	.name = "PCI-MSI",
-	.irq_unmask = pci_msi_unmask_irq,
-	.irq_mask = pci_msi_mask_irq,
+	.irq_compose_msi_msg = zpci_compose_msi_msg,
 };
 
 static void zpci_handle_cpu_local_irq(bool rescan)
 {
 	struct airq_iv *dibv = zpci_ibv[smp_processor_id()];
 	union zpci_sic_iib iib = {{0}};
+	struct irq_domain *msi_domain;
+	irq_hw_number_t hwirq;
 	unsigned long bit;
 	int irqs_on = 0;
 
@@ -164,7 +191,9 @@ static void zpci_handle_cpu_local_irq(bool rescan)
 			continue;
 		}
 		inc_irq_stat(IRQIO_MSI);
-		generic_handle_irq(airq_iv_get_data(dibv, bit));
+		hwirq = airq_iv_get_data(dibv, bit);
+		msi_domain = (struct irq_domain *)airq_iv_get_ptr(dibv, bit);
+		generic_handle_domain_irq(msi_domain, hwirq);
 	}
 }
 
@@ -229,6 +258,8 @@ static void zpci_floating_irq_handler(struct airq_struct *airq,
 				      struct tpi_info *tpi_info)
 {
 	union zpci_sic_iib iib = {{0}};
+	struct irq_domain *msi_domain;
+	irq_hw_number_t hwirq;
 	unsigned long si, ai;
 	struct airq_iv *aibv;
 	int irqs_on = 0;
@@ -256,7 +287,9 @@ static void zpci_floating_irq_handler(struct airq_struct *airq,
 				break;
 			inc_irq_stat(IRQIO_MSI);
 			airq_iv_lock(aibv, ai);
-			generic_handle_irq(airq_iv_get_data(aibv, ai));
+			hwirq = airq_iv_get_data(aibv, ai);
+			msi_domain = (struct irq_domain *)airq_iv_get_ptr(aibv, ai);
+			generic_handle_domain_irq(msi_domain, hwirq);
 			airq_iv_unlock(aibv, ai);
 		}
 	}
@@ -278,7 +311,9 @@ static int __alloc_airq(struct zpci_dev *zdev, int msi_vecs,
 		zdev->aisb = *bit;
 
 		/* Create adapter interrupt vector */
-		zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | AIRQ_IV_BITLOCK, NULL);
+		zdev->aibv = airq_iv_create(msi_vecs,
+					    AIRQ_IV_PTR | AIRQ_IV_DATA | AIRQ_IV_BITLOCK,
+					    NULL);
 		if (!zdev->aibv)
 			return -ENOMEM;
 
@@ -290,146 +325,196 @@ static int __alloc_airq(struct zpci_dev *zdev, int msi_vecs,
 	return 0;
 }
 
-int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
+bool arch_restore_msi_irqs(struct pci_dev *pdev)
 {
-	unsigned int hwirq, msi_vecs, irqs_per_msi, i, cpu;
 	struct zpci_dev *zdev = to_zpci(pdev);
-	struct msi_desc *msi;
-	struct msi_msg msg;
-	unsigned long bit;
-	int cpu_addr;
-	int rc, irq;
 
+	zpci_set_irq(zdev);
+	return true;
+}
+
+static struct airq_struct zpci_airq = {
+	.handler = zpci_floating_irq_handler,
+	.isc = PCI_ISC,
+};
+
+static void zpci_msi_teardown_directed(struct zpci_dev *zdev)
+{
+	airq_iv_free(zpci_ibv[0], zdev->msi_first_bit, zdev->max_msi);
+	zdev->msi_first_bit = -1U;
+	zdev->msi_nr_irqs = 0;
+}
+
+static void zpci_msi_teardown_floating(struct zpci_dev *zdev)
+{
+	airq_iv_release(zdev->aibv);
+	zdev->aibv = NULL;
+	airq_iv_free_bit(zpci_sbv, zdev->aisb);
 	zdev->aisb = -1UL;
 	zdev->msi_first_bit = -1U;
+	zdev->msi_nr_irqs = 0;
+}
+
+static void zpci_msi_teardown(struct irq_domain *domain, msi_alloc_info_t *arg)
+{
+	struct zpci_dev *zdev = to_zpci_dev(domain->dev);
+
+	zpci_clear_irq(zdev);
+	if (irq_delivery == DIRECTED)
+		zpci_msi_teardown_directed(zdev);
+	else
+		zpci_msi_teardown_floating(zdev);
+}
+
+static int zpci_msi_prepare(struct irq_domain *domain,
+			    struct device *dev, int nvec,
+			    msi_alloc_info_t *info)
+{
+	struct zpci_dev *zdev = to_zpci_dev(dev);
+	struct pci_dev *pdev = to_pci_dev(dev);
+	unsigned long bit;
+	int msi_vecs, rc;
 
 	msi_vecs = min_t(unsigned int, nvec, zdev->max_msi);
 	if (msi_vecs < nvec) {
-		pr_info("%s requested %d irqs, allocate system limit of %d",
+		pr_info("%s requested %d IRQs, allocate system limit of %d\n",
 			pci_name(pdev), nvec, zdev->max_msi);
 	}
 
 	rc = __alloc_airq(zdev, msi_vecs, &bit);
-	if (rc < 0)
+	if (rc) {
+		pr_err("Allocating adapter IRQs for %s failed\n", pci_name(pdev));
 		return rc;
+	}
 
-	/*
-	 * Request MSI interrupts:
-	 * When using MSI, nvec_used interrupt sources and their irq
-	 * descriptors are controlled through one msi descriptor.
-	 * Thus the outer loop over msi descriptors shall run only once,
-	 * while two inner loops iterate over the interrupt vectors.
-	 * When using MSI-X, each interrupt vector/irq descriptor
-	 * is bound to exactly one msi descriptor (nvec_used is one).
-	 * So the inner loops are executed once, while the outer iterates
-	 * over the MSI-X descriptors.
-	 */
-	hwirq = bit;
-	msi_for_each_desc(msi, &pdev->dev, MSI_DESC_NOTASSOCIATED) {
-		if (hwirq - bit >= msi_vecs)
-			break;
-		irqs_per_msi = min_t(unsigned int, msi_vecs, msi->nvec_used);
-		irq = __irq_alloc_descs(-1, 0, irqs_per_msi, 0, THIS_MODULE,
-					(irq_delivery == DIRECTED) ?
-					msi->affinity : NULL);
-		if (irq < 0)
-			return -ENOMEM;
+	zdev->msi_first_bit = bit;
+	zdev->msi_nr_irqs = msi_vecs;
+	rc = zpci_set_irq(zdev);
+	if (rc) {
+		pr_err("Registering adapter IRQs for %s failed\n",
+		       pci_name(pdev));
+
+		if (irq_delivery == DIRECTED)
+			zpci_msi_teardown_directed(zdev);
+		else
+			zpci_msi_teardown_floating(zdev);
+		return rc;
+	}
+	return 0;
+}
 
-		for (i = 0; i < irqs_per_msi; i++) {
-			rc = irq_set_msi_desc_off(irq, i, msi);
-			if (rc)
-				return rc;
-			irq_set_chip_and_handler(irq + i, &zpci_irq_chip,
-						 handle_percpu_irq);
-		}
+static int zpci_msi_domain_alloc(struct irq_domain *domain, unsigned int virq,
+				 unsigned int nr_irqs, void *args)
+{
+	struct msi_desc *desc = ((msi_alloc_info_t *)args)->desc;
+	struct zpci_dev *zdev = to_zpci_dev(desc->dev);
+	struct zpci_bus *zbus = zdev->zbus;
+	unsigned int cpu, hwirq;
+	unsigned long bit;
+	int i;
 
-		msg.data = hwirq - bit;
-		if (irq_delivery == DIRECTED) {
-			if (msi->affinity)
-				cpu = cpumask_first(&msi->affinity->mask);
-			else
-				cpu = 0;
-			cpu_addr = smp_cpu_get_cpu_address(cpu);
+	bit = zdev->msi_first_bit + desc->msi_index;
+	hwirq = zpci_encode_hwirq(zdev->devfn, desc->msi_index);
 
-			msg.address_lo = zdev->msi_addr & 0xff0000ff;
-			msg.address_lo |= (cpu_addr << 8);
+	if (desc->msi_index + nr_irqs > zdev->max_msi)
+		return -EINVAL;
 
+	for (i = 0; i < nr_irqs; i++) {
+		irq_domain_set_info(domain, virq + i, hwirq + i,
+				    &zpci_irq_chip, zdev,
+				    handle_percpu_irq, NULL, NULL);
+
+		if (irq_delivery == DIRECTED) {
 			for_each_possible_cpu(cpu) {
-				for (i = 0; i < irqs_per_msi; i++)
-					airq_iv_set_data(zpci_ibv[cpu],
-							 hwirq + i, irq + i);
+				airq_iv_set_ptr(zpci_ibv[cpu], bit + i,
+						(unsigned long)zbus->msi_parent_domain);
+				airq_iv_set_data(zpci_ibv[cpu], bit + i, hwirq + i);
 			}
 		} else {
-			msg.address_lo = zdev->msi_addr & 0xffffffff;
-			for (i = 0; i < irqs_per_msi; i++)
-				airq_iv_set_data(zdev->aibv, hwirq + i, irq + i);
+			airq_iv_set_ptr(zdev->aibv, bit + i,
+					(unsigned long)zbus->msi_parent_domain);
+			airq_iv_set_data(zdev->aibv, bit + i, hwirq + i);
 		}
-		msg.address_hi = zdev->msi_addr >> 32;
-		pci_write_msi_msg(irq, &msg);
-		hwirq += irqs_per_msi;
 	}
 
-	zdev->msi_first_bit = bit;
-	zdev->msi_nr_irqs = hwirq - bit;
+	return 0;
+}
 
-	rc = zpci_set_irq(zdev);
-	if (rc)
-		return rc;
+static void zpci_msi_domain_free(struct irq_domain *domain, unsigned int virq,
+				 unsigned int nr_irqs)
+{
+	struct irq_data *d;
+	int i;
 
-	return (zdev->msi_nr_irqs == nvec) ? 0 : zdev->msi_nr_irqs;
+	for (i = 0; i < nr_irqs; i++) {
+		d = irq_domain_get_irq_data(domain, virq + i);
+		irq_domain_reset_irq_data(d);
+	}
 }
 
-void arch_teardown_msi_irqs(struct pci_dev *pdev)
+static const struct irq_domain_ops zpci_msi_domain_ops = {
+	.alloc = zpci_msi_domain_alloc,
+	.free  = zpci_msi_domain_free,
+};
+
+static bool zpci_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
+				   struct irq_domain *real_parent,
+				   struct msi_domain_info *info)
 {
-	struct zpci_dev *zdev = to_zpci(pdev);
-	struct msi_desc *msi;
-	unsigned int i;
-	int rc;
+	if (!msi_lib_init_dev_msi_info(dev, domain, real_parent, info))
+		return false;
 
-	/* Disable interrupts */
-	rc = zpci_clear_irq(zdev);
-	if (rc)
-		return;
+	info->ops->msi_prepare = zpci_msi_prepare;
+	info->ops->msi_teardown = zpci_msi_teardown;
 
-	/* Release MSI interrupts */
-	msi_for_each_desc(msi, &pdev->dev, MSI_DESC_ASSOCIATED) {
-		for (i = 0; i < msi->nvec_used; i++) {
-			irq_set_msi_desc(msi->irq + i, NULL);
-			irq_free_desc(msi->irq + i);
-		}
-		msi->msg.address_lo = 0;
-		msi->msg.address_hi = 0;
-		msi->msg.data = 0;
-		msi->irq = 0;
-	}
+	return true;
+}
+
+static struct msi_parent_ops zpci_msi_parent_ops = {
+	.supported_flags   = MSI_GENERIC_FLAGS_MASK	|
+			     MSI_FLAG_PCI_MSIX		|
+			     MSI_FLAG_MULTI_PCI_MSI,
+	.required_flags	   = MSI_FLAG_USE_DEF_DOM_OPS  |
+			     MSI_FLAG_USE_DEF_CHIP_OPS,
+	.init_dev_msi_info = zpci_init_dev_msi_info,
+};
+
+int zpci_create_parent_msi_domain(struct zpci_bus *zbus)
+{
+	char fwnode_name[18];
 
-	if (zdev->aisb != -1UL) {
-		zpci_ibv[zdev->aisb] = NULL;
-		airq_iv_free_bit(zpci_sbv, zdev->aisb);
-		zdev->aisb = -1UL;
+	snprintf(fwnode_name, sizeof(fwnode_name), "ZPCI_MSI_DOM_%04x", zbus->domain_nr);
+	struct irq_domain_info info = {
+		.fwnode		= irq_domain_alloc_named_fwnode(fwnode_name),
+		.ops		= &zpci_msi_domain_ops,
+	};
+
+	if (!info.fwnode) {
+		pr_err("Failed to allocate fwnode for MSI IRQ domain\n");
+		return -ENOMEM;
 	}
-	if (zdev->aibv) {
-		airq_iv_release(zdev->aibv);
-		zdev->aibv = NULL;
+
+	if (irq_delivery == FLOATING)
+		zpci_msi_parent_ops.required_flags |= MSI_FLAG_NO_AFFINITY;
+	zbus->msi_parent_domain = msi_create_parent_irq_domain(&info, &zpci_msi_parent_ops);
+	if (!zbus->msi_parent_domain) {
+		irq_domain_free_fwnode(info.fwnode);
+		pr_err("Failed to create MSI IRQ domain\n");
+		return -ENOMEM;
 	}
 
-	if ((irq_delivery == DIRECTED) && zdev->msi_first_bit != -1U)
-		airq_iv_free(zpci_ibv[0], zdev->msi_first_bit, zdev->msi_nr_irqs);
+	return 0;
 }
 
-bool arch_restore_msi_irqs(struct pci_dev *pdev)
+void zpci_remove_parent_msi_domain(struct zpci_bus *zbus)
 {
-	struct zpci_dev *zdev = to_zpci(pdev);
+	struct fwnode_handle *fn;
 
-	zpci_set_irq(zdev);
-	return true;
+	fn = zbus->msi_parent_domain->fwnode;
+	irq_domain_remove(zbus->msi_parent_domain);
+	irq_domain_free_fwnode(fn);
 }
 
-static struct airq_struct zpci_airq = {
-	.handler = zpci_floating_irq_handler,
-	.isc = PCI_ISC,
-};
-
 static void __init cpu_enable_directed_irq(void *unused)
 {
 	union zpci_sic_iib iib = {{0}};
@@ -466,6 +551,7 @@ static int __init zpci_directed_irq_init(void)
 		 * is only done on the first vector.
 		 */
 		zpci_ibv[cpu] = airq_iv_create(cache_line_size() * BITS_PER_BYTE,
+					       AIRQ_IV_PTR |
 					       AIRQ_IV_DATA |
 					       AIRQ_IV_CACHELINE |
 					       (!cpu ? AIRQ_IV_ALLOC : 0), NULL);

-- 
2.51.0
Re: [PATCH v7 2/2] s390/pci: Migrate s390 IRQ logic to IRQ domain API
Posted by Gerd Bayer 2 months, 1 week ago
On Thu, 2025-11-27 at 16:07 +0100, Tobias Schumacher wrote:
> s390 is one of the last architectures using the legacy API for setup and
> teardown of PCI MSI IRQs. Migrate the s390 IRQ allocation and teardown
> to the MSI parent domain API. For details, see:
> 
> https://lore.kernel.org/lkml/20221111120501.026511281@linutronix.de
> 
> In detail, create an MSI parent domain for each PCI domain. When a PCI
> device sets up MSI or MSI-X IRQs, the library creates a per-device IRQ
> domain for this device, which is used by the device for allocating and
> freeing IRQs.
> 
> The per-device domain delegates this allocation and freeing to the
> parent-domain. In the end, the corresponding callbacks of the parent
> domain are responsible for allocating and freeing the IRQs.
> 
> The allocation is split into two parts:
> - zpci_msi_prepare() is called once for each device and allocates the
>   required resources. On s390, each PCI function has its own airq
>   vector and a summary bit, which must be configured once per function.
>   This is done in prepare().
> - zpci_msi_alloc() can be called multiple times for allocating one or
>   more MSI/MSI-X IRQs. This creates a mapping between the virtual IRQ
>   number in the kernel and the hardware IRQ number.
> 
> Freeing is split into two counterparts:
> - zpci_msi_free() reverts the effects of zpci_msi_alloc() and
> - zpci_msi_teardown() reverts the effects of zpci_msi_prepare(). This is
>   called once when all IRQs are freed before a device is removed.
> 
> Since the parent domain in the end allocates the IRQs, the hwirq
> encoding must be unambiguous for all IRQs of all devices. This is
> achieved by encoding the hwirq using the devfn and the MSI index.
> 
> Signed-off-by: Tobias Schumacher <ts@linux.ibm.com>
> ---
>  arch/s390/Kconfig           |   1 +
>  arch/s390/include/asm/pci.h |   5 +
>  arch/s390/pci/pci.c         |   6 +
>  arch/s390/pci/pci_bus.c     |  18 ++-
>  arch/s390/pci/pci_irq.c     | 310 ++++++++++++++++++++++++++++----------------
>  5 files changed, 224 insertions(+), 116 deletions(-)
> 

  [ ... snip ... ]

> diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c
> index e73be96ce5fe6473fc193d65b8f0ff635d6a98ba..2ac0fab605a83a2f06be6a0a68718e528125ced6 100644
> --- a/arch/s390/pci/pci_irq.c
> +++ b/arch/s390/pci/pci_irq.c
> @@ -290,146 +325,196 @@ static int __alloc_airq(struct zpci_dev *zdev, int msi_vecs,
>  	return 0;
>  }
>  
> -int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
> +bool arch_restore_msi_irqs(struct pci_dev *pdev)
>  {
> -	unsigned int hwirq, msi_vecs, irqs_per_msi, i, cpu;
>  	struct zpci_dev *zdev = to_zpci(pdev);
> -	struct msi_desc *msi;
> -	struct msi_msg msg;
> -	unsigned long bit;
> -	int cpu_addr;
> -	int rc, irq;
>  
> +	zpci_set_irq(zdev);
> +	return true;
> +}
> 

It's always a little tricky to distinguish which code handles both MSI
and MSI-X or just MSI proper when routines have _msi_ in their name.
But apparently, both __pci_restore_msi_state() and
__pci_restore_msix_state() inside pci_restore_msi_state() do call
arch_restore_msi_irqs() - so life is good!


  [ ... snip ... ]

> +static void zpci_msi_domain_free(struct irq_domain *domain, unsigned int virq,
> +				 unsigned int nr_irqs)
> +{
> +	struct irq_data *d;
> +	int i;
>  
> -	return (zdev->msi_nr_irqs == nvec) ? 0 : zdev->msi_nr_irqs;
> +	for (i = 0; i < nr_irqs; i++) {
> +		d = irq_domain_get_irq_data(domain, virq + i);
> +		irq_domain_reset_irq_data(d);

Question: zpci_msi_alloc_domain() did modify airq data, can this be
left as is in zpci_msi_domain_free()?

> +	}
>  }
> 

   [ ... snip ... ]


> +
> +int zpci_create_parent_msi_domain(struct zpci_bus *zbus)
> +{
> +	char fwnode_name[18];
>  
> -	if (zdev->aisb != -1UL) {
> -		zpci_ibv[zdev->aisb] = NULL;
> -		airq_iv_free_bit(zpci_sbv, zdev->aisb);
> -		zdev->aisb = -1UL;
> +	snprintf(fwnode_name, sizeof(fwnode_name), "ZPCI_MSI_DOM_%04x", zbus->domain_nr);
> +	struct irq_domain_info info = {
> +		.fwnode		= irq_domain_alloc_named_fwnode(fwnode_name),
> +		.ops		= &zpci_msi_domain_ops,
> +	};
> +
> +	if (!info.fwnode) {
> +		pr_err("Failed to allocate fwnode for MSI IRQ domain\n");
> +		return -ENOMEM;
>  	}
> -	if (zdev->aibv) {
> -		airq_iv_release(zdev->aibv);
> -		zdev->aibv = NULL;
> +
> +	if (irq_delivery == FLOATING)
> +		zpci_msi_parent_ops.required_flags |= MSI_FLAG_NO_AFFINITY;

Add empty line here, so the intent is clear that the following
assignment is executed unconditionally.

> +	zbus->msi_parent_domain = msi_create_parent_irq_domain(&info, &zpci_msi_parent_ops);
> +	if (!zbus->msi_parent_domain) {
> +		irq_domain_free_fwnode(info.fwnode);
> +		pr_err("Failed to create MSI IRQ domain\n");
> +		return -ENOMEM;
>  	}
>  
> -	if ((irq_delivery == DIRECTED) && zdev->msi_first_bit != -1U)
> -		airq_iv_free(zpci_ibv[0], zdev->msi_first_bit, zdev->msi_nr_irqs);
> +	return 0;
>  }
> 

   [ ... snip ... ]
 
> @@ -466,6 +551,7 @@ static int __init zpci_directed_irq_init(void)
>  		 * is only done on the first vector.
>  		 */
>  		zpci_ibv[cpu] = airq_iv_create(cache_line_size() * BITS_PER_BYTE,
> +					       AIRQ_IV_PTR |
>  					       AIRQ_IV_DATA |
>  					       AIRQ_IV_CACHELINE |
>  					       (!cpu ? AIRQ_IV_ALLOC : 0), NULL);


This looks very good to me already. Unfortunately, I was unable to
relieve my MSI vs. MSI-X anxiety regarding arch_restore_msi_irqs() with
a test since the only MSI-using PCI function (ISM) is not supporting
PCI auto-recovery :(

But a mlx5 VF now recovers just fine!

Thanks,
Gerd
Re: [PATCH v7 2/2] s390/pci: Migrate s390 IRQ logic to IRQ domain API
Posted by Tobias Schumacher 2 months, 1 week ago
On Tue Dec 2, 2025 at 7:14 PM CET, Gerd Bayer wrote:
> On Thu, 2025-11-27 at 16:07 +0100, Tobias Schumacher wrote:
>   [ ... snip ... ]
>
>> diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c
>> index e73be96ce5fe6473fc193d65b8f0ff635d6a98ba..2ac0fab605a83a2f06be6a0a68718e528125ced6 100644
>> --- a/arch/s390/pci/pci_irq.c
>> +++ b/arch/s390/pci/pci_irq.c
>> @@ -290,146 +325,196 @@ static int __alloc_airq(struct zpci_dev *zdev, int msi_vecs,
>>  	return 0;
>>  }
>>  
>> -int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
>> +bool arch_restore_msi_irqs(struct pci_dev *pdev)
>>  {
>> -	unsigned int hwirq, msi_vecs, irqs_per_msi, i, cpu;
>>  	struct zpci_dev *zdev = to_zpci(pdev);
>> -	struct msi_desc *msi;
>> -	struct msi_msg msg;
>> -	unsigned long bit;
>> -	int cpu_addr;
>> -	int rc, irq;
>>  
>> +	zpci_set_irq(zdev);
>> +	return true;
>> +}
>> 
>
> It's always a little tricky to distinguish which code handles both MSI
> and MSI-X or just MSI proper when routines have _msi_ in their name.
> But apparently, both __pci_restore_msi_state() and
> __pci_restore_msix_state() inside pci_restore_msi_state() do call
> arch_restore_msi_irqs() - so life is good!

Regarding arch_restore_msi_irqs() the main change in the patchset is
that it is now also conditionally  called from zpci_reenable_device().
This is becasue in the recovery case, __pci_restore_msix_state() does
not call arch_restore_msi_irqs(), it exits directly at the beginning
because dev->msix_enabled evaluates to false.

With the legacy API, IRQs are later re-enabled using
arch_setup_msi_irqs(), which also registers the airq with the hw. With
the MSI parent domain, zpci_msi_prepare() would register the airq, but
is not called in the recovery path. This is why it is now added to
zpci_reenable_device()


>   [ ... snip ... ]
>
>> +static void zpci_msi_domain_free(struct irq_domain *domain, unsigned int virq,
>> +				 unsigned int nr_irqs)
>> +{
>> +	struct irq_data *d;
>> +	int i;
>>  
>> -	return (zdev->msi_nr_irqs == nvec) ? 0 : zdev->msi_nr_irqs;
>> +	for (i = 0; i < nr_irqs; i++) {
>> +		d = irq_domain_get_irq_data(domain, virq + i);
>> +		irq_domain_reset_irq_data(d);
>
> Question: zpci_msi_alloc_domain() did modify airq data, can this be
> left as is in zpci_msi_domain_free()?

I was thinking about this myself and came to the conclusion that it is
fine. zpci_msi_domain_alloc() sets the ptr to the msi parent domain and
data to the encoded hwirq. Both fields are only required in the IRQ
handler.
* When free() is called, the corresponding interrupt was already
  deactivated by the hardware, so hardware shouldn't generate it
  anymore anyway.
* If, for whatever reason, hw still generates the interrupt,
  generic_handle_domain_irq returns an error since the hwirq cannot be
  resolved.
* If the IRQ gets allocated again, the fields are written again before
  the IRQ is activated. The data written will even be the same
  as it was before.

>    [ ... snip ... ]
>
>
>> +
>> +int zpci_create_parent_msi_domain(struct zpci_bus *zbus)
>> +{
>> +	char fwnode_name[18];
>>  
>> -	if (zdev->aisb != -1UL) {
>> -		zpci_ibv[zdev->aisb] = NULL;
>> -		airq_iv_free_bit(zpci_sbv, zdev->aisb);
>> -		zdev->aisb = -1UL;
>> +	snprintf(fwnode_name, sizeof(fwnode_name), "ZPCI_MSI_DOM_%04x", zbus->domain_nr);
>> +	struct irq_domain_info info = {
>> +		.fwnode		= irq_domain_alloc_named_fwnode(fwnode_name),
>> +		.ops		= &zpci_msi_domain_ops,
>> +	};
>> +
>> +	if (!info.fwnode) {
>> +		pr_err("Failed to allocate fwnode for MSI IRQ domain\n");
>> +		return -ENOMEM;
>>  	}
>> -	if (zdev->aibv) {
>> -		airq_iv_release(zdev->aibv);
>> -		zdev->aibv = NULL;
>> +
>> +	if (irq_delivery == FLOATING)
>> +		zpci_msi_parent_ops.required_flags |= MSI_FLAG_NO_AFFINITY;
>
> Add empty line here, so the intent is clear that the following
> assignment is executed unconditionally.

Ok.

>    [ ... snip ... ]
>  
>> @@ -466,6 +551,7 @@ static int __init zpci_directed_irq_init(void)
>>  		 * is only done on the first vector.
>>  		 */
>>  		zpci_ibv[cpu] = airq_iv_create(cache_line_size() * BITS_PER_BYTE,
>> +					       AIRQ_IV_PTR |
>>  					       AIRQ_IV_DATA |
>>  					       AIRQ_IV_CACHELINE |
>>  					       (!cpu ? AIRQ_IV_ALLOC : 0), NULL);
>
>
> This looks very good to me already. Unfortunately, I was unable to
> relieve my MSI vs. MSI-X anxiety regarding arch_restore_msi_irqs() with
> a test since the only MSI-using PCI function (ISM) is not supporting
> PCI auto-recovery :(
>
> But a mlx5 VF now recovers just fine!

Did my expanation above help with this?

Thanks
Tobias
Re: [PATCH v7 2/2] s390/pci: Migrate s390 IRQ logic to IRQ domain API
Posted by Gerd Bayer 2 months, 1 week ago
On Wed, 2025-12-03 at 08:53 +0100, Tobias Schumacher wrote:
> On Tue Dec 2, 2025 at 7:14 PM CET, Gerd Bayer wrote:
> > On Thu, 2025-11-27 at 16:07 +0100, Tobias Schumacher wrote:
> >   [ ... snip ... ]
> > 
> > > diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c
> > > index e73be96ce5fe6473fc193d65b8f0ff635d6a98ba..2ac0fab605a83a2f06be6a0a68718e528125ced6 100644
> > > --- a/arch/s390/pci/pci_irq.c
> > > +++ b/arch/s390/pci/pci_irq.c
> > > @@ -290,146 +325,196 @@ static int __alloc_airq(struct zpci_dev *zdev, int msi_vecs,
> > >  	return 0;
> > >  }
> > >  
> > > -int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
> > > +bool arch_restore_msi_irqs(struct pci_dev *pdev)
> > >  {
> > > -	unsigned int hwirq, msi_vecs, irqs_per_msi, i, cpu;
> > >  	struct zpci_dev *zdev = to_zpci(pdev);
> > > -	struct msi_desc *msi;
> > > -	struct msi_msg msg;
> > > -	unsigned long bit;
> > > -	int cpu_addr;
> > > -	int rc, irq;
> > >  
> > > +	zpci_set_irq(zdev);
> > > +	return true;
> > > +}
> > > 
> > 
> > It's always a little tricky to distinguish which code handles both MSI
> > and MSI-X or just MSI proper when routines have _msi_ in their name.
> > But apparently, both __pci_restore_msi_state() and
> > __pci_restore_msix_state() inside pci_restore_msi_state() do call
> > arch_restore_msi_irqs() - so life is good!
> 
> Regarding arch_restore_msi_irqs() the main change in the patchset is
> that it is now also conditionally  called from zpci_reenable_device().

Sorry, I don't follow: This patch adds a conditional call to
zpci_set_irg() to zpci_reenable_device() - not arch_restore_msi_irqs().

> This is becasue in the recovery case, __pci_restore_msix_state() does
> not call arch_restore_msi_irqs(), it exits directly at the beginning
> because dev->msix_enabled evaluates to false.

Does that mean arch_restore_msi_irqs() is dead code?
After re-reading pci_save_state()/pci_restore_state(), it sounds as if
arch_restore_msi_irqs() may be useful afterall, with device drivers
that consider the MSI/MSI-X interrupt setup part of their save/restore
snapshot? And we just happen to have not executed any of those, maybe?

So probably just leave it in.

> With the legacy API, IRQs are later re-enabled using
> arch_setup_msi_irqs(), which also registers the airq with the hw. With
> the MSI parent domain, zpci_msi_prepare() would register the airq, but
> is not called in the recovery path. This is why it is now added to
> zpci_reenable_device()
> 
> 
> >   [ ... snip ... ]
> > 
> > > +static void zpci_msi_domain_free(struct irq_domain *domain, unsigned int virq,
> > > +				 unsigned int nr_irqs)
> > > +{
> > > +	struct irq_data *d;
> > > +	int i;
> > >  
> > > -	return (zdev->msi_nr_irqs == nvec) ? 0 : zdev->msi_nr_irqs;
> > > +	for (i = 0; i < nr_irqs; i++) {
> > > +		d = irq_domain_get_irq_data(domain, virq + i);
> > > +		irq_domain_reset_irq_data(d);
> > 
> > Question: zpci_msi_alloc_domain() did modify airq data, can this be
> > left as is in zpci_msi_domain_free()?
> 
> I was thinking about this myself and came to the conclusion that it is
> fine. zpci_msi_domain_alloc() sets the ptr to the msi parent domain and
> data to the encoded hwirq. Both fields are only required in the IRQ
> handler.
> * When free() is called, the corresponding interrupt was already
>   deactivated by the hardware, so hardware shouldn't generate it
>   anymore anyway.
> * If, for whatever reason, hw still generates the interrupt,
>   generic_handle_domain_irq returns an error since the hwirq cannot be
>   resolved.
> * If the IRQ gets allocated again, the fields are written again before
>   the IRQ is activated. The data written will even be the same
>   as it was before.

Well, this is all assuming no further errors in the code...
I'd still vote to clean up airq resources when they are no longer
needed - just act defensively in case some weird (future) path still
tries to use these after they got put to rest - or you have to do some
post-mortem dump analysis and try to make sense of this "garbage".

> 
> >    [ ... snip ... ]
> > 
> > 
> > > +
> > > +int zpci_create_parent_msi_domain(struct zpci_bus *zbus)
> > > +{
> > > +	char fwnode_name[18];
> > >  
> > > -	if (zdev->aisb != -1UL) {
> > > -		zpci_ibv[zdev->aisb] = NULL;
> > > -		airq_iv_free_bit(zpci_sbv, zdev->aisb);
> > > -		zdev->aisb = -1UL;
> > > +	snprintf(fwnode_name, sizeof(fwnode_name), "ZPCI_MSI_DOM_%04x", zbus->domain_nr);
> > > +	struct irq_domain_info info = {
> > > +		.fwnode		= irq_domain_alloc_named_fwnode(fwnode_name),
> > > +		.ops		= &zpci_msi_domain_ops,
> > > +	};
> > > +
> > > +	if (!info.fwnode) {
> > > +		pr_err("Failed to allocate fwnode for MSI IRQ domain\n");
> > > +		return -ENOMEM;
> > >  	}
> > > -	if (zdev->aibv) {
> > > -		airq_iv_release(zdev->aibv);
> > > -		zdev->aibv = NULL;
> > > +
> > > +	if (irq_delivery == FLOATING)
> > > +		zpci_msi_parent_ops.required_flags |= MSI_FLAG_NO_AFFINITY;
> > 
> > Add empty line here, so the intent is clear that the following
> > assignment is executed unconditionally.
> 
> Ok.
> 
> >    [ ... snip ... ]
> >  
> > > @@ -466,6 +551,7 @@ static int __init zpci_directed_irq_init(void)
> > >  		 * is only done on the first vector.
> > >  		 */
> > >  		zpci_ibv[cpu] = airq_iv_create(cache_line_size() * BITS_PER_BYTE,
> > > +					       AIRQ_IV_PTR |
> > >  					       AIRQ_IV_DATA |
> > >  					       AIRQ_IV_CACHELINE |
> > >  					       (!cpu ? AIRQ_IV_ALLOC : 0), NULL);
> > 
> > 
> > This looks very good to me already. Unfortunately, I was unable to
> > relieve my MSI vs. MSI-X anxiety regarding arch_restore_msi_irqs() with
> > a test since the only MSI-using PCI function (ISM) is not supporting
> > PCI auto-recovery :(
> > 
> > But a mlx5 VF now recovers just fine!
> 
> Did my expanation above help with this?

Yes, thank you. But I still would request to address the airq cleanup
in zpci_msi_domain_free().

> Thanks
> Tobias

Thanks,
Gerd
Re: [PATCH v7 2/2] s390/pci: Migrate s390 IRQ logic to IRQ domain API
Posted by Tobias Schumacher 2 months, 1 week ago
On Wed Dec 3, 2025 at 1:32 PM CET, Gerd Bayer wrote:
> On Wed, 2025-12-03 at 08:53 +0100, Tobias Schumacher wrote:
>> On Tue Dec 2, 2025 at 7:14 PM CET, Gerd Bayer wrote:
>> > On Thu, 2025-11-27 at 16:07 +0100, Tobias Schumacher wrote:
>> >   [ ... snip ... ]
>> > 
>> > > diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c
>> > > index e73be96ce5fe6473fc193d65b8f0ff635d6a98ba..2ac0fab605a83a2f06be6a0a68718e528125ced6 100644
>> > > --- a/arch/s390/pci/pci_irq.c
>> > > +++ b/arch/s390/pci/pci_irq.c
>> > > @@ -290,146 +325,196 @@ static int __alloc_airq(struct zpci_dev *zdev, int msi_vecs,
>> > >  	return 0;
>> > >  }
>> > >  
>> > > -int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
>> > > +bool arch_restore_msi_irqs(struct pci_dev *pdev)
>> > >  {
>> > > -	unsigned int hwirq, msi_vecs, irqs_per_msi, i, cpu;
>> > >  	struct zpci_dev *zdev = to_zpci(pdev);
>> > > -	struct msi_desc *msi;
>> > > -	struct msi_msg msg;
>> > > -	unsigned long bit;
>> > > -	int cpu_addr;
>> > > -	int rc, irq;
>> > >  
>> > > +	zpci_set_irq(zdev);
>> > > +	return true;
>> > > +}
>> > > 
>> > 
>> > It's always a little tricky to distinguish which code handles both MSI
>> > and MSI-X or just MSI proper when routines have _msi_ in their name.
>> > But apparently, both __pci_restore_msi_state() and
>> > __pci_restore_msix_state() inside pci_restore_msi_state() do call
>> > arch_restore_msi_irqs() - so life is good!
>> 
>> Regarding arch_restore_msi_irqs() the main change in the patchset is
>> that it is now also conditionally  called from zpci_reenable_device().

Right, sorry. 

> Sorry, I don't follow: This patch adds a conditional call to
> zpci_set_irg() to zpci_reenable_device() - not arch_restore_msi_irqs().
>
>> This is becasue in the recovery case, __pci_restore_msix_state() does
>> not call arch_restore_msi_irqs(), it exits directly at the beginning
>> because dev->msix_enabled evaluates to false.
>
> Does that mean arch_restore_msi_irqs() is dead code?
> After re-reading pci_save_state()/pci_restore_state(), it sounds as if
> arch_restore_msi_irqs() may be useful afterall, with device drivers
> that consider the MSI/MSI-X interrupt setup part of their save/restore
> snapshot? And we just happen to have not executed any of those, maybe?
>
> So probably just leave it in.

No, it's not dead code. After the zpcictl --reset-fw, MSI-X interrupts
are shutdown before the pci_restore_state(), which is why
arch_restore_msi_irqs() is not called. But a driver can still call
pci_save_state() and pci_restore_state() without shutting down MSI IRQs
before, in which case arch_restore_msi_irqs() is called.

>> With the legacy API, IRQs are later re-enabled using
>> arch_setup_msi_irqs(), which also registers the airq with the hw. With
>> the MSI parent domain, zpci_msi_prepare() would register the airq, but
>> is not called in the recovery path. This is why it is now added to
>> zpci_reenable_device()
>> 
>> 
>> >   [ ... snip ... ]
>> > 
>> > > +static void zpci_msi_domain_free(struct irq_domain *domain, unsigned int virq,
>> > > +				 unsigned int nr_irqs)
>> > > +{
>> > > +	struct irq_data *d;
>> > > +	int i;
>> > >  
>> > > -	return (zdev->msi_nr_irqs == nvec) ? 0 : zdev->msi_nr_irqs;
>> > > +	for (i = 0; i < nr_irqs; i++) {
>> > > +		d = irq_domain_get_irq_data(domain, virq + i);
>> > > +		irq_domain_reset_irq_data(d);
>> > 
>> > Question: zpci_msi_alloc_domain() did modify airq data, can this be
>> > left as is in zpci_msi_domain_free()?
>> 
>> I was thinking about this myself and came to the conclusion that it is
>> fine. zpci_msi_domain_alloc() sets the ptr to the msi parent domain and
>> data to the encoded hwirq. Both fields are only required in the IRQ
>> handler.
>> * When free() is called, the corresponding interrupt was already
>>   deactivated by the hardware, so hardware shouldn't generate it
>>   anymore anyway.
>> * If, for whatever reason, hw still generates the interrupt,
>>   generic_handle_domain_irq returns an error since the hwirq cannot be
>>   resolved.
>> * If the IRQ gets allocated again, the fields are written again before
>>   the IRQ is activated. The data written will even be the same
>>   as it was before.
>
> Well, this is all assuming no further errors in the code...
> I'd still vote to clean up airq resources when they are no longer
> needed - just act defensively in case some weird (future) path still
> tries to use these after they got put to rest - or you have to do some
> post-mortem dump analysis and try to make sense of this "garbage".

Ok, I can do that.

>> >    [ ... snip ... ]
>> >  
>> > > @@ -466,6 +551,7 @@ static int __init zpci_directed_irq_init(void)
>> > >  		 * is only done on the first vector.
>> > >  		 */
>> > >  		zpci_ibv[cpu] = airq_iv_create(cache_line_size() * BITS_PER_BYTE,
>> > > +					       AIRQ_IV_PTR |
>> > >  					       AIRQ_IV_DATA |
>> > >  					       AIRQ_IV_CACHELINE |
>> > >  					       (!cpu ? AIRQ_IV_ALLOC : 0), NULL);
>> > 
>> > 
>> > This looks very good to me already. Unfortunately, I was unable to
>> > relieve my MSI vs. MSI-X anxiety regarding arch_restore_msi_irqs() with
>> > a test since the only MSI-using PCI function (ISM) is not supporting
>> > PCI auto-recovery :(
>> > 
>> > But a mlx5 VF now recovers just fine!
>> 
>> Did my expanation above help with this?
>
> Yes, thank you. But I still would request to address the airq cleanup
> in zpci_msi_domain_free().

Ok, will do.

Thanks
Tobias
Re: [PATCH v7 2/2] s390/pci: Migrate s390 IRQ logic to IRQ domain API
Posted by Gerd Bayer 2 months, 1 week ago
On Wed, 2025-12-03 at 14:55 +0100, Tobias Schumacher wrote:
> On Wed Dec 3, 2025 at 1:32 PM CET, Gerd Bayer wrote:
> > On Wed, 2025-12-03 at 08:53 +0100, Tobias Schumacher wrote:
> > > On Tue Dec 2, 2025 at 7:14 PM CET, Gerd Bayer wrote:
> > > > On Thu, 2025-11-27 at 16:07 +0100, Tobias Schumacher wrote:
> > > >   [ ... snip ... ]
> > > > 
> > > > > diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c
> > > > > index e73be96ce5fe6473fc193d65b8f0ff635d6a98ba..2ac0fab605a83a2f06be6a0a68718e528125ced6 100644
> > > > > --- a/arch/s390/pci/pci_irq.c
> > > > > +++ b/arch/s390/pci/pci_irq.c
> > > > > @@ -290,146 +325,196 @@ static int __alloc_airq(struct zpci_dev *zdev, int msi_vecs,
> > > > >  	return 0;
> > > > >  }
> > > > >  
> > > > > -int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
> > > > > +bool arch_restore_msi_irqs(struct pci_dev *pdev)
> > > > >  {
> > > > > -	unsigned int hwirq, msi_vecs, irqs_per_msi, i, cpu;
> > > > >  	struct zpci_dev *zdev = to_zpci(pdev);
> > > > > -	struct msi_desc *msi;
> > > > > -	struct msi_msg msg;
> > > > > -	unsigned long bit;
> > > > > -	int cpu_addr;
> > > > > -	int rc, irq;
> > > > >  
> > > > > +	zpci_set_irq(zdev);
> > > > > +	return true;
> > > > > +}
> > > > > 
> > > > 
> > > > It's always a little tricky to distinguish which code handles both MSI
> > > > and MSI-X or just MSI proper when routines have _msi_ in their name.
> > > > But apparently, both __pci_restore_msi_state() and
> > > > __pci_restore_msix_state() inside pci_restore_msi_state() do call
> > > > arch_restore_msi_irqs() - so life is good!
> > > 
> > > Regarding arch_restore_msi_irqs() the main change in the patchset is
> > > that it is now also conditionally  called from zpci_reenable_device().
> 
> Right, sorry. 
> 
> > Sorry, I don't follow: This patch adds a conditional call to
> > zpci_set_irg() to zpci_reenable_device() - not arch_restore_msi_irqs().
> > 
> > > This is becasue in the recovery case, __pci_restore_msix_state() does
> > > not call arch_restore_msi_irqs(), it exits directly at the beginning
> > > because dev->msix_enabled evaluates to false.
> > 
> > Does that mean arch_restore_msi_irqs() is dead code?
> > After re-reading pci_save_state()/pci_restore_state(), it sounds as if
> > arch_restore_msi_irqs() may be useful afterall, with device drivers
> > that consider the MSI/MSI-X interrupt setup part of their save/restore
> > snapshot? And we just happen to have not executed any of those, maybe?
> > 
> > So probably just leave it in.
> 
> No, it's not dead code. After the zpcictl --reset-fw, MSI-X interrupts
> are shutdown before the pci_restore_state(), which is why
> arch_restore_msi_irqs() is not called. But a driver can still call
> pci_save_state() and pci_restore_state() without shutting down MSI IRQs
> before, in which case arch_restore_msi_irqs() is called.

Yes, that's the scenario that I thought of when I wrote that device
drivers (other than mlx5_core) may want to have their MSI/MSI-X
interrupt setup be restored through pci_restore_state()

> > > With the legacy API, IRQs are later re-enabled using
> > > arch_setup_msi_irqs(), which also registers the airq with the hw. With
> > > the MSI parent domain, zpci_msi_prepare() would register the airq, but
> > > is not called in the recovery path. This is why it is now added to
> > > zpci_reenable_device()
> > > 
> > > 
> > > >   [ ... snip ... ]
> > > > 
> > > > > +static void zpci_msi_domain_free(struct irq_domain *domain, unsigned int virq,
> > > > > +				 unsigned int nr_irqs)
> > > > > +{
> > > > > +	struct irq_data *d;
> > > > > +	int i;
> > > > >  
> > > > > -	return (zdev->msi_nr_irqs == nvec) ? 0 : zdev->msi_nr_irqs;
> > > > > +	for (i = 0; i < nr_irqs; i++) {
> > > > > +		d = irq_domain_get_irq_data(domain, virq + i);
> > > > > +		irq_domain_reset_irq_data(d);
> > > > 
> > > > Question: zpci_msi_alloc_domain() did modify airq data, can this be
> > > > left as is in zpci_msi_domain_free()?
> > > 
> > > I was thinking about this myself and came to the conclusion that it is
> > > fine. zpci_msi_domain_alloc() sets the ptr to the msi parent domain and
> > > data to the encoded hwirq. Both fields are only required in the IRQ
> > > handler.
> > > * When free() is called, the corresponding interrupt was already
> > >   deactivated by the hardware, so hardware shouldn't generate it
> > >   anymore anyway.
> > > * If, for whatever reason, hw still generates the interrupt,
> > >   generic_handle_domain_irq returns an error since the hwirq cannot be
> > >   resolved.
> > > * If the IRQ gets allocated again, the fields are written again before
> > >   the IRQ is activated. The data written will even be the same
> > >   as it was before.
> > 
> > Well, this is all assuming no further errors in the code...
> > I'd still vote to clean up airq resources when they are no longer
> > needed - just act defensively in case some weird (future) path still
> > tries to use these after they got put to rest - or you have to do some
> > post-mortem dump analysis and try to make sense of this "garbage".
> 
> Ok, I can do that.
> 
> > > >    [ ... snip ... ]
> > > >  
> > > > > @@ -466,6 +551,7 @@ static int __init zpci_directed_irq_init(void)
> > > > >  		 * is only done on the first vector.
> > > > >  		 */
> > > > >  		zpci_ibv[cpu] = airq_iv_create(cache_line_size() * BITS_PER_BYTE,
> > > > > +					       AIRQ_IV_PTR |
> > > > >  					       AIRQ_IV_DATA |
> > > > >  					       AIRQ_IV_CACHELINE |
> > > > >  					       (!cpu ? AIRQ_IV_ALLOC : 0), NULL);
> > > > 
> > > > 
> > > > This looks very good to me already. Unfortunately, I was unable to
> > > > relieve my MSI vs. MSI-X anxiety regarding arch_restore_msi_irqs() with
> > > > a test since the only MSI-using PCI function (ISM) is not supporting
> > > > PCI auto-recovery :(
> > > > 
> > > > But a mlx5 VF now recovers just fine!
> > > 
> > > Did my expanation above help with this?
> > 
> > Yes, thank you. But I still would request to address the airq cleanup
> > in zpci_msi_domain_free().
> 
> Ok, will do.

Thank you,
Gerd
Re: [PATCH v7 2/2] s390/pci: Migrate s390 IRQ logic to IRQ domain API
Posted by Farhan Ali 2 months, 1 week ago
On 11/27/2025 7:07 AM, Tobias Schumacher wrote:
> s390 is one of the last architectures using the legacy API for setup and
> teardown of PCI MSI IRQs. Migrate the s390 IRQ allocation and teardown
> to the MSI parent domain API. For details, see:
>
> https://lore.kernel.org/lkml/20221111120501.026511281@linutronix.de
>
> In detail, create an MSI parent domain for each PCI domain. When a PCI
> device sets up MSI or MSI-X IRQs, the library creates a per-device IRQ
> domain for this device, which is used by the device for allocating and
> freeing IRQs.
>
> The per-device domain delegates this allocation and freeing to the
> parent-domain. In the end, the corresponding callbacks of the parent
> domain are responsible for allocating and freeing the IRQs.
>
> The allocation is split into two parts:
> - zpci_msi_prepare() is called once for each device and allocates the
>    required resources. On s390, each PCI function has its own airq
>    vector and a summary bit, which must be configured once per function.
>    This is done in prepare().
> - zpci_msi_alloc() can be called multiple times for allocating one or
>    more MSI/MSI-X IRQs. This creates a mapping between the virtual IRQ
>    number in the kernel and the hardware IRQ number.
>
> Freeing is split into two counterparts:
> - zpci_msi_free() reverts the effects of zpci_msi_alloc() and
> - zpci_msi_teardown() reverts the effects of zpci_msi_prepare(). This is
>    called once when all IRQs are freed before a device is removed.
>
> Since the parent domain in the end allocates the IRQs, the hwirq
> encoding must be unambiguous for all IRQs of all devices. This is
> achieved by encoding the hwirq using the devfn and the MSI index.
>
> Signed-off-by: Tobias Schumacher<ts@linux.ibm.com>
> ---
>   arch/s390/Kconfig           |   1 +
>   arch/s390/include/asm/pci.h |   5 +
>   arch/s390/pci/pci.c         |   6 +
>   arch/s390/pci/pci_bus.c     |  18 ++-
>   arch/s390/pci/pci_irq.c     | 310 ++++++++++++++++++++++++++++----------------
>   5 files changed, 224 insertions(+), 116 deletions(-)

LGTM and testing this patch it does fix the issue found in v6.

Reviewed-by: Farhan Ali<alifm@linux.ibm.com>
Re: [PATCH v7 2/2] s390/pci: Migrate s390 IRQ logic to IRQ domain API
Posted by Niklas Schnelle 2 months, 1 week ago
On Thu, 2025-11-27 at 16:07 +0100, Tobias Schumacher wrote:
> s390 is one of the last architectures using the legacy API for setup and
> teardown of PCI MSI IRQs. Migrate the s390 IRQ allocation and teardown
> to the MSI parent domain API. For details, see:
> 
> https://lore.kernel.org/lkml/20221111120501.026511281@linutronix.de
> 
> In detail, create an MSI parent domain for each PCI domain. When a PCI
> device sets up MSI or MSI-X IRQs, the library creates a per-device IRQ
> domain for this device, which is used by the device for allocating and
> freeing IRQs.
> 
> The per-device domain delegates this allocation and freeing to the
> parent-domain. In the end, the corresponding callbacks of the parent
> domain are responsible for allocating and freeing the IRQs.
> 
> The allocation is split into two parts:
> - zpci_msi_prepare() is called once for each device and allocates the
>   required resources. On s390, each PCI function has its own airq
>   vector and a summary bit, which must be configured once per function.
>   This is done in prepare().
> - zpci_msi_alloc() can be called multiple times for allocating one or
>   more MSI/MSI-X IRQs. This creates a mapping between the virtual IRQ
>   number in the kernel and the hardware IRQ number.
> 
> Freeing is split into two counterparts:
> - zpci_msi_free() reverts the effects of zpci_msi_alloc() and
> - zpci_msi_teardown() reverts the effects of zpci_msi_prepare(). This is
>   called once when all IRQs are freed before a device is removed.
> 
> Since the parent domain in the end allocates the IRQs, the hwirq
> encoding must be unambiguous for all IRQs of all devices. This is
> achieved by encoding the hwirq using the devfn and the MSI index.
> 
> Signed-off-by: Tobias Schumacher <ts@linux.ibm.com>
> ---
>  arch/s390/Kconfig           |   1 +
>  arch/s390/include/asm/pci.h |   5 +
>  arch/s390/pci/pci.c         |   6 +
>  arch/s390/pci/pci_bus.c     |  18 ++-
>  arch/s390/pci/pci_irq.c     | 310 ++++++++++++++++++++++++++++----------------
>  5 files changed, 224 insertions(+), 116 deletions(-)
> 
--- snip ---

> +
> +static inline u16 zpci_decode_hwirq_msi_index(u32 hwirq)

I think the parameter's type should by irq_hw_number_t here. It doesn't
matter for correctness since we're only using 32 bits now and the cast
just cuts off the upper 32 bits but I'd like to preserve the type until
you explicitly mask off the bits we don't use.

I also considered making zpci_encode_hwirq()'s return type
irq_hw_number_t but I think it's not needed. This is because there
we're still in the process of encoding the hwirq and want to emphasize
that our encoding output uses 32 bits.

> +{
> +	return hwirq & 0xffff;
> +}
>  

The changes versus v6 look good to me and I agree that putting the
zpci_set_irq() in zpci_reenable_device() looks like the cleanest fix
for the recovery issue. Also good catch on the msg->data assignment. So
feel free to (re-)add my R-b as per below.

Reviewed-by: Niklas Schnelle <schnelle@linux.ibm.com>
Re: [PATCH v7 2/2] s390/pci: Migrate s390 IRQ logic to IRQ domain API
Posted by Tobias Schumacher 2 months, 1 week ago
On Fri Nov 28, 2025 at 10:47 AM CET, Niklas Schnelle wrote:
> On Thu, 2025-11-27 at 16:07 +0100, Tobias Schumacher wrote:
>> s390 is one of the last architectures using the legacy API for setup and
>> teardown of PCI MSI IRQs. Migrate the s390 IRQ allocation and teardown
>> to the MSI parent domain API. For details, see:
>> 
>> https://lore.kernel.org/lkml/20221111120501.026511281@linutronix.de
>> 
>> In detail, create an MSI parent domain for each PCI domain. When a PCI
>> device sets up MSI or MSI-X IRQs, the library creates a per-device IRQ
>> domain for this device, which is used by the device for allocating and
>> freeing IRQs.
>> 
>> The per-device domain delegates this allocation and freeing to the
>> parent-domain. In the end, the corresponding callbacks of the parent
>> domain are responsible for allocating and freeing the IRQs.
>> 
>> The allocation is split into two parts:
>> - zpci_msi_prepare() is called once for each device and allocates the
>>   required resources. On s390, each PCI function has its own airq
>>   vector and a summary bit, which must be configured once per function.
>>   This is done in prepare().
>> - zpci_msi_alloc() can be called multiple times for allocating one or
>>   more MSI/MSI-X IRQs. This creates a mapping between the virtual IRQ
>>   number in the kernel and the hardware IRQ number.
>> 
>> Freeing is split into two counterparts:
>> - zpci_msi_free() reverts the effects of zpci_msi_alloc() and
>> - zpci_msi_teardown() reverts the effects of zpci_msi_prepare(). This is
>>   called once when all IRQs are freed before a device is removed.
>> 
>> Since the parent domain in the end allocates the IRQs, the hwirq
>> encoding must be unambiguous for all IRQs of all devices. This is
>> achieved by encoding the hwirq using the devfn and the MSI index.
>> 
>> Signed-off-by: Tobias Schumacher <ts@linux.ibm.com>
>> ---
>>  arch/s390/Kconfig           |   1 +
>>  arch/s390/include/asm/pci.h |   5 +
>>  arch/s390/pci/pci.c         |   6 +
>>  arch/s390/pci/pci_bus.c     |  18 ++-
>>  arch/s390/pci/pci_irq.c     | 310 ++++++++++++++++++++++++++++----------------
>>  5 files changed, 224 insertions(+), 116 deletions(-)
>> 
> --- snip ---
>
>> +
>> +static inline u16 zpci_decode_hwirq_msi_index(u32 hwirq)
>
> I think the parameter's type should by irq_hw_number_t here. It doesn't
> matter for correctness since we're only using 32 bits now and the cast
> just cuts off the upper 32 bits but I'd like to preserve the type until
> you explicitly mask off the bits we don't use.

Makes sense, I'll change that.

> I also considered making zpci_encode_hwirq()'s return type
> irq_hw_number_t but I think it's not needed. This is because there
> we're still in the process of encoding the hwirq and want to emphasize
> that our encoding output uses 32 bits.

Agreed.

>> +{
>> +	return hwirq & 0xffff;
>> +}
>>  
>
> The changes versus v6 look good to me and I agree that putting the
> zpci_set_irq() in zpci_reenable_device() looks like the cleanest fix
> for the recovery issue. Also good catch on the msg->data assignment. So
> feel free to (re-)add my R-b as per below.
>
> Reviewed-by: Niklas Schnelle <schnelle@linux.ibm.com>

Thanks, since this is only a minor change let's wait if Gerd and Farhan
want to add anything before sending a new version.

Tobias