Setting pci_msi_ignore_mask inhibits the toggling of the mask bit for both
MSI and MSI-X entries globally, regardless of the IRQ chip they are using.
Only Xen sets the pci_msi_ignore_mask when routing physical interrupts over
event channels, to prevent PCI code from attempting to toggle the maskbit,
as it's Xen that controls the bit.
However, the pci_msi_ignore_mask being global will affect devices that use
MSI interrupts but are not routing those interrupts over event channels
(not using the Xen pIRQ chip). One example is devices behind a VMD PCI
bridge. In that scenario the VMD bridge configures MSI(-X) using the
normal IRQ chip (the pIRQ one in the Xen case), and devices behind the
bridge configure the MSI entries using indexes into the VMD bridge MSI
table. The VMD bridge then demultiplexes such interrupts and delivers to
the destination device(s). Having pci_msi_ignore_mask set in that scenario
prevents (un)masking of MSI entries for devices behind the VMD bridge.
Move the signaling of no entry masking into the MSI domain flags, as that
allows setting it on a per-domain basis. Set it for the Xen MSI domain
that uses the pIRQ chip, while leaving it unset for the rest of the
cases.
Remove pci_msi_ignore_mask at once, since it was only used by Xen code, and
with Xen dropping usage the variable is unneeded.
This fixes using devices behind a VMD bridge on Xen PV hardware domains.
Albeit Devices behind a VMD bridge are not known to Xen, that doesn't mean
Linux cannot use them. By inhibiting the usage of
VMD_FEAT_CAN_BYPASS_MSI_REMAP and the removal of the pci_msi_ignore_mask
bodge devices behind a VMD bridge do work fine when use from a Linux Xen
hardware domain. That's the whole point of the series.
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Juergen Gross <jgross@suse.com>
---
Changes since v2:
- Fix subject line.
Changes since v1:
- Fix build.
- Expand commit message.
---
arch/x86/pci/xen.c | 8 ++------
drivers/pci/msi/msi.c | 37 +++++++++++++++++++++----------------
include/linux/msi.h | 3 ++-
kernel/irq/msi.c | 2 +-
4 files changed, 26 insertions(+), 24 deletions(-)
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 0f2fe524f60d..b8755cde2419 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -436,7 +436,8 @@ static struct msi_domain_ops xen_pci_msi_domain_ops = {
};
static struct msi_domain_info xen_pci_msi_domain_info = {
- .flags = MSI_FLAG_PCI_MSIX | MSI_FLAG_FREE_MSI_DESCS | MSI_FLAG_DEV_SYSFS,
+ .flags = MSI_FLAG_PCI_MSIX | MSI_FLAG_FREE_MSI_DESCS |
+ MSI_FLAG_DEV_SYSFS | MSI_FLAG_NO_MASK,
.ops = &xen_pci_msi_domain_ops,
};
@@ -484,11 +485,6 @@ static __init void xen_setup_pci_msi(void)
* in allocating the native domain and never use it.
*/
x86_init.irqs.create_pci_msi_domain = xen_create_pci_msi_domain;
- /*
- * With XEN PIRQ/Eventchannels in use PCI/MSI[-X] masking is solely
- * controlled by the hypervisor.
- */
- pci_msi_ignore_mask = 1;
}
#else /* CONFIG_PCI_MSI */
diff --git a/drivers/pci/msi/msi.c b/drivers/pci/msi/msi.c
index 2f647cac4cae..4c8c2b57b5f6 100644
--- a/drivers/pci/msi/msi.c
+++ b/drivers/pci/msi/msi.c
@@ -10,12 +10,12 @@
#include <linux/err.h>
#include <linux/export.h>
#include <linux/irq.h>
+#include <linux/irqdomain.h>
#include "../pci.h"
#include "msi.h"
int pci_msi_enable = 1;
-int pci_msi_ignore_mask;
/**
* pci_msi_supported - check whether MSI may be enabled on a device
@@ -285,6 +285,8 @@ static void pci_msi_set_enable(struct pci_dev *dev, int enable)
static int msi_setup_msi_desc(struct pci_dev *dev, int nvec,
struct irq_affinity_desc *masks)
{
+ const struct irq_domain *d = dev_get_msi_domain(&dev->dev);
+ const struct msi_domain_info *info = d->host_data;
struct msi_desc desc;
u16 control;
@@ -295,8 +297,7 @@ static int msi_setup_msi_desc(struct pci_dev *dev, int nvec,
/* Lies, damned lies, and MSIs */
if (dev->dev_flags & PCI_DEV_FLAGS_HAS_MSI_MASKING)
control |= PCI_MSI_FLAGS_MASKBIT;
- /* Respect XEN's mask disabling */
- if (pci_msi_ignore_mask)
+ if (info->flags & MSI_FLAG_NO_MASK)
control &= ~PCI_MSI_FLAGS_MASKBIT;
desc.nvec_used = nvec;
@@ -604,12 +605,15 @@ static void __iomem *msix_map_region(struct pci_dev *dev,
*/
void msix_prepare_msi_desc(struct pci_dev *dev, struct msi_desc *desc)
{
+ const struct irq_domain *d = dev_get_msi_domain(&dev->dev);
+ const struct msi_domain_info *info = d->host_data;
+
desc->nvec_used = 1;
desc->pci.msi_attrib.is_msix = 1;
desc->pci.msi_attrib.is_64 = 1;
desc->pci.msi_attrib.default_irq = dev->irq;
desc->pci.mask_base = dev->msix_base;
- desc->pci.msi_attrib.can_mask = !pci_msi_ignore_mask &&
+ desc->pci.msi_attrib.can_mask = !(info->flags & MSI_FLAG_NO_MASK) &&
!desc->pci.msi_attrib.is_virtual;
if (desc->pci.msi_attrib.can_mask) {
@@ -659,9 +663,6 @@ static void msix_mask_all(void __iomem *base, int tsize)
u32 ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT;
int i;
- if (pci_msi_ignore_mask)
- return;
-
for (i = 0; i < tsize; i++, base += PCI_MSIX_ENTRY_SIZE)
writel(ctrl, base + PCI_MSIX_ENTRY_VECTOR_CTRL);
}
@@ -714,6 +715,8 @@ static int msix_setup_interrupts(struct pci_dev *dev, struct msix_entry *entries
static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
int nvec, struct irq_affinity *affd)
{
+ const struct irq_domain *d = dev_get_msi_domain(&dev->dev);
+ const struct msi_domain_info *info = d->host_data;
int ret, tsize;
u16 control;
@@ -744,15 +747,17 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
/* Disable INTX */
pci_intx_for_msi(dev, 0);
- /*
- * Ensure that all table entries are masked to prevent
- * stale entries from firing in a crash kernel.
- *
- * Done late to deal with a broken Marvell NVME device
- * which takes the MSI-X mask bits into account even
- * when MSI-X is disabled, which prevents MSI delivery.
- */
- msix_mask_all(dev->msix_base, tsize);
+ if (!(info->flags & MSI_FLAG_NO_MASK)) {
+ /*
+ * Ensure that all table entries are masked to prevent
+ * stale entries from firing in a crash kernel.
+ *
+ * Done late to deal with a broken Marvell NVME device
+ * which takes the MSI-X mask bits into account even
+ * when MSI-X is disabled, which prevents MSI delivery.
+ */
+ msix_mask_all(dev->msix_base, tsize);
+ }
pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL, 0);
pcibios_free_irq(dev);
diff --git a/include/linux/msi.h b/include/linux/msi.h
index b10093c4d00e..59a421fc42bf 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -73,7 +73,6 @@ struct msi_msg {
};
};
-extern int pci_msi_ignore_mask;
/* Helper functions */
struct msi_desc;
struct pci_dev;
@@ -556,6 +555,8 @@ enum {
MSI_FLAG_PCI_MSIX_ALLOC_DYN = (1 << 20),
/* PCI MSIs cannot be steered separately to CPU cores */
MSI_FLAG_NO_AFFINITY = (1 << 21),
+ /* Inhibit usage of entry masking */
+ MSI_FLAG_NO_MASK = (1 << 22),
};
/**
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 396a067a8a56..7682c36cbccc 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -1143,7 +1143,7 @@ static bool msi_check_reservation_mode(struct irq_domain *domain,
if (!(info->flags & MSI_FLAG_MUST_REACTIVATE))
return false;
- if (IS_ENABLED(CONFIG_PCI_MSI) && pci_msi_ignore_mask)
+ if (info->flags & MSI_FLAG_NO_MASK)
return false;
/*
--
2.46.0
+ Linus so that he can whack it before it spreads any further.
On Wed, Feb 19, 2025 at 10:20:57AM +0100, Roger Pau Monne wrote:
> Setting pci_msi_ignore_mask inhibits the toggling of the mask bit for both
> MSI and MSI-X entries globally, regardless of the IRQ chip they are using.
> Only Xen sets the pci_msi_ignore_mask when routing physical interrupts over
> event channels, to prevent PCI code from attempting to toggle the maskbit,
> as it's Xen that controls the bit.
>
> However, the pci_msi_ignore_mask being global will affect devices that use
> MSI interrupts but are not routing those interrupts over event channels
> (not using the Xen pIRQ chip). One example is devices behind a VMD PCI
> bridge. In that scenario the VMD bridge configures MSI(-X) using the
> normal IRQ chip (the pIRQ one in the Xen case), and devices behind the
> bridge configure the MSI entries using indexes into the VMD bridge MSI
> table. The VMD bridge then demultiplexes such interrupts and delivers to
> the destination device(s). Having pci_msi_ignore_mask set in that scenario
> prevents (un)masking of MSI entries for devices behind the VMD bridge.
>
> Move the signaling of no entry masking into the MSI domain flags, as that
> allows setting it on a per-domain basis. Set it for the Xen MSI domain
> that uses the pIRQ chip, while leaving it unset for the rest of the
> cases.
>
> Remove pci_msi_ignore_mask at once, since it was only used by Xen code, and
> with Xen dropping usage the variable is unneeded.
>
> This fixes using devices behind a VMD bridge on Xen PV hardware domains.
>
> Albeit Devices behind a VMD bridge are not known to Xen, that doesn't mean
> Linux cannot use them. By inhibiting the usage of
> VMD_FEAT_CAN_BYPASS_MSI_REMAP and the removal of the pci_msi_ignore_mask
> bodge devices behind a VMD bridge do work fine when use from a Linux Xen
> hardware domain. That's the whole point of the series.
>
> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
> Acked-by: Juergen Gross <jgross@suse.com>
Did anyone actually test this on a normal KVM guest?
c3164d2e0d181027da8fc94f8179d8607c3d440f is the first bad commit
commit c3164d2e0d181027da8fc94f8179d8607c3d440f
Author: Roger Pau Monne <roger.pau@citrix.com>
Date: Wed Feb 19 10:20:57 2025 +0100
PCI/MSI: Convert pci_msi_ignore_mask to per MSI domain flag
Setting pci_msi_ignore_mask inhibits the toggling of the mask bit for both
MSI and MSI-X entries globally, regardless of the IRQ chip they are using.
Only Xen sets the pci_msi_ignore_mask when routing physical interrupts over
event channels, to prevent PCI code from attempting to toggle the maskbit,
as it's Xen that controls the bit.
However, the pci_msi_ignore_mask being global will affect devices that use
MSI interrupts but are not routing those interrupts over event channels
(not using the Xen pIRQ chip). One example is devices behind a VMD PCI
bridge. In that scenario the VMD bridge configures MSI(-X) using the
normal IRQ chip (the pIRQ one in the Xen case), and devices behind the
bridge configure the MSI entries using indexes into the VMD bridge MSI
table. The VMD bridge then demultiplexes such interrupts and delivers to
the destination device(s). Having pci_msi_ignore_mask set in that scenario
prevents (un)masking of MSI entries for devices behind the VMD bridge.
Move the signaling of no entry masking into the MSI domain flags, as that
allows setting it on a per-domain basis. Set it for the Xen MSI domain
that uses the pIRQ chip, while leaving it unset for the rest of the
cases.
Remove pci_msi_ignore_mask at once, since it was only used by Xen code, and
with Xen dropping usage the variable is unneeded.
This fixes using devices behind a VMD bridge on Xen PV hardware domains.
Albeit Devices behind a VMD bridge are not known to Xen, that doesn't mean
Linux cannot use them. By inhibiting the usage of
VMD_FEAT_CAN_BYPASS_MSI_REMAP and the removal of the pci_msi_ignore_mask
bodge devices behind a VMD bridge do work fine when use from a Linux Xen
hardware domain. That's the whole point of the series.
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Juergen Gross <jgross@suse.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Message-ID: <20250219092059.90850-4-roger.pau@citrix.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
arch/x86/pci/xen.c | 8 ++------
drivers/pci/msi/msi.c | 37 +++++++++++++++++++++----------------
include/linux/msi.h | 3 ++-
kernel/irq/msi.c | 2 +-
4 files changed, 26 insertions(+), 24 deletions(-)
[ 1.254066] zram: Added device: zram0
[ 1.255093] st: Version 20160209, fixed bufsize 32768, s/g segs 256
[ 1.257577] ahci 0000:00:1f.2: version 3.0
[ 1.259050] BUG: kernel NULL pointer dereference, address: 0000000000000000
[ 1.261239] #PF: supervisor read access in kernel mode
[ 1.261544] #PF: error_code(0x0000) - not-present page
[ 1.261544] PGD 0
[ 1.261544] Oops: Oops: 0000 [#1] SMP
[ 1.261544] CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.14.0+ #1 PREEMPT(voluntary)
[ 1.261544] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 2023.11-8 02/21/2024
[ 1.261544] RIP: 0010:msi_setup_msi_desc+0x75/0x170
[ 1.261544] Code: 89 54 24 06 48 8d 54 24 06 f3 48 ab 48 89 ef e8 c1 6a fe ff 0f b7 44 24 06 f6 85 3f 08 00 00 10 74 08 80 cc 01 66 89 44 24 06 <41> 8b 16 81 e2 00 00 40 00 0f 85 bb 00 00 00 89 c6 66 c1 ee 08 83
[ 1.261544] RSP: 0018:ffa0000000023980 EFLAGS: 00010246
[ 1.261544] RAX: 0000000000000080 RBX: ffa0000000023988 RCX: 0000000000000082
[ 1.261544] RDX: 0000000000000000 RSI: 0000000000000293 RDI: ffffffff83a16138
[ 1.261544] RBP: ff11000006fdd000 R08: 0000000000000002 R09: ffa0000000023964
[ 1.261544] R10: 0000000000000000 R11: ffffffff81dd71e0 R12: 0000000000000000
[ 1.261544] R13: 0000000000000001 R14: 0000000000000000 R15: 0000000000000001
[ 1.261544] FS: 0000000000000000(0000) GS:ff110000f0b78000(0000) knlGS:0000000000000000
[ 1.261544] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1.261544] CR2: 0000000000000000 CR3: 0000000002e23001 CR4: 0000000000771ef0
[ 1.261544] PKRU: 55555554
[ 1.261544] Call Trace:
[ 1.261544] <TASK>
[ 1.261544] ? __die_body.cold+0x19/0x29
[ 1.261544] ? page_fault_oops+0x15a/0x260
[ 1.261544] ? exc_page_fault+0x81/0x1b0
[ 1.261544] ? asm_exc_page_fault+0x26/0x30
[ 1.261544] ? pci_mmcfg_arch_unmap+0x40/0x40
[ 1.261544] ? msi_setup_msi_desc+0x75/0x170
[ 1.261544] __msi_capability_init+0x2d/0x2a0
[ 1.261544] ? srso_alias_return_thunk+0x5/0xfbef5
[ 1.261544] ? irq_domain_update_bus_token+0x6b/0x80
[ 1.261544] ? srso_alias_return_thunk+0x5/0xfbef5
[ 1.261544] ? srso_alias_return_thunk+0x5/0xfbef5
[ 1.261544] ? srso_alias_return_thunk+0x5/0xfbef5
[ 1.261544] ? pci_conf1_read+0xb2/0xf0
[ 1.261544] ? srso_alias_return_thunk+0x5/0xfbef5
[ 1.261544] __pci_enable_msi_range+0x271/0x380
[ 1.261544] pci_alloc_irq_vectors_affinity+0xc2/0x110
[ 1.261544] ahci_init_one+0x701/0xd20
[ 1.261544] ? srso_alias_return_thunk+0x5/0xfbef5
[ 1.261544] ? srso_alias_return_thunk+0x5/0xfbef5
[ 1.261544] ? __kernfs_new_node.isra.0+0xcb/0x200
[ 1.261544] local_pci_probe+0x42/0x90
[ 1.261544] pci_device_probe+0xdc/0x260
[ 1.261544] ? sysfs_do_create_link_sd+0x6e/0xe0
[ 1.261544] really_probe+0xdb/0x340
[ 1.261544] ? pm_runtime_barrier+0x54/0x90
[ 1.261544] ? __device_attach_driver+0x110/0x110
[ 1.261544] __driver_probe_device+0x78/0x110
[ 1.261544] driver_probe_device+0x1f/0xa0
[ 1.261544] __driver_attach+0xba/0x1c0
[ 1.261544] bus_for_each_dev+0x8b/0xe0
[ 1.261544] bus_add_driver+0x112/0x1f0
[ 1.261544] driver_register+0x72/0xd0
[ 1.261544] ? ata_sff_init+0x40/0x40
[ 1.261544] do_one_initcall+0x57/0x300
[ 1.261544] kernel_init_freeable+0x237/0x2c0
[ 1.261544] ? rest_init+0xd0/0xd0
[ 1.261544] kernel_init+0x1a/0x130
[ 1.261544] ret_from_fork+0x31/0x50
[ 1.261544] ? rest_init+0xd0/0xd0
[ 1.261544] ret_from_fork_asm+0x11/0x20
[ 1.261544] </TASK>
[ 1.261544] Modules linked in:
[ 1.261544] CR2: 0000000000000000
[ 1.261544] ---[ end trace 0000000000000000 ]---
[ 1.261544] RIP: 0010:msi_setup_msi_desc+0x75/0x170
[ 1.261544] Code: 89 54 24 06 48 8d 54 24 06 f3 48 ab 48 89 ef e8 c1 6a fe ff 0f b7 44 24 06 f6 85 3f 08 00 00 10 74 08 80 cc 01 66 89 44 24 06 <41> 8b 16 81 e2 00 00 40 00 0f 85 bb 00 00 00 89 c6 66 c1 ee 08 83
[ 1.261544] RSP: 0018:ffa0000000023980 EFLAGS: 00010246
[ 1.261544] RAX: 0000000000000080 RBX: ffa0000000023988 RCX: 0000000000000082
[ 1.261544] RDX: 0000000000000000 RSI: 0000000000000293 RDI: ffffffff83a16138
[ 1.261544] RBP: ff11000006fdd000 R08: 0000000000000002 R09: ffa0000000023964
[ 1.261544] R10: 0000000000000000 R11: ffffffff81dd71e0 R12: 0000000000000000
[ 1.261544] R13: 0000000000000001 R14: 0000000000000000 R15: 0000000000000001
[ 1.261544] FS: 0000000000000000(0000) GS:ff110000f0b78000(0000) knlGS:0000000000000000
[ 1.261544] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1.261544] CR2: 0000000000000000 CR3: 0000000002e23001 CR4: 0000000000771ef0
[ 1.261544] PKRU: 55555554
[ 1.261544] note: swapper/0[1] exited with irqs disabled
[ 1.374076] Kernel panic - not syncing: Attempted to kill init! exitcode=0x00000009
[ 1.378062] Kernel Offset: disabled
[ 1.378062] ---[ end Kernel panic - not syncing: Attempted to kill init! exitcode=0x00000009 ]---
> ---
> Changes since v2:
> - Fix subject line.
>
> Changes since v1:
> - Fix build.
> - Expand commit message.
> ---
> arch/x86/pci/xen.c | 8 ++------
> drivers/pci/msi/msi.c | 37 +++++++++++++++++++++----------------
> include/linux/msi.h | 3 ++-
> kernel/irq/msi.c | 2 +-
> 4 files changed, 26 insertions(+), 24 deletions(-)
>
> diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
> index 0f2fe524f60d..b8755cde2419 100644
> --- a/arch/x86/pci/xen.c
> +++ b/arch/x86/pci/xen.c
> @@ -436,7 +436,8 @@ static struct msi_domain_ops xen_pci_msi_domain_ops = {
> };
>
> static struct msi_domain_info xen_pci_msi_domain_info = {
> - .flags = MSI_FLAG_PCI_MSIX | MSI_FLAG_FREE_MSI_DESCS | MSI_FLAG_DEV_SYSFS,
> + .flags = MSI_FLAG_PCI_MSIX | MSI_FLAG_FREE_MSI_DESCS |
> + MSI_FLAG_DEV_SYSFS | MSI_FLAG_NO_MASK,
> .ops = &xen_pci_msi_domain_ops,
> };
>
> @@ -484,11 +485,6 @@ static __init void xen_setup_pci_msi(void)
> * in allocating the native domain and never use it.
> */
> x86_init.irqs.create_pci_msi_domain = xen_create_pci_msi_domain;
> - /*
> - * With XEN PIRQ/Eventchannels in use PCI/MSI[-X] masking is solely
> - * controlled by the hypervisor.
> - */
> - pci_msi_ignore_mask = 1;
> }
>
> #else /* CONFIG_PCI_MSI */
> diff --git a/drivers/pci/msi/msi.c b/drivers/pci/msi/msi.c
> index 2f647cac4cae..4c8c2b57b5f6 100644
> --- a/drivers/pci/msi/msi.c
> +++ b/drivers/pci/msi/msi.c
> @@ -10,12 +10,12 @@
> #include <linux/err.h>
> #include <linux/export.h>
> #include <linux/irq.h>
> +#include <linux/irqdomain.h>
>
> #include "../pci.h"
> #include "msi.h"
>
> int pci_msi_enable = 1;
> -int pci_msi_ignore_mask;
>
> /**
> * pci_msi_supported - check whether MSI may be enabled on a device
> @@ -285,6 +285,8 @@ static void pci_msi_set_enable(struct pci_dev *dev, int enable)
> static int msi_setup_msi_desc(struct pci_dev *dev, int nvec,
> struct irq_affinity_desc *masks)
> {
> + const struct irq_domain *d = dev_get_msi_domain(&dev->dev);
> + const struct msi_domain_info *info = d->host_data;
> struct msi_desc desc;
> u16 control;
>
> @@ -295,8 +297,7 @@ static int msi_setup_msi_desc(struct pci_dev *dev, int nvec,
> /* Lies, damned lies, and MSIs */
> if (dev->dev_flags & PCI_DEV_FLAGS_HAS_MSI_MASKING)
> control |= PCI_MSI_FLAGS_MASKBIT;
> - /* Respect XEN's mask disabling */
> - if (pci_msi_ignore_mask)
> + if (info->flags & MSI_FLAG_NO_MASK)
> control &= ~PCI_MSI_FLAGS_MASKBIT;
>
> desc.nvec_used = nvec;
> @@ -604,12 +605,15 @@ static void __iomem *msix_map_region(struct pci_dev *dev,
> */
> void msix_prepare_msi_desc(struct pci_dev *dev, struct msi_desc *desc)
> {
> + const struct irq_domain *d = dev_get_msi_domain(&dev->dev);
> + const struct msi_domain_info *info = d->host_data;
> +
> desc->nvec_used = 1;
> desc->pci.msi_attrib.is_msix = 1;
> desc->pci.msi_attrib.is_64 = 1;
> desc->pci.msi_attrib.default_irq = dev->irq;
> desc->pci.mask_base = dev->msix_base;
> - desc->pci.msi_attrib.can_mask = !pci_msi_ignore_mask &&
> + desc->pci.msi_attrib.can_mask = !(info->flags & MSI_FLAG_NO_MASK) &&
> !desc->pci.msi_attrib.is_virtual;
>
> if (desc->pci.msi_attrib.can_mask) {
> @@ -659,9 +663,6 @@ static void msix_mask_all(void __iomem *base, int tsize)
> u32 ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT;
> int i;
>
> - if (pci_msi_ignore_mask)
> - return;
> -
> for (i = 0; i < tsize; i++, base += PCI_MSIX_ENTRY_SIZE)
> writel(ctrl, base + PCI_MSIX_ENTRY_VECTOR_CTRL);
> }
> @@ -714,6 +715,8 @@ static int msix_setup_interrupts(struct pci_dev *dev, struct msix_entry *entries
> static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
> int nvec, struct irq_affinity *affd)
> {
> + const struct irq_domain *d = dev_get_msi_domain(&dev->dev);
> + const struct msi_domain_info *info = d->host_data;
> int ret, tsize;
> u16 control;
>
> @@ -744,15 +747,17 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
> /* Disable INTX */
> pci_intx_for_msi(dev, 0);
>
> - /*
> - * Ensure that all table entries are masked to prevent
> - * stale entries from firing in a crash kernel.
> - *
> - * Done late to deal with a broken Marvell NVME device
> - * which takes the MSI-X mask bits into account even
> - * when MSI-X is disabled, which prevents MSI delivery.
> - */
> - msix_mask_all(dev->msix_base, tsize);
> + if (!(info->flags & MSI_FLAG_NO_MASK)) {
> + /*
> + * Ensure that all table entries are masked to prevent
> + * stale entries from firing in a crash kernel.
> + *
> + * Done late to deal with a broken Marvell NVME device
> + * which takes the MSI-X mask bits into account even
> + * when MSI-X is disabled, which prevents MSI delivery.
> + */
> + msix_mask_all(dev->msix_base, tsize);
> + }
> pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL, 0);
>
> pcibios_free_irq(dev);
> diff --git a/include/linux/msi.h b/include/linux/msi.h
> index b10093c4d00e..59a421fc42bf 100644
> --- a/include/linux/msi.h
> +++ b/include/linux/msi.h
> @@ -73,7 +73,6 @@ struct msi_msg {
> };
> };
>
> -extern int pci_msi_ignore_mask;
> /* Helper functions */
> struct msi_desc;
> struct pci_dev;
> @@ -556,6 +555,8 @@ enum {
> MSI_FLAG_PCI_MSIX_ALLOC_DYN = (1 << 20),
> /* PCI MSIs cannot be steered separately to CPU cores */
> MSI_FLAG_NO_AFFINITY = (1 << 21),
> + /* Inhibit usage of entry masking */
> + MSI_FLAG_NO_MASK = (1 << 22),
> };
>
> /**
> diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
> index 396a067a8a56..7682c36cbccc 100644
> --- a/kernel/irq/msi.c
> +++ b/kernel/irq/msi.c
> @@ -1143,7 +1143,7 @@ static bool msi_check_reservation_mode(struct irq_domain *domain,
> if (!(info->flags & MSI_FLAG_MUST_REACTIVATE))
> return false;
>
> - if (IS_ENABLED(CONFIG_PCI_MSI) && pci_msi_ignore_mask)
> + if (info->flags & MSI_FLAG_NO_MASK)
> return false;
>
> /*
> --
> 2.46.0
>
>
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
On Wed, Mar 26, 2025 at 12:04:55PM +0100, Borislav Petkov wrote: > + Linus so that he can whack it before it spreads any further. > > On Wed, Feb 19, 2025 at 10:20:57AM +0100, Roger Pau Monne wrote: > > Setting pci_msi_ignore_mask inhibits the toggling of the mask bit for both > > MSI and MSI-X entries globally, regardless of the IRQ chip they are using. > > Only Xen sets the pci_msi_ignore_mask when routing physical interrupts over > > event channels, to prevent PCI code from attempting to toggle the maskbit, > > as it's Xen that controls the bit. > > > > However, the pci_msi_ignore_mask being global will affect devices that use > > MSI interrupts but are not routing those interrupts over event channels > > (not using the Xen pIRQ chip). One example is devices behind a VMD PCI > > bridge. In that scenario the VMD bridge configures MSI(-X) using the > > normal IRQ chip (the pIRQ one in the Xen case), and devices behind the > > bridge configure the MSI entries using indexes into the VMD bridge MSI > > table. The VMD bridge then demultiplexes such interrupts and delivers to > > the destination device(s). Having pci_msi_ignore_mask set in that scenario > > prevents (un)masking of MSI entries for devices behind the VMD bridge. > > > > Move the signaling of no entry masking into the MSI domain flags, as that > > allows setting it on a per-domain basis. Set it for the Xen MSI domain > > that uses the pIRQ chip, while leaving it unset for the rest of the > > cases. > > > > Remove pci_msi_ignore_mask at once, since it was only used by Xen code, and > > with Xen dropping usage the variable is unneeded. > > > > This fixes using devices behind a VMD bridge on Xen PV hardware domains. > > > > Albeit Devices behind a VMD bridge are not known to Xen, that doesn't mean > > Linux cannot use them. By inhibiting the usage of > > VMD_FEAT_CAN_BYPASS_MSI_REMAP and the removal of the pci_msi_ignore_mask > > bodge devices behind a VMD bridge do work fine when use from a Linux Xen > > hardware domain. That's the whole point of the series. > > > > Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> > > Reviewed-by: Thomas Gleixner <tglx@linutronix.de> > > Acked-by: Juergen Gross <jgross@suse.com> > > Did anyone actually test this on a normal KVM guest? Sorry, not on KVM, I've tested on Xen and native. It also seems to be somewhat tied to the Kconfig, as I couldn't reproduce it with my Kconfig, maybe didn't have the required VirtIO options enabled. It's fixed by: https://lore.kernel.org/xen-devel/87v7rxzct0.ffs@tglx/ Waiting for Thomas to formally sent that. Thanks, Roger.
On Wed, Mar 26, 2025 at 12:14:09PM +0100, Roger Pau Monné wrote:
> Sorry, not on KVM, I've tested on Xen and native. It also seems to be
> somewhat tied to the Kconfig, as I couldn't reproduce it with my
> Kconfig, maybe didn't have the required VirtIO options enabled.
Right.
> It's fixed by:
>
> https://lore.kernel.org/xen-devel/87v7rxzct0.ffs@tglx/
>
> Waiting for Thomas to formally sent that.
Yap, he just pointed me to that one.
Tested-by: Borislav Petkov (AMD) <bp@alien8.de>
Thx.
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
On Wed, Feb 19, 2025 at 10:20:57AM +0100, Roger Pau Monne wrote:
> Setting pci_msi_ignore_mask inhibits the toggling of the mask bit for both
> MSI and MSI-X entries globally, regardless of the IRQ chip they are using.
> Only Xen sets the pci_msi_ignore_mask when routing physical interrupts over
> event channels, to prevent PCI code from attempting to toggle the maskbit,
> as it's Xen that controls the bit.
>
> However, the pci_msi_ignore_mask being global will affect devices that use
> MSI interrupts but are not routing those interrupts over event channels
> (not using the Xen pIRQ chip). One example is devices behind a VMD PCI
> bridge. In that scenario the VMD bridge configures MSI(-X) using the
> normal IRQ chip (the pIRQ one in the Xen case), and devices behind the
> bridge configure the MSI entries using indexes into the VMD bridge MSI
> table. The VMD bridge then demultiplexes such interrupts and delivers to
> the destination device(s). Having pci_msi_ignore_mask set in that scenario
> prevents (un)masking of MSI entries for devices behind the VMD bridge.
>
> Move the signaling of no entry masking into the MSI domain flags, as that
> allows setting it on a per-domain basis. Set it for the Xen MSI domain
> that uses the pIRQ chip, while leaving it unset for the rest of the
> cases.
>
> Remove pci_msi_ignore_mask at once, since it was only used by Xen code, and
> with Xen dropping usage the variable is unneeded.
>
> This fixes using devices behind a VMD bridge on Xen PV hardware domains.
>
> Albeit Devices behind a VMD bridge are not known to Xen, that doesn't mean
> Linux cannot use them. By inhibiting the usage of
> VMD_FEAT_CAN_BYPASS_MSI_REMAP and the removal of the pci_msi_ignore_mask
> bodge devices behind a VMD bridge do work fine when use from a Linux Xen
> hardware domain. That's the whole point of the series.
>
> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
> Acked-by: Juergen Gross <jgross@suse.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
I assume you'll merge this series via the Xen tree. Let me know if
otherwise.
> ---
> Changes since v2:
> - Fix subject line.
>
> Changes since v1:
> - Fix build.
> - Expand commit message.
> ---
> arch/x86/pci/xen.c | 8 ++------
> drivers/pci/msi/msi.c | 37 +++++++++++++++++++++----------------
> include/linux/msi.h | 3 ++-
> kernel/irq/msi.c | 2 +-
> 4 files changed, 26 insertions(+), 24 deletions(-)
>
> diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
> index 0f2fe524f60d..b8755cde2419 100644
> --- a/arch/x86/pci/xen.c
> +++ b/arch/x86/pci/xen.c
> @@ -436,7 +436,8 @@ static struct msi_domain_ops xen_pci_msi_domain_ops = {
> };
>
> static struct msi_domain_info xen_pci_msi_domain_info = {
> - .flags = MSI_FLAG_PCI_MSIX | MSI_FLAG_FREE_MSI_DESCS | MSI_FLAG_DEV_SYSFS,
> + .flags = MSI_FLAG_PCI_MSIX | MSI_FLAG_FREE_MSI_DESCS |
> + MSI_FLAG_DEV_SYSFS | MSI_FLAG_NO_MASK,
> .ops = &xen_pci_msi_domain_ops,
> };
>
> @@ -484,11 +485,6 @@ static __init void xen_setup_pci_msi(void)
> * in allocating the native domain and never use it.
> */
> x86_init.irqs.create_pci_msi_domain = xen_create_pci_msi_domain;
> - /*
> - * With XEN PIRQ/Eventchannels in use PCI/MSI[-X] masking is solely
> - * controlled by the hypervisor.
> - */
> - pci_msi_ignore_mask = 1;
> }
>
> #else /* CONFIG_PCI_MSI */
> diff --git a/drivers/pci/msi/msi.c b/drivers/pci/msi/msi.c
> index 2f647cac4cae..4c8c2b57b5f6 100644
> --- a/drivers/pci/msi/msi.c
> +++ b/drivers/pci/msi/msi.c
> @@ -10,12 +10,12 @@
> #include <linux/err.h>
> #include <linux/export.h>
> #include <linux/irq.h>
> +#include <linux/irqdomain.h>
>
> #include "../pci.h"
> #include "msi.h"
>
> int pci_msi_enable = 1;
> -int pci_msi_ignore_mask;
>
> /**
> * pci_msi_supported - check whether MSI may be enabled on a device
> @@ -285,6 +285,8 @@ static void pci_msi_set_enable(struct pci_dev *dev, int enable)
> static int msi_setup_msi_desc(struct pci_dev *dev, int nvec,
> struct irq_affinity_desc *masks)
> {
> + const struct irq_domain *d = dev_get_msi_domain(&dev->dev);
> + const struct msi_domain_info *info = d->host_data;
> struct msi_desc desc;
> u16 control;
>
> @@ -295,8 +297,7 @@ static int msi_setup_msi_desc(struct pci_dev *dev, int nvec,
> /* Lies, damned lies, and MSIs */
> if (dev->dev_flags & PCI_DEV_FLAGS_HAS_MSI_MASKING)
> control |= PCI_MSI_FLAGS_MASKBIT;
> - /* Respect XEN's mask disabling */
> - if (pci_msi_ignore_mask)
> + if (info->flags & MSI_FLAG_NO_MASK)
> control &= ~PCI_MSI_FLAGS_MASKBIT;
>
> desc.nvec_used = nvec;
> @@ -604,12 +605,15 @@ static void __iomem *msix_map_region(struct pci_dev *dev,
> */
> void msix_prepare_msi_desc(struct pci_dev *dev, struct msi_desc *desc)
> {
> + const struct irq_domain *d = dev_get_msi_domain(&dev->dev);
> + const struct msi_domain_info *info = d->host_data;
> +
> desc->nvec_used = 1;
> desc->pci.msi_attrib.is_msix = 1;
> desc->pci.msi_attrib.is_64 = 1;
> desc->pci.msi_attrib.default_irq = dev->irq;
> desc->pci.mask_base = dev->msix_base;
> - desc->pci.msi_attrib.can_mask = !pci_msi_ignore_mask &&
> + desc->pci.msi_attrib.can_mask = !(info->flags & MSI_FLAG_NO_MASK) &&
> !desc->pci.msi_attrib.is_virtual;
>
> if (desc->pci.msi_attrib.can_mask) {
> @@ -659,9 +663,6 @@ static void msix_mask_all(void __iomem *base, int tsize)
> u32 ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT;
> int i;
>
> - if (pci_msi_ignore_mask)
> - return;
> -
> for (i = 0; i < tsize; i++, base += PCI_MSIX_ENTRY_SIZE)
> writel(ctrl, base + PCI_MSIX_ENTRY_VECTOR_CTRL);
> }
> @@ -714,6 +715,8 @@ static int msix_setup_interrupts(struct pci_dev *dev, struct msix_entry *entries
> static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
> int nvec, struct irq_affinity *affd)
> {
> + const struct irq_domain *d = dev_get_msi_domain(&dev->dev);
> + const struct msi_domain_info *info = d->host_data;
> int ret, tsize;
> u16 control;
>
> @@ -744,15 +747,17 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
> /* Disable INTX */
> pci_intx_for_msi(dev, 0);
>
> - /*
> - * Ensure that all table entries are masked to prevent
> - * stale entries from firing in a crash kernel.
> - *
> - * Done late to deal with a broken Marvell NVME device
> - * which takes the MSI-X mask bits into account even
> - * when MSI-X is disabled, which prevents MSI delivery.
> - */
> - msix_mask_all(dev->msix_base, tsize);
> + if (!(info->flags & MSI_FLAG_NO_MASK)) {
> + /*
> + * Ensure that all table entries are masked to prevent
> + * stale entries from firing in a crash kernel.
> + *
> + * Done late to deal with a broken Marvell NVME device
> + * which takes the MSI-X mask bits into account even
> + * when MSI-X is disabled, which prevents MSI delivery.
> + */
> + msix_mask_all(dev->msix_base, tsize);
> + }
> pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL, 0);
>
> pcibios_free_irq(dev);
> diff --git a/include/linux/msi.h b/include/linux/msi.h
> index b10093c4d00e..59a421fc42bf 100644
> --- a/include/linux/msi.h
> +++ b/include/linux/msi.h
> @@ -73,7 +73,6 @@ struct msi_msg {
> };
> };
>
> -extern int pci_msi_ignore_mask;
> /* Helper functions */
> struct msi_desc;
> struct pci_dev;
> @@ -556,6 +555,8 @@ enum {
> MSI_FLAG_PCI_MSIX_ALLOC_DYN = (1 << 20),
> /* PCI MSIs cannot be steered separately to CPU cores */
> MSI_FLAG_NO_AFFINITY = (1 << 21),
> + /* Inhibit usage of entry masking */
> + MSI_FLAG_NO_MASK = (1 << 22),
> };
>
> /**
> diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
> index 396a067a8a56..7682c36cbccc 100644
> --- a/kernel/irq/msi.c
> +++ b/kernel/irq/msi.c
> @@ -1143,7 +1143,7 @@ static bool msi_check_reservation_mode(struct irq_domain *domain,
> if (!(info->flags & MSI_FLAG_MUST_REACTIVATE))
> return false;
>
> - if (IS_ENABLED(CONFIG_PCI_MSI) && pci_msi_ignore_mask)
> + if (info->flags & MSI_FLAG_NO_MASK)
> return false;
>
> /*
> --
> 2.46.0
>
On 20.03.25 22:07, Bjorn Helgaas wrote: > On Wed, Feb 19, 2025 at 10:20:57AM +0100, Roger Pau Monne wrote: >> Setting pci_msi_ignore_mask inhibits the toggling of the mask bit for both >> MSI and MSI-X entries globally, regardless of the IRQ chip they are using. >> Only Xen sets the pci_msi_ignore_mask when routing physical interrupts over >> event channels, to prevent PCI code from attempting to toggle the maskbit, >> as it's Xen that controls the bit. >> >> However, the pci_msi_ignore_mask being global will affect devices that use >> MSI interrupts but are not routing those interrupts over event channels >> (not using the Xen pIRQ chip). One example is devices behind a VMD PCI >> bridge. In that scenario the VMD bridge configures MSI(-X) using the >> normal IRQ chip (the pIRQ one in the Xen case), and devices behind the >> bridge configure the MSI entries using indexes into the VMD bridge MSI >> table. The VMD bridge then demultiplexes such interrupts and delivers to >> the destination device(s). Having pci_msi_ignore_mask set in that scenario >> prevents (un)masking of MSI entries for devices behind the VMD bridge. >> >> Move the signaling of no entry masking into the MSI domain flags, as that >> allows setting it on a per-domain basis. Set it for the Xen MSI domain >> that uses the pIRQ chip, while leaving it unset for the rest of the >> cases. >> >> Remove pci_msi_ignore_mask at once, since it was only used by Xen code, and >> with Xen dropping usage the variable is unneeded. >> >> This fixes using devices behind a VMD bridge on Xen PV hardware domains. >> >> Albeit Devices behind a VMD bridge are not known to Xen, that doesn't mean >> Linux cannot use them. By inhibiting the usage of >> VMD_FEAT_CAN_BYPASS_MSI_REMAP and the removal of the pci_msi_ignore_mask >> bodge devices behind a VMD bridge do work fine when use from a Linux Xen >> hardware domain. That's the whole point of the series. >> >> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> >> Reviewed-by: Thomas Gleixner <tglx@linutronix.de> >> Acked-by: Juergen Gross <jgross@suse.com> > > Acked-by: Bjorn Helgaas <bhelgaas@google.com> > > I assume you'll merge this series via the Xen tree. Let me know if > otherwise. I've pushed the series to the linux-next branch of the Xen tree. Juergen
Hi,
On Fri, Mar 21, 2025 at 09:00:09AM +0100, Jürgen Groß wrote:
> On 20.03.25 22:07, Bjorn Helgaas wrote:
> > On Wed, Feb 19, 2025 at 10:20:57AM +0100, Roger Pau Monne wrote:
> > > Setting pci_msi_ignore_mask inhibits the toggling of the mask bit for both
> > > MSI and MSI-X entries globally, regardless of the IRQ chip they are using.
> > > Only Xen sets the pci_msi_ignore_mask when routing physical interrupts over
> > > event channels, to prevent PCI code from attempting to toggle the maskbit,
> > > as it's Xen that controls the bit.
> > >
> > > However, the pci_msi_ignore_mask being global will affect devices that use
> > > MSI interrupts but are not routing those interrupts over event channels
> > > (not using the Xen pIRQ chip). One example is devices behind a VMD PCI
> > > bridge. In that scenario the VMD bridge configures MSI(-X) using the
> > > normal IRQ chip (the pIRQ one in the Xen case), and devices behind the
> > > bridge configure the MSI entries using indexes into the VMD bridge MSI
> > > table. The VMD bridge then demultiplexes such interrupts and delivers to
> > > the destination device(s). Having pci_msi_ignore_mask set in that scenario
> > > prevents (un)masking of MSI entries for devices behind the VMD bridge.
> > >
> > > Move the signaling of no entry masking into the MSI domain flags, as that
> > > allows setting it on a per-domain basis. Set it for the Xen MSI domain
> > > that uses the pIRQ chip, while leaving it unset for the rest of the
> > > cases.
> > >
> > > Remove pci_msi_ignore_mask at once, since it was only used by Xen code, and
> > > with Xen dropping usage the variable is unneeded.
> > >
> > > This fixes using devices behind a VMD bridge on Xen PV hardware domains.
> > >
> > > Albeit Devices behind a VMD bridge are not known to Xen, that doesn't mean
> > > Linux cannot use them. By inhibiting the usage of
> > > VMD_FEAT_CAN_BYPASS_MSI_REMAP and the removal of the pci_msi_ignore_mask
> > > bodge devices behind a VMD bridge do work fine when use from a Linux Xen
> > > hardware domain. That's the whole point of the series.
> > >
> > > Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> > > Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
> > > Acked-by: Juergen Gross <jgross@suse.com>
> >
> > Acked-by: Bjorn Helgaas <bhelgaas@google.com>
> >
> > I assume you'll merge this series via the Xen tree. Let me know if
> > otherwise.
>
> I've pushed the series to the linux-next branch of the Xen tree.
>
>
> Juergen
This patch landed in latest next-20250324 tag causing this crash:
[ 0.753426] BUG: kernel NULL pointer dereference, address: 0000000000000002
[ 0.753921] #PF: supervisor read access in kernel mode
[ 0.754286] #PF: error_code(0x0000) - not-present page
[ 0.754656] PGD 0 P4D 0
[ 0.754842] Oops: Oops: 0000 [#1]
[ 0.755080] CPU: 0 UID: 0 PID: 1 Comm: swapper Not tainted 6.14.0-rc7-next-20250324 #1 NONE
[ 0.755691] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
[ 0.756349] RIP: 0010:msix_prepare_msi_desc+0x39/0x80
[ 0.756390] Code: 20 c7 46 04 01 00 00 00 8b 56 4c 89 d0 0d 01 01 00 00 66 89 46 4c 8b 8f 64 02 00 00 89 4e 50 48 8b 8f 70 06 00 00 48 89 4e 58 <41> f6 40 02 40 75 2a c1 ea 02 bf 80 00 00 00 21 fa 25 7f ff ff ff
[ 0.756390] RSP: 0000:ffff8881002a76e0 EFLAGS: 00010202
[ 0.756390] RAX: 0000000000000101 RBX: ffff88810074d000 RCX: ffffc9000002e000
[ 0.756390] RDX: 0000000000000000 RSI: ffff8881002a7710 RDI: ffff88810074d000
[ 0.756390] RBP: ffff8881002a7710 R08: 0000000000000000 R09: ffff8881002a76b4
[ 0.756390] R10: 000000701000c001 R11: ffffffff82a3dc01 R12: 0000000000000000
[ 0.756390] R13: 0000000000000005 R14: 0000000000000000 R15: 0000000000000002
[ 0.756390] FS: 0000000000000000(0000) GS:0000000000000000(0000) knlGS:0000000000000000
[ 0.756390] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 0.756390] CR2: 0000000000000002 CR3: 0000000002a3d001 CR4: 00000000003706b0
[ 0.756390] Call Trace:
[ 0.756390] <TASK>
[ 0.756390] ? __die_body+0x1b/0x60
[ 0.756390] ? page_fault_oops+0x2d0/0x310
[ 0.756390] ? exc_page_fault+0x59/0xc0
[ 0.756390] ? asm_exc_page_fault+0x22/0x30
[ 0.756390] ? msix_prepare_msi_desc+0x39/0x80
[ 0.756390] ? msix_capability_init+0x172/0x2c0
[ 0.756390] ? __pci_enable_msix_range+0x1a8/0x1d0
[ 0.756390] ? pci_alloc_irq_vectors_affinity+0x7c/0xf0
[ 0.756390] ? vp_find_vqs_msix+0x187/0x400
[ 0.756390] ? vp_find_vqs+0x2f/0x250
[ 0.756390] ? snprintf+0x3e/0x50
[ 0.756390] ? vp_modern_find_vqs+0x13/0x60
[ 0.756390] ? init_vq+0x184/0x1e0
[ 0.756390] ? vp_get_status+0x20/0x20
[ 0.756390] ? virtblk_probe+0xeb/0x8d0
[ 0.756390] ? __kernfs_new_node+0x122/0x160
[ 0.756390] ? vp_get_status+0x20/0x20
[ 0.756390] ? virtio_dev_probe+0x171/0x1c0
[ 0.756390] ? really_probe+0xc2/0x240
[ 0.756390] ? driver_probe_device+0x1d/0x70
[ 0.756390] ? __driver_attach+0x96/0xe0
[ 0.756390] ? driver_attach+0x20/0x20
[ 0.756390] ? bus_for_each_dev+0x7b/0xb0
[ 0.756390] ? bus_add_driver+0xe6/0x200
[ 0.756390] ? driver_register+0x5e/0xf0
[ 0.756390] ? virtio_blk_init+0x4d/0x90
[ 0.756390] ? add_boot_memory_block+0x90/0x90
[ 0.756390] ? do_one_initcall+0xe2/0x250
[ 0.756390] ? xas_store+0x4b/0x4b0
[ 0.756390] ? number+0x13b/0x260
[ 0.756390] ? ida_alloc_range+0x36a/0x3b0
[ 0.756390] ? parameq+0x13/0x90
[ 0.756390] ? parse_args+0x10f/0x2a0
[ 0.756390] ? do_initcall_level+0x83/0xb0
[ 0.756390] ? do_initcalls+0x43/0x70
[ 0.756390] ? rest_init+0x80/0x80
[ 0.756390] ? kernel_init_freeable+0x70/0xb0
[ 0.756390] ? kernel_init+0x16/0x110
[ 0.756390] ? ret_from_fork+0x30/0x40
[ 0.756390] ? rest_init+0x80/0x80
[ 0.756390] ? ret_from_fork_asm+0x11/0x20
[ 0.756390] </TASK>
[ 0.756390] Modules linked in:
[ 0.756390] CR2: 0000000000000002
[ 0.756390] ---[ end trace 0000000000000000 ]---
[ 0.756390] RIP: 0010:msix_prepare_msi_desc+0x39/0x80
[ 0.756390] Code: 20 c7 46 04 01 00 00 00 8b 56 4c 89 d0 0d 01 01 00 00 66 89 46 4c 8b 8f 64 02 00 00 89 4e 50 48 8b 8f 70 06 00 00 48 89 4e 58 <41> f6 40 02 40 75 2a c1 ea 02 bf 80 00 00 00 21 fa 25 7f ff ff ff
[ 0.756390] RSP: 0000:ffff8881002a76e0 EFLAGS: 00010202
[ 0.756390] RAX: 0000000000000101 RBX: ffff88810074d000 RCX: ffffc9000002e000
[ 0.756390] RDX: 0000000000000000 RSI: ffff8881002a7710 RDI: ffff88810074d000
[ 0.756390] RBP: ffff8881002a7710 R08: 0000000000000000 R09: ffff8881002a76b4
[ 0.756390] R10: 000000701000c001 R11: ffffffff82a3dc01 R12: 0000000000000000
[ 0.756390] R13: 0000000000000005 R14: 0000000000000000 R15: 0000000000000002
[ 0.756390] FS: 0000000000000000(0000) GS:0000000000000000(0000) knlGS:0000000000000000
[ 0.756390] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 0.756390] CR2: 0000000000000002 CR3: 0000000002a3d001 CR4: 00000000003706b0
[ 0.756390] note: swapper[1] exited with irqs disabled
[ 0.782774] Kernel panic - not syncing: Attempted to kill init! exitcode=0x00000009
[ 0.783560] Kernel Offset: disabled
[ 0.783909] ---[ end Kernel panic - not syncing: Attempted to kill init! exitcode=0x00000009 ]---
msix_prepare_msi_desc+0x39/0x80:
msix_prepare_msi_desc at drivers/pci/msi/msi.c:616
611 desc->nvec_used = 1;
612 desc->pci.msi_attrib.is_msix = 1;
613 desc->pci.msi_attrib.is_64 = 1;
614 desc->pci.msi_attrib.default_irq = dev->irq;
615 desc->pci.mask_base = dev->msix_base;
>616< desc->pci.msi_attrib.can_mask = !(info->flags & MSI_FLAG_NO_MASK) &&
617 !desc->pci.msi_attrib.is_virtual;
618
619 if (desc->pci.msi_attrib.can_mask) {
620 void __iomem *addr = pci_msix_desc_addr(desc);
621
Reverting patch 3 fixes the issue.
Daniel
On Mon, Mar 24, 2025 at 03:29:46PM +0100, Daniel Gomez wrote:
>
> Hi,
>
> On Fri, Mar 21, 2025 at 09:00:09AM +0100, Jürgen Groß wrote:
> > On 20.03.25 22:07, Bjorn Helgaas wrote:
> > > On Wed, Feb 19, 2025 at 10:20:57AM +0100, Roger Pau Monne wrote:
> > > > Setting pci_msi_ignore_mask inhibits the toggling of the mask bit for both
> > > > MSI and MSI-X entries globally, regardless of the IRQ chip they are using.
> > > > Only Xen sets the pci_msi_ignore_mask when routing physical interrupts over
> > > > event channels, to prevent PCI code from attempting to toggle the maskbit,
> > > > as it's Xen that controls the bit.
> > > >
> > > > However, the pci_msi_ignore_mask being global will affect devices that use
> > > > MSI interrupts but are not routing those interrupts over event channels
> > > > (not using the Xen pIRQ chip). One example is devices behind a VMD PCI
> > > > bridge. In that scenario the VMD bridge configures MSI(-X) using the
> > > > normal IRQ chip (the pIRQ one in the Xen case), and devices behind the
> > > > bridge configure the MSI entries using indexes into the VMD bridge MSI
> > > > table. The VMD bridge then demultiplexes such interrupts and delivers to
> > > > the destination device(s). Having pci_msi_ignore_mask set in that scenario
> > > > prevents (un)masking of MSI entries for devices behind the VMD bridge.
> > > >
> > > > Move the signaling of no entry masking into the MSI domain flags, as that
> > > > allows setting it on a per-domain basis. Set it for the Xen MSI domain
> > > > that uses the pIRQ chip, while leaving it unset for the rest of the
> > > > cases.
> > > >
> > > > Remove pci_msi_ignore_mask at once, since it was only used by Xen code, and
> > > > with Xen dropping usage the variable is unneeded.
> > > >
> > > > This fixes using devices behind a VMD bridge on Xen PV hardware domains.
> > > >
> > > > Albeit Devices behind a VMD bridge are not known to Xen, that doesn't mean
> > > > Linux cannot use them. By inhibiting the usage of
> > > > VMD_FEAT_CAN_BYPASS_MSI_REMAP and the removal of the pci_msi_ignore_mask
> > > > bodge devices behind a VMD bridge do work fine when use from a Linux Xen
> > > > hardware domain. That's the whole point of the series.
> > > >
> > > > Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> > > > Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
> > > > Acked-by: Juergen Gross <jgross@suse.com>
> > >
> > > Acked-by: Bjorn Helgaas <bhelgaas@google.com>
> > >
> > > I assume you'll merge this series via the Xen tree. Let me know if
> > > otherwise.
> >
> > I've pushed the series to the linux-next branch of the Xen tree.
> >
> >
> > Juergen
>
> This patch landed in latest next-20250324 tag causing this crash:
>
> [ 0.753426] BUG: kernel NULL pointer dereference, address: 0000000000000002
> [ 0.753921] #PF: supervisor read access in kernel mode
> [ 0.754286] #PF: error_code(0x0000) - not-present page
> [ 0.754656] PGD 0 P4D 0
> [ 0.754842] Oops: Oops: 0000 [#1]
> [ 0.755080] CPU: 0 UID: 0 PID: 1 Comm: swapper Not tainted 6.14.0-rc7-next-20250324 #1 NONE
> [ 0.755691] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
> [ 0.756349] RIP: 0010:msix_prepare_msi_desc+0x39/0x80
> [ 0.756390] Code: 20 c7 46 04 01 00 00 00 8b 56 4c 89 d0 0d 01 01 00 00 66 89 46 4c 8b 8f 64 02 00 00 89 4e 50 48 8b 8f 70 06 00 00 48 89 4e 58 <41> f6 40 02 40 75 2a c1 ea 02 bf 80 00 00 00 21 fa 25 7f ff ff ff
> [ 0.756390] RSP: 0000:ffff8881002a76e0 EFLAGS: 00010202
> [ 0.756390] RAX: 0000000000000101 RBX: ffff88810074d000 RCX: ffffc9000002e000
> [ 0.756390] RDX: 0000000000000000 RSI: ffff8881002a7710 RDI: ffff88810074d000
> [ 0.756390] RBP: ffff8881002a7710 R08: 0000000000000000 R09: ffff8881002a76b4
> [ 0.756390] R10: 000000701000c001 R11: ffffffff82a3dc01 R12: 0000000000000000
> [ 0.756390] R13: 0000000000000005 R14: 0000000000000000 R15: 0000000000000002
> [ 0.756390] FS: 0000000000000000(0000) GS:0000000000000000(0000) knlGS:0000000000000000
> [ 0.756390] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 0.756390] CR2: 0000000000000002 CR3: 0000000002a3d001 CR4: 00000000003706b0
> [ 0.756390] Call Trace:
> [ 0.756390] <TASK>
> [ 0.756390] ? __die_body+0x1b/0x60
> [ 0.756390] ? page_fault_oops+0x2d0/0x310
> [ 0.756390] ? exc_page_fault+0x59/0xc0
> [ 0.756390] ? asm_exc_page_fault+0x22/0x30
> [ 0.756390] ? msix_prepare_msi_desc+0x39/0x80
> [ 0.756390] ? msix_capability_init+0x172/0x2c0
> [ 0.756390] ? __pci_enable_msix_range+0x1a8/0x1d0
> [ 0.756390] ? pci_alloc_irq_vectors_affinity+0x7c/0xf0
> [ 0.756390] ? vp_find_vqs_msix+0x187/0x400
> [ 0.756390] ? vp_find_vqs+0x2f/0x250
> [ 0.756390] ? snprintf+0x3e/0x50
> [ 0.756390] ? vp_modern_find_vqs+0x13/0x60
> [ 0.756390] ? init_vq+0x184/0x1e0
> [ 0.756390] ? vp_get_status+0x20/0x20
> [ 0.756390] ? virtblk_probe+0xeb/0x8d0
> [ 0.756390] ? __kernfs_new_node+0x122/0x160
> [ 0.756390] ? vp_get_status+0x20/0x20
> [ 0.756390] ? virtio_dev_probe+0x171/0x1c0
> [ 0.756390] ? really_probe+0xc2/0x240
> [ 0.756390] ? driver_probe_device+0x1d/0x70
> [ 0.756390] ? __driver_attach+0x96/0xe0
> [ 0.756390] ? driver_attach+0x20/0x20
> [ 0.756390] ? bus_for_each_dev+0x7b/0xb0
> [ 0.756390] ? bus_add_driver+0xe6/0x200
> [ 0.756390] ? driver_register+0x5e/0xf0
> [ 0.756390] ? virtio_blk_init+0x4d/0x90
> [ 0.756390] ? add_boot_memory_block+0x90/0x90
> [ 0.756390] ? do_one_initcall+0xe2/0x250
> [ 0.756390] ? xas_store+0x4b/0x4b0
> [ 0.756390] ? number+0x13b/0x260
> [ 0.756390] ? ida_alloc_range+0x36a/0x3b0
> [ 0.756390] ? parameq+0x13/0x90
> [ 0.756390] ? parse_args+0x10f/0x2a0
> [ 0.756390] ? do_initcall_level+0x83/0xb0
> [ 0.756390] ? do_initcalls+0x43/0x70
> [ 0.756390] ? rest_init+0x80/0x80
> [ 0.756390] ? kernel_init_freeable+0x70/0xb0
> [ 0.756390] ? kernel_init+0x16/0x110
> [ 0.756390] ? ret_from_fork+0x30/0x40
> [ 0.756390] ? rest_init+0x80/0x80
> [ 0.756390] ? ret_from_fork_asm+0x11/0x20
> [ 0.756390] </TASK>
> [ 0.756390] Modules linked in:
> [ 0.756390] CR2: 0000000000000002
> [ 0.756390] ---[ end trace 0000000000000000 ]---
> [ 0.756390] RIP: 0010:msix_prepare_msi_desc+0x39/0x80
> [ 0.756390] Code: 20 c7 46 04 01 00 00 00 8b 56 4c 89 d0 0d 01 01 00 00 66 89 46 4c 8b 8f 64 02 00 00 89 4e 50 48 8b 8f 70 06 00 00 48 89 4e 58 <41> f6 40 02 40 75 2a c1 ea 02 bf 80 00 00 00 21 fa 25 7f ff ff ff
> [ 0.756390] RSP: 0000:ffff8881002a76e0 EFLAGS: 00010202
> [ 0.756390] RAX: 0000000000000101 RBX: ffff88810074d000 RCX: ffffc9000002e000
> [ 0.756390] RDX: 0000000000000000 RSI: ffff8881002a7710 RDI: ffff88810074d000
> [ 0.756390] RBP: ffff8881002a7710 R08: 0000000000000000 R09: ffff8881002a76b4
> [ 0.756390] R10: 000000701000c001 R11: ffffffff82a3dc01 R12: 0000000000000000
> [ 0.756390] R13: 0000000000000005 R14: 0000000000000000 R15: 0000000000000002
> [ 0.756390] FS: 0000000000000000(0000) GS:0000000000000000(0000) knlGS:0000000000000000
> [ 0.756390] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 0.756390] CR2: 0000000000000002 CR3: 0000000002a3d001 CR4: 00000000003706b0
> [ 0.756390] note: swapper[1] exited with irqs disabled
> [ 0.782774] Kernel panic - not syncing: Attempted to kill init! exitcode=0x00000009
> [ 0.783560] Kernel Offset: disabled
> [ 0.783909] ---[ end Kernel panic - not syncing: Attempted to kill init! exitcode=0x00000009 ]---
>
>
> msix_prepare_msi_desc+0x39/0x80:
> msix_prepare_msi_desc at drivers/pci/msi/msi.c:616
> 611 desc->nvec_used = 1;
> 612 desc->pci.msi_attrib.is_msix = 1;
> 613 desc->pci.msi_attrib.is_64 = 1;
> 614 desc->pci.msi_attrib.default_irq = dev->irq;
> 615 desc->pci.mask_base = dev->msix_base;
> >616< desc->pci.msi_attrib.can_mask = !(info->flags & MSI_FLAG_NO_MASK) &&
> 617 !desc->pci.msi_attrib.is_virtual;
> 618
> 619 if (desc->pci.msi_attrib.can_mask) {
> 620 void __iomem *addr = pci_msix_desc_addr(desc);
> 621
>
> Reverting patch 3 fixes the issue.
Thanks for the report and sorry for the breakage. Do you have a QEMU
command line I can use to try to reproduce this locally?
Will work on a patch ASAP.
Regards, Roger.
On Mon, Mar 24, 2025 at 06:51:54PM +0100, Roger Pau Monné wrote:
> On Mon, Mar 24, 2025 at 03:29:46PM +0100, Daniel Gomez wrote:
> >
> > Hi,
> >
> > On Fri, Mar 21, 2025 at 09:00:09AM +0100, Jürgen Groß wrote:
> > > On 20.03.25 22:07, Bjorn Helgaas wrote:
> > > > On Wed, Feb 19, 2025 at 10:20:57AM +0100, Roger Pau Monne wrote:
> > > > > Setting pci_msi_ignore_mask inhibits the toggling of the mask bit for both
> > > > > MSI and MSI-X entries globally, regardless of the IRQ chip they are using.
> > > > > Only Xen sets the pci_msi_ignore_mask when routing physical interrupts over
> > > > > event channels, to prevent PCI code from attempting to toggle the maskbit,
> > > > > as it's Xen that controls the bit.
> > > > >
> > > > > However, the pci_msi_ignore_mask being global will affect devices that use
> > > > > MSI interrupts but are not routing those interrupts over event channels
> > > > > (not using the Xen pIRQ chip). One example is devices behind a VMD PCI
> > > > > bridge. In that scenario the VMD bridge configures MSI(-X) using the
> > > > > normal IRQ chip (the pIRQ one in the Xen case), and devices behind the
> > > > > bridge configure the MSI entries using indexes into the VMD bridge MSI
> > > > > table. The VMD bridge then demultiplexes such interrupts and delivers to
> > > > > the destination device(s). Having pci_msi_ignore_mask set in that scenario
> > > > > prevents (un)masking of MSI entries for devices behind the VMD bridge.
> > > > >
> > > > > Move the signaling of no entry masking into the MSI domain flags, as that
> > > > > allows setting it on a per-domain basis. Set it for the Xen MSI domain
> > > > > that uses the pIRQ chip, while leaving it unset for the rest of the
> > > > > cases.
> > > > >
> > > > > Remove pci_msi_ignore_mask at once, since it was only used by Xen code, and
> > > > > with Xen dropping usage the variable is unneeded.
> > > > >
> > > > > This fixes using devices behind a VMD bridge on Xen PV hardware domains.
> > > > >
> > > > > Albeit Devices behind a VMD bridge are not known to Xen, that doesn't mean
> > > > > Linux cannot use them. By inhibiting the usage of
> > > > > VMD_FEAT_CAN_BYPASS_MSI_REMAP and the removal of the pci_msi_ignore_mask
> > > > > bodge devices behind a VMD bridge do work fine when use from a Linux Xen
> > > > > hardware domain. That's the whole point of the series.
> > > > >
> > > > > Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> > > > > Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
> > > > > Acked-by: Juergen Gross <jgross@suse.com>
> > > >
> > > > Acked-by: Bjorn Helgaas <bhelgaas@google.com>
> > > >
> > > > I assume you'll merge this series via the Xen tree. Let me know if
> > > > otherwise.
> > >
> > > I've pushed the series to the linux-next branch of the Xen tree.
> > >
> > >
> > > Juergen
> >
> > This patch landed in latest next-20250324 tag causing this crash:
> >
> > [ 0.753426] BUG: kernel NULL pointer dereference, address: 0000000000000002
> > [ 0.753921] #PF: supervisor read access in kernel mode
> > [ 0.754286] #PF: error_code(0x0000) - not-present page
> > [ 0.754656] PGD 0 P4D 0
> > [ 0.754842] Oops: Oops: 0000 [#1]
> > [ 0.755080] CPU: 0 UID: 0 PID: 1 Comm: swapper Not tainted 6.14.0-rc7-next-20250324 #1 NONE
> > [ 0.755691] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
> > [ 0.756349] RIP: 0010:msix_prepare_msi_desc+0x39/0x80
> > [ 0.756390] Code: 20 c7 46 04 01 00 00 00 8b 56 4c 89 d0 0d 01 01 00 00 66 89 46 4c 8b 8f 64 02 00 00 89 4e 50 48 8b 8f 70 06 00 00 48 89 4e 58 <41> f6 40 02 40 75 2a c1 ea 02 bf 80 00 00 00 21 fa 25 7f ff ff ff
> > [ 0.756390] RSP: 0000:ffff8881002a76e0 EFLAGS: 00010202
> > [ 0.756390] RAX: 0000000000000101 RBX: ffff88810074d000 RCX: ffffc9000002e000
> > [ 0.756390] RDX: 0000000000000000 RSI: ffff8881002a7710 RDI: ffff88810074d000
> > [ 0.756390] RBP: ffff8881002a7710 R08: 0000000000000000 R09: ffff8881002a76b4
> > [ 0.756390] R10: 000000701000c001 R11: ffffffff82a3dc01 R12: 0000000000000000
> > [ 0.756390] R13: 0000000000000005 R14: 0000000000000000 R15: 0000000000000002
> > [ 0.756390] FS: 0000000000000000(0000) GS:0000000000000000(0000) knlGS:0000000000000000
> > [ 0.756390] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > [ 0.756390] CR2: 0000000000000002 CR3: 0000000002a3d001 CR4: 00000000003706b0
> > [ 0.756390] Call Trace:
> > [ 0.756390] <TASK>
> > [ 0.756390] ? __die_body+0x1b/0x60
> > [ 0.756390] ? page_fault_oops+0x2d0/0x310
> > [ 0.756390] ? exc_page_fault+0x59/0xc0
> > [ 0.756390] ? asm_exc_page_fault+0x22/0x30
> > [ 0.756390] ? msix_prepare_msi_desc+0x39/0x80
> > [ 0.756390] ? msix_capability_init+0x172/0x2c0
> > [ 0.756390] ? __pci_enable_msix_range+0x1a8/0x1d0
> > [ 0.756390] ? pci_alloc_irq_vectors_affinity+0x7c/0xf0
> > [ 0.756390] ? vp_find_vqs_msix+0x187/0x400
> > [ 0.756390] ? vp_find_vqs+0x2f/0x250
> > [ 0.756390] ? snprintf+0x3e/0x50
> > [ 0.756390] ? vp_modern_find_vqs+0x13/0x60
> > [ 0.756390] ? init_vq+0x184/0x1e0
> > [ 0.756390] ? vp_get_status+0x20/0x20
> > [ 0.756390] ? virtblk_probe+0xeb/0x8d0
> > [ 0.756390] ? __kernfs_new_node+0x122/0x160
> > [ 0.756390] ? vp_get_status+0x20/0x20
> > [ 0.756390] ? virtio_dev_probe+0x171/0x1c0
> > [ 0.756390] ? really_probe+0xc2/0x240
> > [ 0.756390] ? driver_probe_device+0x1d/0x70
> > [ 0.756390] ? __driver_attach+0x96/0xe0
> > [ 0.756390] ? driver_attach+0x20/0x20
> > [ 0.756390] ? bus_for_each_dev+0x7b/0xb0
> > [ 0.756390] ? bus_add_driver+0xe6/0x200
> > [ 0.756390] ? driver_register+0x5e/0xf0
> > [ 0.756390] ? virtio_blk_init+0x4d/0x90
> > [ 0.756390] ? add_boot_memory_block+0x90/0x90
> > [ 0.756390] ? do_one_initcall+0xe2/0x250
> > [ 0.756390] ? xas_store+0x4b/0x4b0
> > [ 0.756390] ? number+0x13b/0x260
> > [ 0.756390] ? ida_alloc_range+0x36a/0x3b0
> > [ 0.756390] ? parameq+0x13/0x90
> > [ 0.756390] ? parse_args+0x10f/0x2a0
> > [ 0.756390] ? do_initcall_level+0x83/0xb0
> > [ 0.756390] ? do_initcalls+0x43/0x70
> > [ 0.756390] ? rest_init+0x80/0x80
> > [ 0.756390] ? kernel_init_freeable+0x70/0xb0
> > [ 0.756390] ? kernel_init+0x16/0x110
> > [ 0.756390] ? ret_from_fork+0x30/0x40
> > [ 0.756390] ? rest_init+0x80/0x80
> > [ 0.756390] ? ret_from_fork_asm+0x11/0x20
> > [ 0.756390] </TASK>
> > [ 0.756390] Modules linked in:
> > [ 0.756390] CR2: 0000000000000002
> > [ 0.756390] ---[ end trace 0000000000000000 ]---
> > [ 0.756390] RIP: 0010:msix_prepare_msi_desc+0x39/0x80
> > [ 0.756390] Code: 20 c7 46 04 01 00 00 00 8b 56 4c 89 d0 0d 01 01 00 00 66 89 46 4c 8b 8f 64 02 00 00 89 4e 50 48 8b 8f 70 06 00 00 48 89 4e 58 <41> f6 40 02 40 75 2a c1 ea 02 bf 80 00 00 00 21 fa 25 7f ff ff ff
> > [ 0.756390] RSP: 0000:ffff8881002a76e0 EFLAGS: 00010202
> > [ 0.756390] RAX: 0000000000000101 RBX: ffff88810074d000 RCX: ffffc9000002e000
> > [ 0.756390] RDX: 0000000000000000 RSI: ffff8881002a7710 RDI: ffff88810074d000
> > [ 0.756390] RBP: ffff8881002a7710 R08: 0000000000000000 R09: ffff8881002a76b4
> > [ 0.756390] R10: 000000701000c001 R11: ffffffff82a3dc01 R12: 0000000000000000
> > [ 0.756390] R13: 0000000000000005 R14: 0000000000000000 R15: 0000000000000002
> > [ 0.756390] FS: 0000000000000000(0000) GS:0000000000000000(0000) knlGS:0000000000000000
> > [ 0.756390] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > [ 0.756390] CR2: 0000000000000002 CR3: 0000000002a3d001 CR4: 00000000003706b0
> > [ 0.756390] note: swapper[1] exited with irqs disabled
> > [ 0.782774] Kernel panic - not syncing: Attempted to kill init! exitcode=0x00000009
> > [ 0.783560] Kernel Offset: disabled
> > [ 0.783909] ---[ end Kernel panic - not syncing: Attempted to kill init! exitcode=0x00000009 ]---
> >
> >
> > msix_prepare_msi_desc+0x39/0x80:
> > msix_prepare_msi_desc at drivers/pci/msi/msi.c:616
> > 611 desc->nvec_used = 1;
> > 612 desc->pci.msi_attrib.is_msix = 1;
> > 613 desc->pci.msi_attrib.is_64 = 1;
> > 614 desc->pci.msi_attrib.default_irq = dev->irq;
> > 615 desc->pci.mask_base = dev->msix_base;
> > >616< desc->pci.msi_attrib.can_mask = !(info->flags & MSI_FLAG_NO_MASK) &&
> > 617 !desc->pci.msi_attrib.is_virtual;
> > 618
> > 619 if (desc->pci.msi_attrib.can_mask) {
> > 620 void __iomem *addr = pci_msix_desc_addr(desc);
> > 621
> >
> > Reverting patch 3 fixes the issue.
>
> Thanks for the report and sorry for the breakage. Do you have a QEMU
> command line I can use to try to reproduce this locally?
>
> Will work on a patch ASAP.
Thanks for the quick reply.
The issue is that info appears to be uninitialized. So, this worked for me:
diff --git a/drivers/pci/msi/msi.c b/drivers/pci/msi/msi.c
index dcbb4f9ac578..b76c7ec33602 100644
--- a/drivers/pci/msi/msi.c
+++ b/drivers/pci/msi/msi.c
@@ -609,8 +609,10 @@ void msix_prepare_msi_desc(struct pci_dev *dev, struct msi_desc *desc)
desc->pci.msi_attrib.is_64 = 1;
desc->pci.msi_attrib.default_irq = dev->irq;
desc->pci.mask_base = dev->msix_base;
- desc->pci.msi_attrib.can_mask = !(info->flags & MSI_FLAG_NO_MASK) &&
- !desc->pci.msi_attrib.is_virtual;
+ desc->pci.msi_attrib.can_mask =
+ info ? !(info->flags & MSI_FLAG_NO_MASK) &&
+ !desc->pci.msi_attrib.is_virtual :
+ 1;
if (desc->pci.msi_attrib.can_mask) {
void __iomem *addr = pci_msix_desc_addr(desc);
@@ -743,7 +745,7 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
/* Disable INTX */
pci_intx_for_msi(dev, 0);
- if (!(info->flags & MSI_FLAG_NO_MASK)) {
+ if (info && !(info->flags & MSI_FLAG_NO_MASK)) {
/*
* Ensure that all table entries are masked to prevent
* stale entries from firing in a crash kernel.
I also noticed d (struct irq_domain) can return NULL if CONFIG_GENERIC_MSI_IRQ
is not set and we are not checking that either.
I run QEMU with vmctl [1]. This is my command:
[1] https://github.com/SamsungDS/vmctl
/usr/bin/qemu-system-x86_64 \
-nodefaults \
-display "none" \
-machine "q35,accel=kvm,kernel-irqchip=split" \
-cpu "host" \
-smp "4" \
-m "8G" \
-device "intel-iommu,intremap=on" \
-netdev "user,id=net0,hostfwd=tcp::2222-:22" \
-device "virtio-net-pci,netdev=net0" \
-device "virtio-rng-pci" \
-drive "id=boot,file=file.qcow2,format=qcow2,if=virtio,discard=unmap,media=disk,read-only=no" \
-device "pcie-root-port,id=pcie_root_port0,chassis=1,slot=0" \
-device "nvme,id=nvme0,serial=deadbeef,bus=pcie_root_port0,mdts=7" \
-drive "id=nvm,file=~/nvm.img,format=raw,if=none,discard=unmap,media=disk,read-only=no" \
-device "nvme-ns,id=nvm,drive=nvm,bus=nvme0,nsid=1,logical_block_size=4096,physical_block_size=4096" \
-pidfile "~/vmctl/confdir/run/nvme/pidfile" \
-kernel "~/src/kernel/linux/arch/x86_64/boot/bzImage" \
-append "root=/dev/vda1 console=ttyS0,115200 audit=0" \
-virtfs "local,path=~/linux,security_model=none,readonly=on,mount_tag=kernel_dir" \
-serial "mon:stdio" \
-d "guest_errors" \
-D "~/vmctl/confdir/log/nvme/qemu.log"
Daniel
>
> Regards, Roger.
On Mon, Mar 24, 2025 at 07:58:14PM +0100, Daniel Gomez wrote:
> On Mon, Mar 24, 2025 at 06:51:54PM +0100, Roger Pau Monné wrote:
> > On Mon, Mar 24, 2025 at 03:29:46PM +0100, Daniel Gomez wrote:
> > >
> > > Hi,
> > >
> > > On Fri, Mar 21, 2025 at 09:00:09AM +0100, Jürgen Groß wrote:
> > > > On 20.03.25 22:07, Bjorn Helgaas wrote:
> > > > > On Wed, Feb 19, 2025 at 10:20:57AM +0100, Roger Pau Monne wrote:
> > > > > > Setting pci_msi_ignore_mask inhibits the toggling of the mask bit for both
> > > > > > MSI and MSI-X entries globally, regardless of the IRQ chip they are using.
> > > > > > Only Xen sets the pci_msi_ignore_mask when routing physical interrupts over
> > > > > > event channels, to prevent PCI code from attempting to toggle the maskbit,
> > > > > > as it's Xen that controls the bit.
> > > > > >
> > > > > > However, the pci_msi_ignore_mask being global will affect devices that use
> > > > > > MSI interrupts but are not routing those interrupts over event channels
> > > > > > (not using the Xen pIRQ chip). One example is devices behind a VMD PCI
> > > > > > bridge. In that scenario the VMD bridge configures MSI(-X) using the
> > > > > > normal IRQ chip (the pIRQ one in the Xen case), and devices behind the
> > > > > > bridge configure the MSI entries using indexes into the VMD bridge MSI
> > > > > > table. The VMD bridge then demultiplexes such interrupts and delivers to
> > > > > > the destination device(s). Having pci_msi_ignore_mask set in that scenario
> > > > > > prevents (un)masking of MSI entries for devices behind the VMD bridge.
> > > > > >
> > > > > > Move the signaling of no entry masking into the MSI domain flags, as that
> > > > > > allows setting it on a per-domain basis. Set it for the Xen MSI domain
> > > > > > that uses the pIRQ chip, while leaving it unset for the rest of the
> > > > > > cases.
> > > > > >
> > > > > > Remove pci_msi_ignore_mask at once, since it was only used by Xen code, and
> > > > > > with Xen dropping usage the variable is unneeded.
> > > > > >
> > > > > > This fixes using devices behind a VMD bridge on Xen PV hardware domains.
> > > > > >
> > > > > > Albeit Devices behind a VMD bridge are not known to Xen, that doesn't mean
> > > > > > Linux cannot use them. By inhibiting the usage of
> > > > > > VMD_FEAT_CAN_BYPASS_MSI_REMAP and the removal of the pci_msi_ignore_mask
> > > > > > bodge devices behind a VMD bridge do work fine when use from a Linux Xen
> > > > > > hardware domain. That's the whole point of the series.
> > > > > >
> > > > > > Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> > > > > > Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
> > > > > > Acked-by: Juergen Gross <jgross@suse.com>
> > > > >
> > > > > Acked-by: Bjorn Helgaas <bhelgaas@google.com>
> > > > >
> > > > > I assume you'll merge this series via the Xen tree. Let me know if
> > > > > otherwise.
> > > >
> > > > I've pushed the series to the linux-next branch of the Xen tree.
> > > >
> > > >
> > > > Juergen
> > >
> > > This patch landed in latest next-20250324 tag causing this crash:
> > >
> > > [ 0.753426] BUG: kernel NULL pointer dereference, address: 0000000000000002
> > > [ 0.753921] #PF: supervisor read access in kernel mode
> > > [ 0.754286] #PF: error_code(0x0000) - not-present page
> > > [ 0.754656] PGD 0 P4D 0
> > > [ 0.754842] Oops: Oops: 0000 [#1]
> > > [ 0.755080] CPU: 0 UID: 0 PID: 1 Comm: swapper Not tainted 6.14.0-rc7-next-20250324 #1 NONE
> > > [ 0.755691] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
> > > [ 0.756349] RIP: 0010:msix_prepare_msi_desc+0x39/0x80
> > > [ 0.756390] Code: 20 c7 46 04 01 00 00 00 8b 56 4c 89 d0 0d 01 01 00 00 66 89 46 4c 8b 8f 64 02 00 00 89 4e 50 48 8b 8f 70 06 00 00 48 89 4e 58 <41> f6 40 02 40 75 2a c1 ea 02 bf 80 00 00 00 21 fa 25 7f ff ff ff
> > > [ 0.756390] RSP: 0000:ffff8881002a76e0 EFLAGS: 00010202
> > > [ 0.756390] RAX: 0000000000000101 RBX: ffff88810074d000 RCX: ffffc9000002e000
> > > [ 0.756390] RDX: 0000000000000000 RSI: ffff8881002a7710 RDI: ffff88810074d000
> > > [ 0.756390] RBP: ffff8881002a7710 R08: 0000000000000000 R09: ffff8881002a76b4
> > > [ 0.756390] R10: 000000701000c001 R11: ffffffff82a3dc01 R12: 0000000000000000
> > > [ 0.756390] R13: 0000000000000005 R14: 0000000000000000 R15: 0000000000000002
> > > [ 0.756390] FS: 0000000000000000(0000) GS:0000000000000000(0000) knlGS:0000000000000000
> > > [ 0.756390] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > > [ 0.756390] CR2: 0000000000000002 CR3: 0000000002a3d001 CR4: 00000000003706b0
> > > [ 0.756390] Call Trace:
> > > [ 0.756390] <TASK>
> > > [ 0.756390] ? __die_body+0x1b/0x60
> > > [ 0.756390] ? page_fault_oops+0x2d0/0x310
> > > [ 0.756390] ? exc_page_fault+0x59/0xc0
> > > [ 0.756390] ? asm_exc_page_fault+0x22/0x30
> > > [ 0.756390] ? msix_prepare_msi_desc+0x39/0x80
> > > [ 0.756390] ? msix_capability_init+0x172/0x2c0
> > > [ 0.756390] ? __pci_enable_msix_range+0x1a8/0x1d0
> > > [ 0.756390] ? pci_alloc_irq_vectors_affinity+0x7c/0xf0
> > > [ 0.756390] ? vp_find_vqs_msix+0x187/0x400
> > > [ 0.756390] ? vp_find_vqs+0x2f/0x250
> > > [ 0.756390] ? snprintf+0x3e/0x50
> > > [ 0.756390] ? vp_modern_find_vqs+0x13/0x60
> > > [ 0.756390] ? init_vq+0x184/0x1e0
> > > [ 0.756390] ? vp_get_status+0x20/0x20
> > > [ 0.756390] ? virtblk_probe+0xeb/0x8d0
> > > [ 0.756390] ? __kernfs_new_node+0x122/0x160
> > > [ 0.756390] ? vp_get_status+0x20/0x20
> > > [ 0.756390] ? virtio_dev_probe+0x171/0x1c0
> > > [ 0.756390] ? really_probe+0xc2/0x240
> > > [ 0.756390] ? driver_probe_device+0x1d/0x70
> > > [ 0.756390] ? __driver_attach+0x96/0xe0
> > > [ 0.756390] ? driver_attach+0x20/0x20
> > > [ 0.756390] ? bus_for_each_dev+0x7b/0xb0
> > > [ 0.756390] ? bus_add_driver+0xe6/0x200
> > > [ 0.756390] ? driver_register+0x5e/0xf0
> > > [ 0.756390] ? virtio_blk_init+0x4d/0x90
> > > [ 0.756390] ? add_boot_memory_block+0x90/0x90
> > > [ 0.756390] ? do_one_initcall+0xe2/0x250
> > > [ 0.756390] ? xas_store+0x4b/0x4b0
> > > [ 0.756390] ? number+0x13b/0x260
> > > [ 0.756390] ? ida_alloc_range+0x36a/0x3b0
> > > [ 0.756390] ? parameq+0x13/0x90
> > > [ 0.756390] ? parse_args+0x10f/0x2a0
> > > [ 0.756390] ? do_initcall_level+0x83/0xb0
> > > [ 0.756390] ? do_initcalls+0x43/0x70
> > > [ 0.756390] ? rest_init+0x80/0x80
> > > [ 0.756390] ? kernel_init_freeable+0x70/0xb0
> > > [ 0.756390] ? kernel_init+0x16/0x110
> > > [ 0.756390] ? ret_from_fork+0x30/0x40
> > > [ 0.756390] ? rest_init+0x80/0x80
> > > [ 0.756390] ? ret_from_fork_asm+0x11/0x20
> > > [ 0.756390] </TASK>
> > > [ 0.756390] Modules linked in:
> > > [ 0.756390] CR2: 0000000000000002
> > > [ 0.756390] ---[ end trace 0000000000000000 ]---
> > > [ 0.756390] RIP: 0010:msix_prepare_msi_desc+0x39/0x80
> > > [ 0.756390] Code: 20 c7 46 04 01 00 00 00 8b 56 4c 89 d0 0d 01 01 00 00 66 89 46 4c 8b 8f 64 02 00 00 89 4e 50 48 8b 8f 70 06 00 00 48 89 4e 58 <41> f6 40 02 40 75 2a c1 ea 02 bf 80 00 00 00 21 fa 25 7f ff ff ff
> > > [ 0.756390] RSP: 0000:ffff8881002a76e0 EFLAGS: 00010202
> > > [ 0.756390] RAX: 0000000000000101 RBX: ffff88810074d000 RCX: ffffc9000002e000
> > > [ 0.756390] RDX: 0000000000000000 RSI: ffff8881002a7710 RDI: ffff88810074d000
> > > [ 0.756390] RBP: ffff8881002a7710 R08: 0000000000000000 R09: ffff8881002a76b4
> > > [ 0.756390] R10: 000000701000c001 R11: ffffffff82a3dc01 R12: 0000000000000000
> > > [ 0.756390] R13: 0000000000000005 R14: 0000000000000000 R15: 0000000000000002
> > > [ 0.756390] FS: 0000000000000000(0000) GS:0000000000000000(0000) knlGS:0000000000000000
> > > [ 0.756390] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > > [ 0.756390] CR2: 0000000000000002 CR3: 0000000002a3d001 CR4: 00000000003706b0
> > > [ 0.756390] note: swapper[1] exited with irqs disabled
> > > [ 0.782774] Kernel panic - not syncing: Attempted to kill init! exitcode=0x00000009
> > > [ 0.783560] Kernel Offset: disabled
> > > [ 0.783909] ---[ end Kernel panic - not syncing: Attempted to kill init! exitcode=0x00000009 ]---
> > >
> > >
> > > msix_prepare_msi_desc+0x39/0x80:
> > > msix_prepare_msi_desc at drivers/pci/msi/msi.c:616
> > > 611 desc->nvec_used = 1;
> > > 612 desc->pci.msi_attrib.is_msix = 1;
> > > 613 desc->pci.msi_attrib.is_64 = 1;
> > > 614 desc->pci.msi_attrib.default_irq = dev->irq;
> > > 615 desc->pci.mask_base = dev->msix_base;
> > > >616< desc->pci.msi_attrib.can_mask = !(info->flags & MSI_FLAG_NO_MASK) &&
> > > 617 !desc->pci.msi_attrib.is_virtual;
> > > 618
> > > 619 if (desc->pci.msi_attrib.can_mask) {
> > > 620 void __iomem *addr = pci_msix_desc_addr(desc);
> > > 621
> > >
> > > Reverting patch 3 fixes the issue.
> >
> > Thanks for the report and sorry for the breakage. Do you have a QEMU
> > command line I can use to try to reproduce this locally?
> >
> > Will work on a patch ASAP.
>
> Thanks for the quick reply.
>
> The issue is that info appears to be uninitialized. So, this worked for me:
Indeed, irq_domain->host_data is NULL, there's no msi_domain_info. As
this is x86, I was expecting x86 ot always use
x86_init_dev_msi_info(), but that doesn't seem to be the case. I
would like to better understand this.
> diff --git a/drivers/pci/msi/msi.c b/drivers/pci/msi/msi.c
> index dcbb4f9ac578..b76c7ec33602 100644
> --- a/drivers/pci/msi/msi.c
> +++ b/drivers/pci/msi/msi.c
> @@ -609,8 +609,10 @@ void msix_prepare_msi_desc(struct pci_dev *dev, struct msi_desc *desc)
> desc->pci.msi_attrib.is_64 = 1;
> desc->pci.msi_attrib.default_irq = dev->irq;
> desc->pci.mask_base = dev->msix_base;
> - desc->pci.msi_attrib.can_mask = !(info->flags & MSI_FLAG_NO_MASK) &&
> - !desc->pci.msi_attrib.is_virtual;
> + desc->pci.msi_attrib.can_mask =
> + info ? !(info->flags & MSI_FLAG_NO_MASK) &&
> + !desc->pci.msi_attrib.is_virtual :
> + 1;
>
> if (desc->pci.msi_attrib.can_mask) {
> void __iomem *addr = pci_msix_desc_addr(desc);
> @@ -743,7 +745,7 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
> /* Disable INTX */
> pci_intx_for_msi(dev, 0);
>
> - if (!(info->flags & MSI_FLAG_NO_MASK)) {
> + if (info && !(info->flags & MSI_FLAG_NO_MASK)) {
I think this should rather be:
if (!info || !(info->flags & MSI_FLAG_NO_MASK)) {
So that in case of no info the default action is to mask the entries.
> /*
> * Ensure that all table entries are masked to prevent
> * stale entries from firing in a crash kernel.
>
> I also noticed d (struct irq_domain) can return NULL if CONFIG_GENERIC_MSI_IRQ
> is not set and we are not checking that either.
>
> I run QEMU with vmctl [1]. This is my command:
>
> [1] https://github.com/SamsungDS/vmctl
>
> /usr/bin/qemu-system-x86_64 \
> -nodefaults \
> -display "none" \
> -machine "q35,accel=kvm,kernel-irqchip=split" \
> -cpu "host" \
> -smp "4" \
> -m "8G" \
> -device "intel-iommu,intremap=on" \
> -netdev "user,id=net0,hostfwd=tcp::2222-:22" \
> -device "virtio-net-pci,netdev=net0" \
> -device "virtio-rng-pci" \
> -drive "id=boot,file=file.qcow2,format=qcow2,if=virtio,discard=unmap,media=disk,read-only=no" \
> -device "pcie-root-port,id=pcie_root_port0,chassis=1,slot=0" \
> -device "nvme,id=nvme0,serial=deadbeef,bus=pcie_root_port0,mdts=7" \
> -drive "id=nvm,file=~/nvm.img,format=raw,if=none,discard=unmap,media=disk,read-only=no" \
> -device "nvme-ns,id=nvm,drive=nvm,bus=nvme0,nsid=1,logical_block_size=4096,physical_block_size=4096" \
> -pidfile "~/vmctl/confdir/run/nvme/pidfile" \
> -kernel "~/src/kernel/linux/arch/x86_64/boot/bzImage" \
> -append "root=/dev/vda1 console=ttyS0,115200 audit=0" \
> -virtfs "local,path=~/linux,security_model=none,readonly=on,mount_tag=kernel_dir" \
> -serial "mon:stdio" \
> -d "guest_errors" \
> -D "~/vmctl/confdir/log/nvme/qemu.log"
Can you narrow down the command line to the minimum required to
reproduce the issue?
Can you attach the Kconfig used to build the crashing kernel?
Thanks, Roger.
On Mon, Mar 24 2025 at 20:18, Roger Pau Monné wrote:
> On Mon, Mar 24, 2025 at 07:58:14PM +0100, Daniel Gomez wrote:
>> The issue is that info appears to be uninitialized. So, this worked for me:
>
> Indeed, irq_domain->host_data is NULL, there's no msi_domain_info. As
> this is x86, I was expecting x86 ot always use
> x86_init_dev_msi_info(), but that doesn't seem to be the case. I
> would like to better understand this.
Indeed. On x86 this should not happen at all. On architectures, which do
not use (hierarchical) interrupt domains, it will return NULL.
So I really want to understand why this happens on x86 before such a
"fix" is deployed.
Thanks,
tglx
On Tue, Mar 25 2025 at 09:11, Thomas Gleixner wrote:
> On Mon, Mar 24 2025 at 20:18, Roger Pau Monné wrote:
>> On Mon, Mar 24, 2025 at 07:58:14PM +0100, Daniel Gomez wrote:
>>> The issue is that info appears to be uninitialized. So, this worked for me:
>>
>> Indeed, irq_domain->host_data is NULL, there's no msi_domain_info. As
>> this is x86, I was expecting x86 ot always use
>> x86_init_dev_msi_info(), but that doesn't seem to be the case. I
>> would like to better understand this.
>
> Indeed. On x86 this should not happen at all. On architectures, which do
> not use (hierarchical) interrupt domains, it will return NULL.
>
> So I really want to understand why this happens on x86 before such a
> "fix" is deployed.
So after staring at it some more it's clear. Without XEN, the domain
returned is the MSI parent domain, which is the vector domain in that
setup. That does not have a domain info set. But on legacy architectures
there is not even a domain.
It's really wonderful that we have a gazillion ways to manage the
backends of PCI/MSI....
So none of the suggested pointer checks will cover it correctly. Though
there is already a function which allows to query MSI domain flags
independent of the underlying insanity. Sorry for not catching it in
review.
Untested patch below.
Thanks,
tglx
---
drivers/pci/msi/msi.c | 18 ++++++------------
1 file changed, 6 insertions(+), 12 deletions(-)
--- a/drivers/pci/msi/msi.c
+++ b/drivers/pci/msi/msi.c
@@ -285,8 +285,6 @@ static void pci_msi_set_enable(struct pc
static int msi_setup_msi_desc(struct pci_dev *dev, int nvec,
struct irq_affinity_desc *masks)
{
- const struct irq_domain *d = dev_get_msi_domain(&dev->dev);
- const struct msi_domain_info *info = d->host_data;
struct msi_desc desc;
u16 control;
@@ -297,7 +295,7 @@ static int msi_setup_msi_desc(struct pci
/* Lies, damned lies, and MSIs */
if (dev->dev_flags & PCI_DEV_FLAGS_HAS_MSI_MASKING)
control |= PCI_MSI_FLAGS_MASKBIT;
- if (info->flags & MSI_FLAG_NO_MASK)
+ if (pci_msi_domain_supports(dev, MSI_FLAG_NO_MASK, DENY_LEGACY))
control &= ~PCI_MSI_FLAGS_MASKBIT;
desc.nvec_used = nvec;
@@ -605,20 +603,18 @@ static void __iomem *msix_map_region(str
*/
void msix_prepare_msi_desc(struct pci_dev *dev, struct msi_desc *desc)
{
- const struct irq_domain *d = dev_get_msi_domain(&dev->dev);
- const struct msi_domain_info *info = d->host_data;
-
desc->nvec_used = 1;
desc->pci.msi_attrib.is_msix = 1;
desc->pci.msi_attrib.is_64 = 1;
desc->pci.msi_attrib.default_irq = dev->irq;
desc->pci.mask_base = dev->msix_base;
- desc->pci.msi_attrib.can_mask = !(info->flags & MSI_FLAG_NO_MASK) &&
- !desc->pci.msi_attrib.is_virtual;
- if (desc->pci.msi_attrib.can_mask) {
+
+ if (!pci_msi_domain_supports(dev, MSI_FLAG_NO_MASK, DENY_LEGACY) &&
+ !desc->pci.msi_attrib.is_virtual) {
void __iomem *addr = pci_msix_desc_addr(desc);
+ desc->pci.msi_attrib.can_mask = true;
desc->pci.msix_ctrl = readl(addr + PCI_MSIX_ENTRY_VECTOR_CTRL);
}
}
@@ -715,8 +711,6 @@ static int msix_setup_interrupts(struct
static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
int nvec, struct irq_affinity *affd)
{
- const struct irq_domain *d = dev_get_msi_domain(&dev->dev);
- const struct msi_domain_info *info = d->host_data;
int ret, tsize;
u16 control;
@@ -747,7 +741,7 @@ static int msix_capability_init(struct p
/* Disable INTX */
pci_intx_for_msi(dev, 0);
- if (!(info->flags & MSI_FLAG_NO_MASK)) {
+ if (!pci_msi_domain_supports(dev, MSI_FLAG_NO_MASK, DENY_LEGACY)) {
/*
* Ensure that all table entries are masked to prevent
* stale entries from firing in a crash kernel.
On 25.03.2025 10:20, Thomas Gleixner wrote:
> On Tue, Mar 25 2025 at 09:11, Thomas Gleixner wrote:
>> On Mon, Mar 24 2025 at 20:18, Roger Pau Monné wrote:
>>> On Mon, Mar 24, 2025 at 07:58:14PM +0100, Daniel Gomez wrote:
>>>> The issue is that info appears to be uninitialized. So, this worked for me:
>>> Indeed, irq_domain->host_data is NULL, there's no msi_domain_info. As
>>> this is x86, I was expecting x86 ot always use
>>> x86_init_dev_msi_info(), but that doesn't seem to be the case. I
>>> would like to better understand this.
>> Indeed. On x86 this should not happen at all. On architectures, which do
>> not use (hierarchical) interrupt domains, it will return NULL.
>>
>> So I really want to understand why this happens on x86 before such a
>> "fix" is deployed.
> So after staring at it some more it's clear. Without XEN, the domain
> returned is the MSI parent domain, which is the vector domain in that
> setup. That does not have a domain info set. But on legacy architectures
> there is not even a domain.
>
> It's really wonderful that we have a gazillion ways to manage the
> backends of PCI/MSI....
>
> So none of the suggested pointer checks will cover it correctly. Though
> there is already a function which allows to query MSI domain flags
> independent of the underlying insanity. Sorry for not catching it in
> review.
>
> Untested patch below.
This fixes the panic observed on ARM64 RK3568-based Odroid-M1 board
(arch/arm64/boot/dts/rockchip/rk3568-odroid-m1.dts) on next-20250325.
Thanks!
Feel free to add to the final patch:
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
>
> Thanks,
>
> tglx
> ---
> drivers/pci/msi/msi.c | 18 ++++++------------
> 1 file changed, 6 insertions(+), 12 deletions(-)
>
> --- a/drivers/pci/msi/msi.c
> +++ b/drivers/pci/msi/msi.c
> @@ -285,8 +285,6 @@ static void pci_msi_set_enable(struct pc
> static int msi_setup_msi_desc(struct pci_dev *dev, int nvec,
> struct irq_affinity_desc *masks)
> {
> - const struct irq_domain *d = dev_get_msi_domain(&dev->dev);
> - const struct msi_domain_info *info = d->host_data;
> struct msi_desc desc;
> u16 control;
>
> @@ -297,7 +295,7 @@ static int msi_setup_msi_desc(struct pci
> /* Lies, damned lies, and MSIs */
> if (dev->dev_flags & PCI_DEV_FLAGS_HAS_MSI_MASKING)
> control |= PCI_MSI_FLAGS_MASKBIT;
> - if (info->flags & MSI_FLAG_NO_MASK)
> + if (pci_msi_domain_supports(dev, MSI_FLAG_NO_MASK, DENY_LEGACY))
> control &= ~PCI_MSI_FLAGS_MASKBIT;
>
> desc.nvec_used = nvec;
> @@ -605,20 +603,18 @@ static void __iomem *msix_map_region(str
> */
> void msix_prepare_msi_desc(struct pci_dev *dev, struct msi_desc *desc)
> {
> - const struct irq_domain *d = dev_get_msi_domain(&dev->dev);
> - const struct msi_domain_info *info = d->host_data;
> -
> desc->nvec_used = 1;
> desc->pci.msi_attrib.is_msix = 1;
> desc->pci.msi_attrib.is_64 = 1;
> desc->pci.msi_attrib.default_irq = dev->irq;
> desc->pci.mask_base = dev->msix_base;
> - desc->pci.msi_attrib.can_mask = !(info->flags & MSI_FLAG_NO_MASK) &&
> - !desc->pci.msi_attrib.is_virtual;
>
> - if (desc->pci.msi_attrib.can_mask) {
> +
> + if (!pci_msi_domain_supports(dev, MSI_FLAG_NO_MASK, DENY_LEGACY) &&
> + !desc->pci.msi_attrib.is_virtual) {
> void __iomem *addr = pci_msix_desc_addr(desc);
>
> + desc->pci.msi_attrib.can_mask = true;
> desc->pci.msix_ctrl = readl(addr + PCI_MSIX_ENTRY_VECTOR_CTRL);
> }
> }
> @@ -715,8 +711,6 @@ static int msix_setup_interrupts(struct
> static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
> int nvec, struct irq_affinity *affd)
> {
> - const struct irq_domain *d = dev_get_msi_domain(&dev->dev);
> - const struct msi_domain_info *info = d->host_data;
> int ret, tsize;
> u16 control;
>
> @@ -747,7 +741,7 @@ static int msix_capability_init(struct p
> /* Disable INTX */
> pci_intx_for_msi(dev, 0);
>
> - if (!(info->flags & MSI_FLAG_NO_MASK)) {
> + if (!pci_msi_domain_supports(dev, MSI_FLAG_NO_MASK, DENY_LEGACY)) {
> /*
> * Ensure that all table entries are masked to prevent
> * stale entries from firing in a crash kernel.
>
>
Best regards
--
Marek Szyprowski, PhD
Samsung R&D Institute Poland
On Tue, Mar 25, 2025 at 10:20:43AM +0100, Thomas Gleixner wrote: > On Tue, Mar 25 2025 at 09:11, Thomas Gleixner wrote: > > > On Mon, Mar 24 2025 at 20:18, Roger Pau Monné wrote: > >> On Mon, Mar 24, 2025 at 07:58:14PM +0100, Daniel Gomez wrote: > >>> The issue is that info appears to be uninitialized. So, this worked for me: > >> > >> Indeed, irq_domain->host_data is NULL, there's no msi_domain_info. As > >> this is x86, I was expecting x86 ot always use > >> x86_init_dev_msi_info(), but that doesn't seem to be the case. I > >> would like to better understand this. > > > > Indeed. On x86 this should not happen at all. On architectures, which do > > not use (hierarchical) interrupt domains, it will return NULL. > > > > So I really want to understand why this happens on x86 before such a > > "fix" is deployed. > > So after staring at it some more it's clear. Without XEN, the domain > returned is the MSI parent domain, which is the vector domain in that > setup. That does not have a domain info set. But on legacy architectures > there is not even a domain. > > It's really wonderful that we have a gazillion ways to manage the > backends of PCI/MSI.... > > So none of the suggested pointer checks will cover it correctly. Though > there is already a function which allows to query MSI domain flags > independent of the underlying insanity. Sorry for not catching it in > review. > > Untested patch below. As I'm getting reports of other people hitting this issue, is there anything that needs to be done from my side to get the fix into linux-next? Thanks, Roger.
On Tue, Mar 25, 2025 at 10:20:43AM +0100, Thomas Gleixner wrote: > On Tue, Mar 25 2025 at 09:11, Thomas Gleixner wrote: > > > On Mon, Mar 24 2025 at 20:18, Roger Pau Monné wrote: > >> On Mon, Mar 24, 2025 at 07:58:14PM +0100, Daniel Gomez wrote: > >>> The issue is that info appears to be uninitialized. So, this worked for me: > >> > >> Indeed, irq_domain->host_data is NULL, there's no msi_domain_info. As > >> this is x86, I was expecting x86 ot always use > >> x86_init_dev_msi_info(), but that doesn't seem to be the case. I > >> would like to better understand this. > > > > Indeed. On x86 this should not happen at all. On architectures, which do > > not use (hierarchical) interrupt domains, it will return NULL. > > > > So I really want to understand why this happens on x86 before such a > > "fix" is deployed. > > So after staring at it some more it's clear. Without XEN, the domain > returned is the MSI parent domain, which is the vector domain in that > setup. That does not have a domain info set. But on legacy architectures > there is not even a domain. > > It's really wonderful that we have a gazillion ways to manage the > backends of PCI/MSI.... I'm a bit confused by what msi_create_device_irq_domain() does, as it does allocate an irq_domain with an associated msi_domain_info structure, however that irq_domain is set in dev->msi.data->__domains[domid].domain rather than dev->msi.domain, and doesn't override the default irq_domain set by pcibios_device_add(). And the default x86 irq_domain (set by pcibios_device_add()) doesn't have an associated msi_domain_info. > So none of the suggested pointer checks will cover it correctly. Though > there is already a function which allows to query MSI domain flags > independent of the underlying insanity. Sorry for not catching it in > review. Oh, that's nice, I didn't know about that helper. > Untested patch below. LGTM, but (as you can see) I'm not expert on that area. Thanks, Roger.
On Tue, Mar 25 2025 at 11:22, Roger Pau Monné wrote:
> On Tue, Mar 25, 2025 at 10:20:43AM +0100, Thomas Gleixner wrote:
> I'm a bit confused by what msi_create_device_irq_domain() does, as it
> does allocate an irq_domain with an associated msi_domain_info
> structure, however that irq_domain is set in
> dev->msi.data->__domains[domid].domain rather than dev->msi.domain,
> and doesn't override the default irq_domain set by
> pcibios_device_add().
The default irq domain is a parent domain in that case on top of which
the per device domains are built. And those are private to the device.
The XEN variant uses the original global PCI/MSI domain concept with
this outrageous domain wrapper hack. A crime committed by some tglx
dude.
> And the default x86 irq_domain (set by pcibios_device_add()) doesn't
> have an associated msi_domain_info.
It does not need one.
Thanks,
tglx
On Tue, Mar 25, 2025 at 11:27:51AM +0100, Thomas Gleixner wrote: > On Tue, Mar 25 2025 at 11:22, Roger Pau Monné wrote: > > On Tue, Mar 25, 2025 at 10:20:43AM +0100, Thomas Gleixner wrote: > > I'm a bit confused by what msi_create_device_irq_domain() does, as it > > does allocate an irq_domain with an associated msi_domain_info > > structure, however that irq_domain is set in > > dev->msi.data->__domains[domid].domain rather than dev->msi.domain, > > and doesn't override the default irq_domain set by > > pcibios_device_add(). > > The default irq domain is a parent domain in that case on top of which > the per device domains are built. And those are private to the device. Sorry to ask, but shouldn't dev_get_msi_domain() return the specific device domain rather than the parent one? Otherwise I feel the function should rather be named dev_get_parent_msi_domain(). > The XEN variant uses the original global PCI/MSI domain concept with > this outrageous domain wrapper hack. A crime committed by some tglx > dude. I see. So the proper way would be for Xen to not override the default x86 irq_domain in dev->msi.domain (so don't have a Xen PV specific version of x86_init.irqs.create_pci_msi_domain) and instead do something similar to what VMD does? Thanks, Roger.
On Tue, Mar 25 2025 at 11:55, Roger Pau Monné wrote:
> On Tue, Mar 25, 2025 at 11:27:51AM +0100, Thomas Gleixner wrote:
>> On Tue, Mar 25 2025 at 11:22, Roger Pau Monné wrote:
>> > On Tue, Mar 25, 2025 at 10:20:43AM +0100, Thomas Gleixner wrote:
>> > I'm a bit confused by what msi_create_device_irq_domain() does, as it
>> > does allocate an irq_domain with an associated msi_domain_info
>> > structure, however that irq_domain is set in
>> > dev->msi.data->__domains[domid].domain rather than dev->msi.domain,
>> > and doesn't override the default irq_domain set by
>> > pcibios_device_add().
>>
>> The default irq domain is a parent domain in that case on top of which
>> the per device domains are built. And those are private to the device.
>
> Sorry to ask, but shouldn't dev_get_msi_domain() return the specific
> device domain rather than the parent one? Otherwise I feel the
> function should rather be named dev_get_parent_msi_domain().
The function returns the MSI domain pointer which is associated to the
device. That can be either a global MSI domain or a parent MSI domain.
The few places which actually care about it have the proper checks in
place and until we consolidate the MSI handling to per device domains,
this will unfortunately remain slightly confusing.
>> The XEN variant uses the original global PCI/MSI domain concept with
>> this outrageous domain wrapper hack. A crime committed by some tglx
>> dude.
>
> I see. So the proper way would be for Xen to not override the default
> x86 irq_domain in dev->msi.domain (so don't have a Xen PV specific
> version of x86_init.irqs.create_pci_msi_domain) and instead do
> something similar to what VMD does?
No. Xen should override it as it provides the default domain for the
system. VMD is a special case as it provides it's own magic on top.
If XEN would not override it as the global default, then you'd need a
lot of extra hackery to do the override at the end.
Thanks,
tglx
On Tue, Mar 25, 2025 at 10:20:43AM +0100, Thomas Gleixner wrote:
> On Tue, Mar 25 2025 at 09:11, Thomas Gleixner wrote:
>
> > On Mon, Mar 24 2025 at 20:18, Roger Pau Monné wrote:
> >> On Mon, Mar 24, 2025 at 07:58:14PM +0100, Daniel Gomez wrote:
> >>> The issue is that info appears to be uninitialized. So, this worked for me:
> >>
> >> Indeed, irq_domain->host_data is NULL, there's no msi_domain_info. As
> >> this is x86, I was expecting x86 ot always use
> >> x86_init_dev_msi_info(), but that doesn't seem to be the case. I
> >> would like to better understand this.
> >
> > Indeed. On x86 this should not happen at all. On architectures, which do
> > not use (hierarchical) interrupt domains, it will return NULL.
> >
> > So I really want to understand why this happens on x86 before such a
> > "fix" is deployed.
>
> So after staring at it some more it's clear. Without XEN, the domain
> returned is the MSI parent domain, which is the vector domain in that
> setup. That does not have a domain info set. But on legacy architectures
> there is not even a domain.
>
> It's really wonderful that we have a gazillion ways to manage the
> backends of PCI/MSI....
>
> So none of the suggested pointer checks will cover it correctly. Though
> there is already a function which allows to query MSI domain flags
> independent of the underlying insanity. Sorry for not catching it in
> review.
>
> Untested patch below.
>
> Thanks,
>
> tglx
> ---
> drivers/pci/msi/msi.c | 18 ++++++------------
> 1 file changed, 6 insertions(+), 12 deletions(-)
>
> --- a/drivers/pci/msi/msi.c
> +++ b/drivers/pci/msi/msi.c
> @@ -285,8 +285,6 @@ static void pci_msi_set_enable(struct pc
> static int msi_setup_msi_desc(struct pci_dev *dev, int nvec,
> struct irq_affinity_desc *masks)
> {
> - const struct irq_domain *d = dev_get_msi_domain(&dev->dev);
> - const struct msi_domain_info *info = d->host_data;
> struct msi_desc desc;
> u16 control;
>
> @@ -297,7 +295,7 @@ static int msi_setup_msi_desc(struct pci
> /* Lies, damned lies, and MSIs */
> if (dev->dev_flags & PCI_DEV_FLAGS_HAS_MSI_MASKING)
> control |= PCI_MSI_FLAGS_MASKBIT;
> - if (info->flags & MSI_FLAG_NO_MASK)
> + if (pci_msi_domain_supports(dev, MSI_FLAG_NO_MASK, DENY_LEGACY))
> control &= ~PCI_MSI_FLAGS_MASKBIT;
>
> desc.nvec_used = nvec;
> @@ -605,20 +603,18 @@ static void __iomem *msix_map_region(str
> */
> void msix_prepare_msi_desc(struct pci_dev *dev, struct msi_desc *desc)
> {
> - const struct irq_domain *d = dev_get_msi_domain(&dev->dev);
> - const struct msi_domain_info *info = d->host_data;
> -
> desc->nvec_used = 1;
> desc->pci.msi_attrib.is_msix = 1;
> desc->pci.msi_attrib.is_64 = 1;
> desc->pci.msi_attrib.default_irq = dev->irq;
> desc->pci.mask_base = dev->msix_base;
> - desc->pci.msi_attrib.can_mask = !(info->flags & MSI_FLAG_NO_MASK) &&
> - !desc->pci.msi_attrib.is_virtual;
>
> - if (desc->pci.msi_attrib.can_mask) {
> +
> + if (!pci_msi_domain_supports(dev, MSI_FLAG_NO_MASK, DENY_LEGACY) &&
> + !desc->pci.msi_attrib.is_virtual) {
> void __iomem *addr = pci_msix_desc_addr(desc);
>
> + desc->pci.msi_attrib.can_mask = true;
can_mask is u8.
> desc->pci.msix_ctrl = readl(addr + PCI_MSIX_ENTRY_VECTOR_CTRL);
> }
> }
> @@ -715,8 +711,6 @@ static int msix_setup_interrupts(struct
> static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
> int nvec, struct irq_affinity *affd)
> {
> - const struct irq_domain *d = dev_get_msi_domain(&dev->dev);
> - const struct msi_domain_info *info = d->host_data;
> int ret, tsize;
> u16 control;
>
> @@ -747,7 +741,7 @@ static int msix_capability_init(struct p
> /* Disable INTX */
> pci_intx_for_msi(dev, 0);
>
> - if (!(info->flags & MSI_FLAG_NO_MASK)) {
> + if (!pci_msi_domain_supports(dev, MSI_FLAG_NO_MASK, DENY_LEGACY)) {
> /*
> * Ensure that all table entries are masked to prevent
> * stale entries from firing in a crash kernel.
It works, thanks!
For the final patch:
Tested-by: Daniel Gomez <da.gomez@samsung.com>
Daniel
This also fixes a timeout error in the nvme driver introduced by commit
c3164d2e0d1. In linux-next-2025032{5,6} booting hangs in about 50% of boot
attempts with the message:
nvme nvme1: I/O tag 4 (1004) QID 0 timeout, completion polled
nvme nvme0: I/O tag 20 (1014) QID 0 timeout, completion polled
after some more time I get a message about task udev-worker blocking for more
than 61 seconds and get dropped to an initramfs shell. I bisected this to
commit c3164d2e0d1. As this error does not occur in linux-next-20250328, I
searched for commits that might fix this error and indeed cherrypicking commit
dbc5d00074fd on top of linux-next-20250326 fixes the issue for me. No xen or
kvm was used in my case.
Bert Karwatzki
The conversion of the XEN specific global variable pci_msi_ignore_mask to a
MSI domain flag, missed the facts that:
1) Legacy architectures do not provide a interrupt domain
2) Parent MSI domains do not necessarily have a domain info attached
Both cases result in an unconditional NULL pointer dereference.
Cure this by using the existing pci_msi_domain_supports() helper, which
handles all possible cases correctly.
Fixes: c3164d2e0d18 ("PCI/MSI: Convert pci_msi_ignore_mask to per MSI domain flag")
Reported-by: Daniel Gomez <da.gomez@kernel.org>
Reported-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Tested-by: Borislav Petkov <bp@alien8.de>
Tested-by: Daniel Gomez <da.gomez@kernel.org>
---
drivers/pci/msi/msi.c | 18 ++++++------------
1 file changed, 6 insertions(+), 12 deletions(-)
--- a/drivers/pci/msi/msi.c
+++ b/drivers/pci/msi/msi.c
@@ -285,8 +285,6 @@ static void pci_msi_set_enable(struct pc
static int msi_setup_msi_desc(struct pci_dev *dev, int nvec,
struct irq_affinity_desc *masks)
{
- const struct irq_domain *d = dev_get_msi_domain(&dev->dev);
- const struct msi_domain_info *info = d->host_data;
struct msi_desc desc;
u16 control;
@@ -297,7 +295,7 @@ static int msi_setup_msi_desc(struct pci
/* Lies, damned lies, and MSIs */
if (dev->dev_flags & PCI_DEV_FLAGS_HAS_MSI_MASKING)
control |= PCI_MSI_FLAGS_MASKBIT;
- if (info->flags & MSI_FLAG_NO_MASK)
+ if (pci_msi_domain_supports(dev, MSI_FLAG_NO_MASK, DENY_LEGACY))
control &= ~PCI_MSI_FLAGS_MASKBIT;
desc.nvec_used = nvec;
@@ -604,20 +602,18 @@ static void __iomem *msix_map_region(str
*/
void msix_prepare_msi_desc(struct pci_dev *dev, struct msi_desc *desc)
{
- const struct irq_domain *d = dev_get_msi_domain(&dev->dev);
- const struct msi_domain_info *info = d->host_data;
-
desc->nvec_used = 1;
desc->pci.msi_attrib.is_msix = 1;
desc->pci.msi_attrib.is_64 = 1;
desc->pci.msi_attrib.default_irq = dev->irq;
desc->pci.mask_base = dev->msix_base;
- desc->pci.msi_attrib.can_mask = !(info->flags & MSI_FLAG_NO_MASK) &&
- !desc->pci.msi_attrib.is_virtual;
- if (desc->pci.msi_attrib.can_mask) {
+
+ if (!pci_msi_domain_supports(dev, MSI_FLAG_NO_MASK, DENY_LEGACY) &&
+ !desc->pci.msi_attrib.is_virtual) {
void __iomem *addr = pci_msix_desc_addr(desc);
+ desc->pci.msi_attrib.can_mask = 1;
desc->pci.msix_ctrl = readl(addr + PCI_MSIX_ENTRY_VECTOR_CTRL);
}
}
@@ -715,8 +711,6 @@ static int msix_setup_interrupts(struct
static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
int nvec, struct irq_affinity *affd)
{
- const struct irq_domain *d = dev_get_msi_domain(&dev->dev);
- const struct msi_domain_info *info = d->host_data;
int ret, tsize;
u16 control;
@@ -747,7 +741,7 @@ static int msix_capability_init(struct p
/* Disable INTX */
pci_intx_for_msi(dev, 0);
- if (!(info->flags & MSI_FLAG_NO_MASK)) {
+ if (!pci_msi_domain_supports(dev, MSI_FLAG_NO_MASK, DENY_LEGACY)) {
/*
* Ensure that all table entries are masked to prevent
* stale entries from firing in a crash kernel.
On 26.03.25 13:05, Thomas Gleixner wrote:
> The conversion of the XEN specific global variable pci_msi_ignore_mask to a
> MSI domain flag, missed the facts that:
>
> 1) Legacy architectures do not provide a interrupt domain
> 2) Parent MSI domains do not necessarily have a domain info attached
>
> Both cases result in an unconditional NULL pointer dereference.
>
> Cure this by using the existing pci_msi_domain_supports() helper, which
> handles all possible cases correctly.
>
> Fixes: c3164d2e0d18 ("PCI/MSI: Convert pci_msi_ignore_mask to per MSI domain flag")
> Reported-by: Daniel Gomez <da.gomez@kernel.org>
> Reported-by: Borislav Petkov <bp@alien8.de>
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
> Tested-by: Borislav Petkov <bp@alien8.de>
> Tested-by: Daniel Gomez <da.gomez@kernel.org>
Reviewed-by: Juergen Gross <jgross@suse.com>
Juergen
On 26.03.25 13:05, Thomas Gleixner wrote:
> The conversion of the XEN specific global variable pci_msi_ignore_mask to a
> MSI domain flag, missed the facts that:
>
> 1) Legacy architectures do not provide a interrupt domain
> 2) Parent MSI domains do not necessarily have a domain info attached
>
> Both cases result in an unconditional NULL pointer dereference.
>
> Cure this by using the existing pci_msi_domain_supports() helper, which
> handles all possible cases correctly.
>
> Fixes: c3164d2e0d18 ("PCI/MSI: Convert pci_msi_ignore_mask to per MSI domain flag")
> Reported-by: Daniel Gomez <da.gomez@kernel.org>
> Reported-by: Borislav Petkov <bp@alien8.de>
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
> Tested-by: Borislav Petkov <bp@alien8.de>
> Tested-by: Daniel Gomez <da.gomez@kernel.org>
As the patch introducing the problem went in via the Xen tree, should
this fix go in via the Xen tree, too?
Juergen
On Wed, Mar 26 2025 at 13:09, Jürgen Groß wrote:
> On 26.03.25 13:05, Thomas Gleixner wrote:
>> The conversion of the XEN specific global variable pci_msi_ignore_mask to a
>> MSI domain flag, missed the facts that:
>>
>> 1) Legacy architectures do not provide a interrupt domain
>> 2) Parent MSI domains do not necessarily have a domain info attached
>>
>> Both cases result in an unconditional NULL pointer dereference.
>>
>> Cure this by using the existing pci_msi_domain_supports() helper, which
>> handles all possible cases correctly.
>>
>> Fixes: c3164d2e0d18 ("PCI/MSI: Convert pci_msi_ignore_mask to per MSI domain flag")
>> Reported-by: Daniel Gomez <da.gomez@kernel.org>
>> Reported-by: Borislav Petkov <bp@alien8.de>
>> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
>> Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
>> Tested-by: Borislav Petkov <bp@alien8.de>
>> Tested-by: Daniel Gomez <da.gomez@kernel.org>
>
> As the patch introducing the problem went in via the Xen tree, should
> this fix go in via the Xen tree, too?
I'll queue it up now and send Linus a pull request.
The following commit has been merged into the timers/urgent branch of tip:
Commit-ID: 3ece3e8e5976c49c3f887e5923f998eabd54ff40
Gitweb: https://git.kernel.org/tip/3ece3e8e5976c49c3f887e5923f998eabd54ff40
Author: Thomas Gleixner <tglx@linutronix.de>
AuthorDate: Wed, 26 Mar 2025 13:05:35 +01:00
Committer: Thomas Gleixner <tglx@linutronix.de>
CommitterDate: Wed, 26 Mar 2025 15:28:43 +01:00
PCI/MSI: Handle the NOMASK flag correctly for all PCI/MSI backends
The conversion of the XEN specific global variable pci_msi_ignore_mask to a
MSI domain flag, missed the facts that:
1) Legacy architectures do not provide a interrupt domain
2) Parent MSI domains do not necessarily have a domain info attached
Both cases result in an unconditional NULL pointer dereference. This was
unfortunatly missed in review and testing revealed it late.
Cure this by using the existing pci_msi_domain_supports() helper, which
handles all possible cases correctly.
Fixes: c3164d2e0d18 ("PCI/MSI: Convert pci_msi_ignore_mask to per MSI domain flag")
Reported-by: Daniel Gomez <da.gomez@kernel.org>
Reported-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Juergen Gross <jgross@suse.com>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Tested-by: Borislav Petkov <bp@alien8.de>
Tested-by: Daniel Gomez <da.gomez@kernel.org>
Link: https://lore.kernel.org/all/87iknwyp2o.ffs@tglx
Closes: https://lore.kernel.org/all/qn7fzggcj6qe6r6gdbwcz23pzdz2jx64aldccmsuheabhmjgrt@tawf5nfwuvw7
---
drivers/pci/msi/msi.c | 18 ++++++------------
1 file changed, 6 insertions(+), 12 deletions(-)
diff --git a/drivers/pci/msi/msi.c b/drivers/pci/msi/msi.c
index d741628..7058d59 100644
--- a/drivers/pci/msi/msi.c
+++ b/drivers/pci/msi/msi.c
@@ -285,8 +285,6 @@ static void pci_msi_set_enable(struct pci_dev *dev, int enable)
static int msi_setup_msi_desc(struct pci_dev *dev, int nvec,
struct irq_affinity_desc *masks)
{
- const struct irq_domain *d = dev_get_msi_domain(&dev->dev);
- const struct msi_domain_info *info = d->host_data;
struct msi_desc desc;
u16 control;
@@ -297,7 +295,7 @@ static int msi_setup_msi_desc(struct pci_dev *dev, int nvec,
/* Lies, damned lies, and MSIs */
if (dev->dev_flags & PCI_DEV_FLAGS_HAS_MSI_MASKING)
control |= PCI_MSI_FLAGS_MASKBIT;
- if (info->flags & MSI_FLAG_NO_MASK)
+ if (pci_msi_domain_supports(dev, MSI_FLAG_NO_MASK, DENY_LEGACY))
control &= ~PCI_MSI_FLAGS_MASKBIT;
desc.nvec_used = nvec;
@@ -604,20 +602,18 @@ static void __iomem *msix_map_region(struct pci_dev *dev,
*/
void msix_prepare_msi_desc(struct pci_dev *dev, struct msi_desc *desc)
{
- const struct irq_domain *d = dev_get_msi_domain(&dev->dev);
- const struct msi_domain_info *info = d->host_data;
-
desc->nvec_used = 1;
desc->pci.msi_attrib.is_msix = 1;
desc->pci.msi_attrib.is_64 = 1;
desc->pci.msi_attrib.default_irq = dev->irq;
desc->pci.mask_base = dev->msix_base;
- desc->pci.msi_attrib.can_mask = !(info->flags & MSI_FLAG_NO_MASK) &&
- !desc->pci.msi_attrib.is_virtual;
- if (desc->pci.msi_attrib.can_mask) {
+
+ if (!pci_msi_domain_supports(dev, MSI_FLAG_NO_MASK, DENY_LEGACY) &&
+ !desc->pci.msi_attrib.is_virtual) {
void __iomem *addr = pci_msix_desc_addr(desc);
+ desc->pci.msi_attrib.can_mask = 1;
desc->pci.msix_ctrl = readl(addr + PCI_MSIX_ENTRY_VECTOR_CTRL);
}
}
@@ -715,8 +711,6 @@ static int msix_setup_interrupts(struct pci_dev *dev, struct msix_entry *entries
static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
int nvec, struct irq_affinity *affd)
{
- const struct irq_domain *d = dev_get_msi_domain(&dev->dev);
- const struct msi_domain_info *info = d->host_data;
int ret, tsize;
u16 control;
@@ -747,7 +741,7 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
/* Disable INTX */
pci_intx_for_msi(dev, 0);
- if (!(info->flags & MSI_FLAG_NO_MASK)) {
+ if (!pci_msi_domain_supports(dev, MSI_FLAG_NO_MASK, DENY_LEGACY)) {
/*
* Ensure that all table entries are masked to prevent
* stale entries from firing in a crash kernel.
On Mon, Mar 24, 2025 at 08:18:01PM +0100, Roger Pau Monné wrote:
> On Mon, Mar 24, 2025 at 07:58:14PM +0100, Daniel Gomez wrote:
> > On Mon, Mar 24, 2025 at 06:51:54PM +0100, Roger Pau Monné wrote:
> > > On Mon, Mar 24, 2025 at 03:29:46PM +0100, Daniel Gomez wrote:
> > > >
> > > > Hi,
> > > >
> > > > On Fri, Mar 21, 2025 at 09:00:09AM +0100, Jürgen Groß wrote:
> > > > > On 20.03.25 22:07, Bjorn Helgaas wrote:
> > > > > > On Wed, Feb 19, 2025 at 10:20:57AM +0100, Roger Pau Monne wrote:
> > > > > > > Setting pci_msi_ignore_mask inhibits the toggling of the mask bit for both
> > > > > > > MSI and MSI-X entries globally, regardless of the IRQ chip they are using.
> > > > > > > Only Xen sets the pci_msi_ignore_mask when routing physical interrupts over
> > > > > > > event channels, to prevent PCI code from attempting to toggle the maskbit,
> > > > > > > as it's Xen that controls the bit.
> > > > > > >
> > > > > > > However, the pci_msi_ignore_mask being global will affect devices that use
> > > > > > > MSI interrupts but are not routing those interrupts over event channels
> > > > > > > (not using the Xen pIRQ chip). One example is devices behind a VMD PCI
> > > > > > > bridge. In that scenario the VMD bridge configures MSI(-X) using the
> > > > > > > normal IRQ chip (the pIRQ one in the Xen case), and devices behind the
> > > > > > > bridge configure the MSI entries using indexes into the VMD bridge MSI
> > > > > > > table. The VMD bridge then demultiplexes such interrupts and delivers to
> > > > > > > the destination device(s). Having pci_msi_ignore_mask set in that scenario
> > > > > > > prevents (un)masking of MSI entries for devices behind the VMD bridge.
> > > > > > >
> > > > > > > Move the signaling of no entry masking into the MSI domain flags, as that
> > > > > > > allows setting it on a per-domain basis. Set it for the Xen MSI domain
> > > > > > > that uses the pIRQ chip, while leaving it unset for the rest of the
> > > > > > > cases.
> > > > > > >
> > > > > > > Remove pci_msi_ignore_mask at once, since it was only used by Xen code, and
> > > > > > > with Xen dropping usage the variable is unneeded.
> > > > > > >
> > > > > > > This fixes using devices behind a VMD bridge on Xen PV hardware domains.
> > > > > > >
> > > > > > > Albeit Devices behind a VMD bridge are not known to Xen, that doesn't mean
> > > > > > > Linux cannot use them. By inhibiting the usage of
> > > > > > > VMD_FEAT_CAN_BYPASS_MSI_REMAP and the removal of the pci_msi_ignore_mask
> > > > > > > bodge devices behind a VMD bridge do work fine when use from a Linux Xen
> > > > > > > hardware domain. That's the whole point of the series.
> > > > > > >
> > > > > > > Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> > > > > > > Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
> > > > > > > Acked-by: Juergen Gross <jgross@suse.com>
> > > > > >
> > > > > > Acked-by: Bjorn Helgaas <bhelgaas@google.com>
> > > > > >
> > > > > > I assume you'll merge this series via the Xen tree. Let me know if
> > > > > > otherwise.
> > > > >
> > > > > I've pushed the series to the linux-next branch of the Xen tree.
> > > > >
> > > > >
> > > > > Juergen
> > > >
> > > > This patch landed in latest next-20250324 tag causing this crash:
> > > >
> > > > [ 0.753426] BUG: kernel NULL pointer dereference, address: 0000000000000002
> > > > [ 0.753921] #PF: supervisor read access in kernel mode
> > > > [ 0.754286] #PF: error_code(0x0000) - not-present page
> > > > [ 0.754656] PGD 0 P4D 0
> > > > [ 0.754842] Oops: Oops: 0000 [#1]
> > > > [ 0.755080] CPU: 0 UID: 0 PID: 1 Comm: swapper Not tainted 6.14.0-rc7-next-20250324 #1 NONE
> > > > [ 0.755691] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
> > > > [ 0.756349] RIP: 0010:msix_prepare_msi_desc+0x39/0x80
> > > > [ 0.756390] Code: 20 c7 46 04 01 00 00 00 8b 56 4c 89 d0 0d 01 01 00 00 66 89 46 4c 8b 8f 64 02 00 00 89 4e 50 48 8b 8f 70 06 00 00 48 89 4e 58 <41> f6 40 02 40 75 2a c1 ea 02 bf 80 00 00 00 21 fa 25 7f ff ff ff
> > > > [ 0.756390] RSP: 0000:ffff8881002a76e0 EFLAGS: 00010202
> > > > [ 0.756390] RAX: 0000000000000101 RBX: ffff88810074d000 RCX: ffffc9000002e000
> > > > [ 0.756390] RDX: 0000000000000000 RSI: ffff8881002a7710 RDI: ffff88810074d000
> > > > [ 0.756390] RBP: ffff8881002a7710 R08: 0000000000000000 R09: ffff8881002a76b4
> > > > [ 0.756390] R10: 000000701000c001 R11: ffffffff82a3dc01 R12: 0000000000000000
> > > > [ 0.756390] R13: 0000000000000005 R14: 0000000000000000 R15: 0000000000000002
> > > > [ 0.756390] FS: 0000000000000000(0000) GS:0000000000000000(0000) knlGS:0000000000000000
> > > > [ 0.756390] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > > > [ 0.756390] CR2: 0000000000000002 CR3: 0000000002a3d001 CR4: 00000000003706b0
> > > > [ 0.756390] Call Trace:
> > > > [ 0.756390] <TASK>
> > > > [ 0.756390] ? __die_body+0x1b/0x60
> > > > [ 0.756390] ? page_fault_oops+0x2d0/0x310
> > > > [ 0.756390] ? exc_page_fault+0x59/0xc0
> > > > [ 0.756390] ? asm_exc_page_fault+0x22/0x30
> > > > [ 0.756390] ? msix_prepare_msi_desc+0x39/0x80
> > > > [ 0.756390] ? msix_capability_init+0x172/0x2c0
> > > > [ 0.756390] ? __pci_enable_msix_range+0x1a8/0x1d0
> > > > [ 0.756390] ? pci_alloc_irq_vectors_affinity+0x7c/0xf0
> > > > [ 0.756390] ? vp_find_vqs_msix+0x187/0x400
> > > > [ 0.756390] ? vp_find_vqs+0x2f/0x250
> > > > [ 0.756390] ? snprintf+0x3e/0x50
> > > > [ 0.756390] ? vp_modern_find_vqs+0x13/0x60
> > > > [ 0.756390] ? init_vq+0x184/0x1e0
> > > > [ 0.756390] ? vp_get_status+0x20/0x20
> > > > [ 0.756390] ? virtblk_probe+0xeb/0x8d0
> > > > [ 0.756390] ? __kernfs_new_node+0x122/0x160
> > > > [ 0.756390] ? vp_get_status+0x20/0x20
> > > > [ 0.756390] ? virtio_dev_probe+0x171/0x1c0
> > > > [ 0.756390] ? really_probe+0xc2/0x240
> > > > [ 0.756390] ? driver_probe_device+0x1d/0x70
> > > > [ 0.756390] ? __driver_attach+0x96/0xe0
> > > > [ 0.756390] ? driver_attach+0x20/0x20
> > > > [ 0.756390] ? bus_for_each_dev+0x7b/0xb0
> > > > [ 0.756390] ? bus_add_driver+0xe6/0x200
> > > > [ 0.756390] ? driver_register+0x5e/0xf0
> > > > [ 0.756390] ? virtio_blk_init+0x4d/0x90
> > > > [ 0.756390] ? add_boot_memory_block+0x90/0x90
> > > > [ 0.756390] ? do_one_initcall+0xe2/0x250
> > > > [ 0.756390] ? xas_store+0x4b/0x4b0
> > > > [ 0.756390] ? number+0x13b/0x260
> > > > [ 0.756390] ? ida_alloc_range+0x36a/0x3b0
> > > > [ 0.756390] ? parameq+0x13/0x90
> > > > [ 0.756390] ? parse_args+0x10f/0x2a0
> > > > [ 0.756390] ? do_initcall_level+0x83/0xb0
> > > > [ 0.756390] ? do_initcalls+0x43/0x70
> > > > [ 0.756390] ? rest_init+0x80/0x80
> > > > [ 0.756390] ? kernel_init_freeable+0x70/0xb0
> > > > [ 0.756390] ? kernel_init+0x16/0x110
> > > > [ 0.756390] ? ret_from_fork+0x30/0x40
> > > > [ 0.756390] ? rest_init+0x80/0x80
> > > > [ 0.756390] ? ret_from_fork_asm+0x11/0x20
> > > > [ 0.756390] </TASK>
> > > > [ 0.756390] Modules linked in:
> > > > [ 0.756390] CR2: 0000000000000002
> > > > [ 0.756390] ---[ end trace 0000000000000000 ]---
> > > > [ 0.756390] RIP: 0010:msix_prepare_msi_desc+0x39/0x80
> > > > [ 0.756390] Code: 20 c7 46 04 01 00 00 00 8b 56 4c 89 d0 0d 01 01 00 00 66 89 46 4c 8b 8f 64 02 00 00 89 4e 50 48 8b 8f 70 06 00 00 48 89 4e 58 <41> f6 40 02 40 75 2a c1 ea 02 bf 80 00 00 00 21 fa 25 7f ff ff ff
> > > > [ 0.756390] RSP: 0000:ffff8881002a76e0 EFLAGS: 00010202
> > > > [ 0.756390] RAX: 0000000000000101 RBX: ffff88810074d000 RCX: ffffc9000002e000
> > > > [ 0.756390] RDX: 0000000000000000 RSI: ffff8881002a7710 RDI: ffff88810074d000
> > > > [ 0.756390] RBP: ffff8881002a7710 R08: 0000000000000000 R09: ffff8881002a76b4
> > > > [ 0.756390] R10: 000000701000c001 R11: ffffffff82a3dc01 R12: 0000000000000000
> > > > [ 0.756390] R13: 0000000000000005 R14: 0000000000000000 R15: 0000000000000002
> > > > [ 0.756390] FS: 0000000000000000(0000) GS:0000000000000000(0000) knlGS:0000000000000000
> > > > [ 0.756390] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > > > [ 0.756390] CR2: 0000000000000002 CR3: 0000000002a3d001 CR4: 00000000003706b0
> > > > [ 0.756390] note: swapper[1] exited with irqs disabled
> > > > [ 0.782774] Kernel panic - not syncing: Attempted to kill init! exitcode=0x00000009
> > > > [ 0.783560] Kernel Offset: disabled
> > > > [ 0.783909] ---[ end Kernel panic - not syncing: Attempted to kill init! exitcode=0x00000009 ]---
> > > >
> > > >
> > > > msix_prepare_msi_desc+0x39/0x80:
> > > > msix_prepare_msi_desc at drivers/pci/msi/msi.c:616
> > > > 611 desc->nvec_used = 1;
> > > > 612 desc->pci.msi_attrib.is_msix = 1;
> > > > 613 desc->pci.msi_attrib.is_64 = 1;
> > > > 614 desc->pci.msi_attrib.default_irq = dev->irq;
> > > > 615 desc->pci.mask_base = dev->msix_base;
> > > > >616< desc->pci.msi_attrib.can_mask = !(info->flags & MSI_FLAG_NO_MASK) &&
> > > > 617 !desc->pci.msi_attrib.is_virtual;
> > > > 618
> > > > 619 if (desc->pci.msi_attrib.can_mask) {
> > > > 620 void __iomem *addr = pci_msix_desc_addr(desc);
> > > > 621
> > > >
> > > > Reverting patch 3 fixes the issue.
> > >
> > > Thanks for the report and sorry for the breakage. Do you have a QEMU
> > > command line I can use to try to reproduce this locally?
> > >
> > > Will work on a patch ASAP.
> >
> > Thanks for the quick reply.
> >
> > The issue is that info appears to be uninitialized. So, this worked for me:
>
> Indeed, irq_domain->host_data is NULL, there's no msi_domain_info. As
> this is x86, I was expecting x86 ot always use
> x86_init_dev_msi_info(), but that doesn't seem to be the case. I
> would like to better understand this.
>
> > diff --git a/drivers/pci/msi/msi.c b/drivers/pci/msi/msi.c
> > index dcbb4f9ac578..b76c7ec33602 100644
> > --- a/drivers/pci/msi/msi.c
> > +++ b/drivers/pci/msi/msi.c
> > @@ -609,8 +609,10 @@ void msix_prepare_msi_desc(struct pci_dev *dev, struct msi_desc *desc)
> > desc->pci.msi_attrib.is_64 = 1;
> > desc->pci.msi_attrib.default_irq = dev->irq;
> > desc->pci.mask_base = dev->msix_base;
> > - desc->pci.msi_attrib.can_mask = !(info->flags & MSI_FLAG_NO_MASK) &&
> > - !desc->pci.msi_attrib.is_virtual;
> > + desc->pci.msi_attrib.can_mask =
> > + info ? !(info->flags & MSI_FLAG_NO_MASK) &&
> > + !desc->pci.msi_attrib.is_virtual :
> > + 1;
> >
> > if (desc->pci.msi_attrib.can_mask) {
> > void __iomem *addr = pci_msix_desc_addr(desc);
> > @@ -743,7 +745,7 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
> > /* Disable INTX */
> > pci_intx_for_msi(dev, 0);
> >
> > - if (!(info->flags & MSI_FLAG_NO_MASK)) {
> > + if (info && !(info->flags & MSI_FLAG_NO_MASK)) {
>
> I think this should rather be:
>
> if (!info || !(info->flags & MSI_FLAG_NO_MASK)) {
>
> So that in case of no info the default action is to mask the entries.
>
> > /*
> > * Ensure that all table entries are masked to prevent
> > * stale entries from firing in a crash kernel.
> >
> > I also noticed d (struct irq_domain) can return NULL if CONFIG_GENERIC_MSI_IRQ
> > is not set and we are not checking that either.
> >
> > I run QEMU with vmctl [1]. This is my command:
> >
> > [1] https://github.com/SamsungDS/vmctl
> >
> > /usr/bin/qemu-system-x86_64 \
> > -nodefaults \
> > -display "none" \
> > -machine "q35,accel=kvm,kernel-irqchip=split" \
> > -cpu "host" \
> > -smp "4" \
> > -m "8G" \
> > -device "intel-iommu,intremap=on" \
> > -netdev "user,id=net0,hostfwd=tcp::2222-:22" \
> > -device "virtio-net-pci,netdev=net0" \
> > -device "virtio-rng-pci" \
> > -drive "id=boot,file=file.qcow2,format=qcow2,if=virtio,discard=unmap,media=disk,read-only=no" \
> > -device "pcie-root-port,id=pcie_root_port0,chassis=1,slot=0" \
> > -device "nvme,id=nvme0,serial=deadbeef,bus=pcie_root_port0,mdts=7" \
> > -drive "id=nvm,file=~/nvm.img,format=raw,if=none,discard=unmap,media=disk,read-only=no" \
> > -device "nvme-ns,id=nvm,drive=nvm,bus=nvme0,nsid=1,logical_block_size=4096,physical_block_size=4096" \
> > -pidfile "~/vmctl/confdir/run/nvme/pidfile" \
> > -kernel "~/src/kernel/linux/arch/x86_64/boot/bzImage" \
> > -append "root=/dev/vda1 console=ttyS0,115200 audit=0" \
> > -virtfs "local,path=~/linux,security_model=none,readonly=on,mount_tag=kernel_dir" \
> > -serial "mon:stdio" \
> > -d "guest_errors" \
> > -D "~/vmctl/confdir/log/nvme/qemu.log"
>
> Can you narrow down the command line to the minimum required to
> reproduce the issue?
/usr/bin/qemu-system-x86_64 \
-nodefaults \
-display "none" \
-machine "q35,accel=kvm" \
-cpu "host" \
-drive "id=boot,file=file.qcow2,format=qcow2,if=virtio,discard=unmap,media=disk,read-only=no" \
-kernel "~/src/kernel/linux/arch/x86_64/boot/bzImage" \
-append "root=/dev/vda1 console=ttyS0,115200 audit=0" \
-serial "mon:stdio"
>
> Can you attach the Kconfig used to build the crashing kernel?
I'm using these fragments [1]:
tinyconfig kvm_guest.config virtio-fs.config systemd.config distro.config \
storage.config localauto.config
[1] https://github.com/dkruces/linux-config-fragments/
>
> Thanks, Roger.
© 2016 - 2025 Red Hat, Inc.