When, during boot, we have already correctly determined availability of
the MMCFG access method for a given bus range, there's then no need to
invoke pci_check_extcfg() again for every of the devices. This in
particular avoids ->ext_cfg to transiently indicate the wrong state.
Switch to using Xen style on lines being touched and immediately adjacent
ones.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.
--- a/xen/arch/x86/physdev.c
+++ b/xen/arch/x86/physdev.c
@@ -528,6 +528,8 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H
if ( !ret )
ret = pci_segment_iterate(info.segment, physdev_check_pci_extcfg,
&info);
+ else if ( ret > 0 ) /* Indication of "no change". */
+ ret = 0;
if ( !ret && has_vpci(currd) && (info.flags & XEN_PCI_MMCFG_RESERVED) )
{
--- a/xen/arch/x86/x86_64/mmconfig.h
+++ b/xen/arch/x86/x86_64/mmconfig.h
@@ -74,6 +74,6 @@ int pci_mmcfg_reserved(uint64_t address,
unsigned int flags);
int pci_mmcfg_arch_init(void);
int pci_mmcfg_arch_enable(unsigned int idx);
-void pci_mmcfg_arch_disable(unsigned int idx);
+int pci_mmcfg_arch_disable(unsigned int idx);
#endif /* X86_64_MMCONFIG_H */
--- a/xen/arch/x86/x86_64/mmconfig-shared.c
+++ b/xen/arch/x86/x86_64/mmconfig-shared.c
@@ -388,8 +388,9 @@ static bool __init pci_mmcfg_reject_brok
(unsigned int)cfg->start_bus_number,
(unsigned int)cfg->end_bus_number);
- if (!is_mmconf_reserved(addr, size, i, cfg) ||
- pci_mmcfg_arch_enable(i)) {
+ if ( !is_mmconf_reserved(addr, size, i, cfg) ||
+ pci_mmcfg_arch_enable(i) < 0 )
+ {
pci_mmcfg_arch_disable(i);
valid = 0;
}
@@ -417,8 +418,8 @@ void __init acpi_mmcfg_init(void)
unsigned int i;
pci_mmcfg_arch_init();
- for (i = 0; i < pci_mmcfg_config_num; ++i)
- if (pci_mmcfg_arch_enable(i))
+ for ( i = 0; i < pci_mmcfg_config_num; ++i )
+ if ( pci_mmcfg_arch_enable(i) < 0 )
valid = 0;
} else {
acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg);
@@ -458,10 +459,11 @@ int pci_mmcfg_reserved(uint64_t address,
segment, start_bus, end_bus, address, cfg->address);
return -EIO;
}
- if (flags & XEN_PCI_MMCFG_RESERVED)
+
+ if ( flags & XEN_PCI_MMCFG_RESERVED )
return pci_mmcfg_arch_enable(i);
- pci_mmcfg_arch_disable(i);
- return 0;
+
+ return pci_mmcfg_arch_disable(i);
}
}
--- a/xen/arch/x86/x86_64/mmconfig_64.c
+++ b/xen/arch/x86/x86_64/mmconfig_64.c
@@ -138,8 +138,9 @@ int pci_mmcfg_arch_enable(unsigned int i
const typeof(pci_mmcfg_config[0]) *cfg = pci_mmcfg_virt[idx].cfg;
unsigned long start_mfn, end_mfn;
- if (pci_mmcfg_virt[idx].virt)
- return 0;
+ if ( pci_mmcfg_virt[idx].virt )
+ return 1;
+
pci_mmcfg_virt[idx].virt = mcfg_ioremap(cfg, idx, PAGE_HYPERVISOR_UC);
if (!pci_mmcfg_virt[idx].virt) {
printk(KERN_ERR "PCI: Cannot map MCFG aperture for segment %04x\n",
@@ -160,10 +161,13 @@ int pci_mmcfg_arch_enable(unsigned int i
return 0;
}
-void pci_mmcfg_arch_disable(unsigned int idx)
+int pci_mmcfg_arch_disable(unsigned int idx)
{
const typeof(pci_mmcfg_config[0]) *cfg = pci_mmcfg_virt[idx].cfg;
+ if ( !pci_mmcfg_virt[idx].virt )
+ return 1;
+
pci_mmcfg_virt[idx].virt = NULL;
/*
* Don't use destroy_xen_mappings() here, or make sure that at least
@@ -173,6 +177,8 @@ void pci_mmcfg_arch_disable(unsigned int
mcfg_ioremap(cfg, idx, 0);
printk(KERN_WARNING "PCI: Not using MCFG for segment %04x bus %02x-%02x\n",
cfg->pci_segment, cfg->start_bus_number, cfg->end_bus_number);
+
+ return 0;
}
bool pci_mmcfg_decode(unsigned long mfn, unsigned int *seg, unsigned int *bdf)
On 29.01.2026 14:10, Jan Beulich wrote:
> @@ -160,10 +161,13 @@ int pci_mmcfg_arch_enable(unsigned int i
> return 0;
> }
>
> -void pci_mmcfg_arch_disable(unsigned int idx)
> +int pci_mmcfg_arch_disable(unsigned int idx)
> {
> const typeof(pci_mmcfg_config[0]) *cfg = pci_mmcfg_virt[idx].cfg;
>
> + if ( !pci_mmcfg_virt[idx].virt )
> + return 1;
Afaict this is what causes CI (adl-*) to say no here:
(XEN) [ 4.132689] PCI: Using MCFG for segment 0000 bus 00-ff
(XEN) [ 4.132697] ----[ Xen-4.22-unstable x86_64 debug=y ubsan=y Not tainted ]----
(XEN) [ 4.132700] CPU: 12
(XEN) [ 4.132702] RIP: e008:[<ffff82d0405779bd>] pci_mmcfg_read+0x19e/0x1c7
(XEN) [ 4.132708] RFLAGS: 0000000000010286 CONTEXT: hypervisor (d0v0)
(XEN) [ 4.132711] rax: 0000000000300000 rbx: ffff808000300100 rcx: 0000000000000000
(XEN) [ 4.132714] rdx: ffff808000300100 rsi: 0000000000000000 rdi: ffff8304959ffcec
(XEN) [ 4.132716] rbp: ffff8304959ffd18 rsp: ffff8304959ffce8 r8: 0000000000000004
(XEN) [ 4.132718] r9: ffff8304959ffd2c r10: 0000000000000000 r11: 0000000000000000
(XEN) [ 4.132720] r12: 0000000000000100 r13: 0000000000000004 r14: ffff8304959ffd2c
(XEN) [ 4.132723] r15: ffff808000000000 cr0: 0000000080050033 cr4: 0000000000b526e0
(XEN) [ 4.132725] cr3: 0000000492a30000 cr2: ffff808000300100
(XEN) [ 4.132727] fsb: 0000000000000000 gsb: ffff8881b9a00000 gss: 0000000000000000
(XEN) [ 4.132729] ds: 0000 es: 0000 fs: 0000 gs: 0000 ss: e010 cs: e008
(XEN) [ 4.132733] Xen code around <ffff82d0405779bd> (pci_mmcfg_read+0x19e/0x1c7):
(XEN) [ 4.132734] 48 39 d3 72 ea 4c 01 e3 <8b> 03 89 c3 4d 85 f6 74 0d 41 89 1e b8 00 00 00
(XEN) [ 4.132744] Xen stack trace from rsp=ffff8304959ffce8:
(XEN) [ 4.132745] 0000000300000286 ffff830495bd8010 0000000000000003 ffff830495bd8010
(XEN) [ 4.132749] ffff8304959ffdd0 ffff82d0405fa7ef ffff8304959ffd30 ffff82d040576877
(XEN) [ 4.132753] 000000000000000c ffff8304959ffd58 ffff82d04039b81d ffff8304959ffe28
(XEN) [ 4.132756] 0000000000000003 ffff830495bd8010 ffff8304959ffd80 ffff82d0405fa90b
(XEN) [ 4.132760] ffff8304959ffdc8 ffff830495bd8010 ffff830498019650 ffff8304959ffdb8
(XEN) [ 4.132764] ffff82d0403983e0 ffff830498019650 ffff8304959ffe28 ffff82d0405fa7ef
(XEN) [ 4.132767] 0000000000000018 ffffc9004002b900 ffff8304959ffdf8 ffff82d04039feba
(XEN) [ 4.132771] ffff82d0405fa7ef ffff8304959ffe28 0000000000000000 ffffc9004002b900
(XEN) [ 4.132774] 0000000000000000 ffff8304959bb000 ffff8304959ffe78 ffff82d0405ff666
(XEN) [ 4.132778] ffff82d0405713b8 00000000ffffffff 00a0fb0081f456e0 ffff8304959b3010
(XEN) [ 4.132782] 00000000c0000000 00000001ff000000 ffff8304959fff08 0000000000000040
(XEN) [ 4.132785] 000000ec00000001 ffff8304959fff08 ffff8304959a4000 0000000000000021
(XEN) [ 4.132789] 0000000000000000 ffffc9004002b900 ffff8304959ffef8 ffff82d0405711b2
(XEN) [ 4.132792] 0000000000000000 ffff888100456938 ffff8881001470b8 0000000000000018
(XEN) [ 4.132795] 0000000000000000 ffff8304959ffef8 ffff82d0406213f9 ffff8304959a4000
(XEN) [ 4.132799] 0000000000000000 ffff8304959a4000 0000000000000000 0000000000000000
(XEN) [ 4.132802] ffff8304959fffff 0000000000000000 00007cfb6a6000d7 ffff82d0402012d3
(XEN) [ 4.132806] 0000000000000000 00000000ffffffff ffff8881001470b8 ffff888100b88900
(XEN) [ 4.132809] ffffc9004002b900 ffff8881001470b8 0000000000000283 ffff888100456938
(XEN) [ 4.132813] ffff888100065410 0000000000000000 0000000000000021 ffffffff81f7842a
(XEN) [ 4.132816] Xen call trace:
(XEN) [ 4.132819] [<ffff82d0405779bd>] R pci_mmcfg_read+0x19e/0x1c7
(XEN) [ 4.132822] [<ffff82d040576877>] F pci_conf_read32+0x55/0x5e
(XEN) [ 4.132826] [<ffff82d04039b81d>] F pci_check_extcfg+0xb1/0x13b
(XEN) [ 4.132831] [<ffff82d0405fa90b>] F physdev_check_pci_extcfg+0x11c/0x121
(XEN) [ 4.132833] [<ffff82d0403983e0>] F drivers/passthrough/pci.c#iterate_all+0xa2/0xe2
(XEN) [ 4.132836] [<ffff82d04039feba>] F pci_segment_iterate+0x4e/0x74
(XEN) [ 4.132839] [<ffff82d0405ff666>] F do_physdev_op+0x362a/0x4161
(XEN) [ 4.132842] [<ffff82d0405711b2>] F pv_hypercall+0x6be/0x838
(XEN) [ 4.132845] [<ffff82d0402012d3>] F lstar_enter+0x143/0x148
(XEN) [ 4.132847]
(XEN) [ 4.132848] Pagetable walk from ffff808000300100:
(XEN) [ 4.132851] L4[0x101] = 0000000000000000 ffffffffffffffff
There is an important comment in pci_mmcfg_arch_disable():
/*
* Don't use destroy_xen_mappings() here, or make sure that at least
* the necessary L4 entries get populated (so that they get properly
* propagated to guest domains' page tables).
*/
Hence it is wrong to bypass
mcfg_ioremap(cfg, idx, 0);
. v4 coming.
Jan
On 2/2/26 03:51, Jan Beulich wrote:
> On 29.01.2026 14:10, Jan Beulich wrote:
>> @@ -160,10 +161,13 @@ int pci_mmcfg_arch_enable(unsigned int i
>> return 0;
>> }
>>
>> -void pci_mmcfg_arch_disable(unsigned int idx)
>> +int pci_mmcfg_arch_disable(unsigned int idx)
>> {
>> const typeof(pci_mmcfg_config[0]) *cfg = pci_mmcfg_virt[idx].cfg;
>>
>> + if ( !pci_mmcfg_virt[idx].virt )
>> + return 1;
>
> Afaict this is what causes CI (adl-*) to say no here:
>
> (XEN) [ 4.132689] PCI: Using MCFG for segment 0000 bus 00-ff
> (XEN) [ 4.132697] ----[ Xen-4.22-unstable x86_64 debug=y ubsan=y Not tainted ]----
> (XEN) [ 4.132700] CPU: 12
> (XEN) [ 4.132702] RIP: e008:[<ffff82d0405779bd>] pci_mmcfg_read+0x19e/0x1c7
> (XEN) [ 4.132708] RFLAGS: 0000000000010286 CONTEXT: hypervisor (d0v0)
> (XEN) [ 4.132711] rax: 0000000000300000 rbx: ffff808000300100 rcx: 0000000000000000
> (XEN) [ 4.132714] rdx: ffff808000300100 rsi: 0000000000000000 rdi: ffff8304959ffcec
> (XEN) [ 4.132716] rbp: ffff8304959ffd18 rsp: ffff8304959ffce8 r8: 0000000000000004
> (XEN) [ 4.132718] r9: ffff8304959ffd2c r10: 0000000000000000 r11: 0000000000000000
> (XEN) [ 4.132720] r12: 0000000000000100 r13: 0000000000000004 r14: ffff8304959ffd2c
> (XEN) [ 4.132723] r15: ffff808000000000 cr0: 0000000080050033 cr4: 0000000000b526e0
> (XEN) [ 4.132725] cr3: 0000000492a30000 cr2: ffff808000300100
> (XEN) [ 4.132727] fsb: 0000000000000000 gsb: ffff8881b9a00000 gss: 0000000000000000
> (XEN) [ 4.132729] ds: 0000 es: 0000 fs: 0000 gs: 0000 ss: e010 cs: e008
> (XEN) [ 4.132733] Xen code around <ffff82d0405779bd> (pci_mmcfg_read+0x19e/0x1c7):
> (XEN) [ 4.132734] 48 39 d3 72 ea 4c 01 e3 <8b> 03 89 c3 4d 85 f6 74 0d 41 89 1e b8 00 00 00
> (XEN) [ 4.132744] Xen stack trace from rsp=ffff8304959ffce8:
> (XEN) [ 4.132745] 0000000300000286 ffff830495bd8010 0000000000000003 ffff830495bd8010
> (XEN) [ 4.132749] ffff8304959ffdd0 ffff82d0405fa7ef ffff8304959ffd30 ffff82d040576877
> (XEN) [ 4.132753] 000000000000000c ffff8304959ffd58 ffff82d04039b81d ffff8304959ffe28
> (XEN) [ 4.132756] 0000000000000003 ffff830495bd8010 ffff8304959ffd80 ffff82d0405fa90b
> (XEN) [ 4.132760] ffff8304959ffdc8 ffff830495bd8010 ffff830498019650 ffff8304959ffdb8
> (XEN) [ 4.132764] ffff82d0403983e0 ffff830498019650 ffff8304959ffe28 ffff82d0405fa7ef
> (XEN) [ 4.132767] 0000000000000018 ffffc9004002b900 ffff8304959ffdf8 ffff82d04039feba
> (XEN) [ 4.132771] ffff82d0405fa7ef ffff8304959ffe28 0000000000000000 ffffc9004002b900
> (XEN) [ 4.132774] 0000000000000000 ffff8304959bb000 ffff8304959ffe78 ffff82d0405ff666
> (XEN) [ 4.132778] ffff82d0405713b8 00000000ffffffff 00a0fb0081f456e0 ffff8304959b3010
> (XEN) [ 4.132782] 00000000c0000000 00000001ff000000 ffff8304959fff08 0000000000000040
> (XEN) [ 4.132785] 000000ec00000001 ffff8304959fff08 ffff8304959a4000 0000000000000021
> (XEN) [ 4.132789] 0000000000000000 ffffc9004002b900 ffff8304959ffef8 ffff82d0405711b2
> (XEN) [ 4.132792] 0000000000000000 ffff888100456938 ffff8881001470b8 0000000000000018
> (XEN) [ 4.132795] 0000000000000000 ffff8304959ffef8 ffff82d0406213f9 ffff8304959a4000
> (XEN) [ 4.132799] 0000000000000000 ffff8304959a4000 0000000000000000 0000000000000000
> (XEN) [ 4.132802] ffff8304959fffff 0000000000000000 00007cfb6a6000d7 ffff82d0402012d3
> (XEN) [ 4.132806] 0000000000000000 00000000ffffffff ffff8881001470b8 ffff888100b88900
> (XEN) [ 4.132809] ffffc9004002b900 ffff8881001470b8 0000000000000283 ffff888100456938
> (XEN) [ 4.132813] ffff888100065410 0000000000000000 0000000000000021 ffffffff81f7842a
> (XEN) [ 4.132816] Xen call trace:
> (XEN) [ 4.132819] [<ffff82d0405779bd>] R pci_mmcfg_read+0x19e/0x1c7
> (XEN) [ 4.132822] [<ffff82d040576877>] F pci_conf_read32+0x55/0x5e
> (XEN) [ 4.132826] [<ffff82d04039b81d>] F pci_check_extcfg+0xb1/0x13b
> (XEN) [ 4.132831] [<ffff82d0405fa90b>] F physdev_check_pci_extcfg+0x11c/0x121
> (XEN) [ 4.132833] [<ffff82d0403983e0>] F drivers/passthrough/pci.c#iterate_all+0xa2/0xe2
> (XEN) [ 4.132836] [<ffff82d04039feba>] F pci_segment_iterate+0x4e/0x74
> (XEN) [ 4.132839] [<ffff82d0405ff666>] F do_physdev_op+0x362a/0x4161
> (XEN) [ 4.132842] [<ffff82d0405711b2>] F pv_hypercall+0x6be/0x838
> (XEN) [ 4.132845] [<ffff82d0402012d3>] F lstar_enter+0x143/0x148
> (XEN) [ 4.132847]
> (XEN) [ 4.132848] Pagetable walk from ffff808000300100:
> (XEN) [ 4.132851] L4[0x101] = 0000000000000000 ffffffffffffffff
FWIW, I can reproduce this on my test machine.
> There is an important comment in pci_mmcfg_arch_disable():
>
> /*
> * Don't use destroy_xen_mappings() here, or make sure that at least
> * the necessary L4 entries get populated (so that they get properly
> * propagated to guest domains' page tables).
> */
>
> Hence it is wrong to bypass
>
> mcfg_ioremap(cfg, idx, 0);
Indeed, restoring this call resolves it.
On Mon, Feb 02, 2026 at 09:51:18AM +0100, Jan Beulich wrote:
> On 29.01.2026 14:10, Jan Beulich wrote:
> > @@ -160,10 +161,13 @@ int pci_mmcfg_arch_enable(unsigned int i
> > return 0;
> > }
> >
> > -void pci_mmcfg_arch_disable(unsigned int idx)
> > +int pci_mmcfg_arch_disable(unsigned int idx)
> > {
> > const typeof(pci_mmcfg_config[0]) *cfg = pci_mmcfg_virt[idx].cfg;
> >
> > + if ( !pci_mmcfg_virt[idx].virt )
> > + return 1;
>
> Afaict this is what causes CI (adl-*) to say no here:
>
> (XEN) [ 4.132689] PCI: Using MCFG for segment 0000 bus 00-ff
> (XEN) [ 4.132697] ----[ Xen-4.22-unstable x86_64 debug=y ubsan=y Not tainted ]----
> (XEN) [ 4.132700] CPU: 12
> (XEN) [ 4.132702] RIP: e008:[<ffff82d0405779bd>] pci_mmcfg_read+0x19e/0x1c7
> (XEN) [ 4.132708] RFLAGS: 0000000000010286 CONTEXT: hypervisor (d0v0)
> (XEN) [ 4.132711] rax: 0000000000300000 rbx: ffff808000300100 rcx: 0000000000000000
> (XEN) [ 4.132714] rdx: ffff808000300100 rsi: 0000000000000000 rdi: ffff8304959ffcec
> (XEN) [ 4.132716] rbp: ffff8304959ffd18 rsp: ffff8304959ffce8 r8: 0000000000000004
> (XEN) [ 4.132718] r9: ffff8304959ffd2c r10: 0000000000000000 r11: 0000000000000000
> (XEN) [ 4.132720] r12: 0000000000000100 r13: 0000000000000004 r14: ffff8304959ffd2c
> (XEN) [ 4.132723] r15: ffff808000000000 cr0: 0000000080050033 cr4: 0000000000b526e0
> (XEN) [ 4.132725] cr3: 0000000492a30000 cr2: ffff808000300100
> (XEN) [ 4.132727] fsb: 0000000000000000 gsb: ffff8881b9a00000 gss: 0000000000000000
> (XEN) [ 4.132729] ds: 0000 es: 0000 fs: 0000 gs: 0000 ss: e010 cs: e008
> (XEN) [ 4.132733] Xen code around <ffff82d0405779bd> (pci_mmcfg_read+0x19e/0x1c7):
> (XEN) [ 4.132734] 48 39 d3 72 ea 4c 01 e3 <8b> 03 89 c3 4d 85 f6 74 0d 41 89 1e b8 00 00 00
> (XEN) [ 4.132744] Xen stack trace from rsp=ffff8304959ffce8:
> (XEN) [ 4.132745] 0000000300000286 ffff830495bd8010 0000000000000003 ffff830495bd8010
> (XEN) [ 4.132749] ffff8304959ffdd0 ffff82d0405fa7ef ffff8304959ffd30 ffff82d040576877
> (XEN) [ 4.132753] 000000000000000c ffff8304959ffd58 ffff82d04039b81d ffff8304959ffe28
> (XEN) [ 4.132756] 0000000000000003 ffff830495bd8010 ffff8304959ffd80 ffff82d0405fa90b
> (XEN) [ 4.132760] ffff8304959ffdc8 ffff830495bd8010 ffff830498019650 ffff8304959ffdb8
> (XEN) [ 4.132764] ffff82d0403983e0 ffff830498019650 ffff8304959ffe28 ffff82d0405fa7ef
> (XEN) [ 4.132767] 0000000000000018 ffffc9004002b900 ffff8304959ffdf8 ffff82d04039feba
> (XEN) [ 4.132771] ffff82d0405fa7ef ffff8304959ffe28 0000000000000000 ffffc9004002b900
> (XEN) [ 4.132774] 0000000000000000 ffff8304959bb000 ffff8304959ffe78 ffff82d0405ff666
> (XEN) [ 4.132778] ffff82d0405713b8 00000000ffffffff 00a0fb0081f456e0 ffff8304959b3010
> (XEN) [ 4.132782] 00000000c0000000 00000001ff000000 ffff8304959fff08 0000000000000040
> (XEN) [ 4.132785] 000000ec00000001 ffff8304959fff08 ffff8304959a4000 0000000000000021
> (XEN) [ 4.132789] 0000000000000000 ffffc9004002b900 ffff8304959ffef8 ffff82d0405711b2
> (XEN) [ 4.132792] 0000000000000000 ffff888100456938 ffff8881001470b8 0000000000000018
> (XEN) [ 4.132795] 0000000000000000 ffff8304959ffef8 ffff82d0406213f9 ffff8304959a4000
> (XEN) [ 4.132799] 0000000000000000 ffff8304959a4000 0000000000000000 0000000000000000
> (XEN) [ 4.132802] ffff8304959fffff 0000000000000000 00007cfb6a6000d7 ffff82d0402012d3
> (XEN) [ 4.132806] 0000000000000000 00000000ffffffff ffff8881001470b8 ffff888100b88900
> (XEN) [ 4.132809] ffffc9004002b900 ffff8881001470b8 0000000000000283 ffff888100456938
> (XEN) [ 4.132813] ffff888100065410 0000000000000000 0000000000000021 ffffffff81f7842a
> (XEN) [ 4.132816] Xen call trace:
> (XEN) [ 4.132819] [<ffff82d0405779bd>] R pci_mmcfg_read+0x19e/0x1c7
> (XEN) [ 4.132822] [<ffff82d040576877>] F pci_conf_read32+0x55/0x5e
> (XEN) [ 4.132826] [<ffff82d04039b81d>] F pci_check_extcfg+0xb1/0x13b
> (XEN) [ 4.132831] [<ffff82d0405fa90b>] F physdev_check_pci_extcfg+0x11c/0x121
> (XEN) [ 4.132833] [<ffff82d0403983e0>] F drivers/passthrough/pci.c#iterate_all+0xa2/0xe2
> (XEN) [ 4.132836] [<ffff82d04039feba>] F pci_segment_iterate+0x4e/0x74
> (XEN) [ 4.132839] [<ffff82d0405ff666>] F do_physdev_op+0x362a/0x4161
> (XEN) [ 4.132842] [<ffff82d0405711b2>] F pv_hypercall+0x6be/0x838
> (XEN) [ 4.132845] [<ffff82d0402012d3>] F lstar_enter+0x143/0x148
> (XEN) [ 4.132847]
> (XEN) [ 4.132848] Pagetable walk from ffff808000300100:
> (XEN) [ 4.132851] L4[0x101] = 0000000000000000 ffffffffffffffff
>
> There is an important comment in pci_mmcfg_arch_disable():
>
> /*
> * Don't use destroy_xen_mappings() here, or make sure that at least
> * the necessary L4 entries get populated (so that they get properly
> * propagated to guest domains' page tables).
> */
>
> Hence it is wrong to bypass
>
> mcfg_ioremap(cfg, idx, 0);
Hm, I see. The L4 slot must be unconditionally populated before we
clone the idle page-table, otherwise the mappings won't propagate.
What about unconditionally populating the L4 slot in
subarch_init_memory()? That seems less fragile than doing it in
pci_mmcfg_arch_disable().
Thanks, Roger.
On 02.02.2026 10:14, Roger Pau Monné wrote:
> On Mon, Feb 02, 2026 at 09:51:18AM +0100, Jan Beulich wrote:
>> On 29.01.2026 14:10, Jan Beulich wrote:
>>> @@ -160,10 +161,13 @@ int pci_mmcfg_arch_enable(unsigned int i
>>> return 0;
>>> }
>>>
>>> -void pci_mmcfg_arch_disable(unsigned int idx)
>>> +int pci_mmcfg_arch_disable(unsigned int idx)
>>> {
>>> const typeof(pci_mmcfg_config[0]) *cfg = pci_mmcfg_virt[idx].cfg;
>>>
>>> + if ( !pci_mmcfg_virt[idx].virt )
>>> + return 1;
>>
>> Afaict this is what causes CI (adl-*) to say no here:
>>
>> (XEN) [ 4.132689] PCI: Using MCFG for segment 0000 bus 00-ff
>> (XEN) [ 4.132697] ----[ Xen-4.22-unstable x86_64 debug=y ubsan=y Not tainted ]----
>> (XEN) [ 4.132700] CPU: 12
>> (XEN) [ 4.132702] RIP: e008:[<ffff82d0405779bd>] pci_mmcfg_read+0x19e/0x1c7
>> (XEN) [ 4.132708] RFLAGS: 0000000000010286 CONTEXT: hypervisor (d0v0)
>> (XEN) [ 4.132711] rax: 0000000000300000 rbx: ffff808000300100 rcx: 0000000000000000
>> (XEN) [ 4.132714] rdx: ffff808000300100 rsi: 0000000000000000 rdi: ffff8304959ffcec
>> (XEN) [ 4.132716] rbp: ffff8304959ffd18 rsp: ffff8304959ffce8 r8: 0000000000000004
>> (XEN) [ 4.132718] r9: ffff8304959ffd2c r10: 0000000000000000 r11: 0000000000000000
>> (XEN) [ 4.132720] r12: 0000000000000100 r13: 0000000000000004 r14: ffff8304959ffd2c
>> (XEN) [ 4.132723] r15: ffff808000000000 cr0: 0000000080050033 cr4: 0000000000b526e0
>> (XEN) [ 4.132725] cr3: 0000000492a30000 cr2: ffff808000300100
>> (XEN) [ 4.132727] fsb: 0000000000000000 gsb: ffff8881b9a00000 gss: 0000000000000000
>> (XEN) [ 4.132729] ds: 0000 es: 0000 fs: 0000 gs: 0000 ss: e010 cs: e008
>> (XEN) [ 4.132733] Xen code around <ffff82d0405779bd> (pci_mmcfg_read+0x19e/0x1c7):
>> (XEN) [ 4.132734] 48 39 d3 72 ea 4c 01 e3 <8b> 03 89 c3 4d 85 f6 74 0d 41 89 1e b8 00 00 00
>> (XEN) [ 4.132744] Xen stack trace from rsp=ffff8304959ffce8:
>> (XEN) [ 4.132745] 0000000300000286 ffff830495bd8010 0000000000000003 ffff830495bd8010
>> (XEN) [ 4.132749] ffff8304959ffdd0 ffff82d0405fa7ef ffff8304959ffd30 ffff82d040576877
>> (XEN) [ 4.132753] 000000000000000c ffff8304959ffd58 ffff82d04039b81d ffff8304959ffe28
>> (XEN) [ 4.132756] 0000000000000003 ffff830495bd8010 ffff8304959ffd80 ffff82d0405fa90b
>> (XEN) [ 4.132760] ffff8304959ffdc8 ffff830495bd8010 ffff830498019650 ffff8304959ffdb8
>> (XEN) [ 4.132764] ffff82d0403983e0 ffff830498019650 ffff8304959ffe28 ffff82d0405fa7ef
>> (XEN) [ 4.132767] 0000000000000018 ffffc9004002b900 ffff8304959ffdf8 ffff82d04039feba
>> (XEN) [ 4.132771] ffff82d0405fa7ef ffff8304959ffe28 0000000000000000 ffffc9004002b900
>> (XEN) [ 4.132774] 0000000000000000 ffff8304959bb000 ffff8304959ffe78 ffff82d0405ff666
>> (XEN) [ 4.132778] ffff82d0405713b8 00000000ffffffff 00a0fb0081f456e0 ffff8304959b3010
>> (XEN) [ 4.132782] 00000000c0000000 00000001ff000000 ffff8304959fff08 0000000000000040
>> (XEN) [ 4.132785] 000000ec00000001 ffff8304959fff08 ffff8304959a4000 0000000000000021
>> (XEN) [ 4.132789] 0000000000000000 ffffc9004002b900 ffff8304959ffef8 ffff82d0405711b2
>> (XEN) [ 4.132792] 0000000000000000 ffff888100456938 ffff8881001470b8 0000000000000018
>> (XEN) [ 4.132795] 0000000000000000 ffff8304959ffef8 ffff82d0406213f9 ffff8304959a4000
>> (XEN) [ 4.132799] 0000000000000000 ffff8304959a4000 0000000000000000 0000000000000000
>> (XEN) [ 4.132802] ffff8304959fffff 0000000000000000 00007cfb6a6000d7 ffff82d0402012d3
>> (XEN) [ 4.132806] 0000000000000000 00000000ffffffff ffff8881001470b8 ffff888100b88900
>> (XEN) [ 4.132809] ffffc9004002b900 ffff8881001470b8 0000000000000283 ffff888100456938
>> (XEN) [ 4.132813] ffff888100065410 0000000000000000 0000000000000021 ffffffff81f7842a
>> (XEN) [ 4.132816] Xen call trace:
>> (XEN) [ 4.132819] [<ffff82d0405779bd>] R pci_mmcfg_read+0x19e/0x1c7
>> (XEN) [ 4.132822] [<ffff82d040576877>] F pci_conf_read32+0x55/0x5e
>> (XEN) [ 4.132826] [<ffff82d04039b81d>] F pci_check_extcfg+0xb1/0x13b
>> (XEN) [ 4.132831] [<ffff82d0405fa90b>] F physdev_check_pci_extcfg+0x11c/0x121
>> (XEN) [ 4.132833] [<ffff82d0403983e0>] F drivers/passthrough/pci.c#iterate_all+0xa2/0xe2
>> (XEN) [ 4.132836] [<ffff82d04039feba>] F pci_segment_iterate+0x4e/0x74
>> (XEN) [ 4.132839] [<ffff82d0405ff666>] F do_physdev_op+0x362a/0x4161
>> (XEN) [ 4.132842] [<ffff82d0405711b2>] F pv_hypercall+0x6be/0x838
>> (XEN) [ 4.132845] [<ffff82d0402012d3>] F lstar_enter+0x143/0x148
>> (XEN) [ 4.132847]
>> (XEN) [ 4.132848] Pagetable walk from ffff808000300100:
>> (XEN) [ 4.132851] L4[0x101] = 0000000000000000 ffffffffffffffff
>>
>> There is an important comment in pci_mmcfg_arch_disable():
>>
>> /*
>> * Don't use destroy_xen_mappings() here, or make sure that at least
>> * the necessary L4 entries get populated (so that they get properly
>> * propagated to guest domains' page tables).
>> */
>>
>> Hence it is wrong to bypass
>>
>> mcfg_ioremap(cfg, idx, 0);
>
> Hm, I see. The L4 slot must be unconditionally populated before we
> clone the idle page-table, otherwise the mappings won't propagate.
>
> What about unconditionally populating the L4 slot in
> subarch_init_memory()? That seems less fragile than doing it in
> pci_mmcfg_arch_disable().
Less fragile - perhaps. Yet I don't see why we should populate the field if
we wouldn't ever need it. Of course with violating layering some, we could
know in subarch_init_memory(), as pci_setup() runs earlier.
Jan
On Mon, Feb 02, 2026 at 10:30:29AM +0100, Jan Beulich wrote:
> On 02.02.2026 10:14, Roger Pau Monné wrote:
> > On Mon, Feb 02, 2026 at 09:51:18AM +0100, Jan Beulich wrote:
> >> On 29.01.2026 14:10, Jan Beulich wrote:
> >>> @@ -160,10 +161,13 @@ int pci_mmcfg_arch_enable(unsigned int i
> >>> return 0;
> >>> }
> >>>
> >>> -void pci_mmcfg_arch_disable(unsigned int idx)
> >>> +int pci_mmcfg_arch_disable(unsigned int idx)
> >>> {
> >>> const typeof(pci_mmcfg_config[0]) *cfg = pci_mmcfg_virt[idx].cfg;
> >>>
> >>> + if ( !pci_mmcfg_virt[idx].virt )
> >>> + return 1;
> >>
> >> Afaict this is what causes CI (adl-*) to say no here:
> >>
> >> (XEN) [ 4.132689] PCI: Using MCFG for segment 0000 bus 00-ff
> >> (XEN) [ 4.132697] ----[ Xen-4.22-unstable x86_64 debug=y ubsan=y Not tainted ]----
> >> (XEN) [ 4.132700] CPU: 12
> >> (XEN) [ 4.132702] RIP: e008:[<ffff82d0405779bd>] pci_mmcfg_read+0x19e/0x1c7
> >> (XEN) [ 4.132708] RFLAGS: 0000000000010286 CONTEXT: hypervisor (d0v0)
> >> (XEN) [ 4.132711] rax: 0000000000300000 rbx: ffff808000300100 rcx: 0000000000000000
> >> (XEN) [ 4.132714] rdx: ffff808000300100 rsi: 0000000000000000 rdi: ffff8304959ffcec
> >> (XEN) [ 4.132716] rbp: ffff8304959ffd18 rsp: ffff8304959ffce8 r8: 0000000000000004
> >> (XEN) [ 4.132718] r9: ffff8304959ffd2c r10: 0000000000000000 r11: 0000000000000000
> >> (XEN) [ 4.132720] r12: 0000000000000100 r13: 0000000000000004 r14: ffff8304959ffd2c
> >> (XEN) [ 4.132723] r15: ffff808000000000 cr0: 0000000080050033 cr4: 0000000000b526e0
> >> (XEN) [ 4.132725] cr3: 0000000492a30000 cr2: ffff808000300100
> >> (XEN) [ 4.132727] fsb: 0000000000000000 gsb: ffff8881b9a00000 gss: 0000000000000000
> >> (XEN) [ 4.132729] ds: 0000 es: 0000 fs: 0000 gs: 0000 ss: e010 cs: e008
> >> (XEN) [ 4.132733] Xen code around <ffff82d0405779bd> (pci_mmcfg_read+0x19e/0x1c7):
> >> (XEN) [ 4.132734] 48 39 d3 72 ea 4c 01 e3 <8b> 03 89 c3 4d 85 f6 74 0d 41 89 1e b8 00 00 00
> >> (XEN) [ 4.132744] Xen stack trace from rsp=ffff8304959ffce8:
> >> (XEN) [ 4.132745] 0000000300000286 ffff830495bd8010 0000000000000003 ffff830495bd8010
> >> (XEN) [ 4.132749] ffff8304959ffdd0 ffff82d0405fa7ef ffff8304959ffd30 ffff82d040576877
> >> (XEN) [ 4.132753] 000000000000000c ffff8304959ffd58 ffff82d04039b81d ffff8304959ffe28
> >> (XEN) [ 4.132756] 0000000000000003 ffff830495bd8010 ffff8304959ffd80 ffff82d0405fa90b
> >> (XEN) [ 4.132760] ffff8304959ffdc8 ffff830495bd8010 ffff830498019650 ffff8304959ffdb8
> >> (XEN) [ 4.132764] ffff82d0403983e0 ffff830498019650 ffff8304959ffe28 ffff82d0405fa7ef
> >> (XEN) [ 4.132767] 0000000000000018 ffffc9004002b900 ffff8304959ffdf8 ffff82d04039feba
> >> (XEN) [ 4.132771] ffff82d0405fa7ef ffff8304959ffe28 0000000000000000 ffffc9004002b900
> >> (XEN) [ 4.132774] 0000000000000000 ffff8304959bb000 ffff8304959ffe78 ffff82d0405ff666
> >> (XEN) [ 4.132778] ffff82d0405713b8 00000000ffffffff 00a0fb0081f456e0 ffff8304959b3010
> >> (XEN) [ 4.132782] 00000000c0000000 00000001ff000000 ffff8304959fff08 0000000000000040
> >> (XEN) [ 4.132785] 000000ec00000001 ffff8304959fff08 ffff8304959a4000 0000000000000021
> >> (XEN) [ 4.132789] 0000000000000000 ffffc9004002b900 ffff8304959ffef8 ffff82d0405711b2
> >> (XEN) [ 4.132792] 0000000000000000 ffff888100456938 ffff8881001470b8 0000000000000018
> >> (XEN) [ 4.132795] 0000000000000000 ffff8304959ffef8 ffff82d0406213f9 ffff8304959a4000
> >> (XEN) [ 4.132799] 0000000000000000 ffff8304959a4000 0000000000000000 0000000000000000
> >> (XEN) [ 4.132802] ffff8304959fffff 0000000000000000 00007cfb6a6000d7 ffff82d0402012d3
> >> (XEN) [ 4.132806] 0000000000000000 00000000ffffffff ffff8881001470b8 ffff888100b88900
> >> (XEN) [ 4.132809] ffffc9004002b900 ffff8881001470b8 0000000000000283 ffff888100456938
> >> (XEN) [ 4.132813] ffff888100065410 0000000000000000 0000000000000021 ffffffff81f7842a
> >> (XEN) [ 4.132816] Xen call trace:
> >> (XEN) [ 4.132819] [<ffff82d0405779bd>] R pci_mmcfg_read+0x19e/0x1c7
> >> (XEN) [ 4.132822] [<ffff82d040576877>] F pci_conf_read32+0x55/0x5e
> >> (XEN) [ 4.132826] [<ffff82d04039b81d>] F pci_check_extcfg+0xb1/0x13b
> >> (XEN) [ 4.132831] [<ffff82d0405fa90b>] F physdev_check_pci_extcfg+0x11c/0x121
> >> (XEN) [ 4.132833] [<ffff82d0403983e0>] F drivers/passthrough/pci.c#iterate_all+0xa2/0xe2
> >> (XEN) [ 4.132836] [<ffff82d04039feba>] F pci_segment_iterate+0x4e/0x74
> >> (XEN) [ 4.132839] [<ffff82d0405ff666>] F do_physdev_op+0x362a/0x4161
> >> (XEN) [ 4.132842] [<ffff82d0405711b2>] F pv_hypercall+0x6be/0x838
> >> (XEN) [ 4.132845] [<ffff82d0402012d3>] F lstar_enter+0x143/0x148
> >> (XEN) [ 4.132847]
> >> (XEN) [ 4.132848] Pagetable walk from ffff808000300100:
> >> (XEN) [ 4.132851] L4[0x101] = 0000000000000000 ffffffffffffffff
> >>
> >> There is an important comment in pci_mmcfg_arch_disable():
> >>
> >> /*
> >> * Don't use destroy_xen_mappings() here, or make sure that at least
> >> * the necessary L4 entries get populated (so that they get properly
> >> * propagated to guest domains' page tables).
> >> */
> >>
> >> Hence it is wrong to bypass
> >>
> >> mcfg_ioremap(cfg, idx, 0);
> >
> > Hm, I see. The L4 slot must be unconditionally populated before we
> > clone the idle page-table, otherwise the mappings won't propagate.
> >
> > What about unconditionally populating the L4 slot in
> > subarch_init_memory()? That seems less fragile than doing it in
> > pci_mmcfg_arch_disable().
>
> Less fragile - perhaps. Yet I don't see why we should populate the field if
> we wouldn't ever need it. Of course with violating layering some, we could
> know in subarch_init_memory(), as pci_setup() runs earlier.
How can Xen be sure at setup time that the slot will never be used?
The MMCFG could be empty initially when parsed by Xen, but MMCFG
regions might appear as a result of AML method execution (_CBA and
_CRS methods in hotplug host bridges).
Thanks, Roger.
On 02.02.2026 11:13, Roger Pau Monné wrote:
> On Mon, Feb 02, 2026 at 10:30:29AM +0100, Jan Beulich wrote:
>> On 02.02.2026 10:14, Roger Pau Monné wrote:
>>> On Mon, Feb 02, 2026 at 09:51:18AM +0100, Jan Beulich wrote:
>>>> On 29.01.2026 14:10, Jan Beulich wrote:
>>>>> @@ -160,10 +161,13 @@ int pci_mmcfg_arch_enable(unsigned int i
>>>>> return 0;
>>>>> }
>>>>>
>>>>> -void pci_mmcfg_arch_disable(unsigned int idx)
>>>>> +int pci_mmcfg_arch_disable(unsigned int idx)
>>>>> {
>>>>> const typeof(pci_mmcfg_config[0]) *cfg = pci_mmcfg_virt[idx].cfg;
>>>>>
>>>>> + if ( !pci_mmcfg_virt[idx].virt )
>>>>> + return 1;
>>>>
>>>> Afaict this is what causes CI (adl-*) to say no here:
>>>>
>>>> (XEN) [ 4.132689] PCI: Using MCFG for segment 0000 bus 00-ff
>>>> (XEN) [ 4.132697] ----[ Xen-4.22-unstable x86_64 debug=y ubsan=y Not tainted ]----
>>>> (XEN) [ 4.132700] CPU: 12
>>>> (XEN) [ 4.132702] RIP: e008:[<ffff82d0405779bd>] pci_mmcfg_read+0x19e/0x1c7
>>>> (XEN) [ 4.132708] RFLAGS: 0000000000010286 CONTEXT: hypervisor (d0v0)
>>>> (XEN) [ 4.132711] rax: 0000000000300000 rbx: ffff808000300100 rcx: 0000000000000000
>>>> (XEN) [ 4.132714] rdx: ffff808000300100 rsi: 0000000000000000 rdi: ffff8304959ffcec
>>>> (XEN) [ 4.132716] rbp: ffff8304959ffd18 rsp: ffff8304959ffce8 r8: 0000000000000004
>>>> (XEN) [ 4.132718] r9: ffff8304959ffd2c r10: 0000000000000000 r11: 0000000000000000
>>>> (XEN) [ 4.132720] r12: 0000000000000100 r13: 0000000000000004 r14: ffff8304959ffd2c
>>>> (XEN) [ 4.132723] r15: ffff808000000000 cr0: 0000000080050033 cr4: 0000000000b526e0
>>>> (XEN) [ 4.132725] cr3: 0000000492a30000 cr2: ffff808000300100
>>>> (XEN) [ 4.132727] fsb: 0000000000000000 gsb: ffff8881b9a00000 gss: 0000000000000000
>>>> (XEN) [ 4.132729] ds: 0000 es: 0000 fs: 0000 gs: 0000 ss: e010 cs: e008
>>>> (XEN) [ 4.132733] Xen code around <ffff82d0405779bd> (pci_mmcfg_read+0x19e/0x1c7):
>>>> (XEN) [ 4.132734] 48 39 d3 72 ea 4c 01 e3 <8b> 03 89 c3 4d 85 f6 74 0d 41 89 1e b8 00 00 00
>>>> (XEN) [ 4.132744] Xen stack trace from rsp=ffff8304959ffce8:
>>>> (XEN) [ 4.132745] 0000000300000286 ffff830495bd8010 0000000000000003 ffff830495bd8010
>>>> (XEN) [ 4.132749] ffff8304959ffdd0 ffff82d0405fa7ef ffff8304959ffd30 ffff82d040576877
>>>> (XEN) [ 4.132753] 000000000000000c ffff8304959ffd58 ffff82d04039b81d ffff8304959ffe28
>>>> (XEN) [ 4.132756] 0000000000000003 ffff830495bd8010 ffff8304959ffd80 ffff82d0405fa90b
>>>> (XEN) [ 4.132760] ffff8304959ffdc8 ffff830495bd8010 ffff830498019650 ffff8304959ffdb8
>>>> (XEN) [ 4.132764] ffff82d0403983e0 ffff830498019650 ffff8304959ffe28 ffff82d0405fa7ef
>>>> (XEN) [ 4.132767] 0000000000000018 ffffc9004002b900 ffff8304959ffdf8 ffff82d04039feba
>>>> (XEN) [ 4.132771] ffff82d0405fa7ef ffff8304959ffe28 0000000000000000 ffffc9004002b900
>>>> (XEN) [ 4.132774] 0000000000000000 ffff8304959bb000 ffff8304959ffe78 ffff82d0405ff666
>>>> (XEN) [ 4.132778] ffff82d0405713b8 00000000ffffffff 00a0fb0081f456e0 ffff8304959b3010
>>>> (XEN) [ 4.132782] 00000000c0000000 00000001ff000000 ffff8304959fff08 0000000000000040
>>>> (XEN) [ 4.132785] 000000ec00000001 ffff8304959fff08 ffff8304959a4000 0000000000000021
>>>> (XEN) [ 4.132789] 0000000000000000 ffffc9004002b900 ffff8304959ffef8 ffff82d0405711b2
>>>> (XEN) [ 4.132792] 0000000000000000 ffff888100456938 ffff8881001470b8 0000000000000018
>>>> (XEN) [ 4.132795] 0000000000000000 ffff8304959ffef8 ffff82d0406213f9 ffff8304959a4000
>>>> (XEN) [ 4.132799] 0000000000000000 ffff8304959a4000 0000000000000000 0000000000000000
>>>> (XEN) [ 4.132802] ffff8304959fffff 0000000000000000 00007cfb6a6000d7 ffff82d0402012d3
>>>> (XEN) [ 4.132806] 0000000000000000 00000000ffffffff ffff8881001470b8 ffff888100b88900
>>>> (XEN) [ 4.132809] ffffc9004002b900 ffff8881001470b8 0000000000000283 ffff888100456938
>>>> (XEN) [ 4.132813] ffff888100065410 0000000000000000 0000000000000021 ffffffff81f7842a
>>>> (XEN) [ 4.132816] Xen call trace:
>>>> (XEN) [ 4.132819] [<ffff82d0405779bd>] R pci_mmcfg_read+0x19e/0x1c7
>>>> (XEN) [ 4.132822] [<ffff82d040576877>] F pci_conf_read32+0x55/0x5e
>>>> (XEN) [ 4.132826] [<ffff82d04039b81d>] F pci_check_extcfg+0xb1/0x13b
>>>> (XEN) [ 4.132831] [<ffff82d0405fa90b>] F physdev_check_pci_extcfg+0x11c/0x121
>>>> (XEN) [ 4.132833] [<ffff82d0403983e0>] F drivers/passthrough/pci.c#iterate_all+0xa2/0xe2
>>>> (XEN) [ 4.132836] [<ffff82d04039feba>] F pci_segment_iterate+0x4e/0x74
>>>> (XEN) [ 4.132839] [<ffff82d0405ff666>] F do_physdev_op+0x362a/0x4161
>>>> (XEN) [ 4.132842] [<ffff82d0405711b2>] F pv_hypercall+0x6be/0x838
>>>> (XEN) [ 4.132845] [<ffff82d0402012d3>] F lstar_enter+0x143/0x148
>>>> (XEN) [ 4.132847]
>>>> (XEN) [ 4.132848] Pagetable walk from ffff808000300100:
>>>> (XEN) [ 4.132851] L4[0x101] = 0000000000000000 ffffffffffffffff
>>>>
>>>> There is an important comment in pci_mmcfg_arch_disable():
>>>>
>>>> /*
>>>> * Don't use destroy_xen_mappings() here, or make sure that at least
>>>> * the necessary L4 entries get populated (so that they get properly
>>>> * propagated to guest domains' page tables).
>>>> */
>>>>
>>>> Hence it is wrong to bypass
>>>>
>>>> mcfg_ioremap(cfg, idx, 0);
>>>
>>> Hm, I see. The L4 slot must be unconditionally populated before we
>>> clone the idle page-table, otherwise the mappings won't propagate.
>>>
>>> What about unconditionally populating the L4 slot in
>>> subarch_init_memory()? That seems less fragile than doing it in
>>> pci_mmcfg_arch_disable().
>>
>> Less fragile - perhaps. Yet I don't see why we should populate the field if
>> we wouldn't ever need it. Of course with violating layering some, we could
>> know in subarch_init_memory(), as pci_setup() runs earlier.
>
> How can Xen be sure at setup time that the slot will never be used?
> The MMCFG could be empty initially when parsed by Xen, but MMCFG
> regions might appear as a result of AML method execution (_CBA and
> _CRS methods in hotplug host bridges).
Their usability may change, but how many of them there are (going to be) we
need to know at boot time. See how pci_mmcfg_config_num, pci_mmcfg_config[],
and pci_mmcfg_virt[] are initialized (all by __init functions). If regions
could truly appear "out of the blue", we'd have a bigger problem.
Jan
On Mon, Feb 02, 2026 at 03:40:39PM +0100, Jan Beulich wrote:
> On 02.02.2026 11:13, Roger Pau Monné wrote:
> > On Mon, Feb 02, 2026 at 10:30:29AM +0100, Jan Beulich wrote:
> >> On 02.02.2026 10:14, Roger Pau Monné wrote:
> >>> On Mon, Feb 02, 2026 at 09:51:18AM +0100, Jan Beulich wrote:
> >>>> On 29.01.2026 14:10, Jan Beulich wrote:
> >>>>> @@ -160,10 +161,13 @@ int pci_mmcfg_arch_enable(unsigned int i
> >>>>> return 0;
> >>>>> }
> >>>>>
> >>>>> -void pci_mmcfg_arch_disable(unsigned int idx)
> >>>>> +int pci_mmcfg_arch_disable(unsigned int idx)
> >>>>> {
> >>>>> const typeof(pci_mmcfg_config[0]) *cfg = pci_mmcfg_virt[idx].cfg;
> >>>>>
> >>>>> + if ( !pci_mmcfg_virt[idx].virt )
> >>>>> + return 1;
> >>>>
> >>>> Afaict this is what causes CI (adl-*) to say no here:
> >>>>
> >>>> (XEN) [ 4.132689] PCI: Using MCFG for segment 0000 bus 00-ff
> >>>> (XEN) [ 4.132697] ----[ Xen-4.22-unstable x86_64 debug=y ubsan=y Not tainted ]----
> >>>> (XEN) [ 4.132700] CPU: 12
> >>>> (XEN) [ 4.132702] RIP: e008:[<ffff82d0405779bd>] pci_mmcfg_read+0x19e/0x1c7
> >>>> (XEN) [ 4.132708] RFLAGS: 0000000000010286 CONTEXT: hypervisor (d0v0)
> >>>> (XEN) [ 4.132711] rax: 0000000000300000 rbx: ffff808000300100 rcx: 0000000000000000
> >>>> (XEN) [ 4.132714] rdx: ffff808000300100 rsi: 0000000000000000 rdi: ffff8304959ffcec
> >>>> (XEN) [ 4.132716] rbp: ffff8304959ffd18 rsp: ffff8304959ffce8 r8: 0000000000000004
> >>>> (XEN) [ 4.132718] r9: ffff8304959ffd2c r10: 0000000000000000 r11: 0000000000000000
> >>>> (XEN) [ 4.132720] r12: 0000000000000100 r13: 0000000000000004 r14: ffff8304959ffd2c
> >>>> (XEN) [ 4.132723] r15: ffff808000000000 cr0: 0000000080050033 cr4: 0000000000b526e0
> >>>> (XEN) [ 4.132725] cr3: 0000000492a30000 cr2: ffff808000300100
> >>>> (XEN) [ 4.132727] fsb: 0000000000000000 gsb: ffff8881b9a00000 gss: 0000000000000000
> >>>> (XEN) [ 4.132729] ds: 0000 es: 0000 fs: 0000 gs: 0000 ss: e010 cs: e008
> >>>> (XEN) [ 4.132733] Xen code around <ffff82d0405779bd> (pci_mmcfg_read+0x19e/0x1c7):
> >>>> (XEN) [ 4.132734] 48 39 d3 72 ea 4c 01 e3 <8b> 03 89 c3 4d 85 f6 74 0d 41 89 1e b8 00 00 00
> >>>> (XEN) [ 4.132744] Xen stack trace from rsp=ffff8304959ffce8:
> >>>> (XEN) [ 4.132745] 0000000300000286 ffff830495bd8010 0000000000000003 ffff830495bd8010
> >>>> (XEN) [ 4.132749] ffff8304959ffdd0 ffff82d0405fa7ef ffff8304959ffd30 ffff82d040576877
> >>>> (XEN) [ 4.132753] 000000000000000c ffff8304959ffd58 ffff82d04039b81d ffff8304959ffe28
> >>>> (XEN) [ 4.132756] 0000000000000003 ffff830495bd8010 ffff8304959ffd80 ffff82d0405fa90b
> >>>> (XEN) [ 4.132760] ffff8304959ffdc8 ffff830495bd8010 ffff830498019650 ffff8304959ffdb8
> >>>> (XEN) [ 4.132764] ffff82d0403983e0 ffff830498019650 ffff8304959ffe28 ffff82d0405fa7ef
> >>>> (XEN) [ 4.132767] 0000000000000018 ffffc9004002b900 ffff8304959ffdf8 ffff82d04039feba
> >>>> (XEN) [ 4.132771] ffff82d0405fa7ef ffff8304959ffe28 0000000000000000 ffffc9004002b900
> >>>> (XEN) [ 4.132774] 0000000000000000 ffff8304959bb000 ffff8304959ffe78 ffff82d0405ff666
> >>>> (XEN) [ 4.132778] ffff82d0405713b8 00000000ffffffff 00a0fb0081f456e0 ffff8304959b3010
> >>>> (XEN) [ 4.132782] 00000000c0000000 00000001ff000000 ffff8304959fff08 0000000000000040
> >>>> (XEN) [ 4.132785] 000000ec00000001 ffff8304959fff08 ffff8304959a4000 0000000000000021
> >>>> (XEN) [ 4.132789] 0000000000000000 ffffc9004002b900 ffff8304959ffef8 ffff82d0405711b2
> >>>> (XEN) [ 4.132792] 0000000000000000 ffff888100456938 ffff8881001470b8 0000000000000018
> >>>> (XEN) [ 4.132795] 0000000000000000 ffff8304959ffef8 ffff82d0406213f9 ffff8304959a4000
> >>>> (XEN) [ 4.132799] 0000000000000000 ffff8304959a4000 0000000000000000 0000000000000000
> >>>> (XEN) [ 4.132802] ffff8304959fffff 0000000000000000 00007cfb6a6000d7 ffff82d0402012d3
> >>>> (XEN) [ 4.132806] 0000000000000000 00000000ffffffff ffff8881001470b8 ffff888100b88900
> >>>> (XEN) [ 4.132809] ffffc9004002b900 ffff8881001470b8 0000000000000283 ffff888100456938
> >>>> (XEN) [ 4.132813] ffff888100065410 0000000000000000 0000000000000021 ffffffff81f7842a
> >>>> (XEN) [ 4.132816] Xen call trace:
> >>>> (XEN) [ 4.132819] [<ffff82d0405779bd>] R pci_mmcfg_read+0x19e/0x1c7
> >>>> (XEN) [ 4.132822] [<ffff82d040576877>] F pci_conf_read32+0x55/0x5e
> >>>> (XEN) [ 4.132826] [<ffff82d04039b81d>] F pci_check_extcfg+0xb1/0x13b
> >>>> (XEN) [ 4.132831] [<ffff82d0405fa90b>] F physdev_check_pci_extcfg+0x11c/0x121
> >>>> (XEN) [ 4.132833] [<ffff82d0403983e0>] F drivers/passthrough/pci.c#iterate_all+0xa2/0xe2
> >>>> (XEN) [ 4.132836] [<ffff82d04039feba>] F pci_segment_iterate+0x4e/0x74
> >>>> (XEN) [ 4.132839] [<ffff82d0405ff666>] F do_physdev_op+0x362a/0x4161
> >>>> (XEN) [ 4.132842] [<ffff82d0405711b2>] F pv_hypercall+0x6be/0x838
> >>>> (XEN) [ 4.132845] [<ffff82d0402012d3>] F lstar_enter+0x143/0x148
> >>>> (XEN) [ 4.132847]
> >>>> (XEN) [ 4.132848] Pagetable walk from ffff808000300100:
> >>>> (XEN) [ 4.132851] L4[0x101] = 0000000000000000 ffffffffffffffff
> >>>>
> >>>> There is an important comment in pci_mmcfg_arch_disable():
> >>>>
> >>>> /*
> >>>> * Don't use destroy_xen_mappings() here, or make sure that at least
> >>>> * the necessary L4 entries get populated (so that they get properly
> >>>> * propagated to guest domains' page tables).
> >>>> */
> >>>>
> >>>> Hence it is wrong to bypass
> >>>>
> >>>> mcfg_ioremap(cfg, idx, 0);
> >>>
> >>> Hm, I see. The L4 slot must be unconditionally populated before we
> >>> clone the idle page-table, otherwise the mappings won't propagate.
> >>>
> >>> What about unconditionally populating the L4 slot in
> >>> subarch_init_memory()? That seems less fragile than doing it in
> >>> pci_mmcfg_arch_disable().
> >>
> >> Less fragile - perhaps. Yet I don't see why we should populate the field if
> >> we wouldn't ever need it. Of course with violating layering some, we could
> >> know in subarch_init_memory(), as pci_setup() runs earlier.
> >
> > How can Xen be sure at setup time that the slot will never be used?
> > The MMCFG could be empty initially when parsed by Xen, but MMCFG
> > regions might appear as a result of AML method execution (_CBA and
> > _CRS methods in hotplug host bridges).
>
> Their usability may change, but how many of them there are (going to be) we
> need to know at boot time. See how pci_mmcfg_config_num, pci_mmcfg_config[],
> and pci_mmcfg_virt[] are initialized (all by __init functions). If regions
> could truly appear "out of the blue", we'd have a bigger problem.
My copy of the PCI Firmware Spec v3.3 contains:
"4.1.2. MCFG Table Description
The MCFG table is an ACPI table that is used to communicate the base
addresses corresponding to the non-hot removable PCI Segment Groups
range within a PCI Segment Group available to the operating system at
boot.
[...]
4.1.3. The _CBA Method
Some systems may support hot plug of host bridges that introduce
either a range of buses within an existing PCI Segment Group or
introduce a new PCI Segment Group. For example, each I/O chip in a
multi-chip PCI Express root complex implementation could start a new
PCI Segment Group."
Together with this:
"The MCFG table format allows for more than one memory mapped base
address entry provided each entry (memory mapped configuration space
base address allocation structure) corresponds to a unique PCI Segment
Group consisting of 256 PCI buses. Multiple entries corresponding to a
single PCI Segment Group is not allowed."
Given that each segment group can only appear once in the MCFG, and
that the _CBA method can introduce new segment groups, it would seem
to me the spec does allow for new segments appearing exclusively as
the return of _CBA method? It does read as if hot-removable segment
groups must not appear in the MCFG table. I'm not finding any clear
statement in the spec that says that ECAM areas must previously appear
in the MCFG table.
I'm not sure how common that is, but it doesn't seem impossible given
my reading of the spec.
Thanks, Roger.
On 02.02.2026 16:18, Roger Pau Monné wrote: > My copy of the PCI Firmware Spec v3.3 contains: > > "4.1.2. MCFG Table Description > > The MCFG table is an ACPI table that is used to communicate the base > addresses corresponding to the non-hot removable PCI Segment Groups > range within a PCI Segment Group available to the operating system at > boot. > > [...] > > 4.1.3. The _CBA Method > > Some systems may support hot plug of host bridges that introduce > either a range of buses within an existing PCI Segment Group or > introduce a new PCI Segment Group. For example, each I/O chip in a > multi-chip PCI Express root complex implementation could start a new > PCI Segment Group." > > Together with this: > > "The MCFG table format allows for more than one memory mapped base > address entry provided each entry (memory mapped configuration space > base address allocation structure) corresponds to a unique PCI Segment > Group consisting of 256 PCI buses. Multiple entries corresponding to a > single PCI Segment Group is not allowed." > > Given that each segment group can only appear once in the MCFG, and > that the _CBA method can introduce new segment groups, it would seem > to me the spec does allow for new segments appearing exclusively as > the return of _CBA method? It does read as if hot-removable segment > groups must not appear in the MCFG table. I'm not finding any clear > statement in the spec that says that ECAM areas must previously appear > in the MCFG table. > > I'm not sure how common that is, but it doesn't seem impossible given > my reading of the spec. Hmm, that'll be a bit of work then, as Dom0 will also need to propagate the necessary data into Xen. Jan
On Tue, Feb 03, 2026 at 03:48:52PM +0100, Jan Beulich wrote: > On 02.02.2026 16:18, Roger Pau Monné wrote: > > My copy of the PCI Firmware Spec v3.3 contains: > > > > "4.1.2. MCFG Table Description > > > > The MCFG table is an ACPI table that is used to communicate the base > > addresses corresponding to the non-hot removable PCI Segment Groups > > range within a PCI Segment Group available to the operating system at > > boot. > > > > [...] > > > > 4.1.3. The _CBA Method > > > > Some systems may support hot plug of host bridges that introduce > > either a range of buses within an existing PCI Segment Group or > > introduce a new PCI Segment Group. For example, each I/O chip in a > > multi-chip PCI Express root complex implementation could start a new > > PCI Segment Group." > > > > Together with this: > > > > "The MCFG table format allows for more than one memory mapped base > > address entry provided each entry (memory mapped configuration space > > base address allocation structure) corresponds to a unique PCI Segment > > Group consisting of 256 PCI buses. Multiple entries corresponding to a > > single PCI Segment Group is not allowed." > > > > Given that each segment group can only appear once in the MCFG, and > > that the _CBA method can introduce new segment groups, it would seem > > to me the spec does allow for new segments appearing exclusively as > > the return of _CBA method? It does read as if hot-removable segment > > groups must not appear in the MCFG table. I'm not finding any clear > > statement in the spec that says that ECAM areas must previously appear > > in the MCFG table. > > > > I'm not sure how common that is, but it doesn't seem impossible given > > my reading of the spec. > > Hmm, that'll be a bit of work then, as Dom0 will also need to propagate > the necessary data into Xen. TBH this is what the spec says, but I've never encountered such a system. In fact I've never tested hotplug of a PCI host bridge. Not sure this can be simulated with QEMU so that we could at least test whatever fixes we plan to do in that area? I guess we could "fake" a bodge where Xen ignores the MCFG completely and only becomes aware of the ECAM areas from what the hardware domain reports back. Thanks, Roger.
On Thu, Jan 29, 2026 at 02:10:01PM +0100, Jan Beulich wrote: > When, during boot, we have already correctly determined availability of > the MMCFG access method for a given bus range, there's then no need to > invoke pci_check_extcfg() again for every of the devices. This in > particular avoids ->ext_cfg to transiently indicate the wrong state. > > Switch to using Xen style on lines being touched and immediately adjacent > ones. > > Signed-off-by: Jan Beulich <jbeulich@suse.com> Acked-by: Roger Pau Monné <roger.pau@citrix.com> One suggestion for a further addition below. > --- > v3: New. > > --- a/xen/arch/x86/physdev.c > +++ b/xen/arch/x86/physdev.c > @@ -528,6 +528,8 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H > if ( !ret ) > ret = pci_segment_iterate(info.segment, physdev_check_pci_extcfg, > &info); > + else if ( ret > 0 ) /* Indication of "no change". */ > + ret = 0; > > if ( !ret && has_vpci(currd) && (info.flags & XEN_PCI_MMCFG_RESERVED) ) Maybe it doesn't need to be strictly done here, but now that pci_mmcfg_reserved() signals whether the MMCFG was already registered, could you also restrict the register_vpci_mmcfg_handler() call to ret == 0? That will also simplify the logic in register_vpci_mmcfg_handler() since we no longer need to return 0 when the region is already registered, returning -EEXIST should be fine if the caller is adjusted. Thanks, Roger.
On 29.01.2026 16:32, Roger Pau Monné wrote: > On Thu, Jan 29, 2026 at 02:10:01PM +0100, Jan Beulich wrote: >> When, during boot, we have already correctly determined availability of >> the MMCFG access method for a given bus range, there's then no need to >> invoke pci_check_extcfg() again for every of the devices. This in >> particular avoids ->ext_cfg to transiently indicate the wrong state. >> >> Switch to using Xen style on lines being touched and immediately adjacent >> ones. >> >> Signed-off-by: Jan Beulich <jbeulich@suse.com> > > Acked-by: Roger Pau Monné <roger.pau@citrix.com> Thanks. > One suggestion for a further addition below. > >> --- >> v3: New. >> >> --- a/xen/arch/x86/physdev.c >> +++ b/xen/arch/x86/physdev.c >> @@ -528,6 +528,8 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H >> if ( !ret ) >> ret = pci_segment_iterate(info.segment, physdev_check_pci_extcfg, >> &info); >> + else if ( ret > 0 ) /* Indication of "no change". */ >> + ret = 0; >> >> if ( !ret && has_vpci(currd) && (info.flags & XEN_PCI_MMCFG_RESERVED) ) > > Maybe it doesn't need to be strictly done here, but now that > pci_mmcfg_reserved() signals whether the MMCFG was already registered, > could you also restrict the register_vpci_mmcfg_handler() call to ret > == 0? Possibly; you know vPCI better than I do. > That will also simplify the logic in register_vpci_mmcfg_handler() > since we no longer need to return 0 when the region is already > registered, returning -EEXIST should be fine if the caller is > adjusted. I think this then will want to be a separate change, with its own justification. Jan
© 2016 - 2026 Red Hat, Inc.