:p
atchew
Login
This is a follow-on to 'vPCI: avoid bogus "overlap in extended cap list" warnings', addressing further issues noted there. 1: PCI: determine whether a device has extended config space 2: PCI: pass pdev to pci_ats_{device,enabled}() 3: x86/MSI: pass pdev to read_pci_mem_bar() 4: PCI: pass pdev to pci_find_{,next_}ext_capability() 5: PCI: don't look for ext-caps when there's no extended cfg space 6: vPCI/DomU: really no ext-caps without extended config space Jan
Legacy PCI devices don't have any extended config space. Reading any part thereof may return all ones or other arbitrary data, e.g. in some cases base config space contents repeatedly. Logic follows Linux 6.19-rc's pci_cfg_space_size(), albeit leveraging our determination of device type; in particular some comments are taken verbatim from there. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- Note that alloc_pdev()'s switch doesn't handle DEV_TYPE_PCI2PCIe_BRIDGE at all. Such bridges will therefore not have ->ext_cfg set (which is likely wrong). Shouldn't we handle them like DEV_TYPE_LEGACY_PCI_BRIDGE (or DEV_TYPE_PCI?) anyway (just like VT-d's set_msi_source_id() does)? Linux also has CONFIG_PCI_QUIRKS, allowing to compile out the slightly risky code (as reads may in principle have side effects). Should we gain such, too? --- a/xen/drivers/passthrough/pci.c +++ b/xen/drivers/passthrough/pci.c @@ -XXX,XX +XXX,XX @@ static void apply_quirks(struct pci_dev * from trying to size the BARs or add handlers to trap accesses. */ pdev->ignore_bars = true; + + if ( pdev->ext_cfg ) + { + unsigned int pos; + + /* + * PCI Express to PCI/PCI-X Bridge Specification, rev 1.0, 4.1.4 says + * that when forwarding a type1 configuration request the bridge must + * check that the extended register address field is zero. The bridge + * is not permitted to forward the transactions and must handle it as + * an Unsupported Request. Some bridges do not follow this rule and + * simply drop the extended register bits, resulting in the standard + * config space being aliased, every 256 bytes across the entire + * configuration space. Test for this condition by comparing the first + * dword of each potential alias to the vendor/device ID. + * Known offenders: + * ASM1083/1085 PCIe-to-PCI Reversible Bridge (1b21:1080, rev 01 & 03) + * AMD/ATI SBx00 PCI to PCI Bridge (1002:4384, rev 40) + */ + for ( pos = PCI_CFG_SPACE_SIZE; + pos < PCI_CFG_SPACE_EXP_SIZE; pos += PCI_CFG_SPACE_SIZE ) + { + if ( pci_conf_read16(pdev->sbdf, pos) != vendor || + pci_conf_read16(pdev->sbdf, pos + 2) != device ) + break; + } + + if ( pos >= PCI_CFG_SPACE_EXP_SIZE ) + { + printk(XENLOG_WARNING + "%pp: extended config space aliases base one\n", + &pdev->sbdf); + pdev->ext_cfg = false; + } + } } static struct pci_dev *alloc_pdev(struct pci_seg *pseg, u8 bus, u8 devfn) @@ -XXX,XX +XXX,XX @@ static struct pci_dev *alloc_pdev(struct unsigned long flags; case DEV_TYPE_PCIe2PCI_BRIDGE: + pdev->ext_cfg = true; + fallthrough; case DEV_TYPE_LEGACY_PCI_BRIDGE: sec_bus = pci_conf_read8(pdev->sbdf, PCI_SECONDARY_BUS); sub_bus = pci_conf_read8(pdev->sbdf, PCI_SUBORDINATE_BUS); @@ -XXX,XX +XXX,XX @@ static struct pci_dev *alloc_pdev(struct pseg->bus2bridge[sec_bus].devfn = devfn; } spin_unlock_irqrestore(&pseg->bus2bridge_lock, flags); + + fallthrough; + case DEV_TYPE_PCI: + pos = pci_find_cap_offset(pdev->sbdf, PCI_CAP_ID_PCIX); + if ( pos && + (pci_conf_read32(pdev->sbdf, pos + PCI_X_STATUS) & + (PCI_X_STATUS_266MHZ | PCI_X_STATUS_533MHZ)) ) + pdev->ext_cfg = true; break; case DEV_TYPE_PCIe_ENDPOINT: + pdev->ext_cfg = true; + pos = pci_find_cap_offset(pdev->sbdf, PCI_CAP_ID_EXP); BUG_ON(!pos); cap = pci_conf_read16(pdev->sbdf, pos + PCI_EXP_DEVCAP); @@ -XXX,XX +XXX,XX @@ static struct pci_dev *alloc_pdev(struct } break; - case DEV_TYPE_PCI: case DEV_TYPE_PCIe_BRIDGE: case DEV_TYPE_PCI_HOST_BRIDGE: + pdev->ext_cfg = true; break; default: @@ -XXX,XX +XXX,XX @@ static struct pci_dev *alloc_pdev(struct break; } + if ( pdev->ext_cfg && + /* + * Regular PCI devices have 256 bytes, but PCI-X 2 and PCI Express + * devices have 4096 bytes. Even if the device is capable, that + * doesn't mean we can access it. Maybe we don't have a way to + * generate extended config space accesses, or the device is behind a + * reverse Express bridge. So we try reading the dword at + * PCI_CFG_SPACE_SIZE which must either be 0 or a valid extended + * capability header. + */ + pci_conf_read32(pdev->sbdf, PCI_CFG_SPACE_SIZE) == 0xffffffffU ) + pdev->ext_cfg = false; + apply_quirks(pdev); check_pdev(pdev); @@ -XXX,XX +XXX,XX @@ int pci_add_device(u16 seg, u8 bus, u8 d list_add(&pdev->vf_list, &pf_pdev->vf_list); } + + if ( !pdev->ext_cfg ) + printk(XENLOG_WARNING + "%pp: VF without extended config space?\n", + &pdev->sbdf); } } --- a/xen/include/xen/pci.h +++ b/xen/include/xen/pci.h @@ -XXX,XX +XXX,XX @@ struct pci_dev { nodeid_t node; /* NUMA node */ + /* Whether the device has extended config space. */ + bool ext_cfg; + /* Device to be quarantined, don't automatically re-assign to dom0 */ bool quarantine;
This not only brings both in sync with {en,dis}able_ats_device() but also prepares for doing the same to pci_find_{,next_}ext_capability(). Signed-off-by: Jan Beulich <jbeulich@suse.com> --- a/xen/drivers/passthrough/amd/iommu_cmd.c +++ b/xen/drivers/passthrough/amd/iommu_cmd.c @@ -XXX,XX +XXX,XX @@ void amd_iommu_flush_iotlb(u8 devfn, con if ( !ats_enabled ) return; - if ( !pci_ats_enabled(pdev->seg, pdev->bus, pdev->devfn) ) + if ( !pci_ats_enabled(pdev) ) return; iommu = find_iommu_for_device(pdev->sbdf); --- a/xen/drivers/passthrough/amd/pci_amd_iommu.c +++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c @@ -XXX,XX +XXX,XX @@ static bool use_ats( { return !ivrs_dev->block_ats && iommu_has_cap(iommu, PCI_CAP_IOTLB_SHIFT) && - pci_ats_device(iommu->sbdf.seg, pdev->bus, pdev->devfn); + pci_ats_device(pdev); } static int __must_check amd_iommu_setup_domain_device( @@ -XXX,XX +XXX,XX @@ static int __must_check amd_iommu_setup_ ASSERT(pcidevs_locked()); - if ( use_ats(pdev, iommu, ivrs_dev) && - !pci_ats_enabled(iommu->sbdf.seg, bus, pdev->devfn) ) + if ( use_ats(pdev, iommu, ivrs_dev) && !pci_ats_enabled(pdev) ) { if ( devfn == pdev->devfn ) enable_ats_device(pdev, &iommu->ats_devices); @@ -XXX,XX +XXX,XX @@ static void amd_iommu_disable_domain_dev ASSERT(pcidevs_locked()); - if ( pci_ats_device(iommu->sbdf.seg, bus, pdev->devfn) && - pci_ats_enabled(iommu->sbdf.seg, bus, pdev->devfn) ) + if ( pci_ats_device(pdev) && pci_ats_enabled(pdev) ) disable_ats_device(pdev); BUG_ON ( iommu->dev_table.buffer == NULL ); --- a/xen/drivers/passthrough/ats.h +++ b/xen/drivers/passthrough/ats.h @@ -XXX,XX +XXX,XX @@ extern bool ats_enabled; int enable_ats_device(struct pci_dev *pdev, struct list_head *ats_list); void disable_ats_device(struct pci_dev *pdev); -static inline int pci_ats_enabled(int seg, int bus, int devfn) +static inline int pci_ats_enabled(const struct pci_dev *pdev) { u32 value; int pos; - pos = pci_find_ext_capability(PCI_SBDF(seg, bus, devfn), - PCI_EXT_CAP_ID_ATS); + pos = pci_find_ext_capability(pdev->sbdf, PCI_EXT_CAP_ID_ATS); BUG_ON(!pos); - value = pci_conf_read16(PCI_SBDF(seg, bus, devfn), pos + ATS_REG_CTL); + value = pci_conf_read16(pdev->sbdf, pos + ATS_REG_CTL); return value & ATS_ENABLE; } -static inline int pci_ats_device(int seg, int bus, int devfn) +static inline int pci_ats_device(const struct pci_dev *pdev) { if ( !ats_enabled ) return 0; - return pci_find_ext_capability(PCI_SBDF(seg, bus, devfn), - PCI_EXT_CAP_ID_ATS); + return pci_find_ext_capability(pdev->sbdf, PCI_EXT_CAP_ID_ATS); } #endif /* DRIVERS__PASSTHROUGH__ATS_H */
This not only reduces the number of parameters and local variables, but also prepares for doing the same to pci_find_{,next_}ext_capability(). Signed-off-by: Jan Beulich <jbeulich@suse.com> --- a/xen/arch/x86/msi.c +++ b/xen/arch/x86/msi.c @@ -XXX,XX +XXX,XX @@ static int msi_capability_init(struct pc return 0; } -static uint64_t read_pci_mem_bar(pci_sbdf_t sbdf, uint8_t bir, int vf, - const struct pf_info *pf_info) +static uint64_t read_pci_mem_bar(const struct pci_dev *pdev, uint8_t bir, + int vf) { - uint16_t seg = sbdf.seg; - uint8_t bus = sbdf.bus, slot = sbdf.dev, func = sbdf.fn; + uint16_t seg = pdev->sbdf.seg; + uint8_t bus = pdev->sbdf.bus, slot = pdev->sbdf.dev, func = pdev->sbdf.fn; u8 limit; u32 addr, base = PCI_BASE_ADDRESS_0; u64 disp = 0; @@ -XXX,XX +XXX,XX @@ static uint64_t read_pci_mem_bar(pci_sbd unsigned int pos; uint16_t ctrl, num_vf, offset, stride; - ASSERT(pf_info); - - pos = pci_find_ext_capability(sbdf, PCI_EXT_CAP_ID_SRIOV); - ctrl = pci_conf_read16(sbdf, pos + PCI_SRIOV_CTRL); - num_vf = pci_conf_read16(sbdf, pos + PCI_SRIOV_NUM_VF); - offset = pci_conf_read16(sbdf, pos + PCI_SRIOV_VF_OFFSET); - stride = pci_conf_read16(sbdf, pos + PCI_SRIOV_VF_STRIDE); + pos = pci_find_ext_capability(pdev->sbdf, PCI_EXT_CAP_ID_SRIOV); + ctrl = pci_conf_read16(pdev->sbdf, pos + PCI_SRIOV_CTRL); + num_vf = pci_conf_read16(pdev->sbdf, pos + PCI_SRIOV_NUM_VF); + offset = pci_conf_read16(pdev->sbdf, pos + PCI_SRIOV_VF_OFFSET); + stride = pci_conf_read16(pdev->sbdf, pos + PCI_SRIOV_VF_STRIDE); if ( !pos || !(ctrl & PCI_SRIOV_CTRL_VFE) || !(ctrl & PCI_SRIOV_CTRL_MSE) || !num_vf || !offset || (num_vf > 1 && !stride) || bir >= PCI_SRIOV_NUM_BARS || - !pf_info->vf_rlen[bir] ) + !pdev->physfn.vf_rlen[bir] ) return 0; base = pos + PCI_SRIOV_BAR; vf -= PCI_BDF(bus, slot, func) + offset; @@ -XXX,XX +XXX,XX @@ static uint64_t read_pci_mem_bar(pci_sbd } if ( vf >= num_vf ) return 0; - BUILD_BUG_ON(ARRAY_SIZE(pf_info->vf_rlen) != PCI_SRIOV_NUM_BARS); - disp = vf * pf_info->vf_rlen[bir]; + BUILD_BUG_ON(ARRAY_SIZE(pdev->physfn.vf_rlen) != PCI_SRIOV_NUM_BARS); + disp = vf * pdev->physfn.vf_rlen[bir]; limit = PCI_SRIOV_NUM_BARS; } else switch ( pci_conf_read8(PCI_SBDF(seg, bus, slot, func), @@ -XXX,XX +XXX,XX @@ static int msix_capability_init(struct p u16 control; u64 table_paddr; u32 table_offset; - u16 seg = dev->seg; - u8 bus = dev->bus; - u8 slot = PCI_SLOT(dev->devfn); - u8 func = PCI_FUNC(dev->devfn); bool maskall = msix->host_maskall, zap_on_error = false; unsigned int pos = dev->msix_pos; @@ -XXX,XX +XXX,XX @@ static int msix_capability_init(struct p (is_hardware_domain(current->domain) && (dev->domain == current->domain || dev->domain == dom_io))) ) { - unsigned int bir = table_offset & PCI_MSIX_BIRMASK, pbus, pslot, pfunc; - int vf; + unsigned int bir = table_offset & PCI_MSIX_BIRMASK; + int vf = -1; + const struct pci_dev *pf_dev = dev; paddr_t pba_paddr; unsigned int pba_offset; - const struct pf_info *pf_info; - if ( !dev->info.is_virtfn ) - { - pbus = bus; - pslot = slot; - pfunc = func; - vf = -1; - pf_info = NULL; - } - else + if ( dev->info.is_virtfn ) { - pbus = dev->info.physfn.bus; - pslot = PCI_SLOT(dev->info.physfn.devfn); - pfunc = PCI_FUNC(dev->info.physfn.devfn); vf = dev->sbdf.bdf; ASSERT(dev->pf_pdev); - pf_info = &dev->pf_pdev->physfn; + pf_dev = dev->pf_pdev; } - table_paddr = read_pci_mem_bar(PCI_SBDF(seg, pbus, pslot, pfunc), bir, - vf, pf_info); + table_paddr = read_pci_mem_bar(pf_dev, bir, vf); WARN_ON(msi && msi->table_base != table_paddr); if ( !table_paddr ) { @@ -XXX,XX +XXX,XX @@ static int msix_capability_init(struct p pba_offset = pci_conf_read32(dev->sbdf, msix_pba_offset_reg(pos)); bir = (u8)(pba_offset & PCI_MSIX_BIRMASK); - pba_paddr = read_pci_mem_bar(PCI_SBDF(seg, pbus, pslot, pfunc), bir, vf, - pf_info); + pba_paddr = read_pci_mem_bar(pf_dev, bir, vf); WARN_ON(!pba_paddr); pba_paddr += pba_offset & ~PCI_MSIX_BIRMASK;
This is in preparation of using attributes recorded for devices. Additionally locating (extended) capabilities of non-devices (e.g. phantom functions) makes no sense. While there also eliminate open-coding of PCI_CFG_SPACE_SIZE in adjacent code. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- a/xen/arch/x86/msi.c +++ b/xen/arch/x86/msi.c @@ -XXX,XX +XXX,XX @@ static uint64_t read_pci_mem_bar(const s unsigned int pos; uint16_t ctrl, num_vf, offset, stride; - pos = pci_find_ext_capability(pdev->sbdf, PCI_EXT_CAP_ID_SRIOV); + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_SRIOV); ctrl = pci_conf_read16(pdev->sbdf, pos + PCI_SRIOV_CTRL); num_vf = pci_conf_read16(pdev->sbdf, pos + PCI_SRIOV_NUM_VF); offset = pci_conf_read16(pdev->sbdf, pos + PCI_SRIOV_VF_OFFSET); --- a/xen/drivers/passthrough/ats.c +++ b/xen/drivers/passthrough/ats.c @@ -XXX,XX +XXX,XX @@ int enable_ats_device(struct pci_dev *pd u32 value; int pos; - pos = pci_find_ext_capability(pdev->sbdf, PCI_EXT_CAP_ID_ATS); + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS); BUG_ON(!pos); if ( iommu_verbose ) --- a/xen/drivers/passthrough/ats.h +++ b/xen/drivers/passthrough/ats.h @@ -XXX,XX +XXX,XX @@ static inline int pci_ats_enabled(const u32 value; int pos; - pos = pci_find_ext_capability(pdev->sbdf, PCI_EXT_CAP_ID_ATS); + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS); BUG_ON(!pos); value = pci_conf_read16(pdev->sbdf, pos + ATS_REG_CTL); @@ -XXX,XX +XXX,XX @@ static inline int pci_ats_device(const s if ( !ats_enabled ) return 0; - return pci_find_ext_capability(pdev->sbdf, PCI_EXT_CAP_ID_ATS); + return pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS); } #endif /* DRIVERS__PASSTHROUGH__ATS_H */ --- a/xen/drivers/passthrough/pci.c +++ b/xen/drivers/passthrough/pci.c @@ -XXX,XX +XXX,XX @@ static void pci_enable_acs(struct pci_de if ( !is_iommu_enabled(pdev->domain) ) return; - pos = pci_find_ext_capability(pdev->sbdf, PCI_EXT_CAP_ID_ACS); + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ACS); if (!pos) return; @@ -XXX,XX +XXX,XX @@ int pci_add_device(u16 seg, u8 bus, u8 d if ( !pdev->info.is_virtfn && !pdev->physfn.vf_rlen[0] ) { - unsigned int pos = pci_find_ext_capability(pdev->sbdf, - PCI_EXT_CAP_ID_SRIOV); + unsigned int pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_SRIOV); uint16_t ctrl = pci_conf_read16(pdev->sbdf, pos + PCI_SRIOV_CTRL); if ( !pos ) --- a/xen/drivers/passthrough/vtd/x86/ats.c +++ b/xen/drivers/passthrough/vtd/x86/ats.c @@ -XXX,XX +XXX,XX @@ int ats_device(const struct pci_dev *pde return 0; ats_drhd = find_ats_dev_drhd(drhd->iommu); - pos = pci_find_ext_capability(pdev->sbdf, PCI_EXT_CAP_ID_ATS); + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS); if ( pos && (ats_drhd == NULL) ) { --- a/xen/drivers/passthrough/vtd/quirks.c +++ b/xen/drivers/passthrough/vtd/quirks.c @@ -XXX,XX +XXX,XX @@ void pci_vtd_quirk(const struct pci_dev /* Sandybridge-EP (Romley) */ case 0x3c00: /* host bridge */ case 0x3c01 ... 0x3c0b: /* root ports */ - pos = pci_find_ext_capability(pdev->sbdf, PCI_EXT_CAP_ID_ERR); + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ERR); if ( !pos ) { - pos = pci_find_ext_capability(pdev->sbdf, PCI_EXT_CAP_ID_VNDR); + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_VNDR); while ( pos ) { val = pci_conf_read32(pdev->sbdf, pos + PCI_VNDR_HEADER); @@ -XXX,XX +XXX,XX @@ void pci_vtd_quirk(const struct pci_dev pos += PCI_VNDR_HEADER; break; } - pos = pci_find_next_ext_capability(pdev->sbdf, pos, + pos = pci_find_next_ext_capability(pdev, pos, PCI_EXT_CAP_ID_VNDR); } ff = 0; --- a/xen/drivers/pci/pci.c +++ b/xen/drivers/pci/pci.c @@ -XXX,XX +XXX,XX @@ unsigned int pci_find_next_cap(pci_sbdf_ * within the device's PCI configuration space or 0 if the device does * not support it. */ -unsigned int pci_find_ext_capability(pci_sbdf_t sbdf, unsigned int cap) +unsigned int pci_find_ext_capability(const struct pci_dev *pdev, + unsigned int cap) { - return pci_find_next_ext_capability(sbdf, 0, cap); + return pci_find_next_ext_capability(pdev, 0, cap); } /** @@ -XXX,XX +XXX,XX @@ unsigned int pci_find_ext_capability(pci * within the device's PCI configuration space or 0 if the device does * not support it. */ -unsigned int pci_find_next_ext_capability(pci_sbdf_t sbdf, unsigned int start, +unsigned int pci_find_next_ext_capability(const struct pci_dev *pdev, + unsigned int start, unsigned int cap) { u32 header; int ttl = 480; /* 3840 bytes, minimum 8 bytes per capability */ - unsigned int pos = max(start, 0x100U); + unsigned int pos = max(start, PCI_CFG_SPACE_SIZE + 0U); - header = pci_conf_read32(sbdf, pos); + header = pci_conf_read32(pdev->sbdf, pos); /* * If we have no capabilities, this is indicated by cap ID, @@ -XXX,XX +XXX,XX @@ unsigned int pci_find_next_ext_capabilit if ( PCI_EXT_CAP_ID(header) == cap && pos != start ) return pos; pos = PCI_EXT_CAP_NEXT(header); - if ( pos < 0x100 ) + if ( pos < PCI_CFG_SPACE_SIZE ) break; - header = pci_conf_read32(sbdf, pos); + header = pci_conf_read32(pdev->sbdf, pos); } return 0; } --- a/xen/drivers/vpci/rebar.c +++ b/xen/drivers/vpci/rebar.c @@ -XXX,XX +XXX,XX @@ static int cf_check init_rebar(struct pc { uint32_t ctrl; unsigned int nbars; - unsigned int rebar_offset = pci_find_ext_capability(pdev->sbdf, + unsigned int rebar_offset = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_REBAR); if ( !rebar_offset ) --- a/xen/drivers/vpci/vpci.c +++ b/xen/drivers/vpci/vpci.c @@ -XXX,XX +XXX,XX @@ static struct vpci_register *vpci_get_pr static int vpci_ext_capability_hide( const struct pci_dev *pdev, unsigned int cap) { - const unsigned int offset = pci_find_ext_capability(pdev->sbdf, cap); + const unsigned int offset = pci_find_ext_capability(pdev, cap); struct vpci_register *r, *prev_r; struct vpci *vpci = pdev->vpci; uint32_t header, pre_header; @@ -XXX,XX +XXX,XX @@ static int vpci_init_capabilities(struct if ( !is_ext ) pos = pci_find_cap_offset(pdev->sbdf, cap); else if ( is_hardware_domain(pdev->domain) ) - pos = pci_find_ext_capability(pdev->sbdf, cap); + pos = pci_find_ext_capability(pdev, cap); if ( !pos ) continue; @@ -XXX,XX +XXX,XX @@ void vpci_deassign_device(struct pci_dev if ( !capability->is_ext ) pos = pci_find_cap_offset(pdev->sbdf, cap); else if ( is_hardware_domain(pdev->domain) ) - pos = pci_find_ext_capability(pdev->sbdf, cap); + pos = pci_find_ext_capability(pdev, cap); if ( pos ) { int rc = capability->cleanup(pdev, false); --- a/xen/include/xen/pci.h +++ b/xen/include/xen/pci.h @@ -XXX,XX +XXX,XX @@ unsigned int pci_find_next_cap_ttl(pci_s unsigned int *ttl); unsigned int pci_find_next_cap(pci_sbdf_t sbdf, unsigned int pos, unsigned int cap); -unsigned int pci_find_ext_capability(pci_sbdf_t sbdf, unsigned int cap); -unsigned int pci_find_next_ext_capability(pci_sbdf_t sbdf, unsigned int start, +unsigned int pci_find_ext_capability(const struct pci_dev *pdev, + unsigned int cap); +unsigned int pci_find_next_ext_capability(const struct pci_dev *pdev, + unsigned int start, unsigned int cap); const char *parse_pci(const char *s, unsigned int *seg_p, unsigned int *bus_p, unsigned int *dev_p, unsigned int *func_p);
Avoid interpreting as extended capabilities what may be about anything. In doing so, vPCI then also won't mis-interpret data from beyond base config space anymore. Fixes: 3b35911d709e ("Enable pci mmcfg and ATS for x86_64") Fixes: a845b50c12f3 ("vpci/header: Emulate extended capability list for dom0") Signed-off-by: Jan Beulich <jbeulich@suse.com> --- Because of the multiple prereq changes, despite the Fixes: tags I'm not quite sure whether to backport this. (I'm leaning towards "no".) --- a/xen/drivers/pci/pci.c +++ b/xen/drivers/pci/pci.c @@ -XXX,XX +XXX,XX @@ unsigned int pci_find_next_ext_capabilit int ttl = 480; /* 3840 bytes, minimum 8 bytes per capability */ unsigned int pos = max(start, PCI_CFG_SPACE_SIZE + 0U); + if ( !pdev->ext_cfg ) + { + ASSERT(!start); + return 0; + } + header = pci_conf_read32(pdev->sbdf, pos); /*
Whether to emulate accesses to the first 32 bits of extended config space as read-as-zero or read-as-all-ones depends on whether a device actually has extended config space. If it doesn't, read-as-zero isn't correct; not getting this right may confuse functions like Linux 6.19-rc's pci_ext_cfg_is_aliased(). Fixes: a845b50c12f3 ("vpci/header: Emulate extended capability list for dom0") Signed-off-by: Jan Beulich <jbeulich@suse.com> --- a/xen/drivers/vpci/header.c +++ b/xen/drivers/vpci/header.c @@ -XXX,XX +XXX,XX @@ static int vpci_init_ext_capability_list unsigned int pos = PCI_CFG_SPACE_SIZE; if ( !is_hardware_domain(pdev->domain) ) + { + if ( !pdev->ext_cfg ) + return 0; + /* Extended capabilities read as zero, write ignore for DomU */ return vpci_add_register(pdev->vpci, vpci_read_val, NULL, pos, 4, (void *)0); + } do {
This is a follow-on to 'vPCI: avoid bogus "overlap in extended cap list" warnings', addressing further issues noted there. v3: Three new patches and some other re-work. See individual patches. 1: PCI: handle PCI->PCIe bridges as well in free_pdev() 2: PCI: determine whether a device has extended config space 3: PCI: don't look for ext-caps when there's no extended cfg space 4: vPCI/DomU: really no ext-caps without extended config space 5: x86/PCI: avoid re-evaluation of extended config space accessibility 6: vPCI: re-init extended-capability lists when MMCFG availability changed Jan
Don't know how I managed to overlook the freeing side when adding the case to alloc_pdev(). Fixes: cd2b9f0b1986 ("PCI: handle PCI->PCIe bridges as well in alloc_pdev()") Signed-off-by: Jan Beulich <jbeulich@suse.com> --- v3: New. --- Noticed due to the original patch still applying cleanly, just with an offset of a few dozen lines. --- a/xen/drivers/passthrough/pci.c +++ b/xen/drivers/passthrough/pci.c @@ -XXX,XX +XXX,XX @@ static void free_pdev(struct pci_seg *ps unsigned long flags; case DEV_TYPE_PCIe2PCI_BRIDGE: + case DEV_TYPE_PCI2PCIe_BRIDGE: case DEV_TYPE_LEGACY_PCI_BRIDGE: sec_bus = pci_conf_read8(pdev->sbdf, PCI_SECONDARY_BUS); sub_bus = pci_conf_read8(pdev->sbdf, PCI_SUBORDINATE_BUS);
Legacy PCI devices don't have any extended config space. Reading any part thereof may return all ones or other arbitrary data, e.g. in some cases base config space contents repeatedly. Logic follows Linux 6.19-rc's pci_cfg_space_size(), albeit leveraging our determination of device type; in particular some comments are taken verbatim from there. Like with Linux'es CONFIG_PCI_QUIRKS, only the alias detection logic is covered by the new "pci=no-quirks". The singular access at PCI_CFG_SPACE_SIZE is left unconditional. Signed-off-by: Jan Beulich <jbeulich@suse.com> Acked-by: Roger Pau Monné <roger.pau@citrix.com> --- The warning near the bottom of pci_check_extcfg() may be issued multiple times for a single device now. Should we try to avoid that? Note that no vPCI adjustments are done here, but they're going to be needed: Whatever requires extended capabilities will need re- evaluating / newly establishing / tearing down in case an invocation of PHYSDEVOP_pci_mmcfg_reserved alters global state. --- v3: Add command line (sub-)option. v2: Major re-work to also check upon PHYSDEVOP_pci_mmcfg_reserved invocation. --- a/docs/misc/xen-command-line.pandoc +++ b/docs/misc/xen-command-line.pandoc @@ -XXX,XX +XXX,XX @@ Only effective if CONFIG_PARTIAL_EMULATI behavior.** ### pci - = List of [ serr=<bool>, perr=<bool> ] + = List of [ serr=<bool>, perr=<bool>, quirks=<bool> ] + +* `serr` and `perr` Default: Signaling left as set by firmware. -Override the firmware settings, and explicitly enable or disable the -signalling of PCI System and Parity errors. + Override the firmware settings, and explicitly enable or disable the + signalling of PCI System and Parity errors. + +* `quirks` + + Default: `on` + + In its negative form, allows to suppress certain quirk workarounds, in case + they cause issues. ### pci-phantom > `=[<seg>:]<bus>:<device>,<stride>` --- a/xen/arch/x86/physdev.c +++ b/xen/arch/x86/physdev.c @@ -XXX,XX +XXX,XX @@ int physdev_map_pirq(struct domain *d, i struct msi_info *msi); int physdev_unmap_pirq(struct domain *d, int pirq); +int cf_check physdev_check_pci_extcfg(struct pci_dev *pdev, void *arg); + #include "x86_64/mmconfig.h" #ifndef COMPAT @@ -XXX,XX +XXX,XX @@ int physdev_unmap_pirq(struct domain *d, return ret; } + +int cf_check physdev_check_pci_extcfg(struct pci_dev *pdev, void *arg) +{ + const struct physdev_pci_mmcfg_reserved *info = arg; + + ASSERT(pdev->seg == info->segment); + if ( pdev->bus >= info->start_bus && pdev->bus <= info->end_bus ) + pci_check_extcfg(pdev); + + return 0; +} #endif /* COMPAT */ ret_t do_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg) @@ -XXX,XX +XXX,XX @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H ret = pci_mmcfg_reserved(info.address, info.segment, info.start_bus, info.end_bus, info.flags); + + if ( !ret ) + ret = pci_segment_iterate(info.segment, physdev_check_pci_extcfg, + &info); + if ( !ret && has_vpci(currd) && (info.flags & XEN_PCI_MMCFG_RESERVED) ) { /* --- a/xen/drivers/passthrough/pci.c +++ b/xen/drivers/passthrough/pci.c @@ -XXX,XX +XXX,XX @@ custom_param("pci-phantom", parse_phanto static u16 __read_mostly command_mask; static u16 __read_mostly bridge_ctl_mask; +static bool __ro_after_init opt_pci_quirks = true; static int __init cf_check parse_pci_param(const char *s) { @@ -XXX,XX +XXX,XX @@ static int __init cf_check parse_pci_par cmd_mask = PCI_COMMAND_PARITY; brctl_mask = PCI_BRIDGE_CTL_PARITY; } + else if ( (val = parse_boolean("quirks", s, ss)) >= 0 ) + opt_pci_quirks = val; else rc = -EINVAL; @@ -XXX,XX +XXX,XX @@ static struct pci_dev *alloc_pdev(struct } apply_quirks(pdev); + + pci_check_extcfg(pdev); + check_pdev(pdev); return pdev; @@ -XXX,XX +XXX,XX @@ int pci_add_device(u16 seg, u8 bus, u8 d list_add(&pdev->vf_list, &pf_pdev->vf_list); } + + if ( !pdev->ext_cfg ) + printk(XENLOG_WARNING + "%pp: VF without extended config space?\n", + &pdev->sbdf); } } @@ -XXX,XX +XXX,XX @@ enum pdev_type pdev_type(u16 seg, u8 bus return pos ? DEV_TYPE_PCIe_ENDPOINT : DEV_TYPE_PCI; } +void pci_check_extcfg(struct pci_dev *pdev) +{ + unsigned int pos; + + pdev->ext_cfg = false; + + switch ( pdev->type ) + { + case DEV_TYPE_PCIe_ENDPOINT: + case DEV_TYPE_PCIe_BRIDGE: + case DEV_TYPE_PCI_HOST_BRIDGE: + case DEV_TYPE_PCIe2PCI_BRIDGE: + case DEV_TYPE_PCI2PCIe_BRIDGE: + break; + + case DEV_TYPE_LEGACY_PCI_BRIDGE: + case DEV_TYPE_PCI: + pos = pci_find_cap_offset(pdev->sbdf, PCI_CAP_ID_PCIX); + if ( !pos || + !(pci_conf_read32(pdev->sbdf, pos + PCI_X_STATUS) & + (PCI_X_STATUS_266MHZ | PCI_X_STATUS_533MHZ)) ) + return; + break; + + default: + return; + } + + /* + * Regular PCI devices have 256 bytes, but PCI-X 2 and PCI Express devices + * have 4096 bytes. Even if the device is capable, that doesn't mean we + * can access it. Maybe we don't have a way to generate extended config + * space accesses, or the device is behind a reverse Express bridge. So + * we try reading the dword at PCI_CFG_SPACE_SIZE which must either be 0 + * or a valid extended capability header. + */ + if ( pci_conf_read32(pdev->sbdf, PCI_CFG_SPACE_SIZE) == 0xffffffffU ) + return; + + if ( opt_pci_quirks ) + { + /* + * PCI Express to PCI/PCI-X Bridge Specification, rev 1.0, 4.1.4 says + * that when forwarding a type1 configuration request the bridge must + * check that the extended register address field is zero. The bridge + * is not permitted to forward the transactions and must handle it as + * an Unsupported Request. Some bridges do not follow this rule and + * simply drop the extended register bits, resulting in the standard + * config space being aliased, every 256 bytes across the entire + * configuration space. Test for this condition by comparing the first + * dword of each potential alias to the vendor/device ID. + * Known offenders: + * ASM1083/1085 PCIe-to-PCI Reversible Bridge (1b21:1080, rev 01 & 03) + * AMD/ATI SBx00 PCI to PCI Bridge (1002:4384, rev 40) + */ + unsigned int sig = pci_conf_read32(pdev->sbdf, PCI_VENDOR_ID); + + for ( pos = PCI_CFG_SPACE_SIZE; + pos < PCI_CFG_SPACE_EXP_SIZE; pos += PCI_CFG_SPACE_SIZE ) + if ( pci_conf_read32(pdev->sbdf, pos) != sig ) + break; + + if ( pos >= PCI_CFG_SPACE_EXP_SIZE ) + { + printk(XENLOG_WARNING "%pp: extended config space aliases base one\n", + &pdev->sbdf); + return; + } + } + + pdev->ext_cfg = true; +} + /* * find the upstream PCIe-to-PCI/PCIX bridge or PCI legacy bridge * return 0: the device is integrated PCI device or PCIe @@ -XXX,XX +XXX,XX @@ int pci_iterate_devices(int (*handler)(s return pci_segments_iterate(iterate_all, &iter) ?: iter.rc; } +/* Iterate a single PCI segment, with locking but without preemption. */ +int pci_segment_iterate(unsigned int segment, + int (*handler)(struct pci_dev *pdev, void *arg), + void *arg) +{ + struct pci_seg *seg = get_pseg(segment); + struct segment_iter iter = { + .handler = handler, + .arg = arg, + }; + + if ( !seg ) + return -ENODEV; + + pcidevs_lock(); + + iter.rc = iterate_all(seg, &iter) ?: iter.rc; + + pcidevs_unlock(); + + return iter.rc; +} + /* * Local variables: * mode: C --- a/xen/include/xen/pci.h +++ b/xen/include/xen/pci.h @@ -XXX,XX +XXX,XX @@ struct pci_dev { nodeid_t node; /* NUMA node */ + /* Whether the device has (accessible) extended config space. */ + bool ext_cfg; + /* Device to be quarantined, don't automatically re-assign to dom0 */ bool quarantine; @@ -XXX,XX +XXX,XX @@ void pci_check_disable_device(u16 seg, u int pci_iterate_devices(int (*handler)(struct pci_dev *pdev, void *arg), void *arg); +/* Iterate a single PCI segment, with locking but without preemption. */ +int pci_segment_iterate(unsigned int segment, + int (*handler)(struct pci_dev *pdev, void *arg), + void *arg); + uint8_t pci_conf_read8(pci_sbdf_t sbdf, unsigned int reg); uint16_t pci_conf_read16(pci_sbdf_t sbdf, unsigned int reg); uint32_t pci_conf_read32(pci_sbdf_t sbdf, unsigned int reg); @@ -XXX,XX +XXX,XX @@ unsigned int pci_find_next_cap_ttl(pci_s unsigned int *ttl); unsigned int pci_find_next_cap(pci_sbdf_t sbdf, unsigned int pos, unsigned int cap); +void pci_check_extcfg(struct pci_dev *pdev); unsigned int pci_find_ext_capability(const struct pci_dev *pdev, unsigned int cap); unsigned int pci_find_next_ext_capability(const struct pci_dev *pdev,
Avoid interpreting as extended capabilities what may be about anything. In doing so, vPCI then also won't mis-interpret data from beyond base config space anymore. Fixes: 3b35911d709e ("Enable pci mmcfg and ATS for x86_64") Fixes: a845b50c12f3 ("vpci/header: Emulate extended capability list for dom0") Signed-off-by: Jan Beulich <jbeulich@suse.com> Reviewed-by: Stewart Hildebrand <stewart.hildebrand@amd.com> Acked-by: Roger Pau Monné <roger.pau@citrix.com> --- Because of the multiple prereq changes, despite the Fixes: tags I'm not quite sure whether to backport this. (I'm leaning towards "no".) --- a/xen/drivers/pci/pci.c +++ b/xen/drivers/pci/pci.c @@ -XXX,XX +XXX,XX @@ unsigned int pci_find_next_ext_capabilit int ttl = 480; /* 3840 bytes, minimum 8 bytes per capability */ unsigned int pos = max(start, PCI_CFG_SPACE_SIZE + 0U); + if ( !pdev->ext_cfg ) + { + ASSERT(!start); + return 0; + } + header = pci_conf_read32(pdev->sbdf, pos); /*
For DomU-s, whether to emulate accesses to the first 32 bits of extended config space as read-as-zero or read-as-all-ones depends on whether a device actually has extended config space. If it doesn't, read-as-zero isn't correct; not getting this right may confuse functions like Linux 6.19-rc's pci_ext_cfg_is_aliased(). For Dom0 this then simply allows dropping a later conditional. Fixes: a845b50c12f3 ("vpci/header: Emulate extended capability list for dom0") Signed-off-by: Jan Beulich <jbeulich@suse.com> --- v3: Move code condition to top-level function scope. Eliminate a later conditional in exchange. --- a/xen/drivers/vpci/header.c +++ b/xen/drivers/vpci/header.c @@ -XXX,XX +XXX,XX @@ static int vpci_init_ext_capability_list { unsigned int pos = PCI_CFG_SPACE_SIZE; + if ( !pdev->ext_cfg ) + return 0; + if ( !is_hardware_domain(pdev->domain) ) /* Extended capabilities read as zero, write ignore for DomU */ return vpci_add_register(pdev->vpci, vpci_read_val, NULL, @@ -XXX,XX +XXX,XX @@ static int vpci_init_ext_capability_list if ( header == 0xffffffffU ) { - if ( pos != PCI_CFG_SPACE_SIZE ) - printk(XENLOG_WARNING - "%pd %pp: broken extended cap list, offset %#x\n", - pdev->domain, &pdev->sbdf, pos); + printk(XENLOG_WARNING + "%pd %pp: broken extended cap list, offset %#x\n", + pdev->domain, &pdev->sbdf, pos); return 0; }
When, during boot, we have already correctly determined availability of the MMCFG access method for a given bus range, there's then no need to invoke pci_check_extcfg() again for every of the devices. This in particular avoids ->ext_cfg to transiently indicate the wrong state. Switch to using Xen style on lines being touched and immediately adjacent ones. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- v3: New. --- a/xen/arch/x86/physdev.c +++ b/xen/arch/x86/physdev.c @@ -XXX,XX +XXX,XX @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H if ( !ret ) ret = pci_segment_iterate(info.segment, physdev_check_pci_extcfg, &info); + else if ( ret > 0 ) /* Indication of "no change". */ + ret = 0; if ( !ret && has_vpci(currd) && (info.flags & XEN_PCI_MMCFG_RESERVED) ) { --- a/xen/arch/x86/x86_64/mmconfig.h +++ b/xen/arch/x86/x86_64/mmconfig.h @@ -XXX,XX +XXX,XX @@ int pci_mmcfg_reserved(uint64_t address, unsigned int flags); int pci_mmcfg_arch_init(void); int pci_mmcfg_arch_enable(unsigned int idx); -void pci_mmcfg_arch_disable(unsigned int idx); +int pci_mmcfg_arch_disable(unsigned int idx); #endif /* X86_64_MMCONFIG_H */ --- a/xen/arch/x86/x86_64/mmconfig-shared.c +++ b/xen/arch/x86/x86_64/mmconfig-shared.c @@ -XXX,XX +XXX,XX @@ static bool __init pci_mmcfg_reject_brok (unsigned int)cfg->start_bus_number, (unsigned int)cfg->end_bus_number); - if (!is_mmconf_reserved(addr, size, i, cfg) || - pci_mmcfg_arch_enable(i)) { + if ( !is_mmconf_reserved(addr, size, i, cfg) || + pci_mmcfg_arch_enable(i) < 0 ) + { pci_mmcfg_arch_disable(i); valid = 0; } @@ -XXX,XX +XXX,XX @@ void __init acpi_mmcfg_init(void) unsigned int i; pci_mmcfg_arch_init(); - for (i = 0; i < pci_mmcfg_config_num; ++i) - if (pci_mmcfg_arch_enable(i)) + for ( i = 0; i < pci_mmcfg_config_num; ++i ) + if ( pci_mmcfg_arch_enable(i) < 0 ) valid = 0; } else { acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg); @@ -XXX,XX +XXX,XX @@ int pci_mmcfg_reserved(uint64_t address, segment, start_bus, end_bus, address, cfg->address); return -EIO; } - if (flags & XEN_PCI_MMCFG_RESERVED) + + if ( flags & XEN_PCI_MMCFG_RESERVED ) return pci_mmcfg_arch_enable(i); - pci_mmcfg_arch_disable(i); - return 0; + + return pci_mmcfg_arch_disable(i); } } --- a/xen/arch/x86/x86_64/mmconfig_64.c +++ b/xen/arch/x86/x86_64/mmconfig_64.c @@ -XXX,XX +XXX,XX @@ int pci_mmcfg_arch_enable(unsigned int i const typeof(pci_mmcfg_config[0]) *cfg = pci_mmcfg_virt[idx].cfg; unsigned long start_mfn, end_mfn; - if (pci_mmcfg_virt[idx].virt) - return 0; + if ( pci_mmcfg_virt[idx].virt ) + return 1; + pci_mmcfg_virt[idx].virt = mcfg_ioremap(cfg, idx, PAGE_HYPERVISOR_UC); if (!pci_mmcfg_virt[idx].virt) { printk(KERN_ERR "PCI: Cannot map MCFG aperture for segment %04x\n", @@ -XXX,XX +XXX,XX @@ int pci_mmcfg_arch_enable(unsigned int i return 0; } -void pci_mmcfg_arch_disable(unsigned int idx) +int pci_mmcfg_arch_disable(unsigned int idx) { const typeof(pci_mmcfg_config[0]) *cfg = pci_mmcfg_virt[idx].cfg; + if ( !pci_mmcfg_virt[idx].virt ) + return 1; + pci_mmcfg_virt[idx].virt = NULL; /* * Don't use destroy_xen_mappings() here, or make sure that at least @@ -XXX,XX +XXX,XX @@ void pci_mmcfg_arch_disable(unsigned int mcfg_ioremap(cfg, idx, 0); printk(KERN_WARNING "PCI: Not using MCFG for segment %04x bus %02x-%02x\n", cfg->pci_segment, cfg->start_bus_number, cfg->end_bus_number); + + return 0; } bool pci_mmcfg_decode(unsigned long mfn, unsigned int *seg, unsigned int *bdf)
When Dom0 informs up about MMCFG usability, this may change whether extended capabilities are available (accessible) for devices. Zap what might be on record, and re-initialize the list. No synchronization is added for the case where devices may already be in use. That'll need sorting when (a) DomU support was added and (b) DomU-s may run already while Dom0 / hwdom still boots (dom0less, Hyperlaunch). Signed-off-by: Jan Beulich <jbeulich@suse.com> --- vpci_reinit_ext_capability_list()'s return value isn't checked, as it doesn't feel quite right to fail the hypercall because of this. At the same time it also doesn't feel quite right to have the function return "void". Thoughts? --- v3: New. --- a/xen/arch/x86/physdev.c +++ b/xen/arch/x86/physdev.c @@ -XXX,XX +XXX,XX @@ #include <xen/guest_access.h> #include <xen/iocap.h> #include <xen/serial.h> +#include <xen/vpci.h> + #include <asm/current.h> #include <asm/io_apic.h> #include <asm/msi.h> @@ -XXX,XX +XXX,XX @@ int cf_check physdev_check_pci_extcfg(st ASSERT(pdev->seg == info->segment); if ( pdev->bus >= info->start_bus && pdev->bus <= info->end_bus ) + { pci_check_extcfg(pdev); + vpci_reinit_ext_capability_list(pdev); + } return 0; } --- a/xen/drivers/vpci/header.c +++ b/xen/drivers/vpci/header.c @@ -XXX,XX +XXX,XX @@ static int vpci_init_ext_capability_list return 0; } +int vpci_reinit_ext_capability_list(const struct pci_dev *pdev) +{ + if ( !pdev->vpci ) + return 0; + + if ( vpci_remove_registers(pdev->vpci, PCI_CFG_SPACE_SIZE, + PCI_CFG_SPACE_EXP_SIZE - PCI_CFG_SPACE_SIZE) ) + ASSERT_UNREACHABLE(); + + return vpci_init_ext_capability_list(pdev); +} + int vpci_init_header(struct pci_dev *pdev) { uint16_t cmd; --- a/xen/include/xen/vpci.h +++ b/xen/include/xen/vpci.h @@ -XXX,XX +XXX,XX @@ typedef struct { REGISTER_VPCI_CAPABILITY(PCI_EXT_CAP_ID_##name, name, finit, fclean, true) int __must_check vpci_init_header(struct pci_dev *pdev); +int vpci_reinit_ext_capability_list(const struct pci_dev *pdev); /* Assign vPCI to device by adding handlers. */ int __must_check vpci_assign_device(struct pci_dev *pdev); @@ -XXX,XX +XXX,XX @@ bool vpci_ecam_read(pci_sbdf_t sbdf, uns #else /* !CONFIG_HAS_VPCI */ struct vpci_vcpu {}; +static inline int vpci_reinit_ext_capability_list(const struct pci_dev *pdev) +{ + return 0; +} + static inline int vpci_assign_device(struct pci_dev *pdev) { return 0;