[PATCH v2 5/7] xen/pci: initialize BARs

Mykyta Poturai posted 7 patches 1 week ago
[PATCH v2 5/7] xen/pci: initialize BARs
Posted by Mykyta Poturai 1 week ago
From: Stewart Hildebrand <stewart.hildebrand@amd.com>

A PCI device must have valid BARs in order to assign it to a domain.  On
ARM, firmware is unlikely to have initialized the BARs, so we must do
this in Xen. During setup_hwdom_pci_devices(), check if each BAR is
valid. If the BAR happens to already be valid, remove the BAR range from
a rangeset of valid PCI ranges so as to avoid overlap when reserving a
new BAR. If not valid, reserve a new BAR address from the rangeset and
write it to the device.

Avaliable ranges are read from DT during init and stored in distinct
rangesets.

Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
Signed-off-by: Mykyta Poturai <mykyta_poturai@epam.com>
---
v1->v2:
* move hwdom_uses_vpci to this patch
* fixup error reporting
---
 xen/arch/arm/include/asm/pci.h     |  7 +++
 xen/arch/arm/pci/pci-host-common.c | 76 ++++++++++++++++++++++++++++
 xen/arch/x86/include/asm/pci.h     | 20 ++++++++
 xen/common/rangeset.c              | 35 +++++++++++++
 xen/drivers/passthrough/pci.c      | 79 ++++++++++++++++++++++++++++++
 xen/include/xen/rangeset.h         |  6 ++-
 6 files changed, 222 insertions(+), 1 deletion(-)

diff --git a/xen/arch/arm/include/asm/pci.h b/xen/arch/arm/include/asm/pci.h
index 7db36e7bc3..46bdc7713b 100644
--- a/xen/arch/arm/include/asm/pci.h
+++ b/xen/arch/arm/include/asm/pci.h
@@ -74,6 +74,8 @@ struct pci_host_bridge {
     struct pci_config_window *child_cfg;
     const struct pci_ops *child_ops;
     void *priv;                      /* Private data of the bridge. */
+    struct rangeset *bar_ranges;
+    struct rangeset *bar_ranges_prefetch;
 };
 
 struct pci_ops {
@@ -154,6 +156,11 @@ void pci_generic_init_bus_range_child(struct dt_device_node *dev,
 
 bool arch_pci_device_physdevop(void);
 
+uint64_t pci_get_new_bar_addr(const struct pci_dev *pdev, uint64_t size,
+                              bool is_64bit, bool prefetch);
+int pci_reserve_bar_range(const struct pci_dev *pdev, uint64_t addr,
+                          uint64_t size, bool prefetch);
+
 #else   /*!CONFIG_HAS_PCI*/
 
 struct pci_dev;
diff --git a/xen/arch/arm/pci/pci-host-common.c b/xen/arch/arm/pci/pci-host-common.c
index 67447d8696..86467e7cf6 100644
--- a/xen/arch/arm/pci/pci-host-common.c
+++ b/xen/arch/arm/pci/pci-host-common.c
@@ -21,6 +21,7 @@
 #include <xen/rwlock.h>
 #include <xen/sched.h>
 #include <xen/vmap.h>
+#include <xen/resource.h>
 
 #include <asm/setup.h>
 
@@ -232,6 +233,21 @@ static int pci_bus_find_domain_nr(struct dt_device_node *dev)
     return domain;
 }
 
+static int add_bar_range(const struct dt_device_node *dev, uint32_t flags,
+                         uint64_t addr, uint64_t len, void *data)
+{
+    struct pci_host_bridge *bridge = data;
+
+    if ( !(flags & IORESOURCE_MEM) )
+        return 0;
+
+    if ( flags & IORESOURCE_PREFETCH )
+        return rangeset_add_range(bridge->bar_ranges_prefetch, addr,
+                                  addr + len - 1);
+    else
+        return rangeset_add_range(bridge->bar_ranges, addr, addr + len - 1);
+}
+
 struct pci_host_bridge *
 pci_host_common_probe(struct dt_device_node *dev,
                       const struct pci_ecam_ops *ops,
@@ -286,6 +302,14 @@ pci_host_common_probe(struct dt_device_node *dev,
     pci_add_host_bridge(bridge);
     pci_add_segment(bridge->segment);
 
+    bridge->bar_ranges = rangeset_new(NULL, "BAR ranges",
+                                      RANGESETF_prettyprint_hex);
+    bridge->bar_ranges_prefetch = rangeset_new(NULL,
+                                               "BAR ranges (prefetchable)",
+                                               RANGESETF_prettyprint_hex);
+    if ( bridge->bar_ranges && bridge->bar_ranges_prefetch )
+        dt_for_each_range(bridge->dt_node, add_bar_range, bridge);
+
     return bridge;
 
 err_child:
@@ -476,6 +500,58 @@ bool pci_check_bar(const struct pci_dev *pdev, mfn_t start, mfn_t end)
 
     return bar_data.is_valid;
 }
+
+uint64_t pci_get_new_bar_addr(const struct pci_dev *pdev, uint64_t size,
+                              bool is_64bit, bool prefetch)
+{
+    struct pci_host_bridge *bridge;
+    struct rangeset *range;
+    uint64_t addr;
+
+    bridge = pci_find_host_bridge(pdev->seg, pdev->bus);
+    if ( !bridge )
+        return 0;
+
+    range = prefetch ? bridge->bar_ranges_prefetch : bridge->bar_ranges;
+
+    if ( size < PAGE_SIZE )
+        size = PAGE_SIZE;
+
+    if ( is_64bit && !rangeset_find_aligned_range(range, size, GB(4), &addr) )
+    {
+        if ( rangeset_remove_range(range, addr, addr + size - 1) )
+            printk(XENLOG_ERR "Failed to remove BAR range %lx-%lx from rangeset\n",
+                   addr, addr + size - 1);
+        return addr;
+    }
+    if ( !rangeset_find_aligned_range(range, size, 0, &addr) )
+    {
+        if ( !is_64bit && addr >= GB(4) )
+            return 0;
+        if ( rangeset_remove_range(range, addr, addr + size - 1) )
+            printk(XENLOG_ERR "Failed to remove BAR range %lx-%lx from rangeset\n",
+                   addr, addr + size - 1);
+        return addr;
+    }
+
+    return 0;
+}
+
+int pci_reserve_bar_range(const struct pci_dev *pdev, uint64_t addr,
+                          uint64_t size, bool prefetch)
+{
+    struct pci_host_bridge *bridge;
+    struct rangeset *range;
+
+    bridge = pci_find_host_bridge(pdev->seg, pdev->bus);
+    if ( !bridge )
+        return 0;
+
+    range = prefetch ? bridge->bar_ranges_prefetch : bridge->bar_ranges;
+
+    return rangeset_remove_range(range, addr, addr + size - 1);
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/arch/x86/include/asm/pci.h b/xen/arch/x86/include/asm/pci.h
index 0b98081aea..4cc3f2e853 100644
--- a/xen/arch/x86/include/asm/pci.h
+++ b/xen/arch/x86/include/asm/pci.h
@@ -76,4 +76,24 @@ int pci_sanitize_bar_memory(struct rangeset *r);
 
 void pci_setup(void);
 
+/* Unlike ARM, HW domain alyways uses vpci for x86 */
+static inline bool hwdom_uses_vpci(void)
+{
+    return true;
+}
+
+static inline uint64_t pci_get_new_bar_addr(const struct pci_dev *pdev,
+                                            uint64_t size, bool is_64bit,
+                                            bool prefetch)
+{
+    return 0;
+}
+
+static inline int pci_reserve_bar_range(const struct pci_dev *pdev,
+                                        uint64_t addr, uint64_t size,
+                                        bool prefetch)
+{
+    return 0;
+}
+
 #endif /* __X86_PCI_H__ */
diff --git a/xen/common/rangeset.c b/xen/common/rangeset.c
index 0e3b9acd35..c1c6f8610c 100644
--- a/xen/common/rangeset.c
+++ b/xen/common/rangeset.c
@@ -357,6 +357,41 @@ int rangeset_claim_range(struct rangeset *r, unsigned long size,
     return 0;
 }
 
+int rangeset_find_aligned_range(struct rangeset *r, unsigned long size,
+                                unsigned long min, unsigned long *s)
+{
+    struct range *x;
+
+    /* Power of 2 check */
+    if ( (size & (size - 1)) != 0 )
+    {
+        *s = 0;
+        return -EINVAL;
+    }
+
+    read_lock(&r->lock);
+
+    for ( x = first_range(r); x; x = next_range(r, x) )
+    {
+        /* Assumes size is a power of 2 */
+        unsigned long start_aligned = (x->s + size - 1) & ~(size - 1);
+
+        if ( x->e > start_aligned &&
+             (x->e - start_aligned) >= size &&
+             start_aligned >= min )
+        {
+            read_unlock(&r->lock);
+            *s = start_aligned;
+            return 0;
+        }
+    }
+
+    read_unlock(&r->lock);
+    *s = 0;
+
+    return -ENOSPC;
+}
+
 int rangeset_consume_ranges(struct rangeset *r,
                             int (*cb)(unsigned long s, unsigned long e,
                                       void *ctxt, unsigned long *c),
diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
index 3edcfa8a04..4f5de9a542 100644
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -1172,6 +1172,80 @@ int __init scan_pci_devices(void)
     return ret;
 }
 
+static void __init cf_check reserve_bar_range(struct pci_dev *pdev, uint8_t reg,
+                                              uint64_t addr, uint64_t size,
+                                              bool is_64bit, bool prefetch)
+{
+    if ( pci_check_bar(pdev, maddr_to_mfn(addr),
+                       maddr_to_mfn(addr + size - 1)) )
+        pci_reserve_bar_range(pdev, addr, size, prefetch);
+}
+
+static void __init cf_check get_new_bar_addr(struct pci_dev *pdev, uint8_t reg,
+                                             uint64_t addr, uint64_t size,
+                                             bool is_64bit, bool prefetch)
+{
+    if ( !pci_check_bar(pdev, maddr_to_mfn(addr),
+                        maddr_to_mfn(addr + size - 1)) )
+    {
+        uint16_t cmd = pci_conf_read16(pdev->sbdf, PCI_COMMAND);
+
+        addr = pci_get_new_bar_addr(pdev, size, is_64bit, prefetch);
+
+        pci_conf_write16(pdev->sbdf, PCI_COMMAND,
+                         cmd & ~(PCI_COMMAND_MEMORY | PCI_COMMAND_IO));
+
+        pci_conf_write32(pdev->sbdf, reg,
+                         (addr & GENMASK(31, 0)) |
+                         (is_64bit ? PCI_BASE_ADDRESS_MEM_TYPE_64 : 0));
+
+        if ( is_64bit )
+            pci_conf_write32(pdev->sbdf, reg + 4, addr >> 32);
+
+        pci_conf_write16(pdev->sbdf, PCI_COMMAND, cmd);
+    }
+}
+
+static int __init cf_check bars_iterate(struct pci_seg *pseg, void *arg)
+{
+    struct pci_dev *pdev;
+    unsigned int i, ret, num_bars = PCI_HEADER_NORMAL_NR_BARS;
+    uint64_t addr, size;
+    void (*cb)(struct pci_dev *, uint8_t, uint64_t, uint64_t, bool, bool) = arg;
+
+    list_for_each_entry ( pdev, &pseg->alldevs_list, alldevs_list )
+    {
+        if ( (pci_conf_read8(pdev->sbdf, PCI_HEADER_TYPE) & 0x7f) ==
+             PCI_HEADER_TYPE_NORMAL )
+        {
+            for ( i = 0; i < num_bars; i += ret )
+            {
+                uint8_t reg = PCI_BASE_ADDRESS_0 + i * 4;
+                bool prefetch;
+
+                if ( (pci_conf_read32(pdev->sbdf, reg) & PCI_BASE_ADDRESS_SPACE)
+                     == PCI_BASE_ADDRESS_SPACE_IO )
+                {
+                    ret = 1;
+                    continue;
+                }
+
+                ret = pci_size_mem_bar(pdev->sbdf, reg, &addr, &size,
+                                      (i == num_bars - 1) ? PCI_BAR_LAST : 0);
+
+                if ( !size )
+                    continue;
+                prefetch = !!(pci_conf_read32(pdev->sbdf, reg) &
+                              PCI_BASE_ADDRESS_MEM_PREFETCH);
+
+                cb(pdev, reg, addr, size, ret == 2, prefetch);
+            }
+        }
+    }
+
+    return 0;
+}
+
 struct setup_hwdom {
     struct domain *d;
     int (*handler)(uint8_t devfn, struct pci_dev *pdev);
@@ -1263,6 +1337,11 @@ void __hwdom_init setup_hwdom_pci_devices(
     struct setup_hwdom ctxt = { .d = d, .handler = handler };
 
     pcidevs_lock();
+    if ( hwdom_uses_vpci() )
+    {
+        pci_segments_iterate(bars_iterate, reserve_bar_range);
+        pci_segments_iterate(bars_iterate, get_new_bar_addr);
+    }
     pci_segments_iterate(_setup_hwdom_pci_devices, &ctxt);
     pcidevs_unlock();
 }
diff --git a/xen/include/xen/rangeset.h b/xen/include/xen/rangeset.h
index 817505badf..e71e810f82 100644
--- a/xen/include/xen/rangeset.h
+++ b/xen/include/xen/rangeset.h
@@ -56,11 +56,15 @@ void rangeset_limit(
 bool __must_check rangeset_is_empty(
     const struct rangeset *r);
 
-/* Add/claim/remove/query/purge a numeric range. */
+/* Add/claim/find/remove/query/purge a numeric range. */
 int __must_check rangeset_add_range(
     struct rangeset *r, unsigned long s, unsigned long e);
 int __must_check rangeset_claim_range(struct rangeset *r, unsigned long size,
                                       unsigned long *s);
+int __must_check rangeset_find_aligned_range(struct rangeset *r,
+                                             unsigned long size,
+                                             unsigned long min,
+                                             unsigned long *s);
 int __must_check rangeset_remove_range(
     struct rangeset *r, unsigned long s, unsigned long e);
 bool __must_check rangeset_contains_range(
-- 
2.34.1
Re: [PATCH v2 5/7] xen/pci: initialize BARs
Posted by Jan Beulich 1 week ago
On 22.10.2025 15:56, Mykyta Poturai wrote:
> @@ -232,6 +233,21 @@ static int pci_bus_find_domain_nr(struct dt_device_node *dev)
>      return domain;
>  }
>  
> +static int add_bar_range(const struct dt_device_node *dev, uint32_t flags,
> +                         uint64_t addr, uint64_t len, void *data)
> +{
> +    struct pci_host_bridge *bridge = data;
> +
> +    if ( !(flags & IORESOURCE_MEM) )
> +        return 0;
> +
> +    if ( flags & IORESOURCE_PREFETCH )
> +        return rangeset_add_range(bridge->bar_ranges_prefetch, addr,
> +                                  addr + len - 1);
> +    else
> +        return rangeset_add_range(bridge->bar_ranges, addr, addr + len - 1);

Here and everywhere else you use rangesets: This loses significant bits on
Arm32. Iirc the plan was to select HAS_PCI only for Arm64, but that's entirely
invisible here. I think there want to be perhaps multiple BUILD_BUG_ON()s, at
the very least.

> --- a/xen/arch/x86/include/asm/pci.h
> +++ b/xen/arch/x86/include/asm/pci.h
> @@ -76,4 +76,24 @@ int pci_sanitize_bar_memory(struct rangeset *r);
>  
>  void pci_setup(void);
>  
> +/* Unlike ARM, HW domain alyways uses vpci for x86 */
> +static inline bool hwdom_uses_vpci(void)
> +{
> +    return true;
> +}

What the comment says is not true. It is true for PVH Dom0. The sole use
of the predicate therefore is questionable, too.

> +static inline uint64_t pci_get_new_bar_addr(const struct pci_dev *pdev,
> +                                            uint64_t size, bool is_64bit,
> +                                            bool prefetch)
> +{
> +    return 0;
> +}
> +
> +static inline int pci_reserve_bar_range(const struct pci_dev *pdev,
> +                                        uint64_t addr, uint64_t size,
> +                                        bool prefetch)
> +{
> +    return 0;
> +}

Neither here nor elsewhere is any word said on what these do, what a
"new BAR range" is, or what "reserving" would mean.

> --- a/xen/common/rangeset.c
> +++ b/xen/common/rangeset.c
> @@ -357,6 +357,41 @@ int rangeset_claim_range(struct rangeset *r, unsigned long size,
>      return 0;
>  }
>  
> +int rangeset_find_aligned_range(struct rangeset *r, unsigned long size,
> +                                unsigned long min, unsigned long *s)
> +{
> +    struct range *x;
> +
> +    /* Power of 2 check */
> +    if ( (size & (size - 1)) != 0 )
> +    {
> +        *s = 0;
> +        return -EINVAL;
> +    }
> +
> +    read_lock(&r->lock);
> +
> +    for ( x = first_range(r); x; x = next_range(r, x) )
> +    {
> +        /* Assumes size is a power of 2 */
> +        unsigned long start_aligned = (x->s + size - 1) & ~(size - 1);
> +
> +        if ( x->e > start_aligned &&
> +             (x->e - start_aligned) >= size &&
> +             start_aligned >= min )
> +        {
> +            read_unlock(&r->lock);

With this and ...

> +            *s = start_aligned;
> +            return 0;
> +        }
> +    }
> +
> +    read_unlock(&r->lock);

... this, how can the caller be sure the result they receive is not stale by
the time they get to look at and use it?

> --- a/xen/drivers/passthrough/pci.c
> +++ b/xen/drivers/passthrough/pci.c
> @@ -1172,6 +1172,80 @@ int __init scan_pci_devices(void)
>      return ret;
>  }
>  
> +static void __init cf_check reserve_bar_range(struct pci_dev *pdev, uint8_t reg,
> +                                              uint64_t addr, uint64_t size,
> +                                              bool is_64bit, bool prefetch)
> +{
> +    if ( pci_check_bar(pdev, maddr_to_mfn(addr),
> +                       maddr_to_mfn(addr + size - 1)) )
> +        pci_reserve_bar_range(pdev, addr, size, prefetch);
> +}
> +
> +static void __init cf_check get_new_bar_addr(struct pci_dev *pdev, uint8_t reg,
> +                                             uint64_t addr, uint64_t size,
> +                                             bool is_64bit, bool prefetch)
> +{
> +    if ( !pci_check_bar(pdev, maddr_to_mfn(addr),
> +                        maddr_to_mfn(addr + size - 1)) )
> +    {
> +        uint16_t cmd = pci_conf_read16(pdev->sbdf, PCI_COMMAND);
> +
> +        addr = pci_get_new_bar_addr(pdev, size, is_64bit, prefetch);
> +
> +        pci_conf_write16(pdev->sbdf, PCI_COMMAND,
> +                         cmd & ~(PCI_COMMAND_MEMORY | PCI_COMMAND_IO));
> +
> +        pci_conf_write32(pdev->sbdf, reg,
> +                         (addr & GENMASK(31, 0)) |
> +                         (is_64bit ? PCI_BASE_ADDRESS_MEM_TYPE_64 : 0));
> +
> +        if ( is_64bit )
> +            pci_conf_write32(pdev->sbdf, reg + 4, addr >> 32);
> +
> +        pci_conf_write16(pdev->sbdf, PCI_COMMAND, cmd);
> +    }
> +}
> +
> +static int __init cf_check bars_iterate(struct pci_seg *pseg, void *arg)
> +{
> +    struct pci_dev *pdev;
> +    unsigned int i, ret, num_bars = PCI_HEADER_NORMAL_NR_BARS;
> +    uint64_t addr, size;
> +    void (*cb)(struct pci_dev *, uint8_t, uint64_t, uint64_t, bool, bool) = arg;

There needs to be some build-time checking that this and the two callbacks
above actually match in type. Likely by way of using a function typedef.

> --- a/xen/include/xen/rangeset.h
> +++ b/xen/include/xen/rangeset.h
> @@ -56,11 +56,15 @@ void rangeset_limit(
>  bool __must_check rangeset_is_empty(
>      const struct rangeset *r);
>  
> -/* Add/claim/remove/query/purge a numeric range. */
> +/* Add/claim/find/remove/query/purge a numeric range. */
>  int __must_check rangeset_add_range(
>      struct rangeset *r, unsigned long s, unsigned long e);
>  int __must_check rangeset_claim_range(struct rangeset *r, unsigned long size,
>                                        unsigned long *s);
> +int __must_check rangeset_find_aligned_range(struct rangeset *r,
> +                                             unsigned long size,
> +                                             unsigned long min,
> +                                             unsigned long *s);

From these parameter names I'm unable to tell what the "aligned" part of
the name is referring to. IOW it may be necessary to have an accompanying
comment.

Jan