[v2] vpci: allow 32-bit BAR writes with memory decoding enabled

[PATCH v2 1/3] vpci: allow queueing of mapping operations

Posted by Stewart Hildebrand 3 months, 1 week ago

Introduce vPCI BAR mapping task queue. Decouple map operation state from
general vPCI state: in particular, move the per-BAR rangeset out of
struct vpci and into the map task struct.

This is preparatory work for further changes that need to perform
multiple unmap/map operations before returning to guest.

Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
---
v1->v2:
* new patch

Related: 622bdd962822 ("vpci/header: handle p2m range sets per BAR")
---
 xen/common/domain.c       |   4 +
 xen/drivers/vpci/header.c | 197 +++++++++++++++++++++++---------------
 xen/drivers/vpci/vpci.c   |   3 -
 xen/include/xen/vpci.h    |  16 +++-
 4 files changed, 139 insertions(+), 81 deletions(-)

diff --git a/xen/common/domain.c b/xen/common/domain.c
index 303c338ef293..214795e2d2fe 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -459,6 +459,10 @@ struct vcpu *vcpu_create(struct domain *d, unsigned int vcpu_id)
         d->vcpu[prev_id]->next_in_list = v;
     }
 
+#ifdef CONFIG_HAS_VPCI
+    INIT_LIST_HEAD(&v->vpci.task_queue);
+#endif
+
     /* Must be called after making new vcpu visible to for_each_vcpu(). */
     vcpu_check_shutdown(v);
 
diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
index bb76e707992c..df065a5f5faf 100644
--- a/xen/drivers/vpci/header.c
+++ b/xen/drivers/vpci/header.c
@@ -34,7 +34,7 @@
 
 struct map_data {
     struct domain *d;
-    const struct vpci_bar *bar;
+    const struct vpci_bar_map *bar;
     bool map;
 };
 
@@ -173,31 +173,23 @@ static void modify_decoding(const struct pci_dev *pdev, uint16_t cmd,
         ASSERT_UNREACHABLE();
 }
 
-bool vpci_process_pending(struct vcpu *v)
+static bool vpci_process_map_task(struct vpci_map_task *task)
 {
-    const struct pci_dev *pdev = v->vpci.pdev;
-    struct vpci_header *header = NULL;
+    const struct pci_dev *pdev = task->pdev;
     unsigned int i;
 
     if ( !pdev )
         return false;
 
-    read_lock(&v->domain->pci_lock);
-
-    if ( !pdev->vpci || (v->domain != pdev->domain) )
-    {
-        v->vpci.pdev = NULL;
-        read_unlock(&v->domain->pci_lock);
+    if ( !pdev->vpci || (task->domain != pdev->domain) )
         return false;
-    }
 
-    header = &pdev->vpci->header;
-    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
+    for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
     {
-        struct vpci_bar *bar = &header->bars[i];
+        struct vpci_bar_map *bar = &task->bars[i];
         struct map_data data = {
-            .d = v->domain,
-            .map = v->vpci.cmd & PCI_COMMAND_MEMORY,
+            .d = task->domain,
+            .map = task->cmd & PCI_COMMAND_MEMORY,
             .bar = bar,
         };
         int rc;
@@ -208,57 +200,79 @@ bool vpci_process_pending(struct vcpu *v)
         rc = rangeset_consume_ranges(bar->mem, map_range, &data);
 
         if ( rc == -ERESTART )
-        {
-            read_unlock(&v->domain->pci_lock);
             return true;
-        }
 
         if ( rc )
         {
             spin_lock(&pdev->vpci->lock);
             /* Disable memory decoding unconditionally on failure. */
-            modify_decoding(pdev, v->vpci.cmd & ~PCI_COMMAND_MEMORY,
+            modify_decoding(pdev, task->cmd & ~PCI_COMMAND_MEMORY,
                             false);
             spin_unlock(&pdev->vpci->lock);
 
-            /* Clean all the rangesets */
-            for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
-                if ( !rangeset_is_empty(header->bars[i].mem) )
-                     rangeset_purge(header->bars[i].mem);
-
-            v->vpci.pdev = NULL;
-
-            read_unlock(&v->domain->pci_lock);
-
-            if ( !is_hardware_domain(v->domain) )
-                domain_crash(v->domain);
+            if ( !is_hardware_domain(task->domain) )
+                domain_crash(task->domain);
 
             return false;
         }
     }
-    v->vpci.pdev = NULL;
 
     spin_lock(&pdev->vpci->lock);
-    modify_decoding(pdev, v->vpci.cmd, v->vpci.rom_only);
+    modify_decoding(pdev, task->cmd, task->rom_only);
     spin_unlock(&pdev->vpci->lock);
 
-    read_unlock(&v->domain->pci_lock);
+    return false;
+}
+
+static void destroy_map_task(struct vpci_map_task *task)
+{
+    unsigned int i;
 
+    if ( !task )
+        return;
+
+    for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
+        rangeset_destroy(task->bars[i].mem);
+
+    xfree(task);
+}
+
+bool vpci_process_pending(struct vcpu *v)
+{
+    struct vpci_map_task *task;
+    read_lock(&v->domain->pci_lock);
+
+    while ( (task = list_first_entry_or_null(&v->vpci.task_queue,
+                                             struct vpci_map_task,
+                                             next)) != NULL )
+    {
+        if ( vpci_process_map_task(task) )
+        {
+            read_unlock(&v->domain->pci_lock);
+            return true;
+        }
+
+        list_del(&task->next);
+        destroy_map_task(task);
+    }
+
+    read_unlock(&v->domain->pci_lock);
     return false;
 }
 
-static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
-                            uint16_t cmd)
+static int __init apply_map(struct vpci_map_task *task)
 {
-    struct vpci_header *header = &pdev->vpci->header;
+    struct domain *d = task->domain;
+    const struct pci_dev *pdev = task->pdev;
+    uint16_t cmd = task->cmd;
     int rc = 0;
     unsigned int i;
 
     ASSERT(rw_is_write_locked(&d->pci_lock));
 
-    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
+    for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
     {
-        struct vpci_bar *bar = &header->bars[i];
+        struct vpci_bar_map *bar = &task->bars[i];
         struct map_data data = { .d = d, .map = true, .bar = bar };
 
         if ( rangeset_is_empty(bar->mem) )
@@ -283,7 +297,48 @@ static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
     return rc;
 }
 
-static void defer_map(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
+static struct vpci_map_task *alloc_map_task(const struct pci_dev *pdev,
+                                            uint16_t cmd, bool rom_only)
+{
+    struct vpci_map_task *task = xzalloc(struct vpci_map_task);
+    unsigned int i;
+
+    if ( !task )
+        return NULL;
+
+    BUILD_BUG_ON(ARRAY_SIZE(task->bars) != ARRAY_SIZE(pdev->vpci->header.bars));
+
+    for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
+    {
+        if ( pdev->vpci->header.bars[i].type >= VPCI_BAR_MEM32 )
+        {
+            char str[32];
+
+            snprintf(str, sizeof(str), "%pp:BAR%u", &pdev->sbdf, i);
+
+            task->bars[i].mem = rangeset_new(pdev->domain, str,
+                                             RANGESETF_no_print);
+
+            if ( !task->bars[i].mem )
+            {
+                destroy_map_task(task);
+                return NULL;
+            }
+
+            task->bars[i].addr = pdev->vpci->header.bars[i].addr;
+            task->bars[i].guest_addr = pdev->vpci->header.bars[i].guest_addr;
+        }
+    }
+
+    task->pdev = pdev;
+    task->domain = pdev->domain;
+    task->cmd = cmd;
+    task->rom_only = rom_only;
+
+    return task;
+}
+
+static void defer_map(struct vpci_map_task *task)
 {
     struct vcpu *curr = current;
 
@@ -293,9 +348,9 @@ static void defer_map(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
      * is mapped. This can lead to parallel mapping operations being
      * started for the same device if the domain is not well-behaved.
      */
-    curr->vpci.pdev = pdev;
-    curr->vpci.cmd = cmd;
-    curr->vpci.rom_only = rom_only;
+
+    list_add_tail(&task->next, &curr->vpci.task_queue);
+
     /*
      * Raise a scheduler softirq in order to prevent the guest from resuming
      * execution with pending mapping operations, to trigger the invocation
@@ -310,11 +365,15 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
     struct pci_dev *tmp;
     const struct domain *d;
     const struct vpci_msix *msix = pdev->vpci->msix;
+    struct vpci_map_task *task = alloc_map_task(pdev, cmd, rom_only);
     unsigned int i, j;
     int rc;
 
     ASSERT(rw_is_write_locked(&pdev->domain->pci_lock));
 
+    if ( !task )
+        return -ENOMEM;
+
     /*
      * Create a rangeset per BAR that represents the current device memory
      * region and compare it against all the currently active BAR memory
@@ -330,12 +389,13 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
     for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
     {
         struct vpci_bar *bar = &header->bars[i];
+        struct rangeset *mem = task->bars[i].mem;
         unsigned long start = PFN_DOWN(bar->addr);
         unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
         unsigned long start_guest = PFN_DOWN(bar->guest_addr);
         unsigned long end_guest = PFN_DOWN(bar->guest_addr + bar->size - 1);
 
-        if ( !bar->mem )
+        if ( !mem )
             continue;
 
         if ( !MAPPABLE_BAR(bar) ||
@@ -353,7 +413,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
             continue;
         }
 
-        ASSERT(rangeset_is_empty(bar->mem));
+        ASSERT(rangeset_is_empty(mem));
 
         /*
          * Make sure that the guest set address has the same page offset
@@ -365,21 +425,23 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
             gprintk(XENLOG_G_WARNING,
                     "%pp: can't map BAR%u - offset mismatch: %#lx vs %#lx\n",
                     &pdev->sbdf, i, bar->guest_addr, bar->addr);
+            destroy_map_task(task);
             return -EINVAL;
         }
 
-        rc = rangeset_add_range(bar->mem, start_guest, end_guest);
+        rc = rangeset_add_range(mem, start_guest, end_guest);
         if ( rc )
         {
             printk(XENLOG_G_WARNING "Failed to add [%lx, %lx]: %d\n",
                    start_guest, end_guest, rc);
+            destroy_map_task(task);
             return rc;
         }
 
         /* Check for overlap with the already setup BAR ranges. */
         for ( j = 0; j < i; j++ )
         {
-            struct vpci_bar *prev_bar = &header->bars[j];
+            struct vpci_bar_map *prev_bar = &task->bars[j];
 
             if ( rangeset_is_empty(prev_bar->mem) )
                 continue;
@@ -390,16 +452,18 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
                 gprintk(XENLOG_WARNING,
                        "%pp: failed to remove overlapping range [%lx, %lx]: %d\n",
                         &pdev->sbdf, start_guest, end_guest, rc);
+                destroy_map_task(task);
                 return rc;
             }
         }
 
-        rc = pci_sanitize_bar_memory(bar->mem);
+        rc = pci_sanitize_bar_memory(mem);
         if ( rc )
         {
             gprintk(XENLOG_WARNING,
                     "%pp: failed to sanitize BAR#%u memory: %d\n",
                     &pdev->sbdf, i, rc);
+            destroy_map_task(task);
             return rc;
         }
     }
@@ -413,7 +477,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
 
         for ( j = 0; j < ARRAY_SIZE(header->bars); j++ )
         {
-            const struct vpci_bar *bar = &header->bars[j];
+            const struct vpci_bar_map *bar = &task->bars[j];
 
             if ( rangeset_is_empty(bar->mem) )
                 continue;
@@ -424,6 +488,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
                 gprintk(XENLOG_WARNING,
                        "%pp: failed to remove MSIX table [%lx, %lx]: %d\n",
                         &pdev->sbdf, start, end, rc);
+                destroy_map_task(task);
                 return rc;
             }
         }
@@ -468,8 +533,9 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
                 for ( j = 0; j < ARRAY_SIZE(header->bars); j++)
                 {
                     const struct vpci_bar *bar = &header->bars[j];
+                    struct rangeset *mem = task->bars[j].mem;
 
-                    if ( !rangeset_overlaps_range(bar->mem, start, end) ||
+                    if ( !rangeset_overlaps_range(mem, start, end) ||
                          /*
                           * If only the ROM enable bit is toggled check against
                           * other BARs in the same device for overlaps, but not
@@ -480,12 +546,13 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
                           bar->type == VPCI_BAR_ROM) )
                         continue;
 
-                    rc = rangeset_remove_range(bar->mem, start, end);
+                    rc = rangeset_remove_range(mem, start, end);
                     if ( rc )
                     {
                         gprintk(XENLOG_WARNING,
                                 "%pp: failed to remove [%lx, %lx]: %d\n",
                                 &pdev->sbdf, start, end, rc);
+                        destroy_map_task(task);
                         return rc;
                     }
                 }
@@ -509,10 +576,12 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
          * will always be to establish mappings and process all the BARs.
          */
         ASSERT((cmd & PCI_COMMAND_MEMORY) && !rom_only);
-        return apply_map(pdev->domain, pdev, cmd);
+        rc = apply_map(task);
+        destroy_map_task(task);
+        return rc;
     }
 
-    defer_map(pdev, cmd, rom_only);
+    defer_map(task);
 
     return 0;
 }
@@ -731,18 +800,6 @@ static void cf_check rom_write(
     }
 }
 
-static int bar_add_rangeset(const struct pci_dev *pdev, struct vpci_bar *bar,
-                            unsigned int i)
-{
-    char str[32];
-
-    snprintf(str, sizeof(str), "%pp:BAR%u", &pdev->sbdf, i);
-
-    bar->mem = rangeset_new(pdev->domain, str, RANGESETF_no_print);
-
-    return !bar->mem ? -ENOMEM : 0;
-}
-
 static int vpci_init_capability_list(struct pci_dev *pdev)
 {
     int rc;
@@ -947,10 +1004,6 @@ static int cf_check init_header(struct pci_dev *pdev)
         else
             bars[i].type = VPCI_BAR_MEM32;
 
-        rc = bar_add_rangeset(pdev, &bars[i], i);
-        if ( rc )
-            goto fail;
-
         rc = pci_size_mem_bar(pdev->sbdf, reg, &addr, &size,
                               (i == num_bars - 1) ? PCI_BAR_LAST : 0);
         if ( rc < 0 )
@@ -1003,12 +1056,6 @@ static int cf_check init_header(struct pci_dev *pdev)
                                4, rom);
         if ( rc )
             rom->type = VPCI_BAR_EMPTY;
-        else
-        {
-            rc = bar_add_rangeset(pdev, rom, num_bars);
-            if ( rc )
-                goto fail;
-        }
     }
     else if ( !is_hwdom )
     {
diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
index 09988f04c27c..7177cfce355d 100644
--- a/xen/drivers/vpci/vpci.c
+++ b/xen/drivers/vpci/vpci.c
@@ -117,9 +117,6 @@ void vpci_deassign_device(struct pci_dev *pdev)
                 iounmap(pdev->vpci->msix->table[i]);
     }
 
-    for ( i = 0; i < ARRAY_SIZE(pdev->vpci->header.bars); i++ )
-        rangeset_destroy(pdev->vpci->header.bars[i].mem);
-
     xfree(pdev->vpci->msix);
     xfree(pdev->vpci->msi);
     xfree(pdev->vpci);
diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
index 6a481a4e89d3..c2e75076691f 100644
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -103,7 +103,6 @@ struct vpci {
             uint64_t guest_addr;
             uint64_t size;
             uint64_t resizable_sizes;
-            struct rangeset *mem;
             enum {
                 VPCI_BAR_EMPTY,
                 VPCI_BAR_IO,
@@ -194,14 +193,25 @@ struct vpci {
 #endif
 };
 
-struct vpci_vcpu {
+#ifdef __XEN__
+struct vpci_map_task {
     /* Per-vcpu structure to store state while {un}mapping of PCI BARs. */
+    struct list_head next;
     const struct pci_dev *pdev;
+    struct domain *domain;
+    struct vpci_bar_map {
+        uint64_t addr;
+        uint64_t guest_addr;
+        struct rangeset *mem;
+    } bars[PCI_HEADER_NORMAL_NR_BARS + 1];
     uint16_t cmd;
     bool rom_only : 1;
 };
 
-#ifdef __XEN__
+struct vpci_vcpu {
+    struct list_head task_queue;
+};
+
 void vpci_dump_msi(void);
 
 /* Make sure there's a hole in the p2m for the MSIX mmio areas. */
-- 
2.50.1

Re: [PATCH v2 1/3] vpci: allow queueing of mapping operations

Posted by Roger Pau Monné 3 months, 1 week ago

On Wed, Jul 23, 2025 at 12:37:41PM -0400, Stewart Hildebrand wrote:
> Introduce vPCI BAR mapping task queue. Decouple map operation state from
> general vPCI state: in particular, move the per-BAR rangeset out of
> struct vpci and into the map task struct.
> 
> This is preparatory work for further changes that need to perform
> multiple unmap/map operations before returning to guest.
> 
> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
> ---
> v1->v2:
> * new patch
> 
> Related: 622bdd962822 ("vpci/header: handle p2m range sets per BAR")
> ---
>  xen/common/domain.c       |   4 +
>  xen/drivers/vpci/header.c | 197 +++++++++++++++++++++++---------------
>  xen/drivers/vpci/vpci.c   |   3 -
>  xen/include/xen/vpci.h    |  16 +++-
>  4 files changed, 139 insertions(+), 81 deletions(-)
> 
> diff --git a/xen/common/domain.c b/xen/common/domain.c
> index 303c338ef293..214795e2d2fe 100644
> --- a/xen/common/domain.c
> +++ b/xen/common/domain.c
> @@ -459,6 +459,10 @@ struct vcpu *vcpu_create(struct domain *d, unsigned int vcpu_id)
>          d->vcpu[prev_id]->next_in_list = v;
>      }
>  
> +#ifdef CONFIG_HAS_VPCI
> +    INIT_LIST_HEAD(&v->vpci.task_queue);
> +#endif
> +
>      /* Must be called after making new vcpu visible to for_each_vcpu(). */
>      vcpu_check_shutdown(v);
>  
> diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
> index bb76e707992c..df065a5f5faf 100644
> --- a/xen/drivers/vpci/header.c
> +++ b/xen/drivers/vpci/header.c
> @@ -34,7 +34,7 @@
>  
>  struct map_data {
>      struct domain *d;
> -    const struct vpci_bar *bar;
> +    const struct vpci_bar_map *bar;
>      bool map;
>  };
>  
> @@ -173,31 +173,23 @@ static void modify_decoding(const struct pci_dev *pdev, uint16_t cmd,
>          ASSERT_UNREACHABLE();
>  }
>  
> -bool vpci_process_pending(struct vcpu *v)
> +static bool vpci_process_map_task(struct vpci_map_task *task)
>  {
> -    const struct pci_dev *pdev = v->vpci.pdev;
> -    struct vpci_header *header = NULL;
> +    const struct pci_dev *pdev = task->pdev;
>      unsigned int i;
>  
>      if ( !pdev )
>          return false;
>  
> -    read_lock(&v->domain->pci_lock);
> -
> -    if ( !pdev->vpci || (v->domain != pdev->domain) )
> -    {
> -        v->vpci.pdev = NULL;
> -        read_unlock(&v->domain->pci_lock);
> +    if ( !pdev->vpci || (task->domain != pdev->domain) )
>          return false;
> -    }
>  
> -    header = &pdev->vpci->header;
> -    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> +    for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
>      {
> -        struct vpci_bar *bar = &header->bars[i];
> +        struct vpci_bar_map *bar = &task->bars[i];
>          struct map_data data = {
> -            .d = v->domain,
> -            .map = v->vpci.cmd & PCI_COMMAND_MEMORY,
> +            .d = task->domain,
> +            .map = task->cmd & PCI_COMMAND_MEMORY,
>              .bar = bar,
>          };
>          int rc;
> @@ -208,57 +200,79 @@ bool vpci_process_pending(struct vcpu *v)
>          rc = rangeset_consume_ranges(bar->mem, map_range, &data);
>  
>          if ( rc == -ERESTART )
> -        {
> -            read_unlock(&v->domain->pci_lock);
>              return true;
> -        }
>  
>          if ( rc )
>          {
>              spin_lock(&pdev->vpci->lock);
>              /* Disable memory decoding unconditionally on failure. */
> -            modify_decoding(pdev, v->vpci.cmd & ~PCI_COMMAND_MEMORY,
> +            modify_decoding(pdev, task->cmd & ~PCI_COMMAND_MEMORY,
>                              false);
>              spin_unlock(&pdev->vpci->lock);
>  
> -            /* Clean all the rangesets */
> -            for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> -                if ( !rangeset_is_empty(header->bars[i].mem) )
> -                     rangeset_purge(header->bars[i].mem);
> -
> -            v->vpci.pdev = NULL;
> -
> -            read_unlock(&v->domain->pci_lock);
> -
> -            if ( !is_hardware_domain(v->domain) )
> -                domain_crash(v->domain);
> +            if ( !is_hardware_domain(task->domain) )
> +                domain_crash(task->domain);
>  
>              return false;
>          }
>      }
> -    v->vpci.pdev = NULL;
>  
>      spin_lock(&pdev->vpci->lock);
> -    modify_decoding(pdev, v->vpci.cmd, v->vpci.rom_only);
> +    modify_decoding(pdev, task->cmd, task->rom_only);
>      spin_unlock(&pdev->vpci->lock);
>  
> -    read_unlock(&v->domain->pci_lock);
> +    return false;
> +}
> +
> +static void destroy_map_task(struct vpci_map_task *task)
> +{
> +    unsigned int i;
>  
> +    if ( !task )
> +        return;

Maybe I'm missing a case, but is there any caller that can possibly
pass a NULL task?  Not that having the check is bad, just wondering
myself.  You might consider adding an ASSERT_UNREACHABLE() if there
are no callers that pass NULL (to ensure correctness of the calling
logic, not that the function can't deal with them).

> +
> +    for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
> +        rangeset_destroy(task->bars[i].mem);
> +
> +    xfree(task);
> +}
> +
> +bool vpci_process_pending(struct vcpu *v)
> +{
> +    struct vpci_map_task *task;

Newline.

> +    read_lock(&v->domain->pci_lock);
> +
> +    while ( (task = list_first_entry_or_null(&v->vpci.task_queue,
> +                                             struct vpci_map_task,
> +                                             next)) != NULL )
> +    {
> +        if ( vpci_process_map_task(task) )
> +        {
> +            read_unlock(&v->domain->pci_lock);
> +            return true;
> +        }
> +
> +        list_del(&task->next);
> +        destroy_map_task(task);
> +    }
> +
> +    read_unlock(&v->domain->pci_lock);
>      return false;
>  }
>  
> -static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
> -                            uint16_t cmd)
> +static int __init apply_map(struct vpci_map_task *task)
>  {
> -    struct vpci_header *header = &pdev->vpci->header;
> +    struct domain *d = task->domain;
> +    const struct pci_dev *pdev = task->pdev;
> +    uint16_t cmd = task->cmd;
>      int rc = 0;
>      unsigned int i;
>  
>      ASSERT(rw_is_write_locked(&d->pci_lock));
>  
> -    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> +    for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
>      {
> -        struct vpci_bar *bar = &header->bars[i];
> +        struct vpci_bar_map *bar = &task->bars[i];
>          struct map_data data = { .d = d, .map = true, .bar = bar };
>  
>          if ( rangeset_is_empty(bar->mem) )
> @@ -283,7 +297,48 @@ static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
>      return rc;
>  }
>  
> -static void defer_map(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> +static struct vpci_map_task *alloc_map_task(const struct pci_dev *pdev,
> +                                            uint16_t cmd, bool rom_only)
> +{
> +    struct vpci_map_task *task = xzalloc(struct vpci_map_task);

xvzalloc() preferably.

This however introduces run-time allocations as a result of guest
actions, which is not ideal IMO.  It would be preferable to do those
allocations as part of the header initialization, and re-use them.

> +    unsigned int i;
> +
> +    if ( !task )
> +        return NULL;
> +
> +    BUILD_BUG_ON(ARRAY_SIZE(task->bars) != ARRAY_SIZE(pdev->vpci->header.bars));
> +
> +    for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
> +    {
> +        if ( pdev->vpci->header.bars[i].type >= VPCI_BAR_MEM32 )

You possibly don't need to prepare this for VPCI_BAR_MEM64_HI BAR
types?

I think you want to use the existing MAPPABLE_BAR() macro to check
whether a BAR requires backing mapping infrastructure.

Also, a nit, but you might want to do something like:

if ( !MAPPABLE_BAR(&pdev->vpci->header.bars[i]) )
    continue;

To reduce a level of indentation.

> +        {
> +            char str[32];
> +
> +            snprintf(str, sizeof(str), "%pp:BAR%u", &pdev->sbdf, i);
> +
> +            task->bars[i].mem = rangeset_new(pdev->domain, str,
> +                                             RANGESETF_no_print);
> +
> +            if ( !task->bars[i].mem )
> +            {
> +                destroy_map_task(task);
> +                return NULL;
> +            }
> +
> +            task->bars[i].addr = pdev->vpci->header.bars[i].addr;
> +            task->bars[i].guest_addr = pdev->vpci->header.bars[i].guest_addr;
> +        }
> +    }
> +
> +    task->pdev = pdev;
> +    task->domain = pdev->domain;
> +    task->cmd = cmd;
> +    task->rom_only = rom_only;
> +
> +    return task;
> +}
> +
> +static void defer_map(struct vpci_map_task *task)
>  {
>      struct vcpu *curr = current;
>  
> @@ -293,9 +348,9 @@ static void defer_map(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>       * is mapped. This can lead to parallel mapping operations being
>       * started for the same device if the domain is not well-behaved.
>       */
> -    curr->vpci.pdev = pdev;
> -    curr->vpci.cmd = cmd;
> -    curr->vpci.rom_only = rom_only;
> +
> +    list_add_tail(&task->next, &curr->vpci.task_queue);

You could probably assert the list is empty before adding, albeit that
won't be the case after further patches are added?

>      /*
>       * Raise a scheduler softirq in order to prevent the guest from resuming
>       * execution with pending mapping operations, to trigger the invocation
> @@ -310,11 +365,15 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>      struct pci_dev *tmp;
>      const struct domain *d;
>      const struct vpci_msix *msix = pdev->vpci->msix;
> +    struct vpci_map_task *task = alloc_map_task(pdev, cmd, rom_only);
>      unsigned int i, j;
>      int rc;
>  
>      ASSERT(rw_is_write_locked(&pdev->domain->pci_lock));
>  
> +    if ( !task )
> +        return -ENOMEM;
> +
>      /*
>       * Create a rangeset per BAR that represents the current device memory
>       * region and compare it against all the currently active BAR memory
> @@ -330,12 +389,13 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>      for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>      {
>          struct vpci_bar *bar = &header->bars[i];
> +        struct rangeset *mem = task->bars[i].mem;
>          unsigned long start = PFN_DOWN(bar->addr);
>          unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
>          unsigned long start_guest = PFN_DOWN(bar->guest_addr);
>          unsigned long end_guest = PFN_DOWN(bar->guest_addr + bar->size - 1);
>  
> -        if ( !bar->mem )
> +        if ( !mem )
>              continue;
>  
>          if ( !MAPPABLE_BAR(bar) ||
> @@ -353,7 +413,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>              continue;
>          }
>  
> -        ASSERT(rangeset_is_empty(bar->mem));
> +        ASSERT(rangeset_is_empty(mem));
>  
>          /*
>           * Make sure that the guest set address has the same page offset
> @@ -365,21 +425,23 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>              gprintk(XENLOG_G_WARNING,
>                      "%pp: can't map BAR%u - offset mismatch: %#lx vs %#lx\n",
>                      &pdev->sbdf, i, bar->guest_addr, bar->addr);
> +            destroy_map_task(task);
>              return -EINVAL;
>          }
>  
> -        rc = rangeset_add_range(bar->mem, start_guest, end_guest);
> +        rc = rangeset_add_range(mem, start_guest, end_guest);
>          if ( rc )
>          {
>              printk(XENLOG_G_WARNING "Failed to add [%lx, %lx]: %d\n",
>                     start_guest, end_guest, rc);
> +            destroy_map_task(task);
>              return rc;
>          }
>  
>          /* Check for overlap with the already setup BAR ranges. */
>          for ( j = 0; j < i; j++ )
>          {
> -            struct vpci_bar *prev_bar = &header->bars[j];
> +            struct vpci_bar_map *prev_bar = &task->bars[j];
>  
>              if ( rangeset_is_empty(prev_bar->mem) )
>                  continue;
> @@ -390,16 +452,18 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>                  gprintk(XENLOG_WARNING,
>                         "%pp: failed to remove overlapping range [%lx, %lx]: %d\n",
>                          &pdev->sbdf, start_guest, end_guest, rc);
> +                destroy_map_task(task);
>                  return rc;
>              }
>          }
>  
> -        rc = pci_sanitize_bar_memory(bar->mem);
> +        rc = pci_sanitize_bar_memory(mem);
>          if ( rc )
>          {
>              gprintk(XENLOG_WARNING,
>                      "%pp: failed to sanitize BAR#%u memory: %d\n",
>                      &pdev->sbdf, i, rc);
> +            destroy_map_task(task);
>              return rc;
>          }
>      }
> @@ -413,7 +477,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>  
>          for ( j = 0; j < ARRAY_SIZE(header->bars); j++ )
>          {
> -            const struct vpci_bar *bar = &header->bars[j];
> +            const struct vpci_bar_map *bar = &task->bars[j];
>  
>              if ( rangeset_is_empty(bar->mem) )
>                  continue;
> @@ -424,6 +488,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>                  gprintk(XENLOG_WARNING,
>                         "%pp: failed to remove MSIX table [%lx, %lx]: %d\n",
>                          &pdev->sbdf, start, end, rc);
> +                destroy_map_task(task);
>                  return rc;
>              }
>          }
> @@ -468,8 +533,9 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>                  for ( j = 0; j < ARRAY_SIZE(header->bars); j++)
>                  {
>                      const struct vpci_bar *bar = &header->bars[j];
> +                    struct rangeset *mem = task->bars[j].mem;
>  
> -                    if ( !rangeset_overlaps_range(bar->mem, start, end) ||
> +                    if ( !rangeset_overlaps_range(mem, start, end) ||
>                           /*
>                            * If only the ROM enable bit is toggled check against
>                            * other BARs in the same device for overlaps, but not
> @@ -480,12 +546,13 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>                            bar->type == VPCI_BAR_ROM) )
>                          continue;
>  
> -                    rc = rangeset_remove_range(bar->mem, start, end);
> +                    rc = rangeset_remove_range(mem, start, end);
>                      if ( rc )
>                      {
>                          gprintk(XENLOG_WARNING,
>                                  "%pp: failed to remove [%lx, %lx]: %d\n",
>                                  &pdev->sbdf, start, end, rc);
> +                        destroy_map_task(task);
>                          return rc;
>                      }
>                  }
> @@ -509,10 +576,12 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>           * will always be to establish mappings and process all the BARs.
>           */
>          ASSERT((cmd & PCI_COMMAND_MEMORY) && !rom_only);
> -        return apply_map(pdev->domain, pdev, cmd);
> +        rc = apply_map(task);
> +        destroy_map_task(task);

All the destroy calls are a bit repetitive, you could consider using
an error label or similar.

> +        return rc;
>      }
>  
> -    defer_map(pdev, cmd, rom_only);
> +    defer_map(task);
>  
>      return 0;
>  }
> @@ -731,18 +800,6 @@ static void cf_check rom_write(
>      }
>  }
>  
> -static int bar_add_rangeset(const struct pci_dev *pdev, struct vpci_bar *bar,
> -                            unsigned int i)
> -{
> -    char str[32];
> -
> -    snprintf(str, sizeof(str), "%pp:BAR%u", &pdev->sbdf, i);
> -
> -    bar->mem = rangeset_new(pdev->domain, str, RANGESETF_no_print);
> -
> -    return !bar->mem ? -ENOMEM : 0;
> -}
> -
>  static int vpci_init_capability_list(struct pci_dev *pdev)
>  {
>      int rc;
> @@ -947,10 +1004,6 @@ static int cf_check init_header(struct pci_dev *pdev)
>          else
>              bars[i].type = VPCI_BAR_MEM32;
>  
> -        rc = bar_add_rangeset(pdev, &bars[i], i);
> -        if ( rc )
> -            goto fail;
> -
>          rc = pci_size_mem_bar(pdev->sbdf, reg, &addr, &size,
>                                (i == num_bars - 1) ? PCI_BAR_LAST : 0);
>          if ( rc < 0 )
> @@ -1003,12 +1056,6 @@ static int cf_check init_header(struct pci_dev *pdev)
>                                 4, rom);
>          if ( rc )
>              rom->type = VPCI_BAR_EMPTY;
> -        else
> -        {
> -            rc = bar_add_rangeset(pdev, rom, num_bars);
> -            if ( rc )
> -                goto fail;
> -        }
>      }
>      else if ( !is_hwdom )
>      {
> diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
> index 09988f04c27c..7177cfce355d 100644
> --- a/xen/drivers/vpci/vpci.c
> +++ b/xen/drivers/vpci/vpci.c
> @@ -117,9 +117,6 @@ void vpci_deassign_device(struct pci_dev *pdev)
>                  iounmap(pdev->vpci->msix->table[i]);
>      }
>  
> -    for ( i = 0; i < ARRAY_SIZE(pdev->vpci->header.bars); i++ )
> -        rangeset_destroy(pdev->vpci->header.bars[i].mem);
> -
>      xfree(pdev->vpci->msix);
>      xfree(pdev->vpci->msi);
>      xfree(pdev->vpci);
> diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
> index 6a481a4e89d3..c2e75076691f 100644
> --- a/xen/include/xen/vpci.h
> +++ b/xen/include/xen/vpci.h
> @@ -103,7 +103,6 @@ struct vpci {
>              uint64_t guest_addr;
>              uint64_t size;
>              uint64_t resizable_sizes;
> -            struct rangeset *mem;
>              enum {
>                  VPCI_BAR_EMPTY,
>                  VPCI_BAR_IO,
> @@ -194,14 +193,25 @@ struct vpci {
>  #endif
>  };
>  
> -struct vpci_vcpu {
> +#ifdef __XEN__
> +struct vpci_map_task {
>      /* Per-vcpu structure to store state while {un}mapping of PCI BARs. */
> +    struct list_head next;
>      const struct pci_dev *pdev;
> +    struct domain *domain;
> +    struct vpci_bar_map {
> +        uint64_t addr;
> +        uint64_t guest_addr;
> +        struct rangeset *mem;
> +    } bars[PCI_HEADER_NORMAL_NR_BARS + 1];

You might be able to use ARRAY_SIZE() to derive the number of elements
from the existing array in vpci_header struct.

Thanks, Roger.

Re: [PATCH v2 1/3] vpci: allow queueing of mapping operations

Posted by Roger Pau Monné 3 months, 1 week ago

On Thu, Jul 24, 2025 at 06:44:32PM +0200, Roger Pau Monné wrote:
> On Wed, Jul 23, 2025 at 12:37:41PM -0400, Stewart Hildebrand wrote:
> > @@ -283,7 +297,48 @@ static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
> >      return rc;
> >  }
> >  
> > -static void defer_map(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> > +static struct vpci_map_task *alloc_map_task(const struct pci_dev *pdev,
> > +                                            uint16_t cmd, bool rom_only)
> > +{
> > +    struct vpci_map_task *task = xzalloc(struct vpci_map_task);
> 
> xvzalloc() preferably.
> 
> This however introduces run-time allocations as a result of guest
> actions, which is not ideal IMO.  It would be preferable to do those
> allocations as part of the header initialization, and re-use them.

I've been thinking over this, as I've realized that while commenting
on it, I didn't provide any alternatives.

The usage of rangesets to figure out the regions to map is already not
optimal, as adding/removing from a rangeset can lead to memory
allocations.  It would be good if we could create rangesets with a
pre-allocated number of ranges (iow: a pool of struct ranges), but
that's for another patchset.  I think Jan already commented on this
aspect long time ago.

I'm considering whether to allocate the deferred mapping structures
per-vCPU instead of per-device.  That would for example mean moving
the current vpci_bar->mem rangeset so it's allocated in vpci_vcpu
struct instead.  The point would be to not have the rangesets per
device (because there can be a lot of devices, specially for the
hardware domain), but instead have those per-vCPU.  This should work
because a vCPU can only queue a single vPCI operation, from a single
device.

It should then be possible to allocate the deferred mapping structures
at vCPU creation.  I also ponder if we really need a linked list to
queue them; AFAIK there can only ever be an unmapping and a mapping
operation pending (so 2 operations at most).  Hence we could use a
more "fixed" structure like an array.  For example in struct vpci_vcpu
you could introduce a struct vpci_map_task task[2] field?

Sorry, I know this is not a minor change to request.  It shouldn't
change the overall logic much, but it would inevitably affect the
code.  Let me know what you think.

Thanks, Roger.

Re: [PATCH v2 1/3] vpci: allow queueing of mapping operations

Posted by Stewart Hildebrand 2 months, 4 weeks ago

On 7/25/25 03:58, Roger Pau Monné wrote:
> On Thu, Jul 24, 2025 at 06:44:32PM +0200, Roger Pau Monné wrote:
>> On Wed, Jul 23, 2025 at 12:37:41PM -0400, Stewart Hildebrand wrote:
>>> @@ -283,7 +297,48 @@ static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
>>>      return rc;
>>>  }
>>>  
>>> -static void defer_map(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>>> +static struct vpci_map_task *alloc_map_task(const struct pci_dev *pdev,
>>> +                                            uint16_t cmd, bool rom_only)
>>> +{
>>> +    struct vpci_map_task *task = xzalloc(struct vpci_map_task);
>>
>> xvzalloc() preferably.
>>
>> This however introduces run-time allocations as a result of guest
>> actions, which is not ideal IMO.  It would be preferable to do those
>> allocations as part of the header initialization, and re-use them.
> 
> I've been thinking over this, as I've realized that while commenting
> on it, I didn't provide any alternatives.
> 
> The usage of rangesets to figure out the regions to map is already not
> optimal, as adding/removing from a rangeset can lead to memory
> allocations.  It would be good if we could create rangesets with a
> pre-allocated number of ranges (iow: a pool of struct ranges), but
> that's for another patchset.  I think Jan already commented on this
> aspect long time ago.

+1

> I'm considering whether to allocate the deferred mapping structures
> per-vCPU instead of per-device.  That would for example mean moving
> the current vpci_bar->mem rangeset so it's allocated in vpci_vcpu
> struct instead.  The point would be to not have the rangesets per
> device (because there can be a lot of devices, specially for the
> hardware domain), but instead have those per-vCPU.  This should work
> because a vCPU can only queue a single vPCI operation, from a single
> device.
> 
> It should then be possible to allocate the deferred mapping structures
> at vCPU creation.  I also ponder if we really need a linked list to
> queue them; AFAIK there can only ever be an unmapping and a mapping
> operation pending (so 2 operations at most).  Hence we could use a
> more "fixed" structure like an array.  For example in struct vpci_vcpu
> you could introduce a struct vpci_map_task task[2] field?
> 
> Sorry, I know this is not a minor change to request.  It shouldn't
> change the overall logic much, but it would inevitably affect the
> code.  Let me know what you think.

Thanks for the feedback and suggestion. Yeah, I'll give this a try.
Here's roughly what I'm thinking so far. I'll keep playing with it.

diff --git a/xen/common/domain.c b/xen/common/domain.c
index 5241a1629eeb..942c9fe7d364 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -387,6 +387,16 @@ static int vmtrace_alloc_buffer(struct vcpu *v)
  */
 static int vcpu_teardown(struct vcpu *v)
 {
+#ifdef CONFIG_HAS_VPCI
+    for ( unsigned int i = 0; i < ARRAY_SIZE(v->vpci.task); i++ )
+    {
+        struct vpci_map_task *task = &v->vpci.task[i];
+
+        for ( unsigned int j = 0; j < ARRAY_SIZE(task->bars); j++ )
+            rangeset_destroy(task->bars[j].mem);
+    }
+#endif
+
     vmtrace_free_buffer(v);
 
     return 0;
@@ -467,6 +477,26 @@ struct vcpu *vcpu_create(struct domain *d, unsigned int vcpu_id)
         d->vcpu[prev_id]->next_in_list = v;
     }
 
+#ifdef CONFIG_HAS_VPCI
+    for ( unsigned int i = 0; i < ARRAY_SIZE(v->vpci.task); i++ )
+    {
+        struct vpci_map_task *task = &v->vpci.task[i];
+
+        for ( unsigned int j = 0; j < ARRAY_SIZE(task->bars); j++ )
+        {
+            struct vpci_bar_map *bar = &task->bars[j];
+            char str[32];
+
+            snprintf(str, sizeof(str), "PCI map vcpu %u task %u BAR %u", vcpu_id, i, j);
+
+            bar->mem = rangeset_new(v->domain, str, RANGESETF_no_print);
+
+            if ( !bar->mem )
+                goto fail_sched;
+        }
+    }
+#endif
+
     /* Must be called after making new vcpu visible to for_each_vcpu(). */
     vcpu_check_shutdown(v);
 
diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
index 17cfecb0aabf..afe78b00ffc9 100644
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -116,7 +116,6 @@ struct vpci {
             uint64_t guest_addr;
             uint64_t size;
             uint64_t resizable_sizes;
-            struct rangeset *mem;
             enum {
                 VPCI_BAR_EMPTY,
                 VPCI_BAR_IO,
@@ -207,14 +206,23 @@ struct vpci {
 #endif
 };
 
+#ifdef __XEN__
 struct vpci_vcpu {
     /* Per-vcpu structure to store state while {un}mapping of PCI BARs. */
     const struct pci_dev *pdev;
-    uint16_t cmd;
-    bool rom_only : 1;
+    struct domain *domain;
+    unsigned int nr_pending_ops;
+    struct vpci_map_task {
+        struct vpci_bar_map {
+            uint64_t addr;
+            uint64_t guest_addr;
+            struct rangeset *mem;
+        } bars[PCI_HEADER_NORMAL_NR_BARS + 1];
+        uint16_t cmd;
+        bool rom_only : 1;
+    } task[2];
 };
 
-#ifdef __XEN__
 void vpci_dump_msi(void);
 
 /* Make sure there's a hole in the p2m for the MSIX mmio areas. */

Re: [PATCH v2 1/3] vpci: allow queueing of mapping operations

Posted by Roger Pau Monné 2 months, 3 weeks ago

On Fri, Aug 01, 2025 at 05:06:32PM -0400, Stewart Hildebrand wrote:
> On 7/25/25 03:58, Roger Pau Monné wrote:
> > On Thu, Jul 24, 2025 at 06:44:32PM +0200, Roger Pau Monné wrote:
> >> On Wed, Jul 23, 2025 at 12:37:41PM -0400, Stewart Hildebrand wrote:
> >>> @@ -283,7 +297,48 @@ static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
> >>>      return rc;
> >>>  }
> >>>  
> >>> -static void defer_map(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> >>> +static struct vpci_map_task *alloc_map_task(const struct pci_dev *pdev,
> >>> +                                            uint16_t cmd, bool rom_only)
> >>> +{
> >>> +    struct vpci_map_task *task = xzalloc(struct vpci_map_task);
> >>
> >> xvzalloc() preferably.
> >>
> >> This however introduces run-time allocations as a result of guest
> >> actions, which is not ideal IMO.  It would be preferable to do those
> >> allocations as part of the header initialization, and re-use them.
> > 
> > I've been thinking over this, as I've realized that while commenting
> > on it, I didn't provide any alternatives.
> > 
> > The usage of rangesets to figure out the regions to map is already not
> > optimal, as adding/removing from a rangeset can lead to memory
> > allocations.  It would be good if we could create rangesets with a
> > pre-allocated number of ranges (iow: a pool of struct ranges), but
> > that's for another patchset.  I think Jan already commented on this
> > aspect long time ago.
> 
> +1
> 
> > I'm considering whether to allocate the deferred mapping structures
> > per-vCPU instead of per-device.  That would for example mean moving
> > the current vpci_bar->mem rangeset so it's allocated in vpci_vcpu
> > struct instead.  The point would be to not have the rangesets per
> > device (because there can be a lot of devices, specially for the
> > hardware domain), but instead have those per-vCPU.  This should work
> > because a vCPU can only queue a single vPCI operation, from a single
> > device.
> > 
> > It should then be possible to allocate the deferred mapping structures
> > at vCPU creation.  I also ponder if we really need a linked list to
> > queue them; AFAIK there can only ever be an unmapping and a mapping
> > operation pending (so 2 operations at most).  Hence we could use a
> > more "fixed" structure like an array.  For example in struct vpci_vcpu
> > you could introduce a struct vpci_map_task task[2] field?
> > 
> > Sorry, I know this is not a minor change to request.  It shouldn't
> > change the overall logic much, but it would inevitably affect the
> > code.  Let me know what you think.
> 
> Thanks for the feedback and suggestion. Yeah, I'll give this a try.
> Here's roughly what I'm thinking so far. I'll keep playing with it.
> 
> diff --git a/xen/common/domain.c b/xen/common/domain.c
> index 5241a1629eeb..942c9fe7d364 100644
> --- a/xen/common/domain.c
> +++ b/xen/common/domain.c
> @@ -387,6 +387,16 @@ static int vmtrace_alloc_buffer(struct vcpu *v)
>   */
>  static int vcpu_teardown(struct vcpu *v)
>  {
> +#ifdef CONFIG_HAS_VPCI
> +    for ( unsigned int i = 0; i < ARRAY_SIZE(v->vpci.task); i++ )
> +    {
> +        struct vpci_map_task *task = &v->vpci.task[i];
> +
> +        for ( unsigned int j = 0; j < ARRAY_SIZE(task->bars); j++ )
> +            rangeset_destroy(task->bars[j].mem);

You might want to additionally do:

task->bars[j].mem = NULL;

> +    }
> +#endif
> +
>      vmtrace_free_buffer(v);
>  
>      return 0;
> @@ -467,6 +477,26 @@ struct vcpu *vcpu_create(struct domain *d, unsigned int vcpu_id)
>          d->vcpu[prev_id]->next_in_list = v;
>      }
>  
> +#ifdef CONFIG_HAS_VPCI
> +    for ( unsigned int i = 0; i < ARRAY_SIZE(v->vpci.task); i++ )
> +    {
> +        struct vpci_map_task *task = &v->vpci.task[i];
> +
> +        for ( unsigned int j = 0; j < ARRAY_SIZE(task->bars); j++ )
> +        {
> +            struct vpci_bar_map *bar = &task->bars[j];
> +            char str[32];
> +
> +            snprintf(str, sizeof(str), "PCI map vcpu %u task %u BAR %u", vcpu_id, i, j);
> +
> +            bar->mem = rangeset_new(v->domain, str, RANGESETF_no_print);

Not sure there's much point in naming those with that much detail -
those are scratch space for mapping calculations.  You already pass
RANGESETF_no_print, which means the contents of the rangeset won't be
dumped, and hence the name is kind of meaningless.  I shouldn't have
named those either when allocated in bar_add_rangeset().

> +
> +            if ( !bar->mem )
> +                goto fail_sched;
> +        }
> +    }
> +#endif
> +
>      /* Must be called after making new vcpu visible to for_each_vcpu(). */
>      vcpu_check_shutdown(v);
>  
> diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
> index 17cfecb0aabf..afe78b00ffc9 100644
> --- a/xen/include/xen/vpci.h
> +++ b/xen/include/xen/vpci.h
> @@ -116,7 +116,6 @@ struct vpci {
>              uint64_t guest_addr;
>              uint64_t size;
>              uint64_t resizable_sizes;
> -            struct rangeset *mem;
>              enum {
>                  VPCI_BAR_EMPTY,
>                  VPCI_BAR_IO,
> @@ -207,14 +206,23 @@ struct vpci {
>  #endif
>  };
>  
> +#ifdef __XEN__
>  struct vpci_vcpu {
>      /* Per-vcpu structure to store state while {un}mapping of PCI BARs. */
>      const struct pci_dev *pdev;
> -    uint16_t cmd;
> -    bool rom_only : 1;
> +    struct domain *domain;
> +    unsigned int nr_pending_ops;

Not sure you really need a pending ops counter?  Hard to tell without
seeing the code that makes use of it.

> +    struct vpci_map_task {
> +        struct vpci_bar_map {
> +            uint64_t addr;
> +            uint64_t guest_addr;
> +            struct rangeset *mem;
> +        } bars[PCI_HEADER_NORMAL_NR_BARS + 1];
> +        uint16_t cmd;
> +        bool rom_only : 1;
> +    } task[2];

Don't you need a way to differentiate between map/unmap operations?
Do you plan to use slot 0 as unmap and slot 1 as map?  Or would you
rather introduce a boolean field to signal it in struct vpci_map_task?

Overall seems OK, but obviously it all needs to fit together with the
current code :).

Thanks, Roger.

Re: [PATCH v2 1/3] vpci: allow queueing of mapping operations

Posted by Jan Beulich 2 months, 3 weeks ago

On 04.08.2025 15:55, Roger Pau Monné wrote:
> On Fri, Aug 01, 2025 at 05:06:32PM -0400, Stewart Hildebrand wrote:
>> On 7/25/25 03:58, Roger Pau Monné wrote:
>>> On Thu, Jul 24, 2025 at 06:44:32PM +0200, Roger Pau Monné wrote:
>>>> On Wed, Jul 23, 2025 at 12:37:41PM -0400, Stewart Hildebrand wrote:
>>>>> @@ -283,7 +297,48 @@ static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
>>>>>      return rc;
>>>>>  }
>>>>>  
>>>>> -static void defer_map(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>>>>> +static struct vpci_map_task *alloc_map_task(const struct pci_dev *pdev,
>>>>> +                                            uint16_t cmd, bool rom_only)
>>>>> +{
>>>>> +    struct vpci_map_task *task = xzalloc(struct vpci_map_task);
>>>>
>>>> xvzalloc() preferably.
>>>>
>>>> This however introduces run-time allocations as a result of guest
>>>> actions, which is not ideal IMO.  It would be preferable to do those
>>>> allocations as part of the header initialization, and re-use them.
>>>
>>> I've been thinking over this, as I've realized that while commenting
>>> on it, I didn't provide any alternatives.
>>>
>>> The usage of rangesets to figure out the regions to map is already not
>>> optimal, as adding/removing from a rangeset can lead to memory
>>> allocations.  It would be good if we could create rangesets with a
>>> pre-allocated number of ranges (iow: a pool of struct ranges), but
>>> that's for another patchset.  I think Jan already commented on this
>>> aspect long time ago.
>>
>> +1
>>
>>> I'm considering whether to allocate the deferred mapping structures
>>> per-vCPU instead of per-device.  That would for example mean moving
>>> the current vpci_bar->mem rangeset so it's allocated in vpci_vcpu
>>> struct instead.  The point would be to not have the rangesets per
>>> device (because there can be a lot of devices, specially for the
>>> hardware domain), but instead have those per-vCPU.  This should work
>>> because a vCPU can only queue a single vPCI operation, from a single
>>> device.
>>>
>>> It should then be possible to allocate the deferred mapping structures
>>> at vCPU creation.  I also ponder if we really need a linked list to
>>> queue them; AFAIK there can only ever be an unmapping and a mapping
>>> operation pending (so 2 operations at most).  Hence we could use a
>>> more "fixed" structure like an array.  For example in struct vpci_vcpu
>>> you could introduce a struct vpci_map_task task[2] field?
>>>
>>> Sorry, I know this is not a minor change to request.  It shouldn't
>>> change the overall logic much, but it would inevitably affect the
>>> code.  Let me know what you think.
>>
>> Thanks for the feedback and suggestion. Yeah, I'll give this a try.
>> Here's roughly what I'm thinking so far. I'll keep playing with it.
>>
>> diff --git a/xen/common/domain.c b/xen/common/domain.c
>> index 5241a1629eeb..942c9fe7d364 100644
>> --- a/xen/common/domain.c
>> +++ b/xen/common/domain.c
>> @@ -387,6 +387,16 @@ static int vmtrace_alloc_buffer(struct vcpu *v)
>>   */
>>  static int vcpu_teardown(struct vcpu *v)
>>  {
>> +#ifdef CONFIG_HAS_VPCI
>> +    for ( unsigned int i = 0; i < ARRAY_SIZE(v->vpci.task); i++ )
>> +    {
>> +        struct vpci_map_task *task = &v->vpci.task[i];
>> +
>> +        for ( unsigned int j = 0; j < ARRAY_SIZE(task->bars); j++ )
>> +            rangeset_destroy(task->bars[j].mem);
> 
> You might want to additionally do:
> 
> task->bars[j].mem = NULL;

Should we perhaps introduce RANGESET_DESTROY() along the lines of XFREE() et al?

Jan

Re: [PATCH v2 1/3] vpci: allow queueing of mapping operations

Posted by Roger Pau Monné 2 months, 3 weeks ago

On Mon, Aug 04, 2025 at 03:57:24PM +0200, Jan Beulich wrote:
> On 04.08.2025 15:55, Roger Pau Monné wrote:
> > On Fri, Aug 01, 2025 at 05:06:32PM -0400, Stewart Hildebrand wrote:
> >> On 7/25/25 03:58, Roger Pau Monné wrote:
> >>> On Thu, Jul 24, 2025 at 06:44:32PM +0200, Roger Pau Monné wrote:
> >>>> On Wed, Jul 23, 2025 at 12:37:41PM -0400, Stewart Hildebrand wrote:
> >>>>> @@ -283,7 +297,48 @@ static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
> >>>>>      return rc;
> >>>>>  }
> >>>>>  
> >>>>> -static void defer_map(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> >>>>> +static struct vpci_map_task *alloc_map_task(const struct pci_dev *pdev,
> >>>>> +                                            uint16_t cmd, bool rom_only)
> >>>>> +{
> >>>>> +    struct vpci_map_task *task = xzalloc(struct vpci_map_task);
> >>>>
> >>>> xvzalloc() preferably.
> >>>>
> >>>> This however introduces run-time allocations as a result of guest
> >>>> actions, which is not ideal IMO.  It would be preferable to do those
> >>>> allocations as part of the header initialization, and re-use them.
> >>>
> >>> I've been thinking over this, as I've realized that while commenting
> >>> on it, I didn't provide any alternatives.
> >>>
> >>> The usage of rangesets to figure out the regions to map is already not
> >>> optimal, as adding/removing from a rangeset can lead to memory
> >>> allocations.  It would be good if we could create rangesets with a
> >>> pre-allocated number of ranges (iow: a pool of struct ranges), but
> >>> that's for another patchset.  I think Jan already commented on this
> >>> aspect long time ago.
> >>
> >> +1
> >>
> >>> I'm considering whether to allocate the deferred mapping structures
> >>> per-vCPU instead of per-device.  That would for example mean moving
> >>> the current vpci_bar->mem rangeset so it's allocated in vpci_vcpu
> >>> struct instead.  The point would be to not have the rangesets per
> >>> device (because there can be a lot of devices, specially for the
> >>> hardware domain), but instead have those per-vCPU.  This should work
> >>> because a vCPU can only queue a single vPCI operation, from a single
> >>> device.
> >>>
> >>> It should then be possible to allocate the deferred mapping structures
> >>> at vCPU creation.  I also ponder if we really need a linked list to
> >>> queue them; AFAIK there can only ever be an unmapping and a mapping
> >>> operation pending (so 2 operations at most).  Hence we could use a
> >>> more "fixed" structure like an array.  For example in struct vpci_vcpu
> >>> you could introduce a struct vpci_map_task task[2] field?
> >>>
> >>> Sorry, I know this is not a minor change to request.  It shouldn't
> >>> change the overall logic much, but it would inevitably affect the
> >>> code.  Let me know what you think.
> >>
> >> Thanks for the feedback and suggestion. Yeah, I'll give this a try.
> >> Here's roughly what I'm thinking so far. I'll keep playing with it.
> >>
> >> diff --git a/xen/common/domain.c b/xen/common/domain.c
> >> index 5241a1629eeb..942c9fe7d364 100644
> >> --- a/xen/common/domain.c
> >> +++ b/xen/common/domain.c
> >> @@ -387,6 +387,16 @@ static int vmtrace_alloc_buffer(struct vcpu *v)
> >>   */
> >>  static int vcpu_teardown(struct vcpu *v)
> >>  {
> >> +#ifdef CONFIG_HAS_VPCI
> >> +    for ( unsigned int i = 0; i < ARRAY_SIZE(v->vpci.task); i++ )
> >> +    {
> >> +        struct vpci_map_task *task = &v->vpci.task[i];
> >> +
> >> +        for ( unsigned int j = 0; j < ARRAY_SIZE(task->bars); j++ )
> >> +            rangeset_destroy(task->bars[j].mem);
> > 
> > You might want to additionally do:
> > 
> > task->bars[j].mem = NULL;
> 
> Should we perhaps introduce RANGESET_DESTROY() along the lines of XFREE() et al?

Yes, I was wondering whether to recommend it here, but didn't want to
add noise, so was planning on adding this to my queue.  But yes, if
you can/will please do it Stewart.

Thanks, Roger.

Re: [PATCH v2 1/3] vpci: allow queueing of mapping operations

Posted by dmkhn@proton.me 3 months, 1 week ago

On Wed, Jul 23, 2025 at 12:37:41PM -0400, Stewart Hildebrand wrote:
> Introduce vPCI BAR mapping task queue. Decouple map operation state from
> general vPCI state: in particular, move the per-BAR rangeset out of
> struct vpci and into the map task struct.
> 
> This is preparatory work for further changes that need to perform
> multiple unmap/map operations before returning to guest.
> 
> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
> ---
> v1->v2:
> * new patch
> 
> Related: 622bdd962822 ("vpci/header: handle p2m range sets per BAR")
> ---
>  xen/common/domain.c       |   4 +
>  xen/drivers/vpci/header.c | 197 +++++++++++++++++++++++---------------
>  xen/drivers/vpci/vpci.c   |   3 -
>  xen/include/xen/vpci.h    |  16 +++-
>  4 files changed, 139 insertions(+), 81 deletions(-)
> 
> diff --git a/xen/common/domain.c b/xen/common/domain.c
> index 303c338ef293..214795e2d2fe 100644
> --- a/xen/common/domain.c
> +++ b/xen/common/domain.c
> @@ -459,6 +459,10 @@ struct vcpu *vcpu_create(struct domain *d, unsigned int vcpu_id)
>          d->vcpu[prev_id]->next_in_list = v;
>      }
> 
> +#ifdef CONFIG_HAS_VPCI
> +    INIT_LIST_HEAD(&v->vpci.task_queue);
> +#endif
> +
>      /* Must be called after making new vcpu visible to for_each_vcpu(). */
>      vcpu_check_shutdown(v);
> 
> diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
> index bb76e707992c..df065a5f5faf 100644
> --- a/xen/drivers/vpci/header.c
> +++ b/xen/drivers/vpci/header.c
> @@ -34,7 +34,7 @@
> 
>  struct map_data {
>      struct domain *d;
> -    const struct vpci_bar *bar;
> +    const struct vpci_bar_map *bar;

Perhaps s/vpci_bar_map/vpci_bar_mmio/g to highlight the BAR type?

>      bool map;
>  };
> 
> @@ -173,31 +173,23 @@ static void modify_decoding(const struct pci_dev *pdev, uint16_t cmd,
>          ASSERT_UNREACHABLE();
>  }
> 
> -bool vpci_process_pending(struct vcpu *v)
> +static bool vpci_process_map_task(struct vpci_map_task *task)
>  {
> -    const struct pci_dev *pdev = v->vpci.pdev;
> -    struct vpci_header *header = NULL;
> +    const struct pci_dev *pdev = task->pdev;
>      unsigned int i;
> 
>      if ( !pdev )
>          return false;
> 
> -    read_lock(&v->domain->pci_lock);
> -
> -    if ( !pdev->vpci || (v->domain != pdev->domain) )
> -    {
> -        v->vpci.pdev = NULL;
> -        read_unlock(&v->domain->pci_lock);
> +    if ( !pdev->vpci || (task->domain != pdev->domain) )
>          return false;
> -    }
> 
> -    header = &pdev->vpci->header;
> -    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> +    for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
>      {
> -        struct vpci_bar *bar = &header->bars[i];
> +        struct vpci_bar_map *bar = &task->bars[i];
>          struct map_data data = {
> -            .d = v->domain,
> -            .map = v->vpci.cmd & PCI_COMMAND_MEMORY,
> +            .d = task->domain,
> +            .map = task->cmd & PCI_COMMAND_MEMORY,
>              .bar = bar,
>          };
>          int rc;
> @@ -208,57 +200,79 @@ bool vpci_process_pending(struct vcpu *v)
>          rc = rangeset_consume_ranges(bar->mem, map_range, &data);
> 
>          if ( rc == -ERESTART )
> -        {
> -            read_unlock(&v->domain->pci_lock);
>              return true;
> -        }
> 
>          if ( rc )
>          {
>              spin_lock(&pdev->vpci->lock);
>              /* Disable memory decoding unconditionally on failure. */
> -            modify_decoding(pdev, v->vpci.cmd & ~PCI_COMMAND_MEMORY,
> +            modify_decoding(pdev, task->cmd & ~PCI_COMMAND_MEMORY,
>                              false);
>              spin_unlock(&pdev->vpci->lock);
> 
> -            /* Clean all the rangesets */
> -            for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> -                if ( !rangeset_is_empty(header->bars[i].mem) )
> -                     rangeset_purge(header->bars[i].mem);
> -
> -            v->vpci.pdev = NULL;
> -
> -            read_unlock(&v->domain->pci_lock);
> -
> -            if ( !is_hardware_domain(v->domain) )
> -                domain_crash(v->domain);
> +            if ( !is_hardware_domain(task->domain) )
> +                domain_crash(task->domain);
> 
>              return false;
>          }
>      }
> -    v->vpci.pdev = NULL;
> 
>      spin_lock(&pdev->vpci->lock);
> -    modify_decoding(pdev, v->vpci.cmd, v->vpci.rom_only);
> +    modify_decoding(pdev, task->cmd, task->rom_only);
>      spin_unlock(&pdev->vpci->lock);
> 
> -    read_unlock(&v->domain->pci_lock);
> +    return false;
> +}
> +
> +static void destroy_map_task(struct vpci_map_task *task)
> +{
> +    unsigned int i;
> 
> +    if ( !task )

Looks like task is never NULL in the code, suggest ASSERT().

> +        return;
> +
> +    for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
> +        rangeset_destroy(task->bars[i].mem);
> +
> +    xfree(task);
> +}
> +
> +bool vpci_process_pending(struct vcpu *v)
> +{
> +    struct vpci_map_task *task;
> +    read_lock(&v->domain->pci_lock);
> +
> +    while ( (task = list_first_entry_or_null(&v->vpci.task_queue,
> +                                             struct vpci_map_task,
> +                                             next)) != NULL )
> +    {
> +        if ( vpci_process_map_task(task) )
> +        {
> +            read_unlock(&v->domain->pci_lock);

I would add local variable and use it here for shorter code:

    int rc;

    read_lock(&v->domain->pci_lock);

    while (...)
    {
        rc = vpci_process_map_task(task);
        if ( rc )
            break;

        ...
    }

    read_unlock(&v->domain->pci_lock);

    return rc;



> +            return true;
> +        }
> +
> +        list_del(&task->next);
> +        destroy_map_task(task);
> +    }
> +
> +    read_unlock(&v->domain->pci_lock);
>      return false;
>  }
> 
> -static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
> -                            uint16_t cmd)
> +static int __init apply_map(struct vpci_map_task *task)
>  {
> -    struct vpci_header *header = &pdev->vpci->header;
> +    struct domain *d = task->domain;
> +    const struct pci_dev *pdev = task->pdev;
> +    uint16_t cmd = task->cmd;
>      int rc = 0;
>      unsigned int i;
> 
>      ASSERT(rw_is_write_locked(&d->pci_lock));
> 
> -    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> +    for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
>      {
> -        struct vpci_bar *bar = &header->bars[i];
> +        struct vpci_bar_map *bar = &task->bars[i];
>          struct map_data data = { .d = d, .map = true, .bar = bar };
> 
>          if ( rangeset_is_empty(bar->mem) )
> @@ -283,7 +297,48 @@ static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
>      return rc;
>  }
> 
> -static void defer_map(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> +static struct vpci_map_task *alloc_map_task(const struct pci_dev *pdev,
> +                                            uint16_t cmd, bool rom_only)
> +{
> +    struct vpci_map_task *task = xzalloc(struct vpci_map_task);
> +    unsigned int i;
> +

I would move allocation outside of initializer and use preferred xvzalloc()
variant:

       task = xvzalloc(struct vpci_map_task);
> +    if ( !task )
> +        return NULL;
> +
> +    BUILD_BUG_ON(ARRAY_SIZE(task->bars) != ARRAY_SIZE(pdev->vpci->header.bars));
> +
> +    for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
> +    {
> +        if ( pdev->vpci->header.bars[i].type >= VPCI_BAR_MEM32 )

I would reduce one level of indentation by

           if ( pdev->vpci->header.bars[i].type < VPCI_BAR_MEM32 )
                continue;


> +        {
> +            char str[32];
> +
> +            snprintf(str, sizeof(str), "%pp:BAR%u", &pdev->sbdf, i);
> +
> +            task->bars[i].mem = rangeset_new(pdev->domain, str,
> +                                             RANGESETF_no_print);
> +
> +            if ( !task->bars[i].mem )
> +            {
> +                destroy_map_task(task);

What if allocation fails in the middle of the loop, e.g. i == 5?
I think previously allocated rangesets in this loop should be freed.

> +                return NULL;
> +            }
> +
> +            task->bars[i].addr = pdev->vpci->header.bars[i].addr;
> +            task->bars[i].guest_addr = pdev->vpci->header.bars[i].guest_addr;
> +        }
> +    }
> +
> +    task->pdev = pdev;
> +    task->domain = pdev->domain;
> +    task->cmd = cmd;
> +    task->rom_only = rom_only;
> +
> +    return task;
> +}
> +
> +static void defer_map(struct vpci_map_task *task)
>  {
>      struct vcpu *curr = current;
> 
> @@ -293,9 +348,9 @@ static void defer_map(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>       * is mapped. This can lead to parallel mapping operations being
>       * started for the same device if the domain is not well-behaved.
>       */
> -    curr->vpci.pdev = pdev;
> -    curr->vpci.cmd = cmd;
> -    curr->vpci.rom_only = rom_only;
> +
> +    list_add_tail(&task->next, &curr->vpci.task_queue);
> +
>      /*
>       * Raise a scheduler softirq in order to prevent the guest from resuming
>       * execution with pending mapping operations, to trigger the invocation
> @@ -310,11 +365,15 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>      struct pci_dev *tmp;
>      const struct domain *d;
>      const struct vpci_msix *msix = pdev->vpci->msix;
> +    struct vpci_map_task *task = alloc_map_task(pdev, cmd, rom_only);

Same comment re: having allocation on a separate line.

>      unsigned int i, j;
>      int rc;
> 
>      ASSERT(rw_is_write_locked(&pdev->domain->pci_lock));
> 
> +    if ( !task )
> +        return -ENOMEM;
> +
>      /*
>       * Create a rangeset per BAR that represents the current device memory
>       * region and compare it against all the currently active BAR memory
> @@ -330,12 +389,13 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>      for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>      {
>          struct vpci_bar *bar = &header->bars[i];
> +        struct rangeset *mem = task->bars[i].mem;
>          unsigned long start = PFN_DOWN(bar->addr);
>          unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
>          unsigned long start_guest = PFN_DOWN(bar->guest_addr);
>          unsigned long end_guest = PFN_DOWN(bar->guest_addr + bar->size - 1);
> 
> -        if ( !bar->mem )
> +        if ( !mem )
>              continue;
> 
>          if ( !MAPPABLE_BAR(bar) ||
> @@ -353,7 +413,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>              continue;
>          }
> 
> -        ASSERT(rangeset_is_empty(bar->mem));
> +        ASSERT(rangeset_is_empty(mem));
> 
>          /*
>           * Make sure that the guest set address has the same page offset
> @@ -365,21 +425,23 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>              gprintk(XENLOG_G_WARNING,
>                      "%pp: can't map BAR%u - offset mismatch: %#lx vs %#lx\n",
>                      &pdev->sbdf, i, bar->guest_addr, bar->addr);
> +            destroy_map_task(task);

I think using goto would be justified for doing cleanup in one place.

>              return -EINVAL;
>          }
> 
> -        rc = rangeset_add_range(bar->mem, start_guest, end_guest);
> +        rc = rangeset_add_range(mem, start_guest, end_guest);
>          if ( rc )
>          {
>              printk(XENLOG_G_WARNING "Failed to add [%lx, %lx]: %d\n",
>                     start_guest, end_guest, rc);
> +            destroy_map_task(task);
>              return rc;
>          }
> 
>          /* Check for overlap with the already setup BAR ranges. */
>          for ( j = 0; j < i; j++ )
>          {
> -            struct vpci_bar *prev_bar = &header->bars[j];
> +            struct vpci_bar_map *prev_bar = &task->bars[j];
> 
>              if ( rangeset_is_empty(prev_bar->mem) )
>                  continue;
> @@ -390,16 +452,18 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>                  gprintk(XENLOG_WARNING,
>                         "%pp: failed to remove overlapping range [%lx, %lx]: %d\n",
>                          &pdev->sbdf, start_guest, end_guest, rc);
> +                destroy_map_task(task);
>                  return rc;
>              }
>          }
> 
> -        rc = pci_sanitize_bar_memory(bar->mem);
> +        rc = pci_sanitize_bar_memory(mem);
>          if ( rc )
>          {
>              gprintk(XENLOG_WARNING,
>                      "%pp: failed to sanitize BAR#%u memory: %d\n",
>                      &pdev->sbdf, i, rc);
> +            destroy_map_task(task);
>              return rc;
>          }
>      }
> @@ -413,7 +477,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> 
>          for ( j = 0; j < ARRAY_SIZE(header->bars); j++ )
>          {
> -            const struct vpci_bar *bar = &header->bars[j];
> +            const struct vpci_bar_map *bar = &task->bars[j];
> 
>              if ( rangeset_is_empty(bar->mem) )
>                  continue;
> @@ -424,6 +488,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>                  gprintk(XENLOG_WARNING,
>                         "%pp: failed to remove MSIX table [%lx, %lx]: %d\n",
>                          &pdev->sbdf, start, end, rc);
> +                destroy_map_task(task);
>                  return rc;
>              }
>          }
> @@ -468,8 +533,9 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>                  for ( j = 0; j < ARRAY_SIZE(header->bars); j++)
>                  {
>                      const struct vpci_bar *bar = &header->bars[j];
> +                    struct rangeset *mem = task->bars[j].mem;
> 
> -                    if ( !rangeset_overlaps_range(bar->mem, start, end) ||
> +                    if ( !rangeset_overlaps_range(mem, start, end) ||
>                           /*
>                            * If only the ROM enable bit is toggled check against
>                            * other BARs in the same device for overlaps, but not
> @@ -480,12 +546,13 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>                            bar->type == VPCI_BAR_ROM) )
>                          continue;
> 
> -                    rc = rangeset_remove_range(bar->mem, start, end);
> +                    rc = rangeset_remove_range(mem, start, end);
>                      if ( rc )
>                      {
>                          gprintk(XENLOG_WARNING,
>                                  "%pp: failed to remove [%lx, %lx]: %d\n",
>                                  &pdev->sbdf, start, end, rc);
> +                        destroy_map_task(task);
>                          return rc;
>                      }
>                  }
> @@ -509,10 +576,12 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>           * will always be to establish mappings and process all the BARs.
>           */
>          ASSERT((cmd & PCI_COMMAND_MEMORY) && !rom_only);
> -        return apply_map(pdev->domain, pdev, cmd);
> +        rc = apply_map(task);
> +        destroy_map_task(task);
> +        return rc;
>      }
> 
> -    defer_map(pdev, cmd, rom_only);
> +    defer_map(task);
> 
>      return 0;
>  }
> @@ -731,18 +800,6 @@ static void cf_check rom_write(
>      }
>  }
> 
> -static int bar_add_rangeset(const struct pci_dev *pdev, struct vpci_bar *bar,
> -                            unsigned int i)
> -{
> -    char str[32];
> -
> -    snprintf(str, sizeof(str), "%pp:BAR%u", &pdev->sbdf, i);
> -
> -    bar->mem = rangeset_new(pdev->domain, str, RANGESETF_no_print);
> -
> -    return !bar->mem ? -ENOMEM : 0;
> -}
> -
>  static int vpci_init_capability_list(struct pci_dev *pdev)
>  {
>      int rc;
> @@ -947,10 +1004,6 @@ static int cf_check init_header(struct pci_dev *pdev)
>          else
>              bars[i].type = VPCI_BAR_MEM32;
> 
> -        rc = bar_add_rangeset(pdev, &bars[i], i);
> -        if ( rc )
> -            goto fail;
> -
>          rc = pci_size_mem_bar(pdev->sbdf, reg, &addr, &size,
>                                (i == num_bars - 1) ? PCI_BAR_LAST : 0);
>          if ( rc < 0 )
> @@ -1003,12 +1056,6 @@ static int cf_check init_header(struct pci_dev *pdev)
>                                 4, rom);
>          if ( rc )
>              rom->type = VPCI_BAR_EMPTY;
> -        else
> -        {
> -            rc = bar_add_rangeset(pdev, rom, num_bars);
> -            if ( rc )
> -                goto fail;
> -        }
>      }
>      else if ( !is_hwdom )
>      {
> diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
> index 09988f04c27c..7177cfce355d 100644
> --- a/xen/drivers/vpci/vpci.c
> +++ b/xen/drivers/vpci/vpci.c
> @@ -117,9 +117,6 @@ void vpci_deassign_device(struct pci_dev *pdev)
>                  iounmap(pdev->vpci->msix->table[i]);
>      }
> 
> -    for ( i = 0; i < ARRAY_SIZE(pdev->vpci->header.bars); i++ )
> -        rangeset_destroy(pdev->vpci->header.bars[i].mem);
> -
>      xfree(pdev->vpci->msix);
>      xfree(pdev->vpci->msi);
>      xfree(pdev->vpci);
> diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
> index 6a481a4e89d3..c2e75076691f 100644
> --- a/xen/include/xen/vpci.h
> +++ b/xen/include/xen/vpci.h
> @@ -103,7 +103,6 @@ struct vpci {
>              uint64_t guest_addr;
>              uint64_t size;
>              uint64_t resizable_sizes;
> -            struct rangeset *mem;
>              enum {
>                  VPCI_BAR_EMPTY,
>                  VPCI_BAR_IO,
> @@ -194,14 +193,25 @@ struct vpci {
>  #endif
>  };
> 
> -struct vpci_vcpu {
> +#ifdef __XEN__
> +struct vpci_map_task {
>      /* Per-vcpu structure to store state while {un}mapping of PCI BARs. */
> +    struct list_head next;
>      const struct pci_dev *pdev;
> +    struct domain *domain;
> +    struct vpci_bar_map {
> +        uint64_t addr;
> +        uint64_t guest_addr;

Perhaps use hpa/gpa notation for naming address fields?

> +        struct rangeset *mem;
> +    } bars[PCI_HEADER_NORMAL_NR_BARS + 1];

I would add a brief comment for `+ 1`

>      uint16_t cmd;
>      bool rom_only : 1;
>  };
> 
> -#ifdef __XEN__
> +struct vpci_vcpu {
> +    struct list_head task_queue;
> +};
> +
>  void vpci_dump_msi(void);
> 
>  /* Make sure there's a hole in the p2m for the MSIX mmio areas. */
> --
> 2.50.1
> 
>

Re: [PATCH v2 1/3] vpci: allow queueing of mapping operations

Posted by dmkhn@proton.me 3 months, 1 week ago

On Thu, Jul 24, 2025 at 03:00:46AM +0000, dmkhn@proton.me wrote:
> On Wed, Jul 23, 2025 at 12:37:41PM -0400, Stewart Hildebrand wrote:
> > Introduce vPCI BAR mapping task queue. Decouple map operation state from
> > general vPCI state: in particular, move the per-BAR rangeset out of
> > struct vpci and into the map task struct.
> >
> > This is preparatory work for further changes that need to perform
> > multiple unmap/map operations before returning to guest.
> >
> > Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
> > ---
> > v1->v2:
> > * new patch
> >
> > Related: 622bdd962822 ("vpci/header: handle p2m range sets per BAR")
> > ---
> >  xen/common/domain.c       |   4 +
> >  xen/drivers/vpci/header.c | 197 +++++++++++++++++++++++---------------
> >  xen/drivers/vpci/vpci.c   |   3 -
> >  xen/include/xen/vpci.h    |  16 +++-
> >  4 files changed, 139 insertions(+), 81 deletions(-)
> >
> > diff --git a/xen/common/domain.c b/xen/common/domain.c
> > index 303c338ef293..214795e2d2fe 100644
> > --- a/xen/common/domain.c
> > +++ b/xen/common/domain.c
> > @@ -459,6 +459,10 @@ struct vcpu *vcpu_create(struct domain *d, unsigned int vcpu_id)
> >          d->vcpu[prev_id]->next_in_list = v;
> >      }
> >
> > +#ifdef CONFIG_HAS_VPCI
> > +    INIT_LIST_HEAD(&v->vpci.task_queue);
> > +#endif
> > +
> >      /* Must be called after making new vcpu visible to for_each_vcpu(). */
> >      vcpu_check_shutdown(v);
> >
> > diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
> > index bb76e707992c..df065a5f5faf 100644
> > --- a/xen/drivers/vpci/header.c
> > +++ b/xen/drivers/vpci/header.c
> > @@ -34,7 +34,7 @@
> >
> >  struct map_data {
> >      struct domain *d;
> > -    const struct vpci_bar *bar;
> > +    const struct vpci_bar_map *bar;
> 
> Perhaps s/vpci_bar_map/vpci_bar_mmio/g to highlight the BAR type?
> 
> >      bool map;
> >  };
> >
> > @@ -173,31 +173,23 @@ static void modify_decoding(const struct pci_dev *pdev, uint16_t cmd,
> >          ASSERT_UNREACHABLE();
> >  }
> >
> > -bool vpci_process_pending(struct vcpu *v)
> > +static bool vpci_process_map_task(struct vpci_map_task *task)
> >  {
> > -    const struct pci_dev *pdev = v->vpci.pdev;
> > -    struct vpci_header *header = NULL;
> > +    const struct pci_dev *pdev = task->pdev;
> >      unsigned int i;
> >
> >      if ( !pdev )
> >          return false;
> >
> > -    read_lock(&v->domain->pci_lock);
> > -
> > -    if ( !pdev->vpci || (v->domain != pdev->domain) )
> > -    {
> > -        v->vpci.pdev = NULL;
> > -        read_unlock(&v->domain->pci_lock);
> > +    if ( !pdev->vpci || (task->domain != pdev->domain) )
> >          return false;
> > -    }
> >
> > -    header = &pdev->vpci->header;
> > -    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> > +    for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
> >      {
> > -        struct vpci_bar *bar = &header->bars[i];
> > +        struct vpci_bar_map *bar = &task->bars[i];
> >          struct map_data data = {
> > -            .d = v->domain,
> > -            .map = v->vpci.cmd & PCI_COMMAND_MEMORY,
> > +            .d = task->domain,
> > +            .map = task->cmd & PCI_COMMAND_MEMORY,
> >              .bar = bar,
> >          };
> >          int rc;
> > @@ -208,57 +200,79 @@ bool vpci_process_pending(struct vcpu *v)
> >          rc = rangeset_consume_ranges(bar->mem, map_range, &data);
> >
> >          if ( rc == -ERESTART )
> > -        {
> > -            read_unlock(&v->domain->pci_lock);
> >              return true;
> > -        }
> >
> >          if ( rc )
> >          {
> >              spin_lock(&pdev->vpci->lock);
> >              /* Disable memory decoding unconditionally on failure. */
> > -            modify_decoding(pdev, v->vpci.cmd & ~PCI_COMMAND_MEMORY,
> > +            modify_decoding(pdev, task->cmd & ~PCI_COMMAND_MEMORY,
> >                              false);
> >              spin_unlock(&pdev->vpci->lock);
> >
> > -            /* Clean all the rangesets */
> > -            for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> > -                if ( !rangeset_is_empty(header->bars[i].mem) )
> > -                     rangeset_purge(header->bars[i].mem);
> > -
> > -            v->vpci.pdev = NULL;
> > -
> > -            read_unlock(&v->domain->pci_lock);
> > -
> > -            if ( !is_hardware_domain(v->domain) )
> > -                domain_crash(v->domain);
> > +            if ( !is_hardware_domain(task->domain) )
> > +                domain_crash(task->domain);
> >
> >              return false;
> >          }
> >      }
> > -    v->vpci.pdev = NULL;
> >
> >      spin_lock(&pdev->vpci->lock);
> > -    modify_decoding(pdev, v->vpci.cmd, v->vpci.rom_only);
> > +    modify_decoding(pdev, task->cmd, task->rom_only);
> >      spin_unlock(&pdev->vpci->lock);
> >
> > -    read_unlock(&v->domain->pci_lock);
> > +    return false;
> > +}
> > +
> > +static void destroy_map_task(struct vpci_map_task *task)
> > +{
> > +    unsigned int i;
> >
> > +    if ( !task )
> 
> Looks like task is never NULL in the code, suggest ASSERT().
> 
> > +        return;
> > +
> > +    for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
> > +        rangeset_destroy(task->bars[i].mem);
> > +
> > +    xfree(task);
> > +}
> > +
> > +bool vpci_process_pending(struct vcpu *v)
> > +{
> > +    struct vpci_map_task *task;
> > +    read_lock(&v->domain->pci_lock);
> > +
> > +    while ( (task = list_first_entry_or_null(&v->vpci.task_queue,
> > +                                             struct vpci_map_task,
> > +                                             next)) != NULL )
> > +    {
> > +        if ( vpci_process_map_task(task) )
> > +        {
> > +            read_unlock(&v->domain->pci_lock);
> 
> I would add local variable and use it here for shorter code:
> 
>     int rc;
> 
>     read_lock(&v->domain->pci_lock);
> 
>     while (...)
>     {
>         rc = vpci_process_map_task(task);
>         if ( rc )
>             break;
> 
>         ...
>     }
> 
>     read_unlock(&v->domain->pci_lock);
> 
>     return rc;
> 
> 
> 
> > +            return true;
> > +        }
> > +
> > +        list_del(&task->next);
> > +        destroy_map_task(task);
> > +    }
> > +
> > +    read_unlock(&v->domain->pci_lock);
> >      return false;
> >  }
> >
> > -static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
> > -                            uint16_t cmd)
> > +static int __init apply_map(struct vpci_map_task *task)
> >  {
> > -    struct vpci_header *header = &pdev->vpci->header;
> > +    struct domain *d = task->domain;
> > +    const struct pci_dev *pdev = task->pdev;
> > +    uint16_t cmd = task->cmd;
> >      int rc = 0;
> >      unsigned int i;
> >
> >      ASSERT(rw_is_write_locked(&d->pci_lock));
> >
> > -    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> > +    for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
> >      {
> > -        struct vpci_bar *bar = &header->bars[i];
> > +        struct vpci_bar_map *bar = &task->bars[i];
> >          struct map_data data = { .d = d, .map = true, .bar = bar };
> >
> >          if ( rangeset_is_empty(bar->mem) )
> > @@ -283,7 +297,48 @@ static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
> >      return rc;
> >  }
> >
> > -static void defer_map(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> > +static struct vpci_map_task *alloc_map_task(const struct pci_dev *pdev,
> > +                                            uint16_t cmd, bool rom_only)
> > +{
> > +    struct vpci_map_task *task = xzalloc(struct vpci_map_task);
> > +    unsigned int i;
> > +
> 
> I would move allocation outside of initializer and use preferred xvzalloc()
> variant:
> 
>        task = xvzalloc(struct vpci_map_task);
> > +    if ( !task )
> > +        return NULL;
> > +
> > +    BUILD_BUG_ON(ARRAY_SIZE(task->bars) != ARRAY_SIZE(pdev->vpci->header.bars));
> > +
> > +    for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
> > +    {
> > +        if ( pdev->vpci->header.bars[i].type >= VPCI_BAR_MEM32 )
> 
> I would reduce one level of indentation by
> 
>            if ( pdev->vpci->header.bars[i].type < VPCI_BAR_MEM32 )
>                 continue;
> 
> 
> > +        {
> > +            char str[32];
> > +
> > +            snprintf(str, sizeof(str), "%pp:BAR%u", &pdev->sbdf, i);
> > +
> > +            task->bars[i].mem = rangeset_new(pdev->domain, str,
> > +                                             RANGESETF_no_print);
> > +
> > +            if ( !task->bars[i].mem )
> > +            {
> > +                destroy_map_task(task);
> 
> What if allocation fails in the middle of the loop, e.g. i == 5?
> I think previously allocated rangesets in this loop should be freed.

Oh, I see that is done in destroy_map_task(); please disregard.

> 
> > +                return NULL;
> > +            }
> > +
> > +            task->bars[i].addr = pdev->vpci->header.bars[i].addr;
> > +            task->bars[i].guest_addr = pdev->vpci->header.bars[i].guest_addr;
> > +        }
> > +    }
> > +
> > +    task->pdev = pdev;
> > +    task->domain = pdev->domain;
> > +    task->cmd = cmd;
> > +    task->rom_only = rom_only;
> > +
> > +    return task;
> > +}
> > +
> > +static void defer_map(struct vpci_map_task *task)
> >  {
> >      struct vcpu *curr = current;
> >
> > @@ -293,9 +348,9 @@ static void defer_map(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> >       * is mapped. This can lead to parallel mapping operations being
> >       * started for the same device if the domain is not well-behaved.
> >       */
> > -    curr->vpci.pdev = pdev;
> > -    curr->vpci.cmd = cmd;
> > -    curr->vpci.rom_only = rom_only;
> > +
> > +    list_add_tail(&task->next, &curr->vpci.task_queue);
> > +
> >      /*
> >       * Raise a scheduler softirq in order to prevent the guest from resuming
> >       * execution with pending mapping operations, to trigger the invocation
> > @@ -310,11 +365,15 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> >      struct pci_dev *tmp;
> >      const struct domain *d;
> >      const struct vpci_msix *msix = pdev->vpci->msix;
> > +    struct vpci_map_task *task = alloc_map_task(pdev, cmd, rom_only);
> 
> Same comment re: having allocation on a separate line.
> 
> >      unsigned int i, j;
> >      int rc;
> >
> >      ASSERT(rw_is_write_locked(&pdev->domain->pci_lock));
> >
> > +    if ( !task )
> > +        return -ENOMEM;
> > +
> >      /*
> >       * Create a rangeset per BAR that represents the current device memory
> >       * region and compare it against all the currently active BAR memory
> > @@ -330,12 +389,13 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> >      for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> >      {
> >          struct vpci_bar *bar = &header->bars[i];
> > +        struct rangeset *mem = task->bars[i].mem;
> >          unsigned long start = PFN_DOWN(bar->addr);
> >          unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
> >          unsigned long start_guest = PFN_DOWN(bar->guest_addr);
> >          unsigned long end_guest = PFN_DOWN(bar->guest_addr + bar->size - 1);
> >
> > -        if ( !bar->mem )
> > +        if ( !mem )
> >              continue;
> >
> >          if ( !MAPPABLE_BAR(bar) ||
> > @@ -353,7 +413,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> >              continue;
> >          }
> >
> > -        ASSERT(rangeset_is_empty(bar->mem));
> > +        ASSERT(rangeset_is_empty(mem));
> >
> >          /*
> >           * Make sure that the guest set address has the same page offset
> > @@ -365,21 +425,23 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> >              gprintk(XENLOG_G_WARNING,
> >                      "%pp: can't map BAR%u - offset mismatch: %#lx vs %#lx\n",
> >                      &pdev->sbdf, i, bar->guest_addr, bar->addr);
> > +            destroy_map_task(task);
> 
> I think using goto would be justified for doing cleanup in one place.
> 
> >              return -EINVAL;
> >          }
> >
> > -        rc = rangeset_add_range(bar->mem, start_guest, end_guest);
> > +        rc = rangeset_add_range(mem, start_guest, end_guest);
> >          if ( rc )
> >          {
> >              printk(XENLOG_G_WARNING "Failed to add [%lx, %lx]: %d\n",
> >                     start_guest, end_guest, rc);
> > +            destroy_map_task(task);
> >              return rc;
> >          }
> >
> >          /* Check for overlap with the already setup BAR ranges. */
> >          for ( j = 0; j < i; j++ )
> >          {
> > -            struct vpci_bar *prev_bar = &header->bars[j];
> > +            struct vpci_bar_map *prev_bar = &task->bars[j];
> >
> >              if ( rangeset_is_empty(prev_bar->mem) )
> >                  continue;
> > @@ -390,16 +452,18 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> >                  gprintk(XENLOG_WARNING,
> >                         "%pp: failed to remove overlapping range [%lx, %lx]: %d\n",
> >                          &pdev->sbdf, start_guest, end_guest, rc);
> > +                destroy_map_task(task);
> >                  return rc;
> >              }
> >          }
> >
> > -        rc = pci_sanitize_bar_memory(bar->mem);
> > +        rc = pci_sanitize_bar_memory(mem);
> >          if ( rc )
> >          {
> >              gprintk(XENLOG_WARNING,
> >                      "%pp: failed to sanitize BAR#%u memory: %d\n",
> >                      &pdev->sbdf, i, rc);
> > +            destroy_map_task(task);
> >              return rc;
> >          }
> >      }
> > @@ -413,7 +477,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> >
> >          for ( j = 0; j < ARRAY_SIZE(header->bars); j++ )
> >          {
> > -            const struct vpci_bar *bar = &header->bars[j];
> > +            const struct vpci_bar_map *bar = &task->bars[j];
> >
> >              if ( rangeset_is_empty(bar->mem) )
> >                  continue;
> > @@ -424,6 +488,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> >                  gprintk(XENLOG_WARNING,
> >                         "%pp: failed to remove MSIX table [%lx, %lx]: %d\n",
> >                          &pdev->sbdf, start, end, rc);
> > +                destroy_map_task(task);
> >                  return rc;
> >              }
> >          }
> > @@ -468,8 +533,9 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> >                  for ( j = 0; j < ARRAY_SIZE(header->bars); j++)
> >                  {
> >                      const struct vpci_bar *bar = &header->bars[j];
> > +                    struct rangeset *mem = task->bars[j].mem;
> >
> > -                    if ( !rangeset_overlaps_range(bar->mem, start, end) ||
> > +                    if ( !rangeset_overlaps_range(mem, start, end) ||
> >                           /*
> >                            * If only the ROM enable bit is toggled check against
> >                            * other BARs in the same device for overlaps, but not
> > @@ -480,12 +546,13 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> >                            bar->type == VPCI_BAR_ROM) )
> >                          continue;
> >
> > -                    rc = rangeset_remove_range(bar->mem, start, end);
> > +                    rc = rangeset_remove_range(mem, start, end);
> >                      if ( rc )
> >                      {
> >                          gprintk(XENLOG_WARNING,
> >                                  "%pp: failed to remove [%lx, %lx]: %d\n",
> >                                  &pdev->sbdf, start, end, rc);
> > +                        destroy_map_task(task);
> >                          return rc;
> >                      }
> >                  }
> > @@ -509,10 +576,12 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
> >           * will always be to establish mappings and process all the BARs.
> >           */
> >          ASSERT((cmd & PCI_COMMAND_MEMORY) && !rom_only);
> > -        return apply_map(pdev->domain, pdev, cmd);
> > +        rc = apply_map(task);
> > +        destroy_map_task(task);
> > +        return rc;
> >      }
> >
> > -    defer_map(pdev, cmd, rom_only);
> > +    defer_map(task);
> >
> >      return 0;
> >  }
> > @@ -731,18 +800,6 @@ static void cf_check rom_write(
> >      }
> >  }
> >
> > -static int bar_add_rangeset(const struct pci_dev *pdev, struct vpci_bar *bar,
> > -                            unsigned int i)
> > -{
> > -    char str[32];
> > -
> > -    snprintf(str, sizeof(str), "%pp:BAR%u", &pdev->sbdf, i);
> > -
> > -    bar->mem = rangeset_new(pdev->domain, str, RANGESETF_no_print);
> > -
> > -    return !bar->mem ? -ENOMEM : 0;
> > -}
> > -
> >  static int vpci_init_capability_list(struct pci_dev *pdev)
> >  {
> >      int rc;
> > @@ -947,10 +1004,6 @@ static int cf_check init_header(struct pci_dev *pdev)
> >          else
> >              bars[i].type = VPCI_BAR_MEM32;
> >
> > -        rc = bar_add_rangeset(pdev, &bars[i], i);
> > -        if ( rc )
> > -            goto fail;
> > -
> >          rc = pci_size_mem_bar(pdev->sbdf, reg, &addr, &size,
> >                                (i == num_bars - 1) ? PCI_BAR_LAST : 0);
> >          if ( rc < 0 )
> > @@ -1003,12 +1056,6 @@ static int cf_check init_header(struct pci_dev *pdev)
> >                                 4, rom);
> >          if ( rc )
> >              rom->type = VPCI_BAR_EMPTY;
> > -        else
> > -        {
> > -            rc = bar_add_rangeset(pdev, rom, num_bars);
> > -            if ( rc )
> > -                goto fail;
> > -        }
> >      }
> >      else if ( !is_hwdom )
> >      {
> > diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
> > index 09988f04c27c..7177cfce355d 100644
> > --- a/xen/drivers/vpci/vpci.c
> > +++ b/xen/drivers/vpci/vpci.c
> > @@ -117,9 +117,6 @@ void vpci_deassign_device(struct pci_dev *pdev)
> >                  iounmap(pdev->vpci->msix->table[i]);
> >      }
> >
> > -    for ( i = 0; i < ARRAY_SIZE(pdev->vpci->header.bars); i++ )
> > -        rangeset_destroy(pdev->vpci->header.bars[i].mem);
> > -
> >      xfree(pdev->vpci->msix);
> >      xfree(pdev->vpci->msi);
> >      xfree(pdev->vpci);
> > diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
> > index 6a481a4e89d3..c2e75076691f 100644
> > --- a/xen/include/xen/vpci.h
> > +++ b/xen/include/xen/vpci.h
> > @@ -103,7 +103,6 @@ struct vpci {
> >              uint64_t guest_addr;
> >              uint64_t size;
> >              uint64_t resizable_sizes;
> > -            struct rangeset *mem;
> >              enum {
> >                  VPCI_BAR_EMPTY,
> >                  VPCI_BAR_IO,
> > @@ -194,14 +193,25 @@ struct vpci {
> >  #endif
> >  };
> >
> > -struct vpci_vcpu {
> > +#ifdef __XEN__
> > +struct vpci_map_task {
> >      /* Per-vcpu structure to store state while {un}mapping of PCI BARs. */
> > +    struct list_head next;
> >      const struct pci_dev *pdev;
> > +    struct domain *domain;
> > +    struct vpci_bar_map {
> > +        uint64_t addr;
> > +        uint64_t guest_addr;
> 
> Perhaps use hpa/gpa notation for naming address fields?
> 
> > +        struct rangeset *mem;
> > +    } bars[PCI_HEADER_NORMAL_NR_BARS + 1];
> 
> I would add a brief comment for `+ 1`
> 
> >      uint16_t cmd;
> >      bool rom_only : 1;
> >  };
> >
> > -#ifdef __XEN__
> > +struct vpci_vcpu {
> > +    struct list_head task_queue;
> > +};
> > +
> >  void vpci_dump_msi(void);
> >
> >  /* Make sure there's a hole in the p2m for the MSIX mmio areas. */
> > --
> > 2.50.1
> >
> >
> 
>

Re: [PATCH v2 1/3] vpci: allow queueing of mapping operations

Posted by Jan Beulich 3 months, 1 week ago

On 24.07.2025 05:42, dmkhn@proton.me wrote:
> On Thu, Jul 24, 2025 at 03:00:46AM +0000, dmkhn@proton.me wrote:
>> On Wed, Jul 23, 2025 at 12:37:41PM -0400, Stewart Hildebrand wrote:
>>> Introduce vPCI BAR mapping task queue. Decouple map operation state from
>>> general vPCI state: in particular, move the per-BAR rangeset out of
>>> struct vpci and into the map task struct.
>>>
>>> This is preparatory work for further changes that need to perform
>>> multiple unmap/map operations before returning to guest.
>>>
>>> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
>>> ---
>>> v1->v2:
>>> * new patch
>>>
>>> Related: 622bdd962822 ("vpci/header: handle p2m range sets per BAR")
>>> ---
>>>  xen/common/domain.c       |   4 +
>>>  xen/drivers/vpci/header.c | 197 +++++++++++++++++++++++---------------
>>>  xen/drivers/vpci/vpci.c   |   3 -
>>>  xen/include/xen/vpci.h    |  16 +++-
>>>  4 files changed, 139 insertions(+), 81 deletions(-)

Why did I (and many others) end up on the To: list of this reply? (Breaks my
rules of sorting incoming mail into appropriate folders, for context.) Also,
please trim reply context suitably. Without you doing so, every single reader
will need to scroll through the entirety of a long mail just to find (in this
case) a single line of reply (somewhere in the middle). Of course you
shouldn't be too agressive with trimming, to retain proper context for your
reply.

Jan

Re: [PATCH v2 1/3] vpci: allow queueing of mapping operations

Posted by dmkhn@proton.me 3 months ago

On Thu, Jul 24, 2025 at 09:47:24AM +0200, Jan Beulich wrote:
> On 24.07.2025 05:42, dmkhn@proton.me wrote:
> > On Thu, Jul 24, 2025 at 03:00:46AM +0000, dmkhn@proton.me wrote:
> >> On Wed, Jul 23, 2025 at 12:37:41PM -0400, Stewart Hildebrand wrote:
> >>> Introduce vPCI BAR mapping task queue. Decouple map operation state from
> >>> general vPCI state: in particular, move the per-BAR rangeset out of
> >>> struct vpci and into the map task struct.
> >>>
> >>> This is preparatory work for further changes that need to perform
> >>> multiple unmap/map operations before returning to guest.
> >>>
> >>> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
> >>> ---
> >>> v1->v2:
> >>> * new patch
> >>>
> >>> Related: 622bdd962822 ("vpci/header: handle p2m range sets per BAR")
> >>> ---
> >>>  xen/common/domain.c       |   4 +
> >>>  xen/drivers/vpci/header.c | 197 +++++++++++++++++++++++---------------
> >>>  xen/drivers/vpci/vpci.c   |   3 -
> >>>  xen/include/xen/vpci.h    |  16 +++-
> >>>  4 files changed, 139 insertions(+), 81 deletions(-)
> 
> Why did I (and many others) end up on the To: list of this reply? (Breaks my

Apologies for that, I might missed something when sending the email out.

> rules of sorting incoming mail into appropriate folders, for context.) Also,
> please trim reply context suitably. Without you doing so, every single reader
> will need to scroll through the entirety of a long mail just to find (in this
> case) a single line of reply (somewhere in the middle). Of course you
> shouldn't be too agressive with trimming, to retain proper context for your
> reply.

Ack.

> 
> Jan

[PATCH v2 1/3] vpci: allow queueing of mapping operations
[PATCH v2 2/3] vpci: allow BAR map/unmap without affecting memory decoding bit
[PATCH v2 3/3] vpci: allow 32-bit BAR writes with memory decoding enabled