[PATCH v3 2/4] vpci: allow queueing of mapping operations

Stewart Hildebrand posted 4 patches 1 week, 3 days ago
[PATCH v3 2/4] vpci: allow queueing of mapping operations
Posted by Stewart Hildebrand 1 week, 3 days ago
Introduce vPCI BAR mapping task queue. Store information necessary in an
array in struct vpci_vcpu to perform multiple p2m operations associated
with single device.

This is preparatory work for further changes that need to perform
multiple unmap/map operations before returning to guest.

At the moment, only a single slot is needed in the array. However, when
multiple operations are queued and pending, there is a check in
modify_bars() to skip BARs already in the requested state that is not
accurate. Remove this check.

Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
---
apply_map() and vpci_process_map_task() are very similar. Should we try
to combine them into a single function?

v2->v3:
* base on ("vpci: Use pervcpu ranges for BAR mapping") from [1]
* rework with fixed array of map/unmap slots

[1] https://lore.kernel.org/xen-devel/cover.1772806036.git.mykyta_poturai@epam.com/T/#t

v1->v2:
* new patch
---
 xen/drivers/vpci/header.c | 154 +++++++++++++++++++++++---------------
 xen/drivers/vpci/vpci.c   |  27 ++++---
 xen/include/xen/vpci.h    |  15 +++-
 3 files changed, 122 insertions(+), 74 deletions(-)

diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
index 89dce932f3b1..e57c00839841 100644
--- a/xen/drivers/vpci/header.c
+++ b/xen/drivers/vpci/header.c
@@ -35,7 +35,7 @@
 
 struct map_data {
     struct domain *d;
-    const struct vpci_bar *bar;
+    const struct vpci_bar_map *bar;
     bool map;
 };
 
@@ -174,32 +174,23 @@ static void modify_decoding(const struct pci_dev *pdev, uint16_t cmd,
         ASSERT_UNREACHABLE();
 }
 
-bool vpci_process_pending(struct vcpu *v)
+static int vpci_process_map_task(const struct pci_dev *pdev,
+                                 struct vpci_map_task *task)
 {
-    const struct pci_dev *pdev = v->vpci.pdev;
-    struct vpci_header *header = NULL;
     unsigned int i;
 
-    if ( !pdev )
-        return false;
+    ASSERT(rw_is_locked(&pdev->domain->pci_lock));
 
-    read_lock(&v->domain->pci_lock);
-
-    if ( !pdev->vpci || (v->domain != pdev->domain) )
-    {
-        v->vpci.pdev = NULL;
-        read_unlock(&v->domain->pci_lock);
-        return false;
-    }
+    if ( !task->pending )
+        return 0;
 
-    header = &pdev->vpci->header;
-    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
+    for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
     {
-        struct vpci_bar *bar = &header->bars[i];
-        struct rangeset *mem = v->vpci.mem[i];
+        struct vpci_bar_map *bar = &task->bars[i];
+        struct rangeset *mem = bar->mem;
         struct map_data data = {
-            .d = v->domain,
-            .map = v->vpci.cmd & PCI_COMMAND_MEMORY,
+            .d = pdev->domain,
+            .map = task->cmd & PCI_COMMAND_MEMORY,
             .bar = bar,
         };
         int rc;
@@ -210,58 +201,91 @@ bool vpci_process_pending(struct vcpu *v)
         rc = rangeset_consume_ranges(mem, map_range, &data);
 
         if ( rc == -ERESTART )
-        {
-            read_unlock(&v->domain->pci_lock);
-            return true;
-        }
+            return rc;
 
         if ( rc )
         {
             spin_lock(&pdev->vpci->lock);
             /* Disable memory decoding unconditionally on failure. */
-            modify_decoding(pdev, v->vpci.cmd & ~PCI_COMMAND_MEMORY,
-                            false);
+            modify_decoding(pdev, task->cmd & ~PCI_COMMAND_MEMORY, false);
             spin_unlock(&pdev->vpci->lock);
 
             /* Clean all the rangesets */
-            for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
-                if ( !rangeset_is_empty(v->vpci.mem[i]) )
-                     rangeset_purge(v->vpci.mem[i]);
-
-            v->vpci.pdev = NULL;
+            for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
+                if ( !rangeset_is_empty(mem) )
+                     rangeset_purge(mem);
 
-            read_unlock(&v->domain->pci_lock);
+            if ( !is_hardware_domain(pdev->domain) )
+                domain_crash(pdev->domain);
 
-            if ( !is_hardware_domain(v->domain) )
-                domain_crash(v->domain);
+            task->pending = false;
 
-            return false;
+            return rc;
         }
     }
-    v->vpci.pdev = NULL;
 
     spin_lock(&pdev->vpci->lock);
-    modify_decoding(pdev, v->vpci.cmd, v->vpci.rom_only);
+    modify_decoding(pdev, task->cmd, task->rom_only);
     spin_unlock(&pdev->vpci->lock);
 
+    task->pending = false;
+
+    return 0;
+}
+
+bool vpci_process_pending(struct vcpu *v)
+{
+    const struct pci_dev *pdev = v->vpci.pdev;
+    unsigned int i;
+
+    if ( !pdev )
+        return false;
+
+    read_lock(&v->domain->pci_lock);
+
+    if ( !pdev->vpci || (v->domain != pdev->domain) )
+    {
+        for ( i = 0; i < ARRAY_SIZE(v->vpci.task); i++ )
+            v->vpci.task[i].pending = false;
+
+        v->vpci.pdev = NULL;
+        read_unlock(&v->domain->pci_lock);
+        return false;
+    }
+
+    for ( i = 0; i < ARRAY_SIZE(v->vpci.task); i++ )
+    {
+        struct vpci_map_task *task = &(v->vpci.task[i]);
+        int rc = vpci_process_map_task(pdev, task);
+
+        if ( rc == -ERESTART )
+        {
+            read_unlock(&v->domain->pci_lock);
+            return true;
+        }
+
+        if ( rc )
+            break;
+    }
+    v->vpci.pdev = NULL;
+
     read_unlock(&v->domain->pci_lock);
 
     return false;
 }
 
 static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
-                            uint16_t cmd)
+                            struct vpci_map_task *task)
 {
-    struct vpci_header *header = &pdev->vpci->header;
     int rc = 0;
     unsigned int i;
 
     ASSERT(rw_is_write_locked(&d->pci_lock));
 
-    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
+    for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
     {
-        struct vpci_bar *bar = &header->bars[i];
-        struct rangeset *mem = current->vpci.mem[i];
+        struct vpci_bar_map *bar = &task->bars[i];
+        struct rangeset *mem = bar->mem;
         struct map_data data = { .d = d, .map = true, .bar = bar };
 
         if ( rangeset_is_empty(mem) )
@@ -281,12 +305,14 @@ static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
         }
     }
     if ( !rc )
-        modify_decoding(pdev, cmd, false);
+        modify_decoding(pdev, task->cmd, false);
+
+    task->pending = false;
 
     return rc;
 }
 
-static void defer_map(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
+static void defer_map(const struct pci_dev *pdev)
 {
     struct vcpu *curr = current;
 
@@ -297,8 +323,7 @@ static void defer_map(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
      * started for the same device if the domain is not well-behaved.
      */
     curr->vpci.pdev = pdev;
-    curr->vpci.cmd = cmd;
-    curr->vpci.rom_only = rom_only;
+
     /*
      * Raise a scheduler softirq in order to prevent the guest from resuming
      * execution with pending mapping operations, to trigger the invocation
@@ -307,16 +332,20 @@ static void defer_map(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
     raise_softirq(SCHEDULE_SOFTIRQ);
 }
 
-static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
+static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only,
+                       unsigned int map_slot)
 {
     struct vpci_header *header = &pdev->vpci->header;
     struct pci_dev *tmp;
     const struct domain *d;
     const struct vpci_msix *msix = pdev->vpci->msix;
+    struct vpci_map_task *task = &current->vpci.task[map_slot];
     unsigned int i, j;
     int rc;
 
     ASSERT(rw_is_write_locked(&pdev->domain->pci_lock));
+    ASSERT(map_slot < ARRAY_SIZE(current->vpci.task));
+    ASSERT(!task->pending);
 
     /*
      * Create a rangeset per BAR that represents the current device memory
@@ -333,7 +362,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
     for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
     {
         struct vpci_bar *bar = &header->bars[i];
-        struct rangeset *mem = current->vpci.mem[i];
+        struct rangeset *mem = task->bars[i].mem;
         unsigned long start = PFN_DOWN(bar->addr);
         unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
         unsigned long start_guest = PFN_DOWN(bar->guest_addr);
@@ -343,9 +372,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
 
         if ( !MAPPABLE_BAR(bar) ||
              (rom_only ? bar->type != VPCI_BAR_ROM
-                       : (bar->type == VPCI_BAR_ROM && !header->rom_enabled)) ||
-             /* Skip BARs already in the requested state. */
-             bar->enabled == !!(cmd & PCI_COMMAND_MEMORY) )
+                       : (bar->type == VPCI_BAR_ROM && !header->rom_enabled)) )
             continue;
 
         if ( !pci_check_bar(pdev, _mfn(start), _mfn(end)) )
@@ -382,7 +409,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
         /* Check for overlap with the already setup BAR ranges. */
         for ( j = 0; j < i; j++ )
         {
-            struct rangeset *prev_mem = current->vpci.mem[j];
+            struct rangeset *prev_mem = task->bars[j].mem;
 
             if ( rangeset_is_empty(prev_mem) )
                 continue;
@@ -405,6 +432,9 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
                     &pdev->sbdf, i, rc);
             return rc;
         }
+
+        task->bars[i].addr = bar->addr;
+        task->bars[i].guest_addr = bar->guest_addr;
     }
 
     /* Remove any MSIX regions if present. */
@@ -414,9 +444,9 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
         unsigned long end = PFN_DOWN(vmsix_table_addr(pdev->vpci, i) +
                                      vmsix_table_size(pdev->vpci, i) - 1);
 
-        for ( j = 0; j < ARRAY_SIZE(current->vpci.mem); j++ )
+        for ( j = 0; j < ARRAY_SIZE(task->bars); j++ )
         {
-            struct rangeset *mem = current->vpci.mem[j];
+            struct rangeset *mem = task->bars[j].mem;
 
             if ( rangeset_is_empty(mem) )
                 continue;
@@ -471,7 +501,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
                 for ( j = 0; j < ARRAY_SIZE(header->bars); j++)
                 {
                     const struct vpci_bar *bar = &header->bars[j];
-                    struct rangeset *mem = current->vpci.mem[j];
+                    struct rangeset *mem = task->bars[j].mem;
 
                     if ( !rangeset_overlaps_range(mem, start, end) ||
                          /*
@@ -502,6 +532,10 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
         d = dom_xen;
     }
 
+    task->cmd = cmd;
+    task->rom_only = rom_only;
+    task->pending = true;
+
     if ( system_state < SYS_STATE_active )
     {
         /*
@@ -513,10 +547,10 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
          * will always be to establish mappings and process all the BARs.
          */
         ASSERT((cmd & PCI_COMMAND_MEMORY) && !rom_only);
-        return apply_map(pdev->domain, pdev, cmd);
+        return apply_map(pdev->domain, pdev, task);
     }
 
-    defer_map(pdev, cmd, rom_only);
+    defer_map(pdev);
 
     return 0;
 }
@@ -549,7 +583,7 @@ static void cf_check cmd_write(
          * memory decoding bit has not been changed, so leave everything as-is,
          * hoping the guest will realize and try again.
          */
-        modify_bars(pdev, cmd, false);
+        modify_bars(pdev, cmd, false, 0);
     else
         pci_conf_write16(pdev->sbdf, reg, cmd);
 }
@@ -717,7 +751,7 @@ static void cf_check rom_write(
      * Pass PCI_COMMAND_MEMORY or 0 to signal a map/unmap request, note that
      * this fabricated command is never going to be written to the register.
      */
-    else if ( modify_bars(pdev, new_enabled ? PCI_COMMAND_MEMORY : 0, true) )
+    else if ( modify_bars(pdev, new_enabled ? PCI_COMMAND_MEMORY : 0, true, 0) )
         /*
          * No memory has been added or removed from the p2m (because the actual
          * p2m changes are deferred in defer_map) and the ROM enable bit has
@@ -907,7 +941,7 @@ int vpci_init_header(struct pci_dev *pdev)
             goto fail;
     }
 
-    return (cmd & PCI_COMMAND_MEMORY) ? modify_bars(pdev, cmd, false) : 0;
+    return (cmd & PCI_COMMAND_MEMORY) ? modify_bars(pdev, cmd, false, 0) : 0;
 
  fail:
     pci_conf_write16(pdev->sbdf, PCI_COMMAND, cmd);
diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
index 8e6343653078..54a4ed46387c 100644
--- a/xen/drivers/vpci/vpci.c
+++ b/xen/drivers/vpci/vpci.c
@@ -26,30 +26,37 @@
 
 void vpci_vcpu_destroy(struct vcpu *v)
 {
-    unsigned int i;
+    unsigned int i, j;
 
     if ( !has_vpci(v->domain) && !is_idle_domain(v->domain) )
         return;
 
-    for ( i = 0; i < ARRAY_SIZE(v->vpci.mem); i++ )
-        RANGESET_DESTROY(v->vpci.mem[i]);
+    for ( i = 0; i < ARRAY_SIZE(v->vpci.task); i++ )
+    {
+        for ( j = 0; j < ARRAY_SIZE(v->vpci.task[i].bars); j++ )
+            RANGESET_DESTROY(v->vpci.task[i].bars[j].mem);
+    }
 }
 
 int vpci_vcpu_init(struct vcpu *v)
 {
-    unsigned int i;
+    unsigned int i, j;
 
     if ( !has_vpci(v->domain) && !is_idle_domain(v->domain) )
         return 0;
 
-    for ( i = 0; i < ARRAY_SIZE(v->vpci.mem); i++ )
+    for ( i = 0; i < ARRAY_SIZE(v->vpci.task); i++ )
     {
-        char str[32];
+        for ( j = 0; j < ARRAY_SIZE(v->vpci.task[i].bars); j++ )
+        {
+            char str[32];
 
-        snprintf(str, sizeof(str), "%pv:BAR%u", v, i);
-        v->vpci.mem[i] = rangeset_new(v->domain, str, RANGESETF_no_print);
-        if ( !v->vpci.mem[i] )
-            return -ENOMEM;
+            snprintf(str, sizeof(str), "%pv:BAR[%u][%u]", v, i, j);
+            v->vpci.task[i].bars[j].mem = rangeset_new(v->domain, str,
+                                                       RANGESETF_no_print);
+            if ( !v->vpci.task[i].bars[j].mem )
+                return -ENOMEM;
+        }
     }
 
     return 0;
diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
index b55bacbe6e01..e4fbf7b702d6 100644
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -156,11 +156,18 @@ struct vpci {
 
 #ifdef __XEN__
 struct vpci_vcpu {
-    /* Per-vcpu structure to store state while {un}mapping of PCI BARs. */
     const struct pci_dev *pdev;
-    struct rangeset *mem[ARRAY_SIZE(((struct vpci_header *)NULL)->bars)];
-    uint16_t cmd;
-    bool rom_only : 1;
+    /* Per-vcpu structure to store state while {un}mapping of PCI BARs. */
+    struct vpci_map_task {
+        struct vpci_bar_map {
+            uint64_t addr;
+            uint64_t guest_addr;
+            struct rangeset *mem;
+        } bars[ARRAY_SIZE(((struct vpci_header *)NULL)->bars)];
+        uint16_t cmd;
+        bool rom_only : 1;
+        bool pending : 1;
+    } task[1];
 };
 
 void vpci_dump_msi(void);
-- 
2.53.0
Re: [PATCH v3 2/4] vpci: allow queueing of mapping operations
Posted by Mykyta Poturai 21 hours ago
On 3/24/26 05:04, Stewart Hildebrand wrote:
> Introduce vPCI BAR mapping task queue. Store information necessary in an
> array in struct vpci_vcpu to perform multiple p2m operations associated
> with single device.
> 
> This is preparatory work for further changes that need to perform
> multiple unmap/map operations before returning to guest.
> 
> At the moment, only a single slot is needed in the array. However, when
> multiple operations are queued and pending, there is a check in
> modify_bars() to skip BARs already in the requested state that is not
> accurate. Remove this check.
> 
> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
> ---
> apply_map() and vpci_process_map_task() are very similar. Should we try
> to combine them into a single function?
> 
> v2->v3:
> * base on ("vpci: Use pervcpu ranges for BAR mapping") from [1]
> * rework with fixed array of map/unmap slots
> 
> [1] https://lore.kernel.org/xen-devel/cover.1772806036.git.mykyta_poturai@epam.com/T/#t
> 
> v1->v2:
> * new patch

Hi everyone,


Would it be possible to move back to a dynamically allocated number of 
tasks? This would help with mapping SR-IOV virtual functions a lot. 
@Stewart @Roger, what are your thoughts?

Alternatively, I can continue with an approach described in SR-IOV 
series, where VFs are handled separately. I figured out how to return to 
do_softirq after mapping each VF, so it should not block the CPU for too 
long.


-- 
Mykyta