[PATCH v2 4/8] vpci: add a wait operation to the vpci vcpu pending actions

Mykyta Poturai posted 8 patches 1 month ago
There is a newer version of this series
[PATCH v2 4/8] vpci: add a wait operation to the vpci vcpu pending actions
Posted by Mykyta Poturai 1 month ago
This allows waiting a specified number of cycles on the vcpu. Once the
wait has finished a callback is executed.

Note that this is still not used, but introduced here in order to
simplify the complexity of the patches that actually make use of it.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Signed-off-by: Mykyta Poturai <mykyta_poturai@epam.com>
---
v1->v2:
* new patch
---
 xen/drivers/vpci/header.c | 125 ++++++++++++++++++++++----------------
 xen/include/xen/vpci.h    |  19 ++++++
 2 files changed, 90 insertions(+), 54 deletions(-)

diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
index cb64d9b9fc..284964f0d4 100644
--- a/xen/drivers/vpci/header.c
+++ b/xen/drivers/vpci/header.c
@@ -175,76 +175,92 @@ static void modify_decoding(const struct pci_dev *pdev, uint16_t cmd,
 
 bool vpci_process_pending(struct vcpu *v)
 {
-    const struct pci_dev *pdev = v->vpci.pdev;
-    struct vpci_header *header = NULL;
-    unsigned int i;
-
-    if ( !pdev )
-        return false;
-
-    read_lock(&v->domain->pci_lock);
-
-    if ( !pdev->vpci || (v->domain != pdev->domain) )
+    switch ( v->vpci.task )
     {
-        v->vpci.pdev = NULL;
-        read_unlock(&v->domain->pci_lock);
-        return false;
-    }
-
-    header = &pdev->vpci->header;
-    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
+    case MODIFY_MEMORY:
     {
-        struct vpci_bar *bar = &header->bars[i];
-        struct rangeset *mem = v->vpci.bar_mem[i];
-        struct map_data data = {
-            .d = v->domain,
-            .map = v->vpci.cmd & PCI_COMMAND_MEMORY,
-            .bar = bar,
-        };
-        int rc;
+        const struct pci_dev *pdev = v->vpci.memory.pdev;
+        struct vpci_header *header = NULL;
+        unsigned int i;
 
-        if ( rangeset_is_empty(mem) )
-            continue;
+        if ( !pdev )
+            break;
 
-        rc = rangeset_consume_ranges(mem, map_range, &data);
+        read_lock(&v->domain->pci_lock);
 
-        if ( rc == -ERESTART )
+        if ( !pdev->vpci || (v->domain != pdev->domain) )
         {
+            v->vpci.memory.pdev = NULL;
             read_unlock(&v->domain->pci_lock);
-            return true;
+            break;
         }
 
-        if ( rc )
+        header = &pdev->vpci->header;
+        for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
         {
-            spin_lock(&pdev->vpci->lock);
-            /* Disable memory decoding unconditionally on failure. */
-            modify_decoding(pdev, v->vpci.cmd & ~PCI_COMMAND_MEMORY,
-                            false);
-            spin_unlock(&pdev->vpci->lock);
+            struct vpci_bar *bar = &header->bars[i];
+            struct rangeset *mem = v->vpci.bar_mem[i];
+            struct map_data data = {
+                .d = v->domain,
+                .map = v->vpci.memory.cmd & PCI_COMMAND_MEMORY,
+                .bar = bar,
+            };
+            int rc;
+
+            if ( rangeset_is_empty(mem) )
+                continue;
 
-            /* Clean all the rangesets */
-            for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
-                if ( !rangeset_is_empty(v->vpci.bar_mem[i]) )
-                     rangeset_purge(v->vpci.bar_mem[i]);
+            rc = rangeset_consume_ranges(mem, map_range, &data);
 
-            v->vpci.pdev = NULL;
+            if ( rc == -ERESTART )
+            {
+                read_unlock(&v->domain->pci_lock);
+                return true;
+            }
 
-            read_unlock(&v->domain->pci_lock);
+            if ( rc )
+            {
+                spin_lock(&pdev->vpci->lock);
+                /* Disable memory decoding unconditionally on failure. */
+                modify_decoding(pdev, v->vpci.memory.cmd & ~PCI_COMMAND_MEMORY,
+                                false);
+                spin_unlock(&pdev->vpci->lock);
+
+                /* Clean all the rangesets */
+                for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
+                    if ( !rangeset_is_empty(v->vpci.bar_mem[i]) )
+                        rangeset_purge(v->vpci.bar_mem[i]);
+
+                v->vpci.memory.pdev = NULL;
+
+                read_unlock(&v->domain->pci_lock);
 
-            if ( !is_hardware_domain(v->domain) )
-                domain_crash(v->domain);
+                if ( !is_hardware_domain(v->domain) )
+                    domain_crash(v->domain);
 
-            return false;
+                break;
+            }
         }
-    }
-    v->vpci.pdev = NULL;
+        v->vpci.memory.pdev = NULL;
 
-    spin_lock(&pdev->vpci->lock);
-    modify_decoding(pdev, v->vpci.cmd, v->vpci.rom_only);
-    spin_unlock(&pdev->vpci->lock);
+        spin_lock(&pdev->vpci->lock);
+        modify_decoding(pdev, v->vpci.memory.cmd, v->vpci.memory.rom_only);
+        spin_unlock(&pdev->vpci->lock);
 
-    read_unlock(&v->domain->pci_lock);
+        read_unlock(&v->domain->pci_lock);
+
+        break;
+    }
+    case WAIT:
+        if ( NOW() < v->vpci.wait.end )
+            return true;
+        v->vpci.wait.callback(v->vpci.wait.data);
+        break;
+    case NONE:
+        return false;
+    }
 
+    v->vpci.task = NONE;
     return false;
 }
 
@@ -295,9 +311,10 @@ static void defer_map(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
      * is mapped. This can lead to parallel mapping operations being
      * started for the same device if the domain is not well-behaved.
      */
-    curr->vpci.pdev = pdev;
-    curr->vpci.cmd = cmd;
-    curr->vpci.rom_only = rom_only;
+    curr->vpci.memory.pdev = pdev;
+    curr->vpci.memory.cmd = cmd;
+    curr->vpci.memory.rom_only = rom_only;
+    curr->vpci.task = MODIFY_MEMORY;
     /*
      * Raise a scheduler softirq in order to prevent the guest from resuming
      * execution with pending mapping operations, to trigger the invocation
diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
index fa654545e5..47cdb54d42 100644
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -212,7 +212,26 @@ struct vpci_vcpu {
     /* Per-vcpu structure to store state while {un}mapping of PCI BARs. */
     const struct pci_dev *pdev;
 #ifdef __XEN__
+    enum {
+        NONE,
+        MODIFY_MEMORY,
+        WAIT,
+    } task;
     struct rangeset *bar_mem[PCI_HEADER_NORMAL_NR_BARS + 1];
+    union {
+        struct {
+            /* Store state while {un}mapping of PCI BARs. */
+            const struct pci_dev *pdev;
+            uint16_t cmd;
+            bool rom_only : 1;
+        } memory;
+        struct {
+            /* Store wait state. */
+            s_time_t end;
+            void (*callback)(void *);
+            void *data;
+        } wait;
+    };
 #endif
     uint16_t cmd;
     bool rom_only : 1;
-- 
2.51.2
Re: [PATCH v2 4/8] vpci: add a wait operation to the vpci vcpu pending actions
Posted by Jan Beulich 1 week, 2 days ago
On 09.03.2026 12:08, Mykyta Poturai wrote:
> --- a/xen/drivers/vpci/header.c
> +++ b/xen/drivers/vpci/header.c
> @@ -175,76 +175,92 @@ static void modify_decoding(const struct pci_dev *pdev, uint16_t cmd,
>  
>  bool vpci_process_pending(struct vcpu *v)
>  {
> -    const struct pci_dev *pdev = v->vpci.pdev;
> -    struct vpci_header *header = NULL;
> -    unsigned int i;
> -
> -    if ( !pdev )
> -        return false;
> -
> -    read_lock(&v->domain->pci_lock);
> -
> -    if ( !pdev->vpci || (v->domain != pdev->domain) )
> +    switch ( v->vpci.task )
>      {
> -        v->vpci.pdev = NULL;
> -        read_unlock(&v->domain->pci_lock);
> -        return false;
> -    }
> -
> -    header = &pdev->vpci->header;
> -    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> +    case MODIFY_MEMORY:
>      {
> -        struct vpci_bar *bar = &header->bars[i];
> -        struct rangeset *mem = v->vpci.bar_mem[i];
> -        struct map_data data = {
> -            .d = v->domain,
> -            .map = v->vpci.cmd & PCI_COMMAND_MEMORY,
> -            .bar = bar,
> -        };
> -        int rc;
> +        const struct pci_dev *pdev = v->vpci.memory.pdev;
> +        struct vpci_header *header = NULL;
> +        unsigned int i;
>  
> -        if ( rangeset_is_empty(mem) )
> -            continue;
> +        if ( !pdev )
> +            break;
>  
> -        rc = rangeset_consume_ranges(mem, map_range, &data);
> +        read_lock(&v->domain->pci_lock);
>  
> -        if ( rc == -ERESTART )
> +        if ( !pdev->vpci || (v->domain != pdev->domain) )
>          {
> +            v->vpci.memory.pdev = NULL;
>              read_unlock(&v->domain->pci_lock);
> -            return true;
> +            break;
>          }
>  
> -        if ( rc )
> +        header = &pdev->vpci->header;
> +        for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>          {
> -            spin_lock(&pdev->vpci->lock);
> -            /* Disable memory decoding unconditionally on failure. */
> -            modify_decoding(pdev, v->vpci.cmd & ~PCI_COMMAND_MEMORY,
> -                            false);
> -            spin_unlock(&pdev->vpci->lock);
> +            struct vpci_bar *bar = &header->bars[i];
> +            struct rangeset *mem = v->vpci.bar_mem[i];
> +            struct map_data data = {
> +                .d = v->domain,
> +                .map = v->vpci.memory.cmd & PCI_COMMAND_MEMORY,
> +                .bar = bar,
> +            };
> +            int rc;
> +
> +            if ( rangeset_is_empty(mem) )
> +                continue;
>  
> -            /* Clean all the rangesets */
> -            for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> -                if ( !rangeset_is_empty(v->vpci.bar_mem[i]) )
> -                     rangeset_purge(v->vpci.bar_mem[i]);
> +            rc = rangeset_consume_ranges(mem, map_range, &data);
>  
> -            v->vpci.pdev = NULL;
> +            if ( rc == -ERESTART )
> +            {
> +                read_unlock(&v->domain->pci_lock);
> +                return true;
> +            }
>  
> -            read_unlock(&v->domain->pci_lock);
> +            if ( rc )
> +            {
> +                spin_lock(&pdev->vpci->lock);
> +                /* Disable memory decoding unconditionally on failure. */
> +                modify_decoding(pdev, v->vpci.memory.cmd & ~PCI_COMMAND_MEMORY,
> +                                false);
> +                spin_unlock(&pdev->vpci->lock);
> +
> +                /* Clean all the rangesets */
> +                for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> +                    if ( !rangeset_is_empty(v->vpci.bar_mem[i]) )
> +                        rangeset_purge(v->vpci.bar_mem[i]);
> +
> +                v->vpci.memory.pdev = NULL;
> +
> +                read_unlock(&v->domain->pci_lock);
>  
> -            if ( !is_hardware_domain(v->domain) )
> -                domain_crash(v->domain);
> +                if ( !is_hardware_domain(v->domain) )
> +                    domain_crash(v->domain);
>  
> -            return false;
> +                break;
> +            }
>          }
> -    }
> -    v->vpci.pdev = NULL;
> +        v->vpci.memory.pdev = NULL;
>  
> -    spin_lock(&pdev->vpci->lock);
> -    modify_decoding(pdev, v->vpci.cmd, v->vpci.rom_only);
> -    spin_unlock(&pdev->vpci->lock);
> +        spin_lock(&pdev->vpci->lock);
> +        modify_decoding(pdev, v->vpci.memory.cmd, v->vpci.memory.rom_only);
> +        spin_unlock(&pdev->vpci->lock);
>  
> -    read_unlock(&v->domain->pci_lock);
> +        read_unlock(&v->domain->pci_lock);
> +
> +        break;
> +    }
> +    case WAIT:
> +        if ( NOW() < v->vpci.wait.end )
> +            return true;
> +        v->vpci.wait.callback(v->vpci.wait.data);
> +        break;

As just indicated in reply to patch 6, busy waiting isn't really acceptable.
This is even more so when the waiting exceeds the typical length of a
scheduling timeslice.

In that other reply I said to put the vCPU to sleep, but you need to be careful
there too: The domain may not expect its vCPU to not make any progress for such
an extended period of time. This may need doing entirely differently: Once the
command register was written, you may want to record the time after which
accesses to the VF registers are permitted. Earlier accesses would simply be
terminated. You may still additionally need a timer, in order to kick off BAR
mapping after that time. (Yet better would  be if the BAR mapping could be
done during those 100ms. After all that may be a reason why this long a delay
is specified: Firmware on the device may also require some time to set up the
BARs accordingly.)

Jan
Re: [PATCH v2 4/8] vpci: add a wait operation to the vpci vcpu pending actions
Posted by Mykyta Poturai 1 week, 1 day ago
On 3/31/26 17:55, Jan Beulich wrote:
> On 09.03.2026 12:08, Mykyta Poturai wrote:
>> --- a/xen/drivers/vpci/header.c
>> +++ b/xen/drivers/vpci/header.c
>> @@ -175,76 +175,92 @@ static void modify_decoding(const struct pci_dev *pdev, uint16_t cmd,
>>   
>>   bool vpci_process_pending(struct vcpu *v)
>>   {
>> -    const struct pci_dev *pdev = v->vpci.pdev;
>> -    struct vpci_header *header = NULL;
>> -    unsigned int i;
>> -
>> -    if ( !pdev )
>> -        return false;
>> -
>> -    read_lock(&v->domain->pci_lock);
>> -
>> -    if ( !pdev->vpci || (v->domain != pdev->domain) )
>> +    switch ( v->vpci.task )
>>       {
>> -        v->vpci.pdev = NULL;
>> -        read_unlock(&v->domain->pci_lock);
>> -        return false;
>> -    }
>> -
>> -    header = &pdev->vpci->header;
>> -    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>> +    case MODIFY_MEMORY:
>>       {
>> -        struct vpci_bar *bar = &header->bars[i];
>> -        struct rangeset *mem = v->vpci.bar_mem[i];
>> -        struct map_data data = {
>> -            .d = v->domain,
>> -            .map = v->vpci.cmd & PCI_COMMAND_MEMORY,
>> -            .bar = bar,
>> -        };
>> -        int rc;
>> +        const struct pci_dev *pdev = v->vpci.memory.pdev;
>> +        struct vpci_header *header = NULL;
>> +        unsigned int i;
>>   
>> -        if ( rangeset_is_empty(mem) )
>> -            continue;
>> +        if ( !pdev )
>> +            break;
>>   
>> -        rc = rangeset_consume_ranges(mem, map_range, &data);
>> +        read_lock(&v->domain->pci_lock);
>>   
>> -        if ( rc == -ERESTART )
>> +        if ( !pdev->vpci || (v->domain != pdev->domain) )
>>           {
>> +            v->vpci.memory.pdev = NULL;
>>               read_unlock(&v->domain->pci_lock);
>> -            return true;
>> +            break;
>>           }
>>   
>> -        if ( rc )
>> +        header = &pdev->vpci->header;
>> +        for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>>           {
>> -            spin_lock(&pdev->vpci->lock);
>> -            /* Disable memory decoding unconditionally on failure. */
>> -            modify_decoding(pdev, v->vpci.cmd & ~PCI_COMMAND_MEMORY,
>> -                            false);
>> -            spin_unlock(&pdev->vpci->lock);
>> +            struct vpci_bar *bar = &header->bars[i];
>> +            struct rangeset *mem = v->vpci.bar_mem[i];
>> +            struct map_data data = {
>> +                .d = v->domain,
>> +                .map = v->vpci.memory.cmd & PCI_COMMAND_MEMORY,
>> +                .bar = bar,
>> +            };
>> +            int rc;
>> +
>> +            if ( rangeset_is_empty(mem) )
>> +                continue;
>>   
>> -            /* Clean all the rangesets */
>> -            for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>> -                if ( !rangeset_is_empty(v->vpci.bar_mem[i]) )
>> -                     rangeset_purge(v->vpci.bar_mem[i]);
>> +            rc = rangeset_consume_ranges(mem, map_range, &data);
>>   
>> -            v->vpci.pdev = NULL;
>> +            if ( rc == -ERESTART )
>> +            {
>> +                read_unlock(&v->domain->pci_lock);
>> +                return true;
>> +            }
>>   
>> -            read_unlock(&v->domain->pci_lock);
>> +            if ( rc )
>> +            {
>> +                spin_lock(&pdev->vpci->lock);
>> +                /* Disable memory decoding unconditionally on failure. */
>> +                modify_decoding(pdev, v->vpci.memory.cmd & ~PCI_COMMAND_MEMORY,
>> +                                false);
>> +                spin_unlock(&pdev->vpci->lock);
>> +
>> +                /* Clean all the rangesets */
>> +                for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>> +                    if ( !rangeset_is_empty(v->vpci.bar_mem[i]) )
>> +                        rangeset_purge(v->vpci.bar_mem[i]);
>> +
>> +                v->vpci.memory.pdev = NULL;
>> +
>> +                read_unlock(&v->domain->pci_lock);
>>   
>> -            if ( !is_hardware_domain(v->domain) )
>> -                domain_crash(v->domain);
>> +                if ( !is_hardware_domain(v->domain) )
>> +                    domain_crash(v->domain);
>>   
>> -            return false;
>> +                break;
>> +            }
>>           }
>> -    }
>> -    v->vpci.pdev = NULL;
>> +        v->vpci.memory.pdev = NULL;
>>   
>> -    spin_lock(&pdev->vpci->lock);
>> -    modify_decoding(pdev, v->vpci.cmd, v->vpci.rom_only);
>> -    spin_unlock(&pdev->vpci->lock);
>> +        spin_lock(&pdev->vpci->lock);
>> +        modify_decoding(pdev, v->vpci.memory.cmd, v->vpci.memory.rom_only);
>> +        spin_unlock(&pdev->vpci->lock);
>>   
>> -    read_unlock(&v->domain->pci_lock);
>> +        read_unlock(&v->domain->pci_lock);
>> +
>> +        break;
>> +    }
>> +    case WAIT:
>> +        if ( NOW() < v->vpci.wait.end )
>> +            return true;
>> +        v->vpci.wait.callback(v->vpci.wait.data);
>> +        break;
> 
> As just indicated in reply to patch 6, busy waiting isn't really acceptable.
> This is even more so when the waiting exceeds the typical length of a
> scheduling timeslice.
> 
> In that other reply I said to put the vCPU to sleep, but you need to be careful
> there too: The domain may not expect its vCPU to not make any progress for such
> an extended period of time. This may need doing entirely differently: Once the
> command register was written, you may want to record the time after which
> accesses to the VF registers are permitted. Earlier accesses would simply be
> terminated. You may still additionally need a timer, in order to kick off BAR
> mapping after that time. (Yet better would  be if the BAR mapping could be
> done during those 100ms. After all that may be a reason why this long a delay
> is specified: Firmware on the device may also require some time to set up the
> BARs accordingly.)
> 
> Jan

I am not sure it would work that way. If we look at how linux 
initialized sriov, it writes VFE and MSE bits, waits 100ms and then 
expects VFs to be operational. If they are not operational at that 
moment, then it considers the operation failed and removes all VFs. If 
we also wait 100ms before enabling access, the probability of a guest 
trying to access something before we allow it would be very high.

So I think there is no way to add VFs in Xen without blocking the 
guest’s vCPU in some way. We can revert back to the old variant and rely 
on physdev op to add VFs one by one as they are discovered by Dom0, then 
we will not need to explicitly wait.
@Roger are you okay with that?


Snippet from Linux:

static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
{
	...
	pci_iov_set_numvfs(dev, nr_virtfn);
	iov->ctrl |= PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE;
	pci_cfg_access_lock(dev);
	pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
	msleep(100);
	pci_cfg_access_unlock(dev);

	rc = sriov_add_vfs(dev, initial);
	if (rc)
		goto err_pcibios;
	...
}





-- 
Mykyta
Re: [PATCH v2 4/8] vpci: add a wait operation to the vpci vcpu pending actions
Posted by Jan Beulich 1 week, 1 day ago
On 01.04.2026 09:59, Mykyta Poturai wrote:
> On 3/31/26 17:55, Jan Beulich wrote:
>> On 09.03.2026 12:08, Mykyta Poturai wrote:
>>> --- a/xen/drivers/vpci/header.c
>>> +++ b/xen/drivers/vpci/header.c
>>> @@ -175,76 +175,92 @@ static void modify_decoding(const struct pci_dev *pdev, uint16_t cmd,
>>>   
>>>   bool vpci_process_pending(struct vcpu *v)
>>>   {
>>> -    const struct pci_dev *pdev = v->vpci.pdev;
>>> -    struct vpci_header *header = NULL;
>>> -    unsigned int i;
>>> -
>>> -    if ( !pdev )
>>> -        return false;
>>> -
>>> -    read_lock(&v->domain->pci_lock);
>>> -
>>> -    if ( !pdev->vpci || (v->domain != pdev->domain) )
>>> +    switch ( v->vpci.task )
>>>       {
>>> -        v->vpci.pdev = NULL;
>>> -        read_unlock(&v->domain->pci_lock);
>>> -        return false;
>>> -    }
>>> -
>>> -    header = &pdev->vpci->header;
>>> -    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>>> +    case MODIFY_MEMORY:
>>>       {
>>> -        struct vpci_bar *bar = &header->bars[i];
>>> -        struct rangeset *mem = v->vpci.bar_mem[i];
>>> -        struct map_data data = {
>>> -            .d = v->domain,
>>> -            .map = v->vpci.cmd & PCI_COMMAND_MEMORY,
>>> -            .bar = bar,
>>> -        };
>>> -        int rc;
>>> +        const struct pci_dev *pdev = v->vpci.memory.pdev;
>>> +        struct vpci_header *header = NULL;
>>> +        unsigned int i;
>>>   
>>> -        if ( rangeset_is_empty(mem) )
>>> -            continue;
>>> +        if ( !pdev )
>>> +            break;
>>>   
>>> -        rc = rangeset_consume_ranges(mem, map_range, &data);
>>> +        read_lock(&v->domain->pci_lock);
>>>   
>>> -        if ( rc == -ERESTART )
>>> +        if ( !pdev->vpci || (v->domain != pdev->domain) )
>>>           {
>>> +            v->vpci.memory.pdev = NULL;
>>>               read_unlock(&v->domain->pci_lock);
>>> -            return true;
>>> +            break;
>>>           }
>>>   
>>> -        if ( rc )
>>> +        header = &pdev->vpci->header;
>>> +        for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>>>           {
>>> -            spin_lock(&pdev->vpci->lock);
>>> -            /* Disable memory decoding unconditionally on failure. */
>>> -            modify_decoding(pdev, v->vpci.cmd & ~PCI_COMMAND_MEMORY,
>>> -                            false);
>>> -            spin_unlock(&pdev->vpci->lock);
>>> +            struct vpci_bar *bar = &header->bars[i];
>>> +            struct rangeset *mem = v->vpci.bar_mem[i];
>>> +            struct map_data data = {
>>> +                .d = v->domain,
>>> +                .map = v->vpci.memory.cmd & PCI_COMMAND_MEMORY,
>>> +                .bar = bar,
>>> +            };
>>> +            int rc;
>>> +
>>> +            if ( rangeset_is_empty(mem) )
>>> +                continue;
>>>   
>>> -            /* Clean all the rangesets */
>>> -            for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>>> -                if ( !rangeset_is_empty(v->vpci.bar_mem[i]) )
>>> -                     rangeset_purge(v->vpci.bar_mem[i]);
>>> +            rc = rangeset_consume_ranges(mem, map_range, &data);
>>>   
>>> -            v->vpci.pdev = NULL;
>>> +            if ( rc == -ERESTART )
>>> +            {
>>> +                read_unlock(&v->domain->pci_lock);
>>> +                return true;
>>> +            }
>>>   
>>> -            read_unlock(&v->domain->pci_lock);
>>> +            if ( rc )
>>> +            {
>>> +                spin_lock(&pdev->vpci->lock);
>>> +                /* Disable memory decoding unconditionally on failure. */
>>> +                modify_decoding(pdev, v->vpci.memory.cmd & ~PCI_COMMAND_MEMORY,
>>> +                                false);
>>> +                spin_unlock(&pdev->vpci->lock);
>>> +
>>> +                /* Clean all the rangesets */
>>> +                for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>>> +                    if ( !rangeset_is_empty(v->vpci.bar_mem[i]) )
>>> +                        rangeset_purge(v->vpci.bar_mem[i]);
>>> +
>>> +                v->vpci.memory.pdev = NULL;
>>> +
>>> +                read_unlock(&v->domain->pci_lock);
>>>   
>>> -            if ( !is_hardware_domain(v->domain) )
>>> -                domain_crash(v->domain);
>>> +                if ( !is_hardware_domain(v->domain) )
>>> +                    domain_crash(v->domain);
>>>   
>>> -            return false;
>>> +                break;
>>> +            }
>>>           }
>>> -    }
>>> -    v->vpci.pdev = NULL;
>>> +        v->vpci.memory.pdev = NULL;
>>>   
>>> -    spin_lock(&pdev->vpci->lock);
>>> -    modify_decoding(pdev, v->vpci.cmd, v->vpci.rom_only);
>>> -    spin_unlock(&pdev->vpci->lock);
>>> +        spin_lock(&pdev->vpci->lock);
>>> +        modify_decoding(pdev, v->vpci.memory.cmd, v->vpci.memory.rom_only);
>>> +        spin_unlock(&pdev->vpci->lock);
>>>   
>>> -    read_unlock(&v->domain->pci_lock);
>>> +        read_unlock(&v->domain->pci_lock);
>>> +
>>> +        break;
>>> +    }
>>> +    case WAIT:
>>> +        if ( NOW() < v->vpci.wait.end )
>>> +            return true;
>>> +        v->vpci.wait.callback(v->vpci.wait.data);
>>> +        break;
>>
>> As just indicated in reply to patch 6, busy waiting isn't really acceptable.
>> This is even more so when the waiting exceeds the typical length of a
>> scheduling timeslice.
>>
>> In that other reply I said to put the vCPU to sleep, but you need to be careful
>> there too: The domain may not expect its vCPU to not make any progress for such
>> an extended period of time. This may need doing entirely differently: Once the
>> command register was written, you may want to record the time after which
>> accesses to the VF registers are permitted. Earlier accesses would simply be
>> terminated. You may still additionally need a timer, in order to kick off BAR
>> mapping after that time. (Yet better would  be if the BAR mapping could be
>> done during those 100ms. After all that may be a reason why this long a delay
>> is specified: Firmware on the device may also require some time to set up the
>> BARs accordingly.)
> 
> I am not sure it would work that way. If we look at how linux 
> initialized sriov, it writes VFE and MSE bits, waits 100ms and then 
> expects VFs to be operational. If they are not operational at that 
> moment, then it considers the operation failed and removes all VFs. If 
> we also wait 100ms before enabling access, the probability of a guest 
> trying to access something before we allow it would be very high.

Well, not really. Our counting of the 100ms necessarily starts before Dom0's.
Furthermore it may be acceptable (or even appropriate) to stall premature
accesses (because they shouldn't occur in the first place), by blocking the
vCPU at that point. A middle route may be possible: Terminate accesses in,
say, the first 90ms, and stall the vCPU for any access past that, but before
the 100ms expired.

Jan
Re: [PATCH v2 4/8] vpci: add a wait operation to the vpci vcpu pending actions
Posted by Mykyta Poturai 1 week, 1 day ago

On 4/1/26 11:21, Jan Beulich wrote:
> On 01.04.2026 09:59, Mykyta Poturai wrote:
>> On 3/31/26 17:55, Jan Beulich wrote:
>>> On 09.03.2026 12:08, Mykyta Poturai wrote:
>>>> --- a/xen/drivers/vpci/header.c
>>>> +++ b/xen/drivers/vpci/header.c
>>>> @@ -175,76 +175,92 @@ static void modify_decoding(const struct pci_dev *pdev, uint16_t cmd,
>>>>    
>>>>    bool vpci_process_pending(struct vcpu *v)
>>>>    {
>>>> -    const struct pci_dev *pdev = v->vpci.pdev;
>>>> -    struct vpci_header *header = NULL;
>>>> -    unsigned int i;
>>>> -
>>>> -    if ( !pdev )
>>>> -        return false;
>>>> -
>>>> -    read_lock(&v->domain->pci_lock);
>>>> -
>>>> -    if ( !pdev->vpci || (v->domain != pdev->domain) )
>>>> +    switch ( v->vpci.task )
>>>>        {
>>>> -        v->vpci.pdev = NULL;
>>>> -        read_unlock(&v->domain->pci_lock);
>>>> -        return false;
>>>> -    }
>>>> -
>>>> -    header = &pdev->vpci->header;
>>>> -    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>>>> +    case MODIFY_MEMORY:
>>>>        {
>>>> -        struct vpci_bar *bar = &header->bars[i];
>>>> -        struct rangeset *mem = v->vpci.bar_mem[i];
>>>> -        struct map_data data = {
>>>> -            .d = v->domain,
>>>> -            .map = v->vpci.cmd & PCI_COMMAND_MEMORY,
>>>> -            .bar = bar,
>>>> -        };
>>>> -        int rc;
>>>> +        const struct pci_dev *pdev = v->vpci.memory.pdev;
>>>> +        struct vpci_header *header = NULL;
>>>> +        unsigned int i;
>>>>    
>>>> -        if ( rangeset_is_empty(mem) )
>>>> -            continue;
>>>> +        if ( !pdev )
>>>> +            break;
>>>>    
>>>> -        rc = rangeset_consume_ranges(mem, map_range, &data);
>>>> +        read_lock(&v->domain->pci_lock);
>>>>    
>>>> -        if ( rc == -ERESTART )
>>>> +        if ( !pdev->vpci || (v->domain != pdev->domain) )
>>>>            {
>>>> +            v->vpci.memory.pdev = NULL;
>>>>                read_unlock(&v->domain->pci_lock);
>>>> -            return true;
>>>> +            break;
>>>>            }
>>>>    
>>>> -        if ( rc )
>>>> +        header = &pdev->vpci->header;
>>>> +        for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>>>>            {
>>>> -            spin_lock(&pdev->vpci->lock);
>>>> -            /* Disable memory decoding unconditionally on failure. */
>>>> -            modify_decoding(pdev, v->vpci.cmd & ~PCI_COMMAND_MEMORY,
>>>> -                            false);
>>>> -            spin_unlock(&pdev->vpci->lock);
>>>> +            struct vpci_bar *bar = &header->bars[i];
>>>> +            struct rangeset *mem = v->vpci.bar_mem[i];
>>>> +            struct map_data data = {
>>>> +                .d = v->domain,
>>>> +                .map = v->vpci.memory.cmd & PCI_COMMAND_MEMORY,
>>>> +                .bar = bar,
>>>> +            };
>>>> +            int rc;
>>>> +
>>>> +            if ( rangeset_is_empty(mem) )
>>>> +                continue;
>>>>    
>>>> -            /* Clean all the rangesets */
>>>> -            for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>>>> -                if ( !rangeset_is_empty(v->vpci.bar_mem[i]) )
>>>> -                     rangeset_purge(v->vpci.bar_mem[i]);
>>>> +            rc = rangeset_consume_ranges(mem, map_range, &data);
>>>>    
>>>> -            v->vpci.pdev = NULL;
>>>> +            if ( rc == -ERESTART )
>>>> +            {
>>>> +                read_unlock(&v->domain->pci_lock);
>>>> +                return true;
>>>> +            }
>>>>    
>>>> -            read_unlock(&v->domain->pci_lock);
>>>> +            if ( rc )
>>>> +            {
>>>> +                spin_lock(&pdev->vpci->lock);
>>>> +                /* Disable memory decoding unconditionally on failure. */
>>>> +                modify_decoding(pdev, v->vpci.memory.cmd & ~PCI_COMMAND_MEMORY,
>>>> +                                false);
>>>> +                spin_unlock(&pdev->vpci->lock);
>>>> +
>>>> +                /* Clean all the rangesets */
>>>> +                for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>>>> +                    if ( !rangeset_is_empty(v->vpci.bar_mem[i]) )
>>>> +                        rangeset_purge(v->vpci.bar_mem[i]);
>>>> +
>>>> +                v->vpci.memory.pdev = NULL;
>>>> +
>>>> +                read_unlock(&v->domain->pci_lock);
>>>>    
>>>> -            if ( !is_hardware_domain(v->domain) )
>>>> -                domain_crash(v->domain);
>>>> +                if ( !is_hardware_domain(v->domain) )
>>>> +                    domain_crash(v->domain);
>>>>    
>>>> -            return false;
>>>> +                break;
>>>> +            }
>>>>            }
>>>> -    }
>>>> -    v->vpci.pdev = NULL;
>>>> +        v->vpci.memory.pdev = NULL;
>>>>    
>>>> -    spin_lock(&pdev->vpci->lock);
>>>> -    modify_decoding(pdev, v->vpci.cmd, v->vpci.rom_only);
>>>> -    spin_unlock(&pdev->vpci->lock);
>>>> +        spin_lock(&pdev->vpci->lock);
>>>> +        modify_decoding(pdev, v->vpci.memory.cmd, v->vpci.memory.rom_only);
>>>> +        spin_unlock(&pdev->vpci->lock);
>>>>    
>>>> -    read_unlock(&v->domain->pci_lock);
>>>> +        read_unlock(&v->domain->pci_lock);
>>>> +
>>>> +        break;
>>>> +    }
>>>> +    case WAIT:
>>>> +        if ( NOW() < v->vpci.wait.end )
>>>> +            return true;
>>>> +        v->vpci.wait.callback(v->vpci.wait.data);
>>>> +        break;
>>>
>>> As just indicated in reply to patch 6, busy waiting isn't really acceptable.
>>> This is even more so when the waiting exceeds the typical length of a
>>> scheduling timeslice.
>>>
>>> In that other reply I said to put the vCPU to sleep, but you need to be careful
>>> there too: The domain may not expect its vCPU to not make any progress for such
>>> an extended period of time. This may need doing entirely differently: Once the
>>> command register was written, you may want to record the time after which
>>> accesses to the VF registers are permitted. Earlier accesses would simply be
>>> terminated. You may still additionally need a timer, in order to kick off BAR
>>> mapping after that time. (Yet better would  be if the BAR mapping could be
>>> done during those 100ms. After all that may be a reason why this long a delay
>>> is specified: Firmware on the device may also require some time to set up the
>>> BARs accordingly.)
>>
>> I am not sure it would work that way. If we look at how linux
>> initialized sriov, it writes VFE and MSE bits, waits 100ms and then
>> expects VFs to be operational. If they are not operational at that
>> moment, then it considers the operation failed and removes all VFs. If
>> we also wait 100ms before enabling access, the probability of a guest
>> trying to access something before we allow it would be very high.
> 
> Well, not really. Our counting of the 100ms necessarily starts before Dom0's.
> Furthermore it may be acceptable (or even appropriate) to stall premature
> accesses (because they shouldn't occur in the first place), by blocking the
> vCPU at that point. A middle route may be possible: Terminate accesses in,
> say, the first 90ms, and stall the vCPU for any access past that, but before
> the 100ms expired.
> 

Is there any real benefit to doing all this work instead of just waiting 
for Dom0 to register the FVs? Implementing it the way you described 
would require a relatively complex state machine and two timers per 
sriov-capable device. And will also probably require some hacks to 
handle partially initialized VFs in Xen. This adds a lot of work and 
many possible bugs for not a lot of benefit in my opinion.

-- 
Mykyta
Re: [PATCH v2 4/8] vpci: add a wait operation to the vpci vcpu pending actions
Posted by Jan Beulich 1 week, 1 day ago
On 01.04.2026 16:07, Mykyta Poturai wrote:
> On 4/1/26 11:21, Jan Beulich wrote:
>> On 01.04.2026 09:59, Mykyta Poturai wrote:
>>> On 3/31/26 17:55, Jan Beulich wrote:
>>>> On 09.03.2026 12:08, Mykyta Poturai wrote:
>>>>> --- a/xen/drivers/vpci/header.c
>>>>> +++ b/xen/drivers/vpci/header.c
>>>>> @@ -175,76 +175,92 @@ static void modify_decoding(const struct pci_dev *pdev, uint16_t cmd,
>>>>>    
>>>>>    bool vpci_process_pending(struct vcpu *v)
>>>>>    {
>>>>> -    const struct pci_dev *pdev = v->vpci.pdev;
>>>>> -    struct vpci_header *header = NULL;
>>>>> -    unsigned int i;
>>>>> -
>>>>> -    if ( !pdev )
>>>>> -        return false;
>>>>> -
>>>>> -    read_lock(&v->domain->pci_lock);
>>>>> -
>>>>> -    if ( !pdev->vpci || (v->domain != pdev->domain) )
>>>>> +    switch ( v->vpci.task )
>>>>>        {
>>>>> -        v->vpci.pdev = NULL;
>>>>> -        read_unlock(&v->domain->pci_lock);
>>>>> -        return false;
>>>>> -    }
>>>>> -
>>>>> -    header = &pdev->vpci->header;
>>>>> -    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>>>>> +    case MODIFY_MEMORY:
>>>>>        {
>>>>> -        struct vpci_bar *bar = &header->bars[i];
>>>>> -        struct rangeset *mem = v->vpci.bar_mem[i];
>>>>> -        struct map_data data = {
>>>>> -            .d = v->domain,
>>>>> -            .map = v->vpci.cmd & PCI_COMMAND_MEMORY,
>>>>> -            .bar = bar,
>>>>> -        };
>>>>> -        int rc;
>>>>> +        const struct pci_dev *pdev = v->vpci.memory.pdev;
>>>>> +        struct vpci_header *header = NULL;
>>>>> +        unsigned int i;
>>>>>    
>>>>> -        if ( rangeset_is_empty(mem) )
>>>>> -            continue;
>>>>> +        if ( !pdev )
>>>>> +            break;
>>>>>    
>>>>> -        rc = rangeset_consume_ranges(mem, map_range, &data);
>>>>> +        read_lock(&v->domain->pci_lock);
>>>>>    
>>>>> -        if ( rc == -ERESTART )
>>>>> +        if ( !pdev->vpci || (v->domain != pdev->domain) )
>>>>>            {
>>>>> +            v->vpci.memory.pdev = NULL;
>>>>>                read_unlock(&v->domain->pci_lock);
>>>>> -            return true;
>>>>> +            break;
>>>>>            }
>>>>>    
>>>>> -        if ( rc )
>>>>> +        header = &pdev->vpci->header;
>>>>> +        for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>>>>>            {
>>>>> -            spin_lock(&pdev->vpci->lock);
>>>>> -            /* Disable memory decoding unconditionally on failure. */
>>>>> -            modify_decoding(pdev, v->vpci.cmd & ~PCI_COMMAND_MEMORY,
>>>>> -                            false);
>>>>> -            spin_unlock(&pdev->vpci->lock);
>>>>> +            struct vpci_bar *bar = &header->bars[i];
>>>>> +            struct rangeset *mem = v->vpci.bar_mem[i];
>>>>> +            struct map_data data = {
>>>>> +                .d = v->domain,
>>>>> +                .map = v->vpci.memory.cmd & PCI_COMMAND_MEMORY,
>>>>> +                .bar = bar,
>>>>> +            };
>>>>> +            int rc;
>>>>> +
>>>>> +            if ( rangeset_is_empty(mem) )
>>>>> +                continue;
>>>>>    
>>>>> -            /* Clean all the rangesets */
>>>>> -            for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>>>>> -                if ( !rangeset_is_empty(v->vpci.bar_mem[i]) )
>>>>> -                     rangeset_purge(v->vpci.bar_mem[i]);
>>>>> +            rc = rangeset_consume_ranges(mem, map_range, &data);
>>>>>    
>>>>> -            v->vpci.pdev = NULL;
>>>>> +            if ( rc == -ERESTART )
>>>>> +            {
>>>>> +                read_unlock(&v->domain->pci_lock);
>>>>> +                return true;
>>>>> +            }
>>>>>    
>>>>> -            read_unlock(&v->domain->pci_lock);
>>>>> +            if ( rc )
>>>>> +            {
>>>>> +                spin_lock(&pdev->vpci->lock);
>>>>> +                /* Disable memory decoding unconditionally on failure. */
>>>>> +                modify_decoding(pdev, v->vpci.memory.cmd & ~PCI_COMMAND_MEMORY,
>>>>> +                                false);
>>>>> +                spin_unlock(&pdev->vpci->lock);
>>>>> +
>>>>> +                /* Clean all the rangesets */
>>>>> +                for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>>>>> +                    if ( !rangeset_is_empty(v->vpci.bar_mem[i]) )
>>>>> +                        rangeset_purge(v->vpci.bar_mem[i]);
>>>>> +
>>>>> +                v->vpci.memory.pdev = NULL;
>>>>> +
>>>>> +                read_unlock(&v->domain->pci_lock);
>>>>>    
>>>>> -            if ( !is_hardware_domain(v->domain) )
>>>>> -                domain_crash(v->domain);
>>>>> +                if ( !is_hardware_domain(v->domain) )
>>>>> +                    domain_crash(v->domain);
>>>>>    
>>>>> -            return false;
>>>>> +                break;
>>>>> +            }
>>>>>            }
>>>>> -    }
>>>>> -    v->vpci.pdev = NULL;
>>>>> +        v->vpci.memory.pdev = NULL;
>>>>>    
>>>>> -    spin_lock(&pdev->vpci->lock);
>>>>> -    modify_decoding(pdev, v->vpci.cmd, v->vpci.rom_only);
>>>>> -    spin_unlock(&pdev->vpci->lock);
>>>>> +        spin_lock(&pdev->vpci->lock);
>>>>> +        modify_decoding(pdev, v->vpci.memory.cmd, v->vpci.memory.rom_only);
>>>>> +        spin_unlock(&pdev->vpci->lock);
>>>>>    
>>>>> -    read_unlock(&v->domain->pci_lock);
>>>>> +        read_unlock(&v->domain->pci_lock);
>>>>> +
>>>>> +        break;
>>>>> +    }
>>>>> +    case WAIT:
>>>>> +        if ( NOW() < v->vpci.wait.end )
>>>>> +            return true;
>>>>> +        v->vpci.wait.callback(v->vpci.wait.data);
>>>>> +        break;
>>>>
>>>> As just indicated in reply to patch 6, busy waiting isn't really acceptable.
>>>> This is even more so when the waiting exceeds the typical length of a
>>>> scheduling timeslice.
>>>>
>>>> In that other reply I said to put the vCPU to sleep, but you need to be careful
>>>> there too: The domain may not expect its vCPU to not make any progress for such
>>>> an extended period of time. This may need doing entirely differently: Once the
>>>> command register was written, you may want to record the time after which
>>>> accesses to the VF registers are permitted. Earlier accesses would simply be
>>>> terminated. You may still additionally need a timer, in order to kick off BAR
>>>> mapping after that time. (Yet better would  be if the BAR mapping could be
>>>> done during those 100ms. After all that may be a reason why this long a delay
>>>> is specified: Firmware on the device may also require some time to set up the
>>>> BARs accordingly.)
>>>
>>> I am not sure it would work that way. If we look at how linux
>>> initialized sriov, it writes VFE and MSE bits, waits 100ms and then
>>> expects VFs to be operational. If they are not operational at that
>>> moment, then it considers the operation failed and removes all VFs. If
>>> we also wait 100ms before enabling access, the probability of a guest
>>> trying to access something before we allow it would be very high.
>>
>> Well, not really. Our counting of the 100ms necessarily starts before Dom0's.
>> Furthermore it may be acceptable (or even appropriate) to stall premature
>> accesses (because they shouldn't occur in the first place), by blocking the
>> vCPU at that point. A middle route may be possible: Terminate accesses in,
>> say, the first 90ms, and stall the vCPU for any access past that, but before
>> the 100ms expired.
> 
> Is there any real benefit to doing all this work instead of just waiting 
> for Dom0 to register the FVs? Implementing it the way you described 
> would require a relatively complex state machine and two timers per 
> sriov-capable device. And will also probably require some hacks to 
> handle partially initialized VFs in Xen. This adds a lot of work and 
> many possible bugs for not a lot of benefit in my opinion.

Odd that you ask me this question. If there was no benefit, why did you do
it this way?

Jan
Re: [PATCH v2 4/8] vpci: add a wait operation to the vpci vcpu pending actions
Posted by Mykyta Poturai 1 week, 1 day ago

On 4/1/26 17:14, Jan Beulich wrote:
> On 01.04.2026 16:07, Mykyta Poturai wrote:
>> On 4/1/26 11:21, Jan Beulich wrote:
>>> On 01.04.2026 09:59, Mykyta Poturai wrote:
>>>> On 3/31/26 17:55, Jan Beulich wrote:
>>>>> On 09.03.2026 12:08, Mykyta Poturai wrote:
>>>>>> --- a/xen/drivers/vpci/header.c
>>>>>> +++ b/xen/drivers/vpci/header.c
>>>>>> @@ -175,76 +175,92 @@ static void modify_decoding(const struct pci_dev *pdev, uint16_t cmd,
>>>>>>     
>>>>>>     bool vpci_process_pending(struct vcpu *v)
>>>>>>     {
>>>>>> -    const struct pci_dev *pdev = v->vpci.pdev;
>>>>>> -    struct vpci_header *header = NULL;
>>>>>> -    unsigned int i;
>>>>>> -
>>>>>> -    if ( !pdev )
>>>>>> -        return false;
>>>>>> -
>>>>>> -    read_lock(&v->domain->pci_lock);
>>>>>> -
>>>>>> -    if ( !pdev->vpci || (v->domain != pdev->domain) )
>>>>>> +    switch ( v->vpci.task )
>>>>>>         {
>>>>>> -        v->vpci.pdev = NULL;
>>>>>> -        read_unlock(&v->domain->pci_lock);
>>>>>> -        return false;
>>>>>> -    }
>>>>>> -
>>>>>> -    header = &pdev->vpci->header;
>>>>>> -    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>>>>>> +    case MODIFY_MEMORY:
>>>>>>         {
>>>>>> -        struct vpci_bar *bar = &header->bars[i];
>>>>>> -        struct rangeset *mem = v->vpci.bar_mem[i];
>>>>>> -        struct map_data data = {
>>>>>> -            .d = v->domain,
>>>>>> -            .map = v->vpci.cmd & PCI_COMMAND_MEMORY,
>>>>>> -            .bar = bar,
>>>>>> -        };
>>>>>> -        int rc;
>>>>>> +        const struct pci_dev *pdev = v->vpci.memory.pdev;
>>>>>> +        struct vpci_header *header = NULL;
>>>>>> +        unsigned int i;
>>>>>>     
>>>>>> -        if ( rangeset_is_empty(mem) )
>>>>>> -            continue;
>>>>>> +        if ( !pdev )
>>>>>> +            break;
>>>>>>     
>>>>>> -        rc = rangeset_consume_ranges(mem, map_range, &data);
>>>>>> +        read_lock(&v->domain->pci_lock);
>>>>>>     
>>>>>> -        if ( rc == -ERESTART )
>>>>>> +        if ( !pdev->vpci || (v->domain != pdev->domain) )
>>>>>>             {
>>>>>> +            v->vpci.memory.pdev = NULL;
>>>>>>                 read_unlock(&v->domain->pci_lock);
>>>>>> -            return true;
>>>>>> +            break;
>>>>>>             }
>>>>>>     
>>>>>> -        if ( rc )
>>>>>> +        header = &pdev->vpci->header;
>>>>>> +        for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>>>>>>             {
>>>>>> -            spin_lock(&pdev->vpci->lock);
>>>>>> -            /* Disable memory decoding unconditionally on failure. */
>>>>>> -            modify_decoding(pdev, v->vpci.cmd & ~PCI_COMMAND_MEMORY,
>>>>>> -                            false);
>>>>>> -            spin_unlock(&pdev->vpci->lock);
>>>>>> +            struct vpci_bar *bar = &header->bars[i];
>>>>>> +            struct rangeset *mem = v->vpci.bar_mem[i];
>>>>>> +            struct map_data data = {
>>>>>> +                .d = v->domain,
>>>>>> +                .map = v->vpci.memory.cmd & PCI_COMMAND_MEMORY,
>>>>>> +                .bar = bar,
>>>>>> +            };
>>>>>> +            int rc;
>>>>>> +
>>>>>> +            if ( rangeset_is_empty(mem) )
>>>>>> +                continue;
>>>>>>     
>>>>>> -            /* Clean all the rangesets */
>>>>>> -            for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>>>>>> -                if ( !rangeset_is_empty(v->vpci.bar_mem[i]) )
>>>>>> -                     rangeset_purge(v->vpci.bar_mem[i]);
>>>>>> +            rc = rangeset_consume_ranges(mem, map_range, &data);
>>>>>>     
>>>>>> -            v->vpci.pdev = NULL;
>>>>>> +            if ( rc == -ERESTART )
>>>>>> +            {
>>>>>> +                read_unlock(&v->domain->pci_lock);
>>>>>> +                return true;
>>>>>> +            }
>>>>>>     
>>>>>> -            read_unlock(&v->domain->pci_lock);
>>>>>> +            if ( rc )
>>>>>> +            {
>>>>>> +                spin_lock(&pdev->vpci->lock);
>>>>>> +                /* Disable memory decoding unconditionally on failure. */
>>>>>> +                modify_decoding(pdev, v->vpci.memory.cmd & ~PCI_COMMAND_MEMORY,
>>>>>> +                                false);
>>>>>> +                spin_unlock(&pdev->vpci->lock);
>>>>>> +
>>>>>> +                /* Clean all the rangesets */
>>>>>> +                for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>>>>>> +                    if ( !rangeset_is_empty(v->vpci.bar_mem[i]) )
>>>>>> +                        rangeset_purge(v->vpci.bar_mem[i]);
>>>>>> +
>>>>>> +                v->vpci.memory.pdev = NULL;
>>>>>> +
>>>>>> +                read_unlock(&v->domain->pci_lock);
>>>>>>     
>>>>>> -            if ( !is_hardware_domain(v->domain) )
>>>>>> -                domain_crash(v->domain);
>>>>>> +                if ( !is_hardware_domain(v->domain) )
>>>>>> +                    domain_crash(v->domain);
>>>>>>     
>>>>>> -            return false;
>>>>>> +                break;
>>>>>> +            }
>>>>>>             }
>>>>>> -    }
>>>>>> -    v->vpci.pdev = NULL;
>>>>>> +        v->vpci.memory.pdev = NULL;
>>>>>>     
>>>>>> -    spin_lock(&pdev->vpci->lock);
>>>>>> -    modify_decoding(pdev, v->vpci.cmd, v->vpci.rom_only);
>>>>>> -    spin_unlock(&pdev->vpci->lock);
>>>>>> +        spin_lock(&pdev->vpci->lock);
>>>>>> +        modify_decoding(pdev, v->vpci.memory.cmd, v->vpci.memory.rom_only);
>>>>>> +        spin_unlock(&pdev->vpci->lock);
>>>>>>     
>>>>>> -    read_unlock(&v->domain->pci_lock);
>>>>>> +        read_unlock(&v->domain->pci_lock);
>>>>>> +
>>>>>> +        break;
>>>>>> +    }
>>>>>> +    case WAIT:
>>>>>> +        if ( NOW() < v->vpci.wait.end )
>>>>>> +            return true;
>>>>>> +        v->vpci.wait.callback(v->vpci.wait.data);
>>>>>> +        break;
>>>>>
>>>>> As just indicated in reply to patch 6, busy waiting isn't really acceptable.
>>>>> This is even more so when the waiting exceeds the typical length of a
>>>>> scheduling timeslice.
>>>>>
>>>>> In that other reply I said to put the vCPU to sleep, but you need to be careful
>>>>> there too: The domain may not expect its vCPU to not make any progress for such
>>>>> an extended period of time. This may need doing entirely differently: Once the
>>>>> command register was written, you may want to record the time after which
>>>>> accesses to the VF registers are permitted. Earlier accesses would simply be
>>>>> terminated. You may still additionally need a timer, in order to kick off BAR
>>>>> mapping after that time. (Yet better would  be if the BAR mapping could be
>>>>> done during those 100ms. After all that may be a reason why this long a delay
>>>>> is specified: Firmware on the device may also require some time to set up the
>>>>> BARs accordingly.)
>>>>
>>>> I am not sure it would work that way. If we look at how linux
>>>> initialized sriov, it writes VFE and MSE bits, waits 100ms and then
>>>> expects VFs to be operational. If they are not operational at that
>>>> moment, then it considers the operation failed and removes all VFs. If
>>>> we also wait 100ms before enabling access, the probability of a guest
>>>> trying to access something before we allow it would be very high.
>>>
>>> Well, not really. Our counting of the 100ms necessarily starts before Dom0's.
>>> Furthermore it may be acceptable (or even appropriate) to stall premature
>>> accesses (because they shouldn't occur in the first place), by blocking the
>>> vCPU at that point. A middle route may be possible: Terminate accesses in,
>>> say, the first 90ms, and stall the vCPU for any access past that, but before
>>> the 100ms expired.
>>
>> Is there any real benefit to doing all this work instead of just waiting
>> for Dom0 to register the FVs? Implementing it the way you described
>> would require a relatively complex state machine and two timers per
>> sriov-capable device. And will also probably require some hacks to
>> handle partially initialized VFs in Xen. This adds a lot of work and
>> many possible bugs for not a lot of benefit in my opinion.
> 
> Odd that you ask me this question. If there was no benefit, why did you do
> it this way?
> 

Roger asked for this approach in V1, and I saw that it can be done in a 
relatively straightforward way, so I implemented it. I didn’t exactly 
get what the benefits were, but I assumed that there are some and the 
effort is not too big to just do it if the maintainers are asking for it.

Now with the posibility of redoing everything *again* and making it much 
more complex I started to really think if its really needed. So I want 
to know your and others' opinions on registering VFs with Dom0.


-- 
Mykyta
Re: [PATCH v2 4/8] vpci: add a wait operation to the vpci vcpu pending actions
Posted by Jan Beulich 1 week, 1 day ago
On 01.04.2026 16:40, Mykyta Poturai wrote:
> 
> 
> On 4/1/26 17:14, Jan Beulich wrote:
>> On 01.04.2026 16:07, Mykyta Poturai wrote:
>>> On 4/1/26 11:21, Jan Beulich wrote:
>>>> On 01.04.2026 09:59, Mykyta Poturai wrote:
>>>>> On 3/31/26 17:55, Jan Beulich wrote:
>>>>>> On 09.03.2026 12:08, Mykyta Poturai wrote:
>>>>>>> --- a/xen/drivers/vpci/header.c
>>>>>>> +++ b/xen/drivers/vpci/header.c
>>>>>>> @@ -175,76 +175,92 @@ static void modify_decoding(const struct pci_dev *pdev, uint16_t cmd,
>>>>>>>     
>>>>>>>     bool vpci_process_pending(struct vcpu *v)
>>>>>>>     {
>>>>>>> -    const struct pci_dev *pdev = v->vpci.pdev;
>>>>>>> -    struct vpci_header *header = NULL;
>>>>>>> -    unsigned int i;
>>>>>>> -
>>>>>>> -    if ( !pdev )
>>>>>>> -        return false;
>>>>>>> -
>>>>>>> -    read_lock(&v->domain->pci_lock);
>>>>>>> -
>>>>>>> -    if ( !pdev->vpci || (v->domain != pdev->domain) )
>>>>>>> +    switch ( v->vpci.task )
>>>>>>>         {
>>>>>>> -        v->vpci.pdev = NULL;
>>>>>>> -        read_unlock(&v->domain->pci_lock);
>>>>>>> -        return false;
>>>>>>> -    }
>>>>>>> -
>>>>>>> -    header = &pdev->vpci->header;
>>>>>>> -    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>>>>>>> +    case MODIFY_MEMORY:
>>>>>>>         {
>>>>>>> -        struct vpci_bar *bar = &header->bars[i];
>>>>>>> -        struct rangeset *mem = v->vpci.bar_mem[i];
>>>>>>> -        struct map_data data = {
>>>>>>> -            .d = v->domain,
>>>>>>> -            .map = v->vpci.cmd & PCI_COMMAND_MEMORY,
>>>>>>> -            .bar = bar,
>>>>>>> -        };
>>>>>>> -        int rc;
>>>>>>> +        const struct pci_dev *pdev = v->vpci.memory.pdev;
>>>>>>> +        struct vpci_header *header = NULL;
>>>>>>> +        unsigned int i;
>>>>>>>     
>>>>>>> -        if ( rangeset_is_empty(mem) )
>>>>>>> -            continue;
>>>>>>> +        if ( !pdev )
>>>>>>> +            break;
>>>>>>>     
>>>>>>> -        rc = rangeset_consume_ranges(mem, map_range, &data);
>>>>>>> +        read_lock(&v->domain->pci_lock);
>>>>>>>     
>>>>>>> -        if ( rc == -ERESTART )
>>>>>>> +        if ( !pdev->vpci || (v->domain != pdev->domain) )
>>>>>>>             {
>>>>>>> +            v->vpci.memory.pdev = NULL;
>>>>>>>                 read_unlock(&v->domain->pci_lock);
>>>>>>> -            return true;
>>>>>>> +            break;
>>>>>>>             }
>>>>>>>     
>>>>>>> -        if ( rc )
>>>>>>> +        header = &pdev->vpci->header;
>>>>>>> +        for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>>>>>>>             {
>>>>>>> -            spin_lock(&pdev->vpci->lock);
>>>>>>> -            /* Disable memory decoding unconditionally on failure. */
>>>>>>> -            modify_decoding(pdev, v->vpci.cmd & ~PCI_COMMAND_MEMORY,
>>>>>>> -                            false);
>>>>>>> -            spin_unlock(&pdev->vpci->lock);
>>>>>>> +            struct vpci_bar *bar = &header->bars[i];
>>>>>>> +            struct rangeset *mem = v->vpci.bar_mem[i];
>>>>>>> +            struct map_data data = {
>>>>>>> +                .d = v->domain,
>>>>>>> +                .map = v->vpci.memory.cmd & PCI_COMMAND_MEMORY,
>>>>>>> +                .bar = bar,
>>>>>>> +            };
>>>>>>> +            int rc;
>>>>>>> +
>>>>>>> +            if ( rangeset_is_empty(mem) )
>>>>>>> +                continue;
>>>>>>>     
>>>>>>> -            /* Clean all the rangesets */
>>>>>>> -            for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>>>>>>> -                if ( !rangeset_is_empty(v->vpci.bar_mem[i]) )
>>>>>>> -                     rangeset_purge(v->vpci.bar_mem[i]);
>>>>>>> +            rc = rangeset_consume_ranges(mem, map_range, &data);
>>>>>>>     
>>>>>>> -            v->vpci.pdev = NULL;
>>>>>>> +            if ( rc == -ERESTART )
>>>>>>> +            {
>>>>>>> +                read_unlock(&v->domain->pci_lock);
>>>>>>> +                return true;
>>>>>>> +            }
>>>>>>>     
>>>>>>> -            read_unlock(&v->domain->pci_lock);
>>>>>>> +            if ( rc )
>>>>>>> +            {
>>>>>>> +                spin_lock(&pdev->vpci->lock);
>>>>>>> +                /* Disable memory decoding unconditionally on failure. */
>>>>>>> +                modify_decoding(pdev, v->vpci.memory.cmd & ~PCI_COMMAND_MEMORY,
>>>>>>> +                                false);
>>>>>>> +                spin_unlock(&pdev->vpci->lock);
>>>>>>> +
>>>>>>> +                /* Clean all the rangesets */
>>>>>>> +                for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>>>>>>> +                    if ( !rangeset_is_empty(v->vpci.bar_mem[i]) )
>>>>>>> +                        rangeset_purge(v->vpci.bar_mem[i]);
>>>>>>> +
>>>>>>> +                v->vpci.memory.pdev = NULL;
>>>>>>> +
>>>>>>> +                read_unlock(&v->domain->pci_lock);
>>>>>>>     
>>>>>>> -            if ( !is_hardware_domain(v->domain) )
>>>>>>> -                domain_crash(v->domain);
>>>>>>> +                if ( !is_hardware_domain(v->domain) )
>>>>>>> +                    domain_crash(v->domain);
>>>>>>>     
>>>>>>> -            return false;
>>>>>>> +                break;
>>>>>>> +            }
>>>>>>>             }
>>>>>>> -    }
>>>>>>> -    v->vpci.pdev = NULL;
>>>>>>> +        v->vpci.memory.pdev = NULL;
>>>>>>>     
>>>>>>> -    spin_lock(&pdev->vpci->lock);
>>>>>>> -    modify_decoding(pdev, v->vpci.cmd, v->vpci.rom_only);
>>>>>>> -    spin_unlock(&pdev->vpci->lock);
>>>>>>> +        spin_lock(&pdev->vpci->lock);
>>>>>>> +        modify_decoding(pdev, v->vpci.memory.cmd, v->vpci.memory.rom_only);
>>>>>>> +        spin_unlock(&pdev->vpci->lock);
>>>>>>>     
>>>>>>> -    read_unlock(&v->domain->pci_lock);
>>>>>>> +        read_unlock(&v->domain->pci_lock);
>>>>>>> +
>>>>>>> +        break;
>>>>>>> +    }
>>>>>>> +    case WAIT:
>>>>>>> +        if ( NOW() < v->vpci.wait.end )
>>>>>>> +            return true;
>>>>>>> +        v->vpci.wait.callback(v->vpci.wait.data);
>>>>>>> +        break;
>>>>>>
>>>>>> As just indicated in reply to patch 6, busy waiting isn't really acceptable.
>>>>>> This is even more so when the waiting exceeds the typical length of a
>>>>>> scheduling timeslice.
>>>>>>
>>>>>> In that other reply I said to put the vCPU to sleep, but you need to be careful
>>>>>> there too: The domain may not expect its vCPU to not make any progress for such
>>>>>> an extended period of time. This may need doing entirely differently: Once the
>>>>>> command register was written, you may want to record the time after which
>>>>>> accesses to the VF registers are permitted. Earlier accesses would simply be
>>>>>> terminated. You may still additionally need a timer, in order to kick off BAR
>>>>>> mapping after that time. (Yet better would  be if the BAR mapping could be
>>>>>> done during those 100ms. After all that may be a reason why this long a delay
>>>>>> is specified: Firmware on the device may also require some time to set up the
>>>>>> BARs accordingly.)
>>>>>
>>>>> I am not sure it would work that way. If we look at how linux
>>>>> initialized sriov, it writes VFE and MSE bits, waits 100ms and then
>>>>> expects VFs to be operational. If they are not operational at that
>>>>> moment, then it considers the operation failed and removes all VFs. If
>>>>> we also wait 100ms before enabling access, the probability of a guest
>>>>> trying to access something before we allow it would be very high.
>>>>
>>>> Well, not really. Our counting of the 100ms necessarily starts before Dom0's.
>>>> Furthermore it may be acceptable (or even appropriate) to stall premature
>>>> accesses (because they shouldn't occur in the first place), by blocking the
>>>> vCPU at that point. A middle route may be possible: Terminate accesses in,
>>>> say, the first 90ms, and stall the vCPU for any access past that, but before
>>>> the 100ms expired.
>>>
>>> Is there any real benefit to doing all this work instead of just waiting
>>> for Dom0 to register the FVs? Implementing it the way you described
>>> would require a relatively complex state machine and two timers per
>>> sriov-capable device. And will also probably require some hacks to
>>> handle partially initialized VFs in Xen. This adds a lot of work and
>>> many possible bugs for not a lot of benefit in my opinion.
>>
>> Odd that you ask me this question. If there was no benefit, why did you do
>> it this way?
> 
> Roger asked for this approach in V1, and I saw that it can be done in a 
> relatively straightforward way, so I implemented it. I didn’t exactly 
> get what the benefits were, but I assumed that there are some and the 
> effort is not too big to just do it if the maintainers are asking for it.
> 
> Now with the posibility of redoing everything *again* and making it much 
> more complex I started to really think if its really needed. So I want 
> to know your and others' opinions on registering VFs with Dom0.

Well, before I voice any view here I'd need to understand why Roger wanted
it done like that. (Apparently you must have agreed sufficiently to not
ask back.)

Jan

Re: [PATCH v2 4/8] vpci: add a wait operation to the vpci vcpu pending actions
Posted by Jan Beulich 1 week, 2 days ago
On 09.03.2026 12:08, Mykyta Poturai wrote:
> --- a/xen/include/xen/vpci.h
> +++ b/xen/include/xen/vpci.h
> @@ -212,7 +212,26 @@ struct vpci_vcpu {
>      /* Per-vcpu structure to store state while {un}mapping of PCI BARs. */
>      const struct pci_dev *pdev;
>  #ifdef __XEN__
> +    enum {
> +        NONE,
> +        MODIFY_MEMORY,
> +        WAIT,
> +    } task;

Unlike structure or union fields, the scope of enumerators is global. I
don't think generic names like NONE and WAIT should be introduced into
global scope. At the very least VPCI_ wants prefixing to them, albeit
VPCI_NONE of course isn't to going to read very well either. Hence
either replace "NONE" there as well, or use e.g. VPCI_OP_* as a prefix.

Jan
Re: [PATCH v2 4/8] vpci: add a wait operation to the vpci vcpu pending actions
Posted by Stewart Hildebrand 3 weeks, 2 days ago
On 3/9/26 07:08, Mykyta Poturai wrote:
> This allows waiting a specified number of cycles on the vcpu. Once the
> wait has finished a callback is executed.
> 
> Note that this is still not used, but introduced here in order to
> simplify the complexity of the patches that actually make use of it.
> 
> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> Signed-off-by: Mykyta Poturai <mykyta_poturai@epam.com>
> ---
> v1->v2:
> * new patch
> ---
>  xen/drivers/vpci/header.c | 125 ++++++++++++++++++++++----------------
>  xen/include/xen/vpci.h    |  19 ++++++
>  2 files changed, 90 insertions(+), 54 deletions(-)
> 
> diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
> index cb64d9b9fc..284964f0d4 100644
> --- a/xen/drivers/vpci/header.c
> +++ b/xen/drivers/vpci/header.c
> @@ -175,76 +175,92 @@ static void modify_decoding(const struct pci_dev *pdev, uint16_t cmd,
>  
>  bool vpci_process_pending(struct vcpu *v)
>  {
> -    const struct pci_dev *pdev = v->vpci.pdev;
> -    struct vpci_header *header = NULL;
> -    unsigned int i;
> -
> -    if ( !pdev )
> -        return false;
> -
> -    read_lock(&v->domain->pci_lock);
> -
> -    if ( !pdev->vpci || (v->domain != pdev->domain) )
> +    switch ( v->vpci.task )
>      {
> -        v->vpci.pdev = NULL;
> -        read_unlock(&v->domain->pci_lock);
> -        return false;
> -    }
> -
> -    header = &pdev->vpci->header;
> -    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> +    case MODIFY_MEMORY:
>      {
> -        struct vpci_bar *bar = &header->bars[i];
> -        struct rangeset *mem = v->vpci.bar_mem[i];
> -        struct map_data data = {
> -            .d = v->domain,
> -            .map = v->vpci.cmd & PCI_COMMAND_MEMORY,
> -            .bar = bar,
> -        };
> -        int rc;
> +        const struct pci_dev *pdev = v->vpci.memory.pdev;
> +        struct vpci_header *header = NULL;
> +        unsigned int i;
>  
> -        if ( rangeset_is_empty(mem) )
> -            continue;
> +        if ( !pdev )
> +            break;
>  
> -        rc = rangeset_consume_ranges(mem, map_range, &data);
> +        read_lock(&v->domain->pci_lock);
>  
> -        if ( rc == -ERESTART )
> +        if ( !pdev->vpci || (v->domain != pdev->domain) )
>          {
> +            v->vpci.memory.pdev = NULL;
>              read_unlock(&v->domain->pci_lock);
> -            return true;
> +            break;
>          }
>  
> -        if ( rc )
> +        header = &pdev->vpci->header;
> +        for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>          {
> -            spin_lock(&pdev->vpci->lock);
> -            /* Disable memory decoding unconditionally on failure. */
> -            modify_decoding(pdev, v->vpci.cmd & ~PCI_COMMAND_MEMORY,
> -                            false);
> -            spin_unlock(&pdev->vpci->lock);
> +            struct vpci_bar *bar = &header->bars[i];
> +            struct rangeset *mem = v->vpci.bar_mem[i];
> +            struct map_data data = {
> +                .d = v->domain,
> +                .map = v->vpci.memory.cmd & PCI_COMMAND_MEMORY,
> +                .bar = bar,
> +            };
> +            int rc;
> +
> +            if ( rangeset_is_empty(mem) )
> +                continue;
>  
> -            /* Clean all the rangesets */
> -            for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> -                if ( !rangeset_is_empty(v->vpci.bar_mem[i]) )
> -                     rangeset_purge(v->vpci.bar_mem[i]);
> +            rc = rangeset_consume_ranges(mem, map_range, &data);
>  
> -            v->vpci.pdev = NULL;
> +            if ( rc == -ERESTART )
> +            {
> +                read_unlock(&v->domain->pci_lock);
> +                return true;
> +            }
>  
> -            read_unlock(&v->domain->pci_lock);
> +            if ( rc )
> +            {
> +                spin_lock(&pdev->vpci->lock);
> +                /* Disable memory decoding unconditionally on failure. */
> +                modify_decoding(pdev, v->vpci.memory.cmd & ~PCI_COMMAND_MEMORY,
> +                                false);
> +                spin_unlock(&pdev->vpci->lock);
> +
> +                /* Clean all the rangesets */
> +                for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> +                    if ( !rangeset_is_empty(v->vpci.bar_mem[i]) )
> +                        rangeset_purge(v->vpci.bar_mem[i]);
> +
> +                v->vpci.memory.pdev = NULL;
> +
> +                read_unlock(&v->domain->pci_lock);
>  
> -            if ( !is_hardware_domain(v->domain) )
> -                domain_crash(v->domain);
> +                if ( !is_hardware_domain(v->domain) )
> +                    domain_crash(v->domain);
>  
> -            return false;
> +                break;
> +            }
>          }
> -    }
> -    v->vpci.pdev = NULL;
> +        v->vpci.memory.pdev = NULL;
>  
> -    spin_lock(&pdev->vpci->lock);
> -    modify_decoding(pdev, v->vpci.cmd, v->vpci.rom_only);
> -    spin_unlock(&pdev->vpci->lock);
> +        spin_lock(&pdev->vpci->lock);
> +        modify_decoding(pdev, v->vpci.memory.cmd, v->vpci.memory.rom_only);
> +        spin_unlock(&pdev->vpci->lock);
>  
> -    read_unlock(&v->domain->pci_lock);
> +        read_unlock(&v->domain->pci_lock);
> +
> +        break;
> +    }

Nit: this is a lot of churn for a relatively small number of changes. Could the
indentation level be retained (reducing churn) by putting the block in a new
function?

> +    case WAIT:
> +        if ( NOW() < v->vpci.wait.end )
> +            return true;
> +        v->vpci.wait.callback(v->vpci.wait.data);
> +        break;
> +    case NONE:
> +        return false;
> +    }
>  
> +    v->vpci.task = NONE;
>      return false;
>  }
>  
> @@ -295,9 +311,10 @@ static void defer_map(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>       * is mapped. This can lead to parallel mapping operations being
>       * started for the same device if the domain is not well-behaved.
>       */
> -    curr->vpci.pdev = pdev;
> -    curr->vpci.cmd = cmd;
> -    curr->vpci.rom_only = rom_only;
> +    curr->vpci.memory.pdev = pdev;
> +    curr->vpci.memory.cmd = cmd;
> +    curr->vpci.memory.rom_only = rom_only;
> +    curr->vpci.task = MODIFY_MEMORY;
>      /*
>       * Raise a scheduler softirq in order to prevent the guest from resuming
>       * execution with pending mapping operations, to trigger the invocation
> diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
> index fa654545e5..47cdb54d42 100644
> --- a/xen/include/xen/vpci.h
> +++ b/xen/include/xen/vpci.h
> @@ -212,7 +212,26 @@ struct vpci_vcpu {
>      /* Per-vcpu structure to store state while {un}mapping of PCI BARs. */
>      const struct pci_dev *pdev;

pdev can now be removed from here

>  #ifdef __XEN__
> +    enum {
> +        NONE,
> +        MODIFY_MEMORY,
> +        WAIT,
> +    } task;
>      struct rangeset *bar_mem[PCI_HEADER_NORMAL_NR_BARS + 1];
> +    union {
> +        struct {
> +            /* Store state while {un}mapping of PCI BARs. */
> +            const struct pci_dev *pdev;
> +            uint16_t cmd;
> +            bool rom_only : 1;
> +        } memory;
> +        struct {
> +            /* Store wait state. */
> +            s_time_t end;
> +            void (*callback)(void *);
> +            void *data;
> +        } wait;
> +    };
>  #endif
>      uint16_t cmd;
>      bool rom_only : 1;

cmd and rom_only can be removed from here