[Qemu-devel] [RFC/RFT PATCH 2/5] vfio/pci: Add base BAR MemoryRegion

Alex Williamson posted 5 patches 7 years, 10 months ago
There is a newer version of this series
[Qemu-devel] [RFC/RFT PATCH 2/5] vfio/pci: Add base BAR MemoryRegion
Posted by Alex Williamson 7 years, 10 months ago
Add one more layer to our stack of MemoryRegions, this base region
allows us to register BARs independently of the vfio region or to
extend the size of BARs which do map to a region.  This will be
useful when we want hypervisor defined BARs or sections of BARs,
for purposes such as relocating MSI-X emulation.  We therefore call
msix_init() based on this new base MemoryRegion, while the quirks,
which only modify regions still operate on those sub-MemoryRegions.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 hw/vfio/pci.c |   74 ++++++++++++++++++++++++++++++++++++++++++++-------------
 hw/vfio/pci.h |    3 ++
 2 files changed, 60 insertions(+), 17 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index c977ee327f94..8f46fdd1d391 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1440,9 +1440,9 @@ static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
     vdev->msix->pending = g_malloc0(BITS_TO_LONGS(vdev->msix->entries) *
                                     sizeof(unsigned long));
     ret = msix_init(&vdev->pdev, vdev->msix->entries,
-                    vdev->bars[vdev->msix->table_bar].region.mem,
+                    vdev->bars[vdev->msix->table_bar].mr,
                     vdev->msix->table_bar, vdev->msix->table_offset,
-                    vdev->bars[vdev->msix->pba_bar].region.mem,
+                    vdev->bars[vdev->msix->pba_bar].mr,
                     vdev->msix->pba_bar, vdev->msix->pba_offset, pos,
                     &err);
     if (ret < 0) {
@@ -1482,8 +1482,8 @@ static void vfio_teardown_msi(VFIOPCIDevice *vdev)
 
     if (vdev->msix) {
         msix_uninit(&vdev->pdev,
-                    vdev->bars[vdev->msix->table_bar].region.mem,
-                    vdev->bars[vdev->msix->pba_bar].region.mem);
+                    vdev->bars[vdev->msix->table_bar].mr,
+                    vdev->bars[vdev->msix->pba_bar].mr);
         g_free(vdev->msix->pending);
     }
 }
@@ -1500,12 +1500,11 @@ static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled)
     }
 }
 
-static void vfio_bar_setup(VFIOPCIDevice *vdev, int nr)
+static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr)
 {
     VFIOBAR *bar = &vdev->bars[nr];
 
     uint32_t pci_bar;
-    uint8_t type;
     int ret;
 
     /* Skip both unimplemented BARs and the upper half of 64bit BARS. */
@@ -1524,23 +1523,52 @@ static void vfio_bar_setup(VFIOPCIDevice *vdev, int nr)
     pci_bar = le32_to_cpu(pci_bar);
     bar->ioport = (pci_bar & PCI_BASE_ADDRESS_SPACE_IO);
     bar->mem64 = bar->ioport ? 0 : (pci_bar & PCI_BASE_ADDRESS_MEM_TYPE_64);
-    type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK :
-                                    ~PCI_BASE_ADDRESS_MEM_MASK);
+    bar->type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK :
+                                         ~PCI_BASE_ADDRESS_MEM_MASK);
+    bar->size = bar->region.size;
+}
 
-    if (vfio_region_mmap(&bar->region)) {
-        error_report("Failed to mmap %s BAR %d. Performance may be slow",
-                     vdev->vbasedev.name, nr);
+static void vfio_bars_prepare(VFIOPCIDevice *vdev)
+{
+    int i;
+
+    for (i = 0; i < PCI_ROM_SLOT; i++) {
+        vfio_bar_prepare(vdev, i);
     }
+}
 
-    pci_register_bar(&vdev->pdev, nr, type, bar->region.mem);
+static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
+{
+    VFIOBAR *bar = &vdev->bars[nr];
+    char *name;
+
+    if (!bar->size) {
+        return;
+    }
+
+    bar->mr = g_new0(MemoryRegion, 1);
+    name = g_strdup_printf("%s base BAR %d", vdev->vbasedev.name, nr);
+    memory_region_init_io(bar->mr, OBJECT(vdev), NULL, NULL, name, bar->size);
+    g_free(name);
+
+    if (bar->region.size) {
+        memory_region_add_subregion(bar->mr, 0, bar->region.mem);
+
+        if (vfio_region_mmap(&bar->region)) {
+            error_report("Failed to mmap %s BAR %d. Performance may be slow",
+                         vdev->vbasedev.name, nr);
+        }
+    }
+
+    pci_register_bar(&vdev->pdev, nr, bar->type, bar->mr);
 }
 
-static void vfio_bars_setup(VFIOPCIDevice *vdev)
+static void vfio_bars_register(VFIOPCIDevice *vdev)
 {
     int i;
 
     for (i = 0; i < PCI_ROM_SLOT; i++) {
-        vfio_bar_setup(vdev, i);
+        vfio_bar_register(vdev, i);
     }
 }
 
@@ -1549,8 +1577,13 @@ static void vfio_bars_exit(VFIOPCIDevice *vdev)
     int i;
 
     for (i = 0; i < PCI_ROM_SLOT; i++) {
+        VFIOBAR *bar = &vdev->bars[i];
+
         vfio_bar_quirk_exit(vdev, i);
-        vfio_region_exit(&vdev->bars[i].region);
+        vfio_region_exit(&bar->region);
+        if (bar->region.size) {
+            memory_region_del_subregion(bar->mr, bar->region.mem);
+        }
     }
 
     if (vdev->vga) {
@@ -1564,8 +1597,14 @@ static void vfio_bars_finalize(VFIOPCIDevice *vdev)
     int i;
 
     for (i = 0; i < PCI_ROM_SLOT; i++) {
+        VFIOBAR *bar = &vdev->bars[i];
+
         vfio_bar_quirk_finalize(vdev, i);
-        vfio_region_finalize(&vdev->bars[i].region);
+        vfio_region_finalize(&bar->region);
+        if (bar->size) {
+           object_unparent(OBJECT(bar->mr));
+           g_free(bar->mr);
+        }
     }
 
     if (vdev->vga) {
@@ -2810,7 +2849,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
         goto error;
     }
 
-    vfio_bars_setup(vdev);
+    vfio_bars_prepare(vdev);
+    vfio_bars_register(vdev);
 
     ret = vfio_add_capabilities(vdev, errp);
     if (ret) {
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index 3d753222ca4c..dcdb1a806769 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -33,6 +33,9 @@ typedef struct VFIOQuirk {
 
 typedef struct VFIOBAR {
     VFIORegion region;
+    MemoryRegion *mr;
+    size_t size;
+    uint8_t type;
     bool ioport;
     bool mem64;
     QLIST_HEAD(, VFIOQuirk) quirks;


Re: [Qemu-devel] [RFC/RFT PATCH 2/5] vfio/pci: Add base BAR MemoryRegion
Posted by Alexey Kardashevskiy 7 years, 10 months ago
On 18/12/17 16:02, Alex Williamson wrote:
> Add one more layer to our stack of MemoryRegions, this base region
> allows us to register BARs independently of the vfio region or to
> extend the size of BARs which do map to a region.  This will be
> useful when we want hypervisor defined BARs or sections of BARs,
> for purposes such as relocating MSI-X emulation.  We therefore call
> msix_init() based on this new base MemoryRegion, while the quirks,
> which only modify regions still operate on those sub-MemoryRegions.


Looks ok, one worry though - the default config produces this:


memory-region: pci@800000020000000.mmio
  0000000000000000-ffffffffffffffff (prio 0, i/o): pci@800000020000000.mmio
    0000210000000000-000021000000ffff (prio 1, i/o): 0001:03:00.0 base BAR 1
      0000210000000000-000021000000ffff (prio 0, i/o): 0001:03:00.0 BAR 1
      000021000000e000-000021000000e5ff (prio 0, i/o): msix-table
      000021000000f000-000021000000f00f (prio 0, i/o): msix-pba [disabled]
    0000210000040000-000021000007ffff (prio 1, i/o): 0001:03:00.0 base BAR 3
      0000210000040000-000021000007ffff (prio 0, i/o): 0001:03:00.0 BAR 3
        0000210000040000-000021000007ffff (prio 0, ramd): 0001:03:00.0 BAR
3 mmaps[0]


Where "BAR 1" and "msix-table" overlap. It resolves correctly:


FlatView #1
 AS "memory", root: system
 AS "cpu-memory", root: system
 Root memory region: system
  0000000000000000-000000007fffffff (prio 0, ram): ppc_spapr.ram
  0000210000000000-000021000000dfff (prio 0, i/o): 0001:03:00.0 BAR 1
  000021000000e000-000021000000e5ff (prio 0, i/o): msix-table
  000021000000e600-000021000000ffff (prio 0, i/o): 0001:03:00.0 BAR 1
@000000000000e600
  0000210000040000-000021000007ffff (prio 0, ramd): 0001:03:00.0 BAR 3 mmaps[0]


but this looks like an accident - should not we raise the msix-table
priority or lower the BAR1 priority (to -1)?



> 
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> ---
>  hw/vfio/pci.c |   74 ++++++++++++++++++++++++++++++++++++++++++++-------------
>  hw/vfio/pci.h |    3 ++
>  2 files changed, 60 insertions(+), 17 deletions(-)
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index c977ee327f94..8f46fdd1d391 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -1440,9 +1440,9 @@ static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
>      vdev->msix->pending = g_malloc0(BITS_TO_LONGS(vdev->msix->entries) *
>                                      sizeof(unsigned long));
>      ret = msix_init(&vdev->pdev, vdev->msix->entries,
> -                    vdev->bars[vdev->msix->table_bar].region.mem,
> +                    vdev->bars[vdev->msix->table_bar].mr,
>                      vdev->msix->table_bar, vdev->msix->table_offset,
> -                    vdev->bars[vdev->msix->pba_bar].region.mem,
> +                    vdev->bars[vdev->msix->pba_bar].mr,
>                      vdev->msix->pba_bar, vdev->msix->pba_offset, pos,
>                      &err);
>      if (ret < 0) {
> @@ -1482,8 +1482,8 @@ static void vfio_teardown_msi(VFIOPCIDevice *vdev)
>  
>      if (vdev->msix) {
>          msix_uninit(&vdev->pdev,
> -                    vdev->bars[vdev->msix->table_bar].region.mem,
> -                    vdev->bars[vdev->msix->pba_bar].region.mem);
> +                    vdev->bars[vdev->msix->table_bar].mr,
> +                    vdev->bars[vdev->msix->pba_bar].mr);
>          g_free(vdev->msix->pending);
>      }
>  }
> @@ -1500,12 +1500,11 @@ static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled)
>      }
>  }
>  
> -static void vfio_bar_setup(VFIOPCIDevice *vdev, int nr)
> +static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr)
>  {
>      VFIOBAR *bar = &vdev->bars[nr];
>  
>      uint32_t pci_bar;
> -    uint8_t type;
>      int ret;
>  
>      /* Skip both unimplemented BARs and the upper half of 64bit BARS. */
> @@ -1524,23 +1523,52 @@ static void vfio_bar_setup(VFIOPCIDevice *vdev, int nr)
>      pci_bar = le32_to_cpu(pci_bar);
>      bar->ioport = (pci_bar & PCI_BASE_ADDRESS_SPACE_IO);
>      bar->mem64 = bar->ioport ? 0 : (pci_bar & PCI_BASE_ADDRESS_MEM_TYPE_64);
> -    type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK :
> -                                    ~PCI_BASE_ADDRESS_MEM_MASK);
> +    bar->type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK :
> +                                         ~PCI_BASE_ADDRESS_MEM_MASK);
> +    bar->size = bar->region.size;
> +}
>  
> -    if (vfio_region_mmap(&bar->region)) {
> -        error_report("Failed to mmap %s BAR %d. Performance may be slow",
> -                     vdev->vbasedev.name, nr);
> +static void vfio_bars_prepare(VFIOPCIDevice *vdev)
> +{
> +    int i;
> +
> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
> +        vfio_bar_prepare(vdev, i);
>      }
> +}
>  
> -    pci_register_bar(&vdev->pdev, nr, type, bar->region.mem);
> +static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
> +{
> +    VFIOBAR *bar = &vdev->bars[nr];
> +    char *name;
> +
> +    if (!bar->size) {
> +        return;
> +    }
> +
> +    bar->mr = g_new0(MemoryRegion, 1);
> +    name = g_strdup_printf("%s base BAR %d", vdev->vbasedev.name, nr);
> +    memory_region_init_io(bar->mr, OBJECT(vdev), NULL, NULL, name, bar->size);
> +    g_free(name);
> +
> +    if (bar->region.size) {
> +        memory_region_add_subregion(bar->mr, 0, bar->region.mem);
> +
> +        if (vfio_region_mmap(&bar->region)) {
> +            error_report("Failed to mmap %s BAR %d. Performance may be slow",
> +                         vdev->vbasedev.name, nr);
> +        }
> +    }
> +
> +    pci_register_bar(&vdev->pdev, nr, bar->type, bar->mr);
>  }
>  
> -static void vfio_bars_setup(VFIOPCIDevice *vdev)
> +static void vfio_bars_register(VFIOPCIDevice *vdev)
>  {
>      int i;
>  
>      for (i = 0; i < PCI_ROM_SLOT; i++) {
> -        vfio_bar_setup(vdev, i);
> +        vfio_bar_register(vdev, i);
>      }
>  }
>  
> @@ -1549,8 +1577,13 @@ static void vfio_bars_exit(VFIOPCIDevice *vdev)
>      int i;
>  
>      for (i = 0; i < PCI_ROM_SLOT; i++) {
> +        VFIOBAR *bar = &vdev->bars[i];
> +
>          vfio_bar_quirk_exit(vdev, i);
> -        vfio_region_exit(&vdev->bars[i].region);
> +        vfio_region_exit(&bar->region);
> +        if (bar->region.size) {
> +            memory_region_del_subregion(bar->mr, bar->region.mem);
> +        }
>      }
>  
>      if (vdev->vga) {
> @@ -1564,8 +1597,14 @@ static void vfio_bars_finalize(VFIOPCIDevice *vdev)
>      int i;
>  
>      for (i = 0; i < PCI_ROM_SLOT; i++) {
> +        VFIOBAR *bar = &vdev->bars[i];
> +
>          vfio_bar_quirk_finalize(vdev, i);
> -        vfio_region_finalize(&vdev->bars[i].region);
> +        vfio_region_finalize(&bar->region);
> +        if (bar->size) {
> +           object_unparent(OBJECT(bar->mr));
> +           g_free(bar->mr);
> +        }
>      }
>  
>      if (vdev->vga) {
> @@ -2810,7 +2849,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
>          goto error;
>      }
>  
> -    vfio_bars_setup(vdev);
> +    vfio_bars_prepare(vdev);
> +    vfio_bars_register(vdev);
>  
>      ret = vfio_add_capabilities(vdev, errp);
>      if (ret) {
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index 3d753222ca4c..dcdb1a806769 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -33,6 +33,9 @@ typedef struct VFIOQuirk {
>  
>  typedef struct VFIOBAR {
>      VFIORegion region;
> +    MemoryRegion *mr;
> +    size_t size;
> +    uint8_t type;
>      bool ioport;
>      bool mem64;
>      QLIST_HEAD(, VFIOQuirk) quirks;
> 
> 


-- 
Alexey

Re: [Qemu-devel] [RFC/RFT PATCH 2/5] vfio/pci: Add base BAR MemoryRegion
Posted by Alex Williamson 7 years, 10 months ago
On Tue, 19 Dec 2017 14:44:51 +1100
Alexey Kardashevskiy <aik@ozlabs.ru> wrote:

> On 18/12/17 16:02, Alex Williamson wrote:
> > Add one more layer to our stack of MemoryRegions, this base region
> > allows us to register BARs independently of the vfio region or to
> > extend the size of BARs which do map to a region.  This will be
> > useful when we want hypervisor defined BARs or sections of BARs,
> > for purposes such as relocating MSI-X emulation.  We therefore call
> > msix_init() based on this new base MemoryRegion, while the quirks,
> > which only modify regions still operate on those sub-MemoryRegions.  
> 
> 
> Looks ok, one worry though - the default config produces this:
> 
> 
> memory-region: pci@800000020000000.mmio
>   0000000000000000-ffffffffffffffff (prio 0, i/o): pci@800000020000000.mmio
>     0000210000000000-000021000000ffff (prio 1, i/o): 0001:03:00.0 base BAR 1
>       0000210000000000-000021000000ffff (prio 0, i/o): 0001:03:00.0 BAR 1
>       000021000000e000-000021000000e5ff (prio 0, i/o): msix-table
>       000021000000f000-000021000000f00f (prio 0, i/o): msix-pba [disabled]
>     0000210000040000-000021000007ffff (prio 1, i/o): 0001:03:00.0 base BAR 3
>       0000210000040000-000021000007ffff (prio 0, i/o): 0001:03:00.0 BAR 3
>         0000210000040000-000021000007ffff (prio 0, ramd): 0001:03:00.0 BAR
> 3 mmaps[0]
> 
> 
> Where "BAR 1" and "msix-table" overlap. It resolves correctly:
> 
> 
> FlatView #1
>  AS "memory", root: system
>  AS "cpu-memory", root: system
>  Root memory region: system
>   0000000000000000-000000007fffffff (prio 0, ram): ppc_spapr.ram
>   0000210000000000-000021000000dfff (prio 0, i/o): 0001:03:00.0 BAR 1
>   000021000000e000-000021000000e5ff (prio 0, i/o): msix-table
>   000021000000e600-000021000000ffff (prio 0, i/o): 0001:03:00.0 BAR 1
> @000000000000e600
>   0000210000040000-000021000007ffff (prio 0, ramd): 0001:03:00.0 BAR 3 mmaps[0]
> 
> 
> but this looks like an accident - should not we raise the msix-table
> priority or lower the BAR1 priority (to -1)?

I was worried about this too, but the way I read the code is that the
last registered subregion with the same priority wins.  Therefore so
long as msix_init() is called after we layer anything else onto the base
BAR MemoryRegion that might interfere with it, everything should work
correctly.  It certainly might make sense for msix_init() to add
subregions with a higher priority, but then you immediately get into
the road block of what priority should it use, which is a bit of a rat
hole since we can make it work as-is with proper ordering.  Thanks,

Alex

Re: [Qemu-devel] [RFC/RFT PATCH 2/5] vfio/pci: Add base BAR MemoryRegion
Posted by Alexey Kardashevskiy 7 years, 10 months ago
On 19/12/17 14:56, Alex Williamson wrote:
> On Tue, 19 Dec 2017 14:44:51 +1100
> Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
> 
>> On 18/12/17 16:02, Alex Williamson wrote:
>>> Add one more layer to our stack of MemoryRegions, this base region
>>> allows us to register BARs independently of the vfio region or to
>>> extend the size of BARs which do map to a region.  This will be
>>> useful when we want hypervisor defined BARs or sections of BARs,
>>> for purposes such as relocating MSI-X emulation.  We therefore call
>>> msix_init() based on this new base MemoryRegion, while the quirks,
>>> which only modify regions still operate on those sub-MemoryRegions.  
>>
>>
>> Looks ok, one worry though - the default config produces this:
>>
>>
>> memory-region: pci@800000020000000.mmio
>>   0000000000000000-ffffffffffffffff (prio 0, i/o): pci@800000020000000.mmio
>>     0000210000000000-000021000000ffff (prio 1, i/o): 0001:03:00.0 base BAR 1
>>       0000210000000000-000021000000ffff (prio 0, i/o): 0001:03:00.0 BAR 1
>>       000021000000e000-000021000000e5ff (prio 0, i/o): msix-table
>>       000021000000f000-000021000000f00f (prio 0, i/o): msix-pba [disabled]
>>     0000210000040000-000021000007ffff (prio 1, i/o): 0001:03:00.0 base BAR 3
>>       0000210000040000-000021000007ffff (prio 0, i/o): 0001:03:00.0 BAR 3
>>         0000210000040000-000021000007ffff (prio 0, ramd): 0001:03:00.0 BAR
>> 3 mmaps[0]
>>
>>
>> Where "BAR 1" and "msix-table" overlap. It resolves correctly:
>>
>>
>> FlatView #1
>>  AS "memory", root: system
>>  AS "cpu-memory", root: system
>>  Root memory region: system
>>   0000000000000000-000000007fffffff (prio 0, ram): ppc_spapr.ram
>>   0000210000000000-000021000000dfff (prio 0, i/o): 0001:03:00.0 BAR 1
>>   000021000000e000-000021000000e5ff (prio 0, i/o): msix-table
>>   000021000000e600-000021000000ffff (prio 0, i/o): 0001:03:00.0 BAR 1
>> @000000000000e600
>>   0000210000040000-000021000007ffff (prio 0, ramd): 0001:03:00.0 BAR 3 mmaps[0]
>>
>>
>> but this looks like an accident - should not we raise the msix-table
>> priority or lower the BAR1 priority (to -1)?
> 
> I was worried about this too, but the way I read the code is that the
> last registered subregion with the same priority wins.  Therefore so
> long as msix_init() is called after we layer anything else onto the base
> BAR MemoryRegion that might interfere with it, everything should work
> correctly.  It certainly might make sense for msix_init() to add
> subregions with a higher priority, but then you immediately get into
> the road block of what priority should it use, which is a bit of a rat
> hole since we can make it work as-is with proper ordering.

The order is not obvious from any variant of "info mtree" (I may post a
patch with a "-u" switch to disable sorting).

And docs/devel/memory.txt allows using negative priorities so the "BAR 1"
region could have -1 to make things predictable and more self-documented imho.



>  Thanks,
> 
> Alex
> 


-- 
Alexey