pciinit: setup mcfg for pxb-pcie to support multiple pci domains

[SeaBIOS] [RFC v3] pciinit: setup mcfg for pxb-pcie to support multiple pci domains

Posted by Zihan Yang 5 years, 7 months ago

To support multiple pci domains of pxb-pcie device in qemu, we need to setup
mcfg range in seabios. We use [0x80000000, 0xb0000000) to hold new domain mcfg
table for now, and we need to retrieve the desired mcfg size of each pxb-pcie
from a hidden bar because they may not need the whole 256 busses, which also
enables us to support more domains within a limited range (768MB)

Signed-off-by: Zihan Yang <whois.zihan.yang@gmail.com>
---
 src/fw/dev-q35.h |  7 +++++++
 src/fw/pciinit.c | 32 ++++++++++++++++++++++++++++++++
 src/hw/pci_ids.h |  1 +
 3 files changed, 40 insertions(+)

diff --git a/src/fw/dev-q35.h b/src/fw/dev-q35.h
index 201825d..229cd81 100644
--- a/src/fw/dev-q35.h
+++ b/src/fw/dev-q35.h
@@ -49,4 +49,11 @@
 #define ICH9_APM_ACPI_ENABLE           0x2
 #define ICH9_APM_ACPI_DISABLE          0x3
 
+#define PXB_PCIE_HOST_BRIDGE_MCFG_BAR              0x50    /* 64bit register */
+#define PXB_PCIE_HOST_BRIDGE_MCFG_SIZE             0x58    /* 32bit register */
+#define PXB_PCIE_HOST_BRIDGE_ENABLE                Q35_HOST_BRIDGE_PCIEXBAREN
+/* pxb-pcie can use [0x80000000, 0xb0000000), be careful not to overflow */
+#define PXB_PCIE_HOST_BRIDGE_MCFG_SIZE_ADDR        0x80000000
+#define PXB_PCIE_HOST_BRIDGE_MCFG_SIZE_ADDR_UPPER Q35_HOST_BRIDGE_PCIEXBAR_ADDR
+
 #endif // dev-q35.h
diff --git a/src/fw/pciinit.c b/src/fw/pciinit.c
index c0634bc..e0ac22c 100644
--- a/src/fw/pciinit.c
+++ b/src/fw/pciinit.c
@@ -51,6 +51,7 @@ u64 pcimem_end     = BUILD_PCIMEM_END;
 u64 pcimem64_start = BUILD_PCIMEM64_START;
 u64 pcimem64_end   = BUILD_PCIMEM64_END;
 u64 pci_io_low_end = 0xa000;
+u64 pxb_pcie_mcfg_base = PXB_PCIE_HOST_BRIDGE_MCFG_SIZE_ADDR;
 
 struct pci_region_entry {
     struct pci_device *dev;
@@ -507,11 +508,42 @@ static void mch_mem_addr_setup(struct pci_device *dev, void *arg)
         pci_io_low_end = acpi_pm_base;
 }
 
+static void pxb_pcie_mem_addr_setup(struct pci_device *dev, void *arg)
+{
+    u64 mcfg_base;
+    u32 mcfg_size = pci_config_readl(dev->bdf, PXB_PCIE_HOST_BRIDGE_MCFG_SIZE);
+
+    /* 0 means this pxb-pcie still resides in pci domain 0 */
+    if (mcfg_size == 0)
+        return;
+
+    if (pxb_pcie_mcfg_base + mcfg_size >
+                             PXB_PCIE_HOST_BRIDGE_MCFG_SIZE_ADDR_UPPER) {
+        dprintf(1, "PCI: Not enough space to hold new pci domains\n");
+        return;
+    }
+
+    mcfg_base = pxb_pcie_mcfg_base;
+    pxb_pcie_mcfg_base += mcfg_size;
+
+    /* First clear old mmio, taken care of by QEMU */
+    pci_config_writel(dev->bdf, PXB_PCIE_HOST_BRIDGE_MCFG_BAR, 0);
+    /* Update MCFG base */
+    pci_config_writel(dev->bdf, PXB_PCIE_HOST_BRIDGE_MCFG_BAR + 4,
+                      mcfg_base >> 32);
+    pci_config_writel(dev->bdf, PXB_PCIE_HOST_BRIDGE_MCFG_BAR,
+                      (mcfg_base & 0xffffffff) | PXB_PCIE_HOST_BRIDGE_ENABLE);
+
+    e820_add(mcfg_base, mcfg_size, E820_RESERVED);
+}
+
 static const struct pci_device_id pci_platform_tbl[] = {
     PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82441,
                i440fx_mem_addr_setup),
     PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_Q35_MCH,
                mch_mem_addr_setup),
+    PCI_DEVICE(PCI_VENDOR_ID_REDHAT, PCI_DEVICE_ID_REDHAT_PXB_HOST,
+               pxb_pcie_mem_addr_setup),
     PCI_DEVICE_END
 };
 
diff --git a/src/hw/pci_ids.h b/src/hw/pci_ids.h
index 1096461..b495920 100644
--- a/src/hw/pci_ids.h
+++ b/src/hw/pci_ids.h
@@ -2266,6 +2266,7 @@
 #define PCI_VENDOR_ID_REDHAT		0x1b36
 #define PCI_DEVICE_ID_REDHAT_ROOT_PORT	0x000C
 #define PCI_DEVICE_ID_REDHAT_BRIDGE	0x0001
+#define PCI_DEVICE_ID_REDHAT_PXB_HOST   0x000B
 
 #define PCI_VENDOR_ID_TEKRAM		0x1de1
 #define PCI_DEVICE_ID_TEKRAM_DC290	0xdc29
-- 
2.7.4


_______________________________________________
SeaBIOS mailing list
SeaBIOS@seabios.org
https://mail.coreboot.org/mailman/listinfo/seabios

Re: [SeaBIOS] [RFC v3] pciinit: setup mcfg for pxb-pcie to support multiple pci domains

Posted by Kevin O'Connor 5 years, 7 months ago

On Mon, Sep 17, 2018 at 11:02:59PM +0800, Zihan Yang wrote:
> To support multiple pci domains of pxb-pcie device in qemu, we need to setup
> mcfg range in seabios. We use [0x80000000, 0xb0000000) to hold new domain mcfg
> table for now, and we need to retrieve the desired mcfg size of each pxb-pcie
> from a hidden bar because they may not need the whole 256 busses, which also
> enables us to support more domains within a limited range (768MB)

At a highlevel, this looks okay to me.  I'd like to see additional
reviews from others more familiar with the QEMU PCI code, though.

Is the plan to do the same thing for OVMF?

-Kevin

> 
> Signed-off-by: Zihan Yang <whois.zihan.yang@gmail.com>
> ---
>  src/fw/dev-q35.h |  7 +++++++
>  src/fw/pciinit.c | 32 ++++++++++++++++++++++++++++++++
>  src/hw/pci_ids.h |  1 +
>  3 files changed, 40 insertions(+)
> 
> diff --git a/src/fw/dev-q35.h b/src/fw/dev-q35.h
> index 201825d..229cd81 100644
> --- a/src/fw/dev-q35.h
> +++ b/src/fw/dev-q35.h
> @@ -49,4 +49,11 @@
>  #define ICH9_APM_ACPI_ENABLE           0x2
>  #define ICH9_APM_ACPI_DISABLE          0x3
>  
> +#define PXB_PCIE_HOST_BRIDGE_MCFG_BAR              0x50    /* 64bit register */
> +#define PXB_PCIE_HOST_BRIDGE_MCFG_SIZE             0x58    /* 32bit register */
> +#define PXB_PCIE_HOST_BRIDGE_ENABLE                Q35_HOST_BRIDGE_PCIEXBAREN
> +/* pxb-pcie can use [0x80000000, 0xb0000000), be careful not to overflow */
> +#define PXB_PCIE_HOST_BRIDGE_MCFG_SIZE_ADDR        0x80000000
> +#define PXB_PCIE_HOST_BRIDGE_MCFG_SIZE_ADDR_UPPER Q35_HOST_BRIDGE_PCIEXBAR_ADDR
> +
>  #endif // dev-q35.h
> diff --git a/src/fw/pciinit.c b/src/fw/pciinit.c
> index c0634bc..e0ac22c 100644
> --- a/src/fw/pciinit.c
> +++ b/src/fw/pciinit.c
> @@ -51,6 +51,7 @@ u64 pcimem_end     = BUILD_PCIMEM_END;
>  u64 pcimem64_start = BUILD_PCIMEM64_START;
>  u64 pcimem64_end   = BUILD_PCIMEM64_END;
>  u64 pci_io_low_end = 0xa000;
> +u64 pxb_pcie_mcfg_base = PXB_PCIE_HOST_BRIDGE_MCFG_SIZE_ADDR;
>  
>  struct pci_region_entry {
>      struct pci_device *dev;
> @@ -507,11 +508,42 @@ static void mch_mem_addr_setup(struct pci_device *dev, void *arg)
>          pci_io_low_end = acpi_pm_base;
>  }
>  
> +static void pxb_pcie_mem_addr_setup(struct pci_device *dev, void *arg)
> +{
> +    u64 mcfg_base;
> +    u32 mcfg_size = pci_config_readl(dev->bdf, PXB_PCIE_HOST_BRIDGE_MCFG_SIZE);
> +
> +    /* 0 means this pxb-pcie still resides in pci domain 0 */
> +    if (mcfg_size == 0)
> +        return;
> +
> +    if (pxb_pcie_mcfg_base + mcfg_size >
> +                             PXB_PCIE_HOST_BRIDGE_MCFG_SIZE_ADDR_UPPER) {
> +        dprintf(1, "PCI: Not enough space to hold new pci domains\n");
> +        return;
> +    }
> +
> +    mcfg_base = pxb_pcie_mcfg_base;
> +    pxb_pcie_mcfg_base += mcfg_size;
> +
> +    /* First clear old mmio, taken care of by QEMU */
> +    pci_config_writel(dev->bdf, PXB_PCIE_HOST_BRIDGE_MCFG_BAR, 0);
> +    /* Update MCFG base */
> +    pci_config_writel(dev->bdf, PXB_PCIE_HOST_BRIDGE_MCFG_BAR + 4,
> +                      mcfg_base >> 32);
> +    pci_config_writel(dev->bdf, PXB_PCIE_HOST_BRIDGE_MCFG_BAR,
> +                      (mcfg_base & 0xffffffff) | PXB_PCIE_HOST_BRIDGE_ENABLE);
> +
> +    e820_add(mcfg_base, mcfg_size, E820_RESERVED);
> +}
> +
>  static const struct pci_device_id pci_platform_tbl[] = {
>      PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82441,
>                 i440fx_mem_addr_setup),
>      PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_Q35_MCH,
>                 mch_mem_addr_setup),
> +    PCI_DEVICE(PCI_VENDOR_ID_REDHAT, PCI_DEVICE_ID_REDHAT_PXB_HOST,
> +               pxb_pcie_mem_addr_setup),
>      PCI_DEVICE_END
>  };
>  
> diff --git a/src/hw/pci_ids.h b/src/hw/pci_ids.h
> index 1096461..b495920 100644
> --- a/src/hw/pci_ids.h
> +++ b/src/hw/pci_ids.h
> @@ -2266,6 +2266,7 @@
>  #define PCI_VENDOR_ID_REDHAT		0x1b36
>  #define PCI_DEVICE_ID_REDHAT_ROOT_PORT	0x000C
>  #define PCI_DEVICE_ID_REDHAT_BRIDGE	0x0001
> +#define PCI_DEVICE_ID_REDHAT_PXB_HOST   0x000B
>  
>  #define PCI_VENDOR_ID_TEKRAM		0x1de1
>  #define PCI_DEVICE_ID_TEKRAM_DC290	0xdc29
> -- 
> 2.7.4
> 
> 
> _______________________________________________
> SeaBIOS mailing list
> SeaBIOS@seabios.org
> https://mail.coreboot.org/mailman/listinfo/seabios

_______________________________________________
SeaBIOS mailing list
SeaBIOS@seabios.org
https://mail.coreboot.org/mailman/listinfo/seabios

Re: [SeaBIOS] [RFC v3] pciinit: setup mcfg for pxb-pcie to support multiple pci domains

Posted by Laszlo Ersek 5 years, 7 months ago

On 09/25/18 17:38, Kevin O'Connor wrote:
> On Mon, Sep 17, 2018 at 11:02:59PM +0800, Zihan Yang wrote:
>> To support multiple pci domains of pxb-pcie device in qemu, we need to setup
>> mcfg range in seabios. We use [0x80000000, 0xb0000000) to hold new domain mcfg
>> table for now, and we need to retrieve the desired mcfg size of each pxb-pcie
>> from a hidden bar because they may not need the whole 256 busses, which also
>> enables us to support more domains within a limited range (768MB)
> 
> At a highlevel, this looks okay to me.  I'd like to see additional
> reviews from others more familiar with the QEMU PCI code, though.
> 
> Is the plan to do the same thing for OVMF?

I remain entirely unconvinced that this feature is useful. (I've stated
so before.)

I believe the latest QEMU RFC posting (v5) is here:

[Qemu-devel] [RFC v5 0/6] pci_expander_brdige: support separate pci
domain for pxb-pcie

http://mid.mail-archive.com/1537196258-12581-1-git-send-email-whois.zihan.yang@gmail.com

First, I fail to see the use case where ~256 PCI bus numbers aren't
enough. If I strain myself, perhaps I can imagine using ~200 PCIe root
ports on Q35 (each of which requires a separate bus number), so that we
can independently hot-plug 200 devices then. And that's supposedly not
enough, because we want... 300? 400? A thousand? Doesn't sound realistic
to me. (This is not meant to be a strawman argument, I really have no
idea what the feature would be useful for.)

Second, the v5 RFC doesn't actually address the alleged bus number
shortage. IIUC, it supports a low number of ECAM ranges under 4GB, but
those are (individually) limited in the bus number ranges they can
accommodate (due to 32-bit address space shortage). So more or less the
current approach just fragments the bus number space we already have, to
multiple domains.

Third, should a subsequent iteration of the QEMU series put those extra
ECAMs above 4GB, with the intent to leave the enumeration of those
hierarchies to the "guest OS", it would present an incredible
implementation mess for OVMF. If people gained the ability to attach
storage or network to those domains, on the QEMU command line, they
would expect to boot off of them, using UEFI. Then OVMF would have to
make sure the controllers could be bound by their respective UEFI
drivers. That in turn would require functional config space access
(ECAM) at semi-random 64-bit addresses.

The layout of the 64-bit address space is already pretty darn
complicated in OVMF; it depends on guest RAM size, DIMM hotplug area
size, and it already dictates the base of the 64-bit MMIO aperture for
BAR allocations. It also dictates how high up the DXE Core should build
the page tables (you can only access 64-bit addresses if the 1:1 page
tables built by the DXE Core cover them).

Obviously, UEFI on physical machines does support multiple PCI domains.
There are a number of differences however:

- Both the ECAM ranges, and the MMIO apertures (for BAR allocation) of
the disparate host bridges are distinct. On QEMU / OVMF, the latter part
is not true (about the MMIO apertures), and this has already required us
to write some nasty quirks for supposedly platform-independent core code
in edk2.

- The same address ranges mentioned in the previous bullet are known in
advance (they are "static"). That's a *huge* simplification opportunity
to physical platform code (which is most of the time not even open
source, let alone upstreamed to edk2), because the engineers can lay out
the 64-bit address range, and deduce all the related artifacts from that
layout, on paper, at the office.

It's a proper mess with a lot of opportunity for regressions, and I just
don't see the bang for the buck.

(I didn't mean to re-hash my opinion yet again -- in the QEMU RFC v5
thread, I saw references to OVMF, stating that OVMF would not support
this. I was 100% fine with those mentions, but here you asked
explicitly... Some ideas just make me rant, my apologies.)

Thanks
Laszlo

> -Kevin
> 
>>
>> Signed-off-by: Zihan Yang <whois.zihan.yang@gmail.com>
>> ---
>>  src/fw/dev-q35.h |  7 +++++++
>>  src/fw/pciinit.c | 32 ++++++++++++++++++++++++++++++++
>>  src/hw/pci_ids.h |  1 +
>>  3 files changed, 40 insertions(+)
>>
>> diff --git a/src/fw/dev-q35.h b/src/fw/dev-q35.h
>> index 201825d..229cd81 100644
>> --- a/src/fw/dev-q35.h
>> +++ b/src/fw/dev-q35.h
>> @@ -49,4 +49,11 @@
>>  #define ICH9_APM_ACPI_ENABLE           0x2
>>  #define ICH9_APM_ACPI_DISABLE          0x3
>>  
>> +#define PXB_PCIE_HOST_BRIDGE_MCFG_BAR              0x50    /* 64bit register */
>> +#define PXB_PCIE_HOST_BRIDGE_MCFG_SIZE             0x58    /* 32bit register */
>> +#define PXB_PCIE_HOST_BRIDGE_ENABLE                Q35_HOST_BRIDGE_PCIEXBAREN
>> +/* pxb-pcie can use [0x80000000, 0xb0000000), be careful not to overflow */
>> +#define PXB_PCIE_HOST_BRIDGE_MCFG_SIZE_ADDR        0x80000000
>> +#define PXB_PCIE_HOST_BRIDGE_MCFG_SIZE_ADDR_UPPER Q35_HOST_BRIDGE_PCIEXBAR_ADDR
>> +
>>  #endif // dev-q35.h
>> diff --git a/src/fw/pciinit.c b/src/fw/pciinit.c
>> index c0634bc..e0ac22c 100644
>> --- a/src/fw/pciinit.c
>> +++ b/src/fw/pciinit.c
>> @@ -51,6 +51,7 @@ u64 pcimem_end     = BUILD_PCIMEM_END;
>>  u64 pcimem64_start = BUILD_PCIMEM64_START;
>>  u64 pcimem64_end   = BUILD_PCIMEM64_END;
>>  u64 pci_io_low_end = 0xa000;
>> +u64 pxb_pcie_mcfg_base = PXB_PCIE_HOST_BRIDGE_MCFG_SIZE_ADDR;
>>  
>>  struct pci_region_entry {
>>      struct pci_device *dev;
>> @@ -507,11 +508,42 @@ static void mch_mem_addr_setup(struct pci_device *dev, void *arg)
>>          pci_io_low_end = acpi_pm_base;
>>  }
>>  
>> +static void pxb_pcie_mem_addr_setup(struct pci_device *dev, void *arg)
>> +{
>> +    u64 mcfg_base;
>> +    u32 mcfg_size = pci_config_readl(dev->bdf, PXB_PCIE_HOST_BRIDGE_MCFG_SIZE);
>> +
>> +    /* 0 means this pxb-pcie still resides in pci domain 0 */
>> +    if (mcfg_size == 0)
>> +        return;
>> +
>> +    if (pxb_pcie_mcfg_base + mcfg_size >
>> +                             PXB_PCIE_HOST_BRIDGE_MCFG_SIZE_ADDR_UPPER) {
>> +        dprintf(1, "PCI: Not enough space to hold new pci domains\n");
>> +        return;
>> +    }
>> +
>> +    mcfg_base = pxb_pcie_mcfg_base;
>> +    pxb_pcie_mcfg_base += mcfg_size;
>> +
>> +    /* First clear old mmio, taken care of by QEMU */
>> +    pci_config_writel(dev->bdf, PXB_PCIE_HOST_BRIDGE_MCFG_BAR, 0);
>> +    /* Update MCFG base */
>> +    pci_config_writel(dev->bdf, PXB_PCIE_HOST_BRIDGE_MCFG_BAR + 4,
>> +                      mcfg_base >> 32);
>> +    pci_config_writel(dev->bdf, PXB_PCIE_HOST_BRIDGE_MCFG_BAR,
>> +                      (mcfg_base & 0xffffffff) | PXB_PCIE_HOST_BRIDGE_ENABLE);
>> +
>> +    e820_add(mcfg_base, mcfg_size, E820_RESERVED);
>> +}
>> +
>>  static const struct pci_device_id pci_platform_tbl[] = {
>>      PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82441,
>>                 i440fx_mem_addr_setup),
>>      PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_Q35_MCH,
>>                 mch_mem_addr_setup),
>> +    PCI_DEVICE(PCI_VENDOR_ID_REDHAT, PCI_DEVICE_ID_REDHAT_PXB_HOST,
>> +               pxb_pcie_mem_addr_setup),
>>      PCI_DEVICE_END
>>  };
>>  
>> diff --git a/src/hw/pci_ids.h b/src/hw/pci_ids.h
>> index 1096461..b495920 100644
>> --- a/src/hw/pci_ids.h
>> +++ b/src/hw/pci_ids.h
>> @@ -2266,6 +2266,7 @@
>>  #define PCI_VENDOR_ID_REDHAT		0x1b36
>>  #define PCI_DEVICE_ID_REDHAT_ROOT_PORT	0x000C
>>  #define PCI_DEVICE_ID_REDHAT_BRIDGE	0x0001
>> +#define PCI_DEVICE_ID_REDHAT_PXB_HOST   0x000B
>>  
>>  #define PCI_VENDOR_ID_TEKRAM		0x1de1
>>  #define PCI_DEVICE_ID_TEKRAM_DC290	0xdc29
>> -- 
>> 2.7.4
>>
>>
>> _______________________________________________
>> SeaBIOS mailing list
>> SeaBIOS@seabios.org
>> https://mail.coreboot.org/mailman/listinfo/seabios

_______________________________________________
SeaBIOS mailing list
SeaBIOS@seabios.org
https://mail.coreboot.org/mailman/listinfo/seabios

Re: [SeaBIOS] [RFC v3] pciinit: setup mcfg for pxb-pcie to support multiple pci domains

Posted by Gerd Hoffmann 5 years, 7 months ago

  Hi,

> Second, the v5 RFC doesn't actually address the alleged bus number
> shortage. IIUC, it supports a low number of ECAM ranges under 4GB, but
> those are (individually) limited in the bus number ranges they can
> accommodate (due to 32-bit address space shortage). So more or less the
> current approach just fragments the bus number space we already have, to
> multiple domains.

Havn't looked at the qemu side too close yet, but as I understand things
the firmware programs the ECAM location (simliar to the q35 mmconf bar),
and this is just a limitation of the current seabios patch.

So, no, *that* part wouldn't be messy in ovmf, you can simply place the
ECAMs where you want.

cheers,
  Gerd


_______________________________________________
SeaBIOS mailing list
SeaBIOS@seabios.org
https://mail.coreboot.org/mailman/listinfo/seabios

Re: [SeaBIOS] [RFC v3] pciinit: setup mcfg for pxb-pcie to support multiple pci domains

Posted by Laszlo Ersek 5 years, 7 months ago

On 09/26/18 06:44, Gerd Hoffmann wrote:
>   Hi,
> 
>> Second, the v5 RFC doesn't actually address the alleged bus number
>> shortage. IIUC, it supports a low number of ECAM ranges under 4GB, but
>> those are (individually) limited in the bus number ranges they can
>> accommodate (due to 32-bit address space shortage). So more or less the
>> current approach just fragments the bus number space we already have, to
>> multiple domains.
> 
> Havn't looked at the qemu side too close yet, but as I understand things
> the firmware programs the ECAM location (simliar to the q35 mmconf bar),
> and this is just a limitation of the current seabios patch.
> 
> So, no, *that* part wouldn't be messy in ovmf, you can simply place the
> ECAMs where you want.

Figuring out "wherever I want" is the problem. It's not simple. The
64-bit MMIO aperture (for BAR allocation) can also be placed mostly
"wherever the firmware wants", and that wasn't simple either. All these
things end up depending on each other.

https://bugzilla.redhat.com/show_bug.cgi?id=1353591#c8

(

The placement of the q35 MMCONF BAR was difficult too; I needed your
help with the low RAM split that QEMU would choose.

v1 discussion:

http://mid.mail-archive.com/1457340448.25423.43.camel@redhat.com

v2 patch (ended up as commit 7b8fe63561b4):

http://mid.mail-archive.com/1457446804-18892-4-git-send-email-lersek@redhat.com

These things add up :(

)

Laszlo

_______________________________________________
SeaBIOS mailing list
SeaBIOS@seabios.org
https://mail.coreboot.org/mailman/listinfo/seabios

Re: [SeaBIOS] [RFC v3] pciinit: setup mcfg for pxb-pcie to support multiple pci domains

Posted by Gerd Hoffmann 5 years, 7 months ago

On Wed, Sep 26, 2018 at 10:47:42AM +0200, Laszlo Ersek wrote:
> On 09/26/18 06:44, Gerd Hoffmann wrote:
> >   Hi,
> > 
> >> Second, the v5 RFC doesn't actually address the alleged bus number
> >> shortage. IIUC, it supports a low number of ECAM ranges under 4GB, but
> >> those are (individually) limited in the bus number ranges they can
> >> accommodate (due to 32-bit address space shortage). So more or less the
> >> current approach just fragments the bus number space we already have, to
> >> multiple domains.
> > 
> > Havn't looked at the qemu side too close yet, but as I understand things
> > the firmware programs the ECAM location (simliar to the q35 mmconf bar),
> > and this is just a limitation of the current seabios patch.
> > 
> > So, no, *that* part wouldn't be messy in ovmf, you can simply place the
> > ECAMs where you want.
> 
> Figuring out "wherever I want" is the problem. It's not simple. The
> 64-bit MMIO aperture (for BAR allocation) can also be placed mostly
> "wherever the firmware wants", and that wasn't simple either. All these
> things end up depending on each other.

Agree.  Due to the very dynamic nature virtual hardware in qemu ovmf has
to adapt to *alot* more stuff at runtime than physical hardware
platforms, where you for the most part know what hardware is there and
can handle alot of things at compile time (without nasty initialization
order issues).

Physical address space bits not being reliable is rather bad too.  It
makes address space management harder, because we squeeze everything
below 64G if possible.  Just in case, we might have 36 physbits only.

It's the reason we need need stuff like address space reservation for
memory hotplug in the first place, instead of simply reserving 1/4 or
1/8 of the physical address space at the topmost location for the 64bit
mmio aperture.

Oh, and after the rant back to the original topic:  I just wanted point
out that you don't have to be worryed about all the 32bit address space
issues described in the v5 cover letter.  That is how the seabios patch
handles things.  OVMF can handle things differently.  For example
totally ignore pxb-pcie in 32bit builds, and place the ECAMs above 4G
(take away some room from the 64-bit MMIO aperture) unconditionally in
64bit builds.

cheers,
  Gerd

_______________________________________________
SeaBIOS mailing list
SeaBIOS@seabios.org
https://mail.coreboot.org/mailman/listinfo/seabios

Re: [SeaBIOS] [RFC v3] pciinit: setup mcfg for pxb-pcie to support multiple pci domains

Posted by Zihan Yang 5 years, 7 months ago

HI Laszlo
Laszlo Ersek <lersek@redhat.com> 于2018年9月26日周三 上午1:17写道：
>
> On 09/25/18 17:38, Kevin O'Connor wrote:
> > On Mon, Sep 17, 2018 at 11:02:59PM +0800, Zihan Yang wrote:
> >> To support multiple pci domains of pxb-pcie device in qemu, we need to setup
> >> mcfg range in seabios. We use [0x80000000, 0xb0000000) to hold new domain mcfg
> >> table for now, and we need to retrieve the desired mcfg size of each pxb-pcie
> >> from a hidden bar because they may not need the whole 256 busses, which also
> >> enables us to support more domains within a limited range (768MB)
> >
> > At a highlevel, this looks okay to me.  I'd like to see additional
> > reviews from others more familiar with the QEMU PCI code, though.
> >
> > Is the plan to do the same thing for OVMF?
>
> I remain entirely unconvinced that this feature is useful. (I've stated
> so before.)
>
> I believe the latest QEMU RFC posting (v5) is here:
>
> [Qemu-devel] [RFC v5 0/6] pci_expander_brdige: support separate pci
> domain for pxb-pcie
>
> http://mid.mail-archive.com/1537196258-12581-1-git-send-email-whois.zihan.yang@gmail.com
>
> First, I fail to see the use case where ~256 PCI bus numbers aren't
> enough. If I strain myself, perhaps I can imagine using ~200 PCIe root
> ports on Q35 (each of which requires a separate bus number), so that we
> can independently hot-plug 200 devices then. And that's supposedly not
> enough, because we want... 300? 400? A thousand? Doesn't sound realistic
> to me. (This is not meant to be a strawman argument, I really have no
> idea what the feature would be useful for.)

It might not be very intuitive, but it indeed exists. The very
beginning discussion
about 4 months ago has mentioned a possible use case, and I paste it here

- We have Ray from Intel trying to use 1000 virtio-net devices
- We may have a VM managing some backups (tapes), we may have a lot of these.
- We may want indeed to create a nested solution as Michael mentioned.

The thread can be found in
https://lists.gnu.org/archive/html/qemu-devel/2018-05/msg04667.html

Also, a later post from a person in Dell stated in the list that he would need
this feature for Intel VMD In Dell EMC? I have no idea about the details,
but since they went here for help, I guess they do can benefit from it somehow.

> Second, the v5 RFC doesn't actually address the alleged bus number
> shortage. IIUC, it supports a low number of ECAM ranges under 4GB, but
> those are (individually) limited in the bus number ranges they can
> accommodate (due to 32-bit address space shortage). So more or less the
> current approach just fragments the bus number space we already have, to
> multiple domains.
>
> Third, should a subsequent iteration of the QEMU series put those extra
> ECAMs above 4GB, with the intent to leave the enumeration of those
> hierarchies to the "guest OS", it would present an incredible
> implementation mess for OVMF. If people gained the ability to attach
> storage or network to those domains, on the QEMU command line, they
> would expect to boot off of them, using UEFI. Then OVMF would have to
> make sure the controllers could be bound by their respective UEFI
> drivers. That in turn would require functional config space access
> (ECAM) at semi-random 64-bit addresses.

I'm not familiar with OVMF, so I'm afraid I don't know how to make it easier
for OVMF, the division of 64bit space in OVMF is out of my purpose. There
is no plan to implement it in OVMF for now, we just want to make the
seabios/qemu patch a proof of concept.

As for the seabios, it access devices through port 0xcf8/0xcfc, which is binded
to q35 host in qemu. If we want to change the mmconfig size of pxb-pcie
(instead of using whole 256MB), we must know its desired size, which is passed
as a hidden bar. Unfortunately the configuration space of pxb-pcie device
cannot be accessed with 0xcf8/0xcfc because they are in different host bridge.
At this time the ECAM is not configured yet, so no able to use MMIO too.
In previous version, I tried to bind pxb host to other ports in qemu, so that we
can use port io to access the config space of pxb-pcie, but it seems a
little dirty,

Another issue is how seabios initialize things. It will first do pci_setup when
things like RSDP is not loaded. It is inconvenient to retrieve MCFG table and
other information, so we cannot infer the mmconfig addr and size from MCFG
table in seabios.

Therefore we fall back to an alternative that we support 4x of devices as the
first step, and let the guest os do the initialization. The inability to boot
from devices in another domain is indeed an issue, and we don't have very
good solution to it yet.

Things might change in the future if we can figure out a better solution, and I
hope we can have an easier and more elegant solution in OVMF. But now
we are just trying to give a possible solution as a poc.

Thanks
Zihan

_______________________________________________
SeaBIOS mailing list
SeaBIOS@seabios.org
https://mail.coreboot.org/mailman/listinfo/seabios

Re: [SeaBIOS] [RFC v3] pciinit: setup mcfg for pxb-pcie to support multiple pci domains

Posted by Dr. David Alan Gilbert 5 years, 7 months ago

* Zihan Yang (whois.zihan.yang@gmail.com) wrote:
> HI Laszlo
> Laszlo Ersek <lersek@redhat.com> 于2018年9月26日周三 上午1:17写道：
> >
> > On 09/25/18 17:38, Kevin O'Connor wrote:
> > > On Mon, Sep 17, 2018 at 11:02:59PM +0800, Zihan Yang wrote:
> > >> To support multiple pci domains of pxb-pcie device in qemu, we need to setup
> > >> mcfg range in seabios. We use [0x80000000, 0xb0000000) to hold new domain mcfg
> > >> table for now, and we need to retrieve the desired mcfg size of each pxb-pcie
> > >> from a hidden bar because they may not need the whole 256 busses, which also
> > >> enables us to support more domains within a limited range (768MB)
> > >
> > > At a highlevel, this looks okay to me.  I'd like to see additional
> > > reviews from others more familiar with the QEMU PCI code, though.
> > >
> > > Is the plan to do the same thing for OVMF?
> >
> > I remain entirely unconvinced that this feature is useful. (I've stated
> > so before.)
> >
> > I believe the latest QEMU RFC posting (v5) is here:
> >
> > [Qemu-devel] [RFC v5 0/6] pci_expander_brdige: support separate pci
> > domain for pxb-pcie
> >
> > http://mid.mail-archive.com/1537196258-12581-1-git-send-email-whois.zihan.yang@gmail.com
> >
> > First, I fail to see the use case where ~256 PCI bus numbers aren't
> > enough. If I strain myself, perhaps I can imagine using ~200 PCIe root
> > ports on Q35 (each of which requires a separate bus number), so that we
> > can independently hot-plug 200 devices then. And that's supposedly not
> > enough, because we want... 300? 400? A thousand? Doesn't sound realistic
> > to me. (This is not meant to be a strawman argument, I really have no
> > idea what the feature would be useful for.)
> 
> It might not be very intuitive, but it indeed exists. The very
> beginning discussion
> about 4 months ago has mentioned a possible use case, and I paste it here
> 
> - We have Ray from Intel trying to use 1000 virtio-net devices

why that many?

> - We may have a VM managing some backups (tapes), we may have a lot of these.

I'm curious; what does tape backup have to do with the number of PCI
slots/busses?

Dave

> - We may want indeed to create a nested solution as Michael mentioned.
> 
> The thread can be found in
> https://lists.gnu.org/archive/html/qemu-devel/2018-05/msg04667.html
> 
> Also, a later post from a person in Dell stated in the list that he would need
> this feature for Intel VMD In Dell EMC? I have no idea about the details,
> but since they went here for help, I guess they do can benefit from it somehow.
> 
> > Second, the v5 RFC doesn't actually address the alleged bus number
> > shortage. IIUC, it supports a low number of ECAM ranges under 4GB, but
> > those are (individually) limited in the bus number ranges they can
> > accommodate (due to 32-bit address space shortage). So more or less the
> > current approach just fragments the bus number space we already have, to
> > multiple domains.
> >
> > Third, should a subsequent iteration of the QEMU series put those extra
> > ECAMs above 4GB, with the intent to leave the enumeration of those
> > hierarchies to the "guest OS", it would present an incredible
> > implementation mess for OVMF. If people gained the ability to attach
> > storage or network to those domains, on the QEMU command line, they
> > would expect to boot off of them, using UEFI. Then OVMF would have to
> > make sure the controllers could be bound by their respective UEFI
> > drivers. That in turn would require functional config space access
> > (ECAM) at semi-random 64-bit addresses.
> 
> I'm not familiar with OVMF, so I'm afraid I don't know how to make it easier
> for OVMF, the division of 64bit space in OVMF is out of my purpose. There
> is no plan to implement it in OVMF for now, we just want to make the
> seabios/qemu patch a proof of concept.
> 
> As for the seabios, it access devices through port 0xcf8/0xcfc, which is binded
> to q35 host in qemu. If we want to change the mmconfig size of pxb-pcie
> (instead of using whole 256MB), we must know its desired size, which is passed
> as a hidden bar. Unfortunately the configuration space of pxb-pcie device
> cannot be accessed with 0xcf8/0xcfc because they are in different host bridge.
> At this time the ECAM is not configured yet, so no able to use MMIO too.
> In previous version, I tried to bind pxb host to other ports in qemu, so that we
> can use port io to access the config space of pxb-pcie, but it seems a
> little dirty,
> 
> Another issue is how seabios initialize things. It will first do pci_setup when
> things like RSDP is not loaded. It is inconvenient to retrieve MCFG table and
> other information, so we cannot infer the mmconfig addr and size from MCFG
> table in seabios.
> 
> Therefore we fall back to an alternative that we support 4x of devices as the
> first step, and let the guest os do the initialization. The inability to boot
> from devices in another domain is indeed an issue, and we don't have very
> good solution to it yet.
> 
> Things might change in the future if we can figure out a better solution, and I
> hope we can have an easier and more elegant solution in OVMF. But now
> we are just trying to give a possible solution as a poc.
> 
> Thanks
> Zihan
> 
> _______________________________________________
> SeaBIOS mailing list
> SeaBIOS@seabios.org
> https://mail.coreboot.org/mailman/listinfo/seabios
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK

_______________________________________________
SeaBIOS mailing list
SeaBIOS@seabios.org
https://mail.coreboot.org/mailman/listinfo/seabios

Re: [SeaBIOS] [RFC v3] pciinit: setup mcfg for pxb-pcie to support multiple pci domains

Posted by Zihan Yang 5 years, 6 months ago

Dr. David Alan Gilbert <dgilbert@redhat.com> 于2018年9月28日周五 上午1:53写道：
>
> * Zihan Yang (whois.zihan.yang@gmail.com) wrote:
> > HI Laszlo
> > Laszlo Ersek <lersek@redhat.com> 于2018年9月26日周三 上午1:17写道：
> > >
> > > On 09/25/18 17:38, Kevin O'Connor wrote:
> > > > On Mon, Sep 17, 2018 at 11:02:59PM +0800, Zihan Yang wrote:
> > > >> To support multiple pci domains of pxb-pcie device in qemu, we need to setup
> > > >> mcfg range in seabios. We use [0x80000000, 0xb0000000) to hold new domain mcfg
> > > >> table for now, and we need to retrieve the desired mcfg size of each pxb-pcie
> > > >> from a hidden bar because they may not need the whole 256 busses, which also
> > > >> enables us to support more domains within a limited range (768MB)
> > > >
> > > > At a highlevel, this looks okay to me.  I'd like to see additional
> > > > reviews from others more familiar with the QEMU PCI code, though.
> > > >
> > > > Is the plan to do the same thing for OVMF?
> > >
> > > I remain entirely unconvinced that this feature is useful. (I've stated
> > > so before.)
> > >
> > > I believe the latest QEMU RFC posting (v5) is here:
> > >
> > > [Qemu-devel] [RFC v5 0/6] pci_expander_brdige: support separate pci
> > > domain for pxb-pcie
> > >
> > > http://mid.mail-archive.com/1537196258-12581-1-git-send-email-whois.zihan.yang@gmail.com
> > >
> > > First, I fail to see the use case where ~256 PCI bus numbers aren't
> > > enough. If I strain myself, perhaps I can imagine using ~200 PCIe root
> > > ports on Q35 (each of which requires a separate bus number), so that we
> > > can independently hot-plug 200 devices then. And that's supposedly not
> > > enough, because we want... 300? 400? A thousand? Doesn't sound realistic
> > > to me. (This is not meant to be a strawman argument, I really have no
> > > idea what the feature would be useful for.)
> >
> > It might not be very intuitive, but it indeed exists. The very
> > beginning discussion
> > about 4 months ago has mentioned a possible use case, and I paste it here
> >
> > - We have Ray from Intel trying to use 1000 virtio-net devices
>
> why that many?

I think I'll cc Marcel for the details but I remember that he is not aware of
Intel's purpose either. A guess would be that Intel has NFV, DPDK projects,
maybe they are playing some magic on the network? I wish I knew, but I
doubt whether Intel would let us know their internal projects.

> > - We may have a VM managing some backups (tapes), we may have a lot of these.
>
> I'm curious; what does tape backup have to do with the number of PCI
> slots/busses?

I'm not very clear about how tape works in qemu, but the problem is pcie
devices under q35. The pcie topology requires one device per bus, therefore
the 256 bus might not be enough if we have many pcie devices. Current
pxb-pcie still resides in domain 0, and is limited by the number 256. I think
he means these tape devices would consume all the available busses
we have in domain 0.

Thanks
Zihan

_______________________________________________
SeaBIOS mailing list
SeaBIOS@seabios.org
https://mail.coreboot.org/mailman/listinfo/seabios

Re: [SeaBIOS] [RFC v3] pciinit: setup mcfg for pxb-pcie to support multiple pci domains

Posted by Gerd Hoffmann 5 years, 6 months ago

  Hi,

> > I'm curious; what does tape backup have to do with the number of PCI
> > slots/busses?
> 
> I'm not very clear about how tape works in qemu, but the problem is pcie
> devices under q35. The pcie topology requires one device per bus, therefore
> the 256 bus might not be enough if we have many pcie devices.

Note that you can have multifunction pcie devices. so you can have up to
8 per pcie slot.  Which allows close to 2000 devices.  Limitation: They
are not individually hotpluggable.

cheers,
  Gerd


_______________________________________________
SeaBIOS mailing list
SeaBIOS@seabios.org
https://mail.coreboot.org/mailman/listinfo/seabios

Re: [SeaBIOS] [RFC v3] pciinit: setup mcfg for pxb-pcie to support multiple pci domains

Posted by Kevin O'Connor 5 years, 7 months ago

On Thu, Sep 27, 2018 at 11:05:13PM +0800, Zihan Yang wrote:
> Laszlo Ersek <lersek@redhat.com> 于2018年9月26日周三 上午1:17写道：
> > First, I fail to see the use case where ~256 PCI bus numbers aren't
> > enough. If I strain myself, perhaps I can imagine using ~200 PCIe root
> > ports on Q35 (each of which requires a separate bus number), so that we
> > can independently hot-plug 200 devices then. And that's supposedly not
> > enough, because we want... 300? 400? A thousand? Doesn't sound realistic
> > to me. (This is not meant to be a strawman argument, I really have no
> > idea what the feature would be useful for.)
> 
> It might not be very intuitive, but it indeed exists. The very
> beginning discussion
> about 4 months ago has mentioned a possible use case, and I paste it here
[...]
> Things might change in the future if we can figure out a better solution, and I
> hope we can have an easier and more elegant solution in OVMF. But now
> we are just trying to give a possible solution as a poc.

Thanks.  I wasn't aware this was a proof of concept.  (Nor have I been
following the discussions on the qemu list.)  I don't think it makes
sense to merge this into the main SeaBIOS repository.  The
QEMU/firmware interface is already complex and I don't think we should
complicate it further without a more concrete use case.  In
particular, it seems unclear if 256 buses is enough or if 1024 buses
is too little.

-Kevin

_______________________________________________
SeaBIOS mailing list
SeaBIOS@seabios.org
https://mail.coreboot.org/mailman/listinfo/seabios

Re: [SeaBIOS] [RFC v3] pciinit: setup mcfg for pxb-pcie to support multiple pci domains

Posted by Zihan Yang 5 years, 6 months ago

Kevin O'Connor <kevin@koconnor.net> 于2018年9月28日周五 上午6:30写道：
>
> On Thu, Sep 27, 2018 at 11:05:13PM +0800, Zihan Yang wrote:
> > Laszlo Ersek <lersek@redhat.com> 于2018年9月26日周三 上午1:17写道：
> > > First, I fail to see the use case where ~256 PCI bus numbers aren't
> > > enough. If I strain myself, perhaps I can imagine using ~200 PCIe root
> > > ports on Q35 (each of which requires a separate bus number), so that we
> > > can independently hot-plug 200 devices then. And that's supposedly not
> > > enough, because we want... 300? 400? A thousand? Doesn't sound realistic
> > > to me. (This is not meant to be a strawman argument, I really have no
> > > idea what the feature would be useful for.)
> >
> > It might not be very intuitive, but it indeed exists. The very
> > beginning discussion
> > about 4 months ago has mentioned a possible use case, and I paste it here
> [...]
> > Things might change in the future if we can figure out a better solution, and I
> > hope we can have an easier and more elegant solution in OVMF. But now
> > we are just trying to give a possible solution as a poc.
>
> Thanks.  I wasn't aware this was a proof of concept.  (Nor have I been
> following the discussions on the qemu list.)  I don't think it makes
> sense to merge this into the main SeaBIOS repository.  The
> QEMU/firmware interface is already complex and I don't think we should
> complicate it further without a more concrete use case.  In
> particular, it seems unclear if 256 buses is enough or if 1024 buses
> is too little.

Yes, 1024 is indeed an ambiguous bound for now, but can you elaborate how
concrete should the use case be? For example, do we need to know what
they are going to do with so many network/storage devices? Because I
think normal users rarely need so many devices, but those who really want
devices might not always be willing to tell us their internal project status.

Thanks
Zihan

> -Kevin

_______________________________________________
SeaBIOS mailing list
SeaBIOS@seabios.org
https://mail.coreboot.org/mailman/listinfo/seabios