[Qemu-devel] [PATCH] spapr/pci: populate PCI DT in reverse order

Greg Kurz posted 1 patch 7 years, 1 month ago
Patches applied successfully (tree, apply log)
git fetch https://github.com/patchew-project/qemu tags/patchew/148776029578.5865.5785337570950575739.stgit@bahia
Test checkpatch passed
Test docker passed
Test s390x passed
hw/pci/pci.c         |   28 ++++++++++++++++++++++++++++
hw/ppc/spapr_pci.c   |   12 ++++++------
include/hw/pci/pci.h |    4 ++++
3 files changed, 38 insertions(+), 6 deletions(-)
[Qemu-devel] [PATCH] spapr/pci: populate PCI DT in reverse order
Posted by Greg Kurz 7 years, 1 month ago
From: Greg Kurz <gkurz@linux.vnet.ibm.com>

Since commit 1d2d974244c6 "spapr_pci: enumerate and add PCI device tree", QEMU
populates the PCI device tree in the opposite order compared to SLOF.

Before 1d2d974244c6:

Populating /pci@800000020000000
                     00 0000 (D) : 1af4 1000    virtio [ net ]
                     00 0800 (D) : 1af4 1001    virtio [ block ]
                     00 1000 (D) : 1af4 1009    virtio [ network ]
Populating /pci@800000020000000/unknown-legacy-device@2

7e5294b8 :  /pci@800000020000000
7e52b998 :  |-- ethernet@0
7e52c0c8 :  |-- scsi@1
7e52c7e8 :  +-- unknown-legacy-device@2 ok

Since 1d2d974244c6:

Populating /pci@800000020000000
                     00 1000 (D) : 1af4 1009    virtio [ network ]
Populating /pci@800000020000000/unknown-legacy-device@2
                     00 0800 (D) : 1af4 1001    virtio [ block ]
                     00 0000 (D) : 1af4 1000    virtio [ net ]

7e5e8118 :  /pci@800000020000000
7e5ea6a0 :  |-- unknown-legacy-device@2
7e5eadb8 :  |-- scsi@1
7e5eb4d8 :  +-- ethernet@0 ok

This behaviour change is not actually a bug since no assumptions should be
made on DT ordering. But it has no real justification either, other than
being the consequence of the way fdt_add_subnode() inserts new elements
to the front of the FDT rather than adding them to the tail.

This patch reverts to the historical SLOF ordering by walking PCI devices
in reverse order. This reconciles pseries with x86 machine types behavior.
It is expected to make things easier when porting existing applications to
power.

Signed-off-by: Greg Kurz <gkurz@linux.vnet.ibm.com>
Tested-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: Nikunj A Dadhania <nikunj@linux.vnet.ibm.com>
(slight update to the changelog)
Signed-off-by: Greg Kurz <groug@kaod.org>
---
 hw/pci/pci.c         |   28 ++++++++++++++++++++++++++++
 hw/ppc/spapr_pci.c   |   12 ++++++------
 include/hw/pci/pci.h |    4 ++++
 3 files changed, 38 insertions(+), 6 deletions(-)

David,

This patch was posted and already discussed during 2.5 development:

http://patchwork.ozlabs.org/patch/549925/

The "consensus" at the time was that guests should not rely on device
ordering (i.e. use persistent naming instead).

I got recently contacted by OpenStack people who had several complaints
about the reverse ordering of PCI devices in pseries: different behavior
between ppc64 and x86, lots of time spent in debugging when porting
applications from x86 to ppc64 before realizing that it is caused by the
reverse ordering, necessity to carry hacky workarounds...

One strong argument against handling this properly with persistent naming
is that it requires systemd/udev. This option is considered as painful
with CirrOS, which aims at remaining as minimal as possible and is widely
used in the OpenStack ecosystem.

Would you re-consider your position and apply this patch ?

Cheers.

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index a563555e7da7..273f1e46025a 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -1530,6 +1530,34 @@ static const pci_class_desc pci_class_descriptions[] =
     { 0, NULL}
 };
 
+static void pci_for_each_device_under_bus_reverse(PCIBus *bus,
+                                                  void (*fn)(PCIBus *b,
+                                                             PCIDevice *d,
+                                                             void *opaque),
+                                                  void *opaque)
+{
+    PCIDevice *d;
+    int devfn;
+
+    for (devfn = 0; devfn < ARRAY_SIZE(bus->devices); devfn++) {
+        d = bus->devices[ARRAY_SIZE(bus->devices) - 1 - devfn];
+        if (d) {
+            fn(bus, d, opaque);
+        }
+    }
+}
+
+void pci_for_each_device_reverse(PCIBus *bus, int bus_num,
+                         void (*fn)(PCIBus *b, PCIDevice *d, void *opaque),
+                         void *opaque)
+{
+    bus = pci_find_bus_nr(bus, bus_num);
+
+    if (bus) {
+        pci_for_each_device_under_bus_reverse(bus, fn, opaque);
+    }
+}
+
 static void pci_for_each_device_under_bus(PCIBus *bus,
                                           void (*fn)(PCIBus *b, PCIDevice *d,
                                                      void *opaque),
diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index fd6fc1d95344..2a20c2a140fc 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -1782,9 +1782,9 @@ static void spapr_populate_pci_devices_dt(PCIBus *bus, PCIDevice *pdev,
     s_fdt.fdt = p->fdt;
     s_fdt.node_off = offset;
     s_fdt.sphb = p->sphb;
-    pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
-                        spapr_populate_pci_devices_dt,
-                        &s_fdt);
+    pci_for_each_device_reverse(sec_bus, pci_bus_num(sec_bus),
+                                spapr_populate_pci_devices_dt,
+                                &s_fdt);
 }
 
 static void spapr_phb_pci_enumerate_bridge(PCIBus *bus, PCIDevice *pdev,
@@ -1953,9 +1953,9 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
     s_fdt.fdt = fdt;
     s_fdt.node_off = bus_off;
     s_fdt.sphb = phb;
-    pci_for_each_device(bus, pci_bus_num(bus),
-                        spapr_populate_pci_devices_dt,
-                        &s_fdt);
+    pci_for_each_device_reverse(bus, pci_bus_num(bus),
+                                spapr_populate_pci_devices_dt,
+                                &s_fdt);
 
     ret = spapr_drc_populate_dt(fdt, bus_off, OBJECT(phb),
                                 SPAPR_DR_CONNECTOR_TYPE_PCI);
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index 6983f13745a5..9349acbfb278 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -429,6 +429,10 @@ int pci_bus_numa_node(PCIBus *bus);
 void pci_for_each_device(PCIBus *bus, int bus_num,
                          void (*fn)(PCIBus *bus, PCIDevice *d, void *opaque),
                          void *opaque);
+void pci_for_each_device_reverse(PCIBus *bus, int bus_num,
+                                 void (*fn)(PCIBus *bus, PCIDevice *d,
+                                            void *opaque),
+                                 void *opaque);
 void pci_for_each_bus_depth_first(PCIBus *bus,
                                   void *(*begin)(PCIBus *bus, void *parent_state),
                                   void (*end)(PCIBus *bus, void *state),


Re: [Qemu-devel] [PATCH] spapr/pci: populate PCI DT in reverse order
Posted by Thomas Huth 7 years, 1 month ago
On 22.02.2017 11:56, Greg Kurz wrote:
> From: Greg Kurz <gkurz@linux.vnet.ibm.com>
[...]
> This patch reverts to the historical SLOF ordering by walking PCI devices
> in reverse order. This reconciles pseries with x86 machine types behavior.
> It is expected to make things easier when porting existing applications to
> power.
[...]
> This patch was posted and already discussed during 2.5 development:
> 
> http://patchwork.ozlabs.org/patch/549925/
> 
> The "consensus" at the time was that guests should not rely on device
> ordering (i.e. use persistent naming instead).
> 
> I got recently contacted by OpenStack people who had several complaints
> about the reverse ordering of PCI devices in pseries: different behavior
> between ppc64 and x86, lots of time spent in debugging when porting
> applications from x86 to ppc64 before realizing that it is caused by the
> reverse ordering, necessity to carry hacky workarounds...
> 
> One strong argument against handling this properly with persistent naming
> is that it requires systemd/udev. This option is considered as painful
> with CirrOS, which aims at remaining as minimal as possible and is widely
> used in the OpenStack ecosystem.
> 
> Would you re-consider your position and apply this patch ?

+1 for applying the patch.

During the past months, I've also run one or two times into issues with
the reversed ordering... fortunately, I was able to work around them (or
fix other bugs triggered by this), but I think it would be better to
return the the ascending order again to avoid further future problems.

 Thomas


Re: [Qemu-devel] [PATCH] spapr/pci: populate PCI DT in reverse order
Posted by Nikunj A Dadhania 7 years, 1 month ago
Greg Kurz <groug@kaod.org> writes:

> From: Greg Kurz <gkurz@linux.vnet.ibm.com>
>
> Since commit 1d2d974244c6 "spapr_pci: enumerate and add PCI device tree", QEMU
> populates the PCI device tree in the opposite order compared to SLOF.
>
> Before 1d2d974244c6:
>
> Populating /pci@800000020000000
>                      00 0000 (D) : 1af4 1000    virtio [ net ]
>                      00 0800 (D) : 1af4 1001    virtio [ block ]
>                      00 1000 (D) : 1af4 1009    virtio [ network ]
> Populating /pci@800000020000000/unknown-legacy-device@2
>
> 7e5294b8 :  /pci@800000020000000
> 7e52b998 :  |-- ethernet@0
> 7e52c0c8 :  |-- scsi@1
> 7e52c7e8 :  +-- unknown-legacy-device@2 ok
>
> Since 1d2d974244c6:
>
> Populating /pci@800000020000000
>                      00 1000 (D) : 1af4 1009    virtio [ network ]
> Populating /pci@800000020000000/unknown-legacy-device@2
>                      00 0800 (D) : 1af4 1001    virtio [ block ]
>                      00 0000 (D) : 1af4 1000    virtio [ net ]
>
> 7e5e8118 :  /pci@800000020000000
> 7e5ea6a0 :  |-- unknown-legacy-device@2
> 7e5eadb8 :  |-- scsi@1
> 7e5eb4d8 :  +-- ethernet@0 ok
>
> This behaviour change is not actually a bug since no assumptions should be
> made on DT ordering. But it has no real justification either, other than
> being the consequence of the way fdt_add_subnode() inserts new elements
> to the front of the FDT rather than adding them to the tail.
>
> This patch reverts to the historical SLOF ordering by walking PCI devices
> in reverse order. This reconciles pseries with x86 machine types behavior.
> It is expected to make things easier when porting existing applications to
> power.
>
> Signed-off-by: Greg Kurz <gkurz@linux.vnet.ibm.com>
> Tested-by: Thomas Huth <thuth@redhat.com>
> Reviewed-by: Nikunj A Dadhania <nikunj@linux.vnet.ibm.com>
> (slight update to the changelog)
> Signed-off-by: Greg Kurz <groug@kaod.org>
> ---
>  hw/pci/pci.c         |   28 ++++++++++++++++++++++++++++
>  hw/ppc/spapr_pci.c   |   12 ++++++------
>  include/hw/pci/pci.h |    4 ++++
>  3 files changed, 38 insertions(+), 6 deletions(-)
>
> David,
>
> This patch was posted and already discussed during 2.5 development:
>
> http://patchwork.ozlabs.org/patch/549925/
>
> The "consensus" at the time was that guests should not rely on device
> ordering (i.e. use persistent naming instead).
>
> I got recently contacted by OpenStack people who had several complaints
> about the reverse ordering of PCI devices in pseries: different behavior
> between ppc64 and x86, lots of time spent in debugging when porting
> applications from x86 to ppc64 before realizing that it is caused by the
> reverse ordering, necessity to carry hacky workarounds...
>
> One strong argument against handling this properly with persistent naming
> is that it requires systemd/udev. This option is considered as painful
> with CirrOS, which aims at remaining as minimal as possible and is widely
> used in the OpenStack ecosystem.
>
> Would you re-consider your position and apply this patch ?

+1

I was the one who introduced the reverse ordering inadvertently.

Regards
Nikunj


Re: [Qemu-devel] [PATCH] spapr/pci: populate PCI DT in reverse order
Posted by Alexey Kardashevskiy 7 years, 1 month ago
On 22/02/17 21:56, Greg Kurz wrote:
> From: Greg Kurz <gkurz@linux.vnet.ibm.com>
> 
> Since commit 1d2d974244c6 "spapr_pci: enumerate and add PCI device tree", QEMU
> populates the PCI device tree in the opposite order compared to SLOF.
> 
> Before 1d2d974244c6:
> 
> Populating /pci@800000020000000
>                      00 0000 (D) : 1af4 1000    virtio [ net ]
>                      00 0800 (D) : 1af4 1001    virtio [ block ]
>                      00 1000 (D) : 1af4 1009    virtio [ network ]
> Populating /pci@800000020000000/unknown-legacy-device@2
> 
> 7e5294b8 :  /pci@800000020000000
> 7e52b998 :  |-- ethernet@0
> 7e52c0c8 :  |-- scsi@1
> 7e52c7e8 :  +-- unknown-legacy-device@2 ok
> 
> Since 1d2d974244c6:
> 
> Populating /pci@800000020000000
>                      00 1000 (D) : 1af4 1009    virtio [ network ]
> Populating /pci@800000020000000/unknown-legacy-device@2
>                      00 0800 (D) : 1af4 1001    virtio [ block ]
>                      00 0000 (D) : 1af4 1000    virtio [ net ]
> 
> 7e5e8118 :  /pci@800000020000000
> 7e5ea6a0 :  |-- unknown-legacy-device@2
> 7e5eadb8 :  |-- scsi@1
> 7e5eb4d8 :  +-- ethernet@0 ok
> 
> This behaviour change is not actually a bug since no assumptions should be
> made on DT ordering. But it has no real justification either, other than
> being the consequence of the way fdt_add_subnode() inserts new elements
> to the front of the FDT rather than adding them to the tail.
> 
> This patch reverts to the historical SLOF ordering by walking PCI devices
> in reverse order. This reconciles pseries with x86 machine types behavior.
> It is expected to make things easier when porting existing applications to
> power.
> 
> Signed-off-by: Greg Kurz <gkurz@linux.vnet.ibm.com>
> Tested-by: Thomas Huth <thuth@redhat.com>
> Reviewed-by: Nikunj A Dadhania <nikunj@linux.vnet.ibm.com>
> (slight update to the changelog)
> Signed-off-by: Greg Kurz <groug@kaod.org>
> ---
>  hw/pci/pci.c         |   28 ++++++++++++++++++++++++++++
>  hw/ppc/spapr_pci.c   |   12 ++++++------
>  include/hw/pci/pci.h |    4 ++++
>  3 files changed, 38 insertions(+), 6 deletions(-)
> 
> David,
> 
> This patch was posted and already discussed during 2.5 development:
> 
> http://patchwork.ozlabs.org/patch/549925/
> 
> The "consensus" at the time was that guests should not rely on device
> ordering (i.e. use persistent naming instead).
> 
> I got recently contacted by OpenStack people who had several complaints
> about the reverse ordering of PCI devices in pseries: different behavior
> between ppc64 and x86, lots of time spent in debugging when porting
> applications from x86 to ppc64 before realizing that it is caused by the
> reverse ordering, necessity to carry hacky workarounds...


x86 does not have a device tree, and PCI id (bus:slot:fn) is the same
regardless the scanning order, i.e. "lspci" will show the same picture with
either order.

How could OpenStack tell the difference and require workaround for what
precisely?

I am definitely missing the point here...


> 
> One strong argument against handling this properly with persistent naming
> is that it requires systemd/udev. This option is considered as painful
> with CirrOS, which aims at remaining as minimal as possible and is widely
> used in the OpenStack ecosystem.
> 
> Would you re-consider your position and apply this patch ?
> 
> Cheers.
> 
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index a563555e7da7..273f1e46025a 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -1530,6 +1530,34 @@ static const pci_class_desc pci_class_descriptions[] =
>      { 0, NULL}
>  };
>  
> +static void pci_for_each_device_under_bus_reverse(PCIBus *bus,
> +                                                  void (*fn)(PCIBus *b,
> +                                                             PCIDevice *d,
> +                                                             void *opaque),
> +                                                  void *opaque)
> +{
> +    PCIDevice *d;
> +    int devfn;
> +
> +    for (devfn = 0; devfn < ARRAY_SIZE(bus->devices); devfn++) {
> +        d = bus->devices[ARRAY_SIZE(bus->devices) - 1 - devfn];
> +        if (d) {
> +            fn(bus, d, opaque);
> +        }
> +    }
> +}
> +
> +void pci_for_each_device_reverse(PCIBus *bus, int bus_num,
> +                         void (*fn)(PCIBus *b, PCIDevice *d, void *opaque),
> +                         void *opaque)
> +{
> +    bus = pci_find_bus_nr(bus, bus_num);
> +
> +    if (bus) {
> +        pci_for_each_device_under_bus_reverse(bus, fn, opaque);
> +    }
> +}
> +
>  static void pci_for_each_device_under_bus(PCIBus *bus,
>                                            void (*fn)(PCIBus *b, PCIDevice *d,
>                                                       void *opaque),
> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> index fd6fc1d95344..2a20c2a140fc 100644
> --- a/hw/ppc/spapr_pci.c
> +++ b/hw/ppc/spapr_pci.c
> @@ -1782,9 +1782,9 @@ static void spapr_populate_pci_devices_dt(PCIBus *bus, PCIDevice *pdev,
>      s_fdt.fdt = p->fdt;
>      s_fdt.node_off = offset;
>      s_fdt.sphb = p->sphb;
> -    pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
> -                        spapr_populate_pci_devices_dt,
> -                        &s_fdt);
> +    pci_for_each_device_reverse(sec_bus, pci_bus_num(sec_bus),
> +                                spapr_populate_pci_devices_dt,
> +                                &s_fdt);
>  }
>  
>  static void spapr_phb_pci_enumerate_bridge(PCIBus *bus, PCIDevice *pdev,
> @@ -1953,9 +1953,9 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
>      s_fdt.fdt = fdt;
>      s_fdt.node_off = bus_off;
>      s_fdt.sphb = phb;
> -    pci_for_each_device(bus, pci_bus_num(bus),
> -                        spapr_populate_pci_devices_dt,
> -                        &s_fdt);
> +    pci_for_each_device_reverse(bus, pci_bus_num(bus),
> +                                spapr_populate_pci_devices_dt,
> +                                &s_fdt);
>  
>      ret = spapr_drc_populate_dt(fdt, bus_off, OBJECT(phb),
>                                  SPAPR_DR_CONNECTOR_TYPE_PCI);
> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> index 6983f13745a5..9349acbfb278 100644
> --- a/include/hw/pci/pci.h
> +++ b/include/hw/pci/pci.h
> @@ -429,6 +429,10 @@ int pci_bus_numa_node(PCIBus *bus);
>  void pci_for_each_device(PCIBus *bus, int bus_num,
>                           void (*fn)(PCIBus *bus, PCIDevice *d, void *opaque),
>                           void *opaque);
> +void pci_for_each_device_reverse(PCIBus *bus, int bus_num,
> +                                 void (*fn)(PCIBus *bus, PCIDevice *d,
> +                                            void *opaque),
> +                                 void *opaque);
>  void pci_for_each_bus_depth_first(PCIBus *bus,
>                                    void *(*begin)(PCIBus *bus, void *parent_state),
>                                    void (*end)(PCIBus *bus, void *state),
> 
> 


-- 
Alexey

Re: [Qemu-devel] [PATCH] spapr/pci: populate PCI DT in reverse order
Posted by Greg Kurz 7 years, 1 month ago
On Sat, 25 Feb 2017 20:39:18 +1100
Alexey Kardashevskiy <aik@ozlabs.ru> wrote:

> On 22/02/17 21:56, Greg Kurz wrote:
> > From: Greg Kurz <gkurz@linux.vnet.ibm.com>
> > 
> > Since commit 1d2d974244c6 "spapr_pci: enumerate and add PCI device tree", QEMU
> > populates the PCI device tree in the opposite order compared to SLOF.
> > 
> > Before 1d2d974244c6:
> > 
> > Populating /pci@800000020000000
> >                      00 0000 (D) : 1af4 1000    virtio [ net ]
> >                      00 0800 (D) : 1af4 1001    virtio [ block ]
> >                      00 1000 (D) : 1af4 1009    virtio [ network ]
> > Populating /pci@800000020000000/unknown-legacy-device@2
> > 
> > 7e5294b8 :  /pci@800000020000000
> > 7e52b998 :  |-- ethernet@0
> > 7e52c0c8 :  |-- scsi@1
> > 7e52c7e8 :  +-- unknown-legacy-device@2 ok
> > 
> > Since 1d2d974244c6:
> > 
> > Populating /pci@800000020000000
> >                      00 1000 (D) : 1af4 1009    virtio [ network ]
> > Populating /pci@800000020000000/unknown-legacy-device@2
> >                      00 0800 (D) : 1af4 1001    virtio [ block ]
> >                      00 0000 (D) : 1af4 1000    virtio [ net ]
> > 
> > 7e5e8118 :  /pci@800000020000000
> > 7e5ea6a0 :  |-- unknown-legacy-device@2
> > 7e5eadb8 :  |-- scsi@1
> > 7e5eb4d8 :  +-- ethernet@0 ok
> > 
> > This behaviour change is not actually a bug since no assumptions should be
> > made on DT ordering. But it has no real justification either, other than
> > being the consequence of the way fdt_add_subnode() inserts new elements
> > to the front of the FDT rather than adding them to the tail.
> > 
> > This patch reverts to the historical SLOF ordering by walking PCI devices
> > in reverse order. This reconciles pseries with x86 machine types behavior.
> > It is expected to make things easier when porting existing applications to
> > power.
> > 
> > Signed-off-by: Greg Kurz <gkurz@linux.vnet.ibm.com>
> > Tested-by: Thomas Huth <thuth@redhat.com>
> > Reviewed-by: Nikunj A Dadhania <nikunj@linux.vnet.ibm.com>
> > (slight update to the changelog)
> > Signed-off-by: Greg Kurz <groug@kaod.org>
> > ---
> >  hw/pci/pci.c         |   28 ++++++++++++++++++++++++++++
> >  hw/ppc/spapr_pci.c   |   12 ++++++------
> >  include/hw/pci/pci.h |    4 ++++
> >  3 files changed, 38 insertions(+), 6 deletions(-)
> > 
> > David,
> > 
> > This patch was posted and already discussed during 2.5 development:
> > 
> > http://patchwork.ozlabs.org/patch/549925/
> > 
> > The "consensus" at the time was that guests should not rely on device
> > ordering (i.e. use persistent naming instead).
> > 
> > I got recently contacted by OpenStack people who had several complaints
> > about the reverse ordering of PCI devices in pseries: different behavior
> > between ppc64 and x86, lots of time spent in debugging when porting
> > applications from x86 to ppc64 before realizing that it is caused by the
> > reverse ordering, necessity to carry hacky workarounds...  
> 
> 
> x86 does not have a device tree, and PCI id (bus:slot:fn) is the same
> regardless the scanning order, i.e. "lspci" will show the same picture with
> either order.
> 
> How could OpenStack tell the difference and require workaround for what
> precisely?
> 
> I am definitely missing the point here...
> 

NICs get probed in reverse order and are assigned different names compared
to the same setup on x86 (i.e. eth0 becomes eth1). They end up using wrong
network settings.

The same happens when using Nova libvirt driver which puts each disk on its
own PCI slot (vda becomes vdb).

This is usually avoided with persistent naming but as mentioned below, this
would require a lot of extra work, *just* because ppc64 guests don't do like
everybody else.

> 
> > 
> > One strong argument against handling this properly with persistent naming
> > is that it requires systemd/udev. This option is considered as painful
> > with CirrOS, which aims at remaining as minimal as possible and is widely
> > used in the OpenStack ecosystem.
> > 
> > Would you re-consider your position and apply this patch ?
> > 
> > Cheers.
> > 
> > diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> > index a563555e7da7..273f1e46025a 100644
> > --- a/hw/pci/pci.c
> > +++ b/hw/pci/pci.c
> > @@ -1530,6 +1530,34 @@ static const pci_class_desc pci_class_descriptions[] =
> >      { 0, NULL}
> >  };
> >  
> > +static void pci_for_each_device_under_bus_reverse(PCIBus *bus,
> > +                                                  void (*fn)(PCIBus *b,
> > +                                                             PCIDevice *d,
> > +                                                             void *opaque),
> > +                                                  void *opaque)
> > +{
> > +    PCIDevice *d;
> > +    int devfn;
> > +
> > +    for (devfn = 0; devfn < ARRAY_SIZE(bus->devices); devfn++) {
> > +        d = bus->devices[ARRAY_SIZE(bus->devices) - 1 - devfn];
> > +        if (d) {
> > +            fn(bus, d, opaque);
> > +        }
> > +    }
> > +}
> > +
> > +void pci_for_each_device_reverse(PCIBus *bus, int bus_num,
> > +                         void (*fn)(PCIBus *b, PCIDevice *d, void *opaque),
> > +                         void *opaque)
> > +{
> > +    bus = pci_find_bus_nr(bus, bus_num);
> > +
> > +    if (bus) {
> > +        pci_for_each_device_under_bus_reverse(bus, fn, opaque);
> > +    }
> > +}
> > +
> >  static void pci_for_each_device_under_bus(PCIBus *bus,
> >                                            void (*fn)(PCIBus *b, PCIDevice *d,
> >                                                       void *opaque),
> > diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> > index fd6fc1d95344..2a20c2a140fc 100644
> > --- a/hw/ppc/spapr_pci.c
> > +++ b/hw/ppc/spapr_pci.c
> > @@ -1782,9 +1782,9 @@ static void spapr_populate_pci_devices_dt(PCIBus *bus, PCIDevice *pdev,
> >      s_fdt.fdt = p->fdt;
> >      s_fdt.node_off = offset;
> >      s_fdt.sphb = p->sphb;
> > -    pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
> > -                        spapr_populate_pci_devices_dt,
> > -                        &s_fdt);
> > +    pci_for_each_device_reverse(sec_bus, pci_bus_num(sec_bus),
> > +                                spapr_populate_pci_devices_dt,
> > +                                &s_fdt);
> >  }
> >  
> >  static void spapr_phb_pci_enumerate_bridge(PCIBus *bus, PCIDevice *pdev,
> > @@ -1953,9 +1953,9 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
> >      s_fdt.fdt = fdt;
> >      s_fdt.node_off = bus_off;
> >      s_fdt.sphb = phb;
> > -    pci_for_each_device(bus, pci_bus_num(bus),
> > -                        spapr_populate_pci_devices_dt,
> > -                        &s_fdt);
> > +    pci_for_each_device_reverse(bus, pci_bus_num(bus),
> > +                                spapr_populate_pci_devices_dt,
> > +                                &s_fdt);
> >  
> >      ret = spapr_drc_populate_dt(fdt, bus_off, OBJECT(phb),
> >                                  SPAPR_DR_CONNECTOR_TYPE_PCI);
> > diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> > index 6983f13745a5..9349acbfb278 100644
> > --- a/include/hw/pci/pci.h
> > +++ b/include/hw/pci/pci.h
> > @@ -429,6 +429,10 @@ int pci_bus_numa_node(PCIBus *bus);
> >  void pci_for_each_device(PCIBus *bus, int bus_num,
> >                           void (*fn)(PCIBus *bus, PCIDevice *d, void *opaque),
> >                           void *opaque);
> > +void pci_for_each_device_reverse(PCIBus *bus, int bus_num,
> > +                                 void (*fn)(PCIBus *bus, PCIDevice *d,
> > +                                            void *opaque),
> > +                                 void *opaque);
> >  void pci_for_each_bus_depth_first(PCIBus *bus,
> >                                    void *(*begin)(PCIBus *bus, void *parent_state),
> >                                    void (*end)(PCIBus *bus, void *state),
> > 
> >   
> 
> 

Re: [Qemu-devel] [PATCH] spapr/pci: populate PCI DT in reverse order
Posted by Alexey Kardashevskiy 7 years ago
On 25/02/17 21:40, Greg Kurz wrote:
> On Sat, 25 Feb 2017 20:39:18 +1100
> Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
> 
>> On 22/02/17 21:56, Greg Kurz wrote:
>>> From: Greg Kurz <gkurz@linux.vnet.ibm.com>
>>>
>>> Since commit 1d2d974244c6 "spapr_pci: enumerate and add PCI device tree", QEMU
>>> populates the PCI device tree in the opposite order compared to SLOF.
>>>
>>> Before 1d2d974244c6:
>>>
>>> Populating /pci@800000020000000
>>>                      00 0000 (D) : 1af4 1000    virtio [ net ]
>>>                      00 0800 (D) : 1af4 1001    virtio [ block ]
>>>                      00 1000 (D) : 1af4 1009    virtio [ network ]
>>> Populating /pci@800000020000000/unknown-legacy-device@2
>>>
>>> 7e5294b8 :  /pci@800000020000000
>>> 7e52b998 :  |-- ethernet@0
>>> 7e52c0c8 :  |-- scsi@1
>>> 7e52c7e8 :  +-- unknown-legacy-device@2 ok
>>>
>>> Since 1d2d974244c6:
>>>
>>> Populating /pci@800000020000000
>>>                      00 1000 (D) : 1af4 1009    virtio [ network ]
>>> Populating /pci@800000020000000/unknown-legacy-device@2
>>>                      00 0800 (D) : 1af4 1001    virtio [ block ]
>>>                      00 0000 (D) : 1af4 1000    virtio [ net ]
>>>
>>> 7e5e8118 :  /pci@800000020000000
>>> 7e5ea6a0 :  |-- unknown-legacy-device@2
>>> 7e5eadb8 :  |-- scsi@1
>>> 7e5eb4d8 :  +-- ethernet@0 ok
>>>
>>> This behaviour change is not actually a bug since no assumptions should be
>>> made on DT ordering. But it has no real justification either, other than
>>> being the consequence of the way fdt_add_subnode() inserts new elements
>>> to the front of the FDT rather than adding them to the tail.
>>>
>>> This patch reverts to the historical SLOF ordering by walking PCI devices
>>> in reverse order. This reconciles pseries with x86 machine types behavior.
>>> It is expected to make things easier when porting existing applications to
>>> power.
>>>
>>> Signed-off-by: Greg Kurz <gkurz@linux.vnet.ibm.com>
>>> Tested-by: Thomas Huth <thuth@redhat.com>
>>> Reviewed-by: Nikunj A Dadhania <nikunj@linux.vnet.ibm.com>
>>> (slight update to the changelog)
>>> Signed-off-by: Greg Kurz <groug@kaod.org>
>>> ---
>>>  hw/pci/pci.c         |   28 ++++++++++++++++++++++++++++
>>>  hw/ppc/spapr_pci.c   |   12 ++++++------
>>>  include/hw/pci/pci.h |    4 ++++
>>>  3 files changed, 38 insertions(+), 6 deletions(-)
>>>
>>> David,
>>>
>>> This patch was posted and already discussed during 2.5 development:
>>>
>>> http://patchwork.ozlabs.org/patch/549925/
>>>
>>> The "consensus" at the time was that guests should not rely on device
>>> ordering (i.e. use persistent naming instead).
>>>
>>> I got recently contacted by OpenStack people who had several complaints
>>> about the reverse ordering of PCI devices in pseries: different behavior
>>> between ppc64 and x86, lots of time spent in debugging when porting
>>> applications from x86 to ppc64 before realizing that it is caused by the
>>> reverse ordering, necessity to carry hacky workarounds...  
>>
>>
>> x86 does not have a device tree, and PCI id (bus:slot:fn) is the same
>> regardless the scanning order, i.e. "lspci" will show the same picture with
>> either order.
>>
>> How could OpenStack tell the difference and require workaround for what
>> precisely?
>>
>> I am definitely missing the point here...
>>
> 
> NICs get probed in reverse order and are assigned different names compared
> to the same setup on x86 (i.e. eth0 becomes eth1). They end up using wrong
> network settings.


The answer I was looking for is that the guest probes devices in the order
from the device tree rather than doing PCI scan itself and this is how the
order in the device tree matters :)

+1 for the change.


-- 
Alexey

Re: [Qemu-devel] [Qemu-ppc] [PATCH] spapr/pci: populate PCI DT in reverse order
Posted by Greg Kurz 7 years ago
David,

Any chances to have this in 2.9 ?

On Wed, 22 Feb 2017 11:56:53 +0100
Greg Kurz <groug@kaod.org> wrote:

> From: Greg Kurz <gkurz@linux.vnet.ibm.com>
> 
> Since commit 1d2d974244c6 "spapr_pci: enumerate and add PCI device tree", QEMU
> populates the PCI device tree in the opposite order compared to SLOF.
> 
> Before 1d2d974244c6:
> 
> Populating /pci@800000020000000
>                      00 0000 (D) : 1af4 1000    virtio [ net ]
>                      00 0800 (D) : 1af4 1001    virtio [ block ]
>                      00 1000 (D) : 1af4 1009    virtio [ network ]
> Populating /pci@800000020000000/unknown-legacy-device@2
> 
> 7e5294b8 :  /pci@800000020000000
> 7e52b998 :  |-- ethernet@0
> 7e52c0c8 :  |-- scsi@1
> 7e52c7e8 :  +-- unknown-legacy-device@2 ok
> 
> Since 1d2d974244c6:
> 
> Populating /pci@800000020000000
>                      00 1000 (D) : 1af4 1009    virtio [ network ]
> Populating /pci@800000020000000/unknown-legacy-device@2
>                      00 0800 (D) : 1af4 1001    virtio [ block ]
>                      00 0000 (D) : 1af4 1000    virtio [ net ]
> 
> 7e5e8118 :  /pci@800000020000000
> 7e5ea6a0 :  |-- unknown-legacy-device@2
> 7e5eadb8 :  |-- scsi@1
> 7e5eb4d8 :  +-- ethernet@0 ok
> 
> This behaviour change is not actually a bug since no assumptions should be
> made on DT ordering. But it has no real justification either, other than
> being the consequence of the way fdt_add_subnode() inserts new elements
> to the front of the FDT rather than adding them to the tail.
> 
> This patch reverts to the historical SLOF ordering by walking PCI devices
> in reverse order. This reconciles pseries with x86 machine types behavior.
> It is expected to make things easier when porting existing applications to
> power.
> 
> Signed-off-by: Greg Kurz <gkurz@linux.vnet.ibm.com>
> Tested-by: Thomas Huth <thuth@redhat.com>
> Reviewed-by: Nikunj A Dadhania <nikunj@linux.vnet.ibm.com>
> (slight update to the changelog)
> Signed-off-by: Greg Kurz <groug@kaod.org>
> ---
>  hw/pci/pci.c         |   28 ++++++++++++++++++++++++++++
>  hw/ppc/spapr_pci.c   |   12 ++++++------
>  include/hw/pci/pci.h |    4 ++++
>  3 files changed, 38 insertions(+), 6 deletions(-)
> 
> David,
> 
> This patch was posted and already discussed during 2.5 development:
> 
> http://patchwork.ozlabs.org/patch/549925/
> 
> The "consensus" at the time was that guests should not rely on device
> ordering (i.e. use persistent naming instead).
> 
> I got recently contacted by OpenStack people who had several complaints
> about the reverse ordering of PCI devices in pseries: different behavior
> between ppc64 and x86, lots of time spent in debugging when porting
> applications from x86 to ppc64 before realizing that it is caused by the
> reverse ordering, necessity to carry hacky workarounds...
> 
> One strong argument against handling this properly with persistent naming
> is that it requires systemd/udev. This option is considered as painful
> with CirrOS, which aims at remaining as minimal as possible and is widely
> used in the OpenStack ecosystem.
> 
> Would you re-consider your position and apply this patch ?
> 
> Cheers.
> 
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index a563555e7da7..273f1e46025a 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -1530,6 +1530,34 @@ static const pci_class_desc pci_class_descriptions[] =
>      { 0, NULL}
>  };
>  
> +static void pci_for_each_device_under_bus_reverse(PCIBus *bus,
> +                                                  void (*fn)(PCIBus *b,
> +                                                             PCIDevice *d,
> +                                                             void *opaque),
> +                                                  void *opaque)
> +{
> +    PCIDevice *d;
> +    int devfn;
> +
> +    for (devfn = 0; devfn < ARRAY_SIZE(bus->devices); devfn++) {
> +        d = bus->devices[ARRAY_SIZE(bus->devices) - 1 - devfn];
> +        if (d) {
> +            fn(bus, d, opaque);
> +        }
> +    }
> +}
> +
> +void pci_for_each_device_reverse(PCIBus *bus, int bus_num,
> +                         void (*fn)(PCIBus *b, PCIDevice *d, void *opaque),
> +                         void *opaque)
> +{
> +    bus = pci_find_bus_nr(bus, bus_num);
> +
> +    if (bus) {
> +        pci_for_each_device_under_bus_reverse(bus, fn, opaque);
> +    }
> +}
> +
>  static void pci_for_each_device_under_bus(PCIBus *bus,
>                                            void (*fn)(PCIBus *b, PCIDevice *d,
>                                                       void *opaque),
> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> index fd6fc1d95344..2a20c2a140fc 100644
> --- a/hw/ppc/spapr_pci.c
> +++ b/hw/ppc/spapr_pci.c
> @@ -1782,9 +1782,9 @@ static void spapr_populate_pci_devices_dt(PCIBus *bus, PCIDevice *pdev,
>      s_fdt.fdt = p->fdt;
>      s_fdt.node_off = offset;
>      s_fdt.sphb = p->sphb;
> -    pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
> -                        spapr_populate_pci_devices_dt,
> -                        &s_fdt);
> +    pci_for_each_device_reverse(sec_bus, pci_bus_num(sec_bus),
> +                                spapr_populate_pci_devices_dt,
> +                                &s_fdt);
>  }
>  
>  static void spapr_phb_pci_enumerate_bridge(PCIBus *bus, PCIDevice *pdev,
> @@ -1953,9 +1953,9 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
>      s_fdt.fdt = fdt;
>      s_fdt.node_off = bus_off;
>      s_fdt.sphb = phb;
> -    pci_for_each_device(bus, pci_bus_num(bus),
> -                        spapr_populate_pci_devices_dt,
> -                        &s_fdt);
> +    pci_for_each_device_reverse(bus, pci_bus_num(bus),
> +                                spapr_populate_pci_devices_dt,
> +                                &s_fdt);
>  
>      ret = spapr_drc_populate_dt(fdt, bus_off, OBJECT(phb),
>                                  SPAPR_DR_CONNECTOR_TYPE_PCI);
> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> index 6983f13745a5..9349acbfb278 100644
> --- a/include/hw/pci/pci.h
> +++ b/include/hw/pci/pci.h
> @@ -429,6 +429,10 @@ int pci_bus_numa_node(PCIBus *bus);
>  void pci_for_each_device(PCIBus *bus, int bus_num,
>                           void (*fn)(PCIBus *bus, PCIDevice *d, void *opaque),
>                           void *opaque);
> +void pci_for_each_device_reverse(PCIBus *bus, int bus_num,
> +                                 void (*fn)(PCIBus *bus, PCIDevice *d,
> +                                            void *opaque),
> +                                 void *opaque);
>  void pci_for_each_bus_depth_first(PCIBus *bus,
>                                    void *(*begin)(PCIBus *bus, void *parent_state),
>                                    void (*end)(PCIBus *bus, void *state),
> 
> 

Re: [Qemu-devel] [Qemu-ppc] [PATCH] spapr/pci: populate PCI DT in reverse order
Posted by David Gibson 7 years ago
On Mon, Feb 27, 2017 at 11:20:16PM +0100, Greg Kurz wrote:
> David,
> 
> Any chances to have this in 2.9 ?

Yes.  I've put it in my tree and I'm hoping to send a pull request
shortly.  Actually, I was hoping to send it yesterday, but hit a bug
during testing which I'm now chasing.

> 
> On Wed, 22 Feb 2017 11:56:53 +0100
> Greg Kurz <groug@kaod.org> wrote:
> 
> > From: Greg Kurz <gkurz@linux.vnet.ibm.com>
> > 
> > Since commit 1d2d974244c6 "spapr_pci: enumerate and add PCI device tree", QEMU
> > populates the PCI device tree in the opposite order compared to SLOF.
> > 
> > Before 1d2d974244c6:
> > 
> > Populating /pci@800000020000000
> >                      00 0000 (D) : 1af4 1000    virtio [ net ]
> >                      00 0800 (D) : 1af4 1001    virtio [ block ]
> >                      00 1000 (D) : 1af4 1009    virtio [ network ]
> > Populating /pci@800000020000000/unknown-legacy-device@2
> > 
> > 7e5294b8 :  /pci@800000020000000
> > 7e52b998 :  |-- ethernet@0
> > 7e52c0c8 :  |-- scsi@1
> > 7e52c7e8 :  +-- unknown-legacy-device@2 ok
> > 
> > Since 1d2d974244c6:
> > 
> > Populating /pci@800000020000000
> >                      00 1000 (D) : 1af4 1009    virtio [ network ]
> > Populating /pci@800000020000000/unknown-legacy-device@2
> >                      00 0800 (D) : 1af4 1001    virtio [ block ]
> >                      00 0000 (D) : 1af4 1000    virtio [ net ]
> > 
> > 7e5e8118 :  /pci@800000020000000
> > 7e5ea6a0 :  |-- unknown-legacy-device@2
> > 7e5eadb8 :  |-- scsi@1
> > 7e5eb4d8 :  +-- ethernet@0 ok
> > 
> > This behaviour change is not actually a bug since no assumptions should be
> > made on DT ordering. But it has no real justification either, other than
> > being the consequence of the way fdt_add_subnode() inserts new elements
> > to the front of the FDT rather than adding them to the tail.
> > 
> > This patch reverts to the historical SLOF ordering by walking PCI devices
> > in reverse order. This reconciles pseries with x86 machine types behavior.
> > It is expected to make things easier when porting existing applications to
> > power.
> > 
> > Signed-off-by: Greg Kurz <gkurz@linux.vnet.ibm.com>
> > Tested-by: Thomas Huth <thuth@redhat.com>
> > Reviewed-by: Nikunj A Dadhania <nikunj@linux.vnet.ibm.com>
> > (slight update to the changelog)
> > Signed-off-by: Greg Kurz <groug@kaod.org>
> > ---
> >  hw/pci/pci.c         |   28 ++++++++++++++++++++++++++++
> >  hw/ppc/spapr_pci.c   |   12 ++++++------
> >  include/hw/pci/pci.h |    4 ++++
> >  3 files changed, 38 insertions(+), 6 deletions(-)
> > 
> > David,
> > 
> > This patch was posted and already discussed during 2.5 development:
> > 
> > http://patchwork.ozlabs.org/patch/549925/
> > 
> > The "consensus" at the time was that guests should not rely on device
> > ordering (i.e. use persistent naming instead).
> > 
> > I got recently contacted by OpenStack people who had several complaints
> > about the reverse ordering of PCI devices in pseries: different behavior
> > between ppc64 and x86, lots of time spent in debugging when porting
> > applications from x86 to ppc64 before realizing that it is caused by the
> > reverse ordering, necessity to carry hacky workarounds...
> > 
> > One strong argument against handling this properly with persistent naming
> > is that it requires systemd/udev. This option is considered as painful
> > with CirrOS, which aims at remaining as minimal as possible and is widely
> > used in the OpenStack ecosystem.
> > 
> > Would you re-consider your position and apply this patch ?
> > 
> > Cheers.
> > 
> > diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> > index a563555e7da7..273f1e46025a 100644
> > --- a/hw/pci/pci.c
> > +++ b/hw/pci/pci.c
> > @@ -1530,6 +1530,34 @@ static const pci_class_desc pci_class_descriptions[] =
> >      { 0, NULL}
> >  };
> >  
> > +static void pci_for_each_device_under_bus_reverse(PCIBus *bus,
> > +                                                  void (*fn)(PCIBus *b,
> > +                                                             PCIDevice *d,
> > +                                                             void *opaque),
> > +                                                  void *opaque)
> > +{
> > +    PCIDevice *d;
> > +    int devfn;
> > +
> > +    for (devfn = 0; devfn < ARRAY_SIZE(bus->devices); devfn++) {
> > +        d = bus->devices[ARRAY_SIZE(bus->devices) - 1 - devfn];
> > +        if (d) {
> > +            fn(bus, d, opaque);
> > +        }
> > +    }
> > +}
> > +
> > +void pci_for_each_device_reverse(PCIBus *bus, int bus_num,
> > +                         void (*fn)(PCIBus *b, PCIDevice *d, void *opaque),
> > +                         void *opaque)
> > +{
> > +    bus = pci_find_bus_nr(bus, bus_num);
> > +
> > +    if (bus) {
> > +        pci_for_each_device_under_bus_reverse(bus, fn, opaque);
> > +    }
> > +}
> > +
> >  static void pci_for_each_device_under_bus(PCIBus *bus,
> >                                            void (*fn)(PCIBus *b, PCIDevice *d,
> >                                                       void *opaque),
> > diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> > index fd6fc1d95344..2a20c2a140fc 100644
> > --- a/hw/ppc/spapr_pci.c
> > +++ b/hw/ppc/spapr_pci.c
> > @@ -1782,9 +1782,9 @@ static void spapr_populate_pci_devices_dt(PCIBus *bus, PCIDevice *pdev,
> >      s_fdt.fdt = p->fdt;
> >      s_fdt.node_off = offset;
> >      s_fdt.sphb = p->sphb;
> > -    pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
> > -                        spapr_populate_pci_devices_dt,
> > -                        &s_fdt);
> > +    pci_for_each_device_reverse(sec_bus, pci_bus_num(sec_bus),
> > +                                spapr_populate_pci_devices_dt,
> > +                                &s_fdt);
> >  }
> >  
> >  static void spapr_phb_pci_enumerate_bridge(PCIBus *bus, PCIDevice *pdev,
> > @@ -1953,9 +1953,9 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
> >      s_fdt.fdt = fdt;
> >      s_fdt.node_off = bus_off;
> >      s_fdt.sphb = phb;
> > -    pci_for_each_device(bus, pci_bus_num(bus),
> > -                        spapr_populate_pci_devices_dt,
> > -                        &s_fdt);
> > +    pci_for_each_device_reverse(bus, pci_bus_num(bus),
> > +                                spapr_populate_pci_devices_dt,
> > +                                &s_fdt);
> >  
> >      ret = spapr_drc_populate_dt(fdt, bus_off, OBJECT(phb),
> >                                  SPAPR_DR_CONNECTOR_TYPE_PCI);
> > diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> > index 6983f13745a5..9349acbfb278 100644
> > --- a/include/hw/pci/pci.h
> > +++ b/include/hw/pci/pci.h
> > @@ -429,6 +429,10 @@ int pci_bus_numa_node(PCIBus *bus);
> >  void pci_for_each_device(PCIBus *bus, int bus_num,
> >                           void (*fn)(PCIBus *bus, PCIDevice *d, void *opaque),
> >                           void *opaque);
> > +void pci_for_each_device_reverse(PCIBus *bus, int bus_num,
> > +                                 void (*fn)(PCIBus *bus, PCIDevice *d,
> > +                                            void *opaque),
> > +                                 void *opaque);
> >  void pci_for_each_bus_depth_first(PCIBus *bus,
> >                                    void *(*begin)(PCIBus *bus, void *parent_state),
> >                                    void (*end)(PCIBus *bus, void *state),
> > 
> > 
> 



-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson
Re: [Qemu-devel] [PATCH] spapr/pci: populate PCI DT in reverse order
Posted by David Gibson 7 years ago
On Wed, Feb 22, 2017 at 11:56:53AM +0100, Greg Kurz wrote:
> From: Greg Kurz <gkurz@linux.vnet.ibm.com>
> 
> Since commit 1d2d974244c6 "spapr_pci: enumerate and add PCI device tree", QEMU
> populates the PCI device tree in the opposite order compared to SLOF.
> 
> Before 1d2d974244c6:
> 
> Populating /pci@800000020000000
>                      00 0000 (D) : 1af4 1000    virtio [ net ]
>                      00 0800 (D) : 1af4 1001    virtio [ block ]
>                      00 1000 (D) : 1af4 1009    virtio [ network ]
> Populating /pci@800000020000000/unknown-legacy-device@2
> 
> 7e5294b8 :  /pci@800000020000000
> 7e52b998 :  |-- ethernet@0
> 7e52c0c8 :  |-- scsi@1
> 7e52c7e8 :  +-- unknown-legacy-device@2 ok
> 
> Since 1d2d974244c6:
> 
> Populating /pci@800000020000000
>                      00 1000 (D) : 1af4 1009    virtio [ network ]
> Populating /pci@800000020000000/unknown-legacy-device@2
>                      00 0800 (D) : 1af4 1001    virtio [ block ]
>                      00 0000 (D) : 1af4 1000    virtio [ net ]
> 
> 7e5e8118 :  /pci@800000020000000
> 7e5ea6a0 :  |-- unknown-legacy-device@2
> 7e5eadb8 :  |-- scsi@1
> 7e5eb4d8 :  +-- ethernet@0 ok
> 
> This behaviour change is not actually a bug since no assumptions should be
> made on DT ordering. But it has no real justification either, other than
> being the consequence of the way fdt_add_subnode() inserts new elements
> to the front of the FDT rather than adding them to the tail.
> 
> This patch reverts to the historical SLOF ordering by walking PCI devices
> in reverse order. This reconciles pseries with x86 machine types behavior.
> It is expected to make things easier when porting existing applications to
> power.
> 
> Signed-off-by: Greg Kurz <gkurz@linux.vnet.ibm.com>
> Tested-by: Thomas Huth <thuth@redhat.com>
> Reviewed-by: Nikunj A Dadhania <nikunj@linux.vnet.ibm.com>
> (slight update to the changelog)
> Signed-off-by: Greg Kurz <groug@kaod.org>
> ---
>  hw/pci/pci.c         |   28 ++++++++++++++++++++++++++++
>  hw/ppc/spapr_pci.c   |   12 ++++++------
>  include/hw/pci/pci.h |    4 ++++
>  3 files changed, 38 insertions(+), 6 deletions(-)
> 
> David,
> 
> This patch was posted and already discussed during 2.5 development:
> 
> http://patchwork.ozlabs.org/patch/549925/
> 
> The "consensus" at the time was that guests should not rely on device
> ordering (i.e. use persistent naming instead).
> 
> I got recently contacted by OpenStack people who had several complaints
> about the reverse ordering of PCI devices in pseries: different behavior
> between ppc64 and x86, lots of time spent in debugging when porting
> applications from x86 to ppc64 before realizing that it is caused by the
> reverse ordering, necessity to carry hacky workarounds...
> 
> One strong argument against handling this properly with persistent naming
> is that it requires systemd/udev. This option is considered as painful
> with CirrOS, which aims at remaining as minimal as possible and is widely
> used in the OpenStack ecosystem.
> 
> Would you re-consider your position and apply this patch ?

As it happens, I'd thought about this from time to time already, and
concluded that (re-)reversing the DT order was probably the least bad
approach.

So, applied to ppc-for-2.9.

> 
> Cheers.
> 
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index a563555e7da7..273f1e46025a 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -1530,6 +1530,34 @@ static const pci_class_desc pci_class_descriptions[] =
>      { 0, NULL}
>  };
>  
> +static void pci_for_each_device_under_bus_reverse(PCIBus *bus,
> +                                                  void (*fn)(PCIBus *b,
> +                                                             PCIDevice *d,
> +                                                             void *opaque),
> +                                                  void *opaque)
> +{
> +    PCIDevice *d;
> +    int devfn;
> +
> +    for (devfn = 0; devfn < ARRAY_SIZE(bus->devices); devfn++) {
> +        d = bus->devices[ARRAY_SIZE(bus->devices) - 1 - devfn];
> +        if (d) {
> +            fn(bus, d, opaque);
> +        }
> +    }
> +}
> +
> +void pci_for_each_device_reverse(PCIBus *bus, int bus_num,
> +                         void (*fn)(PCIBus *b, PCIDevice *d, void *opaque),
> +                         void *opaque)
> +{
> +    bus = pci_find_bus_nr(bus, bus_num);
> +
> +    if (bus) {
> +        pci_for_each_device_under_bus_reverse(bus, fn, opaque);
> +    }
> +}
> +
>  static void pci_for_each_device_under_bus(PCIBus *bus,
>                                            void (*fn)(PCIBus *b, PCIDevice *d,
>                                                       void *opaque),
> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> index fd6fc1d95344..2a20c2a140fc 100644
> --- a/hw/ppc/spapr_pci.c
> +++ b/hw/ppc/spapr_pci.c
> @@ -1782,9 +1782,9 @@ static void spapr_populate_pci_devices_dt(PCIBus *bus, PCIDevice *pdev,
>      s_fdt.fdt = p->fdt;
>      s_fdt.node_off = offset;
>      s_fdt.sphb = p->sphb;
> -    pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
> -                        spapr_populate_pci_devices_dt,
> -                        &s_fdt);
> +    pci_for_each_device_reverse(sec_bus, pci_bus_num(sec_bus),
> +                                spapr_populate_pci_devices_dt,
> +                                &s_fdt);
>  }
>  
>  static void spapr_phb_pci_enumerate_bridge(PCIBus *bus, PCIDevice *pdev,
> @@ -1953,9 +1953,9 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
>      s_fdt.fdt = fdt;
>      s_fdt.node_off = bus_off;
>      s_fdt.sphb = phb;
> -    pci_for_each_device(bus, pci_bus_num(bus),
> -                        spapr_populate_pci_devices_dt,
> -                        &s_fdt);
> +    pci_for_each_device_reverse(bus, pci_bus_num(bus),
> +                                spapr_populate_pci_devices_dt,
> +                                &s_fdt);
>  
>      ret = spapr_drc_populate_dt(fdt, bus_off, OBJECT(phb),
>                                  SPAPR_DR_CONNECTOR_TYPE_PCI);
> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> index 6983f13745a5..9349acbfb278 100644
> --- a/include/hw/pci/pci.h
> +++ b/include/hw/pci/pci.h
> @@ -429,6 +429,10 @@ int pci_bus_numa_node(PCIBus *bus);
>  void pci_for_each_device(PCIBus *bus, int bus_num,
>                           void (*fn)(PCIBus *bus, PCIDevice *d, void *opaque),
>                           void *opaque);
> +void pci_for_each_device_reverse(PCIBus *bus, int bus_num,
> +                                 void (*fn)(PCIBus *bus, PCIDevice *d,
> +                                            void *opaque),
> +                                 void *opaque);
>  void pci_for_each_bus_depth_first(PCIBus *bus,
>                                    void *(*begin)(PCIBus *bus, void *parent_state),
>                                    void (*end)(PCIBus *bus, void *state),
> 

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson