[Qemu-devel] [PATCH] spapr/irq: Only claim VALID interrupts at the KVM level

Cédric Le Goater posted 1 patch 4 years, 7 months ago
Test docker-clang@ubuntu passed
Test FreeBSD passed
Test checkpatch passed
Test docker-quick@centos7 passed
Patches applied successfully (tree, apply log)
git fetch https://github.com/patchew-project/qemu tags/patchew/20190910061326.25366-1-clg@kaod.org
Maintainers: "Cédric Le Goater" <clg@kaod.org>, David Gibson <david@gibson.dropbear.id.au>
include/hw/ppc/spapr.h      |  1 +
include/hw/ppc/spapr_xive.h |  1 +
include/hw/ppc/xics.h       |  6 ++++++
hw/intc/spapr_xive.c        |  1 +
hw/intc/spapr_xive_kvm.c    | 34 +++++++++++++++++++++++++++++++---
hw/intc/xics.c              |  1 +
hw/intc/xics_kvm.c          | 14 ++++++++++++++
hw/ppc/spapr.c              |  1 +
hw/ppc/spapr_irq.c          | 13 +++++++------
9 files changed, 63 insertions(+), 9 deletions(-)
[Qemu-devel] [PATCH] spapr/irq: Only claim VALID interrupts at the KVM level
Posted by Cédric Le Goater 4 years, 7 months ago
A typical pseries VM with 16 vCPUs, one disk, one network adapater
uses less than 100 interrupts but the whole IRQ number space of the
QEMU machine is allocated at reset time and it is 8K wide. This is
wasting considerably the global IRQ space of the overall system which
has 1M interrupts per socket on a POWER9.

To optimise the HW resources, only request at the KVM level interrupts
which have been claimed. This will help up increase the maximum number
of VMs per system.

To keep migration compatibility, we introduce a machine class
attribute to adapt the reset behavior on older pseries machines.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---
 include/hw/ppc/spapr.h      |  1 +
 include/hw/ppc/spapr_xive.h |  1 +
 include/hw/ppc/xics.h       |  6 ++++++
 hw/intc/spapr_xive.c        |  1 +
 hw/intc/spapr_xive_kvm.c    | 34 +++++++++++++++++++++++++++++++---
 hw/intc/xics.c              |  1 +
 hw/intc/xics_kvm.c          | 14 ++++++++++++++
 hw/ppc/spapr.c              |  1 +
 hw/ppc/spapr_irq.c          | 13 +++++++------
 9 files changed, 63 insertions(+), 9 deletions(-)

diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 03111fd55bc8..6c622954a60c 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -131,6 +131,7 @@ struct SpaprMachineClass {
     SpaprResizeHpt resize_hpt_default;
     SpaprCapabilities default_caps;
     SpaprIrq *irq;
+    bool irq_reset_all;
 };
 
 /**
diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
index bfd40f01d882..b33913eb0f28 100644
--- a/include/hw/ppc/spapr_xive.h
+++ b/include/hw/ppc/spapr_xive.h
@@ -45,6 +45,7 @@ typedef struct SpaprXive {
     void          *tm_mmap;
     MemoryRegion  tm_mmio_kvm;
     VMChangeStateEntry *change;
+    bool          reset_all;
 } SpaprXive;
 
 /*
diff --git a/include/hw/ppc/xics.h b/include/hw/ppc/xics.h
index f2a8d6a4b4f9..856815362406 100644
--- a/include/hw/ppc/xics.h
+++ b/include/hw/ppc/xics.h
@@ -117,6 +117,7 @@ struct ICSState {
     DeviceState parent_obj;
     /*< public >*/
     uint32_t nr_irqs;
+    bool reset_all;
     uint32_t offset;
     ICSIRQState *irqs;
     XICSFabric *xics;
@@ -179,6 +180,11 @@ void ics_simple_write_xive(ICSState *ics, int nr, int server,
                            uint8_t priority, uint8_t saved_priority);
 void ics_simple_set_irq(void *opaque, int srcno, int val);
 
+static inline bool ics_irq_free(ICSState *ics, uint32_t srcno)
+{
+    return !(ics->irqs[srcno].flags & XICS_FLAGS_IRQ_MASK);
+}
+
 void ics_set_irq_type(ICSState *ics, int srcno, bool lsi);
 void icp_pic_print_info(ICPState *icp, Monitor *mon);
 void ics_pic_print_info(ICSState *ics, Monitor *mon);
diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
index c1c97192a7d2..b717d9e09314 100644
--- a/hw/intc/spapr_xive.c
+++ b/hw/intc/spapr_xive.c
@@ -492,6 +492,7 @@ static Property spapr_xive_properties[] = {
     DEFINE_PROP_UINT32("nr-ends", SpaprXive, nr_ends, 0),
     DEFINE_PROP_UINT64("vc-base", SpaprXive, vc_base, SPAPR_XIVE_VC_BASE),
     DEFINE_PROP_UINT64("tm-base", SpaprXive, tm_base, SPAPR_XIVE_TM_BASE),
+    DEFINE_PROP_BOOL("reset-all", ICSState, reset_all, false),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
index 17af4d19f54e..225abce36270 100644
--- a/hw/intc/spapr_xive_kvm.c
+++ b/hw/intc/spapr_xive_kvm.c
@@ -253,13 +253,23 @@ void kvmppc_xive_source_reset_one(XiveSource *xsrc, int srcno, Error **errp)
                       true, errp);
 }
 
+static bool xive_source_skip_reset(SpaprXive *xive, int srcno)
+{
+    return !xive->reset_all && !xive_eas_is_valid(&xive->eat[srcno]);
+}
+
 static void kvmppc_xive_source_reset(XiveSource *xsrc, Error **errp)
 {
+    SpaprXive *xive = SPAPR_XIVE(xsrc->xive);
     int i;
 
     for (i = 0; i < xsrc->nr_irqs; i++) {
         Error *local_err = NULL;
 
+        if (xive_source_skip_reset(xive, i)) {
+            continue;
+        }
+
         kvmppc_xive_source_reset_one(xsrc, i, &local_err);
         if (local_err) {
             error_propagate(errp, local_err);
@@ -328,11 +338,18 @@ uint64_t kvmppc_xive_esb_rw(XiveSource *xsrc, int srcno, uint32_t offset,
 
 static void kvmppc_xive_source_get_state(XiveSource *xsrc)
 {
+    SpaprXive *xive = SPAPR_XIVE(xsrc->xive);
     int i;
 
     for (i = 0; i < xsrc->nr_irqs; i++) {
+        uint8_t pq;
+
+        if (xive_source_skip_reset(xive, i)) {
+            continue;
+        }
+
         /* Perform a load without side effect to retrieve the PQ bits */
-        uint8_t pq = xive_esb_read(xsrc, i, XIVE_ESB_GET);
+        pq = xive_esb_read(xsrc, i, XIVE_ESB_GET);
 
         /* and save PQ locally */
         xive_source_esb_set(xsrc, i, pq);
@@ -521,9 +538,14 @@ static void kvmppc_xive_change_state_handler(void *opaque, int running,
      */
     if (running) {
         for (i = 0; i < xsrc->nr_irqs; i++) {
-            uint8_t pq = xive_source_esb_get(xsrc, i);
+            uint8_t pq;
             uint8_t old_pq;
 
+            if (xive_source_skip_reset(xive, i)) {
+                continue;
+            }
+
+            pq = xive_source_esb_get(xsrc, i);
             old_pq = xive_esb_read(xsrc, i, XIVE_ESB_SET_PQ_00 + (pq << 8));
 
             /*
@@ -545,7 +567,13 @@ static void kvmppc_xive_change_state_handler(void *opaque, int running,
      * migration is in progress.
      */
     for (i = 0; i < xsrc->nr_irqs; i++) {
-        uint8_t pq = xive_esb_read(xsrc, i, XIVE_ESB_GET);
+        uint8_t pq;
+
+        if (xive_source_skip_reset(xive, i)) {
+            continue;
+        }
+
+        pq = xive_esb_read(xsrc, i, XIVE_ESB_GET);
 
         /*
          * PQ is set to PENDING to possibly catch a triggered
diff --git a/hw/intc/xics.c b/hw/intc/xics.c
index b2fca2975cc4..ae3f83cf0aad 100644
--- a/hw/intc/xics.c
+++ b/hw/intc/xics.c
@@ -746,6 +746,7 @@ static const VMStateDescription vmstate_ics_base = {
 
 static Property ics_base_properties[] = {
     DEFINE_PROP_UINT32("nr-irqs", ICSState, nr_irqs, 0),
+    DEFINE_PROP_BOOL("reset-all", ICSState, reset_all, false),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
index a4d2e876cc5f..aa017b99801c 100644
--- a/hw/intc/xics_kvm.c
+++ b/hw/intc/xics_kvm.c
@@ -177,6 +177,12 @@ void icp_kvm_realize(DeviceState *dev, Error **errp)
 /*
  * ICS-KVM
  */
+
+static bool ics_irq_skip_reset(ICSState *ics, int srcno)
+{
+    return !ics->reset_all && ics_irq_free(ics, srcno);
+}
+
 void ics_get_kvm_state(ICSState *ics)
 {
     uint64_t state;
@@ -190,6 +196,10 @@ void ics_get_kvm_state(ICSState *ics)
     for (i = 0; i < ics->nr_irqs; i++) {
         ICSIRQState *irq = &ics->irqs[i];
 
+        if (ics_irq_skip_reset(ics, i)) {
+            continue;
+        }
+
         kvm_device_access(kernel_xics_fd, KVM_DEV_XICS_GRP_SOURCES,
                           i + ics->offset, &state, false, &error_fatal);
 
@@ -301,6 +311,10 @@ int ics_set_kvm_state(ICSState *ics, Error **errp)
         Error *local_err = NULL;
         int ret;
 
+        if (ics_irq_skip_reset(ics, i)) {
+            continue;
+        }
+
         ret = ics_set_kvm_state_one(ics, i, &local_err);
         if (ret < 0) {
             error_propagate(errp, local_err);
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 7124053b43b7..af89f3b6c698 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -4594,6 +4594,7 @@ static void spapr_machine_4_1_class_options(MachineClass *mc)
     smc->linux_pci_probe = false;
     compat_props_add(mc->compat_props, hw_compat_4_1, hw_compat_4_1_len);
     compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
+    smc->irq_reset_all = true;
 }
 
 DEFINE_SPAPR_MACHINE(4_1, "4.1", false);
diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
index 06fe2432bae5..8c3682613404 100644
--- a/hw/ppc/spapr_irq.c
+++ b/hw/ppc/spapr_irq.c
@@ -97,11 +97,13 @@ static void spapr_irq_init_xics(SpaprMachineState *spapr, int nr_irqs,
 {
     Object *obj;
     Error *local_err = NULL;
+    bool reset_all = SPAPR_MACHINE_GET_CLASS(spapr)->irq_reset_all;
 
     obj = object_new(TYPE_ICS_SIMPLE);
     object_property_add_child(OBJECT(spapr), "ics", obj, &error_abort);
     object_property_add_const_link(obj, ICS_PROP_XICS, OBJECT(spapr),
                                    &error_fatal);
+    object_property_set_bool(obj, reset_all, "reset-all",  &error_fatal);
     object_property_set_int(obj, nr_irqs, "nr-irqs",  &error_fatal);
     object_property_set_bool(obj, true, "realized", &local_err);
     if (local_err) {
@@ -114,9 +116,6 @@ static void spapr_irq_init_xics(SpaprMachineState *spapr, int nr_irqs,
     xics_spapr_init(spapr);
 }
 
-#define ICS_IRQ_FREE(ics, srcno)   \
-    (!((ics)->irqs[(srcno)].flags & (XICS_FLAGS_IRQ_MASK)))
-
 static int spapr_irq_claim_xics(SpaprMachineState *spapr, int irq, bool lsi,
                                 Error **errp)
 {
@@ -129,7 +128,7 @@ static int spapr_irq_claim_xics(SpaprMachineState *spapr, int irq, bool lsi,
         return -1;
     }
 
-    if (!ICS_IRQ_FREE(ics, irq - ics->offset)) {
+    if (!ics_irq_free(ics, irq - ics->offset)) {
         error_setg(errp, "IRQ %d is not free", irq);
         return -1;
     }
@@ -147,7 +146,7 @@ static void spapr_irq_free_xics(SpaprMachineState *spapr, int irq, int num)
     if (ics_valid_irq(ics, irq)) {
         trace_spapr_irq_free(0, irq, num);
         for (i = srcno; i < srcno + num; ++i) {
-            if (ICS_IRQ_FREE(ics, i)) {
+            if (ics_irq_free(ics, i)) {
                 trace_spapr_irq_free_warn(0, i);
             }
             memset(&ics->irqs[i], 0, sizeof(ICSIRQState));
@@ -270,9 +269,11 @@ static void spapr_irq_init_xive(SpaprMachineState *spapr, int nr_irqs,
 {
     uint32_t nr_servers = spapr_max_server_number(spapr);
     DeviceState *dev;
+    bool reset_all = SPAPR_MACHINE_GET_CLASS(spapr)->irq_reset_all;
     int i;
 
     dev = qdev_create(NULL, TYPE_SPAPR_XIVE);
+    object_property_set_bool(OBJECT(dev), reset_all, "reset-all", &error_fatal);
     qdev_prop_set_uint32(dev, "nr-irqs", nr_irqs);
     /*
      * 8 XIVE END structures per CPU. One for each available priority
@@ -767,7 +768,7 @@ static int ics_find_free_block(ICSState *ics, int num, int alignnum)
             return -1;
         }
         for (i = first; i < first + num; ++i) {
-            if (!ICS_IRQ_FREE(ics, i)) {
+            if (!ics_irq_free(ics, i)) {
                 break;
             }
         }
-- 
2.21.0


Re: [Qemu-devel] [PATCH] spapr/irq: Only claim VALID interrupts at the KVM level
Posted by Greg Kurz 4 years, 7 months ago
On Tue, 10 Sep 2019 08:13:26 +0200
Cédric Le Goater <clg@kaod.org> wrote:

> A typical pseries VM with 16 vCPUs, one disk, one network adapater
> uses less than 100 interrupts but the whole IRQ number space of the
> QEMU machine is allocated at reset time and it is 8K wide. This is
> wasting considerably the global IRQ space of the overall system which
> has 1M interrupts per socket on a POWER9.
> 
> To optimise the HW resources, only request at the KVM level interrupts
> which have been claimed. This will help up increase the maximum number
> of VMs per system.
> 
> To keep migration compatibility, we introduce a machine class
> attribute to adapt the reset behavior on older pseries machines.
> 

This can be achieved in a simpler way, see below.

> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> ---
>  include/hw/ppc/spapr.h      |  1 +
>  include/hw/ppc/spapr_xive.h |  1 +
>  include/hw/ppc/xics.h       |  6 ++++++
>  hw/intc/spapr_xive.c        |  1 +
>  hw/intc/spapr_xive_kvm.c    | 34 +++++++++++++++++++++++++++++++---
>  hw/intc/xics.c              |  1 +
>  hw/intc/xics_kvm.c          | 14 ++++++++++++++
>  hw/ppc/spapr.c              |  1 +
>  hw/ppc/spapr_irq.c          | 13 +++++++------
>  9 files changed, 63 insertions(+), 9 deletions(-)
> 
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index 03111fd55bc8..6c622954a60c 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -131,6 +131,7 @@ struct SpaprMachineClass {
>      SpaprResizeHpt resize_hpt_default;
>      SpaprCapabilities default_caps;
>      SpaprIrq *irq;
> +    bool irq_reset_all;
>  };
>  
>  /**
> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
> index bfd40f01d882..b33913eb0f28 100644
> --- a/include/hw/ppc/spapr_xive.h
> +++ b/include/hw/ppc/spapr_xive.h
> @@ -45,6 +45,7 @@ typedef struct SpaprXive {
>      void          *tm_mmap;
>      MemoryRegion  tm_mmio_kvm;
>      VMChangeStateEntry *change;
> +    bool          reset_all;
>  } SpaprXive;
>  
>  /*
> diff --git a/include/hw/ppc/xics.h b/include/hw/ppc/xics.h
> index f2a8d6a4b4f9..856815362406 100644
> --- a/include/hw/ppc/xics.h
> +++ b/include/hw/ppc/xics.h
> @@ -117,6 +117,7 @@ struct ICSState {
>      DeviceState parent_obj;
>      /*< public >*/
>      uint32_t nr_irqs;
> +    bool reset_all;
>      uint32_t offset;
>      ICSIRQState *irqs;
>      XICSFabric *xics;
> @@ -179,6 +180,11 @@ void ics_simple_write_xive(ICSState *ics, int nr, int server,
>                             uint8_t priority, uint8_t saved_priority);
>  void ics_simple_set_irq(void *opaque, int srcno, int val);
>  
> +static inline bool ics_irq_free(ICSState *ics, uint32_t srcno)
> +{
> +    return !(ics->irqs[srcno].flags & XICS_FLAGS_IRQ_MASK);
> +}
> +
>  void ics_set_irq_type(ICSState *ics, int srcno, bool lsi);
>  void icp_pic_print_info(ICPState *icp, Monitor *mon);
>  void ics_pic_print_info(ICSState *ics, Monitor *mon);
> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
> index c1c97192a7d2..b717d9e09314 100644
> --- a/hw/intc/spapr_xive.c
> +++ b/hw/intc/spapr_xive.c
> @@ -492,6 +492,7 @@ static Property spapr_xive_properties[] = {
>      DEFINE_PROP_UINT32("nr-ends", SpaprXive, nr_ends, 0),
>      DEFINE_PROP_UINT64("vc-base", SpaprXive, vc_base, SPAPR_XIVE_VC_BASE),
>      DEFINE_PROP_UINT64("tm-base", SpaprXive, tm_base, SPAPR_XIVE_TM_BASE),
> +    DEFINE_PROP_BOOL("reset-all", ICSState, reset_all, false),

s/ICSState/SpaprXive

>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
> index 17af4d19f54e..225abce36270 100644
> --- a/hw/intc/spapr_xive_kvm.c
> +++ b/hw/intc/spapr_xive_kvm.c
> @@ -253,13 +253,23 @@ void kvmppc_xive_source_reset_one(XiveSource *xsrc, int srcno, Error **errp)
>                        true, errp);
>  }
>  
> +static bool xive_source_skip_reset(SpaprXive *xive, int srcno)
> +{
> +    return !xive->reset_all && !xive_eas_is_valid(&xive->eat[srcno]);
> +}
> +
>  static void kvmppc_xive_source_reset(XiveSource *xsrc, Error **errp)
>  {
> +    SpaprXive *xive = SPAPR_XIVE(xsrc->xive);
>      int i;
>  
>      for (i = 0; i < xsrc->nr_irqs; i++) {
>          Error *local_err = NULL;
>  
> +        if (xive_source_skip_reset(xive, i)) {
> +            continue;
> +        }
> +
>          kvmppc_xive_source_reset_one(xsrc, i, &local_err);
>          if (local_err) {
>              error_propagate(errp, local_err);
> @@ -328,11 +338,18 @@ uint64_t kvmppc_xive_esb_rw(XiveSource *xsrc, int srcno, uint32_t offset,
>  
>  static void kvmppc_xive_source_get_state(XiveSource *xsrc)
>  {
> +    SpaprXive *xive = SPAPR_XIVE(xsrc->xive);
>      int i;
>  
>      for (i = 0; i < xsrc->nr_irqs; i++) {
> +        uint8_t pq;
> +
> +        if (xive_source_skip_reset(xive, i)) {

This looks a bit weird to "skip reset" in a function that isn't
supposed to reset anything... Is it really necessary to check the
reset_all flag actually ? I guess checking the EAS is valid should
be enough.

> +            continue;
> +        }
> +
>          /* Perform a load without side effect to retrieve the PQ bits */
> -        uint8_t pq = xive_esb_read(xsrc, i, XIVE_ESB_GET);
> +        pq = xive_esb_read(xsrc, i, XIVE_ESB_GET);
>  
>          /* and save PQ locally */
>          xive_source_esb_set(xsrc, i, pq);
> @@ -521,9 +538,14 @@ static void kvmppc_xive_change_state_handler(void *opaque, int running,
>       */
>      if (running) {
>          for (i = 0; i < xsrc->nr_irqs; i++) {
> -            uint8_t pq = xive_source_esb_get(xsrc, i);
> +            uint8_t pq;
>              uint8_t old_pq;
>  
> +            if (xive_source_skip_reset(xive, i)) {

Same here ?

> +                continue;
> +            }
> +
> +            pq = xive_source_esb_get(xsrc, i);
>              old_pq = xive_esb_read(xsrc, i, XIVE_ESB_SET_PQ_00 + (pq << 8));
>  
>              /*
> @@ -545,7 +567,13 @@ static void kvmppc_xive_change_state_handler(void *opaque, int running,
>       * migration is in progress.
>       */
>      for (i = 0; i < xsrc->nr_irqs; i++) {
> -        uint8_t pq = xive_esb_read(xsrc, i, XIVE_ESB_GET);
> +        uint8_t pq;
> +
> +        if (xive_source_skip_reset(xive, i)) {

and here ?

> +            continue;
> +        }
> +
> +        pq = xive_esb_read(xsrc, i, XIVE_ESB_GET);
>  
>          /*
>           * PQ is set to PENDING to possibly catch a triggered
> diff --git a/hw/intc/xics.c b/hw/intc/xics.c
> index b2fca2975cc4..ae3f83cf0aad 100644
> --- a/hw/intc/xics.c
> +++ b/hw/intc/xics.c
> @@ -746,6 +746,7 @@ static const VMStateDescription vmstate_ics_base = {
>  
>  static Property ics_base_properties[] = {
>      DEFINE_PROP_UINT32("nr-irqs", ICSState, nr_irqs, 0),
> +    DEFINE_PROP_BOOL("reset-all", ICSState, reset_all, false),
>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
> index a4d2e876cc5f..aa017b99801c 100644
> --- a/hw/intc/xics_kvm.c
> +++ b/hw/intc/xics_kvm.c
> @@ -177,6 +177,12 @@ void icp_kvm_realize(DeviceState *dev, Error **errp)
>  /*
>   * ICS-KVM
>   */
> +
> +static bool ics_irq_skip_reset(ICSState *ics, int srcno)
> +{
> +    return !ics->reset_all && ics_irq_free(ics, srcno);
> +}
> +
>  void ics_get_kvm_state(ICSState *ics)
>  {
>      uint64_t state;
> @@ -190,6 +196,10 @@ void ics_get_kvm_state(ICSState *ics)
>      for (i = 0; i < ics->nr_irqs; i++) {
>          ICSIRQState *irq = &ics->irqs[i];
>  
> +        if (ics_irq_skip_reset(ics, i)) {

And here ?

> +            continue;
> +        }
> +
>          kvm_device_access(kernel_xics_fd, KVM_DEV_XICS_GRP_SOURCES,
>                            i + ics->offset, &state, false, &error_fatal);
>  
> @@ -301,6 +311,10 @@ int ics_set_kvm_state(ICSState *ics, Error **errp)
>          Error *local_err = NULL;
>          int ret;
>  
> +        if (ics_irq_skip_reset(ics, i)) {
> +            continue;
> +        }
> +
>          ret = ics_set_kvm_state_one(ics, i, &local_err);
>          if (ret < 0) {
>              error_propagate(errp, local_err);
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 7124053b43b7..af89f3b6c698 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -4594,6 +4594,7 @@ static void spapr_machine_4_1_class_options(MachineClass *mc)
>      smc->linux_pci_probe = false;
>      compat_props_add(mc->compat_props, hw_compat_4_1, hw_compat_4_1_len);
>      compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
> +    smc->irq_reset_all = true;

Drop this property and add two lines in compat static:

    static GlobalProperty compat[] = {
        /* Only allow 4kiB and 64kiB IOMMU pagesizes */
        { TYPE_SPAPR_PCI_HOST_BRIDGE, "pgsz", "0x11000" },
        { TYPE_SPAPR_XIVE, "reset-all", "on" },
        { TYPE_ICS_BASE, "reset-all", "on" },
    };

This ensures that any instance of SpaprXive or ICSState created by
a pseries-4.1 machine has reset_all set to true.

>  }
>  
>  DEFINE_SPAPR_MACHINE(4_1, "4.1", false);
> diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
> index 06fe2432bae5..8c3682613404 100644
> --- a/hw/ppc/spapr_irq.c
> +++ b/hw/ppc/spapr_irq.c
> @@ -97,11 +97,13 @@ static void spapr_irq_init_xics(SpaprMachineState *spapr, int nr_irqs,
>  {
>      Object *obj;
>      Error *local_err = NULL;
> +    bool reset_all = SPAPR_MACHINE_GET_CLASS(spapr)->irq_reset_all;
>  

... and you can drop all the propagation code here...

>      obj = object_new(TYPE_ICS_SIMPLE);
>      object_property_add_child(OBJECT(spapr), "ics", obj, &error_abort);
>      object_property_add_const_link(obj, ICS_PROP_XICS, OBJECT(spapr),
>                                     &error_fatal);
> +    object_property_set_bool(obj, reset_all, "reset-all",  &error_fatal);
>      object_property_set_int(obj, nr_irqs, "nr-irqs",  &error_fatal);
>      object_property_set_bool(obj, true, "realized", &local_err);
>      if (local_err) {
> @@ -114,9 +116,6 @@ static void spapr_irq_init_xics(SpaprMachineState *spapr, int nr_irqs,
>      xics_spapr_init(spapr);
>  }
>  
> -#define ICS_IRQ_FREE(ics, srcno)   \
> -    (!((ics)->irqs[(srcno)].flags & (XICS_FLAGS_IRQ_MASK)))
> -
>  static int spapr_irq_claim_xics(SpaprMachineState *spapr, int irq, bool lsi,
>                                  Error **errp)
>  {
> @@ -129,7 +128,7 @@ static int spapr_irq_claim_xics(SpaprMachineState *spapr, int irq, bool lsi,
>          return -1;
>      }
>  
> -    if (!ICS_IRQ_FREE(ics, irq - ics->offset)) {
> +    if (!ics_irq_free(ics, irq - ics->offset)) {
>          error_setg(errp, "IRQ %d is not free", irq);
>          return -1;
>      }
> @@ -147,7 +146,7 @@ static void spapr_irq_free_xics(SpaprMachineState *spapr, int irq, int num)
>      if (ics_valid_irq(ics, irq)) {
>          trace_spapr_irq_free(0, irq, num);
>          for (i = srcno; i < srcno + num; ++i) {
> -            if (ICS_IRQ_FREE(ics, i)) {
> +            if (ics_irq_free(ics, i)) {
>                  trace_spapr_irq_free_warn(0, i);
>              }
>              memset(&ics->irqs[i], 0, sizeof(ICSIRQState));
> @@ -270,9 +269,11 @@ static void spapr_irq_init_xive(SpaprMachineState *spapr, int nr_irqs,
>  {
>      uint32_t nr_servers = spapr_max_server_number(spapr);
>      DeviceState *dev;
> +    bool reset_all = SPAPR_MACHINE_GET_CLASS(spapr)->irq_reset_all;

... and here.

>      int i;
>  
>      dev = qdev_create(NULL, TYPE_SPAPR_XIVE);
> +    object_property_set_bool(OBJECT(dev), reset_all, "reset-all", &error_fatal);
>      qdev_prop_set_uint32(dev, "nr-irqs", nr_irqs);
>      /*
>       * 8 XIVE END structures per CPU. One for each available priority
> @@ -767,7 +768,7 @@ static int ics_find_free_block(ICSState *ics, int num, int alignnum)
>              return -1;
>          }
>          for (i = first; i < first + num; ++i) {
> -            if (!ICS_IRQ_FREE(ics, i)) {
> +            if (!ics_irq_free(ics, i)) {
>                  break;
>              }
>          }


Re: [Qemu-devel] [PATCH] spapr/irq: Only claim VALID interrupts at the KVM level
Posted by David Gibson 4 years, 7 months ago
On Tue, Sep 10, 2019 at 08:13:26AM +0200, Cédric Le Goater wrote:
> A typical pseries VM with 16 vCPUs, one disk, one network adapater
> uses less than 100 interrupts but the whole IRQ number space of the
> QEMU machine is allocated at reset time and it is 8K wide. This is
> wasting considerably the global IRQ space of the overall system which
> has 1M interrupts per socket on a POWER9.
> 
> To optimise the HW resources, only request at the KVM level interrupts
> which have been claimed. This will help up increase the maximum number
> of VMs per system.
> 
> To keep migration compatibility, we introduce a machine class
> attribute to adapt the reset behavior on older pseries machines.
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>

The overall idea looks good to me.

I don't really see what the point of the reset-all property is.  How
this is handled shouldn't be guest visible at all AFAICT, so there's
no need to grandfather it for older machine types.

Since ics_irq_free() has uses in the existing code unrelated to this,
it would be nice to pull introducing that out into a separate patch.

> ---
>  include/hw/ppc/spapr.h      |  1 +
>  include/hw/ppc/spapr_xive.h |  1 +
>  include/hw/ppc/xics.h       |  6 ++++++
>  hw/intc/spapr_xive.c        |  1 +
>  hw/intc/spapr_xive_kvm.c    | 34 +++++++++++++++++++++++++++++++---
>  hw/intc/xics.c              |  1 +
>  hw/intc/xics_kvm.c          | 14 ++++++++++++++
>  hw/ppc/spapr.c              |  1 +
>  hw/ppc/spapr_irq.c          | 13 +++++++------
>  9 files changed, 63 insertions(+), 9 deletions(-)
> 
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index 03111fd55bc8..6c622954a60c 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -131,6 +131,7 @@ struct SpaprMachineClass {
>      SpaprResizeHpt resize_hpt_default;
>      SpaprCapabilities default_caps;
>      SpaprIrq *irq;
> +    bool irq_reset_all;
>  };
>  
>  /**
> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
> index bfd40f01d882..b33913eb0f28 100644
> --- a/include/hw/ppc/spapr_xive.h
> +++ b/include/hw/ppc/spapr_xive.h
> @@ -45,6 +45,7 @@ typedef struct SpaprXive {
>      void          *tm_mmap;
>      MemoryRegion  tm_mmio_kvm;
>      VMChangeStateEntry *change;
> +    bool          reset_all;
>  } SpaprXive;
>  
>  /*
> diff --git a/include/hw/ppc/xics.h b/include/hw/ppc/xics.h
> index f2a8d6a4b4f9..856815362406 100644
> --- a/include/hw/ppc/xics.h
> +++ b/include/hw/ppc/xics.h
> @@ -117,6 +117,7 @@ struct ICSState {
>      DeviceState parent_obj;
>      /*< public >*/
>      uint32_t nr_irqs;
> +    bool reset_all;
>      uint32_t offset;
>      ICSIRQState *irqs;
>      XICSFabric *xics;
> @@ -179,6 +180,11 @@ void ics_simple_write_xive(ICSState *ics, int nr, int server,
>                             uint8_t priority, uint8_t saved_priority);
>  void ics_simple_set_irq(void *opaque, int srcno, int val);
>  
> +static inline bool ics_irq_free(ICSState *ics, uint32_t srcno)
> +{
> +    return !(ics->irqs[srcno].flags & XICS_FLAGS_IRQ_MASK);
> +}
> +
>  void ics_set_irq_type(ICSState *ics, int srcno, bool lsi);
>  void icp_pic_print_info(ICPState *icp, Monitor *mon);
>  void ics_pic_print_info(ICSState *ics, Monitor *mon);
> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
> index c1c97192a7d2..b717d9e09314 100644
> --- a/hw/intc/spapr_xive.c
> +++ b/hw/intc/spapr_xive.c
> @@ -492,6 +492,7 @@ static Property spapr_xive_properties[] = {
>      DEFINE_PROP_UINT32("nr-ends", SpaprXive, nr_ends, 0),
>      DEFINE_PROP_UINT64("vc-base", SpaprXive, vc_base, SPAPR_XIVE_VC_BASE),
>      DEFINE_PROP_UINT64("tm-base", SpaprXive, tm_base, SPAPR_XIVE_TM_BASE),
> +    DEFINE_PROP_BOOL("reset-all", ICSState, reset_all, false),
>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
> index 17af4d19f54e..225abce36270 100644
> --- a/hw/intc/spapr_xive_kvm.c
> +++ b/hw/intc/spapr_xive_kvm.c
> @@ -253,13 +253,23 @@ void kvmppc_xive_source_reset_one(XiveSource *xsrc, int srcno, Error **errp)
>                        true, errp);
>  }
>  
> +static bool xive_source_skip_reset(SpaprXive *xive, int srcno)
> +{
> +    return !xive->reset_all && !xive_eas_is_valid(&xive->eat[srcno]);
> +}
> +
>  static void kvmppc_xive_source_reset(XiveSource *xsrc, Error **errp)
>  {
> +    SpaprXive *xive = SPAPR_XIVE(xsrc->xive);
>      int i;
>  
>      for (i = 0; i < xsrc->nr_irqs; i++) {
>          Error *local_err = NULL;
>  
> +        if (xive_source_skip_reset(xive, i)) {
> +            continue;
> +        }
> +
>          kvmppc_xive_source_reset_one(xsrc, i, &local_err);
>          if (local_err) {
>              error_propagate(errp, local_err);
> @@ -328,11 +338,18 @@ uint64_t kvmppc_xive_esb_rw(XiveSource *xsrc, int srcno, uint32_t offset,
>  
>  static void kvmppc_xive_source_get_state(XiveSource *xsrc)
>  {
> +    SpaprXive *xive = SPAPR_XIVE(xsrc->xive);
>      int i;
>  
>      for (i = 0; i < xsrc->nr_irqs; i++) {
> +        uint8_t pq;
> +
> +        if (xive_source_skip_reset(xive, i)) {
> +            continue;
> +        }
> +
>          /* Perform a load without side effect to retrieve the PQ bits */
> -        uint8_t pq = xive_esb_read(xsrc, i, XIVE_ESB_GET);
> +        pq = xive_esb_read(xsrc, i, XIVE_ESB_GET);
>  
>          /* and save PQ locally */
>          xive_source_esb_set(xsrc, i, pq);
> @@ -521,9 +538,14 @@ static void kvmppc_xive_change_state_handler(void *opaque, int running,
>       */
>      if (running) {
>          for (i = 0; i < xsrc->nr_irqs; i++) {
> -            uint8_t pq = xive_source_esb_get(xsrc, i);
> +            uint8_t pq;
>              uint8_t old_pq;
>  
> +            if (xive_source_skip_reset(xive, i)) {
> +                continue;
> +            }
> +
> +            pq = xive_source_esb_get(xsrc, i);
>              old_pq = xive_esb_read(xsrc, i, XIVE_ESB_SET_PQ_00 + (pq << 8));
>  
>              /*
> @@ -545,7 +567,13 @@ static void kvmppc_xive_change_state_handler(void *opaque, int running,
>       * migration is in progress.
>       */
>      for (i = 0; i < xsrc->nr_irqs; i++) {
> -        uint8_t pq = xive_esb_read(xsrc, i, XIVE_ESB_GET);
> +        uint8_t pq;
> +
> +        if (xive_source_skip_reset(xive, i)) {
> +            continue;
> +        }
> +
> +        pq = xive_esb_read(xsrc, i, XIVE_ESB_GET);
>  
>          /*
>           * PQ is set to PENDING to possibly catch a triggered
> diff --git a/hw/intc/xics.c b/hw/intc/xics.c
> index b2fca2975cc4..ae3f83cf0aad 100644
> --- a/hw/intc/xics.c
> +++ b/hw/intc/xics.c
> @@ -746,6 +746,7 @@ static const VMStateDescription vmstate_ics_base = {
>  
>  static Property ics_base_properties[] = {
>      DEFINE_PROP_UINT32("nr-irqs", ICSState, nr_irqs, 0),
> +    DEFINE_PROP_BOOL("reset-all", ICSState, reset_all, false),
>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
> index a4d2e876cc5f..aa017b99801c 100644
> --- a/hw/intc/xics_kvm.c
> +++ b/hw/intc/xics_kvm.c
> @@ -177,6 +177,12 @@ void icp_kvm_realize(DeviceState *dev, Error **errp)
>  /*
>   * ICS-KVM
>   */
> +
> +static bool ics_irq_skip_reset(ICSState *ics, int srcno)
> +{
> +    return !ics->reset_all && ics_irq_free(ics, srcno);
> +}
> +
>  void ics_get_kvm_state(ICSState *ics)
>  {
>      uint64_t state;
> @@ -190,6 +196,10 @@ void ics_get_kvm_state(ICSState *ics)
>      for (i = 0; i < ics->nr_irqs; i++) {
>          ICSIRQState *irq = &ics->irqs[i];
>  
> +        if (ics_irq_skip_reset(ics, i)) {
> +            continue;
> +        }
> +
>          kvm_device_access(kernel_xics_fd, KVM_DEV_XICS_GRP_SOURCES,
>                            i + ics->offset, &state, false, &error_fatal);
>  
> @@ -301,6 +311,10 @@ int ics_set_kvm_state(ICSState *ics, Error **errp)
>          Error *local_err = NULL;
>          int ret;
>  
> +        if (ics_irq_skip_reset(ics, i)) {
> +            continue;
> +        }
> +
>          ret = ics_set_kvm_state_one(ics, i, &local_err);
>          if (ret < 0) {
>              error_propagate(errp, local_err);
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 7124053b43b7..af89f3b6c698 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -4594,6 +4594,7 @@ static void spapr_machine_4_1_class_options(MachineClass *mc)
>      smc->linux_pci_probe = false;
>      compat_props_add(mc->compat_props, hw_compat_4_1, hw_compat_4_1_len);
>      compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
> +    smc->irq_reset_all = true;
>  }
>  
>  DEFINE_SPAPR_MACHINE(4_1, "4.1", false);
> diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
> index 06fe2432bae5..8c3682613404 100644
> --- a/hw/ppc/spapr_irq.c
> +++ b/hw/ppc/spapr_irq.c
> @@ -97,11 +97,13 @@ static void spapr_irq_init_xics(SpaprMachineState *spapr, int nr_irqs,
>  {
>      Object *obj;
>      Error *local_err = NULL;
> +    bool reset_all = SPAPR_MACHINE_GET_CLASS(spapr)->irq_reset_all;
>  
>      obj = object_new(TYPE_ICS_SIMPLE);
>      object_property_add_child(OBJECT(spapr), "ics", obj, &error_abort);
>      object_property_add_const_link(obj, ICS_PROP_XICS, OBJECT(spapr),
>                                     &error_fatal);
> +    object_property_set_bool(obj, reset_all, "reset-all",  &error_fatal);
>      object_property_set_int(obj, nr_irqs, "nr-irqs",  &error_fatal);
>      object_property_set_bool(obj, true, "realized", &local_err);
>      if (local_err) {
> @@ -114,9 +116,6 @@ static void spapr_irq_init_xics(SpaprMachineState *spapr, int nr_irqs,
>      xics_spapr_init(spapr);
>  }
>  
> -#define ICS_IRQ_FREE(ics, srcno)   \
> -    (!((ics)->irqs[(srcno)].flags & (XICS_FLAGS_IRQ_MASK)))
> -
>  static int spapr_irq_claim_xics(SpaprMachineState *spapr, int irq, bool lsi,
>                                  Error **errp)
>  {
> @@ -129,7 +128,7 @@ static int spapr_irq_claim_xics(SpaprMachineState *spapr, int irq, bool lsi,
>          return -1;
>      }
>  
> -    if (!ICS_IRQ_FREE(ics, irq - ics->offset)) {
> +    if (!ics_irq_free(ics, irq - ics->offset)) {
>          error_setg(errp, "IRQ %d is not free", irq);
>          return -1;
>      }
> @@ -147,7 +146,7 @@ static void spapr_irq_free_xics(SpaprMachineState *spapr, int irq, int num)
>      if (ics_valid_irq(ics, irq)) {
>          trace_spapr_irq_free(0, irq, num);
>          for (i = srcno; i < srcno + num; ++i) {
> -            if (ICS_IRQ_FREE(ics, i)) {
> +            if (ics_irq_free(ics, i)) {
>                  trace_spapr_irq_free_warn(0, i);
>              }
>              memset(&ics->irqs[i], 0, sizeof(ICSIRQState));
> @@ -270,9 +269,11 @@ static void spapr_irq_init_xive(SpaprMachineState *spapr, int nr_irqs,
>  {
>      uint32_t nr_servers = spapr_max_server_number(spapr);
>      DeviceState *dev;
> +    bool reset_all = SPAPR_MACHINE_GET_CLASS(spapr)->irq_reset_all;
>      int i;
>  
>      dev = qdev_create(NULL, TYPE_SPAPR_XIVE);
> +    object_property_set_bool(OBJECT(dev), reset_all, "reset-all", &error_fatal);
>      qdev_prop_set_uint32(dev, "nr-irqs", nr_irqs);
>      /*
>       * 8 XIVE END structures per CPU. One for each available priority
> @@ -767,7 +768,7 @@ static int ics_find_free_block(ICSState *ics, int num, int alignnum)
>              return -1;
>          }
>          for (i = first; i < first + num; ++i) {
> -            if (!ICS_IRQ_FREE(ics, i)) {
> +            if (!ics_irq_free(ics, i)) {
>                  break;
>              }
>          }

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson