[Qemu-devel] [PATCH v9 6/6] migration: Include migration support for machine check handling

Aravinda Prasad posted 6 patches 6 years, 5 months ago
Maintainers: David Gibson <david@gibson.dropbear.id.au>, Richard Henderson <rth@twiddle.net>, Paolo Bonzini <pbonzini@redhat.com>
There is a newer version of this series
[Qemu-devel] [PATCH v9 6/6] migration: Include migration support for machine check handling
Posted by Aravinda Prasad 6 years, 5 months ago
This patch includes migration support for machine check
handling. Especially this patch blocks VM migration
requests until the machine check error handling is
complete as (i) these errors are specific to the source
hardware and is irrelevant on the target hardware,
(ii) these errors cause data corruption and should
be handled before migration.

Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
---
 hw/ppc/spapr.c         |   20 ++++++++++++++++++++
 hw/ppc/spapr_events.c  |   17 +++++++++++++++++
 hw/ppc/spapr_rtas.c    |    4 ++++
 include/hw/ppc/spapr.h |    2 ++
 4 files changed, 43 insertions(+)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index e8a77636..31c4850 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2104,6 +2104,25 @@ static const VMStateDescription vmstate_spapr_dtb = {
     },
 };
 
+static bool spapr_fwnmi_needed(void *opaque)
+{
+    SpaprMachineState *spapr = (SpaprMachineState *)opaque;
+
+    return (spapr->guest_machine_check_addr == -1) ? 0 : 1;
+}
+
+static const VMStateDescription vmstate_spapr_machine_check = {
+    .name = "spapr_machine_check",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = spapr_fwnmi_needed,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT64(guest_machine_check_addr, SpaprMachineState),
+        VMSTATE_INT32(mc_status, SpaprMachineState),
+        VMSTATE_END_OF_LIST()
+    },
+};
+
 static const VMStateDescription vmstate_spapr = {
     .name = "spapr",
     .version_id = 3,
@@ -2137,6 +2156,7 @@ static const VMStateDescription vmstate_spapr = {
         &vmstate_spapr_dtb,
         &vmstate_spapr_cap_large_decr,
         &vmstate_spapr_cap_ccf_assist,
+        &vmstate_spapr_machine_check,
         NULL
     }
 };
diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
index 573c0b7..35e21e4 100644
--- a/hw/ppc/spapr_events.c
+++ b/hw/ppc/spapr_events.c
@@ -41,6 +41,7 @@
 #include "qemu/bcd.h"
 #include "hw/ppc/spapr_ovec.h"
 #include <libfdt.h>
+#include "migration/blocker.h"
 
 #define RTAS_LOG_VERSION_MASK                   0xff000000
 #define   RTAS_LOG_VERSION_6                    0x06000000
@@ -855,6 +856,22 @@ static void spapr_mce_dispatch_elog(PowerPCCPU *cpu, bool recovered)
 void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
 {
     SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+    int ret;
+    Error *local_err = NULL;
+
+    error_setg(&spapr->fwnmi_migration_blocker,
+            "Live migration not supported during machine check handling");
+    ret = migrate_add_blocker(spapr->fwnmi_migration_blocker, &local_err);
+    if (ret < 0) {
+        /*
+         * We don't want to abort and let the migration to continue. In a
+         * rare case, the machine check handler will run on the target
+         * hardware. Though this is not preferable, it is better than aborting
+         * the migration or killing the VM.
+         */
+        error_free(spapr->fwnmi_migration_blocker);
+        warn_report_err(local_err);
+    }
 
     while (spapr->mc_status != -1) {
         /*
diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c
index 91a7ab9..c849223 100644
--- a/hw/ppc/spapr_rtas.c
+++ b/hw/ppc/spapr_rtas.c
@@ -50,6 +50,7 @@
 #include "target/ppc/mmu-hash64.h"
 #include "target/ppc/mmu-book3s-v3.h"
 #include "kvm_ppc.h"
+#include "migration/blocker.h"
 
 static void rtas_display_character(PowerPCCPU *cpu, SpaprMachineState *spapr,
                                    uint32_t token, uint32_t nargs,
@@ -404,6 +405,9 @@ static void rtas_ibm_nmi_interlock(PowerPCCPU *cpu,
         spapr->mc_status = -1;
         qemu_cond_signal(&spapr->mc_delivery_cond);
         rtas_st(rets, 0, RTAS_OUT_SUCCESS);
+        migrate_del_blocker(spapr->fwnmi_migration_blocker);
+        error_free(spapr->fwnmi_migration_blocker);
+        spapr->fwnmi_migration_blocker = NULL;
     }
 }
 
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index bd75d4b..6c0cfd8 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -214,6 +214,8 @@ struct SpaprMachineState {
     SpaprCapabilities def, eff, mig;
 
     unsigned gpu_numa_id;
+
+    Error *fwnmi_migration_blocker;
 };
 
 #define H_SUCCESS         0


Re: [Qemu-devel] [Qemu-ppc] [PATCH v9 6/6] migration: Include migration support for machine check handling
Posted by Greg Kurz 6 years, 5 months ago
On Wed, 29 May 2019 11:10:57 +0530
Aravinda Prasad <aravinda@linux.vnet.ibm.com> wrote:

> This patch includes migration support for machine check
> handling. Especially this patch blocks VM migration
> requests until the machine check error handling is
> complete as (i) these errors are specific to the source
> hardware and is irrelevant on the target hardware,
> (ii) these errors cause data corruption and should
> be handled before migration.
> 
> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
> ---

LGTM, just one issue: machine reset should del and free the blocker as well,
otherwise QEMU would crash if spapr_mce_req_event() is called again.

>  hw/ppc/spapr.c         |   20 ++++++++++++++++++++
>  hw/ppc/spapr_events.c  |   17 +++++++++++++++++
>  hw/ppc/spapr_rtas.c    |    4 ++++
>  include/hw/ppc/spapr.h |    2 ++
>  4 files changed, 43 insertions(+)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index e8a77636..31c4850 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -2104,6 +2104,25 @@ static const VMStateDescription vmstate_spapr_dtb = {
>      },
>  };
>  
> +static bool spapr_fwnmi_needed(void *opaque)
> +{
> +    SpaprMachineState *spapr = (SpaprMachineState *)opaque;
> +
> +    return (spapr->guest_machine_check_addr == -1) ? 0 : 1;
> +}
> +
> +static const VMStateDescription vmstate_spapr_machine_check = {
> +    .name = "spapr_machine_check",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .needed = spapr_fwnmi_needed,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_UINT64(guest_machine_check_addr, SpaprMachineState),
> +        VMSTATE_INT32(mc_status, SpaprMachineState),
> +        VMSTATE_END_OF_LIST()
> +    },
> +};
> +
>  static const VMStateDescription vmstate_spapr = {
>      .name = "spapr",
>      .version_id = 3,
> @@ -2137,6 +2156,7 @@ static const VMStateDescription vmstate_spapr = {
>          &vmstate_spapr_dtb,
>          &vmstate_spapr_cap_large_decr,
>          &vmstate_spapr_cap_ccf_assist,
> +        &vmstate_spapr_machine_check,
>          NULL
>      }
>  };
> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
> index 573c0b7..35e21e4 100644
> --- a/hw/ppc/spapr_events.c
> +++ b/hw/ppc/spapr_events.c
> @@ -41,6 +41,7 @@
>  #include "qemu/bcd.h"
>  #include "hw/ppc/spapr_ovec.h"
>  #include <libfdt.h>
> +#include "migration/blocker.h"
>  
>  #define RTAS_LOG_VERSION_MASK                   0xff000000
>  #define   RTAS_LOG_VERSION_6                    0x06000000
> @@ -855,6 +856,22 @@ static void spapr_mce_dispatch_elog(PowerPCCPU *cpu, bool recovered)
>  void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
>  {
>      SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> +    int ret;
> +    Error *local_err = NULL;
> +
> +    error_setg(&spapr->fwnmi_migration_blocker,
> +            "Live migration not supported during machine check handling");
> +    ret = migrate_add_blocker(spapr->fwnmi_migration_blocker, &local_err);
> +    if (ret < 0) {
> +        /*
> +         * We don't want to abort and let the migration to continue. In a
> +         * rare case, the machine check handler will run on the target
> +         * hardware. Though this is not preferable, it is better than aborting
> +         * the migration or killing the VM.
> +         */
> +        error_free(spapr->fwnmi_migration_blocker);
> +        warn_report_err(local_err);
> +    }
>  
>      while (spapr->mc_status != -1) {
>          /*
> diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c
> index 91a7ab9..c849223 100644
> --- a/hw/ppc/spapr_rtas.c
> +++ b/hw/ppc/spapr_rtas.c
> @@ -50,6 +50,7 @@
>  #include "target/ppc/mmu-hash64.h"
>  #include "target/ppc/mmu-book3s-v3.h"
>  #include "kvm_ppc.h"
> +#include "migration/blocker.h"
>  
>  static void rtas_display_character(PowerPCCPU *cpu, SpaprMachineState *spapr,
>                                     uint32_t token, uint32_t nargs,
> @@ -404,6 +405,9 @@ static void rtas_ibm_nmi_interlock(PowerPCCPU *cpu,
>          spapr->mc_status = -1;
>          qemu_cond_signal(&spapr->mc_delivery_cond);
>          rtas_st(rets, 0, RTAS_OUT_SUCCESS);
> +        migrate_del_blocker(spapr->fwnmi_migration_blocker);
> +        error_free(spapr->fwnmi_migration_blocker);
> +        spapr->fwnmi_migration_blocker = NULL;
>      }
>  }
>  
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index bd75d4b..6c0cfd8 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -214,6 +214,8 @@ struct SpaprMachineState {
>      SpaprCapabilities def, eff, mig;
>  
>      unsigned gpu_numa_id;
> +
> +    Error *fwnmi_migration_blocker;
>  };
>  
>  #define H_SUCCESS         0
> 
> 


Re: [Qemu-devel] [Qemu-ppc] [PATCH v9 6/6] migration: Include migration support for machine check handling
Posted by Aravinda Prasad 6 years, 5 months ago

On Monday 03 June 2019 09:10 PM, Greg Kurz wrote:
> On Wed, 29 May 2019 11:10:57 +0530
> Aravinda Prasad <aravinda@linux.vnet.ibm.com> wrote:
> 
>> This patch includes migration support for machine check
>> handling. Especially this patch blocks VM migration
>> requests until the machine check error handling is
>> complete as (i) these errors are specific to the source
>> hardware and is irrelevant on the target hardware,
>> (ii) these errors cause data corruption and should
>> be handled before migration.
>>
>> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
>> ---
> 
> LGTM, just one issue: machine reset should del and free the blocker as well,
> otherwise QEMU would crash if spapr_mce_req_event() is called again.

Sure.


> 
>>  hw/ppc/spapr.c         |   20 ++++++++++++++++++++
>>  hw/ppc/spapr_events.c  |   17 +++++++++++++++++
>>  hw/ppc/spapr_rtas.c    |    4 ++++
>>  include/hw/ppc/spapr.h |    2 ++
>>  4 files changed, 43 insertions(+)
>>
>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>> index e8a77636..31c4850 100644
>> --- a/hw/ppc/spapr.c
>> +++ b/hw/ppc/spapr.c
>> @@ -2104,6 +2104,25 @@ static const VMStateDescription vmstate_spapr_dtb = {
>>      },
>>  };
>>  
>> +static bool spapr_fwnmi_needed(void *opaque)
>> +{
>> +    SpaprMachineState *spapr = (SpaprMachineState *)opaque;
>> +
>> +    return (spapr->guest_machine_check_addr == -1) ? 0 : 1;
>> +}
>> +
>> +static const VMStateDescription vmstate_spapr_machine_check = {
>> +    .name = "spapr_machine_check",
>> +    .version_id = 1,
>> +    .minimum_version_id = 1,
>> +    .needed = spapr_fwnmi_needed,
>> +    .fields = (VMStateField[]) {
>> +        VMSTATE_UINT64(guest_machine_check_addr, SpaprMachineState),
>> +        VMSTATE_INT32(mc_status, SpaprMachineState),
>> +        VMSTATE_END_OF_LIST()
>> +    },
>> +};
>> +
>>  static const VMStateDescription vmstate_spapr = {
>>      .name = "spapr",
>>      .version_id = 3,
>> @@ -2137,6 +2156,7 @@ static const VMStateDescription vmstate_spapr = {
>>          &vmstate_spapr_dtb,
>>          &vmstate_spapr_cap_large_decr,
>>          &vmstate_spapr_cap_ccf_assist,
>> +        &vmstate_spapr_machine_check,
>>          NULL
>>      }
>>  };
>> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
>> index 573c0b7..35e21e4 100644
>> --- a/hw/ppc/spapr_events.c
>> +++ b/hw/ppc/spapr_events.c
>> @@ -41,6 +41,7 @@
>>  #include "qemu/bcd.h"
>>  #include "hw/ppc/spapr_ovec.h"
>>  #include <libfdt.h>
>> +#include "migration/blocker.h"
>>  
>>  #define RTAS_LOG_VERSION_MASK                   0xff000000
>>  #define   RTAS_LOG_VERSION_6                    0x06000000
>> @@ -855,6 +856,22 @@ static void spapr_mce_dispatch_elog(PowerPCCPU *cpu, bool recovered)
>>  void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
>>  {
>>      SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
>> +    int ret;
>> +    Error *local_err = NULL;
>> +
>> +    error_setg(&spapr->fwnmi_migration_blocker,
>> +            "Live migration not supported during machine check handling");
>> +    ret = migrate_add_blocker(spapr->fwnmi_migration_blocker, &local_err);
>> +    if (ret < 0) {
>> +        /*
>> +         * We don't want to abort and let the migration to continue. In a
>> +         * rare case, the machine check handler will run on the target
>> +         * hardware. Though this is not preferable, it is better than aborting
>> +         * the migration or killing the VM.
>> +         */
>> +        error_free(spapr->fwnmi_migration_blocker);
>> +        warn_report_err(local_err);
>> +    }
>>  
>>      while (spapr->mc_status != -1) {
>>          /*
>> diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c
>> index 91a7ab9..c849223 100644
>> --- a/hw/ppc/spapr_rtas.c
>> +++ b/hw/ppc/spapr_rtas.c
>> @@ -50,6 +50,7 @@
>>  #include "target/ppc/mmu-hash64.h"
>>  #include "target/ppc/mmu-book3s-v3.h"
>>  #include "kvm_ppc.h"
>> +#include "migration/blocker.h"
>>  
>>  static void rtas_display_character(PowerPCCPU *cpu, SpaprMachineState *spapr,
>>                                     uint32_t token, uint32_t nargs,
>> @@ -404,6 +405,9 @@ static void rtas_ibm_nmi_interlock(PowerPCCPU *cpu,
>>          spapr->mc_status = -1;
>>          qemu_cond_signal(&spapr->mc_delivery_cond);
>>          rtas_st(rets, 0, RTAS_OUT_SUCCESS);
>> +        migrate_del_blocker(spapr->fwnmi_migration_blocker);
>> +        error_free(spapr->fwnmi_migration_blocker);
>> +        spapr->fwnmi_migration_blocker = NULL;
>>      }
>>  }
>>  
>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>> index bd75d4b..6c0cfd8 100644
>> --- a/include/hw/ppc/spapr.h
>> +++ b/include/hw/ppc/spapr.h
>> @@ -214,6 +214,8 @@ struct SpaprMachineState {
>>      SpaprCapabilities def, eff, mig;
>>  
>>      unsigned gpu_numa_id;
>> +
>> +    Error *fwnmi_migration_blocker;
>>  };
>>  
>>  #define H_SUCCESS         0
>>
>>
> 

-- 
Regards,
Aravinda


Re: [Qemu-devel] [Qemu-ppc] [PATCH v9 6/6] migration: Include migration support for machine check handling
Posted by Greg Kurz 6 years, 5 months ago
On Tue, 4 Jun 2019 12:34:37 +0530
Aravinda Prasad <aravinda@linux.vnet.ibm.com> wrote:

> On Monday 03 June 2019 09:10 PM, Greg Kurz wrote:
> > On Wed, 29 May 2019 11:10:57 +0530
> > Aravinda Prasad <aravinda@linux.vnet.ibm.com> wrote:
> >   
> >> This patch includes migration support for machine check
> >> handling. Especially this patch blocks VM migration
> >> requests until the machine check error handling is
> >> complete as (i) these errors are specific to the source
> >> hardware and is irrelevant on the target hardware,
> >> (ii) these errors cause data corruption and should
> >> be handled before migration.
> >>
> >> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
> >> ---  
> > 
> > LGTM, just one issue: machine reset should del and free the blocker as well,
> > otherwise QEMU would crash if spapr_mce_req_event() is called again.  
> 
> Sure.
> 
> 
> >   
> >>  hw/ppc/spapr.c         |   20 ++++++++++++++++++++
> >>  hw/ppc/spapr_events.c  |   17 +++++++++++++++++
> >>  hw/ppc/spapr_rtas.c    |    4 ++++
> >>  include/hw/ppc/spapr.h |    2 ++
> >>  4 files changed, 43 insertions(+)
> >>
> >> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> >> index e8a77636..31c4850 100644
> >> --- a/hw/ppc/spapr.c
> >> +++ b/hw/ppc/spapr.c
> >> @@ -2104,6 +2104,25 @@ static const VMStateDescription vmstate_spapr_dtb = {
> >>      },
> >>  };
> >>  
> >> +static bool spapr_fwnmi_needed(void *opaque)
> >> +{
> >> +    SpaprMachineState *spapr = (SpaprMachineState *)opaque;
> >> +
> >> +    return (spapr->guest_machine_check_addr == -1) ? 0 : 1;

And also you can drop the parens since == as precedence over ?:

> >> +}
> >> +
> >> +static const VMStateDescription vmstate_spapr_machine_check = {
> >> +    .name = "spapr_machine_check",
> >> +    .version_id = 1,
> >> +    .minimum_version_id = 1,
> >> +    .needed = spapr_fwnmi_needed,
> >> +    .fields = (VMStateField[]) {
> >> +        VMSTATE_UINT64(guest_machine_check_addr, SpaprMachineState),
> >> +        VMSTATE_INT32(mc_status, SpaprMachineState),
> >> +        VMSTATE_END_OF_LIST()
> >> +    },
> >> +};
> >> +
> >>  static const VMStateDescription vmstate_spapr = {
> >>      .name = "spapr",
> >>      .version_id = 3,
> >> @@ -2137,6 +2156,7 @@ static const VMStateDescription vmstate_spapr = {
> >>          &vmstate_spapr_dtb,
> >>          &vmstate_spapr_cap_large_decr,
> >>          &vmstate_spapr_cap_ccf_assist,
> >> +        &vmstate_spapr_machine_check,
> >>          NULL
> >>      }
> >>  };
> >> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
> >> index 573c0b7..35e21e4 100644
> >> --- a/hw/ppc/spapr_events.c
> >> +++ b/hw/ppc/spapr_events.c
> >> @@ -41,6 +41,7 @@
> >>  #include "qemu/bcd.h"
> >>  #include "hw/ppc/spapr_ovec.h"
> >>  #include <libfdt.h>
> >> +#include "migration/blocker.h"
> >>  
> >>  #define RTAS_LOG_VERSION_MASK                   0xff000000
> >>  #define   RTAS_LOG_VERSION_6                    0x06000000
> >> @@ -855,6 +856,22 @@ static void spapr_mce_dispatch_elog(PowerPCCPU *cpu, bool recovered)
> >>  void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
> >>  {
> >>      SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> >> +    int ret;
> >> +    Error *local_err = NULL;
> >> +
> >> +    error_setg(&spapr->fwnmi_migration_blocker,
> >> +            "Live migration not supported during machine check handling");
> >> +    ret = migrate_add_blocker(spapr->fwnmi_migration_blocker, &local_err);
> >> +    if (ret < 0) {
> >> +        /*
> >> +         * We don't want to abort and let the migration to continue. In a
> >> +         * rare case, the machine check handler will run on the target
> >> +         * hardware. Though this is not preferable, it is better than aborting
> >> +         * the migration or killing the VM.
> >> +         */
> >> +        error_free(spapr->fwnmi_migration_blocker);
> >> +        warn_report_err(local_err);
> >> +    }
> >>  
> >>      while (spapr->mc_status != -1) {
> >>          /*
> >> diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c
> >> index 91a7ab9..c849223 100644
> >> --- a/hw/ppc/spapr_rtas.c
> >> +++ b/hw/ppc/spapr_rtas.c
> >> @@ -50,6 +50,7 @@
> >>  #include "target/ppc/mmu-hash64.h"
> >>  #include "target/ppc/mmu-book3s-v3.h"
> >>  #include "kvm_ppc.h"
> >> +#include "migration/blocker.h"
> >>  
> >>  static void rtas_display_character(PowerPCCPU *cpu, SpaprMachineState *spapr,
> >>                                     uint32_t token, uint32_t nargs,
> >> @@ -404,6 +405,9 @@ static void rtas_ibm_nmi_interlock(PowerPCCPU *cpu,
> >>          spapr->mc_status = -1;
> >>          qemu_cond_signal(&spapr->mc_delivery_cond);
> >>          rtas_st(rets, 0, RTAS_OUT_SUCCESS);
> >> +        migrate_del_blocker(spapr->fwnmi_migration_blocker);
> >> +        error_free(spapr->fwnmi_migration_blocker);
> >> +        spapr->fwnmi_migration_blocker = NULL;
> >>      }
> >>  }
> >>  
> >> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> >> index bd75d4b..6c0cfd8 100644
> >> --- a/include/hw/ppc/spapr.h
> >> +++ b/include/hw/ppc/spapr.h
> >> @@ -214,6 +214,8 @@ struct SpaprMachineState {
> >>      SpaprCapabilities def, eff, mig;
> >>  
> >>      unsigned gpu_numa_id;
> >> +
> >> +    Error *fwnmi_migration_blocker;
> >>  };
> >>  
> >>  #define H_SUCCESS         0
> >>
> >>  
> >   
> 


Re: [Qemu-devel] [Qemu-ppc] [PATCH v9 6/6] migration: Include migration support for machine check handling
Posted by Greg Kurz 6 years, 5 months ago
On Tue, 4 Jun 2019 22:04:21 +0200
Greg Kurz <groug@kaod.org> wrote:

> On Tue, 4 Jun 2019 12:34:37 +0530
> Aravinda Prasad <aravinda@linux.vnet.ibm.com> wrote:
> 
> > On Monday 03 June 2019 09:10 PM, Greg Kurz wrote:  
> > > On Wed, 29 May 2019 11:10:57 +0530
> > > Aravinda Prasad <aravinda@linux.vnet.ibm.com> wrote:
> > >     
> > >> This patch includes migration support for machine check
> > >> handling. Especially this patch blocks VM migration
> > >> requests until the machine check error handling is
> > >> complete as (i) these errors are specific to the source
> > >> hardware and is irrelevant on the target hardware,
> > >> (ii) these errors cause data corruption and should
> > >> be handled before migration.
> > >>
> > >> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
> > >> ---    
> > > 
> > > LGTM, just one issue: machine reset should del and free the blocker as well,
> > > otherwise QEMU would crash if spapr_mce_req_event() is called again.    
> > 
> > Sure.
> > 
> >   
> > >     
> > >>  hw/ppc/spapr.c         |   20 ++++++++++++++++++++
> > >>  hw/ppc/spapr_events.c  |   17 +++++++++++++++++
> > >>  hw/ppc/spapr_rtas.c    |    4 ++++
> > >>  include/hw/ppc/spapr.h |    2 ++
> > >>  4 files changed, 43 insertions(+)
> > >>
> > >> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> > >> index e8a77636..31c4850 100644
> > >> --- a/hw/ppc/spapr.c
> > >> +++ b/hw/ppc/spapr.c
> > >> @@ -2104,6 +2104,25 @@ static const VMStateDescription vmstate_spapr_dtb = {
> > >>      },
> > >>  };
> > >>  
> > >> +static bool spapr_fwnmi_needed(void *opaque)
> > >> +{
> > >> +    SpaprMachineState *spapr = (SpaprMachineState *)opaque;
> > >> +
> > >> +    return (spapr->guest_machine_check_addr == -1) ? 0 : 1;  
> 
> And also you can drop the parens since == as precedence over ?:
> 

... or even better make it spapr->guest_machine_check_addr != -1 :)

> > >> +}
> > >> +
> > >> +static const VMStateDescription vmstate_spapr_machine_check = {
> > >> +    .name = "spapr_machine_check",
> > >> +    .version_id = 1,
> > >> +    .minimum_version_id = 1,
> > >> +    .needed = spapr_fwnmi_needed,
> > >> +    .fields = (VMStateField[]) {
> > >> +        VMSTATE_UINT64(guest_machine_check_addr, SpaprMachineState),
> > >> +        VMSTATE_INT32(mc_status, SpaprMachineState),
> > >> +        VMSTATE_END_OF_LIST()
> > >> +    },
> > >> +};
> > >> +
> > >>  static const VMStateDescription vmstate_spapr = {
> > >>      .name = "spapr",
> > >>      .version_id = 3,
> > >> @@ -2137,6 +2156,7 @@ static const VMStateDescription vmstate_spapr = {
> > >>          &vmstate_spapr_dtb,
> > >>          &vmstate_spapr_cap_large_decr,
> > >>          &vmstate_spapr_cap_ccf_assist,
> > >> +        &vmstate_spapr_machine_check,
> > >>          NULL
> > >>      }
> > >>  };
> > >> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
> > >> index 573c0b7..35e21e4 100644
> > >> --- a/hw/ppc/spapr_events.c
> > >> +++ b/hw/ppc/spapr_events.c
> > >> @@ -41,6 +41,7 @@
> > >>  #include "qemu/bcd.h"
> > >>  #include "hw/ppc/spapr_ovec.h"
> > >>  #include <libfdt.h>
> > >> +#include "migration/blocker.h"
> > >>  
> > >>  #define RTAS_LOG_VERSION_MASK                   0xff000000
> > >>  #define   RTAS_LOG_VERSION_6                    0x06000000
> > >> @@ -855,6 +856,22 @@ static void spapr_mce_dispatch_elog(PowerPCCPU *cpu, bool recovered)
> > >>  void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
> > >>  {
> > >>      SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> > >> +    int ret;
> > >> +    Error *local_err = NULL;
> > >> +
> > >> +    error_setg(&spapr->fwnmi_migration_blocker,
> > >> +            "Live migration not supported during machine check handling");
> > >> +    ret = migrate_add_blocker(spapr->fwnmi_migration_blocker, &local_err);
> > >> +    if (ret < 0) {
> > >> +        /*
> > >> +         * We don't want to abort and let the migration to continue. In a
> > >> +         * rare case, the machine check handler will run on the target
> > >> +         * hardware. Though this is not preferable, it is better than aborting
> > >> +         * the migration or killing the VM.
> > >> +         */
> > >> +        error_free(spapr->fwnmi_migration_blocker);
> > >> +        warn_report_err(local_err);
> > >> +    }
> > >>  
> > >>      while (spapr->mc_status != -1) {
> > >>          /*
> > >> diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c
> > >> index 91a7ab9..c849223 100644
> > >> --- a/hw/ppc/spapr_rtas.c
> > >> +++ b/hw/ppc/spapr_rtas.c
> > >> @@ -50,6 +50,7 @@
> > >>  #include "target/ppc/mmu-hash64.h"
> > >>  #include "target/ppc/mmu-book3s-v3.h"
> > >>  #include "kvm_ppc.h"
> > >> +#include "migration/blocker.h"
> > >>  
> > >>  static void rtas_display_character(PowerPCCPU *cpu, SpaprMachineState *spapr,
> > >>                                     uint32_t token, uint32_t nargs,
> > >> @@ -404,6 +405,9 @@ static void rtas_ibm_nmi_interlock(PowerPCCPU *cpu,
> > >>          spapr->mc_status = -1;
> > >>          qemu_cond_signal(&spapr->mc_delivery_cond);
> > >>          rtas_st(rets, 0, RTAS_OUT_SUCCESS);
> > >> +        migrate_del_blocker(spapr->fwnmi_migration_blocker);
> > >> +        error_free(spapr->fwnmi_migration_blocker);
> > >> +        spapr->fwnmi_migration_blocker = NULL;
> > >>      }
> > >>  }
> > >>  
> > >> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> > >> index bd75d4b..6c0cfd8 100644
> > >> --- a/include/hw/ppc/spapr.h
> > >> +++ b/include/hw/ppc/spapr.h
> > >> @@ -214,6 +214,8 @@ struct SpaprMachineState {
> > >>      SpaprCapabilities def, eff, mig;
> > >>  
> > >>      unsigned gpu_numa_id;
> > >> +
> > >> +    Error *fwnmi_migration_blocker;
> > >>  };
> > >>  
> > >>  #define H_SUCCESS         0
> > >>
> > >>    
> > >     
> >   
> 
> 


Re: [Qemu-devel] [PATCH v9 6/6] migration: Include migration support for machine check handling
Posted by David Gibson 6 years, 5 months ago
On Wed, May 29, 2019 at 11:10:57AM +0530, Aravinda Prasad wrote:
> This patch includes migration support for machine check
> handling. Especially this patch blocks VM migration
> requests until the machine check error handling is
> complete as (i) these errors are specific to the source
> hardware and is irrelevant on the target hardware,
> (ii) these errors cause data corruption and should
> be handled before migration.
> 
> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
> ---
>  hw/ppc/spapr.c         |   20 ++++++++++++++++++++
>  hw/ppc/spapr_events.c  |   17 +++++++++++++++++
>  hw/ppc/spapr_rtas.c    |    4 ++++
>  include/hw/ppc/spapr.h |    2 ++
>  4 files changed, 43 insertions(+)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index e8a77636..31c4850 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -2104,6 +2104,25 @@ static const VMStateDescription vmstate_spapr_dtb = {
>      },
>  };
>  
> +static bool spapr_fwnmi_needed(void *opaque)
> +{
> +    SpaprMachineState *spapr = (SpaprMachineState *)opaque;
> +
> +    return (spapr->guest_machine_check_addr == -1) ? 0 : 1;

Since we're introducing a PAPR capability to enable this, it would
actually be better to check that here, rather than the runtime state.
That leads to less cases and easier to understand semantics for the
migration stream.

> +}
> +
> +static const VMStateDescription vmstate_spapr_machine_check = {
> +    .name = "spapr_machine_check",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .needed = spapr_fwnmi_needed,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_UINT64(guest_machine_check_addr, SpaprMachineState),
> +        VMSTATE_INT32(mc_status, SpaprMachineState),
> +        VMSTATE_END_OF_LIST()
> +    },
> +};
> +
>  static const VMStateDescription vmstate_spapr = {
>      .name = "spapr",
>      .version_id = 3,
> @@ -2137,6 +2156,7 @@ static const VMStateDescription vmstate_spapr = {
>          &vmstate_spapr_dtb,
>          &vmstate_spapr_cap_large_decr,
>          &vmstate_spapr_cap_ccf_assist,
> +        &vmstate_spapr_machine_check,
>          NULL
>      }
>  };
> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
> index 573c0b7..35e21e4 100644
> --- a/hw/ppc/spapr_events.c
> +++ b/hw/ppc/spapr_events.c
> @@ -41,6 +41,7 @@
>  #include "qemu/bcd.h"
>  #include "hw/ppc/spapr_ovec.h"
>  #include <libfdt.h>
> +#include "migration/blocker.h"
>  
>  #define RTAS_LOG_VERSION_MASK                   0xff000000
>  #define   RTAS_LOG_VERSION_6                    0x06000000
> @@ -855,6 +856,22 @@ static void spapr_mce_dispatch_elog(PowerPCCPU *cpu, bool recovered)
>  void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
>  {
>      SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> +    int ret;
> +    Error *local_err = NULL;
> +
> +    error_setg(&spapr->fwnmi_migration_blocker,
> +            "Live migration not supported during machine check handling");
> +    ret = migrate_add_blocker(spapr->fwnmi_migration_blocker, &local_err);
> +    if (ret < 0) {
> +        /*
> +         * We don't want to abort and let the migration to continue. In a
> +         * rare case, the machine check handler will run on the target
> +         * hardware. Though this is not preferable, it is better than aborting
> +         * the migration or killing the VM.
> +         */
> +        error_free(spapr->fwnmi_migration_blocker);

You should set fwnmi_migration_blocker to NULL here as well.

As mentioned on an earlier iteration, the migration blocker is the
same every time.  Couldn't you just create it once and free at final
teardown, rather than recreating it for every NMI?

> +        warn_report_err(local_err);
> +    }
>  
>      while (spapr->mc_status != -1) {
>          /*
> diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c
> index 91a7ab9..c849223 100644
> --- a/hw/ppc/spapr_rtas.c
> +++ b/hw/ppc/spapr_rtas.c
> @@ -50,6 +50,7 @@
>  #include "target/ppc/mmu-hash64.h"
>  #include "target/ppc/mmu-book3s-v3.h"
>  #include "kvm_ppc.h"
> +#include "migration/blocker.h"
>  
>  static void rtas_display_character(PowerPCCPU *cpu, SpaprMachineState *spapr,
>                                     uint32_t token, uint32_t nargs,
> @@ -404,6 +405,9 @@ static void rtas_ibm_nmi_interlock(PowerPCCPU *cpu,
>          spapr->mc_status = -1;
>          qemu_cond_signal(&spapr->mc_delivery_cond);
>          rtas_st(rets, 0, RTAS_OUT_SUCCESS);
> +        migrate_del_blocker(spapr->fwnmi_migration_blocker);
> +        error_free(spapr->fwnmi_migration_blocker);
> +        spapr->fwnmi_migration_blocker = NULL;
>      }
>  }
>  
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index bd75d4b..6c0cfd8 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -214,6 +214,8 @@ struct SpaprMachineState {
>      SpaprCapabilities def, eff, mig;
>  
>      unsigned gpu_numa_id;
> +
> +    Error *fwnmi_migration_blocker;
>  };
>  
>  #define H_SUCCESS         0
> 

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson
Re: [Qemu-devel] [PATCH v9 6/6] migration: Include migration support for machine check handling
Posted by Greg Kurz 6 years, 5 months ago
On Thu, 6 Jun 2019 13:06:14 +1000
David Gibson <david@gibson.dropbear.id.au> wrote:

> On Wed, May 29, 2019 at 11:10:57AM +0530, Aravinda Prasad wrote:
> > This patch includes migration support for machine check
> > handling. Especially this patch blocks VM migration
> > requests until the machine check error handling is
> > complete as (i) these errors are specific to the source
> > hardware and is irrelevant on the target hardware,
> > (ii) these errors cause data corruption and should
> > be handled before migration.
> > 
> > Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
> > ---
> >  hw/ppc/spapr.c         |   20 ++++++++++++++++++++
> >  hw/ppc/spapr_events.c  |   17 +++++++++++++++++
> >  hw/ppc/spapr_rtas.c    |    4 ++++
> >  include/hw/ppc/spapr.h |    2 ++
> >  4 files changed, 43 insertions(+)
> > 
> > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> > index e8a77636..31c4850 100644
> > --- a/hw/ppc/spapr.c
> > +++ b/hw/ppc/spapr.c
> > @@ -2104,6 +2104,25 @@ static const VMStateDescription vmstate_spapr_dtb = {
> >      },
> >  };
> >  
> > +static bool spapr_fwnmi_needed(void *opaque)
> > +{
> > +    SpaprMachineState *spapr = (SpaprMachineState *)opaque;
> > +
> > +    return (spapr->guest_machine_check_addr == -1) ? 0 : 1;  
> 
> Since we're introducing a PAPR capability to enable this, it would
> actually be better to check that here, rather than the runtime state.
> That leads to less cases and easier to understand semantics for the
> migration stream.
> 

Hmmm... the purpose of needed() VMState callbacks is precisely about
runtime state: the subsection should only be migrated if an MCE is
pending, ie. spapr->guest_machine_check_addr != -1.

> > +}
> > +
> > +static const VMStateDescription vmstate_spapr_machine_check = {
> > +    .name = "spapr_machine_check",
> > +    .version_id = 1,
> > +    .minimum_version_id = 1,
> > +    .needed = spapr_fwnmi_needed,
> > +    .fields = (VMStateField[]) {
> > +        VMSTATE_UINT64(guest_machine_check_addr, SpaprMachineState),
> > +        VMSTATE_INT32(mc_status, SpaprMachineState),
> > +        VMSTATE_END_OF_LIST()
> > +    },
> > +};
> > +
> >  static const VMStateDescription vmstate_spapr = {
> >      .name = "spapr",
> >      .version_id = 3,
> > @@ -2137,6 +2156,7 @@ static const VMStateDescription vmstate_spapr = {
> >          &vmstate_spapr_dtb,
> >          &vmstate_spapr_cap_large_decr,
> >          &vmstate_spapr_cap_ccf_assist,
> > +        &vmstate_spapr_machine_check,
> >          NULL
> >      }
> >  };
> > diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
> > index 573c0b7..35e21e4 100644
> > --- a/hw/ppc/spapr_events.c
> > +++ b/hw/ppc/spapr_events.c
> > @@ -41,6 +41,7 @@
> >  #include "qemu/bcd.h"
> >  #include "hw/ppc/spapr_ovec.h"
> >  #include <libfdt.h>
> > +#include "migration/blocker.h"
> >  
> >  #define RTAS_LOG_VERSION_MASK                   0xff000000
> >  #define   RTAS_LOG_VERSION_6                    0x06000000
> > @@ -855,6 +856,22 @@ static void spapr_mce_dispatch_elog(PowerPCCPU *cpu, bool recovered)
> >  void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
> >  {
> >      SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> > +    int ret;
> > +    Error *local_err = NULL;
> > +
> > +    error_setg(&spapr->fwnmi_migration_blocker,
> > +            "Live migration not supported during machine check handling");
> > +    ret = migrate_add_blocker(spapr->fwnmi_migration_blocker, &local_err);
> > +    if (ret < 0) {
> > +        /*
> > +         * We don't want to abort and let the migration to continue. In a
> > +         * rare case, the machine check handler will run on the target
> > +         * hardware. Though this is not preferable, it is better than aborting
> > +         * the migration or killing the VM.
> > +         */
> > +        error_free(spapr->fwnmi_migration_blocker);  
> 
> You should set fwnmi_migration_blocker to NULL here as well.
> 
> As mentioned on an earlier iteration, the migration blocker is the
> same every time.  Couldn't you just create it once and free at final
> teardown, rather than recreating it for every NMI?
> 
> > +        warn_report_err(local_err);
> > +    }
> >  
> >      while (spapr->mc_status != -1) {
> >          /*
> > diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c
> > index 91a7ab9..c849223 100644
> > --- a/hw/ppc/spapr_rtas.c
> > +++ b/hw/ppc/spapr_rtas.c
> > @@ -50,6 +50,7 @@
> >  #include "target/ppc/mmu-hash64.h"
> >  #include "target/ppc/mmu-book3s-v3.h"
> >  #include "kvm_ppc.h"
> > +#include "migration/blocker.h"
> >  
> >  static void rtas_display_character(PowerPCCPU *cpu, SpaprMachineState *spapr,
> >                                     uint32_t token, uint32_t nargs,
> > @@ -404,6 +405,9 @@ static void rtas_ibm_nmi_interlock(PowerPCCPU *cpu,
> >          spapr->mc_status = -1;
> >          qemu_cond_signal(&spapr->mc_delivery_cond);
> >          rtas_st(rets, 0, RTAS_OUT_SUCCESS);
> > +        migrate_del_blocker(spapr->fwnmi_migration_blocker);
> > +        error_free(spapr->fwnmi_migration_blocker);
> > +        spapr->fwnmi_migration_blocker = NULL;
> >      }
> >  }
> >  
> > diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> > index bd75d4b..6c0cfd8 100644
> > --- a/include/hw/ppc/spapr.h
> > +++ b/include/hw/ppc/spapr.h
> > @@ -214,6 +214,8 @@ struct SpaprMachineState {
> >      SpaprCapabilities def, eff, mig;
> >  
> >      unsigned gpu_numa_id;
> > +
> > +    Error *fwnmi_migration_blocker;
> >  };
> >  
> >  #define H_SUCCESS         0
> >   
> 

Re: [Qemu-devel] [PATCH v9 6/6] migration: Include migration support for machine check handling
Posted by Aravinda Prasad 6 years, 5 months ago

On Thursday 06 June 2019 11:36 AM, Greg Kurz wrote:
> On Thu, 6 Jun 2019 13:06:14 +1000
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
>> On Wed, May 29, 2019 at 11:10:57AM +0530, Aravinda Prasad wrote:
>>> This patch includes migration support for machine check
>>> handling. Especially this patch blocks VM migration
>>> requests until the machine check error handling is
>>> complete as (i) these errors are specific to the source
>>> hardware and is irrelevant on the target hardware,
>>> (ii) these errors cause data corruption and should
>>> be handled before migration.
>>>
>>> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
>>> ---
>>>  hw/ppc/spapr.c         |   20 ++++++++++++++++++++
>>>  hw/ppc/spapr_events.c  |   17 +++++++++++++++++
>>>  hw/ppc/spapr_rtas.c    |    4 ++++
>>>  include/hw/ppc/spapr.h |    2 ++
>>>  4 files changed, 43 insertions(+)
>>>
>>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>>> index e8a77636..31c4850 100644
>>> --- a/hw/ppc/spapr.c
>>> +++ b/hw/ppc/spapr.c
>>> @@ -2104,6 +2104,25 @@ static const VMStateDescription vmstate_spapr_dtb = {
>>>      },
>>>  };
>>>  
>>> +static bool spapr_fwnmi_needed(void *opaque)
>>> +{
>>> +    SpaprMachineState *spapr = (SpaprMachineState *)opaque;
>>> +
>>> +    return (spapr->guest_machine_check_addr == -1) ? 0 : 1;  
>>
>> Since we're introducing a PAPR capability to enable this, it would
>> actually be better to check that here, rather than the runtime state.
>> That leads to less cases and easier to understand semantics for the
>> migration stream.
>>
> 
> Hmmm... the purpose of needed() VMState callbacks is precisely about
> runtime state: the subsection should only be migrated if an MCE is
> pending, ie. spapr->guest_machine_check_addr != -1.

spapr->guest_machine_check_addr is set when fwnmi is registered. Hence
an MCE might not be pending if it is set.

I am fine with both the approaches (checking for
guest_machine_check_addr or for SPAPR_CAP_FWNMI_MCE).

Regards,
Aravinda

> 
>>> +}
>>> +
>>> +static const VMStateDescription vmstate_spapr_machine_check = {
>>> +    .name = "spapr_machine_check",
>>> +    .version_id = 1,
>>> +    .minimum_version_id = 1,
>>> +    .needed = spapr_fwnmi_needed,
>>> +    .fields = (VMStateField[]) {
>>> +        VMSTATE_UINT64(guest_machine_check_addr, SpaprMachineState),
>>> +        VMSTATE_INT32(mc_status, SpaprMachineState),
>>> +        VMSTATE_END_OF_LIST()
>>> +    },
>>> +};
>>> +
>>>  static const VMStateDescription vmstate_spapr = {
>>>      .name = "spapr",
>>>      .version_id = 3,
>>> @@ -2137,6 +2156,7 @@ static const VMStateDescription vmstate_spapr = {
>>>          &vmstate_spapr_dtb,
>>>          &vmstate_spapr_cap_large_decr,
>>>          &vmstate_spapr_cap_ccf_assist,
>>> +        &vmstate_spapr_machine_check,
>>>          NULL
>>>      }
>>>  };
>>> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
>>> index 573c0b7..35e21e4 100644
>>> --- a/hw/ppc/spapr_events.c
>>> +++ b/hw/ppc/spapr_events.c
>>> @@ -41,6 +41,7 @@
>>>  #include "qemu/bcd.h"
>>>  #include "hw/ppc/spapr_ovec.h"
>>>  #include <libfdt.h>
>>> +#include "migration/blocker.h"
>>>  
>>>  #define RTAS_LOG_VERSION_MASK                   0xff000000
>>>  #define   RTAS_LOG_VERSION_6                    0x06000000
>>> @@ -855,6 +856,22 @@ static void spapr_mce_dispatch_elog(PowerPCCPU *cpu, bool recovered)
>>>  void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
>>>  {
>>>      SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
>>> +    int ret;
>>> +    Error *local_err = NULL;
>>> +
>>> +    error_setg(&spapr->fwnmi_migration_blocker,
>>> +            "Live migration not supported during machine check handling");
>>> +    ret = migrate_add_blocker(spapr->fwnmi_migration_blocker, &local_err);
>>> +    if (ret < 0) {
>>> +        /*
>>> +         * We don't want to abort and let the migration to continue. In a
>>> +         * rare case, the machine check handler will run on the target
>>> +         * hardware. Though this is not preferable, it is better than aborting
>>> +         * the migration or killing the VM.
>>> +         */
>>> +        error_free(spapr->fwnmi_migration_blocker);  
>>
>> You should set fwnmi_migration_blocker to NULL here as well.
>>
>> As mentioned on an earlier iteration, the migration blocker is the
>> same every time.  Couldn't you just create it once and free at final
>> teardown, rather than recreating it for every NMI?
>>
>>> +        warn_report_err(local_err);
>>> +    }
>>>  
>>>      while (spapr->mc_status != -1) {
>>>          /*
>>> diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c
>>> index 91a7ab9..c849223 100644
>>> --- a/hw/ppc/spapr_rtas.c
>>> +++ b/hw/ppc/spapr_rtas.c
>>> @@ -50,6 +50,7 @@
>>>  #include "target/ppc/mmu-hash64.h"
>>>  #include "target/ppc/mmu-book3s-v3.h"
>>>  #include "kvm_ppc.h"
>>> +#include "migration/blocker.h"
>>>  
>>>  static void rtas_display_character(PowerPCCPU *cpu, SpaprMachineState *spapr,
>>>                                     uint32_t token, uint32_t nargs,
>>> @@ -404,6 +405,9 @@ static void rtas_ibm_nmi_interlock(PowerPCCPU *cpu,
>>>          spapr->mc_status = -1;
>>>          qemu_cond_signal(&spapr->mc_delivery_cond);
>>>          rtas_st(rets, 0, RTAS_OUT_SUCCESS);
>>> +        migrate_del_blocker(spapr->fwnmi_migration_blocker);
>>> +        error_free(spapr->fwnmi_migration_blocker);
>>> +        spapr->fwnmi_migration_blocker = NULL;
>>>      }
>>>  }
>>>  
>>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>>> index bd75d4b..6c0cfd8 100644
>>> --- a/include/hw/ppc/spapr.h
>>> +++ b/include/hw/ppc/spapr.h
>>> @@ -214,6 +214,8 @@ struct SpaprMachineState {
>>>      SpaprCapabilities def, eff, mig;
>>>  
>>>      unsigned gpu_numa_id;
>>> +
>>> +    Error *fwnmi_migration_blocker;
>>>  };
>>>  
>>>  #define H_SUCCESS         0
>>>   
>>
> 

-- 
Regards,
Aravinda


Re: [Qemu-devel] [PATCH v9 6/6] migration: Include migration support for machine check handling
Posted by Greg Kurz 6 years, 5 months ago
On Thu, 6 Jun 2019 16:45:30 +0530
Aravinda Prasad <aravinda@linux.vnet.ibm.com> wrote:

> On Thursday 06 June 2019 11:36 AM, Greg Kurz wrote:
> > On Thu, 6 Jun 2019 13:06:14 +1000
> > David Gibson <david@gibson.dropbear.id.au> wrote:
> >   
> >> On Wed, May 29, 2019 at 11:10:57AM +0530, Aravinda Prasad wrote:  
> >>> This patch includes migration support for machine check
> >>> handling. Especially this patch blocks VM migration
> >>> requests until the machine check error handling is
> >>> complete as (i) these errors are specific to the source
> >>> hardware and is irrelevant on the target hardware,
> >>> (ii) these errors cause data corruption and should
> >>> be handled before migration.
> >>>
> >>> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
> >>> ---
> >>>  hw/ppc/spapr.c         |   20 ++++++++++++++++++++
> >>>  hw/ppc/spapr_events.c  |   17 +++++++++++++++++
> >>>  hw/ppc/spapr_rtas.c    |    4 ++++
> >>>  include/hw/ppc/spapr.h |    2 ++
> >>>  4 files changed, 43 insertions(+)
> >>>
> >>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> >>> index e8a77636..31c4850 100644
> >>> --- a/hw/ppc/spapr.c
> >>> +++ b/hw/ppc/spapr.c
> >>> @@ -2104,6 +2104,25 @@ static const VMStateDescription vmstate_spapr_dtb = {
> >>>      },
> >>>  };
> >>>  
> >>> +static bool spapr_fwnmi_needed(void *opaque)
> >>> +{
> >>> +    SpaprMachineState *spapr = (SpaprMachineState *)opaque;
> >>> +
> >>> +    return (spapr->guest_machine_check_addr == -1) ? 0 : 1;    
> >>
> >> Since we're introducing a PAPR capability to enable this, it would
> >> actually be better to check that here, rather than the runtime state.
> >> That leads to less cases and easier to understand semantics for the
> >> migration stream.
> >>  
> > 
> > Hmmm... the purpose of needed() VMState callbacks is precisely about
> > runtime state: the subsection should only be migrated if an MCE is
> > pending, ie. spapr->guest_machine_check_addr != -1.  
> 
> spapr->guest_machine_check_addr is set when fwnmi is registered. Hence
> an MCE might not be pending if it is set.
> 

Oops sorry, got confused... should have written "if the guest has
registered FWNMI", but the idea is the same. We only need to migrate
the subsection if the state is different from reset. This is the way
needed() callbacks are generally implemented.

> I am fine with both the approaches (checking for
> guest_machine_check_addr or for SPAPR_CAP_FWNMI_MCE).
> 

I would find ackward to migrate this always for new machine types,
even if the guest doesn't register FWNMI...

> Regards,
> Aravinda
> 
> >   
> >>> +}
> >>> +
> >>> +static const VMStateDescription vmstate_spapr_machine_check = {
> >>> +    .name = "spapr_machine_check",
> >>> +    .version_id = 1,
> >>> +    .minimum_version_id = 1,
> >>> +    .needed = spapr_fwnmi_needed,
> >>> +    .fields = (VMStateField[]) {
> >>> +        VMSTATE_UINT64(guest_machine_check_addr, SpaprMachineState),
> >>> +        VMSTATE_INT32(mc_status, SpaprMachineState),
> >>> +        VMSTATE_END_OF_LIST()
> >>> +    },
> >>> +};
> >>> +
> >>>  static const VMStateDescription vmstate_spapr = {
> >>>      .name = "spapr",
> >>>      .version_id = 3,
> >>> @@ -2137,6 +2156,7 @@ static const VMStateDescription vmstate_spapr = {
> >>>          &vmstate_spapr_dtb,
> >>>          &vmstate_spapr_cap_large_decr,
> >>>          &vmstate_spapr_cap_ccf_assist,
> >>> +        &vmstate_spapr_machine_check,
> >>>          NULL
> >>>      }
> >>>  };
> >>> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
> >>> index 573c0b7..35e21e4 100644
> >>> --- a/hw/ppc/spapr_events.c
> >>> +++ b/hw/ppc/spapr_events.c
> >>> @@ -41,6 +41,7 @@
> >>>  #include "qemu/bcd.h"
> >>>  #include "hw/ppc/spapr_ovec.h"
> >>>  #include <libfdt.h>
> >>> +#include "migration/blocker.h"
> >>>  
> >>>  #define RTAS_LOG_VERSION_MASK                   0xff000000
> >>>  #define   RTAS_LOG_VERSION_6                    0x06000000
> >>> @@ -855,6 +856,22 @@ static void spapr_mce_dispatch_elog(PowerPCCPU *cpu, bool recovered)
> >>>  void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
> >>>  {
> >>>      SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> >>> +    int ret;
> >>> +    Error *local_err = NULL;
> >>> +
> >>> +    error_setg(&spapr->fwnmi_migration_blocker,
> >>> +            "Live migration not supported during machine check handling");
> >>> +    ret = migrate_add_blocker(spapr->fwnmi_migration_blocker, &local_err);
> >>> +    if (ret < 0) {
> >>> +        /*
> >>> +         * We don't want to abort and let the migration to continue. In a
> >>> +         * rare case, the machine check handler will run on the target
> >>> +         * hardware. Though this is not preferable, it is better than aborting
> >>> +         * the migration or killing the VM.
> >>> +         */
> >>> +        error_free(spapr->fwnmi_migration_blocker);    
> >>
> >> You should set fwnmi_migration_blocker to NULL here as well.
> >>
> >> As mentioned on an earlier iteration, the migration blocker is the
> >> same every time.  Couldn't you just create it once and free at final
> >> teardown, rather than recreating it for every NMI?
> >>  
> >>> +        warn_report_err(local_err);
> >>> +    }
> >>>  
> >>>      while (spapr->mc_status != -1) {
> >>>          /*
> >>> diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c
> >>> index 91a7ab9..c849223 100644
> >>> --- a/hw/ppc/spapr_rtas.c
> >>> +++ b/hw/ppc/spapr_rtas.c
> >>> @@ -50,6 +50,7 @@
> >>>  #include "target/ppc/mmu-hash64.h"
> >>>  #include "target/ppc/mmu-book3s-v3.h"
> >>>  #include "kvm_ppc.h"
> >>> +#include "migration/blocker.h"
> >>>  
> >>>  static void rtas_display_character(PowerPCCPU *cpu, SpaprMachineState *spapr,
> >>>                                     uint32_t token, uint32_t nargs,
> >>> @@ -404,6 +405,9 @@ static void rtas_ibm_nmi_interlock(PowerPCCPU *cpu,
> >>>          spapr->mc_status = -1;
> >>>          qemu_cond_signal(&spapr->mc_delivery_cond);
> >>>          rtas_st(rets, 0, RTAS_OUT_SUCCESS);
> >>> +        migrate_del_blocker(spapr->fwnmi_migration_blocker);
> >>> +        error_free(spapr->fwnmi_migration_blocker);
> >>> +        spapr->fwnmi_migration_blocker = NULL;
> >>>      }
> >>>  }
> >>>  
> >>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> >>> index bd75d4b..6c0cfd8 100644
> >>> --- a/include/hw/ppc/spapr.h
> >>> +++ b/include/hw/ppc/spapr.h
> >>> @@ -214,6 +214,8 @@ struct SpaprMachineState {
> >>>      SpaprCapabilities def, eff, mig;
> >>>  
> >>>      unsigned gpu_numa_id;
> >>> +
> >>> +    Error *fwnmi_migration_blocker;
> >>>  };
> >>>  
> >>>  #define H_SUCCESS         0
> >>>     
> >>  
> >   
> 


Re: [Qemu-devel] [PATCH v9 6/6] migration: Include migration support for machine check handling
Posted by David Gibson 6 years, 5 months ago
On Thu, Jun 06, 2019 at 02:10:48PM +0200, Greg Kurz wrote:
> On Thu, 6 Jun 2019 16:45:30 +0530
> Aravinda Prasad <aravinda@linux.vnet.ibm.com> wrote:
> 
> > On Thursday 06 June 2019 11:36 AM, Greg Kurz wrote:
> > > On Thu, 6 Jun 2019 13:06:14 +1000
> > > David Gibson <david@gibson.dropbear.id.au> wrote:
> > >   
> > >> On Wed, May 29, 2019 at 11:10:57AM +0530, Aravinda Prasad wrote:  
> > >>> This patch includes migration support for machine check
> > >>> handling. Especially this patch blocks VM migration
> > >>> requests until the machine check error handling is
> > >>> complete as (i) these errors are specific to the source
> > >>> hardware and is irrelevant on the target hardware,
> > >>> (ii) these errors cause data corruption and should
> > >>> be handled before migration.
> > >>>
> > >>> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
> > >>> ---
> > >>>  hw/ppc/spapr.c         |   20 ++++++++++++++++++++
> > >>>  hw/ppc/spapr_events.c  |   17 +++++++++++++++++
> > >>>  hw/ppc/spapr_rtas.c    |    4 ++++
> > >>>  include/hw/ppc/spapr.h |    2 ++
> > >>>  4 files changed, 43 insertions(+)
> > >>>
> > >>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> > >>> index e8a77636..31c4850 100644
> > >>> --- a/hw/ppc/spapr.c
> > >>> +++ b/hw/ppc/spapr.c
> > >>> @@ -2104,6 +2104,25 @@ static const VMStateDescription vmstate_spapr_dtb = {
> > >>>      },
> > >>>  };
> > >>>  
> > >>> +static bool spapr_fwnmi_needed(void *opaque)
> > >>> +{
> > >>> +    SpaprMachineState *spapr = (SpaprMachineState *)opaque;
> > >>> +
> > >>> +    return (spapr->guest_machine_check_addr == -1) ? 0 : 1;    
> > >>
> > >> Since we're introducing a PAPR capability to enable this, it would
> > >> actually be better to check that here, rather than the runtime state.
> > >> That leads to less cases and easier to understand semantics for the
> > >> migration stream.
> > >>  
> > > 
> > > Hmmm... the purpose of needed() VMState callbacks is precisely about
> > > runtime state: the subsection should only be migrated if an MCE is
> > > pending, ie. spapr->guest_machine_check_addr != -1.  
> > 
> > spapr->guest_machine_check_addr is set when fwnmi is registered. Hence
> > an MCE might not be pending if it is set.
> > 
> 
> Oops sorry, got confused... should have written "if the guest has
> registered FWNMI", but the idea is the same. We only need to migrate
> the subsection if the state is different from reset. This is the way
> needed() callbacks are generally implemented.

Yes, but usually that's because we need to omit the section if it's
not actively in use in order to maintain backwards compatiblity with
old migration streams.  If the cap is enabled we already need
something that implements it at both ends to have a sane migration.

> > I am fine with both the approaches (checking for
> > guest_machine_check_addr or for SPAPR_CAP_FWNMI_MCE).
> > 
> 
> I would find ackward to migrate this always for new machine types,
> even if the guest doesn't register FWNMI...

How so?

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson
Re: [Qemu-devel] [PATCH v9 6/6] migration: Include migration support for machine check handling
Posted by Greg Kurz 6 years, 5 months ago
On Fri, 7 Jun 2019 10:22:40 +1000
David Gibson <david@gibson.dropbear.id.au> wrote:

> On Thu, Jun 06, 2019 at 02:10:48PM +0200, Greg Kurz wrote:
> > On Thu, 6 Jun 2019 16:45:30 +0530
> > Aravinda Prasad <aravinda@linux.vnet.ibm.com> wrote:
> >   
> > > On Thursday 06 June 2019 11:36 AM, Greg Kurz wrote:  
> > > > On Thu, 6 Jun 2019 13:06:14 +1000
> > > > David Gibson <david@gibson.dropbear.id.au> wrote:
> > > >     
> > > >> On Wed, May 29, 2019 at 11:10:57AM +0530, Aravinda Prasad wrote:    
> > > >>> This patch includes migration support for machine check
> > > >>> handling. Especially this patch blocks VM migration
> > > >>> requests until the machine check error handling is
> > > >>> complete as (i) these errors are specific to the source
> > > >>> hardware and is irrelevant on the target hardware,
> > > >>> (ii) these errors cause data corruption and should
> > > >>> be handled before migration.
> > > >>>
> > > >>> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
> > > >>> ---
> > > >>>  hw/ppc/spapr.c         |   20 ++++++++++++++++++++
> > > >>>  hw/ppc/spapr_events.c  |   17 +++++++++++++++++
> > > >>>  hw/ppc/spapr_rtas.c    |    4 ++++
> > > >>>  include/hw/ppc/spapr.h |    2 ++
> > > >>>  4 files changed, 43 insertions(+)
> > > >>>
> > > >>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> > > >>> index e8a77636..31c4850 100644
> > > >>> --- a/hw/ppc/spapr.c
> > > >>> +++ b/hw/ppc/spapr.c
> > > >>> @@ -2104,6 +2104,25 @@ static const VMStateDescription vmstate_spapr_dtb = {
> > > >>>      },
> > > >>>  };
> > > >>>  
> > > >>> +static bool spapr_fwnmi_needed(void *opaque)
> > > >>> +{
> > > >>> +    SpaprMachineState *spapr = (SpaprMachineState *)opaque;
> > > >>> +
> > > >>> +    return (spapr->guest_machine_check_addr == -1) ? 0 : 1;      
> > > >>
> > > >> Since we're introducing a PAPR capability to enable this, it would
> > > >> actually be better to check that here, rather than the runtime state.
> > > >> That leads to less cases and easier to understand semantics for the
> > > >> migration stream.
> > > >>    
> > > > 
> > > > Hmmm... the purpose of needed() VMState callbacks is precisely about
> > > > runtime state: the subsection should only be migrated if an MCE is
> > > > pending, ie. spapr->guest_machine_check_addr != -1.    
> > > 
> > > spapr->guest_machine_check_addr is set when fwnmi is registered. Hence
> > > an MCE might not be pending if it is set.
> > >   
> > 
> > Oops sorry, got confused... should have written "if the guest has
> > registered FWNMI", but the idea is the same. We only need to migrate
> > the subsection if the state is different from reset. This is the way
> > needed() callbacks are generally implemented.  
> 
> Yes, but usually that's because we need to omit the section if it's
> not actively in use in order to maintain backwards compatiblity with
> old migration streams.  If the cap is enabled we already need
> something that implements it at both ends to have a sane migration.
> 

I see it the opposite way. A subsection is something that is optional
for the destination only. In the present case, an older QEMU wont send
the "spapr_machine_check" subsection, which is interpreted by a newer
QEMU as "the guest didn't register FWNMI".

On the source side, if some internal state got used it should always
be migrated. We maintain backwards compatibility to an older QEMU
by not using the new state at all, thanks to the versioned machine
property.

> > > I am fine with both the approaches (checking for
> > > guest_machine_check_addr or for SPAPR_CAP_FWNMI_MCE).
> > >   
> > 
> > I would find ackward to migrate this always for new machine types,
> > even if the guest doesn't register FWNMI...  
> 
> How so?
> 

Well, I just don't see the point in migrating something that
is not state since its value is the reset default that the
destination already knows.

This being said, I won't make more fuss about it, as long as
it works :)
Re: [Qemu-devel] [PATCH v9 6/6] migration: Include migration support for machine check handling
Posted by Aravinda Prasad 6 years, 5 months ago

On Thursday 06 June 2019 08:36 AM, David Gibson wrote:
> On Wed, May 29, 2019 at 11:10:57AM +0530, Aravinda Prasad wrote:
>> This patch includes migration support for machine check
>> handling. Especially this patch blocks VM migration
>> requests until the machine check error handling is
>> complete as (i) these errors are specific to the source
>> hardware and is irrelevant on the target hardware,
>> (ii) these errors cause data corruption and should
>> be handled before migration.
>>
>> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
>> ---
>>  hw/ppc/spapr.c         |   20 ++++++++++++++++++++
>>  hw/ppc/spapr_events.c  |   17 +++++++++++++++++
>>  hw/ppc/spapr_rtas.c    |    4 ++++
>>  include/hw/ppc/spapr.h |    2 ++
>>  4 files changed, 43 insertions(+)
>>
>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>> index e8a77636..31c4850 100644
>> --- a/hw/ppc/spapr.c
>> +++ b/hw/ppc/spapr.c
>> @@ -2104,6 +2104,25 @@ static const VMStateDescription vmstate_spapr_dtb = {
>>      },
>>  };
>>  
>> +static bool spapr_fwnmi_needed(void *opaque)
>> +{
>> +    SpaprMachineState *spapr = (SpaprMachineState *)opaque;
>> +
>> +    return (spapr->guest_machine_check_addr == -1) ? 0 : 1;
> 
> Since we're introducing a PAPR capability to enable this, it would
> actually be better to check that here, rather than the runtime state.
> That leads to less cases and easier to understand semantics for the
> migration stream.

I am fine with this approach as well.

> 
>> +}
>> +
>> +static const VMStateDescription vmstate_spapr_machine_check = {
>> +    .name = "spapr_machine_check",
>> +    .version_id = 1,
>> +    .minimum_version_id = 1,
>> +    .needed = spapr_fwnmi_needed,
>> +    .fields = (VMStateField[]) {
>> +        VMSTATE_UINT64(guest_machine_check_addr, SpaprMachineState),
>> +        VMSTATE_INT32(mc_status, SpaprMachineState),
>> +        VMSTATE_END_OF_LIST()
>> +    },
>> +};
>> +
>>  static const VMStateDescription vmstate_spapr = {
>>      .name = "spapr",
>>      .version_id = 3,
>> @@ -2137,6 +2156,7 @@ static const VMStateDescription vmstate_spapr = {
>>          &vmstate_spapr_dtb,
>>          &vmstate_spapr_cap_large_decr,
>>          &vmstate_spapr_cap_ccf_assist,
>> +        &vmstate_spapr_machine_check,
>>          NULL
>>      }
>>  };
>> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
>> index 573c0b7..35e21e4 100644
>> --- a/hw/ppc/spapr_events.c
>> +++ b/hw/ppc/spapr_events.c
>> @@ -41,6 +41,7 @@
>>  #include "qemu/bcd.h"
>>  #include "hw/ppc/spapr_ovec.h"
>>  #include <libfdt.h>
>> +#include "migration/blocker.h"
>>  
>>  #define RTAS_LOG_VERSION_MASK                   0xff000000
>>  #define   RTAS_LOG_VERSION_6                    0x06000000
>> @@ -855,6 +856,22 @@ static void spapr_mce_dispatch_elog(PowerPCCPU *cpu, bool recovered)
>>  void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
>>  {
>>      SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
>> +    int ret;
>> +    Error *local_err = NULL;
>> +
>> +    error_setg(&spapr->fwnmi_migration_blocker,
>> +            "Live migration not supported during machine check handling");
>> +    ret = migrate_add_blocker(spapr->fwnmi_migration_blocker, &local_err);
>> +    if (ret < 0) {
>> +        /*
>> +         * We don't want to abort and let the migration to continue. In a
>> +         * rare case, the machine check handler will run on the target
>> +         * hardware. Though this is not preferable, it is better than aborting
>> +         * the migration or killing the VM.
>> +         */
>> +        error_free(spapr->fwnmi_migration_blocker);
> 
> You should set fwnmi_migration_blocker to NULL here as well.

ok.

> 
> As mentioned on an earlier iteration, the migration blocker is the
> same every time.  Couldn't you just create it once and free at final
> teardown, rather than recreating it for every NMI?

That means, we create the error string at the time when ibm,nmi-register
is invoked and tear it down during machine reset?

Regards,
Aravinda

> 
>> +        warn_report_err(local_err);
>> +    }
>>  
>>      while (spapr->mc_status != -1) {
>>          /*
>> diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c
>> index 91a7ab9..c849223 100644
>> --- a/hw/ppc/spapr_rtas.c
>> +++ b/hw/ppc/spapr_rtas.c
>> @@ -50,6 +50,7 @@
>>  #include "target/ppc/mmu-hash64.h"
>>  #include "target/ppc/mmu-book3s-v3.h"
>>  #include "kvm_ppc.h"
>> +#include "migration/blocker.h"
>>  
>>  static void rtas_display_character(PowerPCCPU *cpu, SpaprMachineState *spapr,
>>                                     uint32_t token, uint32_t nargs,
>> @@ -404,6 +405,9 @@ static void rtas_ibm_nmi_interlock(PowerPCCPU *cpu,
>>          spapr->mc_status = -1;
>>          qemu_cond_signal(&spapr->mc_delivery_cond);
>>          rtas_st(rets, 0, RTAS_OUT_SUCCESS);
>> +        migrate_del_blocker(spapr->fwnmi_migration_blocker);
>> +        error_free(spapr->fwnmi_migration_blocker);
>> +        spapr->fwnmi_migration_blocker = NULL;
>>      }
>>  }
>>  
>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>> index bd75d4b..6c0cfd8 100644
>> --- a/include/hw/ppc/spapr.h
>> +++ b/include/hw/ppc/spapr.h
>> @@ -214,6 +214,8 @@ struct SpaprMachineState {
>>      SpaprCapabilities def, eff, mig;
>>  
>>      unsigned gpu_numa_id;
>> +
>> +    Error *fwnmi_migration_blocker;
>>  };
>>  
>>  #define H_SUCCESS         0
>>
> 

-- 
Regards,
Aravinda


Re: [Qemu-devel] [PATCH v9 6/6] migration: Include migration support for machine check handling
Posted by Greg Kurz 6 years, 5 months ago
On Thu, 6 Jun 2019 16:55:18 +0530
Aravinda Prasad <aravinda@linux.vnet.ibm.com> wrote:

> On Thursday 06 June 2019 08:36 AM, David Gibson wrote:
> > On Wed, May 29, 2019 at 11:10:57AM +0530, Aravinda Prasad wrote:  
> >> This patch includes migration support for machine check
> >> handling. Especially this patch blocks VM migration
> >> requests until the machine check error handling is
> >> complete as (i) these errors are specific to the source
> >> hardware and is irrelevant on the target hardware,
> >> (ii) these errors cause data corruption and should
> >> be handled before migration.
> >>
> >> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
> >> ---
> >>  hw/ppc/spapr.c         |   20 ++++++++++++++++++++
> >>  hw/ppc/spapr_events.c  |   17 +++++++++++++++++
> >>  hw/ppc/spapr_rtas.c    |    4 ++++
> >>  include/hw/ppc/spapr.h |    2 ++
> >>  4 files changed, 43 insertions(+)
> >>
> >> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> >> index e8a77636..31c4850 100644
> >> --- a/hw/ppc/spapr.c
> >> +++ b/hw/ppc/spapr.c
> >> @@ -2104,6 +2104,25 @@ static const VMStateDescription vmstate_spapr_dtb = {
> >>      },
> >>  };
> >>  
> >> +static bool spapr_fwnmi_needed(void *opaque)
> >> +{
> >> +    SpaprMachineState *spapr = (SpaprMachineState *)opaque;
> >> +
> >> +    return (spapr->guest_machine_check_addr == -1) ? 0 : 1;  
> > 
> > Since we're introducing a PAPR capability to enable this, it would
> > actually be better to check that here, rather than the runtime state.
> > That leads to less cases and easier to understand semantics for the
> > migration stream.  
> 
> I am fine with this approach as well.
> 
> >   
> >> +}
> >> +
> >> +static const VMStateDescription vmstate_spapr_machine_check = {
> >> +    .name = "spapr_machine_check",
> >> +    .version_id = 1,
> >> +    .minimum_version_id = 1,
> >> +    .needed = spapr_fwnmi_needed,
> >> +    .fields = (VMStateField[]) {
> >> +        VMSTATE_UINT64(guest_machine_check_addr, SpaprMachineState),
> >> +        VMSTATE_INT32(mc_status, SpaprMachineState),
> >> +        VMSTATE_END_OF_LIST()
> >> +    },
> >> +};
> >> +
> >>  static const VMStateDescription vmstate_spapr = {
> >>      .name = "spapr",
> >>      .version_id = 3,
> >> @@ -2137,6 +2156,7 @@ static const VMStateDescription vmstate_spapr = {
> >>          &vmstate_spapr_dtb,
> >>          &vmstate_spapr_cap_large_decr,
> >>          &vmstate_spapr_cap_ccf_assist,
> >> +        &vmstate_spapr_machine_check,
> >>          NULL
> >>      }
> >>  };
> >> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
> >> index 573c0b7..35e21e4 100644
> >> --- a/hw/ppc/spapr_events.c
> >> +++ b/hw/ppc/spapr_events.c
> >> @@ -41,6 +41,7 @@
> >>  #include "qemu/bcd.h"
> >>  #include "hw/ppc/spapr_ovec.h"
> >>  #include <libfdt.h>
> >> +#include "migration/blocker.h"
> >>  
> >>  #define RTAS_LOG_VERSION_MASK                   0xff000000
> >>  #define   RTAS_LOG_VERSION_6                    0x06000000
> >> @@ -855,6 +856,22 @@ static void spapr_mce_dispatch_elog(PowerPCCPU *cpu, bool recovered)
> >>  void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
> >>  {
> >>      SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> >> +    int ret;
> >> +    Error *local_err = NULL;
> >> +
> >> +    error_setg(&spapr->fwnmi_migration_blocker,
> >> +            "Live migration not supported during machine check handling");
> >> +    ret = migrate_add_blocker(spapr->fwnmi_migration_blocker, &local_err);
> >> +    if (ret < 0) {
> >> +        /*
> >> +         * We don't want to abort and let the migration to continue. In a
> >> +         * rare case, the machine check handler will run on the target
> >> +         * hardware. Though this is not preferable, it is better than aborting
> >> +         * the migration or killing the VM.
> >> +         */
> >> +        error_free(spapr->fwnmi_migration_blocker);  
> > 
> > You should set fwnmi_migration_blocker to NULL here as well.  
> 
> ok.
> 
> > 
> > As mentioned on an earlier iteration, the migration blocker is the
> > same every time.  Couldn't you just create it once and free at final
> > teardown, rather than recreating it for every NMI?  
> 
> That means, we create the error string at the time when ibm,nmi-register
> is invoked and tear it down during machine reset?
> 

No, I think David is asking to create the error string during machine init,
for all the machine lifetime. In which case, we don't even need to call
error_free() at all.

> Regards,
> Aravinda
> 
> >   
> >> +        warn_report_err(local_err);
> >> +    }
> >>  
> >>      while (spapr->mc_status != -1) {
> >>          /*
> >> diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c
> >> index 91a7ab9..c849223 100644
> >> --- a/hw/ppc/spapr_rtas.c
> >> +++ b/hw/ppc/spapr_rtas.c
> >> @@ -50,6 +50,7 @@
> >>  #include "target/ppc/mmu-hash64.h"
> >>  #include "target/ppc/mmu-book3s-v3.h"
> >>  #include "kvm_ppc.h"
> >> +#include "migration/blocker.h"
> >>  
> >>  static void rtas_display_character(PowerPCCPU *cpu, SpaprMachineState *spapr,
> >>                                     uint32_t token, uint32_t nargs,
> >> @@ -404,6 +405,9 @@ static void rtas_ibm_nmi_interlock(PowerPCCPU *cpu,
> >>          spapr->mc_status = -1;
> >>          qemu_cond_signal(&spapr->mc_delivery_cond);
> >>          rtas_st(rets, 0, RTAS_OUT_SUCCESS);
> >> +        migrate_del_blocker(spapr->fwnmi_migration_blocker);
> >> +        error_free(spapr->fwnmi_migration_blocker);
> >> +        spapr->fwnmi_migration_blocker = NULL;
> >>      }
> >>  }
> >>  
> >> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> >> index bd75d4b..6c0cfd8 100644
> >> --- a/include/hw/ppc/spapr.h
> >> +++ b/include/hw/ppc/spapr.h
> >> @@ -214,6 +214,8 @@ struct SpaprMachineState {
> >>      SpaprCapabilities def, eff, mig;
> >>  
> >>      unsigned gpu_numa_id;
> >> +
> >> +    Error *fwnmi_migration_blocker;
> >>  };
> >>  
> >>  #define H_SUCCESS         0
> >>  
> >   
> 


Re: [Qemu-devel] [PATCH v9 6/6] migration: Include migration support for machine check handling
Posted by David Gibson 6 years, 5 months ago
On Thu, Jun 06, 2019 at 04:55:18PM +0530, Aravinda Prasad wrote:
> 
> 
> On Thursday 06 June 2019 08:36 AM, David Gibson wrote:
> > On Wed, May 29, 2019 at 11:10:57AM +0530, Aravinda Prasad wrote:
> >> This patch includes migration support for machine check
> >> handling. Especially this patch blocks VM migration
> >> requests until the machine check error handling is
> >> complete as (i) these errors are specific to the source
> >> hardware and is irrelevant on the target hardware,
> >> (ii) these errors cause data corruption and should
> >> be handled before migration.
> >>
> >> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
> >> ---
> >>  hw/ppc/spapr.c         |   20 ++++++++++++++++++++
> >>  hw/ppc/spapr_events.c  |   17 +++++++++++++++++
> >>  hw/ppc/spapr_rtas.c    |    4 ++++
> >>  include/hw/ppc/spapr.h |    2 ++
> >>  4 files changed, 43 insertions(+)
> >>
> >> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> >> index e8a77636..31c4850 100644
> >> --- a/hw/ppc/spapr.c
> >> +++ b/hw/ppc/spapr.c
> >> @@ -2104,6 +2104,25 @@ static const VMStateDescription vmstate_spapr_dtb = {
> >>      },
> >>  };
> >>  
> >> +static bool spapr_fwnmi_needed(void *opaque)
> >> +{
> >> +    SpaprMachineState *spapr = (SpaprMachineState *)opaque;
> >> +
> >> +    return (spapr->guest_machine_check_addr == -1) ? 0 : 1;
> > 
> > Since we're introducing a PAPR capability to enable this, it would
> > actually be better to check that here, rather than the runtime state.
> > That leads to less cases and easier to understand semantics for the
> > migration stream.
> 
> I am fine with this approach as well.
> 
> > 
> >> +}
> >> +
> >> +static const VMStateDescription vmstate_spapr_machine_check = {
> >> +    .name = "spapr_machine_check",
> >> +    .version_id = 1,
> >> +    .minimum_version_id = 1,
> >> +    .needed = spapr_fwnmi_needed,
> >> +    .fields = (VMStateField[]) {
> >> +        VMSTATE_UINT64(guest_machine_check_addr, SpaprMachineState),
> >> +        VMSTATE_INT32(mc_status, SpaprMachineState),
> >> +        VMSTATE_END_OF_LIST()
> >> +    },
> >> +};
> >> +
> >>  static const VMStateDescription vmstate_spapr = {
> >>      .name = "spapr",
> >>      .version_id = 3,
> >> @@ -2137,6 +2156,7 @@ static const VMStateDescription vmstate_spapr = {
> >>          &vmstate_spapr_dtb,
> >>          &vmstate_spapr_cap_large_decr,
> >>          &vmstate_spapr_cap_ccf_assist,
> >> +        &vmstate_spapr_machine_check,
> >>          NULL
> >>      }
> >>  };
> >> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
> >> index 573c0b7..35e21e4 100644
> >> --- a/hw/ppc/spapr_events.c
> >> +++ b/hw/ppc/spapr_events.c
> >> @@ -41,6 +41,7 @@
> >>  #include "qemu/bcd.h"
> >>  #include "hw/ppc/spapr_ovec.h"
> >>  #include <libfdt.h>
> >> +#include "migration/blocker.h"
> >>  
> >>  #define RTAS_LOG_VERSION_MASK                   0xff000000
> >>  #define   RTAS_LOG_VERSION_6                    0x06000000
> >> @@ -855,6 +856,22 @@ static void spapr_mce_dispatch_elog(PowerPCCPU *cpu, bool recovered)
> >>  void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
> >>  {
> >>      SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> >> +    int ret;
> >> +    Error *local_err = NULL;
> >> +
> >> +    error_setg(&spapr->fwnmi_migration_blocker,
> >> +            "Live migration not supported during machine check handling");
> >> +    ret = migrate_add_blocker(spapr->fwnmi_migration_blocker, &local_err);
> >> +    if (ret < 0) {
> >> +        /*
> >> +         * We don't want to abort and let the migration to continue. In a
> >> +         * rare case, the machine check handler will run on the target
> >> +         * hardware. Though this is not preferable, it is better than aborting
> >> +         * the migration or killing the VM.
> >> +         */
> >> +        error_free(spapr->fwnmi_migration_blocker);
> > 
> > You should set fwnmi_migration_blocker to NULL here as well.
> 
> ok.
> 
> > 
> > As mentioned on an earlier iteration, the migration blocker is the
> > same every time.  Couldn't you just create it once and free at final
> > teardown, rather than recreating it for every NMI?
> 
> That means, we create the error string at the time when ibm,nmi-register
> is invoked and tear it down during machine reset?

Or you could even just create it at machine_init time, and tear it
down never, just add/remove it from the blocker slot.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson