[v4] Prepare GHES driver to support error injection

[PATCH v4 13/15] acpi/ghes: move offset calculus to a separate function

Posted by Mauro Carvalho Chehab 1 year, 2 months ago

Currently, CPER address location is calculated as an offset of
the hardware_errors table. It is also badly named, as the
offset actually used is the address where the CPER data starts,
and not the beginning of the error source.

Move the logic which calculates such offset to a separate
function, in preparation for a patch that will be changing the
logic to calculate it from the HEST table.

While here, properly name the variable which stores the cper
address.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 hw/acpi/ghes.c | 41 ++++++++++++++++++++++++++++++++---------
 1 file changed, 32 insertions(+), 9 deletions(-)

diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c
index 87fd3feedd2a..d99697b20164 100644
--- a/hw/acpi/ghes.c
+++ b/hw/acpi/ghes.c
@@ -364,10 +364,37 @@ void acpi_ghes_add_fw_cfg(AcpiGhesState *ags, FWCfgState *s,
     ags->present = true;
 }
 
+static void get_hw_error_offsets(uint64_t ghes_addr,
+                                 uint64_t *cper_addr,
+                                 uint64_t *read_ack_register_addr)
+{
+    if (!ghes_addr) {
+        return;
+    }
+
+    /*
+     * non-HEST version supports only one source, so no need to change
+     * the start offset based on the source ID. Also, we can't validate
+     * the source ID, as it is stored inside the HEST table.
+     */
+
+    cpu_physical_memory_read(ghes_addr, cper_addr,
+                             sizeof(*cper_addr));
+
+    *cper_addr = le64_to_cpu(*cper_addr);
+
+    /*
+     * As the current version supports only one source, the ack offset is
+     * just sizeof(uint64_t).
+     */
+    *read_ack_register_addr = ghes_addr +
+			      ACPI_GHES_ERROR_SOURCE_COUNT * sizeof(uint64_t);
+}
+
 void ghes_record_cper_errors(const void *cper, size_t len,
                              uint16_t source_id, Error **errp)
 {
-    uint64_t error_block_addr, read_ack_register_addr, read_ack_register = 0;
+    uint64_t cper_addr = 0, read_ack_register_addr = 0, read_ack_register;
     uint64_t start_addr;
     AcpiGedState *acpi_ged_state;
     AcpiGhesState *ags;
@@ -389,18 +416,14 @@ void ghes_record_cper_errors(const void *cper, size_t len,
 
     start_addr += source_id * sizeof(uint64_t);
 
-    cpu_physical_memory_read(start_addr, &error_block_addr,
-                             sizeof(error_block_addr));
+    get_hw_error_offsets(start_addr, &cper_addr, &read_ack_register_addr);
 
-    error_block_addr = le64_to_cpu(error_block_addr);
-    if (!error_block_addr) {
+    cper_addr = le64_to_cpu(cper_addr);
+    if (!cper_addr) {
         error_setg(errp, "can not find Generic Error Status Block");
         return;
     }
 
-    read_ack_register_addr = start_addr +
-                             ACPI_GHES_ERROR_SOURCE_COUNT * sizeof(uint64_t);
-
     cpu_physical_memory_read(read_ack_register_addr,
                              &read_ack_register, sizeof(read_ack_register));
 
@@ -421,7 +444,7 @@ void ghes_record_cper_errors(const void *cper, size_t len,
         &read_ack_register, sizeof(uint64_t));
 
     /* Write the generic error data entry into guest memory */
-    cpu_physical_memory_write(error_block_addr, cper, len);
+    cpu_physical_memory_write(cper_addr, cper, len);
 
     return;
 }
-- 
2.47.0

Re: [PATCH v4 13/15] acpi/ghes: move offset calculus to a separate function

Posted by Igor Mammedov 1 year, 2 months ago

On Fri, 22 Nov 2024 10:11:30 +0100
Mauro Carvalho Chehab <mchehab+huawei@kernel.org> wrote:

> Currently, CPER address location is calculated as an offset of
> the hardware_errors table. It is also badly named, as the
> offset actually used is the address where the CPER data starts,
> and not the beginning of the error source.
> 
> Move the logic which calculates such offset to a separate
> function, in preparation for a patch that will be changing the
> logic to calculate it from the HEST table.
> 
> While here, properly name the variable which stores the cper
> address.
> 
> Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> ---
>  hw/acpi/ghes.c | 41 ++++++++++++++++++++++++++++++++---------
>  1 file changed, 32 insertions(+), 9 deletions(-)
> 
> diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c
> index 87fd3feedd2a..d99697b20164 100644
> --- a/hw/acpi/ghes.c
> +++ b/hw/acpi/ghes.c
> @@ -364,10 +364,37 @@ void acpi_ghes_add_fw_cfg(AcpiGhesState *ags, FWCfgState *s,
>      ags->present = true;
>  }
>  
> +static void get_hw_error_offsets(uint64_t ghes_addr,
> +                                 uint64_t *cper_addr,
> +                                 uint64_t *read_ack_register_addr)
> +{


> +    if (!ghes_addr) {
> +        return;
> +    }

why do we need this check?

> +
> +    /*
> +     * non-HEST version supports only one source, so no need to change
> +     * the start offset based on the source ID. Also, we can't validate
> +     * the source ID, as it is stored inside the HEST table.
> +     */
> +
> +    cpu_physical_memory_read(ghes_addr, cper_addr,
> +                             sizeof(*cper_addr));
> +
> +    *cper_addr = le64_to_cpu(*cper_addr);
        1st bits flip, and then see later

> +
> +    /*
> +     * As the current version supports only one source, the ack offset is
> +     * just sizeof(uint64_t).
> +     */
> +    *read_ack_register_addr = ghes_addr +
> +			      ACPI_GHES_ERROR_SOURCE_COUNT * sizeof(uint64_t);
> +}
> +
>  void ghes_record_cper_errors(const void *cper, size_t len,
>                               uint16_t source_id, Error **errp)
>  {
> -    uint64_t error_block_addr, read_ack_register_addr, read_ack_register = 0;
> +    uint64_t cper_addr = 0, read_ack_register_addr = 0, read_ack_register;

if get_hw_error_offsets() isn't supposed to fail, then we do not need to initialize
above. So this hunk doesn't belong to this patch.

>      uint64_t start_addr;
>      AcpiGedState *acpi_ged_state;
>      AcpiGhesState *ags;
> @@ -389,18 +416,14 @@ void ghes_record_cper_errors(const void *cper, size_t len,
>  
>      start_addr += source_id * sizeof(uint64_t);
>  
> -    cpu_physical_memory_read(start_addr, &error_block_addr,
> -                             sizeof(error_block_addr));
> +    get_hw_error_offsets(start_addr, &cper_addr, &read_ack_register_addr);
>  
> -    error_block_addr = le64_to_cpu(error_block_addr);
> -    if (!error_block_addr) {
> +    cper_addr = le64_to_cpu(cper_addr);
                   ^^^^ 2nd bits flip turning it back to guest byte order again

suggest to keep only one of them in get_hw_error_offsets()

> +    if (!cper_addr) {
>          error_setg(errp, "can not find Generic Error Status Block");
>          return;
>      }
>  
> -    read_ack_register_addr = start_addr +
> -                             ACPI_GHES_ERROR_SOURCE_COUNT * sizeof(uint64_t);
> -
>      cpu_physical_memory_read(read_ack_register_addr,
>                               &read_ack_register, sizeof(read_ack_register));
>  
> @@ -421,7 +444,7 @@ void ghes_record_cper_errors(const void *cper, size_t len,
>          &read_ack_register, sizeof(uint64_t));
>  
>      /* Write the generic error data entry into guest memory */
> -    cpu_physical_memory_write(error_block_addr, cper, len);
> +    cpu_physical_memory_write(cper_addr, cper, len);
>  
>      return;
>  }

Re: [PATCH v4 13/15] acpi/ghes: move offset calculus to a separate function

Posted by Mauro Carvalho Chehab 1 year, 2 months ago

Em Tue, 3 Dec 2024 12:51:43 +0100
Igor Mammedov <imammedo@redhat.com> escreveu:

> On Fri, 22 Nov 2024 10:11:30 +0100
> Mauro Carvalho Chehab <mchehab+huawei@kernel.org> wrote:
> 
> > Currently, CPER address location is calculated as an offset of
> > the hardware_errors table. It is also badly named, as the
> > offset actually used is the address where the CPER data starts,
> > and not the beginning of the error source.
> > 
> > Move the logic which calculates such offset to a separate
> > function, in preparation for a patch that will be changing the
> > logic to calculate it from the HEST table.
> > 
> > While here, properly name the variable which stores the cper
> > address.
> > 
> > Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
> > Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> > ---
> >  hw/acpi/ghes.c | 41 ++++++++++++++++++++++++++++++++---------
> >  1 file changed, 32 insertions(+), 9 deletions(-)
> > 
> > diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c
> > index 87fd3feedd2a..d99697b20164 100644
> > --- a/hw/acpi/ghes.c
> > +++ b/hw/acpi/ghes.c
> > @@ -364,10 +364,37 @@ void acpi_ghes_add_fw_cfg(AcpiGhesState *ags, FWCfgState *s,
> >      ags->present = true;
> >  }
> >  
> > +static void get_hw_error_offsets(uint64_t ghes_addr,
> > +                                 uint64_t *cper_addr,
> > +                                 uint64_t *read_ack_register_addr)
> > +{  
> 
> 
> > +    if (!ghes_addr) {
> > +        return;
> > +    }  
> 
> why do we need this check?

It is a safeguard measure to avoid crashes and OOM access. If fw_cfg 
callback doesn't fill it properly, this will be zero.

> > +
> > +    /*
> > +     * non-HEST version supports only one source, so no need to change
> > +     * the start offset based on the source ID. Also, we can't validate
> > +     * the source ID, as it is stored inside the HEST table.
> > +     */
> > +
> > +    cpu_physical_memory_read(ghes_addr, cper_addr,
> > +                             sizeof(*cper_addr));
> > +
> > +    *cper_addr = le64_to_cpu(*cper_addr);  
>         1st bits flip, and then see later
> 
> > +
> > +    /*
> > +     * As the current version supports only one source, the ack offset is
> > +     * just sizeof(uint64_t).
> > +     */
> > +    *read_ack_register_addr = ghes_addr +
> > +			      ACPI_GHES_ERROR_SOURCE_COUNT * sizeof(uint64_t);
> > +}
> > +
> >  void ghes_record_cper_errors(const void *cper, size_t len,
> >                               uint16_t source_id, Error **errp)
> >  {
> > -    uint64_t error_block_addr, read_ack_register_addr, read_ack_register = 0;
> > +    uint64_t cper_addr = 0, read_ack_register_addr = 0, read_ack_register;  
> 
> if get_hw_error_offsets() isn't supposed to fail, then we do not need to initialize
> above. So this hunk doesn't belong to this patch.

It may fail due to:

    if (!ghes_addr) {
        return;
    }

> 
> >      uint64_t start_addr;
> >      AcpiGedState *acpi_ged_state;
> >      AcpiGhesState *ags;
> > @@ -389,18 +416,14 @@ void ghes_record_cper_errors(const void *cper, size_t len,
> >  
> >      start_addr += source_id * sizeof(uint64_t);
> >  
> > -    cpu_physical_memory_read(start_addr, &error_block_addr,
> > -                             sizeof(error_block_addr));
> > +    get_hw_error_offsets(start_addr, &cper_addr, &read_ack_register_addr);
> >  
> > -    error_block_addr = le64_to_cpu(error_block_addr);
> > -    if (!error_block_addr) {
> > +    cper_addr = le64_to_cpu(cper_addr);  
>                    ^^^^ 2nd bits flip turning it back to guest byte order again
> 
> suggest to keep only one of them in get_hw_error_offsets()

Ok, I'll drop this one.

> > +    if (!cper_addr) {
> >          error_setg(errp, "can not find Generic Error Status Block");
> >          return;
> >      }
> >  
> > -    read_ack_register_addr = start_addr +
> > -                             ACPI_GHES_ERROR_SOURCE_COUNT * sizeof(uint64_t);
> > -
> >      cpu_physical_memory_read(read_ack_register_addr,
> >                               &read_ack_register, sizeof(read_ack_register));
> >  
> > @@ -421,7 +444,7 @@ void ghes_record_cper_errors(const void *cper, size_t len,
> >          &read_ack_register, sizeof(uint64_t));
> >  
> >      /* Write the generic error data entry into guest memory */
> > -    cpu_physical_memory_write(error_block_addr, cper, len);
> > +    cpu_physical_memory_write(cper_addr, cper, len);
> >  
> >      return;
> >  }  
> 

Thanks,
Mauro

Re: [PATCH v4 13/15] acpi/ghes: move offset calculus to a separate function

Posted by Igor Mammedov 1 year, 2 months ago

On Tue, 3 Dec 2024 14:47:30 +0100
Mauro Carvalho Chehab <mchehab+huawei@kernel.org> wrote:

> Em Tue, 3 Dec 2024 12:51:43 +0100
> Igor Mammedov <imammedo@redhat.com> escreveu:
> 
> > On Fri, 22 Nov 2024 10:11:30 +0100
> > Mauro Carvalho Chehab <mchehab+huawei@kernel.org> wrote:
> >   
> > > Currently, CPER address location is calculated as an offset of
> > > the hardware_errors table. It is also badly named, as the
> > > offset actually used is the address where the CPER data starts,
> > > and not the beginning of the error source.
> > > 
> > > Move the logic which calculates such offset to a separate
> > > function, in preparation for a patch that will be changing the
> > > logic to calculate it from the HEST table.
> > > 
> > > While here, properly name the variable which stores the cper
> > > address.
> > > 
> > > Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
> > > Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> > > ---
> > >  hw/acpi/ghes.c | 41 ++++++++++++++++++++++++++++++++---------
> > >  1 file changed, 32 insertions(+), 9 deletions(-)
> > > 
> > > diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c
> > > index 87fd3feedd2a..d99697b20164 100644
> > > --- a/hw/acpi/ghes.c
> > > +++ b/hw/acpi/ghes.c
> > > @@ -364,10 +364,37 @@ void acpi_ghes_add_fw_cfg(AcpiGhesState *ags, FWCfgState *s,
> > >      ags->present = true;
> > >  }
> > >  
> > > +static void get_hw_error_offsets(uint64_t ghes_addr,
> > > +                                 uint64_t *cper_addr,
> > > +                                 uint64_t *read_ack_register_addr)
> > > +{    
> > 
> >   
> > > +    if (!ghes_addr) {
> > > +        return;
> > > +    }    
> > 
> > why do we need this check?  
> 
> It is a safeguard measure to avoid crashes and OOM access. If fw_cfg 
> callback doesn't fill it properly, this will be zero.

shouldn't happen, but yeah it firmware job to write back addr
which might happen for whatever reason (a bug for example).

Perhaps push this up to the stack, so we don't have to deal
with scattered checks in ghes code.

kvm_arch_on_sigbus_vcpu() looks like a goo candidate for check
and warn_once if that ever happens.
It already calls acpi_ghes_present() which resolves GED device
and then later we duplicate this job in ghes_record_cper_errors()

so maybe rename acpi_ghes_present to something like AcpiGhesState* acpi_ghes_get_state()
and call it instead. And then move ghes_addr check/warn_once there.
This way the rest of ghes code won't have to deal handling practically
impossible error conditions that cause reader to wonder why it might happen.

> > > +
> > > +    /*
> > > +     * non-HEST version supports only one source, so no need to change
> > > +     * the start offset based on the source ID. Also, we can't validate
> > > +     * the source ID, as it is stored inside the HEST table.
> > > +     */
> > > +
> > > +    cpu_physical_memory_read(ghes_addr, cper_addr,
> > > +                             sizeof(*cper_addr));
> > > +
> > > +    *cper_addr = le64_to_cpu(*cper_addr);    
> >         1st bits flip, and then see later
> >   
> > > +
> > > +    /*
> > > +     * As the current version supports only one source, the ack offset is
> > > +     * just sizeof(uint64_t).
> > > +     */
> > > +    *read_ack_register_addr = ghes_addr +
> > > +			      ACPI_GHES_ERROR_SOURCE_COUNT * sizeof(uint64_t);
> > > +}
> > > +
> > >  void ghes_record_cper_errors(const void *cper, size_t len,
> > >                               uint16_t source_id, Error **errp)
> > >  {
> > > -    uint64_t error_block_addr, read_ack_register_addr, read_ack_register = 0;
> > > +    uint64_t cper_addr = 0, read_ack_register_addr = 0, read_ack_register;    
> > 
> > if get_hw_error_offsets() isn't supposed to fail, then we do not need to initialize
> > above. So this hunk doesn't belong to this patch.  
> 
> It may fail due to:
> 
>     if (!ghes_addr) {
>         return;
>     }
> 
> >   
> > >      uint64_t start_addr;
> > >      AcpiGedState *acpi_ged_state;
> > >      AcpiGhesState *ags;
> > > @@ -389,18 +416,14 @@ void ghes_record_cper_errors(const void *cper, size_t len,
> > >  
> > >      start_addr += source_id * sizeof(uint64_t);
> > >  
> > > -    cpu_physical_memory_read(start_addr, &error_block_addr,
> > > -                             sizeof(error_block_addr));
> > > +    get_hw_error_offsets(start_addr, &cper_addr, &read_ack_register_addr);
> > >  
> > > -    error_block_addr = le64_to_cpu(error_block_addr);
> > > -    if (!error_block_addr) {
> > > +    cper_addr = le64_to_cpu(cper_addr);    
> >                    ^^^^ 2nd bits flip turning it back to guest byte order again
> > 
> > suggest to keep only one of them in get_hw_error_offsets()  
> 
> Ok, I'll drop this one.
> 
> > > +    if (!cper_addr) {
> > >          error_setg(errp, "can not find Generic Error Status Block");
> > >          return;
> > >      }
> > >  
> > > -    read_ack_register_addr = start_addr +
> > > -                             ACPI_GHES_ERROR_SOURCE_COUNT * sizeof(uint64_t);
> > > -
> > >      cpu_physical_memory_read(read_ack_register_addr,
> > >                               &read_ack_register, sizeof(read_ack_register));
> > >  
> > > @@ -421,7 +444,7 @@ void ghes_record_cper_errors(const void *cper, size_t len,
> > >          &read_ack_register, sizeof(uint64_t));
> > >  
> > >      /* Write the generic error data entry into guest memory */
> > > -    cpu_physical_memory_write(error_block_addr, cper, len);
> > > +    cpu_physical_memory_write(cper_addr, cper, len);
> > >  
> > >      return;
> > >  }    
> >   
> 
> Thanks,
> Mauro
>

Re: [PATCH v4 13/15] acpi/ghes: move offset calculus to a separate function

Posted by Mauro Carvalho Chehab 1 year, 2 months ago

Em Wed, 4 Dec 2024 08:54:40 +0100
Igor Mammedov <imammedo@redhat.com> escreveu:

> On Tue, 3 Dec 2024 14:47:30 +0100
> Mauro Carvalho Chehab <mchehab+huawei@kernel.org> wrote:
> 
> > Em Tue, 3 Dec 2024 12:51:43 +0100
> > Igor Mammedov <imammedo@redhat.com> escreveu:
> >   
> > > On Fri, 22 Nov 2024 10:11:30 +0100
> > > Mauro Carvalho Chehab <mchehab+huawei@kernel.org> wrote:
> > >     
> > > > Currently, CPER address location is calculated as an offset of
> > > > the hardware_errors table. It is also badly named, as the
> > > > offset actually used is the address where the CPER data starts,
> > > > and not the beginning of the error source.
> > > > 
> > > > Move the logic which calculates such offset to a separate
> > > > function, in preparation for a patch that will be changing the
> > > > logic to calculate it from the HEST table.
> > > > 
> > > > While here, properly name the variable which stores the cper
> > > > address.
> > > > 
> > > > Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
> > > > Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> > > > ---
> > > >  hw/acpi/ghes.c | 41 ++++++++++++++++++++++++++++++++---------
> > > >  1 file changed, 32 insertions(+), 9 deletions(-)
> > > > 
> > > > diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c
> > > > index 87fd3feedd2a..d99697b20164 100644
> > > > --- a/hw/acpi/ghes.c
> > > > +++ b/hw/acpi/ghes.c
> > > > @@ -364,10 +364,37 @@ void acpi_ghes_add_fw_cfg(AcpiGhesState *ags, FWCfgState *s,
> > > >      ags->present = true;
> > > >  }
> > > >  
> > > > +static void get_hw_error_offsets(uint64_t ghes_addr,
> > > > +                                 uint64_t *cper_addr,
> > > > +                                 uint64_t *read_ack_register_addr)
> > > > +{      
> > > 
> > >     
> > > > +    if (!ghes_addr) {
> > > > +        return;
> > > > +    }      
> > > 
> > > why do we need this check?    
> > 
> > It is a safeguard measure to avoid crashes and OOM access. If fw_cfg 
> > callback doesn't fill it properly, this will be zero.  
> 
> shouldn't happen, but yeah it firmware job to write back addr
> which might happen for whatever reason (a bug for example).
>

The main reason I added it is that, after the second series, it could 
also happen if there's something wrong with the backward compat logic.

So, both here and after switching to HEST-based offsets, I opted
to explicitly test.

> Perhaps push this up to the stack, so we don't have to deal
> with scattered checks in ghes code.
> 
> kvm_arch_on_sigbus_vcpu() looks like a goo candidate for check
> and warn_once if that ever happens.
> It already calls acpi_ghes_present() which resolves GED device
> and then later we duplicate this job in ghes_record_cper_errors()
> 
> so maybe rename acpi_ghes_present to something like AcpiGhesState* acpi_ghes_get_state()
> and call it instead. And then move ghes_addr check/warn_once there.
> This way the rest of ghes code won't have to deal handling practically
> impossible error conditions that cause reader to wonder why it might happen.

I'll look on it. Yet, if ok for you, I would prefer dealing with this
once we have a bigger picture, e.g. once we merge those tree series:

	- cleanup series (this one);
	- HEST offset (I'll be sending a new version today);
	- error_inject.

Thanks,
Mauro

Re: [PATCH v4 13/15] acpi/ghes: move offset calculus to a separate function

Posted by Igor Mammedov 1 year, 2 months ago

On Wed, 4 Dec 2024 09:56:35 +0100
Mauro Carvalho Chehab <mchehab+huawei@kernel.org> wrote:

> Em Wed, 4 Dec 2024 08:54:40 +0100
> Igor Mammedov <imammedo@redhat.com> escreveu:
> 
> > On Tue, 3 Dec 2024 14:47:30 +0100
> > Mauro Carvalho Chehab <mchehab+huawei@kernel.org> wrote:
> >   
> > > Em Tue, 3 Dec 2024 12:51:43 +0100
> > > Igor Mammedov <imammedo@redhat.com> escreveu:
> > >     
> > > > On Fri, 22 Nov 2024 10:11:30 +0100
> > > > Mauro Carvalho Chehab <mchehab+huawei@kernel.org> wrote:
> > > >       
> > > > > Currently, CPER address location is calculated as an offset of
> > > > > the hardware_errors table. It is also badly named, as the
> > > > > offset actually used is the address where the CPER data starts,
> > > > > and not the beginning of the error source.
> > > > > 
> > > > > Move the logic which calculates such offset to a separate
> > > > > function, in preparation for a patch that will be changing the
> > > > > logic to calculate it from the HEST table.
> > > > > 
> > > > > While here, properly name the variable which stores the cper
> > > > > address.
> > > > > 
> > > > > Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
> > > > > Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> > > > > ---
> > > > >  hw/acpi/ghes.c | 41 ++++++++++++++++++++++++++++++++---------
> > > > >  1 file changed, 32 insertions(+), 9 deletions(-)
> > > > > 
> > > > > diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c
> > > > > index 87fd3feedd2a..d99697b20164 100644
> > > > > --- a/hw/acpi/ghes.c
> > > > > +++ b/hw/acpi/ghes.c
> > > > > @@ -364,10 +364,37 @@ void acpi_ghes_add_fw_cfg(AcpiGhesState *ags, FWCfgState *s,
> > > > >      ags->present = true;
> > > > >  }
> > > > >  
> > > > > +static void get_hw_error_offsets(uint64_t ghes_addr,
> > > > > +                                 uint64_t *cper_addr,
> > > > > +                                 uint64_t *read_ack_register_addr)
> > > > > +{        
> > > > 
> > > >       
> > > > > +    if (!ghes_addr) {
> > > > > +        return;
> > > > > +    }        
> > > > 
> > > > why do we need this check?      
> > > 
> > > It is a safeguard measure to avoid crashes and OOM access. If fw_cfg 
> > > callback doesn't fill it properly, this will be zero.    
> > 
> > shouldn't happen, but yeah it firmware job to write back addr
> > which might happen for whatever reason (a bug for example).
> >  
> 
> The main reason I added it is that, after the second series, it could 
> also happen if there's something wrong with the backward compat logic.
> 
> So, both here and after switching to HEST-based offsets, I opted
> to explicitly test.
> 
> > Perhaps push this up to the stack, so we don't have to deal
> > with scattered checks in ghes code.
> > 
> > kvm_arch_on_sigbus_vcpu() looks like a goo candidate for check
> > and warn_once if that ever happens.
> > It already calls acpi_ghes_present() which resolves GED device
> > and then later we duplicate this job in ghes_record_cper_errors()
> > 
> > so maybe rename acpi_ghes_present to something like AcpiGhesState* acpi_ghes_get_state()
> > and call it instead. And then move ghes_addr check/warn_once there.
> > This way the rest of ghes code won't have to deal handling practically
> > impossible error conditions that cause reader to wonder why it might happen.  
> 
> I'll look on it. Yet, if ok for you, I would prefer dealing with this
> once we have a bigger picture, e.g. once we merge those tree series:
> 
> 	- cleanup series (this one);
> 	- HEST offset (I'll be sending a new version today);
ok, lets revisit this point after this series.
Since at this point we should have a clean picture of how new code
works.

> 	- error_inject.
> 
> Thanks,
> Mauro
>

Re: [PATCH v4 13/15] acpi/ghes: move offset calculus to a separate function

Posted by Mauro Carvalho Chehab 1 year, 2 months ago

Em Wed, 4 Dec 2024 10:24:13 +0100
Igor Mammedov <imammedo@redhat.com> escreveu:

> On Wed, 4 Dec 2024 09:56:35 +0100
> Mauro Carvalho Chehab <mchehab+huawei@kernel.org> wrote:
> 
> > Em Wed, 4 Dec 2024 08:54:40 +0100
> > Igor Mammedov <imammedo@redhat.com> escreveu:
> >   
> > > On Tue, 3 Dec 2024 14:47:30 +0100
> > > Mauro Carvalho Chehab <mchehab+huawei@kernel.org> wrote:
> > >     
> > > > Em Tue, 3 Dec 2024 12:51:43 +0100
> > > > Igor Mammedov <imammedo@redhat.com> escreveu:
> > > >       
> > > > > On Fri, 22 Nov 2024 10:11:30 +0100
> > > > > Mauro Carvalho Chehab <mchehab+huawei@kernel.org> wrote:
> > > > >         

...

> > > > > > +static void get_hw_error_offsets(uint64_t ghes_addr,
> > > > > > +                                 uint64_t *cper_addr,
> > > > > > +                                 uint64_t *read_ack_register_addr)
> > > > > > +{          
> > > > > 
> > > > >         
> > > > > > +    if (!ghes_addr) {
> > > > > > +        return;
> > > > > > +    }          
> > > > > 
> > > > > why do we need this check?        
> > > > 
> > > > It is a safeguard measure to avoid crashes and OOM access. If fw_cfg 
> > > > callback doesn't fill it properly, this will be zero.      
> > > 
> > > shouldn't happen, but yeah it firmware job to write back addr
> > > which might happen for whatever reason (a bug for example).
> > >    
> > 
> > The main reason I added it is that, after the second series, it could 
> > also happen if there's something wrong with the backward compat logic.
> > 
> > So, both here and after switching to HEST-based offsets, I opted
> > to explicitly test.
> >   
> > > Perhaps push this up to the stack, so we don't have to deal
> > > with scattered checks in ghes code.
> > > 
> > > kvm_arch_on_sigbus_vcpu() looks like a goo candidate for check
> > > and warn_once if that ever happens.
> > > It already calls acpi_ghes_present() which resolves GED device
> > > and then later we duplicate this job in ghes_record_cper_errors()
> > > 
> > > so maybe rename acpi_ghes_present to something like AcpiGhesState* acpi_ghes_get_state()
> > > and call it instead. And then move ghes_addr check/warn_once there.
> > > This way the rest of ghes code won't have to deal handling practically
> > > impossible error conditions that cause reader to wonder why it might happen.    
> > 
> > I'll look on it.

Wrote the cleanup patch. See enclosed. I'll place it at the end of the
second series.

> > Yet, if ok for you, I would prefer dealing with this
> > once we have a bigger picture, e.g. once we merge those tree series:
> > 
> > 	- cleanup series (this one);
> > 	- HEST offset (I'll be sending a new version today);  
> ok, lets revisit this point after this series.
> Since at this point we should have a clean picture of how new code
> works.

 
Thanks,
Mauro

[PATCH] acpi/ghes: Cleanup the code which gets ghes ged state

Move the check logic into a common function and simplify the
code which checks if GHES is enabled and was properly setup.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>

diff --git a/hw/acpi/ghes-stub.c b/hw/acpi/ghes-stub.c
index 7cec1812dad9..fbabf955155a 100644
--- a/hw/acpi/ghes-stub.c
+++ b/hw/acpi/ghes-stub.c
@@ -16,7 +16,7 @@ int acpi_ghes_memory_errors(uint16_t source_id, uint64_t physical_address)
     return -1;
 }
 
-bool acpi_ghes_present(void)
+AcpiGhesState *acpi_ghes_get_state(void)
 {
-    return false;
+    return NULL;
 }
diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c
index a9c5315c1936..17aada9ee352 100644
--- a/hw/acpi/ghes.c
+++ b/hw/acpi/ghes.c
@@ -420,10 +420,6 @@ static void get_hw_error_offsets(uint64_t ghes_addr,
                                  uint64_t *cper_addr,
                                  uint64_t *read_ack_register_addr)
 {
-    if (!ghes_addr) {
-        return;
-    }
-
     /*
      * non-HEST version supports only one source, so no need to change
      * the start offset based on the source ID. Also, we can't validate
@@ -451,10 +447,6 @@ static void get_ghes_source_offsets(uint16_t source_id, uint64_t hest_addr,
     uint64_t err_source_struct, error_block_addr;
     uint32_t num_sources, i;
 
-    if (!hest_addr) {
-        return;
-    }
-
     cpu_physical_memory_read(hest_addr, &num_sources, sizeof(num_sources));
     num_sources = le32_to_cpu(num_sources);
 
@@ -513,7 +505,6 @@ void ghes_record_cper_errors(const void *cper, size_t len,
                              uint16_t source_id, Error **errp)
 {
     uint64_t cper_addr = 0, read_ack_register_addr = 0, read_ack_register;
-    AcpiGedState *acpi_ged_state;
     AcpiGhesState *ags;
 
     if (len > ACPI_GHES_MAX_RAW_DATA_LENGTH) {
@@ -521,13 +512,10 @@ void ghes_record_cper_errors(const void *cper, size_t len,
         return;
     }
 
-    acpi_ged_state = ACPI_GED(object_resolve_path_type("", TYPE_ACPI_GED,
-                                                       NULL));
-    if (!acpi_ged_state) {
-        error_setg(errp, "Can't find ACPI_GED object");
+    ags = acpi_ghes_get_state();
+    if (!ags) {
         return;
     }
-    ags = &acpi_ged_state->ghes_state;
 
     if (!ags->hest_lookup) {
         get_hw_error_offsets(le64_to_cpu(ags->hw_error_le),
@@ -537,11 +525,6 @@ void ghes_record_cper_errors(const void *cper, size_t len,
                                 &cper_addr, &read_ack_register_addr, errp);
     }
 
-    if (!cper_addr) {
-        error_setg(errp, "can not find Generic Error Status Block");
-        return;
-    }
-
     cpu_physical_memory_read(read_ack_register_addr,
                              &read_ack_register, sizeof(read_ack_register));
 
@@ -606,7 +589,7 @@ int acpi_ghes_memory_errors(uint16_t source_id, uint64_t physical_address)
     return 0;
 }
 
-bool acpi_ghes_present(void)
+AcpiGhesState *acpi_ghes_get_state(void)
 {
     AcpiGedState *acpi_ged_state;
     AcpiGhesState *ags;
@@ -615,8 +598,14 @@ bool acpi_ghes_present(void)
                                                        NULL));
 
     if (!acpi_ged_state) {
-        return false;
+        return NULL;
     }
     ags = &acpi_ged_state->ghes_state;
-    return ags->present;
+    if (!ags->present) {
+        return NULL;
+    }
+    if (!ags->hw_error_le && !ags->hest_addr_le) {
+        return NULL;
+    }
+    return ags;
 }
diff --git a/include/hw/acpi/ghes.h b/include/hw/acpi/ghes.h
index 2e8405edfe27..64fe2b5bea65 100644
--- a/include/hw/acpi/ghes.h
+++ b/include/hw/acpi/ghes.h
@@ -91,10 +91,11 @@ void ghes_record_cper_errors(const void *cper, size_t len,
                              uint16_t source_id, Error **errp);
 
 /**
- * acpi_ghes_present: Report whether ACPI GHES table is present
+ * acpi_ghes_get_state: Get a pointer for ACPI ghes state
  *
- * Returns: true if the system has an ACPI GHES table and it is
- * safe to call acpi_ghes_memory_errors() to record a memory error.
+ * Returns: a pointer to ghes state if the system has an ACPI GHES table,
+ * it is enabled and it is safe to call acpi_ghes_memory_errors() to record
+ * a memory error. Returns false, otherwise.
  */
-bool acpi_ghes_present(void);
+AcpiGhesState *acpi_ghes_get_state(void);
 #endif
diff --git a/target/arm/kvm.c b/target/arm/kvm.c
index b4260467f8b9..7802c32fb7e0 100644
--- a/target/arm/kvm.c
+++ b/target/arm/kvm.c
@@ -2369,7 +2369,7 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
 
     assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO);
 
-    if (acpi_ghes_present() && addr) {
+    if (acpi_ghes_get_state() && addr) {
         ram_addr = qemu_ram_addr_from_host(addr);
         if (ram_addr != RAM_ADDR_INVALID &&
             kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) {