[Qemu-devel] [PATCH v2] spapr: fix memory hotplug error path

Greg Kurz posted 1 patch 6 years, 9 months ago
Failed in applying to current master (apply log)
hw/ppc/spapr.c |   26 ++++++++++++++++++++++----
1 file changed, 22 insertions(+), 4 deletions(-)
[Qemu-devel] [PATCH v2] spapr: fix memory hotplug error path
Posted by Greg Kurz 6 years, 9 months ago
QEMU shouldn't abort if spapr_add_lmbs()->spapr_drc_attach() fails.
Let's propagate the error instead, like it is done everywhere else
where spapr_drc_attach() is called.

Signed-off-by: Greg Kurz <groug@kaod.org>
---
Changes in v2: - added rollback code
---
 hw/ppc/spapr.c |   26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 70b3fd374e2b..ba8f57a5a054 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2601,6 +2601,7 @@ static void spapr_add_lmbs(DeviceState *dev, uint64_t addr_start, uint64_t size,
     int i, fdt_offset, fdt_size;
     void *fdt;
     uint64_t addr = addr_start;
+    Error *local_err = NULL;
 
     for (i = 0; i < nr_lmbs; i++) {
         drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
@@ -2611,7 +2612,18 @@ static void spapr_add_lmbs(DeviceState *dev, uint64_t addr_start, uint64_t size,
         fdt_offset = spapr_populate_memory_node(fdt, node, addr,
                                                 SPAPR_MEMORY_BLOCK_SIZE);
 
-        spapr_drc_attach(drc, dev, fdt, fdt_offset, errp);
+        spapr_drc_attach(drc, dev, fdt, fdt_offset, &local_err);
+        if (local_err) {
+            while (addr > addr_start) {
+                addr -= SPAPR_MEMORY_BLOCK_SIZE;
+                drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
+                                      addr / SPAPR_MEMORY_BLOCK_SIZE);
+                spapr_drc_detach(drc, dev, NULL);
+            }
+            g_free(fdt);
+            error_propagate(errp, local_err);
+            return;
+        }
         addr += SPAPR_MEMORY_BLOCK_SIZE;
     }
     /* send hotplug notification to the
@@ -2651,14 +2663,20 @@ static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
     addr = object_property_get_uint(OBJECT(dimm),
                                     PC_DIMM_ADDR_PROP, &local_err);
     if (local_err) {
-        pc_dimm_memory_unplug(dev, &ms->hotplug_memory, mr);
-        goto out;
+        goto out_unplug;
     }
 
     spapr_add_lmbs(dev, addr, size, node,
                    spapr_ovec_test(ms->ov5_cas, OV5_HP_EVT),
-                   &error_abort);
+                   &local_err);
+    if (local_err) {
+        goto out_unplug;
+    }
+
+    return;
 
+out_unplug:
+    pc_dimm_memory_unplug(dev, &ms->hotplug_memory, mr);
 out:
     error_propagate(errp, local_err);
 }


Re: [Qemu-devel] [Qemu-ppc] [PATCH v2] spapr: fix memory hotplug error path
Posted by Daniel Henrique Barboza 6 years, 9 months ago

On 07/03/2017 11:54 AM, Greg Kurz wrote:
> QEMU shouldn't abort if spapr_add_lmbs()->spapr_drc_attach() fails.
> Let's propagate the error instead, like it is done everywhere else
> where spapr_drc_attach() is called.
>
> Signed-off-by: Greg Kurz <groug@kaod.org>
> ---
> Changes in v2: - added rollback code
> ---
>   hw/ppc/spapr.c |   26 ++++++++++++++++++++++----
>   1 file changed, 22 insertions(+), 4 deletions(-)
>
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 70b3fd374e2b..ba8f57a5a054 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -2601,6 +2601,7 @@ static void spapr_add_lmbs(DeviceState *dev, uint64_t addr_start, uint64_t size,
>       int i, fdt_offset, fdt_size;
>       void *fdt;
>       uint64_t addr = addr_start;
> +    Error *local_err = NULL;
>
>       for (i = 0; i < nr_lmbs; i++) {
>           drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
> @@ -2611,7 +2612,18 @@ static void spapr_add_lmbs(DeviceState *dev, uint64_t addr_start, uint64_t size,
>           fdt_offset = spapr_populate_memory_node(fdt, node, addr,
>                                                   SPAPR_MEMORY_BLOCK_SIZE);
>
> -        spapr_drc_attach(drc, dev, fdt, fdt_offset, errp);
> +        spapr_drc_attach(drc, dev, fdt, fdt_offset, &local_err);
> +        if (local_err) {
> +            while (addr > addr_start) {
> +                addr -= SPAPR_MEMORY_BLOCK_SIZE;
> +                drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
> +                                      addr / SPAPR_MEMORY_BLOCK_SIZE);
> +                spapr_drc_detach(drc, dev, NULL);

Question: why pass NULL instead of '&local_err' in detach() ? Can we 
ignore any
any error that happens in detach() because you're propagating the 
local_err set
in spapr_drc_attach() already?

ps: I am aware that ATM detach() does not propagate anything to the errp 
being
passed. I don't think it's wise to write new code based on this 
assumption though.


Thanks,


Daniel

> +            }
> +            g_free(fdt);
> +            error_propagate(errp, local_err);
> +            return;
> +        }
>           addr += SPAPR_MEMORY_BLOCK_SIZE;
>       }
>       /* send hotplug notification to the
> @@ -2651,14 +2663,20 @@ static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
>       addr = object_property_get_uint(OBJECT(dimm),
>                                       PC_DIMM_ADDR_PROP, &local_err);
>       if (local_err) {
> -        pc_dimm_memory_unplug(dev, &ms->hotplug_memory, mr);
> -        goto out;
> +        goto out_unplug;
>       }
>
>       spapr_add_lmbs(dev, addr, size, node,
>                      spapr_ovec_test(ms->ov5_cas, OV5_HP_EVT),
> -                   &error_abort);
> +                   &local_err);
> +    if (local_err) {
> +        goto out_unplug;
> +    }
> +
> +    return;
>
> +out_unplug:
> +    pc_dimm_memory_unplug(dev, &ms->hotplug_memory, mr);
>   out:
>       error_propagate(errp, local_err);
>   }
>
>


Re: [Qemu-devel] [Qemu-ppc] [PATCH v2] spapr: fix memory hotplug error path
Posted by Greg Kurz 6 years, 9 months ago
On Mon, 3 Jul 2017 14:03:07 -0300
Daniel Henrique Barboza <danielhb@linux.vnet.ibm.com> wrote:

> On 07/03/2017 11:54 AM, Greg Kurz wrote:
> > QEMU shouldn't abort if spapr_add_lmbs()->spapr_drc_attach() fails.
> > Let's propagate the error instead, like it is done everywhere else
> > where spapr_drc_attach() is called.
> >
> > Signed-off-by: Greg Kurz <groug@kaod.org>
> > ---
> > Changes in v2: - added rollback code
> > ---
> >   hw/ppc/spapr.c |   26 ++++++++++++++++++++++----
> >   1 file changed, 22 insertions(+), 4 deletions(-)
> >
> > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> > index 70b3fd374e2b..ba8f57a5a054 100644
> > --- a/hw/ppc/spapr.c
> > +++ b/hw/ppc/spapr.c
> > @@ -2601,6 +2601,7 @@ static void spapr_add_lmbs(DeviceState *dev, uint64_t addr_start, uint64_t size,
> >       int i, fdt_offset, fdt_size;
> >       void *fdt;
> >       uint64_t addr = addr_start;
> > +    Error *local_err = NULL;
> >
> >       for (i = 0; i < nr_lmbs; i++) {
> >           drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
> > @@ -2611,7 +2612,18 @@ static void spapr_add_lmbs(DeviceState *dev, uint64_t addr_start, uint64_t size,
> >           fdt_offset = spapr_populate_memory_node(fdt, node, addr,
> >                                                   SPAPR_MEMORY_BLOCK_SIZE);
> >
> > -        spapr_drc_attach(drc, dev, fdt, fdt_offset, errp);
> > +        spapr_drc_attach(drc, dev, fdt, fdt_offset, &local_err);
> > +        if (local_err) {
> > +            while (addr > addr_start) {
> > +                addr -= SPAPR_MEMORY_BLOCK_SIZE;
> > +                drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
> > +                                      addr / SPAPR_MEMORY_BLOCK_SIZE);
> > +                spapr_drc_detach(drc, dev, NULL);  
> 
> Question: why pass NULL instead of '&local_err' in detach() ?

local_err already contains the error returned by spapr_drc_attach(), it
cannot be used anymore.

> Can we ignore any
> any error that happens in detach() because you're propagating the 
> local_err set
> in spapr_drc_attach() already?
> 

This is a rollback path: the best we can do is to try to undo the
already attached DRCs...

> ps: I am aware that ATM detach() does not propagate anything to the errp 
> being
> passed. I don't think it's wise to write new code based on this 
> assumption though.
> 

... and indeed detach() cannot fail with the current code base. :)

David was even planning to drop the errp argument in the "spapr:
Refactor spapr_drc_detach()" patch.

Otherwise, maybe pass &error_abort since failing to rollback is
likely to be a bug ?

Cheers,

--
Greg

> 
> Thanks,
> 
> 
> Daniel
> 
> > +            }
> > +            g_free(fdt);
> > +            error_propagate(errp, local_err);
> > +            return;
> > +        }
> >           addr += SPAPR_MEMORY_BLOCK_SIZE;
> >       }
> >       /* send hotplug notification to the
> > @@ -2651,14 +2663,20 @@ static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
> >       addr = object_property_get_uint(OBJECT(dimm),
> >                                       PC_DIMM_ADDR_PROP, &local_err);
> >       if (local_err) {
> > -        pc_dimm_memory_unplug(dev, &ms->hotplug_memory, mr);
> > -        goto out;
> > +        goto out_unplug;
> >       }
> >
> >       spapr_add_lmbs(dev, addr, size, node,
> >                      spapr_ovec_test(ms->ov5_cas, OV5_HP_EVT),
> > -                   &error_abort);
> > +                   &local_err);
> > +    if (local_err) {
> > +        goto out_unplug;
> > +    }
> > +
> > +    return;
> >
> > +out_unplug:
> > +    pc_dimm_memory_unplug(dev, &ms->hotplug_memory, mr);
> >   out:
> >       error_propagate(errp, local_err);
> >   }
> >
> >  
> 

Re: [Qemu-devel] [Qemu-ppc] [PATCH v2] spapr: fix memory hotplug error path
Posted by Daniel Henrique Barboza 6 years, 9 months ago

On 07/04/2017 04:42 AM, Greg Kurz wrote:
> On Mon, 3 Jul 2017 14:03:07 -0300
> Daniel Henrique Barboza <danielhb@linux.vnet.ibm.com> wrote:
>
>> On 07/03/2017 11:54 AM, Greg Kurz wrote:
>>> QEMU shouldn't abort if spapr_add_lmbs()->spapr_drc_attach() fails.
>>> Let's propagate the error instead, like it is done everywhere else
>>> where spapr_drc_attach() is called.
>>>
>>> Signed-off-by: Greg Kurz <groug@kaod.org>
>>> ---
>>> Changes in v2: - added rollback code
>>> ---
>>>    hw/ppc/spapr.c |   26 ++++++++++++++++++++++----
>>>    1 file changed, 22 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>>> index 70b3fd374e2b..ba8f57a5a054 100644
>>> --- a/hw/ppc/spapr.c
>>> +++ b/hw/ppc/spapr.c
>>> @@ -2601,6 +2601,7 @@ static void spapr_add_lmbs(DeviceState *dev, uint64_t addr_start, uint64_t size,
>>>        int i, fdt_offset, fdt_size;
>>>        void *fdt;
>>>        uint64_t addr = addr_start;
>>> +    Error *local_err = NULL;
>>>
>>>        for (i = 0; i < nr_lmbs; i++) {
>>>            drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
>>> @@ -2611,7 +2612,18 @@ static void spapr_add_lmbs(DeviceState *dev, uint64_t addr_start, uint64_t size,
>>>            fdt_offset = spapr_populate_memory_node(fdt, node, addr,
>>>                                                    SPAPR_MEMORY_BLOCK_SIZE);
>>>
>>> -        spapr_drc_attach(drc, dev, fdt, fdt_offset, errp);
>>> +        spapr_drc_attach(drc, dev, fdt, fdt_offset, &local_err);
>>> +        if (local_err) {
>>> +            while (addr > addr_start) {
>>> +                addr -= SPAPR_MEMORY_BLOCK_SIZE;
>>> +                drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
>>> +                                      addr / SPAPR_MEMORY_BLOCK_SIZE);
>>> +                spapr_drc_detach(drc, dev, NULL);
>> Question: why pass NULL instead of '&local_err' in detach() ?
> local_err already contains the error returned by spapr_drc_attach(), it
> cannot be used anymore.
>
>> Can we ignore any
>> any error that happens in detach() because you're propagating the
>> local_err set
>> in spapr_drc_attach() already?
>>
> This is a rollback path: the best we can do is to try to undo the
> already attached DRCs...
>
>> ps: I am aware that ATM detach() does not propagate anything to the errp
>> being
>> passed. I don't think it's wise to write new code based on this
>> assumption though.
>>
> ... and indeed detach() cannot fail with the current code base. :)
>
> David was even planning to drop the errp argument in the "spapr:
> Refactor spapr_drc_detach()" patch.

Dropping it would be good. There are places in the code that are
passing non-null pointers tof the Errp parameter of detach().

>
> Otherwise, maybe pass &error_abort since failing to rollback is
> likely to be a bug ?

It would be saner for the reader but in reality it would be of no avail, 
given
this info that detach() will probably never make use of it. AFAIC you 
can leave
it as NULL. If you end up sending a v3 for other reasons then you can 
change it
for &error_abort in the process.


Reviewed-by: Daniel Barboza <danielhb@linux.vnet.ibm.com>

>
> Cheers,
>
> --
> Greg
>
>> Thanks,
>>
>>
>> Daniel
>>
>>> +            }
>>> +            g_free(fdt);
>>> +            error_propagate(errp, local_err);
>>> +            return;
>>> +        }
>>>            addr += SPAPR_MEMORY_BLOCK_SIZE;
>>>        }
>>>        /* send hotplug notification to the
>>> @@ -2651,14 +2663,20 @@ static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
>>>        addr = object_property_get_uint(OBJECT(dimm),
>>>                                        PC_DIMM_ADDR_PROP, &local_err);
>>>        if (local_err) {
>>> -        pc_dimm_memory_unplug(dev, &ms->hotplug_memory, mr);
>>> -        goto out;
>>> +        goto out_unplug;
>>>        }
>>>
>>>        spapr_add_lmbs(dev, addr, size, node,
>>>                       spapr_ovec_test(ms->ov5_cas, OV5_HP_EVT),
>>> -                   &error_abort);
>>> +                   &local_err);
>>> +    if (local_err) {
>>> +        goto out_unplug;
>>> +    }
>>> +
>>> +    return;
>>>
>>> +out_unplug:
>>> +    pc_dimm_memory_unplug(dev, &ms->hotplug_memory, mr);
>>>    out:
>>>        error_propagate(errp, local_err);
>>>    }
>>>
>>>   


Re: [Qemu-devel] [Qemu-ppc] [PATCH v2] spapr: fix memory hotplug error path
Posted by David Gibson 6 years, 9 months ago
On Tue, Jul 04, 2017 at 09:42:52AM +0200, Greg Kurz wrote:
> On Mon, 3 Jul 2017 14:03:07 -0300
> Daniel Henrique Barboza <danielhb@linux.vnet.ibm.com> wrote:
> 
> > On 07/03/2017 11:54 AM, Greg Kurz wrote:
> > > QEMU shouldn't abort if spapr_add_lmbs()->spapr_drc_attach() fails.
> > > Let's propagate the error instead, like it is done everywhere else
> > > where spapr_drc_attach() is called.
> > >
> > > Signed-off-by: Greg Kurz <groug@kaod.org>
> > > ---
> > > Changes in v2: - added rollback code
> > > ---
> > >   hw/ppc/spapr.c |   26 ++++++++++++++++++++++----
> > >   1 file changed, 22 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> > > index 70b3fd374e2b..ba8f57a5a054 100644
> > > --- a/hw/ppc/spapr.c
> > > +++ b/hw/ppc/spapr.c
> > > @@ -2601,6 +2601,7 @@ static void spapr_add_lmbs(DeviceState *dev, uint64_t addr_start, uint64_t size,
> > >       int i, fdt_offset, fdt_size;
> > >       void *fdt;
> > >       uint64_t addr = addr_start;
> > > +    Error *local_err = NULL;
> > >
> > >       for (i = 0; i < nr_lmbs; i++) {
> > >           drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
> > > @@ -2611,7 +2612,18 @@ static void spapr_add_lmbs(DeviceState *dev, uint64_t addr_start, uint64_t size,
> > >           fdt_offset = spapr_populate_memory_node(fdt, node, addr,
> > >                                                   SPAPR_MEMORY_BLOCK_SIZE);
> > >
> > > -        spapr_drc_attach(drc, dev, fdt, fdt_offset, errp);
> > > +        spapr_drc_attach(drc, dev, fdt, fdt_offset, &local_err);
> > > +        if (local_err) {
> > > +            while (addr > addr_start) {
> > > +                addr -= SPAPR_MEMORY_BLOCK_SIZE;
> > > +                drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
> > > +                                      addr / SPAPR_MEMORY_BLOCK_SIZE);
> > > +                spapr_drc_detach(drc, dev, NULL);  
> > 
> > Question: why pass NULL instead of '&local_err' in detach() ?
> 
> local_err already contains the error returned by spapr_drc_attach(), it
> cannot be used anymore.
> 
> > Can we ignore any
> > any error that happens in detach() because you're propagating the 
> > local_err set
> > in spapr_drc_attach() already?
> 
> This is a rollback path: the best we can do is to try to undo the
> already attached DRCs...

Yeah, this is basically the classic exception-during-destructor
problem.  Dealing with an error condition while in the middle of
rolling back another error can get really messy.

> > ps: I am aware that ATM detach() does not propagate anything to the errp 
> > being
> > passed. I don't think it's wise to write new code based on this 
> > assumption though.
> > 
> 
> ... and indeed detach() cannot fail with the current code base. :)

> David was even planning to drop the errp argument in the "spapr:
> Refactor spapr_drc_detach()" patch.

Yeah, but as you pointed out in comments on that patch, that's
probably not a good idea.  Except.. then I looked deeper still and the
story gets more complex again.  I'll revisit that when I respin those
DRC patches.

> Otherwise, maybe pass &error_abort since failing to rollback is
> likely to be a bug ?

Yeah, I think the only sane options here are either to ignore (second
order) errors, or abort.

My usual inclination would be to abort, but I'm a bit concerned that
provides a means by which the guest can crash qemu.  I'm going to
apply the patch as is for now, since I'm pretty confident that it
makes things better than they weren.  I hope we can continue to
improve the detach error handling as a follow up.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson