[PATCH] libxl: Retry QMP PCI device_add

Jason Andryuk posted 1 patch 2 years, 8 months ago
Test gitlab-ci failed
Patches applied successfully (tree, apply log)
git fetch https://gitlab.com/xen-project/patchew/xen tags/patchew/20220330194656.39051-1-jandryuk@gmail.com
There is a newer version of this series
tools/libs/light/libxl_pci.c | 32 ++++++++++++++++++++++++++++++--
1 file changed, 30 insertions(+), 2 deletions(-)
[PATCH] libxl: Retry QMP PCI device_add
Posted by Jason Andryuk 2 years, 8 months ago
PCI device assignment to an HVM with stubdom is potentially racy.  First
the PCI device is assigned to the stubdom via the PV PCI protocol.  Then
QEMU is sent a QMP command to attach the PCI device to QEMU running
within the stubdom.  However, the sysfs entries within the stubdom may
not have appeared by the time QEMU receives the device_add command
resulting in errors like:

libxl_qmp.c:1838:qmp_ev_parse_error_messages:Domain 10:Could not open '/sys/bus/pci/devices/0000:00:1f.3/config': No such file or directory

This patch retries the device assignment up to 10 times with a 1 second
delay between.  That roughly matches the overall hotplug timeout.

The qmp_ev_parse_error_messages error is still printed since it happens
at a lower level than the pci code controlling the retries.  With that,
the "Retrying PCI add %d" message is also printed at ERROR level to
clarify what is happening.

Signed-off-by: Jason Andryuk <jandryuk@gmail.com>
---
 tools/libs/light/libxl_pci.c | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/tools/libs/light/libxl_pci.c b/tools/libs/light/libxl_pci.c
index 4bbbfe9f16..8d6a0e65cc 100644
--- a/tools/libs/light/libxl_pci.c
+++ b/tools/libs/light/libxl_pci.c
@@ -1109,8 +1109,10 @@ typedef struct pci_add_state {
     libxl__xswait_state xswait;
     libxl__ev_qmp qmp;
     libxl__ev_time timeout;
+    libxl__ev_time timeout_retries;
     libxl_device_pci pci;
     libxl_domid pci_domid;
+    int retries;
 } pci_add_state;
 
 static void pci_add_qemu_trad_watch_state_cb(libxl__egc *egc,
@@ -1118,6 +1120,8 @@ static void pci_add_qemu_trad_watch_state_cb(libxl__egc *egc,
 static void pci_add_qmp_device_add(libxl__egc *, pci_add_state *);
 static void pci_add_qmp_device_add_cb(libxl__egc *,
     libxl__ev_qmp *, const libxl__json_object *, int rc);
+static void pci_add_qmp_device_add_retry(libxl__egc *egc, libxl__ev_time *ev,
+    const struct timeval *requested_abs, int rc);
 static void pci_add_qmp_query_pci_cb(libxl__egc *,
     libxl__ev_qmp *, const libxl__json_object *, int rc);
 static void pci_add_timeout(libxl__egc *egc, libxl__ev_time *ev,
@@ -1137,7 +1141,9 @@ static void do_pci_add(libxl__egc *egc,
     libxl__xswait_init(&pas->xswait);
     libxl__ev_qmp_init(&pas->qmp);
     pas->pci_domid = domid;
+    pas->retries = 0;
     libxl__ev_time_init(&pas->timeout);
+    libxl__ev_time_init(&pas->timeout_retries);
 
     if (type == LIBXL_DOMAIN_TYPE_INVALID) {
         rc = ERROR_FAIL;
@@ -1252,10 +1258,22 @@ static void pci_add_qmp_device_add_cb(libxl__egc *egc,
                                       const libxl__json_object *response,
                                       int rc)
 {
-    EGC_GC;
     pci_add_state *pas = CONTAINER_OF(qmp, *pas, qmp);
+    STATE_AO_GC(pas->aodev->ao);
 
-    if (rc) goto out;
+    if (rc) {
+        if (pas->retries++ < 10) {
+            LOGD(ERROR, qmp->domid, "Retrying PCI add %d", pas->retries);
+            rc = libxl__ev_time_register_rel(ao, &pas->timeout_retries,
+                                             pci_add_qmp_device_add_retry,
+                                             1000);
+            if (rc) goto out;
+
+            return; /* wait for the timeout to then retry */
+        } else {
+            goto out;
+        }
+    }
 
     qmp->callback = pci_add_qmp_query_pci_cb;
     rc = libxl__ev_qmp_send(egc, qmp, "query-pci", NULL);
@@ -1266,6 +1284,15 @@ out:
     pci_add_dm_done(egc, pas, rc); /* must be last */
 }
 
+static void pci_add_qmp_device_add_retry(libxl__egc *egc, libxl__ev_time *ev,
+                                         const struct timeval *requested_abs,
+                                         int rc)
+{
+    pci_add_state *pas = CONTAINER_OF(ev, *pas, timeout_retries);
+
+    pci_add_qmp_device_add(egc, pas);
+}
+
 static void pci_add_qmp_query_pci_cb(libxl__egc *egc,
                                      libxl__ev_qmp *qmp,
                                      const libxl__json_object *response,
@@ -1507,6 +1534,7 @@ out_no_irq:
         rc = 0;
 out:
     libxl__ev_time_deregister(gc, &pas->timeout);
+    libxl__ev_time_deregister(gc, &pas->timeout_retries);
     pas->callback(egc, pas, rc);
 }
 
-- 
2.35.1
Re: [PATCH] libxl: Retry QMP PCI device_add
Posted by Anthony PERARD 2 years, 7 months ago
On Wed, Mar 30, 2022 at 03:46:56PM -0400, Jason Andryuk wrote:
> PCI device assignment to an HVM with stubdom is potentially racy.  First
> the PCI device is assigned to the stubdom via the PV PCI protocol.  Then
> QEMU is sent a QMP command to attach the PCI device to QEMU running
> within the stubdom.  However, the sysfs entries within the stubdom may
> not have appeared by the time QEMU receives the device_add command
> resulting in errors like:
> 
> libxl_qmp.c:1838:qmp_ev_parse_error_messages:Domain 10:Could not open '/sys/bus/pci/devices/0000:00:1f.3/config': No such file or directory
> 
> This patch retries the device assignment up to 10 times with a 1 second
> delay between.  That roughly matches the overall hotplug timeout.
> 
> The qmp_ev_parse_error_messages error is still printed since it happens
> at a lower level than the pci code controlling the retries.  With that,
> the "Retrying PCI add %d" message is also printed at ERROR level to
> clarify what is happening.
> 
> Signed-off-by: Jason Andryuk <jandryuk@gmail.com>
> ---
> @@ -1252,10 +1258,22 @@ static void pci_add_qmp_device_add_cb(libxl__egc *egc,
>                                        const libxl__json_object *response,
>                                        int rc)
>  {
> -    EGC_GC;
>      pci_add_state *pas = CONTAINER_OF(qmp, *pas, qmp);
> +    STATE_AO_GC(pas->aodev->ao);

I think this changes are wrong, what is the reason to use STATE_AO_GC
instead of EGC_GC?
I think in this case, it is fine to keep using EGC_GC, as there doesn't
seems to be any allocation that needs to live after this function
returns. And you could just pass `pas->aodev->ao` to
libxl__ev_time_register_rel().

>  
> -    if (rc) goto out;
> +    if (rc) {
> +        if (pas->retries++ < 10) {
> +            LOGD(ERROR, qmp->domid, "Retrying PCI add %d", pas->retries);
> +            rc = libxl__ev_time_register_rel(ao, &pas->timeout_retries,
> +                                             pci_add_qmp_device_add_retry,
> +                                             1000);
> +            if (rc) goto out;
> +
> +            return; /* wait for the timeout to then retry */
> +        } else {
> +            goto out;
> +        }
> +    }

So this retry logic would be done regardless of whether stubdom is in
use or not. It's not going to be useful in the non-stubdom case, is it?

When adding a pci device to a domain that has its device model in a
stubdomain, there's a first call to do_pci_add() which works fine,
right? Then there's a callback to device_pci_add_stubdom_wait(), which
is supposed to wait for the stubdom to be ready, but the sysfs entry
might still be missing at that time, right? Then, there is a second call
to do_pci_add() for the guest, and it's the one that fail in your case,
right?

If my reasoning is correct, could we enable the retry logic only for the
second do_pci_add() call? So that guests without stubdom aren't impacted
as I don't think retrying in this case would be useful and would just
delay the error.

Cheers,

-- 
Anthony PERARD
Re: [PATCH] libxl: Retry QMP PCI device_add
Posted by Jason Andryuk 2 years, 7 months ago
On Fri, Apr 8, 2022 at 10:56 AM Anthony PERARD
<anthony.perard@citrix.com> wrote:
>
> On Wed, Mar 30, 2022 at 03:46:56PM -0400, Jason Andryuk wrote:
> > PCI device assignment to an HVM with stubdom is potentially racy.  First
> > the PCI device is assigned to the stubdom via the PV PCI protocol.  Then
> > QEMU is sent a QMP command to attach the PCI device to QEMU running
> > within the stubdom.  However, the sysfs entries within the stubdom may
> > not have appeared by the time QEMU receives the device_add command
> > resulting in errors like:
> >
> > libxl_qmp.c:1838:qmp_ev_parse_error_messages:Domain 10:Could not open '/sys/bus/pci/devices/0000:00:1f.3/config': No such file or directory
> >
> > This patch retries the device assignment up to 10 times with a 1 second
> > delay between.  That roughly matches the overall hotplug timeout.
> >
> > The qmp_ev_parse_error_messages error is still printed since it happens
> > at a lower level than the pci code controlling the retries.  With that,
> > the "Retrying PCI add %d" message is also printed at ERROR level to
> > clarify what is happening.
> >
> > Signed-off-by: Jason Andryuk <jandryuk@gmail.com>
> > ---
> > @@ -1252,10 +1258,22 @@ static void pci_add_qmp_device_add_cb(libxl__egc *egc,
> >                                        const libxl__json_object *response,
> >                                        int rc)
> >  {
> > -    EGC_GC;
> >      pci_add_state *pas = CONTAINER_OF(qmp, *pas, qmp);
> > +    STATE_AO_GC(pas->aodev->ao);
>
> I think this changes are wrong, what is the reason to use STATE_AO_GC
> instead of EGC_GC?
> I think in this case, it is fine to keep using EGC_GC, as there doesn't
> seems to be any allocation that needs to live after this function
> returns. And you could just pass `pas->aodev->ao` to
> libxl__ev_time_register_rel().

Yes, I think you are correct.  I think I made this change to use a
libxl macro to declare the local ao variable, but as you say that is
not needed.

> >
> > -    if (rc) goto out;
> > +    if (rc) {
> > +        if (pas->retries++ < 10) {
> > +            LOGD(ERROR, qmp->domid, "Retrying PCI add %d", pas->retries);
> > +            rc = libxl__ev_time_register_rel(ao, &pas->timeout_retries,
> > +                                             pci_add_qmp_device_add_retry,
> > +                                             1000);
> > +            if (rc) goto out;
> > +
> > +            return; /* wait for the timeout to then retry */
> > +        } else {
> > +            goto out;
> > +        }
> > +    }
>
> So this retry logic would be done regardless of whether stubdom is in
> use or not. It's not going to be useful in the non-stubdom case, is it?

Yes, it should not be applicable for non-stubdom since the sysfs
entries should already be present.

> When adding a pci device to a domain that has its device model in a
> stubdomain, there's a first call to do_pci_add() which works fine,
> right? Then there's a callback to device_pci_add_stubdom_wait(), which
> is supposed to wait for the stubdom to be ready, but the sysfs entry
> might still be missing at that time, right? Then, there is a second call
> to do_pci_add() for the guest, and it's the one that fail in your case,
> right?

Yes, I think that is all correct.

> If my reasoning is correct, could we enable the retry logic only for the
> second do_pci_add() call? So that guests without stubdom aren't impacted
> as I don't think retrying in this case would be useful and would just
> delay the error.

That should be possible.  Instead of differentiating between the
do_pci_add invocation, I'm thinking pci_add_qmp_device_add_cb would
gain a stubdom check:

if (rc) {
    /* Retry only applicable for HVM with stubdom. */
    if (libxl_get_stubdom_id(CTX, domid) == 0)
        goto out;
    /* retry logic */
}

I just noticed that pci_add_qmp_device_add() registers the
pci_add_timeout, and pci_add_qmp_device_add_retry() calls back into
pci_add_qmp_device_add().  I expected a single 10 second timeout
across all the entire add operation, but the retry logic is
re-registering it.  The duplicate registration may also be corrupting
lists?  I plan to move the pci_add_timeout registration into
do_pci_add()

I went with 10 retries of 1 second each since that matches the
sleep(1) loop patch coming out of OpenXT.  It seems like it could be
optimized, but trying for longer and succeeding seems better than
shorter and failing.

Thanks,
Jason