tools/libs/light/libxl_pci.c | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-)
PCI device assignment to an HVM with stubdom is potentially racy. First
the PCI device is assigned to the stubdom via the PV PCI protocol. Then
QEMU is sent a QMP command to attach the PCI device to QEMU running
within the stubdom. However, the sysfs entries within the stubdom may
not have appeared by the time QEMU receives the device_add command
resulting in errors like:
libxl_qmp.c:1838:qmp_ev_parse_error_messages:Domain 10:Could not open '/sys/bus/pci/devices/0000:00:1f.3/config': No such file or directory
This patch retries the device assignment up to 10 times with a 1 second
delay between. That roughly matches the overall hotplug timeout.
The qmp_ev_parse_error_messages error is still printed since it happens
at a lower level than the pci code controlling the retries. With that,
the "Retrying PCI add %d" message is also printed at ERROR level to
clarify what is happening.
Signed-off-by: Jason Andryuk <jandryuk@gmail.com>
---
tools/libs/light/libxl_pci.c | 32 ++++++++++++++++++++++++++++++--
1 file changed, 30 insertions(+), 2 deletions(-)
diff --git a/tools/libs/light/libxl_pci.c b/tools/libs/light/libxl_pci.c
index 4bbbfe9f16..8d6a0e65cc 100644
--- a/tools/libs/light/libxl_pci.c
+++ b/tools/libs/light/libxl_pci.c
@@ -1109,8 +1109,10 @@ typedef struct pci_add_state {
libxl__xswait_state xswait;
libxl__ev_qmp qmp;
libxl__ev_time timeout;
+ libxl__ev_time timeout_retries;
libxl_device_pci pci;
libxl_domid pci_domid;
+ int retries;
} pci_add_state;
static void pci_add_qemu_trad_watch_state_cb(libxl__egc *egc,
@@ -1118,6 +1120,8 @@ static void pci_add_qemu_trad_watch_state_cb(libxl__egc *egc,
static void pci_add_qmp_device_add(libxl__egc *, pci_add_state *);
static void pci_add_qmp_device_add_cb(libxl__egc *,
libxl__ev_qmp *, const libxl__json_object *, int rc);
+static void pci_add_qmp_device_add_retry(libxl__egc *egc, libxl__ev_time *ev,
+ const struct timeval *requested_abs, int rc);
static void pci_add_qmp_query_pci_cb(libxl__egc *,
libxl__ev_qmp *, const libxl__json_object *, int rc);
static void pci_add_timeout(libxl__egc *egc, libxl__ev_time *ev,
@@ -1137,7 +1141,9 @@ static void do_pci_add(libxl__egc *egc,
libxl__xswait_init(&pas->xswait);
libxl__ev_qmp_init(&pas->qmp);
pas->pci_domid = domid;
+ pas->retries = 0;
libxl__ev_time_init(&pas->timeout);
+ libxl__ev_time_init(&pas->timeout_retries);
if (type == LIBXL_DOMAIN_TYPE_INVALID) {
rc = ERROR_FAIL;
@@ -1252,10 +1258,22 @@ static void pci_add_qmp_device_add_cb(libxl__egc *egc,
const libxl__json_object *response,
int rc)
{
- EGC_GC;
pci_add_state *pas = CONTAINER_OF(qmp, *pas, qmp);
+ STATE_AO_GC(pas->aodev->ao);
- if (rc) goto out;
+ if (rc) {
+ if (pas->retries++ < 10) {
+ LOGD(ERROR, qmp->domid, "Retrying PCI add %d", pas->retries);
+ rc = libxl__ev_time_register_rel(ao, &pas->timeout_retries,
+ pci_add_qmp_device_add_retry,
+ 1000);
+ if (rc) goto out;
+
+ return; /* wait for the timeout to then retry */
+ } else {
+ goto out;
+ }
+ }
qmp->callback = pci_add_qmp_query_pci_cb;
rc = libxl__ev_qmp_send(egc, qmp, "query-pci", NULL);
@@ -1266,6 +1284,15 @@ out:
pci_add_dm_done(egc, pas, rc); /* must be last */
}
+static void pci_add_qmp_device_add_retry(libxl__egc *egc, libxl__ev_time *ev,
+ const struct timeval *requested_abs,
+ int rc)
+{
+ pci_add_state *pas = CONTAINER_OF(ev, *pas, timeout_retries);
+
+ pci_add_qmp_device_add(egc, pas);
+}
+
static void pci_add_qmp_query_pci_cb(libxl__egc *egc,
libxl__ev_qmp *qmp,
const libxl__json_object *response,
@@ -1507,6 +1534,7 @@ out_no_irq:
rc = 0;
out:
libxl__ev_time_deregister(gc, &pas->timeout);
+ libxl__ev_time_deregister(gc, &pas->timeout_retries);
pas->callback(egc, pas, rc);
}
--
2.35.1
On Wed, Mar 30, 2022 at 03:46:56PM -0400, Jason Andryuk wrote: > PCI device assignment to an HVM with stubdom is potentially racy. First > the PCI device is assigned to the stubdom via the PV PCI protocol. Then > QEMU is sent a QMP command to attach the PCI device to QEMU running > within the stubdom. However, the sysfs entries within the stubdom may > not have appeared by the time QEMU receives the device_add command > resulting in errors like: > > libxl_qmp.c:1838:qmp_ev_parse_error_messages:Domain 10:Could not open '/sys/bus/pci/devices/0000:00:1f.3/config': No such file or directory > > This patch retries the device assignment up to 10 times with a 1 second > delay between. That roughly matches the overall hotplug timeout. > > The qmp_ev_parse_error_messages error is still printed since it happens > at a lower level than the pci code controlling the retries. With that, > the "Retrying PCI add %d" message is also printed at ERROR level to > clarify what is happening. > > Signed-off-by: Jason Andryuk <jandryuk@gmail.com> > --- > @@ -1252,10 +1258,22 @@ static void pci_add_qmp_device_add_cb(libxl__egc *egc, > const libxl__json_object *response, > int rc) > { > - EGC_GC; > pci_add_state *pas = CONTAINER_OF(qmp, *pas, qmp); > + STATE_AO_GC(pas->aodev->ao); I think this changes are wrong, what is the reason to use STATE_AO_GC instead of EGC_GC? I think in this case, it is fine to keep using EGC_GC, as there doesn't seems to be any allocation that needs to live after this function returns. And you could just pass `pas->aodev->ao` to libxl__ev_time_register_rel(). > > - if (rc) goto out; > + if (rc) { > + if (pas->retries++ < 10) { > + LOGD(ERROR, qmp->domid, "Retrying PCI add %d", pas->retries); > + rc = libxl__ev_time_register_rel(ao, &pas->timeout_retries, > + pci_add_qmp_device_add_retry, > + 1000); > + if (rc) goto out; > + > + return; /* wait for the timeout to then retry */ > + } else { > + goto out; > + } > + } So this retry logic would be done regardless of whether stubdom is in use or not. It's not going to be useful in the non-stubdom case, is it? When adding a pci device to a domain that has its device model in a stubdomain, there's a first call to do_pci_add() which works fine, right? Then there's a callback to device_pci_add_stubdom_wait(), which is supposed to wait for the stubdom to be ready, but the sysfs entry might still be missing at that time, right? Then, there is a second call to do_pci_add() for the guest, and it's the one that fail in your case, right? If my reasoning is correct, could we enable the retry logic only for the second do_pci_add() call? So that guests without stubdom aren't impacted as I don't think retrying in this case would be useful and would just delay the error. Cheers, -- Anthony PERARD
On Fri, Apr 8, 2022 at 10:56 AM Anthony PERARD <anthony.perard@citrix.com> wrote: > > On Wed, Mar 30, 2022 at 03:46:56PM -0400, Jason Andryuk wrote: > > PCI device assignment to an HVM with stubdom is potentially racy. First > > the PCI device is assigned to the stubdom via the PV PCI protocol. Then > > QEMU is sent a QMP command to attach the PCI device to QEMU running > > within the stubdom. However, the sysfs entries within the stubdom may > > not have appeared by the time QEMU receives the device_add command > > resulting in errors like: > > > > libxl_qmp.c:1838:qmp_ev_parse_error_messages:Domain 10:Could not open '/sys/bus/pci/devices/0000:00:1f.3/config': No such file or directory > > > > This patch retries the device assignment up to 10 times with a 1 second > > delay between. That roughly matches the overall hotplug timeout. > > > > The qmp_ev_parse_error_messages error is still printed since it happens > > at a lower level than the pci code controlling the retries. With that, > > the "Retrying PCI add %d" message is also printed at ERROR level to > > clarify what is happening. > > > > Signed-off-by: Jason Andryuk <jandryuk@gmail.com> > > --- > > @@ -1252,10 +1258,22 @@ static void pci_add_qmp_device_add_cb(libxl__egc *egc, > > const libxl__json_object *response, > > int rc) > > { > > - EGC_GC; > > pci_add_state *pas = CONTAINER_OF(qmp, *pas, qmp); > > + STATE_AO_GC(pas->aodev->ao); > > I think this changes are wrong, what is the reason to use STATE_AO_GC > instead of EGC_GC? > I think in this case, it is fine to keep using EGC_GC, as there doesn't > seems to be any allocation that needs to live after this function > returns. And you could just pass `pas->aodev->ao` to > libxl__ev_time_register_rel(). Yes, I think you are correct. I think I made this change to use a libxl macro to declare the local ao variable, but as you say that is not needed. > > > > - if (rc) goto out; > > + if (rc) { > > + if (pas->retries++ < 10) { > > + LOGD(ERROR, qmp->domid, "Retrying PCI add %d", pas->retries); > > + rc = libxl__ev_time_register_rel(ao, &pas->timeout_retries, > > + pci_add_qmp_device_add_retry, > > + 1000); > > + if (rc) goto out; > > + > > + return; /* wait for the timeout to then retry */ > > + } else { > > + goto out; > > + } > > + } > > So this retry logic would be done regardless of whether stubdom is in > use or not. It's not going to be useful in the non-stubdom case, is it? Yes, it should not be applicable for non-stubdom since the sysfs entries should already be present. > When adding a pci device to a domain that has its device model in a > stubdomain, there's a first call to do_pci_add() which works fine, > right? Then there's a callback to device_pci_add_stubdom_wait(), which > is supposed to wait for the stubdom to be ready, but the sysfs entry > might still be missing at that time, right? Then, there is a second call > to do_pci_add() for the guest, and it's the one that fail in your case, > right? Yes, I think that is all correct. > If my reasoning is correct, could we enable the retry logic only for the > second do_pci_add() call? So that guests without stubdom aren't impacted > as I don't think retrying in this case would be useful and would just > delay the error. That should be possible. Instead of differentiating between the do_pci_add invocation, I'm thinking pci_add_qmp_device_add_cb would gain a stubdom check: if (rc) { /* Retry only applicable for HVM with stubdom. */ if (libxl_get_stubdom_id(CTX, domid) == 0) goto out; /* retry logic */ } I just noticed that pci_add_qmp_device_add() registers the pci_add_timeout, and pci_add_qmp_device_add_retry() calls back into pci_add_qmp_device_add(). I expected a single 10 second timeout across all the entire add operation, but the retry logic is re-registering it. The duplicate registration may also be corrupting lists? I plan to move the pci_add_timeout registration into do_pci_add() I went with 10 retries of 1 second each since that matches the sleep(1) loop patch coming out of OpenXT. It seems like it could be optimized, but trying for longer and succeeding seems better than shorter and failing. Thanks, Jason
© 2016 - 2024 Red Hat, Inc.