From: Peter Xu <peterx@redhat.com>
QEMU resets do not have a way to order reset hooks. Add one coarse grained
reset stage so that some devices can be reset later than some others.
Signed-off-by: Peter Xu <peterx@redhat.com>
---
include/sysemu/reset.h | 5 ++++
hw/core/reset.c | 60 +++++++++++++++++++++++++++++++-----------
2 files changed, 49 insertions(+), 16 deletions(-)
diff --git a/include/sysemu/reset.h b/include/sysemu/reset.h
index 609e4d50c2..0de697ce9f 100644
--- a/include/sysemu/reset.h
+++ b/include/sysemu/reset.h
@@ -5,9 +5,14 @@
typedef void QEMUResetHandler(void *opaque);
+#define QEMU_RESET_STAGES_N 2
+
void qemu_register_reset(QEMUResetHandler *func, void *opaque);
+void qemu_register_reset_one(QEMUResetHandler *func, void *opaque,
+ bool skip_snap, int stage);
void qemu_register_reset_nosnapshotload(QEMUResetHandler *func, void *opaque);
void qemu_unregister_reset(QEMUResetHandler *func, void *opaque);
+void qemu_unregister_reset_one(QEMUResetHandler *func, void *opaque, int stage);
void qemu_devices_reset(ShutdownCause reason);
#endif
diff --git a/hw/core/reset.c b/hw/core/reset.c
index 8cf60b2b09..a84c9bee84 100644
--- a/hw/core/reset.c
+++ b/hw/core/reset.c
@@ -36,55 +36,83 @@ typedef struct QEMUResetEntry {
bool skip_on_snapshot_load;
} QEMUResetEntry;
-static QTAILQ_HEAD(, QEMUResetEntry) reset_handlers =
- QTAILQ_HEAD_INITIALIZER(reset_handlers);
+typedef QTAILQ_HEAD(QEMUResetList, QEMUResetEntry) QEMUResetList;
+static QEMUResetList reset_handlers[QEMU_RESET_STAGES_N];
-static void qemu_register_reset_one(QEMUResetHandler *func, void *opaque,
- bool skip_snap)
+static void __attribute__((__constructor__)) qemu_reset_handlers_init(void)
+{
+ QEMUResetList *head;
+ int i = 0;
+
+ for (i = 0; i < QEMU_RESET_STAGES_N; i++) {
+ head = &reset_handlers[i];
+ QTAILQ_INIT(head);
+ }
+}
+
+void qemu_register_reset_one(QEMUResetHandler *func, void *opaque,
+ bool skip_snap, int stage)
{
QEMUResetEntry *re = g_new0(QEMUResetEntry, 1);
+ QEMUResetList *head;
+
+ assert(stage >= 0 && stage < QEMU_RESET_STAGES_N);
+ head = &reset_handlers[stage];
re->func = func;
re->opaque = opaque;
re->skip_on_snapshot_load = skip_snap;
- QTAILQ_INSERT_TAIL(&reset_handlers, re, entry);
+ QTAILQ_INSERT_TAIL(head, re, entry);
}
void qemu_register_reset(QEMUResetHandler *func, void *opaque)
{
- /* By default, do not skip during load of a snapshot */
- qemu_register_reset_one(func, opaque, false);
+ qemu_register_reset_one(func, opaque, false, 0);
}
void qemu_register_reset_nosnapshotload(QEMUResetHandler *func, void *opaque)
{
- qemu_register_reset_one(func, opaque, true);
+ qemu_register_reset_one(func, opaque, true, 0);
}
-void qemu_unregister_reset(QEMUResetHandler *func, void *opaque)
+void qemu_unregister_reset_one(QEMUResetHandler *func, void *opaque, int stage)
{
+ QEMUResetList *head;
QEMUResetEntry *re;
- QTAILQ_FOREACH(re, &reset_handlers, entry) {
+ assert(stage >= 0 && stage < QEMU_RESET_STAGES_N);
+ head = &reset_handlers[stage];
+
+ QTAILQ_FOREACH(re, head, entry) {
if (re->func == func && re->opaque == opaque) {
- QTAILQ_REMOVE(&reset_handlers, re, entry);
+ QTAILQ_REMOVE(head, re, entry);
g_free(re);
return;
}
}
}
+void qemu_unregister_reset(QEMUResetHandler *func, void *opaque)
+{
+ qemu_unregister_reset_one(func, opaque, 0);
+}
+
void qemu_devices_reset(ShutdownCause reason)
{
QEMUResetEntry *re, *nre;
+ QEMUResetList *head;
+ int stage;
/* reset all devices */
- QTAILQ_FOREACH_SAFE(re, &reset_handlers, entry, nre) {
- if (reason == SHUTDOWN_CAUSE_SNAPSHOT_LOAD &&
- re->skip_on_snapshot_load) {
- continue;
+ for (stage = 0; stage < QEMU_RESET_STAGES_N; stage++) {
+ head = &reset_handlers[stage];
+ QTAILQ_FOREACH_SAFE(re, head, entry, nre) {
+ if (reason == SHUTDOWN_CAUSE_SNAPSHOT_LOAD &&
+ re->skip_on_snapshot_load) {
+ continue;
+ }
+ re->func(re->opaque);
}
- re->func(re->opaque);
}
}
--
2.43.0
On Wed, 17 Jan 2024 at 09:16, <peterx@redhat.com> wrote: > > From: Peter Xu <peterx@redhat.com> > > QEMU resets do not have a way to order reset hooks. Add one coarse grained > reset stage so that some devices can be reset later than some others. > > Signed-off-by: Peter Xu <peterx@redhat.com> > --- > include/sysemu/reset.h | 5 ++++ > hw/core/reset.c | 60 +++++++++++++++++++++++++++++++----------- > 2 files changed, 49 insertions(+), 16 deletions(-) > > diff --git a/include/sysemu/reset.h b/include/sysemu/reset.h > index 609e4d50c2..0de697ce9f 100644 > --- a/include/sysemu/reset.h > +++ b/include/sysemu/reset.h > @@ -5,9 +5,14 @@ > > typedef void QEMUResetHandler(void *opaque); > > +#define QEMU_RESET_STAGES_N 2 > + Our reset handling APIs are already pretty complicated, and raw qemu_register_reset() is kind of a legacy API that I would prefer that we try to move away from, not add extra complexity to. Our device reset design already has a multiple-phase system (see docs/devel/reset.rst), part of the point of which is to try to give us a way to deal with reset-ordering problems. I feel like the right way to handle the issue you're trying to address is to ensure that the thing that needs to happen last is done in the 'exit' phase rather than the 'hold' phase (which is where legacy reset methods get called). There are some annoying wrinkles here, notably that legacy qemu_register_reset() doesn't support 3-phase reset and so the phasing only happens for devices reset via the device/bus tree hierarchy. But I think the way to go is to try to move forward with that design (i.e. expand 3-phase reset to qemu_register_reset() and/or move things using qemu_register_reset() to device reset where that makes sense), not to ignore it and put a completely different reset-ordering API in a different place. thanks -- PMM
On 17/1/24 18:46, Peter Maydell wrote: > On Wed, 17 Jan 2024 at 09:16, <peterx@redhat.com> wrote: >> >> From: Peter Xu <peterx@redhat.com> >> >> QEMU resets do not have a way to order reset hooks. Add one coarse grained >> reset stage so that some devices can be reset later than some others. >> >> Signed-off-by: Peter Xu <peterx@redhat.com> >> --- >> include/sysemu/reset.h | 5 ++++ >> hw/core/reset.c | 60 +++++++++++++++++++++++++++++++----------- >> 2 files changed, 49 insertions(+), 16 deletions(-) >> >> diff --git a/include/sysemu/reset.h b/include/sysemu/reset.h >> index 609e4d50c2..0de697ce9f 100644 >> --- a/include/sysemu/reset.h >> +++ b/include/sysemu/reset.h >> @@ -5,9 +5,14 @@ >> >> typedef void QEMUResetHandler(void *opaque); >> >> +#define QEMU_RESET_STAGES_N 2 >> + > > Our reset handling APIs are already pretty complicated, and > raw qemu_register_reset() is kind of a legacy API that I would > prefer that we try to move away from, not add extra complexity to. > > Our device reset design already has a multiple-phase system > (see docs/devel/reset.rst), part of the point of which is to > try to give us a way to deal with reset-ordering problems. > I feel like the right way to handle the issue you're trying to > address is to ensure that the thing that needs to happen last is > done in the 'exit' phase rather than the 'hold' phase (which is > where legacy reset methods get called). I concur. Devices reset is hard, but bus reset is even harder. Having a quick look, the issues tracked by Alex & Peter might come from the PCI bridges using the legacy DeviceReset. In particular functions like: - hw/pci-bridge/pcie_root_port.c 46 static void rp_reset_hold(Object *obj) 47 { 48 PCIDevice *d = PCI_DEVICE(obj); 49 DeviceState *qdev = DEVICE(obj); 50 51 rp_aer_vector_update(d); 52 pcie_cap_root_reset(d); 53 pcie_cap_deverr_reset(d); 54 pcie_cap_slot_reset(d); 55 pcie_cap_arifwd_reset(d); 56 pcie_acs_reset(d); 57 pcie_aer_root_reset(d); 58 pci_bridge_reset(qdev); 59 pci_bridge_disable_base_limit(d); 60 } - hw/pci-bridge/pcie_pci_bridge.c 107 static void pcie_pci_bridge_reset(DeviceState *qdev) 108 { 109 PCIDevice *d = PCI_DEVICE(qdev); 110 pci_bridge_reset(qdev); 111 if (msi_present(d)) { 112 msi_reset(d); 113 } 114 shpc_reset(d); 115 } - hw/pci-bridge/xio3130_downstream.c 56 static void xio3130_downstream_reset(DeviceState *qdev) 57 { 58 PCIDevice *d = PCI_DEVICE(qdev); 59 60 pcie_cap_deverr_reset(d); 61 pcie_cap_slot_reset(d); 62 pcie_cap_arifwd_reset(d); 63 pci_bridge_reset(qdev); 64 } should really be split and converted as ResettablePhases. pci_bridge_reset() is likely a ResettableExitPhase one. > There are some annoying wrinkles here, notably that legacy > qemu_register_reset() doesn't support 3-phase reset and so > the phasing only happens for devices reset via the device/bus > tree hierarchy. But I think the way to go is to try to move > forward with that design (i.e. expand 3-phase reset to > qemu_register_reset() and/or move things using qemu_register_reset() > to device reset where that makes sense), not to ignore it and > put a completely different reset-ordering API in a different place. Unfortunately despite DeviceReset being deprecated since 4 years in commit c11256aa6f ("hw/core: add Resettable support to BusClass and DeviceClass"), we keep adding code using this legacy API; for example in the last 4 months: - e029bb00a7 ("hw/pci-host: Add Astro system bus adapter found on PA-RISC machines") - 2880e676c0 ("Add virtio-sound device stub") - 4a58330343 ("hw/cxl: Add a switch mailbox CCI function") - 95e1019a4a ("vhost-user-scsi: free the inflight area when reset") - 6f9c3aaa34 ("fsl-imx: add simple RTC emulation for i.MX6 and i.MX7 boards") Regards, Phil.
On Thu, 18 Jan 2024 at 15:53, Philippe Mathieu-Daudé <philmd@linaro.org> wrote: > Unfortunately despite DeviceReset being deprecated since 4 years > in commit c11256aa6f ("hw/core: add Resettable support to BusClass > and DeviceClass"), we keep adding code using this legacy API; for > example in the last 4 months: > > - e029bb00a7 ("hw/pci-host: Add Astro system bus adapter found on > PA-RISC machines") > - 2880e676c0 ("Add virtio-sound device stub") > - 4a58330343 ("hw/cxl: Add a switch mailbox CCI function") > - 95e1019a4a ("vhost-user-scsi: free the inflight area when reset") > - 6f9c3aaa34 ("fsl-imx: add simple RTC emulation for i.MX6 and i.MX7 > boards") For plain old leaf device reset methods, I'm not too fussed about this, because a reset method is exactly equivalent to a single 'hold' phase method, and the relatively small amount of code to convert from one to the other isn't introducing a lot of extra complication. The cleanup I really want to get back to is the bus reset handling, because we only have a few buses that don't use 3-phase reset, and if we convert those then we can get rid of the handling of transitional reset from BusClass completely. Unfortunately I ran into a regression somewhere in the PCI reset handling in the patchsets I was working on which I never got back to to track down the problem. thanks -- PMM
Hi Peter, On 1/17/24 10:15, peterx@redhat.com wrote: > From: Peter Xu <peterx@redhat.com> > > QEMU resets do not have a way to order reset hooks. Add one coarse grained > reset stage so that some devices can be reset later than some others. I would precise that the lowest stage has the highest priority and is handled first. > > Signed-off-by: Peter Xu <peterx@redhat.com> > --- > include/sysemu/reset.h | 5 ++++ > hw/core/reset.c | 60 +++++++++++++++++++++++++++++++----------- > 2 files changed, 49 insertions(+), 16 deletions(-) > > diff --git a/include/sysemu/reset.h b/include/sysemu/reset.h > index 609e4d50c2..0de697ce9f 100644 > --- a/include/sysemu/reset.h > +++ b/include/sysemu/reset.h > @@ -5,9 +5,14 @@ > > typedef void QEMUResetHandler(void *opaque); > > +#define QEMU_RESET_STAGES_N 2 > + > void qemu_register_reset(QEMUResetHandler *func, void *opaque); > +void qemu_register_reset_one(QEMUResetHandler *func, void *opaque, > + bool skip_snap, int stage); > void qemu_register_reset_nosnapshotload(QEMUResetHandler *func, void *opaque); > void qemu_unregister_reset(QEMUResetHandler *func, void *opaque); > +void qemu_unregister_reset_one(QEMUResetHandler *func, void *opaque, int stage); > void qemu_devices_reset(ShutdownCause reason); > > #endif > diff --git a/hw/core/reset.c b/hw/core/reset.c > index 8cf60b2b09..a84c9bee84 100644 > --- a/hw/core/reset.c > +++ b/hw/core/reset.c > @@ -36,55 +36,83 @@ typedef struct QEMUResetEntry { > bool skip_on_snapshot_load; > } QEMUResetEntry; > > -static QTAILQ_HEAD(, QEMUResetEntry) reset_handlers = > - QTAILQ_HEAD_INITIALIZER(reset_handlers); > +typedef QTAILQ_HEAD(QEMUResetList, QEMUResetEntry) QEMUResetList; > +static QEMUResetList reset_handlers[QEMU_RESET_STAGES_N]; > > -static void qemu_register_reset_one(QEMUResetHandler *func, void *opaque, > - bool skip_snap) > +static void __attribute__((__constructor__)) qemu_reset_handlers_init(void) > +{ > + QEMUResetList *head; > + int i = 0; nit: you may put the declarations within the block > + > + for (i = 0; i < QEMU_RESET_STAGES_N; i++) { > + head = &reset_handlers[i]; > + QTAILQ_INIT(head); > + } > +} > + > +void qemu_register_reset_one(QEMUResetHandler *func, void *opaque, > + bool skip_snap, int stage) > { > QEMUResetEntry *re = g_new0(QEMUResetEntry, 1); > + QEMUResetList *head; > + > + assert(stage >= 0 && stage < QEMU_RESET_STAGES_N); > + head = &reset_handlers[stage]; > > re->func = func; > re->opaque = opaque; > re->skip_on_snapshot_load = skip_snap; > - QTAILQ_INSERT_TAIL(&reset_handlers, re, entry); > + QTAILQ_INSERT_TAIL(head, re, entry); > } > > void qemu_register_reset(QEMUResetHandler *func, void *opaque) > { > - /* By default, do not skip during load of a snapshot */ Shouldn't the above comment stay since the statement is not affected by this patch? Or remove it in previous patch? > - qemu_register_reset_one(func, opaque, false); > + qemu_register_reset_one(func, opaque, false, 0); > } > > void qemu_register_reset_nosnapshotload(QEMUResetHandler *func, void *opaque) > { > - qemu_register_reset_one(func, opaque, true); > + qemu_register_reset_one(func, opaque, true, 0); > } > > -void qemu_unregister_reset(QEMUResetHandler *func, void *opaque) > +void qemu_unregister_reset_one(QEMUResetHandler *func, void *opaque, int stage) > { > + QEMUResetList *head; > QEMUResetEntry *re; > > - QTAILQ_FOREACH(re, &reset_handlers, entry) { > + assert(stage >= 0 && stage < QEMU_RESET_STAGES_N); > + head = &reset_handlers[stage]; > + > + QTAILQ_FOREACH(re, head, entry) { > if (re->func == func && re->opaque == opaque) { > - QTAILQ_REMOVE(&reset_handlers, re, entry); > + QTAILQ_REMOVE(head, re, entry); > g_free(re); > return; > } > } > } > > +void qemu_unregister_reset(QEMUResetHandler *func, void *opaque) > +{ > + qemu_unregister_reset_one(func, opaque, 0); > +} > + > void qemu_devices_reset(ShutdownCause reason) > { > QEMUResetEntry *re, *nre; > + QEMUResetList *head; > + int stage; > > /* reset all devices */ > - QTAILQ_FOREACH_SAFE(re, &reset_handlers, entry, nre) { > - if (reason == SHUTDOWN_CAUSE_SNAPSHOT_LOAD && > - re->skip_on_snapshot_load) { > - continue; > + for (stage = 0; stage < QEMU_RESET_STAGES_N; stage++) { > + head = &reset_handlers[stage]; > + QTAILQ_FOREACH_SAFE(re, head, entry, nre) { > + if (reason == SHUTDOWN_CAUSE_SNAPSHOT_LOAD && > + re->skip_on_snapshot_load) { > + continue; > + } > + re->func(re->opaque); > } > - re->func(re->opaque); > } > } > Thanks Eric
On 1/17/24 11:28, Eric Auger wrote: > Hi Peter, > On 1/17/24 10:15, peterx@redhat.com wrote: >> From: Peter Xu <peterx@redhat.com> >> >> QEMU resets do not have a way to order reset hooks. Add one coarse grained >> reset stage so that some devices can be reset later than some others. > I would precise that the lowest stage has the highest priority and is > handled first. yes. May be add an enum like we have for migration : typedef enum { MIG_PRI_DEFAULT = 0, MIG_PRI_IOMMU, /* Must happen before PCI devices */ MIG_PRI_PCI_BUS, /* Must happen before IOMMU */ MIG_PRI_VIRTIO_MEM, /* Must happen before IOMMU */ MIG_PRI_GICV3_ITS, /* Must happen before PCI devices */ MIG_PRI_GICV3, /* Must happen before the ITS */ MIG_PRI_MAX, } MigrationPriority; I think it would help understand the reset ordering and maintenance when grepping qemu_register_reset_one(). Thanks, C. >> >> Signed-off-by: Peter Xu <peterx@redhat.com> >> --- >> include/sysemu/reset.h | 5 ++++ >> hw/core/reset.c | 60 +++++++++++++++++++++++++++++++----------- >> 2 files changed, 49 insertions(+), 16 deletions(-) >> >> diff --git a/include/sysemu/reset.h b/include/sysemu/reset.h >> index 609e4d50c2..0de697ce9f 100644 >> --- a/include/sysemu/reset.h >> +++ b/include/sysemu/reset.h >> @@ -5,9 +5,14 @@ >> >> typedef void QEMUResetHandler(void *opaque); >> >> +#define QEMU_RESET_STAGES_N 2 >> + >> void qemu_register_reset(QEMUResetHandler *func, void *opaque); >> +void qemu_register_reset_one(QEMUResetHandler *func, void *opaque, >> + bool skip_snap, int stage); >> void qemu_register_reset_nosnapshotload(QEMUResetHandler *func, void *opaque); >> void qemu_unregister_reset(QEMUResetHandler *func, void *opaque); >> +void qemu_unregister_reset_one(QEMUResetHandler *func, void *opaque, int stage); >> void qemu_devices_reset(ShutdownCause reason); >> >> #endif >> diff --git a/hw/core/reset.c b/hw/core/reset.c >> index 8cf60b2b09..a84c9bee84 100644 >> --- a/hw/core/reset.c >> +++ b/hw/core/reset.c >> @@ -36,55 +36,83 @@ typedef struct QEMUResetEntry { >> bool skip_on_snapshot_load; >> } QEMUResetEntry; >> >> -static QTAILQ_HEAD(, QEMUResetEntry) reset_handlers = >> - QTAILQ_HEAD_INITIALIZER(reset_handlers); >> +typedef QTAILQ_HEAD(QEMUResetList, QEMUResetEntry) QEMUResetList; >> +static QEMUResetList reset_handlers[QEMU_RESET_STAGES_N]; >> >> -static void qemu_register_reset_one(QEMUResetHandler *func, void *opaque, >> - bool skip_snap) >> +static void __attribute__((__constructor__)) qemu_reset_handlers_init(void) >> +{ >> + QEMUResetList *head; >> + int i = 0; > nit: you may put the declarations within the block >> + >> + for (i = 0; i < QEMU_RESET_STAGES_N; i++) { >> + head = &reset_handlers[i]; >> + QTAILQ_INIT(head); >> + } >> +} >> + >> +void qemu_register_reset_one(QEMUResetHandler *func, void *opaque, >> + bool skip_snap, int stage) >> { >> QEMUResetEntry *re = g_new0(QEMUResetEntry, 1); >> + QEMUResetList *head; >> + >> + assert(stage >= 0 && stage < QEMU_RESET_STAGES_N); >> + head = &reset_handlers[stage]; >> >> re->func = func; >> re->opaque = opaque; >> re->skip_on_snapshot_load = skip_snap; >> - QTAILQ_INSERT_TAIL(&reset_handlers, re, entry); >> + QTAILQ_INSERT_TAIL(head, re, entry); >> } >> >> void qemu_register_reset(QEMUResetHandler *func, void *opaque) >> { >> - /* By default, do not skip during load of a snapshot */ > Shouldn't the above comment stay since the statement is not affected by > this patch? Or remove it in previous patch? >> - qemu_register_reset_one(func, opaque, false); >> + qemu_register_reset_one(func, opaque, false, 0); >> } >> >> void qemu_register_reset_nosnapshotload(QEMUResetHandler *func, void *opaque) >> { >> - qemu_register_reset_one(func, opaque, true); >> + qemu_register_reset_one(func, opaque, true, 0); >> } >> >> -void qemu_unregister_reset(QEMUResetHandler *func, void *opaque) >> +void qemu_unregister_reset_one(QEMUResetHandler *func, void *opaque, int stage) >> { >> + QEMUResetList *head; >> QEMUResetEntry *re; >> >> - QTAILQ_FOREACH(re, &reset_handlers, entry) { >> + assert(stage >= 0 && stage < QEMU_RESET_STAGES_N); >> + head = &reset_handlers[stage]; >> + >> + QTAILQ_FOREACH(re, head, entry) { >> if (re->func == func && re->opaque == opaque) { >> - QTAILQ_REMOVE(&reset_handlers, re, entry); >> + QTAILQ_REMOVE(head, re, entry); >> g_free(re); >> return; >> } >> } >> } >> >> +void qemu_unregister_reset(QEMUResetHandler *func, void *opaque) >> +{ >> + qemu_unregister_reset_one(func, opaque, 0); >> +} >> + >> void qemu_devices_reset(ShutdownCause reason) >> { >> QEMUResetEntry *re, *nre; >> + QEMUResetList *head; >> + int stage; >> >> /* reset all devices */ >> - QTAILQ_FOREACH_SAFE(re, &reset_handlers, entry, nre) { >> - if (reason == SHUTDOWN_CAUSE_SNAPSHOT_LOAD && >> - re->skip_on_snapshot_load) { >> - continue; >> + for (stage = 0; stage < QEMU_RESET_STAGES_N; stage++) { >> + head = &reset_handlers[stage]; >> + QTAILQ_FOREACH_SAFE(re, head, entry, nre) { >> + if (reason == SHUTDOWN_CAUSE_SNAPSHOT_LOAD && >> + re->skip_on_snapshot_load) { >> + continue; >> + } >> + re->func(re->opaque); >> } >> - re->func(re->opaque); >> } >> } >> > Thanks > > Eric > >
© 2016 - 2024 Red Hat, Inc.