It is not safe to call drain_call_rcu() from qmp_device_add() because
some call stacks are not prepared for drain_call_rcu() to drop the Big
QEMU Lock (BQL).
For example, device emulation code is protected by the BQL but when it
calls aio_poll() -> ... -> qmp_device_add() -> drain_call_rcu() then the
BQL is dropped. See bz#2215192 below for a concrete bug of this type.
Another limitation of drain_call_rcu() is that it cannot be invoked
within an RCU read-side critical section since the reclamation phase
cannot complete until the end of the critical section. Unfortunately,
call stacks have been seen where this happens (see bz#2214985 below).
Switch to call_drain_rcu_co() to avoid these problems. This requires
making qmp_device_add() a coroutine. qdev_device_add() is not designed
to be called from coroutines, so it must be invoked from a BH and then
switch back to the coroutine.
Fixes: 7bed89958bfbf40df9ca681cefbdca63abdde39d ("device_core: use drain_call_rcu in in qmp_device_add")
Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=2215192
Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=2214985
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
qapi/qdev.json | 1 +
include/monitor/qdev.h | 3 ++-
monitor/qmp-cmds.c | 2 +-
softmmu/qdev-monitor.c | 34 ++++++++++++++++++++++++++++++----
hmp-commands.hx | 1 +
5 files changed, 35 insertions(+), 6 deletions(-)
diff --git a/qapi/qdev.json b/qapi/qdev.json
index 6bc5a733b8..78e9d7f7b8 100644
--- a/qapi/qdev.json
+++ b/qapi/qdev.json
@@ -79,6 +79,7 @@
##
{ 'command': 'device_add',
'data': {'driver': 'str', '*bus': 'str', '*id': 'str'},
+ 'coroutine': true,
'gen': false, # so we can get the additional arguments
'features': ['json-cli', 'json-cli-hotplug'] }
diff --git a/include/monitor/qdev.h b/include/monitor/qdev.h
index 1d57bf6577..1fed9eb9ea 100644
--- a/include/monitor/qdev.h
+++ b/include/monitor/qdev.h
@@ -5,7 +5,8 @@
void hmp_info_qtree(Monitor *mon, const QDict *qdict);
void hmp_info_qdm(Monitor *mon, const QDict *qdict);
-void qmp_device_add(QDict *qdict, QObject **ret_data, Error **errp);
+void coroutine_fn
+qmp_device_add(QDict *qdict, QObject **ret_data, Error **errp);
int qdev_device_help(QemuOpts *opts);
DeviceState *qdev_device_add(QemuOpts *opts, Error **errp);
diff --git a/monitor/qmp-cmds.c b/monitor/qmp-cmds.c
index b0f948d337..a7419226fe 100644
--- a/monitor/qmp-cmds.c
+++ b/monitor/qmp-cmds.c
@@ -202,7 +202,7 @@ static void __attribute__((__constructor__)) monitor_init_qmp_commands(void)
qmp_init_marshal(&qmp_commands);
qmp_register_command(&qmp_commands, "device_add",
- qmp_device_add, 0, 0);
+ qmp_device_add, QCO_COROUTINE, 0);
QTAILQ_INIT(&qmp_cap_negotiation_commands);
qmp_register_command(&qmp_cap_negotiation_commands, "qmp_capabilities",
diff --git a/softmmu/qdev-monitor.c b/softmmu/qdev-monitor.c
index 74f4e41338..85ae62f7cf 100644
--- a/softmmu/qdev-monitor.c
+++ b/softmmu/qdev-monitor.c
@@ -839,8 +839,28 @@ void hmp_info_qdm(Monitor *mon, const QDict *qdict)
qdev_print_devinfos(true);
}
-void qmp_device_add(QDict *qdict, QObject **ret_data, Error **errp)
+typedef struct {
+ Coroutine *co;
+ QemuOpts *opts;
+ Error **errp;
+ DeviceState *dev;
+} QmpDeviceAdd;
+
+static void qmp_device_add_bh(void *opaque)
{
+ QmpDeviceAdd *data = opaque;
+
+ data->dev = qdev_device_add(data->opts, data->errp);
+ aio_co_wake(data->co);
+}
+
+void coroutine_fn
+qmp_device_add(QDict *qdict, QObject **ret_data, Error **errp)
+{
+ QmpDeviceAdd data = {
+ .co = qemu_coroutine_self(),
+ .errp = errp,
+ };
QemuOpts *opts;
DeviceState *dev;
@@ -852,7 +872,13 @@ void qmp_device_add(QDict *qdict, QObject **ret_data, Error **errp)
qemu_opts_del(opts);
return;
}
- dev = qdev_device_add(opts, errp);
+
+ /* Perform qdev_device_add() call outside coroutine context */
+ data.opts = opts;
+ aio_bh_schedule_oneshot(qemu_coroutine_get_aio_context(data.co),
+ qmp_device_add_bh, &data);
+ qemu_coroutine_yield();
+ dev = data.dev;
/*
* Drain all pending RCU callbacks. This is done because
@@ -863,7 +889,7 @@ void qmp_device_add(QDict *qdict, QObject **ret_data, Error **errp)
* will finish its job completely once qmp command returns result
* to the user
*/
- drain_call_rcu();
+ drain_call_rcu_co();
if (!dev) {
qemu_opts_del(opts);
@@ -956,7 +982,7 @@ void qmp_device_del(const char *id, Error **errp)
}
}
-void hmp_device_add(Monitor *mon, const QDict *qdict)
+void coroutine_fn hmp_device_add(Monitor *mon, const QDict *qdict)
{
Error *err = NULL;
diff --git a/hmp-commands.hx b/hmp-commands.hx
index 2cbd0f77a0..c737d1fd64 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -695,6 +695,7 @@ ERST
.params = "driver[,prop=value][,...]",
.help = "add device, like -device on the command line",
.cmd = hmp_device_add,
+ .coroutine = true,
.command_completion = device_add_completion,
},
--
2.41.0
Am 06.09.2023 um 21:01 hat Stefan Hajnoczi geschrieben: > It is not safe to call drain_call_rcu() from qmp_device_add() because > some call stacks are not prepared for drain_call_rcu() to drop the Big > QEMU Lock (BQL). > > For example, device emulation code is protected by the BQL but when it > calls aio_poll() -> ... -> qmp_device_add() -> drain_call_rcu() then the > BQL is dropped. See bz#2215192 below for a concrete bug of this type. > > Another limitation of drain_call_rcu() is that it cannot be invoked > within an RCU read-side critical section since the reclamation phase > cannot complete until the end of the critical section. Unfortunately, > call stacks have been seen where this happens (see bz#2214985 below). > > Switch to call_drain_rcu_co() to avoid these problems. This requires > making qmp_device_add() a coroutine. qdev_device_add() is not designed > to be called from coroutines, so it must be invoked from a BH and then > switch back to the coroutine. > > Fixes: 7bed89958bfbf40df9ca681cefbdca63abdde39d ("device_core: use drain_call_rcu in in qmp_device_add") > Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=2215192 > Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=2214985 > Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> Can you please include the relevant information directly in the commit message instead of only referencing Bugzilla? Both bugs only contain half of the story - I'm not even sure if the link with the stack trace is publically accessible - and then I think you got some information only from reproducing it yourself, and this information is missing from the bug reports. (The other question is how long the information will still be available in Bugzilla.) > qapi/qdev.json | 1 + > include/monitor/qdev.h | 3 ++- > monitor/qmp-cmds.c | 2 +- > softmmu/qdev-monitor.c | 34 ++++++++++++++++++++++++++++++---- > hmp-commands.hx | 1 + > 5 files changed, 35 insertions(+), 6 deletions(-) > > diff --git a/qapi/qdev.json b/qapi/qdev.json > index 6bc5a733b8..78e9d7f7b8 100644 > --- a/qapi/qdev.json > +++ b/qapi/qdev.json > @@ -79,6 +79,7 @@ > ## > { 'command': 'device_add', > 'data': {'driver': 'str', '*bus': 'str', '*id': 'str'}, > + 'coroutine': true, > 'gen': false, # so we can get the additional arguments > 'features': ['json-cli', 'json-cli-hotplug'] } > > diff --git a/include/monitor/qdev.h b/include/monitor/qdev.h > index 1d57bf6577..1fed9eb9ea 100644 > --- a/include/monitor/qdev.h > +++ b/include/monitor/qdev.h > @@ -5,7 +5,8 @@ > > void hmp_info_qtree(Monitor *mon, const QDict *qdict); > void hmp_info_qdm(Monitor *mon, const QDict *qdict); > -void qmp_device_add(QDict *qdict, QObject **ret_data, Error **errp); > +void coroutine_fn > +qmp_device_add(QDict *qdict, QObject **ret_data, Error **errp); > > int qdev_device_help(QemuOpts *opts); > DeviceState *qdev_device_add(QemuOpts *opts, Error **errp); > diff --git a/monitor/qmp-cmds.c b/monitor/qmp-cmds.c > index b0f948d337..a7419226fe 100644 > --- a/monitor/qmp-cmds.c > +++ b/monitor/qmp-cmds.c > @@ -202,7 +202,7 @@ static void __attribute__((__constructor__)) monitor_init_qmp_commands(void) > qmp_init_marshal(&qmp_commands); > > qmp_register_command(&qmp_commands, "device_add", > - qmp_device_add, 0, 0); > + qmp_device_add, QCO_COROUTINE, 0); > > QTAILQ_INIT(&qmp_cap_negotiation_commands); > qmp_register_command(&qmp_cap_negotiation_commands, "qmp_capabilities", > diff --git a/softmmu/qdev-monitor.c b/softmmu/qdev-monitor.c > index 74f4e41338..85ae62f7cf 100644 > --- a/softmmu/qdev-monitor.c > +++ b/softmmu/qdev-monitor.c > @@ -839,8 +839,28 @@ void hmp_info_qdm(Monitor *mon, const QDict *qdict) > qdev_print_devinfos(true); > } > > -void qmp_device_add(QDict *qdict, QObject **ret_data, Error **errp) > +typedef struct { > + Coroutine *co; > + QemuOpts *opts; > + Error **errp; > + DeviceState *dev; > +} QmpDeviceAdd; > + > +static void qmp_device_add_bh(void *opaque) > { > + QmpDeviceAdd *data = opaque; > + > + data->dev = qdev_device_add(data->opts, data->errp); > + aio_co_wake(data->co); > +} > + > +void coroutine_fn > +qmp_device_add(QDict *qdict, QObject **ret_data, Error **errp) > +{ > + QmpDeviceAdd data = { > + .co = qemu_coroutine_self(), > + .errp = errp, > + }; > QemuOpts *opts; > DeviceState *dev; > > @@ -852,7 +872,13 @@ void qmp_device_add(QDict *qdict, QObject **ret_data, Error **errp) > qemu_opts_del(opts); > return; > } > - dev = qdev_device_add(opts, errp); > + > + /* Perform qdev_device_add() call outside coroutine context */ > + data.opts = opts; > + aio_bh_schedule_oneshot(qemu_coroutine_get_aio_context(data.co), > + qmp_device_add_bh, &data); > + qemu_coroutine_yield(); > + dev = data.dev; I wonder if we should make no_co_wrapper etc. available outside of the block layer, so we could just declare a qdev_co_device_add() and call it here and the code would automatically be generated. This doesn't work today because the script generates only a single source file for all wrappers, which is linked with all of the tools. So putting qdev functions there would make the build fail. I had a similar case in the virtio_load() fix where I decided to write the wrapper manually instead. But having two cases in such a short time frame might be a sign that we actually have enough potential users that making the generator work for it would be worth it. Kevin
On Tue, 12 Sept 2023 at 12:47, Kevin Wolf <kwolf@redhat.com> wrote: > > Am 06.09.2023 um 21:01 hat Stefan Hajnoczi geschrieben: > > It is not safe to call drain_call_rcu() from qmp_device_add() because > > some call stacks are not prepared for drain_call_rcu() to drop the Big > > QEMU Lock (BQL). > > > > For example, device emulation code is protected by the BQL but when it > > calls aio_poll() -> ... -> qmp_device_add() -> drain_call_rcu() then the > > BQL is dropped. See bz#2215192 below for a concrete bug of this type. > > > > Another limitation of drain_call_rcu() is that it cannot be invoked > > within an RCU read-side critical section since the reclamation phase > > cannot complete until the end of the critical section. Unfortunately, > > call stacks have been seen where this happens (see bz#2214985 below). > > > > Switch to call_drain_rcu_co() to avoid these problems. This requires > > making qmp_device_add() a coroutine. qdev_device_add() is not designed > > to be called from coroutines, so it must be invoked from a BH and then > > switch back to the coroutine. > > > > Fixes: 7bed89958bfbf40df9ca681cefbdca63abdde39d ("device_core: use drain_call_rcu in in qmp_device_add") > > Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=2215192 > > Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=2214985 > > Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> > > Can you please include the relevant information directly in the commit > message instead of only referencing Bugzilla? Both bugs only contain > half of the story - I'm not even sure if the link with the stack trace > is publically accessible - and then I think you got some information > only from reproducing it yourself, and this information is missing from > the bug reports. (The other question is how long the information will > still be available in Bugzilla.) Yes, I'll include the details in the commit description. > > > qapi/qdev.json | 1 + > > include/monitor/qdev.h | 3 ++- > > monitor/qmp-cmds.c | 2 +- > > softmmu/qdev-monitor.c | 34 ++++++++++++++++++++++++++++++---- > > hmp-commands.hx | 1 + > > 5 files changed, 35 insertions(+), 6 deletions(-) > > > > diff --git a/qapi/qdev.json b/qapi/qdev.json > > index 6bc5a733b8..78e9d7f7b8 100644 > > --- a/qapi/qdev.json > > +++ b/qapi/qdev.json > > @@ -79,6 +79,7 @@ > > ## > > { 'command': 'device_add', > > 'data': {'driver': 'str', '*bus': 'str', '*id': 'str'}, > > + 'coroutine': true, > > 'gen': false, # so we can get the additional arguments > > 'features': ['json-cli', 'json-cli-hotplug'] } > > > > diff --git a/include/monitor/qdev.h b/include/monitor/qdev.h > > index 1d57bf6577..1fed9eb9ea 100644 > > --- a/include/monitor/qdev.h > > +++ b/include/monitor/qdev.h > > @@ -5,7 +5,8 @@ > > > > void hmp_info_qtree(Monitor *mon, const QDict *qdict); > > void hmp_info_qdm(Monitor *mon, const QDict *qdict); > > -void qmp_device_add(QDict *qdict, QObject **ret_data, Error **errp); > > +void coroutine_fn > > +qmp_device_add(QDict *qdict, QObject **ret_data, Error **errp); > > > > int qdev_device_help(QemuOpts *opts); > > DeviceState *qdev_device_add(QemuOpts *opts, Error **errp); > > diff --git a/monitor/qmp-cmds.c b/monitor/qmp-cmds.c > > index b0f948d337..a7419226fe 100644 > > --- a/monitor/qmp-cmds.c > > +++ b/monitor/qmp-cmds.c > > @@ -202,7 +202,7 @@ static void __attribute__((__constructor__)) monitor_init_qmp_commands(void) > > qmp_init_marshal(&qmp_commands); > > > > qmp_register_command(&qmp_commands, "device_add", > > - qmp_device_add, 0, 0); > > + qmp_device_add, QCO_COROUTINE, 0); > > > > QTAILQ_INIT(&qmp_cap_negotiation_commands); > > qmp_register_command(&qmp_cap_negotiation_commands, "qmp_capabilities", > > diff --git a/softmmu/qdev-monitor.c b/softmmu/qdev-monitor.c > > index 74f4e41338..85ae62f7cf 100644 > > --- a/softmmu/qdev-monitor.c > > +++ b/softmmu/qdev-monitor.c > > @@ -839,8 +839,28 @@ void hmp_info_qdm(Monitor *mon, const QDict *qdict) > > qdev_print_devinfos(true); > > } > > > > -void qmp_device_add(QDict *qdict, QObject **ret_data, Error **errp) > > +typedef struct { > > + Coroutine *co; > > + QemuOpts *opts; > > + Error **errp; > > + DeviceState *dev; > > +} QmpDeviceAdd; > > + > > +static void qmp_device_add_bh(void *opaque) > > { > > + QmpDeviceAdd *data = opaque; > > + > > + data->dev = qdev_device_add(data->opts, data->errp); > > + aio_co_wake(data->co); > > +} > > + > > +void coroutine_fn > > +qmp_device_add(QDict *qdict, QObject **ret_data, Error **errp) > > +{ > > + QmpDeviceAdd data = { > > + .co = qemu_coroutine_self(), > > + .errp = errp, > > + }; > > QemuOpts *opts; > > DeviceState *dev; > > > > @@ -852,7 +872,13 @@ void qmp_device_add(QDict *qdict, QObject **ret_data, Error **errp) > > qemu_opts_del(opts); > > return; > > } > > - dev = qdev_device_add(opts, errp); > > + > > + /* Perform qdev_device_add() call outside coroutine context */ > > + data.opts = opts; > > + aio_bh_schedule_oneshot(qemu_coroutine_get_aio_context(data.co), > > + qmp_device_add_bh, &data); > > + qemu_coroutine_yield(); > > + dev = data.dev; > > I wonder if we should make no_co_wrapper etc. available outside of the > block layer, so we could just declare a qdev_co_device_add() and call it > here and the code would automatically be generated. > > This doesn't work today because the script generates only a single > source file for all wrappers, which is linked with all of the tools. So > putting qdev functions there would make the build fail. > > I had a similar case in the virtio_load() fix where I decided to write > the wrapper manually instead. But having two cases in such a short time > frame might be a sign that we actually have enough potential users that > making the generator work for it would be worth it. In principal I'm happy to do that. Before I continue working on the coroutine version of qmp_device_add(), please let us know your thoughts about Paolo's idea. If I understand correctly, Paolo's idea is to refactor the monitor code so that non-coroutine monitor commands run in the iohandler AioContext, thus avoiding the drain_call_rcu() vs nested event loops issue. It would not be necessary to make qmp_device_add() a coroutine anymore since drain_call_rcu() could be called safely. Does that sound okay or are you aware of a case where this doesn't work? Stefan
© 2016 - 2024 Red Hat, Inc.