include/qemu/timer.h | 4 ++++ util/qemu-timer.c | 21 +++++++++++++++++++++ 2 files changed, 25 insertions(+)
`timerlist_run_timers` provides no mechanism to
make sure the data pointed by `opaque` is valid
when calling timer's callback: there could be
another thread running which is destroying
timer's opaque data.
With this change `timer_del` becomes blocking if
timer's callback is running and it will be safe
to destroy timer's data once `timer_del` returns.
Signed-off-by: Roman Kiryanov <rkir@google.com>
---
v2: rebased to the right branch and removed
Google specific tags from the commit message.
include/qemu/timer.h | 4 ++++
util/qemu-timer.c | 21 +++++++++++++++++++++
2 files changed, 25 insertions(+)
diff --git a/include/qemu/timer.h b/include/qemu/timer.h
index 5ce83c7911..efd0e95d20 100644
--- a/include/qemu/timer.h
+++ b/include/qemu/timer.h
@@ -3,6 +3,7 @@
#include "qemu/bitops.h"
#include "qemu/notify.h"
+#include "qemu/thread.h"
#include "qemu/host-utils.h"
#define NANOSECONDS_PER_SECOND 1000000000LL
@@ -86,9 +87,12 @@ struct QEMUTimer {
QEMUTimerList *timer_list;
QEMUTimerCB *cb;
void *opaque;
+ QemuMutex opaque_lock;
+ QemuCond cb_done;
QEMUTimer *next;
int attributes;
int scale;
+ bool cb_running;
};
extern QEMUTimerListGroup main_loop_tlg;
diff --git a/util/qemu-timer.c b/util/qemu-timer.c
index 213114be68..95af255519 100644
--- a/util/qemu-timer.c
+++ b/util/qemu-timer.c
@@ -369,7 +369,10 @@ void timer_init_full(QEMUTimer *ts,
ts->opaque = opaque;
ts->scale = scale;
ts->attributes = attributes;
+ qemu_mutex_init(&ts->opaque_lock);
+ qemu_cond_init(&ts->cb_done);
ts->expire_time = -1;
+ ts->cb_running = false;
}
void timer_deinit(QEMUTimer *ts)
@@ -394,6 +397,12 @@ static void timer_del_locked(QEMUTimerList *timer_list, QEMUTimer *ts)
}
pt = &t->next;
}
+
+ qemu_mutex_lock(&ts->opaque_lock);
+ while (ts->cb_running) {
+ qemu_cond_wait(&ts->cb_done, &ts->opaque_lock);
+ }
+ qemu_mutex_unlock(&ts->opaque_lock);
}
static bool timer_mod_ns_locked(QEMUTimerList *timer_list,
@@ -571,11 +580,23 @@ bool timerlist_run_timers(QEMUTimerList *timer_list)
cb = ts->cb;
opaque = ts->opaque;
+ /* Mark the callback as running to prevent
+ * destroying `opaque` in another thread.
+ */
+ qemu_mutex_lock(&ts->opaque_lock);
+ ts->cb_running = true;
+ qemu_mutex_unlock(&ts->opaque_lock);
+
/* run the callback (the timer list can be modified) */
qemu_mutex_unlock(&timer_list->active_timers_lock);
cb(opaque);
qemu_mutex_lock(&timer_list->active_timers_lock);
+ qemu_mutex_lock(&ts->opaque_lock);
+ ts->cb_running = false;
+ qemu_cond_broadcast(&ts->cb_done);
+ qemu_mutex_unlock(&ts->opaque_lock);
+
progress = true;
}
qemu_mutex_unlock(&timer_list->active_timers_lock);
--
2.45.2.741.gdbec12cfda-goog
On 6/26/24 23:52, Roman Kiryanov wrote: > `timerlist_run_timers` provides no mechanism to > make sure the data pointed by `opaque` is valid > when calling timer's callback: there could be > another thread running which is destroying > timer's opaque data. > > With this change `timer_del` becomes blocking if > timer's callback is running and it will be safe > to destroy timer's data once `timer_del` returns. > > Signed-off-by: Roman Kiryanov <rkir@google.com> Generally QEMU is running event loop code under the "big QEMU lock" (BQL). If both timer_del() and the callback run under the BQL, the race cannot happen. If you're using multiple threads, however, this code is generally very performance sensitive; and adding a mutex and broadcast on every timer that fires is probably too much. A simpler possibility (and the QemuEvent is already there, even) could be to use qemu_event_wait(&timer_list->timers_done_ev); but I think this situation is not specific to timers, and tied more to the code that creates the timer; therefore it's easier to handle it outside common code. If you only need to synchronize freeing against callbacks, you can use aio_bh_schedule_oneshot() and do the free in the bottom half. If instead you need the cleanup to be synchronous, the same idea of the bottom half can be used, via aio_wait_bh_oneshot(). Paolo > --- > v2: rebased to the right branch and removed > Google specific tags from the commit message. > > include/qemu/timer.h | 4 ++++ > util/qemu-timer.c | 21 +++++++++++++++++++++ > 2 files changed, 25 insertions(+) > > diff --git a/include/qemu/timer.h b/include/qemu/timer.h > index 5ce83c7911..efd0e95d20 100644 > --- a/include/qemu/timer.h > +++ b/include/qemu/timer.h > @@ -3,6 +3,7 @@ > > #include "qemu/bitops.h" > #include "qemu/notify.h" > +#include "qemu/thread.h" > #include "qemu/host-utils.h" > > #define NANOSECONDS_PER_SECOND 1000000000LL > @@ -86,9 +87,12 @@ struct QEMUTimer { > QEMUTimerList *timer_list; > QEMUTimerCB *cb; > void *opaque; > + QemuMutex opaque_lock; > + QemuCond cb_done; > QEMUTimer *next; > int attributes; > int scale; > + bool cb_running; > }; > > extern QEMUTimerListGroup main_loop_tlg; > diff --git a/util/qemu-timer.c b/util/qemu-timer.c > index 213114be68..95af255519 100644 > --- a/util/qemu-timer.c > +++ b/util/qemu-timer.c > @@ -369,7 +369,10 @@ void timer_init_full(QEMUTimer *ts, > ts->opaque = opaque; > ts->scale = scale; > ts->attributes = attributes; > + qemu_mutex_init(&ts->opaque_lock); > + qemu_cond_init(&ts->cb_done); > ts->expire_time = -1; > + ts->cb_running = false; > } > > void timer_deinit(QEMUTimer *ts) > @@ -394,6 +397,12 @@ static void timer_del_locked(QEMUTimerList *timer_list, QEMUTimer *ts) > } > pt = &t->next; > } > + > + qemu_mutex_lock(&ts->opaque_lock); > + while (ts->cb_running) { > + qemu_cond_wait(&ts->cb_done, &ts->opaque_lock); > + } > + qemu_mutex_unlock(&ts->opaque_lock); > } > > static bool timer_mod_ns_locked(QEMUTimerList *timer_list, > @@ -571,11 +580,23 @@ bool timerlist_run_timers(QEMUTimerList *timer_list) > cb = ts->cb; > opaque = ts->opaque; > > + /* Mark the callback as running to prevent > + * destroying `opaque` in another thread. > + */ > + qemu_mutex_lock(&ts->opaque_lock); > + ts->cb_running = true; > + qemu_mutex_unlock(&ts->opaque_lock); > + > /* run the callback (the timer list can be modified) */ > qemu_mutex_unlock(&timer_list->active_timers_lock); > cb(opaque); > qemu_mutex_lock(&timer_list->active_timers_lock); > > + qemu_mutex_lock(&ts->opaque_lock); > + ts->cb_running = false; > + qemu_cond_broadcast(&ts->cb_done); > + qemu_mutex_unlock(&ts->opaque_lock); > + > progress = true; > } > qemu_mutex_unlock(&timer_list->active_timers_lock);
`timerlist_run_timers` provides no mechanism to
make sure the data pointed by `opaque` is valid
when calling timer's callback: there could be
another thread running which is destroying
timer's opaque data.
With this change `timer_del` becomes blocking if
timer's callback is running and it will be safe
to destroy timer's data once `timer_del` returns.
Signed-off-by: Roman Kiryanov <rkir@google.com>
---
v2: rebased to the right branch and removed
Google specific tags from the commit message.
v3: if a timer's callback happens to be running
(cb_running) wait until all timers are done.
qatomic_read/qemu_event_reset could be racing
and timer_del might wait one extra loop of
timers to be done.
include/qemu/timer.h | 1 +
util/qemu-timer.c | 11 +++++++++++
2 files changed, 12 insertions(+)
diff --git a/include/qemu/timer.h b/include/qemu/timer.h
index 5ce83c7911..c2c98f79f4 100644
--- a/include/qemu/timer.h
+++ b/include/qemu/timer.h
@@ -89,6 +89,7 @@ struct QEMUTimer {
QEMUTimer *next;
int attributes;
int scale;
+ bool cb_running;
};
extern QEMUTimerListGroup main_loop_tlg;
diff --git a/util/qemu-timer.c b/util/qemu-timer.c
index 213114be68..5ec379dc43 100644
--- a/util/qemu-timer.c
+++ b/util/qemu-timer.c
@@ -370,6 +370,7 @@ void timer_init_full(QEMUTimer *ts,
ts->scale = scale;
ts->attributes = attributes;
ts->expire_time = -1;
+ ts->cb_running = false;
}
void timer_deinit(QEMUTimer *ts)
@@ -435,6 +436,10 @@ void timer_del(QEMUTimer *ts)
qemu_mutex_lock(&timer_list->active_timers_lock);
timer_del_locked(timer_list, ts);
qemu_mutex_unlock(&timer_list->active_timers_lock);
+
+ if (qatomic_read(&ts->cb_running)) {
+ qemu_event_wait(&timer_list->timers_done_ev);
+ }
}
}
@@ -571,9 +576,15 @@ bool timerlist_run_timers(QEMUTimerList *timer_list)
cb = ts->cb;
opaque = ts->opaque;
+ /* prevent timer_del from returning while cb(opaque)
+ * is still running (by waiting for timers_done_ev).
+ */
+ qatomic_set(&ts->cb_running, true);
+
/* run the callback (the timer list can be modified) */
qemu_mutex_unlock(&timer_list->active_timers_lock);
cb(opaque);
+ qatomic_set(&ts->cb_running, false);
qemu_mutex_lock(&timer_list->active_timers_lock);
progress = true;
--
2.45.2.741.gdbec12cfda-goog
On Thu, Jun 27, 2024 at 2:32 AM Roman Kiryanov <rkir@google.com> wrote: > + if (qatomic_read(&ts->cb_running)) { > + qemu_event_wait(&timer_list->timers_done_ev); > + } qemu_event_wait() already has the right atomic magic, and ts->cb_running is both redundant (in general), and I think racy (as implemented in this patch). This stuff is really hard to get right. At the very least you need a store-release when clearing the flag, and I think it also has to be read under the mutex (I'm not sure if there's anything else, I haven't done a full analysis). But especially, you haven't justified in the commit message _why_ you need this. Since this problem is not timer-specific, using aio_bh_schedule_oneshot() or aio_wait_bh_oneshot() to synchronize everything with the AioContext thread seems like a superior solution to me. Paolo > } > } > > @@ -571,9 +576,15 @@ bool timerlist_run_timers(QEMUTimerList *timer_list) > cb = ts->cb; > opaque = ts->opaque; > > + /* prevent timer_del from returning while cb(opaque) > + * is still running (by waiting for timers_done_ev). > + */ > + qatomic_set(&ts->cb_running, true); > + > /* run the callback (the timer list can be modified) */ > qemu_mutex_unlock(&timer_list->active_timers_lock); > cb(opaque); > + qatomic_set(&ts->cb_running, false); > qemu_mutex_lock(&timer_list->active_timers_lock); > > progress = true; > -- > 2.45.2.741.gdbec12cfda-goog >
On Thu, Jun 27, 2024 at 6:27 AM Paolo Bonzini <pbonzini@redhat.com> wrote: > > On Thu, Jun 27, 2024 at 2:32 AM Roman Kiryanov <rkir@google.com> wrote: > > + if (qatomic_read(&ts->cb_running)) { > > + qemu_event_wait(&timer_list->timers_done_ev); > > + } > > qemu_event_wait() already has the right atomic magic, and > ts->cb_running is both redundant (in general), and I think racy (as > implemented in this patch). I added cb_running to avoid waiting for timers_done_ev if we know our cb is done. > But especially, you haven't justified in the commit message _why_ you > need this. I mentioned the problem of cleanup racing with the timer's callback function in the current shape of QEMU. > Since this problem is not timer-specific, It would be nice to fix all the places then. > using > aio_bh_schedule_oneshot() or aio_wait_bh_oneshot() to synchronize > everything with the AioContext thread seems like a superior solution > to me. Could you please elaborate? The problem we want to solve is this: void myThreadFunc() { CallbackState callbackState; QEMUTimer timer; timer_init(&timer, myClockType, myScale, &myTimerCallbackFunc, &callbackState); ... timer_del(&timer); } Currently, myTimerCallbackFunc could fire after myThreadFunc exits (if timer_del runs between qemu_mutex_unlock and cb(opaque) in timerlist_run_timers) and callbackState gets destroyed.
On Thu, Jun 27, 2024 at 6:12 PM Roman Kiryanov <rkir@google.com> wrote: > > On Thu, Jun 27, 2024 at 6:27 AM Paolo Bonzini <pbonzini@redhat.com> wrote: > > > > On Thu, Jun 27, 2024 at 2:32 AM Roman Kiryanov <rkir@google.com> wrote: > > > + if (qatomic_read(&ts->cb_running)) { > > > + qemu_event_wait(&timer_list->timers_done_ev); > > > + } > > > > qemu_event_wait() already has the right atomic magic, and > > ts->cb_running is both redundant (in general), and I think racy (as > > implemented in this patch). > > I added cb_running to avoid waiting for timers_done_ev if we know our > cb is done. Yes, but it's very tricky. Assuming we want to fix it in the timer core, the QemuEvent should be enough, no need to optimize it. On the other hand, I'm still worried about deadlocks (more below). > > But especially, you haven't justified in the commit message _why_ you > > need this. > > I mentioned the problem of cleanup racing with the timer's callback function > in the current shape of QEMU. Yes, but it was not clear what are the involved threads. It is clear now that you have a function in a separate thread, creating a timer in the main QEMU event loop. > > using > > aio_bh_schedule_oneshot() or aio_wait_bh_oneshot() to synchronize > > everything with the AioContext thread seems like a superior solution > > to me. > > Could you please elaborate? The problem we want to solve is this: > > void myThreadFunc() { > CallbackState callbackState; > QEMUTimer timer; > > timer_init(&timer, myClockType, myScale, &myTimerCallbackFunc, > &callbackState); > ... > timer_del(&timer); > } > > Currently, myTimerCallbackFunc could fire after myThreadFunc exits > (if timer_del runs between qemu_mutex_unlock and cb(opaque) in > timerlist_run_timers) and callbackState gets destroyed. Ok, got it now. I agree that qemu_event_wait() is safe for you here because you are in a completely separate thread. But I'm worried that it causes deadlocks in QEMU where the timer callback and the timer_del run in the same thread. I think the easiest options would be: 1) if possible, allocate the timer and the callbackState statically in the device. 2) use "aio_wait_bh_oneshot(qemu_get_aio_context(), [](void *opaque){}, NULL);" after timer_del(). You can also put the timer and the callbackState in a RAII wrapper, so that aio_wait_bh_oneshot() is executed when the RAII wrapper is destructed Another thing that you could do is to use a shared_ptr<> for the timer+callbackState combo, and pass a weak_ptr<> to the timer. Then: - at the beginning of the timer, you upgrade the weak_ptr with lock() and if it fails, return - at the end of myThreadfunc, you destruct the shared_ptr before deleting the timer. I'm not sure how you'd pass the weak_ptr/shared_ptr to a callback (Rust has Weak::into_raw/Weak::from_raw, but I don't know C++ well enough). That may be overkill. Paolo
Paolo, thank you for your comments. On Thu, Jun 27, 2024 at 10:16 AM Paolo Bonzini <pbonzini@redhat.com> wrote: > I think the easiest options would be: > > 1) if possible, allocate the timer and the callbackState statically in > the device. I think this assumption is not good for both QEMU and us. > 2) use "aio_wait_bh_oneshot(qemu_get_aio_context(), [](void > *opaque){}, NULL);" after timer_del(). You can also put the timer and > the callbackState in a RAII wrapper, so that aio_wait_bh_oneshot() is > executed when the RAII wrapper is destructed My understanding is that this will work as a fence waiting for all timers to finish. If so, maybe there is a value to put it into QEMU (as times_joins() or even as timer_join(QEMUTimer *ts)) if one day you decide to implement it in a more efficient way? > Another thing that you could do is to use a shared_ptr<> for the > timer+callbackState combo, and pass a weak_ptr<> to the timer. Then: > > - at the beginning of the timer, you upgrade the weak_ptr with lock() > and if it fails, return > > I'm not sure how you'd pass the weak_ptr/shared_ptr to a callback I suspect this is not possible in plain C++ without modifying QEMU or code generating at runtime. I would go with your aio_wait_bh_oneshot suggestion. Please consider adding it to QEMU as I pointed above. I can send a patch.
On Thu, Jun 27, 2024 at 10:55 AM Roman Kiryanov <rkir@google.com> wrote: > Paolo, thank you for your comments. > > On Thu, Jun 27, 2024 at 10:16 AM Paolo Bonzini <pbonzini@redhat.com> > wrote: > > I think the easiest options would be: > > > > 1) if possible, allocate the timer and the callbackState statically in > > the device. > > I think this assumption is not good for both QEMU and us. > > > 2) use "aio_wait_bh_oneshot(qemu_get_aio_context(), [](void > > *opaque){}, NULL);" after timer_del(). You can also put the timer and > > the callbackState in a RAII wrapper, so that aio_wait_bh_oneshot() is > > executed when the RAII wrapper is destructed > > My understanding is that this will work as a fence waiting for all > timers to finish. If so, maybe there is a value to put it into QEMU > (as times_joins() or even as timer_join(QEMUTimer *ts)) if one day > you decide to implement it in a more efficient way? > > > Another thing that you could do is to use a shared_ptr<> for the > > timer+callbackState combo, and pass a weak_ptr<> to the timer. Then: > > > > - at the beginning of the timer, you upgrade the weak_ptr with lock() > > and if it fails, return > > > > I'm not sure how you'd pass the weak_ptr/shared_ptr to a callback > > I suspect this is not possible in plain C++ without modifying QEMU or > code generating at runtime. > > I would go with your aio_wait_bh_oneshot suggestion. Please consider > adding it to QEMU as I pointed above. I can send a patch. > Hey - this is an important race condition to resolve, can we get some attention back on this patchset please. It's pressing. Patrick
On 8/14/24 23:12, Patrick Leis wrote: > I would go with your aio_wait_bh_oneshot suggestion. Please consider > adding it to QEMU as I pointed above. I can send a patch. > > Hey - this is an important race condition to resolve, can we get some > attention back on this patchset please. It's pressing. Sorry about the delay. The patch is good, and there is no reason why it should break. But without a user (and as far as I understand there is no occurrence of this race condition in upstream QEMU code) there is no reason to include it in QEMU. QEMU is not a library, and therefore there should be no unused functions in the code base. You can keep it as a solution for the race condition in your code; depending on how your fork is organized, it may be kept in qemu-timer.c or in a file that is part of your additional device models. Thanks, Paolo
Currently there is no mechanism guaranteeing
that it is safe to delete the object pointed
by opaque in timer_init.
This race condition happens if a timer is
created on a separate thread and timer_del
is called between qemu_mutex_unlock and
cb(opaque) in timerlist_run_timers.
In this case the user thread will continue
executing (once timer_del returns) which could
cause destruction of the object pointed by
opaque while the callback is (or will be)
using it. See the example below:
void myThreadFunc() {
void *opaque = malloc(42);
QEMUTimer timer;
timer_init(&timer, myClockType, myScale,
&myTimerCallbackFunc, opaque);
...
timer_del(&timer);
free(opaque);
}
This change adds a function, timer_join,
that makes sure that the timer callback
(all timer callbacks, to be precise; this
is an implementation detail and could be
adjusted later) is done and it is safe to
destroy the object pointed by opaque.
Once this function is available, the example
above will look like this:
timer_del(&timer);
timer_join(&timer);
free(opaque);
Signed-off-by: Roman Kiryanov <rkir@google.com>
---
v2: rebased to the right branch and removed
Google specific tags from the commit message.
v3: if a timer's callback happens to be running
(cb_running) wait until all timers are done.
qatomic_read/qemu_event_reset could be racing
and timer_del might wait one extra loop of
timers to be done.
v4: add timer_join implmented via aio_wait_bh_oneshot
as suggested by pbonzini@redhat.com.
include/qemu/timer.h | 11 +++++++++++
util/qemu-timer.c | 14 ++++++++++++++
2 files changed, 25 insertions(+)
diff --git a/include/qemu/timer.h b/include/qemu/timer.h
index 5ce83c7911..5f7d673830 100644
--- a/include/qemu/timer.h
+++ b/include/qemu/timer.h
@@ -635,6 +635,17 @@ void timer_deinit(QEMUTimer *ts);
*/
void timer_del(QEMUTimer *ts);
+/**
+ * timer_join:
+ * @ts: the timer
+ *
+ * Wait for the timer callback to finish (if it is runniing).
+ *
+ * This function is thread-safe but the timer and its timer list must not be
+ * freed while this function is running.
+ */
+void timer_join(QEMUTimer *ts);
+
/**
* timer_free:
* @ts: the timer
diff --git a/util/qemu-timer.c b/util/qemu-timer.c
index 213114be68..22759be580 100644
--- a/util/qemu-timer.c
+++ b/util/qemu-timer.c
@@ -30,6 +30,8 @@
#include "sysemu/replay.h"
#include "sysemu/cpus.h"
+#include "block/aio-wait.h"
+
#ifdef CONFIG_POSIX
#include <pthread.h>
#endif
@@ -438,6 +440,18 @@ void timer_del(QEMUTimer *ts)
}
}
+static void timer_join_noop(void *unused) {}
+
+/* Make sure the timer callback is done */
+void timer_join(QEMUTimer *ts)
+{
+ /* A side effect of aio_wait_bh_oneshot is all
+ * timer callbacks are done once it returns.
+ */
+ aio_wait_bh_oneshot(qemu_get_aio_context(),
+ &timer_join_noop, NULL);
+}
+
/* modify the current timer so that it will be fired when current_time
>= expire_time. The corresponding callback will be called. */
void timer_mod_ns(QEMUTimer *ts, int64_t expire_time)
--
2.45.2.803.g4e1b14247a-goog
© 2016 - 2024 Red Hat, Inc.