Compiler optimizations can cache TLS values across coroutine yield
points, resulting in stale values from the previous thread when a
coroutine is re-entered by a new thread.
Serge Guelton developed an __attribute__((noinline)) wrapper and tested
it with clang and gcc. I formatted his idea according to QEMU's coding
style and wrote documentation.
Richard Henderson developed an alternative approach that can be inlined
by the compiler. This is included for architectures where we have inline
assembly that determines the address of a TLS variable.
These macros must be used instead of __thread from now on to prevent
coroutine TLS bugs. Here is an x86_64 TLS variable access before this patch:
mov %fs:-0x19c,%edx
And here is the same access using Richard's approach:
rdfsbase %rax # %fs contains the base address
lea -0x1a8(%rax),%rax # -0x1a8 is the offset of our variable
mov 0xc(%rax),%edx # here we access the TLS variable via %rax
Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1952483
Suggested-by: Serge Guelton <sguelton@redhat.com>
Suggested-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
Richard's suggested code used a MOV instruction on x86_64 but we need
LEA semantics. LEA doesn't support %fs so I switched to RDFSBASE+LEA.
Otherwise Richard's approach is unchanged.
---
include/qemu/coroutine-tls.h | 202 +++++++++++++++++++++++++++++++++++
1 file changed, 202 insertions(+)
create mode 100644 include/qemu/coroutine-tls.h
diff --git a/include/qemu/coroutine-tls.h b/include/qemu/coroutine-tls.h
new file mode 100644
index 0000000000..3158f9c0eb
--- /dev/null
+++ b/include/qemu/coroutine-tls.h
@@ -0,0 +1,202 @@
+/*
+ * QEMU Thread Local Storage for coroutines
+ *
+ * Copyright Red Hat
+ *
+ * SPDX-License-Identifier: LGPL-2.1-or-later
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ * It is forbidden to access Thread Local Storage in coroutines because
+ * compiler optimizations may cause values to be cached across coroutine
+ * re-entry. Coroutines can run in more than one thread through the course of
+ * their life, leading bugs when stale TLS values from the wrong thread are
+ * used as a result of compiler optimization.
+ *
+ * An example is:
+ *
+ * ..code-block:: c
+ * :caption: A coroutine that may see the wrong TLS value
+ *
+ * static __thread AioContext *current_aio_context;
+ * ...
+ * static void coroutine_fn foo(void)
+ * {
+ * aio_notify(current_aio_context);
+ * qemu_coroutine_yield();
+ * aio_notify(current_aio_context); // <-- may be stale after yielding!
+ * }
+ *
+ * This header provides macros for safely defining variables in Thread Local
+ * Storage:
+ *
+ * ..code-block:: c
+ * :caption: A coroutine that safely uses TLS
+ *
+ * QEMU_DEFINE_STATIC_CO_TLS(AioContext *, current_aio_context)
+ * ...
+ * static void coroutine_fn foo(void)
+ * {
+ * aio_notify(get_current_aio_context());
+ * qemu_coroutine_yield();
+ * aio_notify(get_current_aio_context()); // <-- safe
+ * }
+ */
+
+#ifndef QEMU_COROUTINE_TLS_H
+#define QEMU_COROUTINE_TLS_H
+
+/*
+ * Two techniques are available to stop the compiler from caching TLS values:
+ * 1. Accessor functions with __attribute__((noinline)). This is portable but
+ * prevents inlining optimizations.
+ * 2. TLS address-of implemented as asm volatile so it can be inlined safely.
+ * This enables inlining optimizations but requires architecture-specific
+ * inline assembly.
+ */
+#if defined(__aarch64__)
+#define QEMU_CO_TLS_ADDR(ret, var) \
+ asm volatile("mrs %0, tpidr_el0\n\t" \
+ "add %0, %0, #:tprel_hi12:"#var", lsl #12\n\t" \
+ "add %0, %0, #:tprel_lo12_nc:"#var \
+ : "=r"(ret))
+#elif defined(__powerpc64__)
+#define QEMU_CO_TLS_ADDR(ret, var) \
+ asm volatile("addis %0,13,"#var"@tprel@ha\n\t" \
+ "add %0,%0,"#var"@tprel@l" \
+ : "=r"(ret))
+#elif defined(__riscv)
+#define QEMU_CO_TLS_ADDR(ret, var) \
+ asm volatile("lui %0,%%tprel_hi("#var")\n\t" \
+ "add %0,%0,%%tprel_add("#var")\n\t" \
+ "addi %0,%0,%%tprel_lo("#var")" \
+ : "=r"(ret))
+#elif defined(__x86_64__)
+#define QEMU_CO_TLS_ADDR(ret, var) \
+ asm volatile("rdfsbase %0\n\t" \
+ "lea "#var"@tpoff(%0), %0" : "=r"(ret))
+#endif
+
+/**
+ * QEMU_DECLARE_CO_TLS:
+ * @type: the variable's C type
+ * @var: the variable name
+ *
+ * Declare an extern variable in Thread Local Storage from a header file:
+ *
+ * .. code-block:: c
+ * :caption: Declaring an extern variable in Thread Local Storage
+ *
+ * QEMU_DECLARE_CO_TLS(int, my_count)
+ * ...
+ * int c = get_my_count();
+ * set_my_count(c + 1);
+ * *get_ptr_my_count() = 0;
+ *
+ * Use this instead of:
+ *
+ * .. code-block:: c
+ * :caption: Declaring a TLS variable using __thread
+ *
+ * extern __thread int my_count;
+ * ...
+ * int c = my_count;
+ * my_count = c + 1;
+ * *(&my_count) = 0;
+ */
+#ifdef QEMU_CO_TLS_ADDR
+#define QEMU_DECLARE_CO_TLS(type, var) \
+ extern __thread type co_tls_##var; \
+ static inline type get_##var(void) \
+ { type *p; QEMU_CO_TLS_ADDR(p, co_tls_##var); return *p; } \
+ static inline void set_##var(type v) \
+ { type *p; QEMU_CO_TLS_ADDR(p, co_tls_##var); *p = v; } \
+ static inline type *get_ptr_##var(void) \
+ { type *p; QEMU_CO_TLS_ADDR(p, co_tls_##var); return p; }
+#else
+#define QEMU_DECLARE_CO_TLS(type, var) \
+ __attribute__((noinline)) type get_##var(void); \
+ __attribute__((noinline)) void set_##var(type v); \
+ __attribute__((noinline)) type *get_ptr_##var(void);
+#endif
+
+/**
+ * QEMU_DEFINE_CO_TLS:
+ * @type: the variable's C type
+ * @var: the variable name
+ *
+ * Define an variable in Thread Local Storage that was previously declared from
+ * a header file with QEMU_DECLARE_CO_TLS():
+ *
+ * .. code-block:: c
+ * :caption: Defining a variable in Thread Local Storage
+ *
+ * QEMU_DEFINE_CO_TLS(int, my_count)
+ *
+ * Use this instead of:
+ *
+ * .. code-block:: c
+ * :caption: Defining a TLS variable using __thread
+ *
+ * __thread int my_count;
+ */
+#ifdef QEMU_CO_TLS_ADDR
+#define QEMU_DEFINE_CO_TLS(type, var) \
+ __thread type co_tls_##var;
+#else
+#define QEMU_DEFINE_CO_TLS(type, var) \
+ static __thread type co_tls_##var; \
+ type get_##var(void) { return co_tls_##var; } \
+ void set_##var(type v) { co_tls_##var = v; } \
+ type *get_ptr_##var(void) { return &co_tls_##var; }
+#endif
+
+/**
+ * QEMU_DEFINE_STATIC_CO_TLS:
+ * @type: the variable's C type
+ * @var: the variable name
+ *
+ * Define a static variable in Thread Local Storage:
+ *
+ * .. code-block:: c
+ * :caption: Defining a static variable in Thread Local Storage
+ *
+ * QEMU_DEFINE_STATIC_CO_TLS(int, my_count)
+ * ...
+ * int c = get_my_count();
+ * set_my_count(c + 1);
+ * *get_ptr_my_count() = 0;
+ *
+ * Use this instead of:
+ *
+ * .. code-block:: c
+ * :caption: Defining a static TLS variable using __thread
+ *
+ * static __thread int my_count;
+ * ...
+ * int c = my_count;
+ * my_count = c + 1;
+ * *(&my_count) = 0;
+ */
+#ifdef QEMU_CO_TLS_ADDR
+#define QEMU_DEFINE_STATIC_CO_TLS(type, var) \
+ __thread type co_tls_##var; \
+ static inline type get_##var(void) \
+ { type *p; QEMU_CO_TLS_ADDR(p, co_tls_##var); return *p; } \
+ static inline void set_##var(type v) \
+ { type *p; QEMU_CO_TLS_ADDR(p, co_tls_##var); *p = v; } \
+ static inline type *get_ptr_##var(void) \
+ { type *p; QEMU_CO_TLS_ADDR(p, co_tls_##var); return p; }
+#else
+#define QEMU_DEFINE_STATIC_CO_TLS(type, var) \
+ static __thread type co_tls_##var; \
+ static __attribute__((noinline, unused)) type get_##var(void) \
+ { return co_tls_##var; } \
+ static __attribute__((noinline, unused)) void set_##var(type v) \
+ { co_tls_##var = v; } \
+ static __attribute__((noinline, unused)) type *get_ptr_##var(void) \
+ { return &co_tls_##var; }
+#endif
+
+#endif /* QEMU_COROUTINE_TLS_H */
--
2.33.1
* Stefan Hajnoczi:
> +#elif defined(__x86_64__)
> +#define QEMU_CO_TLS_ADDR(ret, var) \
> + asm volatile("rdfsbase %0\n\t" \
> + "lea "#var"@tpoff(%0), %0" : "=r"(ret))
> +#endif
RDFSBASE needs quite recent kernels. I think you should use
movq %%fs:0, %0
instead, which is equivalent for the x86-64 psABI.
Thanks,
Florian
On Wed, 1 Dec 2021 at 17:19, Stefan Hajnoczi <stefanha@redhat.com> wrote:
>
> Compiler optimizations can cache TLS values across coroutine yield
> points, resulting in stale values from the previous thread when a
> coroutine is re-entered by a new thread.
>
> Serge Guelton developed an __attribute__((noinline)) wrapper and tested
> it with clang and gcc. I formatted his idea according to QEMU's coding
> style and wrote documentation.
> +#ifdef QEMU_CO_TLS_ADDR
> +#define QEMU_DEFINE_STATIC_CO_TLS(type, var) \
> + __thread type co_tls_##var; \
> + static inline type get_##var(void) \
> + { type *p; QEMU_CO_TLS_ADDR(p, co_tls_##var); return *p; } \
> + static inline void set_##var(type v) \
> + { type *p; QEMU_CO_TLS_ADDR(p, co_tls_##var); *p = v; } \
> + static inline type *get_ptr_##var(void) \
> + { type *p; QEMU_CO_TLS_ADDR(p, co_tls_##var); return p; }
> +#else
> +#define QEMU_DEFINE_STATIC_CO_TLS(type, var) \
> + static __thread type co_tls_##var; \
> + static __attribute__((noinline, unused)) type get_##var(void) \
> + { return co_tls_##var; } \
> + static __attribute__((noinline, unused)) void set_##var(type v) \
> + { co_tls_##var = v; } \
> + static __attribute__((noinline, unused)) type *get_ptr_##var(void) \
> + { return &co_tls_##var; }
> +#endif
My compiler-developer colleagues present the following case where
'noinline' is not sufficient for the compiler to definitely
use different values of the address-of-the-TLS-var across an
intervening function call:
__thread int i;
__attribute__((noinline)) long get_ptr_i()
{
return (long)&i;
}
void switcher();
int g()
{
long a = get_ptr_i();
switcher();
return a == get_ptr_i();
}
Trunk clang optimizes the comparison of the two pointers down
to "must be always true" even though they might not be if the
switcher() function has resulted in a change-of-thread:
https://godbolt.org/z/hd67zh6jW
The 'optnone' attribute (clang-specific) seems to be able
to suppress this attribute. The equivalent on gcc may
(or may not) be 'noipa'.
-- PMM
On Thu, 2 Dec 2021 at 14:44, Peter Maydell <peter.maydell@linaro.org> wrote:
> My compiler-developer colleagues present the following case where
> 'noinline' is not sufficient for the compiler to definitely
> use different values of the address-of-the-TLS-var across an
> intervening function call:
>
> __thread int i;
>
> __attribute__((noinline)) long get_ptr_i()
> {
> return (long)&i;
> }
>
> void switcher();
>
> int g()
> {
> long a = get_ptr_i();
> switcher();
> return a == get_ptr_i();
> }
>
> Trunk clang optimizes the comparison of the two pointers down
> to "must be always true" even though they might not be if the
> switcher() function has resulted in a change-of-thread:
> https://godbolt.org/z/hd67zh6jW
>
> The 'optnone' attribute (clang-specific) seems to be able
> to suppress this
...no it doesn't -- although the get_ptr_i() function is
called both before and after, its return value is ignored
and g() always still returns 'true'.
-- PMM
* Peter Maydell:
> On Thu, 2 Dec 2021 at 14:44, Peter Maydell <peter.maydell@linaro.org> wrote:
>> My compiler-developer colleagues present the following case where
>> 'noinline' is not sufficient for the compiler to definitely
>> use different values of the address-of-the-TLS-var across an
>> intervening function call:
>>
>> __thread int i;
>>
>> __attribute__((noinline)) long get_ptr_i()
>> {
>> return (long)&i;
>> }
>>
>> void switcher();
>>
>> int g()
>> {
>> long a = get_ptr_i();
>> switcher();
>> return a == get_ptr_i();
>> }
>>
>> Trunk clang optimizes the comparison of the two pointers down
>> to "must be always true" even though they might not be if the
>> switcher() function has resulted in a change-of-thread:
>> https://godbolt.org/z/hd67zh6jW
>>
>> The 'optnone' attribute (clang-specific) seems to be able
>> to suppress this
>
> ...no it doesn't -- although the get_ptr_i() function is
> called both before and after, its return value is ignored
> and g() always still returns 'true'.
__attribute__ ((weak)) would mark get_ptr_i as interposable and should
act as an optimization barrier for any compiler. Unless the compiler
defaults -fno-semantic-interposition *and* feels very, very strongly
about its meaning (clang 13 doesn't seem to, despite the
-fno-semantic-interposition default even with -fpic).
Thanks,
Florian
On Thu, Dec 02, 2021 at 02:44:42PM +0000, Peter Maydell wrote:
> On Wed, 1 Dec 2021 at 17:19, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> >
> > Compiler optimizations can cache TLS values across coroutine yield
> > points, resulting in stale values from the previous thread when a
> > coroutine is re-entered by a new thread.
> >
> > Serge Guelton developed an __attribute__((noinline)) wrapper and tested
> > it with clang and gcc. I formatted his idea according to QEMU's coding
> > style and wrote documentation.
>
> > +#ifdef QEMU_CO_TLS_ADDR
> > +#define QEMU_DEFINE_STATIC_CO_TLS(type, var) \
> > + __thread type co_tls_##var; \
> > + static inline type get_##var(void) \
> > + { type *p; QEMU_CO_TLS_ADDR(p, co_tls_##var); return *p; } \
> > + static inline void set_##var(type v) \
> > + { type *p; QEMU_CO_TLS_ADDR(p, co_tls_##var); *p = v; } \
> > + static inline type *get_ptr_##var(void) \
> > + { type *p; QEMU_CO_TLS_ADDR(p, co_tls_##var); return p; }
> > +#else
> > +#define QEMU_DEFINE_STATIC_CO_TLS(type, var) \
> > + static __thread type co_tls_##var; \
> > + static __attribute__((noinline, unused)) type get_##var(void) \
> > + { return co_tls_##var; } \
> > + static __attribute__((noinline, unused)) void set_##var(type v) \
> > + { co_tls_##var = v; } \
> > + static __attribute__((noinline, unused)) type *get_ptr_##var(void) \
> > + { return &co_tls_##var; }
> > +#endif
>
> My compiler-developer colleagues present the following case where
> 'noinline' is not sufficient for the compiler to definitely
> use different values of the address-of-the-TLS-var across an
> intervening function call:
>
> __thread int i;
>
> __attribute__((noinline)) long get_ptr_i()
> {
> return (long)&i;
> }
>
> void switcher();
>
> int g()
> {
> long a = get_ptr_i();
> switcher();
> return a == get_ptr_i();
> }
You can also force an extra mov through `volatile` as in
https://godbolt.org/z/hWvdb7o9G
© 2016 - 2026 Red Hat, Inc.