[v2] Provide emulation for one--byte cmpxchg()

[PATCH v2 cmpxchg 09/13] lib: Add one-byte emulation function

Posted by Paul E. McKenney 1 year, 9 months ago

Architectures are required to provide four-byte cmpxchg() and 64-bit
architectures are additionally required to provide eight-byte cmpxchg().
However, there are cases where one-byte cmpxchg() would be extremely
useful.  Therefore, provide cmpxchg_emu_u8() that emulates one-byte
cmpxchg() in terms of four-byte cmpxchg().

Note that this emulations is fully ordered, and can (for example) cause
one-byte cmpxchg_relaxed() to incur the overhead of full ordering.
If this causes problems for a given architecture, that architecture is
free to provide its own lighter-weight primitives.

[ paulmck: Apply Marco Elver feedback. ]
[ paulmck: Apply kernel test robot feedback. ]
[ paulmck: Drop two-byte support per Arnd Bergmann feedback. ]

Link: https://lore.kernel.org/all/0733eb10-5e7a-4450-9b8a-527b97c842ff@paulmck-laptop/

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Acked-by: Marco Elver <elver@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Cc: Douglas Anderson <dianders@chromium.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: <linux-arch@vger.kernel.org>
---
 arch/Kconfig                |  3 +++
 include/linux/cmpxchg-emu.h | 15 +++++++++++++
 lib/Makefile                |  1 +
 lib/cmpxchg-emu.c           | 45 +++++++++++++++++++++++++++++++++++++
 4 files changed, 64 insertions(+)
 create mode 100644 include/linux/cmpxchg-emu.h
 create mode 100644 lib/cmpxchg-emu.c

diff --git a/arch/Kconfig b/arch/Kconfig
index 9f066785bb71d..284663392eef8 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1609,4 +1609,7 @@ config CC_HAS_SANE_FUNCTION_ALIGNMENT
 	# strict alignment always, even with -falign-functions.
 	def_bool CC_HAS_MIN_FUNCTION_ALIGNMENT || CC_IS_CLANG
 
+config ARCH_NEED_CMPXCHG_1_EMU
+	bool
+
 endmenu
diff --git a/include/linux/cmpxchg-emu.h b/include/linux/cmpxchg-emu.h
new file mode 100644
index 0000000000000..998deec67740a
--- /dev/null
+++ b/include/linux/cmpxchg-emu.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Emulated 1-byte and 2-byte cmpxchg operations for architectures
+ * lacking direct support for these sizes.  These are implemented in terms
+ * of 4-byte cmpxchg operations.
+ *
+ * Copyright (C) 2024 Paul E. McKenney.
+ */
+
+#ifndef __LINUX_CMPXCHG_EMU_H
+#define __LINUX_CMPXCHG_EMU_H
+
+uintptr_t cmpxchg_emu_u8(volatile u8 *p, uintptr_t old, uintptr_t new);
+
+#endif /* __LINUX_CMPXCHG_EMU_H */
diff --git a/lib/Makefile b/lib/Makefile
index ffc6b2341b45a..cc3d52fdb477d 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -236,6 +236,7 @@ obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
 lib-$(CONFIG_GENERIC_BUG) += bug.o
 
 obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o
+obj-$(CONFIG_ARCH_NEED_CMPXCHG_1_EMU) += cmpxchg-emu.o
 
 obj-$(CONFIG_DYNAMIC_DEBUG_CORE) += dynamic_debug.o
 #ensure exported functions have prototypes
diff --git a/lib/cmpxchg-emu.c b/lib/cmpxchg-emu.c
new file mode 100644
index 0000000000000..27f6f97cb60dd
--- /dev/null
+++ b/lib/cmpxchg-emu.c
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Emulated 1-byte cmpxchg operation for architectures lacking direct
+ * support for this size.  This is implemented in terms of 4-byte cmpxchg
+ * operations.
+ *
+ * Copyright (C) 2024 Paul E. McKenney.
+ */
+
+#include <linux/types.h>
+#include <linux/export.h>
+#include <linux/instrumented.h>
+#include <linux/atomic.h>
+#include <linux/panic.h>
+#include <linux/bug.h>
+#include <asm-generic/rwonce.h>
+#include <linux/cmpxchg-emu.h>
+
+union u8_32 {
+	u8 b[4];
+	u32 w;
+};
+
+/* Emulate one-byte cmpxchg() in terms of 4-byte cmpxchg. */
+uintptr_t cmpxchg_emu_u8(volatile u8 *p, uintptr_t old, uintptr_t new)
+{
+	u32 *p32 = (u32 *)(((uintptr_t)p) & ~0x3);
+	int i = ((uintptr_t)p) & 0x3;
+	union u8_32 old32;
+	union u8_32 new32;
+	u32 ret;
+
+	ret = READ_ONCE(*p32);
+	do {
+		old32.w = ret;
+		if (old32.b[i] != old)
+			return old32.b[i];
+		new32.w = old32.w;
+		new32.b[i] = new;
+		instrument_atomic_read_write(p, 1);
+		ret = data_race(cmpxchg(p32, old32.w, new32.w)); // Overridden above.
+	} while (ret != old32.w);
+	return old;
+}
+EXPORT_SYMBOL_GPL(cmpxchg_emu_u8);
-- 
2.40.1

Re: [PATCH v2 cmpxchg 09/13] lib: Add one-byte emulation function

Posted by Boqun Feng 1 year, 9 months ago

On Wed, May 01, 2024 at 04:01:26PM -0700, Paul E. McKenney wrote:
> Architectures are required to provide four-byte cmpxchg() and 64-bit
> architectures are additionally required to provide eight-byte cmpxchg().
> However, there are cases where one-byte cmpxchg() would be extremely
> useful.  Therefore, provide cmpxchg_emu_u8() that emulates one-byte
> cmpxchg() in terms of four-byte cmpxchg().
> 
> Note that this emulations is fully ordered, and can (for example) cause
> one-byte cmpxchg_relaxed() to incur the overhead of full ordering.
> If this causes problems for a given architecture, that architecture is
> free to provide its own lighter-weight primitives.
> 
> [ paulmck: Apply Marco Elver feedback. ]
> [ paulmck: Apply kernel test robot feedback. ]
> [ paulmck: Drop two-byte support per Arnd Bergmann feedback. ]
> 
> Link: https://lore.kernel.org/all/0733eb10-5e7a-4450-9b8a-527b97c842ff@paulmck-laptop/
> 
> Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
> Acked-by: Marco Elver <elver@google.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org>
> Cc: Douglas Anderson <dianders@chromium.org>
> Cc: Petr Mladek <pmladek@suse.com>
> Cc: Arnd Bergmann <arnd@arndb.de>
> Cc: <linux-arch@vger.kernel.org>
> ---
>  arch/Kconfig                |  3 +++
>  include/linux/cmpxchg-emu.h | 15 +++++++++++++
>  lib/Makefile                |  1 +
>  lib/cmpxchg-emu.c           | 45 +++++++++++++++++++++++++++++++++++++
>  4 files changed, 64 insertions(+)
>  create mode 100644 include/linux/cmpxchg-emu.h
>  create mode 100644 lib/cmpxchg-emu.c
> 
> diff --git a/arch/Kconfig b/arch/Kconfig
> index 9f066785bb71d..284663392eef8 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -1609,4 +1609,7 @@ config CC_HAS_SANE_FUNCTION_ALIGNMENT
>  	# strict alignment always, even with -falign-functions.
>  	def_bool CC_HAS_MIN_FUNCTION_ALIGNMENT || CC_IS_CLANG
>  
> +config ARCH_NEED_CMPXCHG_1_EMU
> +	bool
> +
>  endmenu
> diff --git a/include/linux/cmpxchg-emu.h b/include/linux/cmpxchg-emu.h
> new file mode 100644
> index 0000000000000..998deec67740a
> --- /dev/null
> +++ b/include/linux/cmpxchg-emu.h
> @@ -0,0 +1,15 @@
> +/* SPDX-License-Identifier: GPL-2.0+ */
> +/*
> + * Emulated 1-byte and 2-byte cmpxchg operations for architectures
> + * lacking direct support for these sizes.  These are implemented in terms
> + * of 4-byte cmpxchg operations.
> + *
> + * Copyright (C) 2024 Paul E. McKenney.
> + */
> +
> +#ifndef __LINUX_CMPXCHG_EMU_H
> +#define __LINUX_CMPXCHG_EMU_H
> +
> +uintptr_t cmpxchg_emu_u8(volatile u8 *p, uintptr_t old, uintptr_t new);
> +
> +#endif /* __LINUX_CMPXCHG_EMU_H */
> diff --git a/lib/Makefile b/lib/Makefile
> index ffc6b2341b45a..cc3d52fdb477d 100644
> --- a/lib/Makefile
> +++ b/lib/Makefile
> @@ -236,6 +236,7 @@ obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
>  lib-$(CONFIG_GENERIC_BUG) += bug.o
>  
>  obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o
> +obj-$(CONFIG_ARCH_NEED_CMPXCHG_1_EMU) += cmpxchg-emu.o
>  
>  obj-$(CONFIG_DYNAMIC_DEBUG_CORE) += dynamic_debug.o
>  #ensure exported functions have prototypes
> diff --git a/lib/cmpxchg-emu.c b/lib/cmpxchg-emu.c
> new file mode 100644
> index 0000000000000..27f6f97cb60dd
> --- /dev/null
> +++ b/lib/cmpxchg-emu.c
> @@ -0,0 +1,45 @@
> +// SPDX-License-Identifier: GPL-2.0+
> +/*
> + * Emulated 1-byte cmpxchg operation for architectures lacking direct
> + * support for this size.  This is implemented in terms of 4-byte cmpxchg
> + * operations.
> + *
> + * Copyright (C) 2024 Paul E. McKenney.
> + */
> +
> +#include <linux/types.h>
> +#include <linux/export.h>
> +#include <linux/instrumented.h>
> +#include <linux/atomic.h>
> +#include <linux/panic.h>
> +#include <linux/bug.h>
> +#include <asm-generic/rwonce.h>
> +#include <linux/cmpxchg-emu.h>
> +
> +union u8_32 {
> +	u8 b[4];
> +	u32 w;
> +};
> +
> +/* Emulate one-byte cmpxchg() in terms of 4-byte cmpxchg. */
> +uintptr_t cmpxchg_emu_u8(volatile u8 *p, uintptr_t old, uintptr_t new)
> +{
> +	u32 *p32 = (u32 *)(((uintptr_t)p) & ~0x3);
> +	int i = ((uintptr_t)p) & 0x3;
> +	union u8_32 old32;
> +	union u8_32 new32;
> +	u32 ret;
> +
> +	ret = READ_ONCE(*p32);
> +	do {
> +		old32.w = ret;
> +		if (old32.b[i] != old)
> +			return old32.b[i];
> +		new32.w = old32.w;
> +		new32.b[i] = new;
> +		instrument_atomic_read_write(p, 1);
> +		ret = data_race(cmpxchg(p32, old32.w, new32.w)); // Overridden above.

Just out of curiosity, why is this `data_race` needed? cmpxchg is atomic
so there should be no chance for a data race?

Regards,
Boqun

> +	} while (ret != old32.w);
> +	return old;
> +}
> +EXPORT_SYMBOL_GPL(cmpxchg_emu_u8);
> -- 
> 2.40.1
>

Re: [PATCH v2 cmpxchg 09/13] lib: Add one-byte emulation function

Posted by Paul E. McKenney 1 year, 9 months ago

On Mon, May 13, 2024 at 07:44:00AM -0700, Boqun Feng wrote:
> On Wed, May 01, 2024 at 04:01:26PM -0700, Paul E. McKenney wrote:
> > Architectures are required to provide four-byte cmpxchg() and 64-bit
> > architectures are additionally required to provide eight-byte cmpxchg().
> > However, there are cases where one-byte cmpxchg() would be extremely
> > useful.  Therefore, provide cmpxchg_emu_u8() that emulates one-byte
> > cmpxchg() in terms of four-byte cmpxchg().
> > 
> > Note that this emulations is fully ordered, and can (for example) cause
> > one-byte cmpxchg_relaxed() to incur the overhead of full ordering.
> > If this causes problems for a given architecture, that architecture is
> > free to provide its own lighter-weight primitives.
> > 
> > [ paulmck: Apply Marco Elver feedback. ]
> > [ paulmck: Apply kernel test robot feedback. ]
> > [ paulmck: Drop two-byte support per Arnd Bergmann feedback. ]
> > 
> > Link: https://lore.kernel.org/all/0733eb10-5e7a-4450-9b8a-527b97c842ff@paulmck-laptop/
> > 
> > Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
> > Acked-by: Marco Elver <elver@google.com>
> > Cc: Andrew Morton <akpm@linux-foundation.org>
> > Cc: Thomas Gleixner <tglx@linutronix.de>
> > Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org>
> > Cc: Douglas Anderson <dianders@chromium.org>
> > Cc: Petr Mladek <pmladek@suse.com>
> > Cc: Arnd Bergmann <arnd@arndb.de>
> > Cc: <linux-arch@vger.kernel.org>
> > ---
> >  arch/Kconfig                |  3 +++
> >  include/linux/cmpxchg-emu.h | 15 +++++++++++++
> >  lib/Makefile                |  1 +
> >  lib/cmpxchg-emu.c           | 45 +++++++++++++++++++++++++++++++++++++
> >  4 files changed, 64 insertions(+)
> >  create mode 100644 include/linux/cmpxchg-emu.h
> >  create mode 100644 lib/cmpxchg-emu.c
> > 
> > diff --git a/arch/Kconfig b/arch/Kconfig
> > index 9f066785bb71d..284663392eef8 100644
> > --- a/arch/Kconfig
> > +++ b/arch/Kconfig
> > @@ -1609,4 +1609,7 @@ config CC_HAS_SANE_FUNCTION_ALIGNMENT
> >  	# strict alignment always, even with -falign-functions.
> >  	def_bool CC_HAS_MIN_FUNCTION_ALIGNMENT || CC_IS_CLANG
> >  
> > +config ARCH_NEED_CMPXCHG_1_EMU
> > +	bool
> > +
> >  endmenu
> > diff --git a/include/linux/cmpxchg-emu.h b/include/linux/cmpxchg-emu.h
> > new file mode 100644
> > index 0000000000000..998deec67740a
> > --- /dev/null
> > +++ b/include/linux/cmpxchg-emu.h
> > @@ -0,0 +1,15 @@
> > +/* SPDX-License-Identifier: GPL-2.0+ */
> > +/*
> > + * Emulated 1-byte and 2-byte cmpxchg operations for architectures
> > + * lacking direct support for these sizes.  These are implemented in terms
> > + * of 4-byte cmpxchg operations.
> > + *
> > + * Copyright (C) 2024 Paul E. McKenney.
> > + */
> > +
> > +#ifndef __LINUX_CMPXCHG_EMU_H
> > +#define __LINUX_CMPXCHG_EMU_H
> > +
> > +uintptr_t cmpxchg_emu_u8(volatile u8 *p, uintptr_t old, uintptr_t new);
> > +
> > +#endif /* __LINUX_CMPXCHG_EMU_H */
> > diff --git a/lib/Makefile b/lib/Makefile
> > index ffc6b2341b45a..cc3d52fdb477d 100644
> > --- a/lib/Makefile
> > +++ b/lib/Makefile
> > @@ -236,6 +236,7 @@ obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
> >  lib-$(CONFIG_GENERIC_BUG) += bug.o
> >  
> >  obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o
> > +obj-$(CONFIG_ARCH_NEED_CMPXCHG_1_EMU) += cmpxchg-emu.o
> >  
> >  obj-$(CONFIG_DYNAMIC_DEBUG_CORE) += dynamic_debug.o
> >  #ensure exported functions have prototypes
> > diff --git a/lib/cmpxchg-emu.c b/lib/cmpxchg-emu.c
> > new file mode 100644
> > index 0000000000000..27f6f97cb60dd
> > --- /dev/null
> > +++ b/lib/cmpxchg-emu.c
> > @@ -0,0 +1,45 @@
> > +// SPDX-License-Identifier: GPL-2.0+
> > +/*
> > + * Emulated 1-byte cmpxchg operation for architectures lacking direct
> > + * support for this size.  This is implemented in terms of 4-byte cmpxchg
> > + * operations.
> > + *
> > + * Copyright (C) 2024 Paul E. McKenney.
> > + */
> > +
> > +#include <linux/types.h>
> > +#include <linux/export.h>
> > +#include <linux/instrumented.h>
> > +#include <linux/atomic.h>
> > +#include <linux/panic.h>
> > +#include <linux/bug.h>
> > +#include <asm-generic/rwonce.h>
> > +#include <linux/cmpxchg-emu.h>
> > +
> > +union u8_32 {
> > +	u8 b[4];
> > +	u32 w;
> > +};
> > +
> > +/* Emulate one-byte cmpxchg() in terms of 4-byte cmpxchg. */
> > +uintptr_t cmpxchg_emu_u8(volatile u8 *p, uintptr_t old, uintptr_t new)
> > +{
> > +	u32 *p32 = (u32 *)(((uintptr_t)p) & ~0x3);
> > +	int i = ((uintptr_t)p) & 0x3;
> > +	union u8_32 old32;
> > +	union u8_32 new32;
> > +	u32 ret;
> > +
> > +	ret = READ_ONCE(*p32);
> > +	do {
> > +		old32.w = ret;
> > +		if (old32.b[i] != old)
> > +			return old32.b[i];
> > +		new32.w = old32.w;
> > +		new32.b[i] = new;
> > +		instrument_atomic_read_write(p, 1);
> > +		ret = data_race(cmpxchg(p32, old32.w, new32.w)); // Overridden above.
> 
> Just out of curiosity, why is this `data_race` needed? cmpxchg is atomic
> so there should be no chance for a data race?

That is what I thought, too.  ;-)

The problem is that the cmpxchg() covers 32 bits, and so without that
data_race(), KCSAN would complain about data races with perfectly
legitimate concurrent accesses to the other three bytes.

The instrument_atomic_read_write(p, 1) beforehand tells KCSAN to complain
about concurrent accesses, but only to that one byte.

							Thanx, Paul

> Regards,
> Boqun
> 
> > +	} while (ret != old32.w);
> > +	return old;
> > +}
> > +EXPORT_SYMBOL_GPL(cmpxchg_emu_u8);
> > -- 
> > 2.40.1
> >

Re: [PATCH v2 cmpxchg 09/13] lib: Add one-byte emulation function

Posted by Boqun Feng 1 year, 9 months ago

On Mon, May 13, 2024 at 08:41:27AM -0700, Paul E. McKenney wrote:
[...]
> > > +#include <linux/types.h>
> > > +#include <linux/export.h>
> > > +#include <linux/instrumented.h>
> > > +#include <linux/atomic.h>
> > > +#include <linux/panic.h>
> > > +#include <linux/bug.h>
> > > +#include <asm-generic/rwonce.h>
> > > +#include <linux/cmpxchg-emu.h>
> > > +
> > > +union u8_32 {
> > > +	u8 b[4];
> > > +	u32 w;
> > > +};
> > > +
> > > +/* Emulate one-byte cmpxchg() in terms of 4-byte cmpxchg. */
> > > +uintptr_t cmpxchg_emu_u8(volatile u8 *p, uintptr_t old, uintptr_t new)
> > > +{
> > > +	u32 *p32 = (u32 *)(((uintptr_t)p) & ~0x3);
> > > +	int i = ((uintptr_t)p) & 0x3;
> > > +	union u8_32 old32;
> > > +	union u8_32 new32;
> > > +	u32 ret;
> > > +
> > > +	ret = READ_ONCE(*p32);
> > > +	do {
> > > +		old32.w = ret;
> > > +		if (old32.b[i] != old)
> > > +			return old32.b[i];
> > > +		new32.w = old32.w;
> > > +		new32.b[i] = new;
> > > +		instrument_atomic_read_write(p, 1);
> > > +		ret = data_race(cmpxchg(p32, old32.w, new32.w)); // Overridden above.
> > 
> > Just out of curiosity, why is this `data_race` needed? cmpxchg is atomic
> > so there should be no chance for a data race?
> 
> That is what I thought, too.  ;-)
> 
> The problem is that the cmpxchg() covers 32 bits, and so without that
> data_race(), KCSAN would complain about data races with perfectly
> legitimate concurrent accesses to the other three bytes.
> 
> The instrument_atomic_read_write(p, 1) beforehand tells KCSAN to complain
> about concurrent accesses, but only to that one byte.
> 

Oh, I see. For that purpose, maybe we can just use raw_cmpxchg() here,
i.e. a cmpxchg() without any instrument in it. Cc Mark in case I'm
missing something.

Regards,
Boqun

> 							Thanx, Paul
>

Re: [PATCH v2 cmpxchg 09/13] lib: Add one-byte emulation function

Posted by Boqun Feng 1 year, 9 months ago

On Mon, May 13, 2024 at 08:57:16AM -0700, Boqun Feng wrote:
> On Mon, May 13, 2024 at 08:41:27AM -0700, Paul E. McKenney wrote:
> [...]
> > > > +#include <linux/types.h>
> > > > +#include <linux/export.h>
> > > > +#include <linux/instrumented.h>
> > > > +#include <linux/atomic.h>
> > > > +#include <linux/panic.h>
> > > > +#include <linux/bug.h>
> > > > +#include <asm-generic/rwonce.h>
> > > > +#include <linux/cmpxchg-emu.h>
> > > > +
> > > > +union u8_32 {
> > > > +	u8 b[4];
> > > > +	u32 w;
> > > > +};
> > > > +
> > > > +/* Emulate one-byte cmpxchg() in terms of 4-byte cmpxchg. */
> > > > +uintptr_t cmpxchg_emu_u8(volatile u8 *p, uintptr_t old, uintptr_t new)
> > > > +{
> > > > +	u32 *p32 = (u32 *)(((uintptr_t)p) & ~0x3);
> > > > +	int i = ((uintptr_t)p) & 0x3;
> > > > +	union u8_32 old32;
> > > > +	union u8_32 new32;
> > > > +	u32 ret;
> > > > +
> > > > +	ret = READ_ONCE(*p32);
> > > > +	do {
> > > > +		old32.w = ret;
> > > > +		if (old32.b[i] != old)
> > > > +			return old32.b[i];
> > > > +		new32.w = old32.w;
> > > > +		new32.b[i] = new;
> > > > +		instrument_atomic_read_write(p, 1);
> > > > +		ret = data_race(cmpxchg(p32, old32.w, new32.w)); // Overridden above.
> > > 
> > > Just out of curiosity, why is this `data_race` needed? cmpxchg is atomic
> > > so there should be no chance for a data race?
> > 
> > That is what I thought, too.  ;-)
> > 
> > The problem is that the cmpxchg() covers 32 bits, and so without that
> > data_race(), KCSAN would complain about data races with perfectly
> > legitimate concurrent accesses to the other three bytes.
> > 
> > The instrument_atomic_read_write(p, 1) beforehand tells KCSAN to complain
> > about concurrent accesses, but only to that one byte.
> > 
> 
> Oh, I see. For that purpose, maybe we can just use raw_cmpxchg() here,
> i.e. a cmpxchg() without any instrument in it. Cc Mark in case I'm
> missing something.
> 

I just realized that the KCSAN instrumentation is already done in
cmpxchg() layer:

	#define cmpxchg(ptr, ...) \
	({ \
		typeof(ptr) __ai_ptr = (ptr); \
		kcsan_mb(); \
		instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
		raw_cmpxchg(__ai_ptr, __VA_ARGS__); \
	})

and, this function is lower in the layer, so it shouldn't have the
instrumentation itself. How about the following (based on today's RCU
dev branch)?

Regards,
Boqun

-------------------------------------------->8
Subject: [PATCH] lib: cmpxchg-emu: Make cmpxchg_emu_u8() noinstr

Currently, cmpxchg_emu_u8() is called via cmpxchg() or raw_cmpxchg()
which already makes the instrumentation decision:

* cmpxchg() case:

	cmpxchg():
	  kcsan_mb();
	  instrument_atomic_read_write(...);
	  raw_cmpxchg():
	    arch_cmpxchg():
	      cmpxchg_emu_u8();

... should have KCSAN instrumentation.

* raw_cmpxchg() case:

	raw_cmpxchg():
	  arch_cmpxchg():
	    cmpxchg_emu_u8();

... shouldn't have KCSAN instrumentation.

Therefore it's redundant to put KCSAN instrumentation in
cmpxchg_emu_u8() (along with the data_race() to get away the
instrumentation).

So make cmpxchg_emu_u8() a noinstr function, and remove the KCSAN
instrumentation inside it.

Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
---
 include/linux/cmpxchg-emu.h |  4 +++-
 lib/cmpxchg-emu.c           | 14 ++++++++++----
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/include/linux/cmpxchg-emu.h b/include/linux/cmpxchg-emu.h
index 998deec67740..c4c85f41d9f4 100644
--- a/include/linux/cmpxchg-emu.h
+++ b/include/linux/cmpxchg-emu.h
@@ -10,6 +10,8 @@
 #ifndef __LINUX_CMPXCHG_EMU_H
 #define __LINUX_CMPXCHG_EMU_H
 
-uintptr_t cmpxchg_emu_u8(volatile u8 *p, uintptr_t old, uintptr_t new);
+#include <linux/compiler.h>
+
+noinstr uintptr_t cmpxchg_emu_u8(volatile u8 *p, uintptr_t old, uintptr_t new);
 
 #endif /* __LINUX_CMPXCHG_EMU_H */
diff --git a/lib/cmpxchg-emu.c b/lib/cmpxchg-emu.c
index 27f6f97cb60d..788c22cd4462 100644
--- a/lib/cmpxchg-emu.c
+++ b/lib/cmpxchg-emu.c
@@ -21,8 +21,13 @@ union u8_32 {
 	u32 w;
 };
 
-/* Emulate one-byte cmpxchg() in terms of 4-byte cmpxchg. */
-uintptr_t cmpxchg_emu_u8(volatile u8 *p, uintptr_t old, uintptr_t new)
+/*
+ * Emulate one-byte cmpxchg() in terms of 4-byte cmpxchg.
+ *
+ * This function is marked as 'noinstr' as the instrumentation should be done at
+ * outer layer.
+ */
+noinstr uintptr_t cmpxchg_emu_u8(volatile u8 *p, uintptr_t old, uintptr_t new)
 {
 	u32 *p32 = (u32 *)(((uintptr_t)p) & ~0x3);
 	int i = ((uintptr_t)p) & 0x3;
@@ -37,8 +42,9 @@ uintptr_t cmpxchg_emu_u8(volatile u8 *p, uintptr_t old, uintptr_t new)
 			return old32.b[i];
 		new32.w = old32.w;
 		new32.b[i] = new;
-		instrument_atomic_read_write(p, 1);
-		ret = data_race(cmpxchg(p32, old32.w, new32.w)); // Overridden above.
+
+		// raw_cmpxchg() is used here to avoid instrumentation.
+		ret = raw_cmpxchg(p32, old32.w, new32.w); // Overridden above.
 	} while (ret != old32.w);
 	return old;
 }
-- 
2.44.0

Re: [PATCH v2 cmpxchg 09/13] lib: Add one-byte emulation function

Posted by Paul E. McKenney 1 year, 9 months ago

On Mon, May 13, 2024 at 02:19:37PM -0700, Boqun Feng wrote:
> On Mon, May 13, 2024 at 08:57:16AM -0700, Boqun Feng wrote:
> > On Mon, May 13, 2024 at 08:41:27AM -0700, Paul E. McKenney wrote:
> > [...]
> > > > > +#include <linux/types.h>
> > > > > +#include <linux/export.h>
> > > > > +#include <linux/instrumented.h>
> > > > > +#include <linux/atomic.h>
> > > > > +#include <linux/panic.h>
> > > > > +#include <linux/bug.h>
> > > > > +#include <asm-generic/rwonce.h>
> > > > > +#include <linux/cmpxchg-emu.h>
> > > > > +
> > > > > +union u8_32 {
> > > > > +	u8 b[4];
> > > > > +	u32 w;
> > > > > +};
> > > > > +
> > > > > +/* Emulate one-byte cmpxchg() in terms of 4-byte cmpxchg. */
> > > > > +uintptr_t cmpxchg_emu_u8(volatile u8 *p, uintptr_t old, uintptr_t new)
> > > > > +{
> > > > > +	u32 *p32 = (u32 *)(((uintptr_t)p) & ~0x3);
> > > > > +	int i = ((uintptr_t)p) & 0x3;
> > > > > +	union u8_32 old32;
> > > > > +	union u8_32 new32;
> > > > > +	u32 ret;
> > > > > +
> > > > > +	ret = READ_ONCE(*p32);
> > > > > +	do {
> > > > > +		old32.w = ret;
> > > > > +		if (old32.b[i] != old)
> > > > > +			return old32.b[i];
> > > > > +		new32.w = old32.w;
> > > > > +		new32.b[i] = new;
> > > > > +		instrument_atomic_read_write(p, 1);
> > > > > +		ret = data_race(cmpxchg(p32, old32.w, new32.w)); // Overridden above.
> > > > 
> > > > Just out of curiosity, why is this `data_race` needed? cmpxchg is atomic
> > > > so there should be no chance for a data race?
> > > 
> > > That is what I thought, too.  ;-)
> > > 
> > > The problem is that the cmpxchg() covers 32 bits, and so without that
> > > data_race(), KCSAN would complain about data races with perfectly
> > > legitimate concurrent accesses to the other three bytes.
> > > 
> > > The instrument_atomic_read_write(p, 1) beforehand tells KCSAN to complain
> > > about concurrent accesses, but only to that one byte.
> > > 
> > 
> > Oh, I see. For that purpose, maybe we can just use raw_cmpxchg() here,
> > i.e. a cmpxchg() without any instrument in it. Cc Mark in case I'm
> > missing something.
> > 
> 
> I just realized that the KCSAN instrumentation is already done in
> cmpxchg() layer:
> 
> 	#define cmpxchg(ptr, ...) \
> 	({ \
> 		typeof(ptr) __ai_ptr = (ptr); \
> 		kcsan_mb(); \
> 		instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
> 		raw_cmpxchg(__ai_ptr, __VA_ARGS__); \
> 	})
> 
> and, this function is lower in the layer, so it shouldn't have the
> instrumentation itself. How about the following (based on today's RCU
> dev branch)?

The raw_cmpxchg() looks nicer than the added data_race()!

One question below, though.

							Thanx, Paul

> Regards,
> Boqun
> 
> -------------------------------------------->8
> Subject: [PATCH] lib: cmpxchg-emu: Make cmpxchg_emu_u8() noinstr
> 
> Currently, cmpxchg_emu_u8() is called via cmpxchg() or raw_cmpxchg()
> which already makes the instrumentation decision:
> 
> * cmpxchg() case:
> 
> 	cmpxchg():
> 	  kcsan_mb();
> 	  instrument_atomic_read_write(...);
> 	  raw_cmpxchg():
> 	    arch_cmpxchg():
> 	      cmpxchg_emu_u8();
> 
> ... should have KCSAN instrumentation.
> 
> * raw_cmpxchg() case:
> 
> 	raw_cmpxchg():
> 	  arch_cmpxchg():
> 	    cmpxchg_emu_u8();
> 
> ... shouldn't have KCSAN instrumentation.
> 
> Therefore it's redundant to put KCSAN instrumentation in
> cmpxchg_emu_u8() (along with the data_race() to get away the
> instrumentation).
> 
> So make cmpxchg_emu_u8() a noinstr function, and remove the KCSAN
> instrumentation inside it.
> 
> Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
> ---
>  include/linux/cmpxchg-emu.h |  4 +++-
>  lib/cmpxchg-emu.c           | 14 ++++++++++----
>  2 files changed, 13 insertions(+), 5 deletions(-)
> 
> diff --git a/include/linux/cmpxchg-emu.h b/include/linux/cmpxchg-emu.h
> index 998deec67740..c4c85f41d9f4 100644
> --- a/include/linux/cmpxchg-emu.h
> +++ b/include/linux/cmpxchg-emu.h
> @@ -10,6 +10,8 @@
>  #ifndef __LINUX_CMPXCHG_EMU_H
>  #define __LINUX_CMPXCHG_EMU_H
>  
> -uintptr_t cmpxchg_emu_u8(volatile u8 *p, uintptr_t old, uintptr_t new);
> +#include <linux/compiler.h>
> +
> +noinstr uintptr_t cmpxchg_emu_u8(volatile u8 *p, uintptr_t old, uintptr_t new);
>  
>  #endif /* __LINUX_CMPXCHG_EMU_H */
> diff --git a/lib/cmpxchg-emu.c b/lib/cmpxchg-emu.c
> index 27f6f97cb60d..788c22cd4462 100644
> --- a/lib/cmpxchg-emu.c
> +++ b/lib/cmpxchg-emu.c
> @@ -21,8 +21,13 @@ union u8_32 {
>  	u32 w;
>  };
>  
> -/* Emulate one-byte cmpxchg() in terms of 4-byte cmpxchg. */
> -uintptr_t cmpxchg_emu_u8(volatile u8 *p, uintptr_t old, uintptr_t new)
> +/*
> + * Emulate one-byte cmpxchg() in terms of 4-byte cmpxchg.
> + *
> + * This function is marked as 'noinstr' as the instrumentation should be done at
> + * outer layer.
> + */
> +noinstr uintptr_t cmpxchg_emu_u8(volatile u8 *p, uintptr_t old, uintptr_t new)
>  {
>  	u32 *p32 = (u32 *)(((uintptr_t)p) & ~0x3);
>  	int i = ((uintptr_t)p) & 0x3;
> @@ -37,8 +42,9 @@ uintptr_t cmpxchg_emu_u8(volatile u8 *p, uintptr_t old, uintptr_t new)
>  			return old32.b[i];
>  		new32.w = old32.w;
>  		new32.b[i] = new;
> -		instrument_atomic_read_write(p, 1);

Don't we need to keep that instrument_atomic_read_write() in order
to allow KCSAN to detect data races with plain C-language reads?
Or is that being handled some other way?

> -		ret = data_race(cmpxchg(p32, old32.w, new32.w)); // Overridden above.
> +
> +		// raw_cmpxchg() is used here to avoid instrumentation.
> +		ret = raw_cmpxchg(p32, old32.w, new32.w); // Overridden above.
>  	} while (ret != old32.w);
>  	return old;
>  }
> -- 
> 2.44.0
>

Re: [PATCH v2 cmpxchg 09/13] lib: Add one-byte emulation function

Posted by Boqun Feng 1 year, 9 months ago

On Tue, May 14, 2024 at 07:22:47AM -0700, Paul E. McKenney wrote:
> On Mon, May 13, 2024 at 02:19:37PM -0700, Boqun Feng wrote:
> > On Mon, May 13, 2024 at 08:57:16AM -0700, Boqun Feng wrote:
> > > On Mon, May 13, 2024 at 08:41:27AM -0700, Paul E. McKenney wrote:
> > > [...]
> > > > > > +#include <linux/types.h>
> > > > > > +#include <linux/export.h>
> > > > > > +#include <linux/instrumented.h>
> > > > > > +#include <linux/atomic.h>
> > > > > > +#include <linux/panic.h>
> > > > > > +#include <linux/bug.h>
> > > > > > +#include <asm-generic/rwonce.h>
> > > > > > +#include <linux/cmpxchg-emu.h>
> > > > > > +
> > > > > > +union u8_32 {
> > > > > > +	u8 b[4];
> > > > > > +	u32 w;
> > > > > > +};
> > > > > > +
> > > > > > +/* Emulate one-byte cmpxchg() in terms of 4-byte cmpxchg. */
> > > > > > +uintptr_t cmpxchg_emu_u8(volatile u8 *p, uintptr_t old, uintptr_t new)
> > > > > > +{
> > > > > > +	u32 *p32 = (u32 *)(((uintptr_t)p) & ~0x3);
> > > > > > +	int i = ((uintptr_t)p) & 0x3;
> > > > > > +	union u8_32 old32;
> > > > > > +	union u8_32 new32;
> > > > > > +	u32 ret;
> > > > > > +
> > > > > > +	ret = READ_ONCE(*p32);
> > > > > > +	do {
> > > > > > +		old32.w = ret;
> > > > > > +		if (old32.b[i] != old)
> > > > > > +			return old32.b[i];
> > > > > > +		new32.w = old32.w;
> > > > > > +		new32.b[i] = new;
> > > > > > +		instrument_atomic_read_write(p, 1);
> > > > > > +		ret = data_race(cmpxchg(p32, old32.w, new32.w)); // Overridden above.
> > > > > 
> > > > > Just out of curiosity, why is this `data_race` needed? cmpxchg is atomic
> > > > > so there should be no chance for a data race?
> > > > 
> > > > That is what I thought, too.  ;-)
> > > > 
> > > > The problem is that the cmpxchg() covers 32 bits, and so without that
> > > > data_race(), KCSAN would complain about data races with perfectly
> > > > legitimate concurrent accesses to the other three bytes.
> > > > 
> > > > The instrument_atomic_read_write(p, 1) beforehand tells KCSAN to complain
> > > > about concurrent accesses, but only to that one byte.
> > > > 
> > > 
> > > Oh, I see. For that purpose, maybe we can just use raw_cmpxchg() here,
> > > i.e. a cmpxchg() without any instrument in it. Cc Mark in case I'm
> > > missing something.
> > > 
> > 
> > I just realized that the KCSAN instrumentation is already done in
> > cmpxchg() layer:
> > 
> > 	#define cmpxchg(ptr, ...) \
> > 	({ \
> > 		typeof(ptr) __ai_ptr = (ptr); \
> > 		kcsan_mb(); \
> > 		instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
> > 		raw_cmpxchg(__ai_ptr, __VA_ARGS__); \
> > 	})
> > 
> > and, this function is lower in the layer, so it shouldn't have the
> > instrumentation itself. How about the following (based on today's RCU
> > dev branch)?
> 
> The raw_cmpxchg() looks nicer than the added data_race()!
> 
> One question below, though.
> 
> 							Thanx, Paul
> 
> > Regards,
> > Boqun
> > 
> > -------------------------------------------->8
> > Subject: [PATCH] lib: cmpxchg-emu: Make cmpxchg_emu_u8() noinstr
> > 
> > Currently, cmpxchg_emu_u8() is called via cmpxchg() or raw_cmpxchg()
> > which already makes the instrumentation decision:
> > 
> > * cmpxchg() case:
> > 
> > 	cmpxchg():
> > 	  kcsan_mb();
> > 	  instrument_atomic_read_write(...);
> > 	  raw_cmpxchg():
> > 	    arch_cmpxchg():
> > 	      cmpxchg_emu_u8();
> > 
> > ... should have KCSAN instrumentation.
> > 
> > * raw_cmpxchg() case:
> > 
> > 	raw_cmpxchg():
> > 	  arch_cmpxchg():
> > 	    cmpxchg_emu_u8();
> > 
> > ... shouldn't have KCSAN instrumentation.
> > 
> > Therefore it's redundant to put KCSAN instrumentation in
> > cmpxchg_emu_u8() (along with the data_race() to get away the
> > instrumentation).
> > 
> > So make cmpxchg_emu_u8() a noinstr function, and remove the KCSAN
> > instrumentation inside it.
> > 
> > Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
> > ---
> >  include/linux/cmpxchg-emu.h |  4 +++-
> >  lib/cmpxchg-emu.c           | 14 ++++++++++----
> >  2 files changed, 13 insertions(+), 5 deletions(-)
> > 
> > diff --git a/include/linux/cmpxchg-emu.h b/include/linux/cmpxchg-emu.h
> > index 998deec67740..c4c85f41d9f4 100644
> > --- a/include/linux/cmpxchg-emu.h
> > +++ b/include/linux/cmpxchg-emu.h
> > @@ -10,6 +10,8 @@
> >  #ifndef __LINUX_CMPXCHG_EMU_H
> >  #define __LINUX_CMPXCHG_EMU_H
> >  
> > -uintptr_t cmpxchg_emu_u8(volatile u8 *p, uintptr_t old, uintptr_t new);
> > +#include <linux/compiler.h>
> > +
> > +noinstr uintptr_t cmpxchg_emu_u8(volatile u8 *p, uintptr_t old, uintptr_t new);
> >  
> >  #endif /* __LINUX_CMPXCHG_EMU_H */
> > diff --git a/lib/cmpxchg-emu.c b/lib/cmpxchg-emu.c
> > index 27f6f97cb60d..788c22cd4462 100644
> > --- a/lib/cmpxchg-emu.c
> > +++ b/lib/cmpxchg-emu.c
> > @@ -21,8 +21,13 @@ union u8_32 {
> >  	u32 w;
> >  };
> >  
> > -/* Emulate one-byte cmpxchg() in terms of 4-byte cmpxchg. */
> > -uintptr_t cmpxchg_emu_u8(volatile u8 *p, uintptr_t old, uintptr_t new)
> > +/*
> > + * Emulate one-byte cmpxchg() in terms of 4-byte cmpxchg.
> > + *
> > + * This function is marked as 'noinstr' as the instrumentation should be done at
> > + * outer layer.
> > + */
> > +noinstr uintptr_t cmpxchg_emu_u8(volatile u8 *p, uintptr_t old, uintptr_t new)
> >  {
> >  	u32 *p32 = (u32 *)(((uintptr_t)p) & ~0x3);
> >  	int i = ((uintptr_t)p) & 0x3;
> > @@ -37,8 +42,9 @@ uintptr_t cmpxchg_emu_u8(volatile u8 *p, uintptr_t old, uintptr_t new)
> >  			return old32.b[i];
> >  		new32.w = old32.w;
> >  		new32.b[i] = new;
> > -		instrument_atomic_read_write(p, 1);
> 
> Don't we need to keep that instrument_atomic_read_write() in order
> to allow KCSAN to detect data races with plain C-language reads?
> Or is that being handled some other way?
> 

I think that's already covered by the current code, cmpxchg_emu_u8() is
called from cmpxchg() macro, which has a:

	instrument_atomic_read_write(p, sizeof(*p));

in it, and in the cmpxchg((u8*), ..) case, 'sizeof(*p)' is obviously 1
;-)

Regards,
Boqun

> > -		ret = data_race(cmpxchg(p32, old32.w, new32.w)); // Overridden above.
> > +
> > +		// raw_cmpxchg() is used here to avoid instrumentation.
> > +		ret = raw_cmpxchg(p32, old32.w, new32.w); // Overridden above.
> >  	} while (ret != old32.w);
> >  	return old;
> >  }
> > -- 
> > 2.44.0
> >

Re: [PATCH v2 cmpxchg 09/13] lib: Add one-byte emulation function

Posted by Paul E. McKenney 1 year, 9 months ago

On Tue, May 14, 2024 at 07:53:20AM -0700, Boqun Feng wrote:
> On Tue, May 14, 2024 at 07:22:47AM -0700, Paul E. McKenney wrote:
> > On Mon, May 13, 2024 at 02:19:37PM -0700, Boqun Feng wrote:
> > > On Mon, May 13, 2024 at 08:57:16AM -0700, Boqun Feng wrote:
> > > > On Mon, May 13, 2024 at 08:41:27AM -0700, Paul E. McKenney wrote:
> > > > [...]
> > > > > > > +#include <linux/types.h>
> > > > > > > +#include <linux/export.h>
> > > > > > > +#include <linux/instrumented.h>
> > > > > > > +#include <linux/atomic.h>
> > > > > > > +#include <linux/panic.h>
> > > > > > > +#include <linux/bug.h>
> > > > > > > +#include <asm-generic/rwonce.h>
> > > > > > > +#include <linux/cmpxchg-emu.h>
> > > > > > > +
> > > > > > > +union u8_32 {
> > > > > > > +	u8 b[4];
> > > > > > > +	u32 w;
> > > > > > > +};
> > > > > > > +
> > > > > > > +/* Emulate one-byte cmpxchg() in terms of 4-byte cmpxchg. */
> > > > > > > +uintptr_t cmpxchg_emu_u8(volatile u8 *p, uintptr_t old, uintptr_t new)
> > > > > > > +{
> > > > > > > +	u32 *p32 = (u32 *)(((uintptr_t)p) & ~0x3);
> > > > > > > +	int i = ((uintptr_t)p) & 0x3;
> > > > > > > +	union u8_32 old32;
> > > > > > > +	union u8_32 new32;
> > > > > > > +	u32 ret;
> > > > > > > +
> > > > > > > +	ret = READ_ONCE(*p32);
> > > > > > > +	do {
> > > > > > > +		old32.w = ret;
> > > > > > > +		if (old32.b[i] != old)
> > > > > > > +			return old32.b[i];
> > > > > > > +		new32.w = old32.w;
> > > > > > > +		new32.b[i] = new;
> > > > > > > +		instrument_atomic_read_write(p, 1);
> > > > > > > +		ret = data_race(cmpxchg(p32, old32.w, new32.w)); // Overridden above.
> > > > > > 
> > > > > > Just out of curiosity, why is this `data_race` needed? cmpxchg is atomic
> > > > > > so there should be no chance for a data race?
> > > > > 
> > > > > That is what I thought, too.  ;-)
> > > > > 
> > > > > The problem is that the cmpxchg() covers 32 bits, and so without that
> > > > > data_race(), KCSAN would complain about data races with perfectly
> > > > > legitimate concurrent accesses to the other three bytes.
> > > > > 
> > > > > The instrument_atomic_read_write(p, 1) beforehand tells KCSAN to complain
> > > > > about concurrent accesses, but only to that one byte.
> > > > > 
> > > > 
> > > > Oh, I see. For that purpose, maybe we can just use raw_cmpxchg() here,
> > > > i.e. a cmpxchg() without any instrument in it. Cc Mark in case I'm
> > > > missing something.
> > > > 
> > > 
> > > I just realized that the KCSAN instrumentation is already done in
> > > cmpxchg() layer:
> > > 
> > > 	#define cmpxchg(ptr, ...) \
> > > 	({ \
> > > 		typeof(ptr) __ai_ptr = (ptr); \
> > > 		kcsan_mb(); \
> > > 		instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
> > > 		raw_cmpxchg(__ai_ptr, __VA_ARGS__); \
> > > 	})
> > > 
> > > and, this function is lower in the layer, so it shouldn't have the
> > > instrumentation itself. How about the following (based on today's RCU
> > > dev branch)?
> > 
> > The raw_cmpxchg() looks nicer than the added data_race()!
> > 
> > One question below, though.
> > 
> > 							Thanx, Paul
> > 
> > > Regards,
> > > Boqun
> > > 
> > > -------------------------------------------->8
> > > Subject: [PATCH] lib: cmpxchg-emu: Make cmpxchg_emu_u8() noinstr
> > > 
> > > Currently, cmpxchg_emu_u8() is called via cmpxchg() or raw_cmpxchg()
> > > which already makes the instrumentation decision:
> > > 
> > > * cmpxchg() case:
> > > 
> > > 	cmpxchg():
> > > 	  kcsan_mb();
> > > 	  instrument_atomic_read_write(...);
> > > 	  raw_cmpxchg():
> > > 	    arch_cmpxchg():
> > > 	      cmpxchg_emu_u8();
> > > 
> > > ... should have KCSAN instrumentation.
> > > 
> > > * raw_cmpxchg() case:
> > > 
> > > 	raw_cmpxchg():
> > > 	  arch_cmpxchg():
> > > 	    cmpxchg_emu_u8();
> > > 
> > > ... shouldn't have KCSAN instrumentation.
> > > 
> > > Therefore it's redundant to put KCSAN instrumentation in
> > > cmpxchg_emu_u8() (along with the data_race() to get away the
> > > instrumentation).
> > > 
> > > So make cmpxchg_emu_u8() a noinstr function, and remove the KCSAN
> > > instrumentation inside it.
> > > 
> > > Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
> > > ---
> > >  include/linux/cmpxchg-emu.h |  4 +++-
> > >  lib/cmpxchg-emu.c           | 14 ++++++++++----
> > >  2 files changed, 13 insertions(+), 5 deletions(-)
> > > 
> > > diff --git a/include/linux/cmpxchg-emu.h b/include/linux/cmpxchg-emu.h
> > > index 998deec67740..c4c85f41d9f4 100644
> > > --- a/include/linux/cmpxchg-emu.h
> > > +++ b/include/linux/cmpxchg-emu.h
> > > @@ -10,6 +10,8 @@
> > >  #ifndef __LINUX_CMPXCHG_EMU_H
> > >  #define __LINUX_CMPXCHG_EMU_H
> > >  
> > > -uintptr_t cmpxchg_emu_u8(volatile u8 *p, uintptr_t old, uintptr_t new);
> > > +#include <linux/compiler.h>
> > > +
> > > +noinstr uintptr_t cmpxchg_emu_u8(volatile u8 *p, uintptr_t old, uintptr_t new);
> > >  
> > >  #endif /* __LINUX_CMPXCHG_EMU_H */
> > > diff --git a/lib/cmpxchg-emu.c b/lib/cmpxchg-emu.c
> > > index 27f6f97cb60d..788c22cd4462 100644
> > > --- a/lib/cmpxchg-emu.c
> > > +++ b/lib/cmpxchg-emu.c
> > > @@ -21,8 +21,13 @@ union u8_32 {
> > >  	u32 w;
> > >  };
> > >  
> > > -/* Emulate one-byte cmpxchg() in terms of 4-byte cmpxchg. */
> > > -uintptr_t cmpxchg_emu_u8(volatile u8 *p, uintptr_t old, uintptr_t new)
> > > +/*
> > > + * Emulate one-byte cmpxchg() in terms of 4-byte cmpxchg.
> > > + *
> > > + * This function is marked as 'noinstr' as the instrumentation should be done at
> > > + * outer layer.
> > > + */
> > > +noinstr uintptr_t cmpxchg_emu_u8(volatile u8 *p, uintptr_t old, uintptr_t new)
> > >  {
> > >  	u32 *p32 = (u32 *)(((uintptr_t)p) & ~0x3);
> > >  	int i = ((uintptr_t)p) & 0x3;
> > > @@ -37,8 +42,9 @@ uintptr_t cmpxchg_emu_u8(volatile u8 *p, uintptr_t old, uintptr_t new)
> > >  			return old32.b[i];
> > >  		new32.w = old32.w;
> > >  		new32.b[i] = new;
> > > -		instrument_atomic_read_write(p, 1);
> > 
> > Don't we need to keep that instrument_atomic_read_write() in order
> > to allow KCSAN to detect data races with plain C-language reads?
> > Or is that being handled some other way?
> > 
> 
> I think that's already covered by the current code, cmpxchg_emu_u8() is
> called from cmpxchg() macro, which has a:
> 
> 	instrument_atomic_read_write(p, sizeof(*p));
> 
> in it, and in the cmpxchg((u8*), ..) case, 'sizeof(*p)' is obviously 1
> ;-)

I believe you, but could you nevertheless please test it?  ;-)

							Thanx, Paul

> Regards,
> Boqun
> 
> > > -		ret = data_race(cmpxchg(p32, old32.w, new32.w)); // Overridden above.
> > > +
> > > +		// raw_cmpxchg() is used here to avoid instrumentation.
> > > +		ret = raw_cmpxchg(p32, old32.w, new32.w); // Overridden above.
> > >  	} while (ret != old32.w);
> > >  	return old;
> > >  }
> > > -- 
> > > 2.44.0
> > >