[v1] Upstream kvx Linux port

[RFC PATCH 05/25] kvx: Add atomic/locking headers

Posted by Yann Sionneau 2 years, 8 months ago

Add common headers (atomic, bitops, barrier and locking) for basic
kvx support.

CC: Will Deacon <will@kernel.org>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Boqun Feng <boqun.feng@gmail.com>
CC: Mark Rutland <mark.rutland@arm.com>
CC: linux-kernel@vger.kernel.org
Co-developed-by: Clement Leger <clement.leger@bootlin.com>
Signed-off-by: Clement Leger <clement.leger@bootlin.com>
Co-developed-by: Jules Maselbas <jmaselbas@kalray.eu>
Signed-off-by: Jules Maselbas <jmaselbas@kalray.eu>
Co-developed-by: Julian Vetter <jvetter@kalray.eu>
Signed-off-by: Julian Vetter <jvetter@kalray.eu>
Co-developed-by: Julien Villette <jvillette@kalray.eu>
Signed-off-by: Julien Villette <jvillette@kalray.eu>
Co-developed-by: Yann Sionneau <ysionneau@kalray.eu>
Signed-off-by: Yann Sionneau <ysionneau@kalray.eu>
---
 arch/kvx/include/asm/atomic.h  | 104 +++++++++++++++++
 arch/kvx/include/asm/barrier.h |  15 +++
 arch/kvx/include/asm/bitops.h  | 207 +++++++++++++++++++++++++++++++++
 arch/kvx/include/asm/bitrev.h  |  32 +++++
 arch/kvx/include/asm/cmpxchg.h | 185 +++++++++++++++++++++++++++++
 5 files changed, 543 insertions(+)
 create mode 100644 arch/kvx/include/asm/atomic.h
 create mode 100644 arch/kvx/include/asm/barrier.h
 create mode 100644 arch/kvx/include/asm/bitops.h
 create mode 100644 arch/kvx/include/asm/bitrev.h
 create mode 100644 arch/kvx/include/asm/cmpxchg.h

diff --git a/arch/kvx/include/asm/atomic.h b/arch/kvx/include/asm/atomic.h
new file mode 100644
index 000000000000..eb8acbcbc70d
--- /dev/null
+++ b/arch/kvx/include/asm/atomic.h
@@ -0,0 +1,104 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2017-2023 Kalray Inc.
+ * Author(s): Clement Leger
+ */
+
+#ifndef _ASM_KVX_ATOMIC_H
+#define _ASM_KVX_ATOMIC_H
+
+#include <linux/types.h>
+
+#include <asm/cmpxchg.h>
+
+#define ATOMIC64_INIT(i)     { (i) }
+
+#define arch_atomic64_cmpxchg(v, old, new) (arch_cmpxchg(&((v)->counter), old, new))
+#define arch_atomic64_xchg(v, new) (arch_xchg(&((v)->counter), new))
+
+static inline long arch_atomic64_read(const atomic64_t *v)
+{
+	return v->counter;
+}
+
+static inline void arch_atomic64_set(atomic64_t *v, long i)
+{
+	v->counter = i;
+}
+
+#define ATOMIC64_RETURN_OP(op, c_op)					\
+static inline long arch_atomic64_##op##_return(long i, atomic64_t *v)	\
+{									\
+	long new, old, ret;						\
+									\
+	do {								\
+		old = v->counter;					\
+		new = old c_op i;					\
+		ret = arch_cmpxchg(&v->counter, old, new);		\
+	} while (ret != old);						\
+									\
+	return new;							\
+}
+
+#define ATOMIC64_OP(op, c_op)						\
+static inline void arch_atomic64_##op(long i, atomic64_t *v)		\
+{									\
+	long new, old, ret;						\
+									\
+	do {								\
+		old = v->counter;					\
+		new = old c_op i;					\
+		ret = arch_cmpxchg(&v->counter, old, new);		\
+	} while (ret != old);						\
+}
+
+#define ATOMIC64_FETCH_OP(op, c_op)					\
+static inline long arch_atomic64_fetch_##op(long i, atomic64_t *v)	\
+{									\
+	long new, old, ret;						\
+									\
+	do {								\
+		old = v->counter;					\
+		new = old c_op i;					\
+		ret = arch_cmpxchg(&v->counter, old, new);		\
+	} while (ret != old);						\
+									\
+	return old;							\
+}
+
+#define ATOMIC64_OPS(op, c_op)						\
+	ATOMIC64_OP(op, c_op)						\
+	ATOMIC64_RETURN_OP(op, c_op)					\
+	ATOMIC64_FETCH_OP(op, c_op)
+
+ATOMIC64_OPS(and, &)
+ATOMIC64_OPS(or, |)
+ATOMIC64_OPS(xor, ^)
+ATOMIC64_OPS(add, +)
+ATOMIC64_OPS(sub, -)
+
+#undef ATOMIC64_OPS
+#undef ATOMIC64_FETCH_OP
+#undef ATOMIC64_OP
+
+static inline int arch_atomic_add_return(int i, atomic_t *v)
+{
+	int new, old, ret;
+
+	do {
+		old = v->counter;
+		new = old + i;
+		ret = arch_cmpxchg(&v->counter, old, new);
+	} while (ret != old);
+
+	return new;
+}
+
+static inline int arch_atomic_sub_return(int i, atomic_t *v)
+{
+	return arch_atomic_add_return(-i, v);
+}
+
+#include <asm-generic/atomic.h>
+
+#endif	/* _ASM_KVX_ATOMIC_H */
diff --git a/arch/kvx/include/asm/barrier.h b/arch/kvx/include/asm/barrier.h
new file mode 100644
index 000000000000..371f1c70746d
--- /dev/null
+++ b/arch/kvx/include/asm/barrier.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2017-2023 Kalray Inc.
+ * Author(s): Clement Leger
+ */
+
+#ifndef _ASM_KVX_BARRIER_H
+#define _ASM_KVX_BARRIER_H
+
+/* fence is sufficient to guarantee write ordering */
+#define mb()	__builtin_kvx_fence()
+
+#include <asm-generic/barrier.h>
+
+#endif /* _ASM_KVX_BARRIER_H */
diff --git a/arch/kvx/include/asm/bitops.h b/arch/kvx/include/asm/bitops.h
new file mode 100644
index 000000000000..03b861169ce8
--- /dev/null
+++ b/arch/kvx/include/asm/bitops.h
@@ -0,0 +1,207 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2017-2023 Kalray Inc.
+ * Author(s): Clement Leger
+ *            Yann Sionneau
+ */
+
+#ifndef _ASM_KVX_BITOPS_H
+#define _ASM_KVX_BITOPS_H
+
+#ifdef __KERNEL__
+
+#ifndef _LINUX_BITOPS_H
+#error only <linux/bitops.h> can be included directly
+#endif
+
+#include <asm/cmpxchg.h>
+
+static inline unsigned long __ffs(unsigned long word);
+
+#include <asm-generic/bitops/const_hweight.h>
+#include <asm-generic/bitops/ffz.h>
+#include <asm-generic/bitops/non-atomic.h>
+#include <asm-generic/bitops/lock.h>
+#include <asm-generic/bitops/sched.h>
+
+static inline int fls(int x)
+{
+	return 32 - __builtin_kvx_clzw(x);
+}
+
+static inline int fls64(__u64 x)
+{
+	return 64 - __builtin_kvx_clzd(x);
+}
+
+/**
+ * __ffs - find first set bit in word
+ * @word: The word to search
+ *
+ * Undefined if no set bit exists, so code should check against 0 first.
+ */
+static inline unsigned long __ffs(unsigned long word)
+{
+	return __builtin_kvx_ctzd(word);
+}
+
+/**
+ * __fls - find last set bit in word
+ * @word: The word to search
+ *
+ * Undefined if no set bit exists, so code should check against 0 first.
+ */
+static inline unsigned long __fls(unsigned long word)
+{
+	return 63 - __builtin_kvx_clzd(word);
+}
+
+
+/**
+ * ffs - find first set bit in word
+ * @x: the word to search
+ *
+ * This is defined the same way as the libc and compiler builtin ffs
+ * routines, therefore differs in spirit from the other bitops.
+ *
+ * ffs(value) returns 0 if value is 0 or the position of the first
+ * set bit if value is nonzero. The first (least significant) bit
+ * is at position 1.
+ */
+static inline int ffs(int x)
+{
+	if (!x)
+		return 0;
+	return __builtin_kvx_ctzw(x) + 1;
+}
+
+
+static inline unsigned int __arch_hweight32(unsigned int w)
+{
+	unsigned int count;
+
+	asm volatile ("cbsw %0 = %1\n\t;;"
+	: "=r" (count)
+	: "r" (w));
+
+	return count;
+}
+
+static inline unsigned int __arch_hweight64(__u64 w)
+{
+	unsigned int count;
+
+	asm volatile ("cbsd %0 = %1\n\t;;"
+	: "=r" (count)
+	: "r" (w));
+
+	return count;
+}
+
+static inline unsigned int __arch_hweight16(unsigned int w)
+{
+	return __arch_hweight32(w & 0xffff);
+}
+
+static inline unsigned int __arch_hweight8(unsigned int w)
+{
+	return __arch_hweight32(w & 0xff);
+}
+
+
+/* Bitmask modifiers */
+#define __NOP(x)	(x)
+#define __NOT(x)	(~(x))
+
+
+#define __test_and_op_bit(nr, addr, op, mod)				\
+({									\
+	unsigned long __mask = BIT_MASK(nr);				\
+	unsigned long __new, __old, __ret;				\
+	do {								\
+		__old = *(&addr[BIT_WORD(nr)]);				\
+		__new = __old op mod(__mask);				\
+		__ret = cmpxchg(addr, __old, __new);			\
+	} while (__ret != __old);					\
+	(__old & __mask);						\
+})
+
+/**
+ * test_and_set_bit - Set a bit and return its old value
+ * @nr: Bit to set
+ * @addr: Address to count from
+ *
+ * This operation may be reordered on other architectures than x86.
+ */
+static inline int test_and_set_bit(int nr, volatile unsigned long *addr)
+{
+	return __test_and_op_bit(nr, addr, |, __NOP) != 0;
+}
+
+/**
+ * test_and_clear_bit - Clear a bit and return its old value
+ * @nr: Bit to clear
+ * @addr: Address to count from
+ *
+ * This operation can be reordered on other architectures other than x86.
+ */
+static inline int test_and_clear_bit(int nr, volatile unsigned long *addr)
+{
+	return __test_and_op_bit(nr, addr, &, __NOT);
+}
+
+#define __atomic_op(nr, addr, op, mod)					\
+({									\
+	unsigned long __new, __old, __ret;				\
+	__ret = addr[BIT_WORD(nr)];					\
+	do {								\
+		__old = __ret;						\
+		__new = __old op mod(BIT_MASK(nr));			\
+		__ret = cmpxchg(&addr[BIT_WORD(nr)], __old, __new);	\
+	} while (__ret != __old);					\
+})
+
+/**
+ * set_bit - Atomically set a bit in memory
+ * @nr: the bit to set
+ * @addr: the address to start counting from
+ *
+ * Note that @nr may be almost arbitrarily large; this function is not
+ * restricted to acting on a single-word quantity.
+ */
+static inline void set_bit(int nr, volatile unsigned long *addr)
+{
+	__atomic_op(nr, addr, |, __NOP);
+}
+
+/**
+ * clear_bit - Clears a bit in memory
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ */
+static inline void clear_bit(int nr, volatile unsigned long *addr)
+{
+	__atomic_op(nr, addr, &, __NOT);
+}
+
+/**
+ * change_bit - Toggle a bit in memory
+ * @nr: Bit to change
+ * @addr: Address to start counting from
+ *
+ * Note that @nr may be almost arbitrarily large; this function is not
+ * restricted to acting on a single-word quantity.
+ */
+static inline void change_bit(int nr, volatile unsigned long *addr)
+{
+	__atomic_op(nr, addr, ^, __NOP);
+}
+
+#include <asm-generic/bitops/lock.h>
+#include <asm-generic/bitops/non-atomic.h>
+#include <asm-generic/bitops/le.h>
+#include <asm-generic/bitops/ext2-atomic.h>
+
+#endif
+
+#endif
diff --git a/arch/kvx/include/asm/bitrev.h b/arch/kvx/include/asm/bitrev.h
new file mode 100644
index 000000000000..79865081905a
--- /dev/null
+++ b/arch/kvx/include/asm/bitrev.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2017-2023 Kalray Inc.
+ * Author(s): Clement Leger
+ */
+
+#ifndef _ASM_KVX_BITREV_H
+#define _ASM_KVX_BITREV_H
+
+#include <linux/swab.h>
+
+/* Bit reversal constant for matrix multiply */
+#define BIT_REVERSE 0x0102040810204080ULL
+
+static __always_inline __attribute_const__ u32 __arch_bitrev32(u32 x)
+{
+	/* Reverse all bits for each bytes and then byte-reverse the 32 LSB */
+	return swab32(__builtin_kvx_sbmm8(BIT_REVERSE, x));
+}
+
+static __always_inline __attribute_const__ u16 __arch_bitrev16(u16 x)
+{
+	/* Reverse all bits for each bytes and then byte-reverse the 16 LSB */
+	return swab16(__builtin_kvx_sbmm8(BIT_REVERSE, x));
+}
+
+static __always_inline __attribute_const__ u8 __arch_bitrev8(u8 x)
+{
+	return __builtin_kvx_sbmm8(BIT_REVERSE, x);
+}
+
+#endif
diff --git a/arch/kvx/include/asm/cmpxchg.h b/arch/kvx/include/asm/cmpxchg.h
new file mode 100644
index 000000000000..b1d128b060a2
--- /dev/null
+++ b/arch/kvx/include/asm/cmpxchg.h
@@ -0,0 +1,185 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2017-2023 Kalray Inc.
+ * Author(s): Clement Leger
+ *            Yann Sionneau
+ */
+
+#ifndef _ASM_KVX_CMPXCHG_H
+#define _ASM_KVX_CMPXCHG_H
+
+#include <linux/bits.h>
+#include <linux/types.h>
+#include <linux/build_bug.h>
+
+/*
+ * On kvx, we have a boolean compare and swap which means that the operation
+ * returns only the success of operation.
+ * If operation succeed, this is simple, we just need to return the provided
+ * old value. However, if it fails, we need to load the value to return it for
+ * the caller. If the loaded value is different from the "old" provided by the
+ * caller, we can return it since it will means it failed.
+ * However, if for some reason the value we read is equal to the old value
+ * provided by the caller, we can't simply return it or the caller will think it
+ * succeeded. So if the value we read is the same as the "old" provided by
+ * the caller, we try again until either we succeed or we fail with a different
+ * value than the provided one.
+ */
+#define __cmpxchg(ptr, old, new, op_suffix, load_suffix)		\
+({									\
+	register unsigned long __rn asm("r62");				\
+	register unsigned long __ro asm("r63");				\
+	__asm__ __volatile__ (						\
+		/* Fence to guarantee previous store to be committed */	\
+		"fence\n"						\
+		/* Init "expect" with previous value */			\
+		"copyd $r63 = %[rOld]\n"				\
+		";;\n"							\
+		"1:\n"							\
+		/* Init "update" value with new */			\
+		"copyd $r62 = %[rNew]\n"				\
+		";;\n"							\
+		"acswap" #op_suffix " 0[%[rPtr]], $r62r63\n"		\
+		";;\n"							\
+		/* if acswap succeed, simply return */			\
+		"cb.dnez $r62? 2f\n"					\
+		";;\n"							\
+		/* We failed, load old value */				\
+		"l"  #op_suffix  #load_suffix" $r63 = 0[%[rPtr]]\n"	\
+		";;\n"							\
+		/* Check if equal to "old" one */			\
+		"comp" #op_suffix ".ne $r62 = $r63, %[rOld]\n"		\
+		";;\n"							\
+		/* If different from "old", return it to caller */	\
+		"cb.deqz $r62? 1b\n"					\
+		";;\n"							\
+		"2:\n"							\
+		: "+r" (__rn), "+r" (__ro)				\
+		: [rPtr] "r" (ptr), [rOld] "r" (old), [rNew] "r" (new)	\
+		: "memory");						\
+	(__ro);								\
+})
+
+#define arch_cmpxchg(ptr, o, n)						\
+({									\
+	unsigned long __ret;						\
+	BUILD_BUG_ON(sizeof(*(ptr)) != 4 && sizeof(*(ptr)) != 8);	\
+	switch (sizeof(*(ptr))) {					\
+	case 4:								\
+		__ret = __cmpxchg((ptr), (o), (n), w, s);		\
+		break;							\
+	case 8:								\
+		__ret = __cmpxchg((ptr), (o), (n), d, );		\
+		break;							\
+	}								\
+	(__typeof__(*(ptr))) (__ret);					\
+})
+
+/*
+ * In order to optimize xchg for 16 byte, we can use insf/extfs if we know the
+ * bounds. This way, we only take one more bundle than standard xchg.
+ * We simply do a read modify acswap on a 32 bit word.
+ */
+#define __xchg_small_asm(ptr, new, start, stop)				\
+({									\
+	register unsigned long __rn asm("r62");				\
+	register unsigned long __ro asm("r63");				\
+	__asm__ __volatile__ (						\
+		"fence\n"						\
+		";;\n"							\
+		"1:\n"							\
+		/* Load original old value */				\
+		"lws $r62 = 0[%[rPtr]]\n"				\
+		";;\n"							\
+		/* Copy read value into "expect" */			\
+		"copyd $r63 = $r62\n"					\
+		/* Prepare new value with insf */			\
+		"insf $r62 = %[rNew], " #stop "," #start "\n"		\
+		";;\n"							\
+		/* Try compare & swap with loaded value */		\
+		"acswapw 0[%[rPtr]], $r62r63\n"				\
+		";;\n"							\
+		/* Did we succeed ?, if no, try again */		\
+		"cb.deqz $r62? 1b\n"					\
+		/* Extract old value for ret value */			\
+		"extfs $r63 = $r63, " #stop "," #start "\n"		\
+		";;\n"							\
+		: "+r" (__rn), "+r" (__ro)				\
+		: [rPtr] "r" (ptr), [rNew] "r" (new)			\
+		: "memory");						\
+	(__ro);								\
+})
+
+/* Needed for generic qspinlock implementation */
+static inline unsigned long xchg_u16(volatile void *ptr, unsigned long new,
+				     int size)
+{
+	int off = (unsigned long)ptr % sizeof(u32);
+	volatile u32 *p = ptr - off;
+
+	/*
+	 * GCC is smart enough to eliminate the dead branches by detecting
+	 * the offset statically
+	 */
+	if (off == 0)
+		return __xchg_small_asm(p, new, 0, 15);
+	else
+		return __xchg_small_asm(p, new, 16, 31);
+}
+
+#define __xchg_asm(ptr, new, op_suffix, load_suffix)			\
+({									\
+	register unsigned long __rn asm("r62") = (unsigned long) (new);	\
+	register unsigned long __ro asm("r63");				\
+	__asm__ __volatile__ (						\
+		"fence\n"						\
+		";;\n"							\
+		"1:\n"							\
+		/* Load original old value */				\
+		"l" #op_suffix #load_suffix " $r63 = 0[%[rPtr]]\n"	\
+		";;\n"							\
+		/* Try compare & swap with loaded value */		\
+		"acswap" #op_suffix " 0[%[rPtr]], $r62r63\n"		\
+		";;\n"							\
+		/* Did we succeed ?, if no, try again */		\
+		"cb.deqz $r62? 1b\n"					\
+		/* $r62 has been cloberred by acswap, restore it */	\
+		"copyd $r62 = %[rNew]\n"				\
+		";;\n"							\
+		: "+r" (__rn), "+r" (__ro)				\
+		: [rPtr] "r" (ptr), [rNew] "r" (new)			\
+		: "memory");						\
+	(__ro);								\
+})
+
+/*
+ * This function doesn't exist, so you'll get a linker error if
+ * something tries to do an invalidly-sized xchg().
+ */
+extern unsigned long __xchg_called_with_bad_pointer(void)
+	__compiletime_error("Bad argument size for xchg");
+
+static inline unsigned long __xchg(volatile void *ptr, unsigned long val,
+				   int size)
+{
+	switch (size) {
+	case 2:
+		return xchg_u16(ptr, val, size);
+	case 4:
+		return __xchg_asm(ptr, val, w, s);
+	case 8:
+		return __xchg_asm(ptr, val, d, );
+	}
+	__xchg_called_with_bad_pointer();
+
+	return val;
+}
+
+#define arch_xchg(ptr, with)						\
+	({								\
+		(__typeof__(*(ptr))) __xchg((ptr),			\
+					    (unsigned long)(with),	\
+					    sizeof(*(ptr)));		\
+	})
+
+#endif
-- 
2.37.2

Re: [RFC PATCH 05/25] kvx: Add atomic/locking headers

Posted by Mark Rutland 2 years, 8 months ago

On Tue, Jan 03, 2023 at 05:43:39PM +0100, Yann Sionneau wrote:
> Add common headers (atomic, bitops, barrier and locking) for basic
> kvx support.
> 
> CC: Will Deacon <will@kernel.org>
> CC: Peter Zijlstra <peterz@infradead.org>
> CC: Boqun Feng <boqun.feng@gmail.com>
> CC: Mark Rutland <mark.rutland@arm.com>
> CC: linux-kernel@vger.kernel.org
> Co-developed-by: Clement Leger <clement.leger@bootlin.com>
> Signed-off-by: Clement Leger <clement.leger@bootlin.com>
> Co-developed-by: Jules Maselbas <jmaselbas@kalray.eu>
> Signed-off-by: Jules Maselbas <jmaselbas@kalray.eu>
> Co-developed-by: Julian Vetter <jvetter@kalray.eu>
> Signed-off-by: Julian Vetter <jvetter@kalray.eu>
> Co-developed-by: Julien Villette <jvillette@kalray.eu>
> Signed-off-by: Julien Villette <jvillette@kalray.eu>
> Co-developed-by: Yann Sionneau <ysionneau@kalray.eu>
> Signed-off-by: Yann Sionneau <ysionneau@kalray.eu>
> ---
>  arch/kvx/include/asm/atomic.h  | 104 +++++++++++++++++
>  arch/kvx/include/asm/barrier.h |  15 +++
>  arch/kvx/include/asm/bitops.h  | 207 +++++++++++++++++++++++++++++++++
>  arch/kvx/include/asm/bitrev.h  |  32 +++++
>  arch/kvx/include/asm/cmpxchg.h | 185 +++++++++++++++++++++++++++++
>  5 files changed, 543 insertions(+)
>  create mode 100644 arch/kvx/include/asm/atomic.h
>  create mode 100644 arch/kvx/include/asm/barrier.h
>  create mode 100644 arch/kvx/include/asm/bitops.h
>  create mode 100644 arch/kvx/include/asm/bitrev.h
>  create mode 100644 arch/kvx/include/asm/cmpxchg.h
> 
> diff --git a/arch/kvx/include/asm/atomic.h b/arch/kvx/include/asm/atomic.h
> new file mode 100644
> index 000000000000..eb8acbcbc70d
> --- /dev/null
> +++ b/arch/kvx/include/asm/atomic.h
> @@ -0,0 +1,104 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (C) 2017-2023 Kalray Inc.
> + * Author(s): Clement Leger
> + */
> +
> +#ifndef _ASM_KVX_ATOMIC_H
> +#define _ASM_KVX_ATOMIC_H
> +
> +#include <linux/types.h>
> +
> +#include <asm/cmpxchg.h>
> +
> +#define ATOMIC64_INIT(i)     { (i) }
> +
> +#define arch_atomic64_cmpxchg(v, old, new) (arch_cmpxchg(&((v)->counter), old, new))
> +#define arch_atomic64_xchg(v, new) (arch_xchg(&((v)->counter), new))
> +
> +static inline long arch_atomic64_read(const atomic64_t *v)
> +{
> +	return v->counter;
> +}

This is a plain read, and is *not* atomic.

The compiler can replay a plain read an arbitrary number of times, and is
permitted to split it into smaller accesses.

At minimum this needs to be

  READ_ONCE(v->counter)

... which will prevent replay. Whether or not that's actually atomic will
depend on the instructions the compiler generates, and how those instructions
are defines in your architecture.

Do you have a single instruction that can read a 64-bit memory location, and is
it guaranteed to result in a single access that cannot be split?

> +static inline void arch_atomic64_set(atomic64_t *v, long i)
> +{
> +	v->counter = i;
> +}

Same comments as for arch_atomic64_read(); at minimum this needs to be:

  WRITE_ONCE(v->counter, i)

... but that may or may not actually be atomic on your architecture.

> +#define ATOMIC64_RETURN_OP(op, c_op)					\
> +static inline long arch_atomic64_##op##_return(long i, atomic64_t *v)	\
> +{									\
> +	long new, old, ret;						\
> +									\
> +	do {								\
> +		old = v->counter;					\
> +		new = old c_op i;					\
> +		ret = arch_cmpxchg(&v->counter, old, new);		\
> +	} while (ret != old);						\
> +									\
> +	return new;							\
> +}
> +
> +#define ATOMIC64_OP(op, c_op)						\
> +static inline void arch_atomic64_##op(long i, atomic64_t *v)		\
> +{									\
> +	long new, old, ret;						\
> +									\
> +	do {								\
> +		old = v->counter;					\
> +		new = old c_op i;					\
> +		ret = arch_cmpxchg(&v->counter, old, new);		\
> +	} while (ret != old);						\
> +}
> +
> +#define ATOMIC64_FETCH_OP(op, c_op)					\
> +static inline long arch_atomic64_fetch_##op(long i, atomic64_t *v)	\
> +{									\
> +	long new, old, ret;						\
> +									\
> +	do {								\
> +		old = v->counter;					\
> +		new = old c_op i;					\
> +		ret = arch_cmpxchg(&v->counter, old, new);		\
> +	} while (ret != old);						\
> +									\
> +	return old;							\
> +}

These look ok, but it'd be nicer if we could teach the generic atomic64 code to
do this, like the generic atomic code does.

We could rename the existing asm-generic/atomic64 code to atomic64-spinlock,
and add a separate atomic64-cmpxchg (and likewise for the 32-bit code) to make
that clearer and consistent.

> +
> +#define ATOMIC64_OPS(op, c_op)						\
> +	ATOMIC64_OP(op, c_op)						\
> +	ATOMIC64_RETURN_OP(op, c_op)					\
> +	ATOMIC64_FETCH_OP(op, c_op)
> +
> +ATOMIC64_OPS(and, &)
> +ATOMIC64_OPS(or, |)
> +ATOMIC64_OPS(xor, ^)
> +ATOMIC64_OPS(add, +)
> +ATOMIC64_OPS(sub, -)
> +
> +#undef ATOMIC64_OPS
> +#undef ATOMIC64_FETCH_OP
> +#undef ATOMIC64_OP
> +
> +static inline int arch_atomic_add_return(int i, atomic_t *v)
> +{
> +	int new, old, ret;
> +
> +	do {
> +		old = v->counter;
> +		new = old + i;
> +		ret = arch_cmpxchg(&v->counter, old, new);
> +	} while (ret != old);
> +
> +	return new;
> +}
> +
> +static inline int arch_atomic_sub_return(int i, atomic_t *v)
> +{
> +	return arch_atomic_add_return(-i, v);
> +}

Likewise for these two.

> +
> +#include <asm-generic/atomic.h>
> +
> +#endif	/* _ASM_KVX_ATOMIC_H */
> diff --git a/arch/kvx/include/asm/barrier.h b/arch/kvx/include/asm/barrier.h
> new file mode 100644
> index 000000000000..371f1c70746d
> --- /dev/null
> +++ b/arch/kvx/include/asm/barrier.h
> @@ -0,0 +1,15 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (C) 2017-2023 Kalray Inc.
> + * Author(s): Clement Leger
> + */
> +
> +#ifndef _ASM_KVX_BARRIER_H
> +#define _ASM_KVX_BARRIER_H

> +/* Bitmask modifiers */
> +#define __NOP(x)	(x)
> +#define __NOT(x)	(~(x))
> +
> +
> +#define __test_and_op_bit(nr, addr, op, mod)				\
> +({									\
> +	unsigned long __mask = BIT_MASK(nr);				\
> +	unsigned long __new, __old, __ret;				\
> +	do {								\
> +		__old = *(&addr[BIT_WORD(nr)]);				\
> +		__new = __old op mod(__mask);				\
> +		__ret = cmpxchg(addr, __old, __new);			\
> +	} while (__ret != __old);					\
> +	(__old & __mask);						\
> +})

Please use <asm-generic/bitops/atomic.h> which should give you the common
bit operations "for free" atop your regular atomics.

[...]

> diff --git a/arch/kvx/include/asm/cmpxchg.h b/arch/kvx/include/asm/cmpxchg.h
> new file mode 100644
> index 000000000000..b1d128b060a2
> --- /dev/null
> +++ b/arch/kvx/include/asm/cmpxchg.h
> @@ -0,0 +1,185 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (C) 2017-2023 Kalray Inc.
> + * Author(s): Clement Leger
> + *            Yann Sionneau
> + */
> +
> +#ifndef _ASM_KVX_CMPXCHG_H
> +#define _ASM_KVX_CMPXCHG_H
> +
> +#include <linux/bits.h>
> +#include <linux/types.h>
> +#include <linux/build_bug.h>
> +
> +/*
> + * On kvx, we have a boolean compare and swap which means that the operation
> + * returns only the success of operation.
> + * If operation succeed, this is simple, we just need to return the provided
> + * old value. However, if it fails, we need to load the value to return it for
> + * the caller. If the loaded value is different from the "old" provided by the
> + * caller, we can return it since it will means it failed.
> + * However, if for some reason the value we read is equal to the old value
> + * provided by the caller, we can't simply return it or the caller will think it
> + * succeeded. So if the value we read is the same as the "old" provided by
> + * the caller, we try again until either we succeed or we fail with a different
> + * value than the provided one.
> + */
> +#define __cmpxchg(ptr, old, new, op_suffix, load_suffix)		\
> +({									\
> +	register unsigned long __rn asm("r62");				\
> +	register unsigned long __ro asm("r63");				\

Why do you need to specify the exact registers?

e.g. does some instruction use these implicitly, or do you need two adjacent
register for encoding reasons?

> +	__asm__ __volatile__ (						\
> +		/* Fence to guarantee previous store to be committed */	\
> +		"fence\n"						\

This implies you can implement the relaxed form of cmpxchg().

What ordering do you get by default, and do you have any other barriers (e.g.
for acquire/release semantics), or just "fence" ?

Thanks,
Mark.

> +		/* Init "expect" with previous value */			\
> +		"copyd $r63 = %[rOld]\n"				\
> +		";;\n"							\
> +		"1:\n"							\
> +		/* Init "update" value with new */			\
> +		"copyd $r62 = %[rNew]\n"				\
> +		";;\n"							\
> +		"acswap" #op_suffix " 0[%[rPtr]], $r62r63\n"		\
> +		";;\n"							\
> +		/* if acswap succeed, simply return */			\
> +		"cb.dnez $r62? 2f\n"					\
> +		";;\n"							\
> +		/* We failed, load old value */				\
> +		"l"  #op_suffix  #load_suffix" $r63 = 0[%[rPtr]]\n"	\
> +		";;\n"							\
> +		/* Check if equal to "old" one */			\
> +		"comp" #op_suffix ".ne $r62 = $r63, %[rOld]\n"		\
> +		";;\n"							\
> +		/* If different from "old", return it to caller */	\
> +		"cb.deqz $r62? 1b\n"					\
> +		";;\n"							\
> +		"2:\n"							\
> +		: "+r" (__rn), "+r" (__ro)				\
> +		: [rPtr] "r" (ptr), [rOld] "r" (old), [rNew] "r" (new)	\
> +		: "memory");						\
> +	(__ro);								\
> +})
> +
> +#define arch_cmpxchg(ptr, o, n)						\
> +({									\
> +	unsigned long __ret;						\
> +	BUILD_BUG_ON(sizeof(*(ptr)) != 4 && sizeof(*(ptr)) != 8);	\
> +	switch (sizeof(*(ptr))) {					\
> +	case 4:								\
> +		__ret = __cmpxchg((ptr), (o), (n), w, s);		\
> +		break;							\
> +	case 8:								\
> +		__ret = __cmpxchg((ptr), (o), (n), d, );		\
> +		break;							\
> +	}								\
> +	(__typeof__(*(ptr))) (__ret);					\
> +})
> +
> +/*
> + * In order to optimize xchg for 16 byte, we can use insf/extfs if we know the
> + * bounds. This way, we only take one more bundle than standard xchg.
> + * We simply do a read modify acswap on a 32 bit word.
> + */
> +#define __xchg_small_asm(ptr, new, start, stop)				\
> +({									\
> +	register unsigned long __rn asm("r62");				\
> +	register unsigned long __ro asm("r63");				\
> +	__asm__ __volatile__ (						\
> +		"fence\n"						\
> +		";;\n"							\
> +		"1:\n"							\
> +		/* Load original old value */				\
> +		"lws $r62 = 0[%[rPtr]]\n"				\
> +		";;\n"							\
> +		/* Copy read value into "expect" */			\
> +		"copyd $r63 = $r62\n"					\
> +		/* Prepare new value with insf */			\
> +		"insf $r62 = %[rNew], " #stop "," #start "\n"		\
> +		";;\n"							\
> +		/* Try compare & swap with loaded value */		\
> +		"acswapw 0[%[rPtr]], $r62r63\n"				\
> +		";;\n"							\
> +		/* Did we succeed ?, if no, try again */		\
> +		"cb.deqz $r62? 1b\n"					\
> +		/* Extract old value for ret value */			\
> +		"extfs $r63 = $r63, " #stop "," #start "\n"		\
> +		";;\n"							\
> +		: "+r" (__rn), "+r" (__ro)				\
> +		: [rPtr] "r" (ptr), [rNew] "r" (new)			\
> +		: "memory");						\
> +	(__ro);								\
> +})
> +
> +/* Needed for generic qspinlock implementation */
> +static inline unsigned long xchg_u16(volatile void *ptr, unsigned long new,
> +				     int size)
> +{
> +	int off = (unsigned long)ptr % sizeof(u32);
> +	volatile u32 *p = ptr - off;
> +
> +	/*
> +	 * GCC is smart enough to eliminate the dead branches by detecting
> +	 * the offset statically
> +	 */
> +	if (off == 0)
> +		return __xchg_small_asm(p, new, 0, 15);
> +	else
> +		return __xchg_small_asm(p, new, 16, 31);
> +}
> +
> +#define __xchg_asm(ptr, new, op_suffix, load_suffix)			\
> +({									\
> +	register unsigned long __rn asm("r62") = (unsigned long) (new);	\
> +	register unsigned long __ro asm("r63");				\
> +	__asm__ __volatile__ (						\
> +		"fence\n"						\
> +		";;\n"							\
> +		"1:\n"							\
> +		/* Load original old value */				\
> +		"l" #op_suffix #load_suffix " $r63 = 0[%[rPtr]]\n"	\
> +		";;\n"							\
> +		/* Try compare & swap with loaded value */		\
> +		"acswap" #op_suffix " 0[%[rPtr]], $r62r63\n"		\
> +		";;\n"							\
> +		/* Did we succeed ?, if no, try again */		\
> +		"cb.deqz $r62? 1b\n"					\
> +		/* $r62 has been cloberred by acswap, restore it */	\
> +		"copyd $r62 = %[rNew]\n"				\
> +		";;\n"							\
> +		: "+r" (__rn), "+r" (__ro)				\
> +		: [rPtr] "r" (ptr), [rNew] "r" (new)			\
> +		: "memory");						\
> +	(__ro);								\
> +})
> +
> +/*
> + * This function doesn't exist, so you'll get a linker error if
> + * something tries to do an invalidly-sized xchg().
> + */
> +extern unsigned long __xchg_called_with_bad_pointer(void)
> +	__compiletime_error("Bad argument size for xchg");
> +
> +static inline unsigned long __xchg(volatile void *ptr, unsigned long val,
> +				   int size)
> +{
> +	switch (size) {
> +	case 2:
> +		return xchg_u16(ptr, val, size);
> +	case 4:
> +		return __xchg_asm(ptr, val, w, s);
> +	case 8:
> +		return __xchg_asm(ptr, val, d, );
> +	}
> +	__xchg_called_with_bad_pointer();
> +
> +	return val;
> +}
> +
> +#define arch_xchg(ptr, with)						\
> +	({								\
> +		(__typeof__(*(ptr))) __xchg((ptr),			\
> +					    (unsigned long)(with),	\
> +					    sizeof(*(ptr)));		\
> +	})
> +
> +#endif
> -- 
> 2.37.2
> 
> 
> 
> 
>

Re: [RFC PATCH 05/25] kvx: Add atomic/locking headers

Posted by Jules Maselbas 2 years, 8 months ago

Hi Mark,

On Wed, Jan 04, 2023 at 09:53:24AM +0000, Mark Rutland wrote:
> On Tue, Jan 03, 2023 at 05:43:39PM +0100, Yann Sionneau wrote:
> > Add common headers (atomic, bitops, barrier and locking) for basic
> > kvx support.
> > 
> > CC: Will Deacon <will@kernel.org>
> > CC: Peter Zijlstra <peterz@infradead.org>
> > CC: Boqun Feng <boqun.feng@gmail.com>
> > CC: Mark Rutland <mark.rutland@arm.com>
> > CC: linux-kernel@vger.kernel.org
> > Co-developed-by: Clement Leger <clement.leger@bootlin.com>
> > Signed-off-by: Clement Leger <clement.leger@bootlin.com>
> > Co-developed-by: Jules Maselbas <jmaselbas@kalray.eu>
> > Signed-off-by: Jules Maselbas <jmaselbas@kalray.eu>
> > Co-developed-by: Julian Vetter <jvetter@kalray.eu>
> > Signed-off-by: Julian Vetter <jvetter@kalray.eu>
> > Co-developed-by: Julien Villette <jvillette@kalray.eu>
> > Signed-off-by: Julien Villette <jvillette@kalray.eu>
> > Co-developed-by: Yann Sionneau <ysionneau@kalray.eu>
> > Signed-off-by: Yann Sionneau <ysionneau@kalray.eu>
> > ---
> >  arch/kvx/include/asm/atomic.h  | 104 +++++++++++++++++
> >  arch/kvx/include/asm/barrier.h |  15 +++
> >  arch/kvx/include/asm/bitops.h  | 207 +++++++++++++++++++++++++++++++++
> >  arch/kvx/include/asm/bitrev.h  |  32 +++++
> >  arch/kvx/include/asm/cmpxchg.h | 185 +++++++++++++++++++++++++++++
> >  5 files changed, 543 insertions(+)
> >  create mode 100644 arch/kvx/include/asm/atomic.h
> >  create mode 100644 arch/kvx/include/asm/barrier.h
> >  create mode 100644 arch/kvx/include/asm/bitops.h
> >  create mode 100644 arch/kvx/include/asm/bitrev.h
> >  create mode 100644 arch/kvx/include/asm/cmpxchg.h
> > 
> > diff --git a/arch/kvx/include/asm/atomic.h b/arch/kvx/include/asm/atomic.h
> > new file mode 100644
> > index 000000000000..eb8acbcbc70d
> > --- /dev/null
> > +++ b/arch/kvx/include/asm/atomic.h
> > @@ -0,0 +1,104 @@
> > +/* SPDX-License-Identifier: GPL-2.0-only */
> > +/*
> > + * Copyright (C) 2017-2023 Kalray Inc.
> > + * Author(s): Clement Leger
> > + */
> > +
> > +#ifndef _ASM_KVX_ATOMIC_H
> > +#define _ASM_KVX_ATOMIC_H
> > +
> > +#include <linux/types.h>
> > +
> > +#include <asm/cmpxchg.h>
> > +
> > +#define ATOMIC64_INIT(i)     { (i) }
> > +
> > +#define arch_atomic64_cmpxchg(v, old, new) (arch_cmpxchg(&((v)->counter), old, new))
> > +#define arch_atomic64_xchg(v, new) (arch_xchg(&((v)->counter), new))
> > +
> > +static inline long arch_atomic64_read(const atomic64_t *v)
> > +{
> > +	return v->counter;
> > +}
> 
> This is a plain read, and is *not* atomic.
> 
> The compiler can replay a plain read an arbitrary number of times, and is
> permitted to split it into smaller accesses.
> 
> At minimum this needs to be
> 
>   READ_ONCE(v->counter)
> 
> ... which will prevent replay. Whether or not that's actually atomic will
> depend on the instructions the compiler generates, and how those instructions
> are defines in your architecture.
Good point, we are going to use {READ,WRITE}_ONCE macros

> Do you have a single instruction that can read a 64-bit memory location, and is
> it guaranteed to result in a single access that cannot be split?

We do have a single instruction that can read a 64-bit memory location
(supported sizes are 8, 16, 32, 64, 128, 256).
All accesses are guaranteed to not be split, unless they are misaligned.
Furthermore, misaligned write accesses crossing a 32-byte boundary may
complete in a non-atomic way.

> 
> > +static inline void arch_atomic64_set(atomic64_t *v, long i)
> > +{
> > +	v->counter = i;
> > +}
> 
> Same comments as for arch_atomic64_read(); at minimum this needs to be:
> 
>   WRITE_ONCE(v->counter, i)
> 
> ... but that may or may not actually be atomic on your architecture.
> 
> > +#define ATOMIC64_RETURN_OP(op, c_op)					\
> > +static inline long arch_atomic64_##op##_return(long i, atomic64_t *v)	\
> > +{									\
> > +	long new, old, ret;						\
> > +									\
> > +	do {								\
> > +		old = v->counter;					\
> > +		new = old c_op i;					\
> > +		ret = arch_cmpxchg(&v->counter, old, new);		\
> > +	} while (ret != old);						\
> > +									\
> > +	return new;							\
> > +}
> > +
> > +#define ATOMIC64_OP(op, c_op)						\
> > +static inline void arch_atomic64_##op(long i, atomic64_t *v)		\
> > +{									\
> > +	long new, old, ret;						\
> > +									\
> > +	do {								\
> > +		old = v->counter;					\
> > +		new = old c_op i;					\
> > +		ret = arch_cmpxchg(&v->counter, old, new);		\
> > +	} while (ret != old);						\
> > +}
> > +
> > +#define ATOMIC64_FETCH_OP(op, c_op)					\
> > +static inline long arch_atomic64_fetch_##op(long i, atomic64_t *v)	\
> > +{									\
> > +	long new, old, ret;						\
> > +									\
> > +	do {								\
> > +		old = v->counter;					\
> > +		new = old c_op i;					\
> > +		ret = arch_cmpxchg(&v->counter, old, new);		\
> > +	} while (ret != old);						\
> > +									\
> > +	return old;							\
> > +}
> 
> These look ok, but it'd be nicer if we could teach the generic atomic64 code to
> do this, like the generic atomic code does.
> 
> We could rename the existing asm-generic/atomic64 code to atomic64-spinlock,
> and add a separate atomic64-cmpxchg (and likewise for the 32-bit code) to make
> that clearer and consistent.
I am not sure what this implies and how big this change might be,
but I'll take a look at this.

> > +
> > +#define ATOMIC64_OPS(op, c_op)						\
> > +	ATOMIC64_OP(op, c_op)						\
> > +	ATOMIC64_RETURN_OP(op, c_op)					\
> > +	ATOMIC64_FETCH_OP(op, c_op)
> > +
> > +ATOMIC64_OPS(and, &)
> > +ATOMIC64_OPS(or, |)
> > +ATOMIC64_OPS(xor, ^)
> > +ATOMIC64_OPS(add, +)
> > +ATOMIC64_OPS(sub, -)
> > +
> > +#undef ATOMIC64_OPS
> > +#undef ATOMIC64_FETCH_OP
> > +#undef ATOMIC64_OP
> > +
> > +static inline int arch_atomic_add_return(int i, atomic_t *v)
> > +{
> > +	int new, old, ret;
> > +
> > +	do {
> > +		old = v->counter;
> > +		new = old + i;
> > +		ret = arch_cmpxchg(&v->counter, old, new);
> > +	} while (ret != old);
> > +
> > +	return new;
> > +}
> > +
> > +static inline int arch_atomic_sub_return(int i, atomic_t *v)
> > +{
> > +	return arch_atomic_add_return(-i, v);
> > +}
> 
> Likewise for these two.
> 
> > +
> > +#include <asm-generic/atomic.h>
> > +
> > +#endif	/* _ASM_KVX_ATOMIC_H */
> > diff --git a/arch/kvx/include/asm/barrier.h b/arch/kvx/include/asm/barrier.h
> > new file mode 100644
> > index 000000000000..371f1c70746d
> > --- /dev/null
> > +++ b/arch/kvx/include/asm/barrier.h
> > @@ -0,0 +1,15 @@
> > +/* SPDX-License-Identifier: GPL-2.0-only */
> > +/*
> > + * Copyright (C) 2017-2023 Kalray Inc.
> > + * Author(s): Clement Leger
> > + */
> > +
> > +#ifndef _ASM_KVX_BARRIER_H
> > +#define _ASM_KVX_BARRIER_H
> 
> > +/* Bitmask modifiers */
> > +#define __NOP(x)	(x)
> > +#define __NOT(x)	(~(x))
> > +
> > +
> > +#define __test_and_op_bit(nr, addr, op, mod)				\
> > +({									\
> > +	unsigned long __mask = BIT_MASK(nr);				\
> > +	unsigned long __new, __old, __ret;				\
> > +	do {								\
> > +		__old = *(&addr[BIT_WORD(nr)]);				\
> > +		__new = __old op mod(__mask);				\
> > +		__ret = cmpxchg(addr, __old, __new);			\
> > +	} while (__ret != __old);					\
> > +	(__old & __mask);						\
> > +})
> 
> Please use <asm-generic/bitops/atomic.h> which should give you the common
> bit operations "for free" atop your regular atomics.
Yes

> 
> [...]
> 
> > diff --git a/arch/kvx/include/asm/cmpxchg.h b/arch/kvx/include/asm/cmpxchg.h
> > new file mode 100644
> > index 000000000000..b1d128b060a2
> > --- /dev/null
> > +++ b/arch/kvx/include/asm/cmpxchg.h
> > @@ -0,0 +1,185 @@
> > +/* SPDX-License-Identifier: GPL-2.0-only */
> > +/*
> > + * Copyright (C) 2017-2023 Kalray Inc.
> > + * Author(s): Clement Leger
> > + *            Yann Sionneau
> > + */
> > +
> > +#ifndef _ASM_KVX_CMPXCHG_H
> > +#define _ASM_KVX_CMPXCHG_H
> > +
> > +#include <linux/bits.h>
> > +#include <linux/types.h>
> > +#include <linux/build_bug.h>
> > +
> > +/*
> > + * On kvx, we have a boolean compare and swap which means that the operation
> > + * returns only the success of operation.
> > + * If operation succeed, this is simple, we just need to return the provided
> > + * old value. However, if it fails, we need to load the value to return it for
> > + * the caller. If the loaded value is different from the "old" provided by the
> > + * caller, we can return it since it will means it failed.
> > + * However, if for some reason the value we read is equal to the old value
> > + * provided by the caller, we can't simply return it or the caller will think it
> > + * succeeded. So if the value we read is the same as the "old" provided by
> > + * the caller, we try again until either we succeed or we fail with a different
> > + * value than the provided one.
> > + */
> > +#define __cmpxchg(ptr, old, new, op_suffix, load_suffix)		\
> > +({									\
> > +	register unsigned long __rn asm("r62");				\
> > +	register unsigned long __ro asm("r63");				\
> 
> Why do you need to specify the exact registers?
r62 and r63 are hardcoded in the inline assembly, they are caller saved.
I have a C implementation that uses builtins however this is not merged
in our tree yet (but I want to).

> e.g. does some instruction use these implicitly, or do you need two adjacent
> register for encoding reasons?

The atomic compare and swap (acswap) instruction needs a register "pair"
which can only exists with "adjacent" registers:  $r0r1, $r2r3 ect.

> > +	__asm__ __volatile__ (						\
> > +		/* Fence to guarantee previous store to be committed */	\
> > +		"fence\n"						\
> 
> This implies you can implement the relaxed form of cmpxchg().
> 
> What ordering do you get by default, and do you have any other barriers (e.g.
> for acquire/release semantics), or just "fence" ?
We have two barrier types:
 - fence: ensure that all uncached memory operations are committed to
   memory, typically used to ensure a write to memory is visible to
   other cores.
 - barrier: flush the core instruction pipeline and memory system

Thanks,
-- Jules

Re: [RFC PATCH 05/25] kvx: Add atomic/locking headers

Posted by Mark Rutland 2 years, 8 months ago

On Fri, Jan 06, 2023 at 03:11:58PM +0100, Jules Maselbas wrote:
> Hi Mark,
> 
> On Wed, Jan 04, 2023 at 09:53:24AM +0000, Mark Rutland wrote:
> > On Tue, Jan 03, 2023 at 05:43:39PM +0100, Yann Sionneau wrote:
> > > Add common headers (atomic, bitops, barrier and locking) for basic
> > > kvx support.
> > > 
> > > CC: Will Deacon <will@kernel.org>
> > > CC: Peter Zijlstra <peterz@infradead.org>
> > > CC: Boqun Feng <boqun.feng@gmail.com>
> > > CC: Mark Rutland <mark.rutland@arm.com>
> > > CC: linux-kernel@vger.kernel.org
> > > Co-developed-by: Clement Leger <clement.leger@bootlin.com>
> > > Signed-off-by: Clement Leger <clement.leger@bootlin.com>
> > > Co-developed-by: Jules Maselbas <jmaselbas@kalray.eu>
> > > Signed-off-by: Jules Maselbas <jmaselbas@kalray.eu>
> > > Co-developed-by: Julian Vetter <jvetter@kalray.eu>
> > > Signed-off-by: Julian Vetter <jvetter@kalray.eu>
> > > Co-developed-by: Julien Villette <jvillette@kalray.eu>
> > > Signed-off-by: Julien Villette <jvillette@kalray.eu>
> > > Co-developed-by: Yann Sionneau <ysionneau@kalray.eu>
> > > Signed-off-by: Yann Sionneau <ysionneau@kalray.eu>
> > > ---
> > >  arch/kvx/include/asm/atomic.h  | 104 +++++++++++++++++
> > >  arch/kvx/include/asm/barrier.h |  15 +++
> > >  arch/kvx/include/asm/bitops.h  | 207 +++++++++++++++++++++++++++++++++
> > >  arch/kvx/include/asm/bitrev.h  |  32 +++++
> > >  arch/kvx/include/asm/cmpxchg.h | 185 +++++++++++++++++++++++++++++
> > >  5 files changed, 543 insertions(+)
> > >  create mode 100644 arch/kvx/include/asm/atomic.h
> > >  create mode 100644 arch/kvx/include/asm/barrier.h
> > >  create mode 100644 arch/kvx/include/asm/bitops.h
> > >  create mode 100644 arch/kvx/include/asm/bitrev.h
> > >  create mode 100644 arch/kvx/include/asm/cmpxchg.h
> > > 
> > > diff --git a/arch/kvx/include/asm/atomic.h b/arch/kvx/include/asm/atomic.h
> > > new file mode 100644
> > > index 000000000000..eb8acbcbc70d
> > > --- /dev/null
> > > +++ b/arch/kvx/include/asm/atomic.h
> > > @@ -0,0 +1,104 @@
> > > +/* SPDX-License-Identifier: GPL-2.0-only */
> > > +/*
> > > + * Copyright (C) 2017-2023 Kalray Inc.
> > > + * Author(s): Clement Leger
> > > + */
> > > +
> > > +#ifndef _ASM_KVX_ATOMIC_H
> > > +#define _ASM_KVX_ATOMIC_H
> > > +
> > > +#include <linux/types.h>
> > > +
> > > +#include <asm/cmpxchg.h>
> > > +
> > > +#define ATOMIC64_INIT(i)     { (i) }
> > > +
> > > +#define arch_atomic64_cmpxchg(v, old, new) (arch_cmpxchg(&((v)->counter), old, new))
> > > +#define arch_atomic64_xchg(v, new) (arch_xchg(&((v)->counter), new))
> > > +
> > > +static inline long arch_atomic64_read(const atomic64_t *v)
> > > +{
> > > +	return v->counter;
> > > +}
> > 
> > This is a plain read, and is *not* atomic.
> > 
> > The compiler can replay a plain read an arbitrary number of times, and is
> > permitted to split it into smaller accesses.
> > 
> > At minimum this needs to be
> > 
> >   READ_ONCE(v->counter)
> > 
> > ... which will prevent replay. Whether or not that's actually atomic will
> > depend on the instructions the compiler generates, and how those instructions
> > are defines in your architecture.
> Good point, we are going to use {READ,WRITE}_ONCE macros
> 
> > Do you have a single instruction that can read a 64-bit memory location, and is
> > it guaranteed to result in a single access that cannot be split?
> 
> We do have a single instruction that can read a 64-bit memory location
> (supported sizes are 8, 16, 32, 64, 128, 256).
> All accesses are guaranteed to not be split, unless they are misaligned.
> Furthermore, misaligned write accesses crossing a 32-byte boundary may
> complete in a non-atomic way.

Perfect, thanks for confirming!

[...]

> > > +static inline void arch_atomic64_set(atomic64_t *v, long i)
> > > +{
> > > +	v->counter = i;
> > > +}
> > 
> > Same comments as for arch_atomic64_read(); at minimum this needs to be:
> > 
> >   WRITE_ONCE(v->counter, i)
> > 
> > ... but that may or may not actually be atomic on your architecture.
> > 
> > > +#define ATOMIC64_RETURN_OP(op, c_op)					\
> > > +static inline long arch_atomic64_##op##_return(long i, atomic64_t *v)	\
> > > +{									\
> > > +	long new, old, ret;						\
> > > +									\
> > > +	do {								\
> > > +		old = v->counter;					\
> > > +		new = old c_op i;					\
> > > +		ret = arch_cmpxchg(&v->counter, old, new);		\
> > > +	} while (ret != old);						\
> > > +									\
> > > +	return new;							\
> > > +}
> > > +
> > > +#define ATOMIC64_OP(op, c_op)						\
> > > +static inline void arch_atomic64_##op(long i, atomic64_t *v)		\
> > > +{									\
> > > +	long new, old, ret;						\
> > > +									\
> > > +	do {								\
> > > +		old = v->counter;					\
> > > +		new = old c_op i;					\
> > > +		ret = arch_cmpxchg(&v->counter, old, new);		\
> > > +	} while (ret != old);						\
> > > +}
> > > +
> > > +#define ATOMIC64_FETCH_OP(op, c_op)					\
> > > +static inline long arch_atomic64_fetch_##op(long i, atomic64_t *v)	\
> > > +{									\
> > > +	long new, old, ret;						\
> > > +									\
> > > +	do {								\
> > > +		old = v->counter;					\
> > > +		new = old c_op i;					\
> > > +		ret = arch_cmpxchg(&v->counter, old, new);		\
> > > +	} while (ret != old);						\
> > > +									\
> > > +	return old;							\
> > > +}
> > 
> > These look ok, but it'd be nicer if we could teach the generic atomic64 code to
> > do this, like the generic atomic code does.
> > 
> > We could rename the existing asm-generic/atomic64 code to atomic64-spinlock,
> > and add a separate atomic64-cmpxchg (and likewise for the 32-bit code) to make
> > that clearer and consistent.
> I am not sure what this implies and how big this change might be,
> but I'll take a look at this.

Hmm... from a quick attempt just now it looks like that will be a bit more
churny than I thought.

We can always factor this out later, so feel free to leave it as-is, thgouh if
we could make this genric (and have it look like asm-generic/atomic.h), it'd be
nice for consistency and maintenance.

> > > +
> > > +#define ATOMIC64_OPS(op, c_op)						\
> > > +	ATOMIC64_OP(op, c_op)						\
> > > +	ATOMIC64_RETURN_OP(op, c_op)					\
> > > +	ATOMIC64_FETCH_OP(op, c_op)
> > > +
> > > +ATOMIC64_OPS(and, &)
> > > +ATOMIC64_OPS(or, |)
> > > +ATOMIC64_OPS(xor, ^)
> > > +ATOMIC64_OPS(add, +)
> > > +ATOMIC64_OPS(sub, -)
> > > +
> > > +#undef ATOMIC64_OPS
> > > +#undef ATOMIC64_FETCH_OP
> > > +#undef ATOMIC64_OP
> > > +
> > > +static inline int arch_atomic_add_return(int i, atomic_t *v)
> > > +{
> > > +	int new, old, ret;
> > > +
> > > +	do {
> > > +		old = v->counter;
> > > +		new = old + i;
> > > +		ret = arch_cmpxchg(&v->counter, old, new);
> > > +	} while (ret != old);
> > > +
> > > +	return new;
> > > +}
> > > +
> > > +static inline int arch_atomic_sub_return(int i, atomic_t *v)
> > > +{
> > > +	return arch_atomic_add_return(-i, v);
> > > +}
> > 
> > Likewise for these two.

The 32-bit atomics should come from asm-generic/atomic.h and not be necesary,
that has both arch_atomic_add_return() and arch_atomic_sub_return().

> > > diff --git a/arch/kvx/include/asm/cmpxchg.h b/arch/kvx/include/asm/cmpxchg.h
> > > new file mode 100644
> > > index 000000000000..b1d128b060a2
> > > --- /dev/null
> > > +++ b/arch/kvx/include/asm/cmpxchg.h
> > > @@ -0,0 +1,185 @@
> > > +/* SPDX-License-Identifier: GPL-2.0-only */
> > > +/*
> > > + * Copyright (C) 2017-2023 Kalray Inc.
> > > + * Author(s): Clement Leger
> > > + *            Yann Sionneau
> > > + */
> > > +
> > > +#ifndef _ASM_KVX_CMPXCHG_H
> > > +#define _ASM_KVX_CMPXCHG_H
> > > +
> > > +#include <linux/bits.h>
> > > +#include <linux/types.h>
> > > +#include <linux/build_bug.h>
> > > +
> > > +/*
> > > + * On kvx, we have a boolean compare and swap which means that the operation
> > > + * returns only the success of operation.
> > > + * If operation succeed, this is simple, we just need to return the provided
> > > + * old value. However, if it fails, we need to load the value to return it for
> > > + * the caller. If the loaded value is different from the "old" provided by the
> > > + * caller, we can return it since it will means it failed.
> > > + * However, if for some reason the value we read is equal to the old value
> > > + * provided by the caller, we can't simply return it or the caller will think it
> > > + * succeeded. So if the value we read is the same as the "old" provided by
> > > + * the caller, we try again until either we succeed or we fail with a different
> > > + * value than the provided one.
> > > + */
> > > +#define __cmpxchg(ptr, old, new, op_suffix, load_suffix)		\
> > > +({									\
> > > +	register unsigned long __rn asm("r62");				\
> > > +	register unsigned long __ro asm("r63");				\
> > 
> > Why do you need to specify the exact registers?
> r62 and r63 are hardcoded in the inline assembly, they are caller saved.
> I have a C implementation that uses builtins however this is not merged
> in our tree yet (but I want to).
> 
> > e.g. does some instruction use these implicitly, or do you need two adjacent
> > register for encoding reasons?
> 
> The atomic compare and swap (acswap) instruction needs a register "pair"
> which can only exists with "adjacent" registers:  $r0r1, $r2r3 ect.

Ok; and you don't have a way to ask GCC for an arbitrary register pair, so you
chose r62+r63 as they can be clobbered?

It might be worth looking into using an __int128_t to give you a register pair,
assuming your calling convention mandares adjacency of the two halves. That
could give you the pair while giving the compiler freedom to chose any suitable
pair (assuming you have a suitable operand modifier to extract the low/high
registers from the asm operand.

> > > +	__asm__ __volatile__ (						\
> > > +		/* Fence to guarantee previous store to be committed */	\
> > > +		"fence\n"						\
> > 
> > This implies you can implement the relaxed form of cmpxchg().
> > 
> > What ordering do you get by default, and do you have any other barriers (e.g.
> > for acquire/release semantics), or just "fence" ?
> We have two barrier types:
>  - fence: ensure that all uncached memory operations are committed to
>    memory, typically used to ensure a write to memory is visible to
>    other cores.
>  - barrier: flush the core instruction pipeline and memory system

Ok.

In the absence of barriers do you have any ordering guarantee? e.g. weak like
arm/power, TSO like x86, or something else entirely?

When you say "uncached memory operations", does that mean that you have cached
memory operatins which are not affected by the barriers, or is "uncached memory
operations" just meant to read as "oustanding/incomplete memory operations" ?

Thanks,
Mark.

Re: [RFC PATCH 05/25] kvx: Add atomic/locking headers

Posted by Yann Sionneau 2 years, 8 months ago

Hi Mark,

On 10/01/2023 14:24, Mark Rutland wrote:
> On Fri, Jan 06, 2023 at 03:11:58PM +0100, Jules Maselbas wrote:
>> Hi Mark,
>>
>> On Wed, Jan 04, 2023 at 09:53:24AM +0000, Mark Rutland wrote:
>>> On Tue, Jan 03, 2023 at 05:43:39PM +0100, Yann Sionneau wrote:
>>>> Add common headers (atomic, bitops, barrier and locking) for basic
>>>> kvx support.
>>>>
>>>> CC: Will Deacon <will@kernel.org>
>>>> CC: Peter Zijlstra <peterz@infradead.org>
>>>> CC: Boqun Feng <boqun.feng@gmail.com>
>>>> CC: Mark Rutland <mark.rutland@arm.com>
>>>> CC: linux-kernel@vger.kernel.org
>>>> Co-developed-by: Clement Leger <clement.leger@bootlin.com>
>>>> Signed-off-by: Clement Leger <clement.leger@bootlin.com>
>>>> Co-developed-by: Jules Maselbas <jmaselbas@kalray.eu>
>>>> Signed-off-by: Jules Maselbas <jmaselbas@kalray.eu>
>>>> Co-developed-by: Julian Vetter <jvetter@kalray.eu>
>>>> Signed-off-by: Julian Vetter <jvetter@kalray.eu>
>>>> Co-developed-by: Julien Villette <jvillette@kalray.eu>
>>>> Signed-off-by: Julien Villette <jvillette@kalray.eu>
>>>> Co-developed-by: Yann Sionneau <ysionneau@kalray.eu>
>>>> Signed-off-by: Yann Sionneau <ysionneau@kalray.eu>
>>>> ---
>>>>   arch/kvx/include/asm/atomic.h  | 104 +++++++++++++++++
>>>>   arch/kvx/include/asm/barrier.h |  15 +++
>>>>   arch/kvx/include/asm/bitops.h  | 207 +++++++++++++++++++++++++++++++++
>>>>   arch/kvx/include/asm/bitrev.h  |  32 +++++
>>>>   arch/kvx/include/asm/cmpxchg.h | 185 +++++++++++++++++++++++++++++
>>>>   5 files changed, 543 insertions(+)
>>>>   create mode 100644 arch/kvx/include/asm/atomic.h
>>>>   create mode 100644 arch/kvx/include/asm/barrier.h
>>>>   create mode 100644 arch/kvx/include/asm/bitops.h
>>>>   create mode 100644 arch/kvx/include/asm/bitrev.h
>>>>   create mode 100644 arch/kvx/include/asm/cmpxchg.h
>>>>
>>>> diff --git a/arch/kvx/include/asm/atomic.h b/arch/kvx/include/asm/atomic.h
>>>> new file mode 100644
>>>> index 000000000000..eb8acbcbc70d
>>>> --- /dev/null
>>>> +++ b/arch/kvx/include/asm/atomic.h
>>>> @@ -0,0 +1,104 @@
>>>> +/* SPDX-License-Identifier: GPL-2.0-only */
>>>> +/*
>>>> + * Copyright (C) 2017-2023 Kalray Inc.
>>>> + * Author(s): Clement Leger
>>>> + */
>>>> +
>>>> +#ifndef _ASM_KVX_ATOMIC_H
>>>> +#define _ASM_KVX_ATOMIC_H
>>>> +
>>>> +#include <linux/types.h>
>>>> +
>>>> +#include <asm/cmpxchg.h>
>>>> +
>>>> +#define ATOMIC64_INIT(i)     { (i) }
>>>> +
>>>> +#define arch_atomic64_cmpxchg(v, old, new) (arch_cmpxchg(&((v)->counter), old, new))
>>>> +#define arch_atomic64_xchg(v, new) (arch_xchg(&((v)->counter), new))
>>>> +
>>>> +static inline long arch_atomic64_read(const atomic64_t *v)
>>>> +{
>>>> +	return v->counter;
>>>> +}
>>> This is a plain read, and is *not* atomic.
>>>
>>> The compiler can replay a plain read an arbitrary number of times, and is
>>> permitted to split it into smaller accesses.
>>>
>>> At minimum this needs to be
>>>
>>>    READ_ONCE(v->counter)
>>>
>>> ... which will prevent replay. Whether or not that's actually atomic will
>>> depend on the instructions the compiler generates, and how those instructions
>>> are defines in your architecture.
>> Good point, we are going to use {READ,WRITE}_ONCE macros
Done for V2 (which we will send pretty soon).
>>
>>> Do you have a single instruction that can read a 64-bit memory location, and is
>>> it guaranteed to result in a single access that cannot be split?
>> We do have a single instruction that can read a 64-bit memory location
>> (supported sizes are 8, 16, 32, 64, 128, 256).
>> All accesses are guaranteed to not be split, unless they are misaligned.
>> Furthermore, misaligned write accesses crossing a 32-byte boundary may
>> complete in a non-atomic way.
> Perfect, thanks for confirming!
>
> [...]
>
>>>> +static inline void arch_atomic64_set(atomic64_t *v, long i)
>>>> +{
>>>> +	v->counter = i;
>>>> +}
>>> Same comments as for arch_atomic64_read(); at minimum this needs to be:
>>>
>>>    WRITE_ONCE(v->counter, i)
>>>
>>> ... but that may or may not actually be atomic on your architecture.
>>>
>>>> +#define ATOMIC64_RETURN_OP(op, c_op)					\
>>>> +static inline long arch_atomic64_##op##_return(long i, atomic64_t *v)	\
>>>> +{									\
>>>> +	long new, old, ret;						\
>>>> +									\
>>>> +	do {								\
>>>> +		old = v->counter;					\
>>>> +		new = old c_op i;					\
>>>> +		ret = arch_cmpxchg(&v->counter, old, new);		\
>>>> +	} while (ret != old);						\
>>>> +									\
>>>> +	return new;							\
>>>> +}
>>>> +
>>>> +#define ATOMIC64_OP(op, c_op)						\
>>>> +static inline void arch_atomic64_##op(long i, atomic64_t *v)		\
>>>> +{									\
>>>> +	long new, old, ret;						\
>>>> +									\
>>>> +	do {								\
>>>> +		old = v->counter;					\
>>>> +		new = old c_op i;					\
>>>> +		ret = arch_cmpxchg(&v->counter, old, new);		\
>>>> +	} while (ret != old);						\
>>>> +}
>>>> +
>>>> +#define ATOMIC64_FETCH_OP(op, c_op)					\
>>>> +static inline long arch_atomic64_fetch_##op(long i, atomic64_t *v)	\
>>>> +{									\
>>>> +	long new, old, ret;						\
>>>> +									\
>>>> +	do {								\
>>>> +		old = v->counter;					\
>>>> +		new = old c_op i;					\
>>>> +		ret = arch_cmpxchg(&v->counter, old, new);		\
>>>> +	} while (ret != old);						\
>>>> +									\
>>>> +	return old;							\
>>>> +}
>>> These look ok, but it'd be nicer if we could teach the generic atomic64 code to
>>> do this, like the generic atomic code does.
>>>
>>> We could rename the existing asm-generic/atomic64 code to atomic64-spinlock,
>>> and add a separate atomic64-cmpxchg (and likewise for the 32-bit code) to make
>>> that clearer and consistent.
>> I am not sure what this implies and how big this change might be,
>> but I'll take a look at this.
> Hmm... from a quick attempt just now it looks like that will be a bit more
> churny than I thought.
>
> We can always factor this out later, so feel free to leave it as-is, thgouh if
> we could make this genric (and have it look like asm-generic/atomic.h), it'd be
> nice for consistency and maintenance.
Ack for doing it later.
>
>>>> +
>>>> +#define ATOMIC64_OPS(op, c_op)						\
>>>> +	ATOMIC64_OP(op, c_op)						\
>>>> +	ATOMIC64_RETURN_OP(op, c_op)					\
>>>> +	ATOMIC64_FETCH_OP(op, c_op)
>>>> +
>>>> +ATOMIC64_OPS(and, &)
>>>> +ATOMIC64_OPS(or, |)
>>>> +ATOMIC64_OPS(xor, ^)
>>>> +ATOMIC64_OPS(add, +)
>>>> +ATOMIC64_OPS(sub, -)
>>>> +
>>>> +#undef ATOMIC64_OPS
>>>> +#undef ATOMIC64_FETCH_OP
>>>> +#undef ATOMIC64_OP
>>>> +
>>>> +static inline int arch_atomic_add_return(int i, atomic_t *v)
>>>> +{
>>>> +	int new, old, ret;
>>>> +
>>>> +	do {
>>>> +		old = v->counter;
>>>> +		new = old + i;
>>>> +		ret = arch_cmpxchg(&v->counter, old, new);
>>>> +	} while (ret != old);
>>>> +
>>>> +	return new;
>>>> +}
>>>> +
>>>> +static inline int arch_atomic_sub_return(int i, atomic_t *v)
>>>> +{
>>>> +	return arch_atomic_add_return(-i, v);
>>>> +}
>>> Likewise for these two.
> The 32-bit atomics should come from asm-generic/atomic.h and not be necesary,
> that has both arch_atomic_add_return() and arch_atomic_sub_return().
>
>>>> diff --git a/arch/kvx/include/asm/cmpxchg.h b/arch/kvx/include/asm/cmpxchg.h
>>>> new file mode 100644
>>>> index 000000000000..b1d128b060a2
>>>> --- /dev/null
>>>> +++ b/arch/kvx/include/asm/cmpxchg.h
>>>> @@ -0,0 +1,185 @@
>>>> +/* SPDX-License-Identifier: GPL-2.0-only */
>>>> +/*
>>>> + * Copyright (C) 2017-2023 Kalray Inc.
>>>> + * Author(s): Clement Leger
>>>> + *            Yann Sionneau
>>>> + */
>>>> +
>>>> +#ifndef _ASM_KVX_CMPXCHG_H
>>>> +#define _ASM_KVX_CMPXCHG_H
>>>> +
>>>> +#include <linux/bits.h>
>>>> +#include <linux/types.h>
>>>> +#include <linux/build_bug.h>
>>>> +
>>>> +/*
>>>> + * On kvx, we have a boolean compare and swap which means that the operation
>>>> + * returns only the success of operation.
>>>> + * If operation succeed, this is simple, we just need to return the provided
>>>> + * old value. However, if it fails, we need to load the value to return it for
>>>> + * the caller. If the loaded value is different from the "old" provided by the
>>>> + * caller, we can return it since it will means it failed.
>>>> + * However, if for some reason the value we read is equal to the old value
>>>> + * provided by the caller, we can't simply return it or the caller will think it
>>>> + * succeeded. So if the value we read is the same as the "old" provided by
>>>> + * the caller, we try again until either we succeed or we fail with a different
>>>> + * value than the provided one.
>>>> + */
>>>> +#define __cmpxchg(ptr, old, new, op_suffix, load_suffix)		\
>>>> +({									\
>>>> +	register unsigned long __rn asm("r62");				\
>>>> +	register unsigned long __ro asm("r63");				\
>>> Why do you need to specify the exact registers?
>> r62 and r63 are hardcoded in the inline assembly, they are caller saved.
>> I have a C implementation that uses builtins however this is not merged
>> in our tree yet (but I want to).
>>
>>> e.g. does some instruction use these implicitly, or do you need two adjacent
>>> register for encoding reasons?
>> The atomic compare and swap (acswap) instruction needs a register "pair"
>> which can only exists with "adjacent" registers:  $r0r1, $r2r3 ect.
> Ok; and you don't have a way to ask GCC for an arbitrary register pair, so you
> chose r62+r63 as they can be clobbered?
>
> It might be worth looking into using an __int128_t to give you a register pair,
> assuming your calling convention mandares adjacency of the two halves. That
> could give you the pair while giving the compiler freedom to chose any suitable
> pair (assuming you have a suitable operand modifier to extract the low/high
> registers from the asm operand.

In the V2 we replaced the assembly code with C code.

acswap{d,w} instruction is generated by the mean of a compiler builtin. 
This way

the code is much more readable and it allows to not hard code some

register pair, letting the compiler allocate them.

>
>>>> +	__asm__ __volatile__ (						\
>>>> +		/* Fence to guarantee previous store to be committed */	\
>>>> +		"fence\n"						\
>>> This implies you can implement the relaxed form of cmpxchg().
>>>
>>> What ordering do you get by default, and do you have any other barriers (e.g.
>>> for acquire/release semantics), or just "fence" ?
>> We have two barrier types:
>>   - fence: ensure that all uncached memory operations are committed to
>>     memory, typically used to ensure a write to memory is visible to
>>     other cores.
>>   - barrier: flush the core instruction pipeline and memory system
> Ok.
>
> In the absence of barriers do you have any ordering guarantee? e.g. weak like
> arm/power, TSO like x86, or something else entirely?

Our memory ordering model is weak.

```

store @data_addr = new_value

;;

store @data_is_valid_addr = 1

;;

```

By running the previous pseudo-code, there is no guarantee that another 
CPU would see the *data_addr == new_value just because it sees 
*data_is_valid_addr == 1

The correct code would need an extra bundle containing a `fence` 
instruction in-between the 2 other bundles.

> When you say "uncached memory operations", does that mean that you have cached
> memory operatins which are not affected by the barriers, or is "uncached memory
> operations" just meant to read as "oustanding/incomplete memory operations" ?

The fence instruction will force all outstanding memory write operations 
to finish and reach their target.

* For an uncached access it means that after the fence the DDR has been 
updated (or the target device register).

* For a cached access it means the L2 cache has been updated, and 
therefore all the corresponding L1D lines invalidated. (L1I is untouched!)

Bottom line is that after the fence, all other cores of the cluster 
running Linux will see the new value. Except of course if the write is 
through an uncached MMU mapping and the read is done through a cached 
MMU mapping, but this seems normal.

Thanks for the review, it's much appreciated!

-- 

Yann