[PATCH 1/9] s390/percpu: Provide arch_raw_cpu_ptr()

Heiko Carstens posted 9 patches 2 weeks, 6 days ago
There is a newer version of this series
[PATCH 1/9] s390/percpu: Provide arch_raw_cpu_ptr()
Posted by Heiko Carstens 2 weeks, 6 days ago
Provide an s390 specific arch_raw_cpu_ptr() implementation which avoids the
detour over get_lowcore() to get the lowcore pointer. The inline assembly
is implemented with an alternative so that relocated lowcore (percpu offset
is at a different address) is handled correctly.

This turns code like this

  102f78:       a7 39 00 00             lghi    %r3,0
  102f7c:       e3 20 33 b8 00 08       ag      %r2,952(%r3)

which adds the percpu offset to register r2 with a single instruction

  102f7c:       e3 20 33 b8 00 08       ag      %r2,952(%r0)

and also avoids the need of a base register, thus reducing register
pressure.

With defconfig bloat-o-meter -t provides this result:

add/remove: 12/26 grow/shrink: 183/3391 up/down: 14880/-41950 (-27070)

Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---
 arch/s390/include/asm/percpu.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/arch/s390/include/asm/percpu.h b/arch/s390/include/asm/percpu.h
index 5899f57f17d1..b18a96f3a334 100644
--- a/arch/s390/include/asm/percpu.h
+++ b/arch/s390/include/asm/percpu.h
@@ -12,6 +12,24 @@
  */
 #define __my_cpu_offset get_lowcore()->percpu_offset
 
+#define arch_raw_cpu_ptr(_ptr)						\
+({									\
+	unsigned long lc_percpu, tcp_ptr__;				\
+									\
+	tcp_ptr__ = (__force unsigned long)(_ptr);			\
+	lc_percpu = offsetof(struct lowcore, percpu_offset);		\
+	asm_inline volatile(						\
+	ALTERNATIVE("ag		%[__ptr__],%[offzero](%%r0)\n",		\
+		    "ag		%[__ptr__],%[offalt](%%r0)\n",		\
+		    ALT_FEATURE(MFEATURE_LOWCORE))			\
+	: [__ptr__] "+d" (tcp_ptr__)					\
+	: [offzero] "i" (lc_percpu),					\
+	  [offalt] "i" (lc_percpu + LOWCORE_ALT_ADDRESS),		\
+	  "m" (((struct lowcore *)0)->percpu_offset)			\
+	: "cc");							\
+	(TYPEOF_UNQUAL(*(_ptr)) __force __kernel *)tcp_ptr__;		\
+})
+
 /*
  * We use a compare-and-swap loop since that uses less cpu cycles than
  * disabling and enabling interrupts like the generic variant would do.
-- 
2.51.0