[PATCH v2 3/2] RISC-V: sbi: remove sbi_ecall tracepoints

Radim Krčmář posted 2 patches 3 months, 3 weeks ago
[PATCH v2 3/2] RISC-V: sbi: remove sbi_ecall tracepoints
Posted by Radim Krčmář 3 months, 3 weeks ago
Tracepoits generate bad code in the non-trace path.

The acceptable tracepoint overhead in the non-tracing path is a nop, and
possibly a second 2 byte nop for alignment, but the actual overhead is
way higher.  For example, the sbi_fwft_set with tracepoints:
   0xffffffff80022ee8 <+0>:	auipc	a5,0x2cec
   0xffffffff80022eec <+4>:	lbu	a5,1704(a5) # 0xffffffff82d0f590 <sbi_fwft_supported>
   0xffffffff80022ef0 <+8>:	beqz	a5,0xffffffff80022fa0 <sbi_fwft_set+184>
   0xffffffff80022ef2 <+10>:	addi	sp,sp,-48
   0xffffffff80022ef4 <+12>:	sd	s0,32(sp)
   0xffffffff80022ef6 <+14>:	sd	s1,24(sp)
   0xffffffff80022ef8 <+16>:	sd	s2,16(sp)
   0xffffffff80022efa <+18>:	sd	ra,40(sp)
   0xffffffff80022efc <+20>:	addi	s0,sp,48
   0xffffffff80022efe <+22>:	slli	s1,a0,0x20
   0xffffffff80022f02 <+26>:	mv	s2,a1
   0xffffffff80022f04 <+28>:	srli	s1,s1,0x20
   0xffffffff80022f06 <+30>:	nop
   0xffffffff80022f08 <+32>:	nop
   0xffffffff80022f0c <+36>:	lui	a7,0x46574
   0xffffffff80022f10 <+40>:	mv	a0,s1
   0xffffffff80022f12 <+42>:	mv	a1,s2
   0xffffffff80022f14 <+44>:	addi	a7,a7,1620 # 0x46574654
   0xffffffff80022f18 <+48>:	li	a6,0
   0xffffffff80022f1a <+50>:	ecall
   0xffffffff80022f1e <+54>:	mv	s1,a0
   0xffffffff80022f20 <+56>:	nop
   0xffffffff80022f24 <+60>:	addiw	a0,s1,14
   0xffffffff80022f28 <+64>:	li	a5,14
   0xffffffff80022f2a <+66>:	bltu	a5,a0,0xffffffff80022f9a <sbi_fwft_set+178>
   0xffffffff80022f2e <+70>:	slli	a5,a0,0x20
   0xffffffff80022f32 <+74>:	srli	a0,a5,0x1e
   0xffffffff80022f36 <+78>:	auipc	a5,0x1c75
   0xffffffff80022f3a <+82>:	addi	a5,a5,-886 # 0xffffffff81c97bc0 <CSWTCH.177>
   0xffffffff80022f3e <+86>:	add	a5,a5,a0
   0xffffffff80022f40 <+88>:	lw	a0,0(a5)
   0xffffffff80022f42 <+90>:	ld	ra,40(sp)
   0xffffffff80022f44 <+92>:	ld	s0,32(sp)
   0xffffffff80022f46 <+94>:	ld	s1,24(sp)
   0xffffffff80022f48 <+96>:	ld	s2,16(sp)
   0xffffffff80022f4a <+98>:	addi	sp,sp,48
   0xffffffff80022f4c <+100>:	ret
   [tracepoint slowpaths]
   0xffffffff80022f9a <+178>:	li	a0,-524
   0xffffffff80022f9e <+182>:	j	0xffffffff80022f42 <sbi_fwft_set+90>
   0xffffffff80022fa0 <+184>:	li	a0,-95
   0xffffffff80022fa4 <+188>:	ret

Without tracepoints:
   0xffffffff80022b40 <+0>:	addi	sp,sp,-16
   0xffffffff80022b42 <+2>:	sd	s0,0(sp)
   0xffffffff80022b44 <+4>:	sd	ra,8(sp)
   0xffffffff80022b46 <+6>:	addi	s0,sp,16
   0xffffffff80022b48 <+8>:	auipc	a5,0x2ced
   0xffffffff80022b4c <+12>:	lbu	a5,-1464(a5) # 0xffffffff82d0f590 <sbi_fwft_supported>
   0xffffffff80022b50 <+16>:	beqz	a5,0xffffffff80022b8e <sbi_fwft_set+78>
   0xffffffff80022b52 <+18>:	lui	a7,0x46574
   0xffffffff80022b56 <+22>:	slli	a0,a0,0x20
   0xffffffff80022b58 <+24>:	srli	a0,a0,0x20
   0xffffffff80022b5a <+26>:	addi	a7,a7,1620 # 0x46574654
   0xffffffff80022b5e <+30>:	li	a6,0
   0xffffffff80022b60 <+32>:	ecall
   0xffffffff80022b64 <+36>:	li	a5,14
   0xffffffff80022b66 <+38>:	addiw	a0,a0,14
   0xffffffff80022b68 <+40>:	bltu	a5,a0,0xffffffff80022b88 <sbi_fwft_set+72>
   0xffffffff80022b6c <+44>:	slli	a5,a0,0x20
   0xffffffff80022b70 <+48>:	srli	a0,a5,0x1e
   0xffffffff80022b74 <+52>:	auipc	a5,0x1c75
   0xffffffff80022b78 <+56>:	addi	a5,a5,-300 # 0xffffffff81c97a48 <CSWTCH.176>
   0xffffffff80022b7c <+60>:	add	a5,a5,a0
   0xffffffff80022b7e <+62>:	lw	a0,0(a5)
   0xffffffff80022b80 <+64>:	ld	ra,8(sp)
   0xffffffff80022b82 <+66>:	ld	s0,0(sp)
   0xffffffff80022b84 <+68>:	addi	sp,sp,16
   0xffffffff80022b86 <+70>:	ret

   0xffffffff80022b88 <+72>:	li	a0,-524
   0xffffffff80022b8c <+76>:	j	0xffffffff80022b80 <sbi_fwft_set+64>
   0xffffffff80022b8e <+78>:	li	a0,-95
   0xffffffff80022b92 <+82>:	j	0xffffffff80022b80 <sbi_fwft_set+64>

It would be nice if RISC-V had a way to tell compilers to generate the
desired code, because if this issue isn't limited to ecall tracepoints,
then disabling CONFIG_TRACEPOINTS is starting to look good. :)

Signed-off-by: Radim Krčmář <rkrcmar@ventanamicro.com>
---
 arch/riscv/include/asm/sbi.h   | 30 ++--------------------------
 arch/riscv/include/asm/trace.h | 36 ----------------------------------
 arch/riscv/kernel/sbi_ecall.c  | 18 -----------------
 3 files changed, 2 insertions(+), 82 deletions(-)

diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h
index 7aff31583a3d..ffab0614d24a 100644
--- a/arch/riscv/include/asm/sbi.h
+++ b/arch/riscv/include/asm/sbi.h
@@ -11,7 +11,6 @@
 #include <linux/types.h>
 #include <linux/cpumask.h>
 #include <linux/jump_label.h>
-#include <linux/tracepoint-defs.h>
 
 #ifdef CONFIG_RISCV_SBI
 enum sbi_ext_id {
@@ -461,16 +460,6 @@ struct sbiret {
 	long value;
 };
 
-#ifdef CONFIG_TRACEPOINTS
-DECLARE_TRACEPOINT(sbi_call);
-DECLARE_TRACEPOINT(sbi_return);
-extern void do_trace_sbi_call(int ext, int fid);
-extern void do_trace_sbi_return(int ext, long error, long value);
-#else
-static inline void do_trace_sbi_call(int ext, int fid) {};
-static inline void do_trace_sbi_return(int ext, long error, long value) {};
-#endif
-
 void sbi_init(void);
 long __sbi_base_ecall(int fid);
 
@@ -509,32 +498,17 @@ long __sbi_base_ecall(int fid);
 #define __sbi_ecall_constraints7  __sbi_ecall_constraints6, "r" (__a4)
 #define __sbi_ecall_constraints8  __sbi_ecall_constraints7, "r" (__a5)
 
-#define __sbi_ecall_trace_call() \
-	if (tracepoint_enabled(sbi_call)) \
-		do_trace_sbi_call(__ta7, __ta6)
-
-#define __sbi_ecall_trace_return() \
-	if (tracepoint_enabled(sbi_return)) \
-		do_trace_sbi_return(__ta7, __ret.error, __ret.value)
-
-/*
- * Clear a1 to avoid leaking unrelated kernel state through tracepoints in case
- * the register doesn't get overwritten by the ecall nor the arguments.
- */
 #define sbi_ecall(A...) \
 ({ \
 	CONCATENATE(__sbi_ecall_args, COUNT_ARGS(A))(A); \
-	__sbi_ecall_trace_call(); \
 	register uintptr_t __r0 asm ("a0"); \
-	register uintptr_t __r1 asm ("a1") = 0; \
+	register uintptr_t __r1 asm ("a1"); \
 	CONCATENATE(__sbi_ecall_regs, COUNT_ARGS(A)); \
 	asm volatile ("ecall" \
 			: "=r" (__r0), "=r" (__r1) \
 			: CONCATENATE(__sbi_ecall_constraints, COUNT_ARGS(A)) \
 			: "memory"); \
-	struct sbiret __ret = {.error = __r0, .value = __r1}; \
-	__sbi_ecall_trace_return(); \
-	__ret; \
+	(struct sbiret){.error = __r0, .value = __r1}; \
 })
 
 #ifdef CONFIG_RISCV_SBI_V01
diff --git a/arch/riscv/include/asm/trace.h b/arch/riscv/include/asm/trace.h
index 6151cee5450c..7c99d91fcce3 100644
--- a/arch/riscv/include/asm/trace.h
+++ b/arch/riscv/include/asm/trace.h
@@ -7,42 +7,6 @@
 
 #include <linux/tracepoint.h>
 
-TRACE_EVENT_CONDITION(sbi_call,
-	TP_PROTO(int ext, int fid),
-	TP_ARGS(ext, fid),
-	TP_CONDITION(ext != SBI_EXT_HSM),
-
-	TP_STRUCT__entry(
-		__field(int, ext)
-		__field(int, fid)
-	),
-
-	TP_fast_assign(
-		__entry->ext = ext;
-		__entry->fid = fid;
-	),
-
-	TP_printk("ext=0x%x fid=%d", __entry->ext, __entry->fid)
-);
-
-TRACE_EVENT_CONDITION(sbi_return,
-	TP_PROTO(int ext, long error, long value),
-	TP_ARGS(ext, error, value),
-	TP_CONDITION(ext != SBI_EXT_HSM),
-
-	TP_STRUCT__entry(
-		__field(long, error)
-		__field(long, value)
-	),
-
-	TP_fast_assign(
-		__entry->error = error;
-		__entry->value = value;
-	),
-
-	TP_printk("error=%ld value=0x%lx", __entry->error, __entry->value)
-);
-
 #endif /* _TRACE_RISCV_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/arch/riscv/kernel/sbi_ecall.c b/arch/riscv/kernel/sbi_ecall.c
index b783a46fff1c..62489ffeae2c 100644
--- a/arch/riscv/kernel/sbi_ecall.c
+++ b/arch/riscv/kernel/sbi_ecall.c
@@ -2,8 +2,6 @@
 /* Copyright (c) 2024 Rivos Inc. */
 
 #include <asm/sbi.h>
-#define CREATE_TRACE_POINTS
-#include <asm/trace.h>
 
 long __sbi_base_ecall(int fid)
 {
@@ -16,19 +14,3 @@ long __sbi_base_ecall(int fid)
 		return sbi_err_map_linux_errno(ret.error);
 }
 EXPORT_SYMBOL(__sbi_base_ecall);
-
-#ifdef CONFIG_TRACEPOINTS
-void do_trace_sbi_call(int ext, int fid)
-{
-	trace_sbi_call(ext, fid);
-}
-EXPORT_SYMBOL(do_trace_sbi_call);
-EXPORT_TRACEPOINT_SYMBOL(sbi_call);
-
-void do_trace_sbi_return(int ext, long error, long value)
-{
-	trace_sbi_return(ext, error, value);
-}
-EXPORT_SYMBOL(do_trace_sbi_return);
-EXPORT_TRACEPOINT_SYMBOL(sbi_return);
-#endif
-- 
2.49.0

Re: [PATCH v2 3/2] RISC-V: sbi: remove sbi_ecall tracepoints
Posted by Palmer Dabbelt 3 months, 2 weeks ago
Having patch 3 of 2 is not normal.

On Thu, 19 Jun 2025 12:03:15 PDT (-0700), rkrcmar@ventanamicro.com wrote:
> Tracepoits generate bad code in the non-trace path.
>
> The acceptable tracepoint overhead in the non-tracing path is a nop, and
> possibly a second 2 byte nop for alignment, but the actual overhead is
> way higher.  For example, the sbi_fwft_set with tracepoints:
>    0xffffffff80022ee8 <+0>:	auipc	a5,0x2cec
>    0xffffffff80022eec <+4>:	lbu	a5,1704(a5) # 0xffffffff82d0f590 <sbi_fwft_supported>
>    0xffffffff80022ef0 <+8>:	beqz	a5,0xffffffff80022fa0 <sbi_fwft_set+184>
>    0xffffffff80022ef2 <+10>:	addi	sp,sp,-48
>    0xffffffff80022ef4 <+12>:	sd	s0,32(sp)
>    0xffffffff80022ef6 <+14>:	sd	s1,24(sp)
>    0xffffffff80022ef8 <+16>:	sd	s2,16(sp)
>    0xffffffff80022efa <+18>:	sd	ra,40(sp)
>    0xffffffff80022efc <+20>:	addi	s0,sp,48
>    0xffffffff80022efe <+22>:	slli	s1,a0,0x20
>    0xffffffff80022f02 <+26>:	mv	s2,a1
>    0xffffffff80022f04 <+28>:	srli	s1,s1,0x20
>    0xffffffff80022f06 <+30>:	nop
>    0xffffffff80022f08 <+32>:	nop
>    0xffffffff80022f0c <+36>:	lui	a7,0x46574
>    0xffffffff80022f10 <+40>:	mv	a0,s1
>    0xffffffff80022f12 <+42>:	mv	a1,s2
>    0xffffffff80022f14 <+44>:	addi	a7,a7,1620 # 0x46574654
>    0xffffffff80022f18 <+48>:	li	a6,0
>    0xffffffff80022f1a <+50>:	ecall
>    0xffffffff80022f1e <+54>:	mv	s1,a0
>    0xffffffff80022f20 <+56>:	nop
>    0xffffffff80022f24 <+60>:	addiw	a0,s1,14
>    0xffffffff80022f28 <+64>:	li	a5,14
>    0xffffffff80022f2a <+66>:	bltu	a5,a0,0xffffffff80022f9a <sbi_fwft_set+178>
>    0xffffffff80022f2e <+70>:	slli	a5,a0,0x20
>    0xffffffff80022f32 <+74>:	srli	a0,a5,0x1e
>    0xffffffff80022f36 <+78>:	auipc	a5,0x1c75
>    0xffffffff80022f3a <+82>:	addi	a5,a5,-886 # 0xffffffff81c97bc0 <CSWTCH.177>
>    0xffffffff80022f3e <+86>:	add	a5,a5,a0
>    0xffffffff80022f40 <+88>:	lw	a0,0(a5)
>    0xffffffff80022f42 <+90>:	ld	ra,40(sp)
>    0xffffffff80022f44 <+92>:	ld	s0,32(sp)
>    0xffffffff80022f46 <+94>:	ld	s1,24(sp)
>    0xffffffff80022f48 <+96>:	ld	s2,16(sp)
>    0xffffffff80022f4a <+98>:	addi	sp,sp,48
>    0xffffffff80022f4c <+100>:	ret
>    [tracepoint slowpaths]
>    0xffffffff80022f9a <+178>:	li	a0,-524
>    0xffffffff80022f9e <+182>:	j	0xffffffff80022f42 <sbi_fwft_set+90>
>    0xffffffff80022fa0 <+184>:	li	a0,-95
>    0xffffffff80022fa4 <+188>:	ret
>
> Without tracepoints:
>    0xffffffff80022b40 <+0>:	addi	sp,sp,-16
>    0xffffffff80022b42 <+2>:	sd	s0,0(sp)
>    0xffffffff80022b44 <+4>:	sd	ra,8(sp)
>    0xffffffff80022b46 <+6>:	addi	s0,sp,16
>    0xffffffff80022b48 <+8>:	auipc	a5,0x2ced
>    0xffffffff80022b4c <+12>:	lbu	a5,-1464(a5) # 0xffffffff82d0f590 <sbi_fwft_supported>
>    0xffffffff80022b50 <+16>:	beqz	a5,0xffffffff80022b8e <sbi_fwft_set+78>
>    0xffffffff80022b52 <+18>:	lui	a7,0x46574
>    0xffffffff80022b56 <+22>:	slli	a0,a0,0x20
>    0xffffffff80022b58 <+24>:	srli	a0,a0,0x20
>    0xffffffff80022b5a <+26>:	addi	a7,a7,1620 # 0x46574654
>    0xffffffff80022b5e <+30>:	li	a6,0
>    0xffffffff80022b60 <+32>:	ecall
>    0xffffffff80022b64 <+36>:	li	a5,14
>    0xffffffff80022b66 <+38>:	addiw	a0,a0,14
>    0xffffffff80022b68 <+40>:	bltu	a5,a0,0xffffffff80022b88 <sbi_fwft_set+72>
>    0xffffffff80022b6c <+44>:	slli	a5,a0,0x20
>    0xffffffff80022b70 <+48>:	srli	a0,a5,0x1e
>    0xffffffff80022b74 <+52>:	auipc	a5,0x1c75
>    0xffffffff80022b78 <+56>:	addi	a5,a5,-300 # 0xffffffff81c97a48 <CSWTCH.176>
>    0xffffffff80022b7c <+60>:	add	a5,a5,a0
>    0xffffffff80022b7e <+62>:	lw	a0,0(a5)
>    0xffffffff80022b80 <+64>:	ld	ra,8(sp)
>    0xffffffff80022b82 <+66>:	ld	s0,0(sp)
>    0xffffffff80022b84 <+68>:	addi	sp,sp,16
>    0xffffffff80022b86 <+70>:	ret
>
>    0xffffffff80022b88 <+72>:	li	a0,-524
>    0xffffffff80022b8c <+76>:	j	0xffffffff80022b80 <sbi_fwft_set+64>
>    0xffffffff80022b8e <+78>:	li	a0,-95
>    0xffffffff80022b92 <+82>:	j	0xffffffff80022b80 <sbi_fwft_set+64>
>
> It would be nice if RISC-V had a way to tell compilers to generate the
> desired code, because if this issue isn't limited to ecall tracepoints,
> then disabling CONFIG_TRACEPOINTS is starting to look good. :)

So the issue is the extra save/restore on function entry?  That's the 
sort of think shrink wrapping is supposed to help with.  It's been 
implemented in GCC for a while, but I'm not sure how well it's been 
pushed on (IIRC it was just one of the SPEC workloads).

That said, this is kind of hard to reason about.  Can you pull out a 
smaller example?

> Signed-off-by: Radim Krčmář <rkrcmar@ventanamicro.com>
> ---
>  arch/riscv/include/asm/sbi.h   | 30 ++--------------------------
>  arch/riscv/include/asm/trace.h | 36 ----------------------------------
>  arch/riscv/kernel/sbi_ecall.c  | 18 -----------------
>  3 files changed, 2 insertions(+), 82 deletions(-)
>
> diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h
> index 7aff31583a3d..ffab0614d24a 100644
> --- a/arch/riscv/include/asm/sbi.h
> +++ b/arch/riscv/include/asm/sbi.h
> @@ -11,7 +11,6 @@
>  #include <linux/types.h>
>  #include <linux/cpumask.h>
>  #include <linux/jump_label.h>
> -#include <linux/tracepoint-defs.h>
>
>  #ifdef CONFIG_RISCV_SBI
>  enum sbi_ext_id {
> @@ -461,16 +460,6 @@ struct sbiret {
>  	long value;
>  };
>
> -#ifdef CONFIG_TRACEPOINTS
> -DECLARE_TRACEPOINT(sbi_call);
> -DECLARE_TRACEPOINT(sbi_return);
> -extern void do_trace_sbi_call(int ext, int fid);
> -extern void do_trace_sbi_return(int ext, long error, long value);
> -#else
> -static inline void do_trace_sbi_call(int ext, int fid) {};
> -static inline void do_trace_sbi_return(int ext, long error, long value) {};
> -#endif
> -
>  void sbi_init(void);
>  long __sbi_base_ecall(int fid);
>
> @@ -509,32 +498,17 @@ long __sbi_base_ecall(int fid);
>  #define __sbi_ecall_constraints7  __sbi_ecall_constraints6, "r" (__a4)
>  #define __sbi_ecall_constraints8  __sbi_ecall_constraints7, "r" (__a5)
>
> -#define __sbi_ecall_trace_call() \
> -	if (tracepoint_enabled(sbi_call)) \
> -		do_trace_sbi_call(__ta7, __ta6)
> -
> -#define __sbi_ecall_trace_return() \
> -	if (tracepoint_enabled(sbi_return)) \
> -		do_trace_sbi_return(__ta7, __ret.error, __ret.value)
> -
> -/*
> - * Clear a1 to avoid leaking unrelated kernel state through tracepoints in case
> - * the register doesn't get overwritten by the ecall nor the arguments.
> - */
>  #define sbi_ecall(A...) \
>  ({ \
>  	CONCATENATE(__sbi_ecall_args, COUNT_ARGS(A))(A); \
> -	__sbi_ecall_trace_call(); \
>  	register uintptr_t __r0 asm ("a0"); \
> -	register uintptr_t __r1 asm ("a1") = 0; \
> +	register uintptr_t __r1 asm ("a1"); \
>  	CONCATENATE(__sbi_ecall_regs, COUNT_ARGS(A)); \
>  	asm volatile ("ecall" \
>  			: "=r" (__r0), "=r" (__r1) \
>  			: CONCATENATE(__sbi_ecall_constraints, COUNT_ARGS(A)) \
>  			: "memory"); \
> -	struct sbiret __ret = {.error = __r0, .value = __r1}; \
> -	__sbi_ecall_trace_return(); \
> -	__ret; \
> +	(struct sbiret){.error = __r0, .value = __r1}; \
>  })
>
>  #ifdef CONFIG_RISCV_SBI_V01
> diff --git a/arch/riscv/include/asm/trace.h b/arch/riscv/include/asm/trace.h
> index 6151cee5450c..7c99d91fcce3 100644
> --- a/arch/riscv/include/asm/trace.h
> +++ b/arch/riscv/include/asm/trace.h
> @@ -7,42 +7,6 @@
>
>  #include <linux/tracepoint.h>
>
> -TRACE_EVENT_CONDITION(sbi_call,
> -	TP_PROTO(int ext, int fid),
> -	TP_ARGS(ext, fid),
> -	TP_CONDITION(ext != SBI_EXT_HSM),
> -
> -	TP_STRUCT__entry(
> -		__field(int, ext)
> -		__field(int, fid)
> -	),
> -
> -	TP_fast_assign(
> -		__entry->ext = ext;
> -		__entry->fid = fid;
> -	),
> -
> -	TP_printk("ext=0x%x fid=%d", __entry->ext, __entry->fid)
> -);
> -
> -TRACE_EVENT_CONDITION(sbi_return,
> -	TP_PROTO(int ext, long error, long value),
> -	TP_ARGS(ext, error, value),
> -	TP_CONDITION(ext != SBI_EXT_HSM),
> -
> -	TP_STRUCT__entry(
> -		__field(long, error)
> -		__field(long, value)
> -	),
> -
> -	TP_fast_assign(
> -		__entry->error = error;
> -		__entry->value = value;
> -	),
> -
> -	TP_printk("error=%ld value=0x%lx", __entry->error, __entry->value)
> -);
> -
>  #endif /* _TRACE_RISCV_H */
>
>  #undef TRACE_INCLUDE_PATH
> diff --git a/arch/riscv/kernel/sbi_ecall.c b/arch/riscv/kernel/sbi_ecall.c
> index b783a46fff1c..62489ffeae2c 100644
> --- a/arch/riscv/kernel/sbi_ecall.c
> +++ b/arch/riscv/kernel/sbi_ecall.c
> @@ -2,8 +2,6 @@
>  /* Copyright (c) 2024 Rivos Inc. */
>
>  #include <asm/sbi.h>
> -#define CREATE_TRACE_POINTS
> -#include <asm/trace.h>
>
>  long __sbi_base_ecall(int fid)
>  {
> @@ -16,19 +14,3 @@ long __sbi_base_ecall(int fid)
>  		return sbi_err_map_linux_errno(ret.error);
>  }
>  EXPORT_SYMBOL(__sbi_base_ecall);
> -
> -#ifdef CONFIG_TRACEPOINTS
> -void do_trace_sbi_call(int ext, int fid)
> -{
> -	trace_sbi_call(ext, fid);
> -}
> -EXPORT_SYMBOL(do_trace_sbi_call);
> -EXPORT_TRACEPOINT_SYMBOL(sbi_call);
> -
> -void do_trace_sbi_return(int ext, long error, long value)
> -{
> -	trace_sbi_return(ext, error, value);
> -}
> -EXPORT_SYMBOL(do_trace_sbi_return);
> -EXPORT_TRACEPOINT_SYMBOL(sbi_return);
> -#endif
Re: [PATCH v2 3/2] RISC-V: sbi: remove sbi_ecall tracepoints
Posted by Radim Krčmář 3 months, 2 weeks ago
2025-06-23T15:54:00-07:00, Palmer Dabbelt <palmer@dabbelt.com>:
> Having patch 3 of 2 is not normal.

Sorry, I wanted to distinguish it from the original series without
sending a new one, because it's quite radical proposal I don't
necessarily want to get merged.
Would "[RFC 3/2]", "[RFC 3/3]", or something else look better while
raising the same alarms?

> On Thu, 19 Jun 2025 12:03:15 PDT (-0700), rkrcmar@ventanamicro.com wrote:
> So the issue is the extra save/restore on function entry?  That's the 
> sort of think shrink wrapping is supposed to help with.  It's been 
> implemented in GCC for a while, but I'm not sure how well it's been 
> pushed on (IIRC it was just one of the SPEC workloads).

Yes, shrink wrapping could help if compilers can figure out what to do
with static_keys. It's hopefully going to sort itself out in the future.
We'd ideally have some way to tell the compiler to always keep the
tracepoints inside their branches, to make them less fragile, but that
is probably asking too much from C.

I think GCC 15.1 had some shrink-wrapping improvements, but I've only
been using 14.3 so far...

> That said, this is kind of hard to reason about.  Can you pull out a 
> smaller example?

I posted an example of the original 8 argument ecall in v1:
https://lore.kernel.org/linux-riscv/20250612145754.2126147-2-rkrcmar@ventanamicro.com/T/#m1d441ab3de3e6d6b3b8d120b923f2e2081918a98
For another example, let's have the following function:

  struct sbiret some_sbi_ecall(uintptr_t a0, uintptr_t a1)
  {
    return sbi_ecall(123, 456, a0, a1);
  }

The disassembly without tracepoints (with -fno-omit-frame-pointer):
(It could have been just "li;li;ecall;ret" without frame pointer.)

   0xffffffff80016d48 <+0>:	addi	sp,sp,-16
   0xffffffff80016d4a <+2>:	sd	ra,8(sp)
   0xffffffff80016d4c <+4>:	sd	s0,0(sp)
   0xffffffff80016d4e <+6>:	addi	s0,sp,16
   0xffffffff80016d50 <+8>:	li	a7,123
   0xffffffff80016d54 <+12>:	li	a6,456
   0xffffffff80016d58 <+16>:	ecall
   0xffffffff80016d5c <+20>:	ld	ra,8(sp)
   0xffffffff80016d5e <+22>:	ld	s0,0(sp)
   0xffffffff80016d60 <+24>:	addi	sp,sp,16
   0xffffffff80016d62 <+26>:	ret

With tracepoints, the situation is worse... the optimal outcome would
add two nops, but the actual result is:

   0xffffffff80017720 <+0>:	addi	sp,sp,-48
   0xffffffff80017722 <+2>:	sd	ra,40(sp)
   0xffffffff80017724 <+4>:	sd	s0,32(sp)
   0xffffffff80017726 <+6>:	sd	s1,24(sp)
   0xffffffff80017728 <+8>:	sd	s2,16(sp)
   0xffffffff8001772a <+10>:	sd	s3,8(sp)
   0xffffffff8001772c <+12>:	addi	s0,sp,48
   0xffffffff8001772e <+14>:	nop
   0xffffffff80017730 <+16>:	nop
   0xffffffff80017734 <+20>:	li	a7,123
   0xffffffff80017738 <+24>:	li	a6,456
   0xffffffff8001773c <+28>:	ecall
   0xffffffff80017740 <+32>:	nop
   0xffffffff80017744 <+36>:	ld	ra,40(sp)
   0xffffffff80017746 <+38>:	ld	s0,32(sp)
   0xffffffff80017748 <+40>:	ld	s1,24(sp)
   0xffffffff8001774a <+42>:	ld	s2,16(sp)
   0xffffffff8001774c <+44>:	ld	s3,8(sp)
   0xffffffff8001774e <+46>:	addi	sp,sp,48
   0xffffffff80017750 <+48>:	ret
   [Tracing slowpath continues to 202.]

i.e. we spill 3 extra registers, which is at least better v1.  I'll try
again with GCC 15.1, and get back if it actually improves the situation.
Re: [PATCH v2 3/2] RISC-V: sbi: remove sbi_ecall tracepoints
Posted by Radim Krčmář 3 months, 2 weeks ago
2025-06-24T15:09:09+02:00, Radim Krčmář <rkrcmar@ventanamicro.com>:
> For another example, let's have the following function:
>
>   struct sbiret some_sbi_ecall(uintptr_t a0, uintptr_t a1)
>   {
>     return sbi_ecall(123, 456, a0, a1);
>   }
>
> The disassembly without tracepoints (with -fno-omit-frame-pointer):
> (It could have been just "li;li;ecall;ret" without frame pointer.)
>
>    0xffffffff80016d48 <+0>:	addi	sp,sp,-16
>    0xffffffff80016d4a <+2>:	sd	ra,8(sp)
>    0xffffffff80016d4c <+4>:	sd	s0,0(sp)
>    0xffffffff80016d4e <+6>:	addi	s0,sp,16
>    0xffffffff80016d50 <+8>:	li	a7,123
>    0xffffffff80016d54 <+12>:	li	a6,456
>    0xffffffff80016d58 <+16>:	ecall
>    0xffffffff80016d5c <+20>:	ld	ra,8(sp)
>    0xffffffff80016d5e <+22>:	ld	s0,0(sp)
>    0xffffffff80016d60 <+24>:	addi	sp,sp,16
>    0xffffffff80016d62 <+26>:	ret
>
> [ Removed previous disassembly with tracepoints. ]
>                                                                I'll try
> again with GCC 15.1, and get back if it actually improves the situation.

GCC 15.1 still leaves "mv" outside the branch, but at least seems to be
on the right track (undesired overhead is marked with leading stars):

   0xffffffff800236e8 <+0>:	addi	sp,sp,-48
   0xffffffff800236ea <+2>:	sd	s0,32(sp)
   0xffffffff800236ec <+4>:	sd	ra,40(sp)
   0xffffffff800236ee <+6>:	addi	s0,sp,48
*  0xffffffff800236f0 <+8>:	mv	a4,a0
*  0xffffffff800236f2 <+10>:	mv	a5,a1
   0xffffffff800236f4 <+12>:	nop
*  0xffffffff800236f8 <+16>:	mv	a0,a4
*  0xffffffff800236fa <+18>:	mv	a1,a5
   0xffffffff800236fc <+20>:	li	a7,123
   0xffffffff80023700 <+24>:	li	a6,456
   0xffffffff80023704 <+28>:	ecall
*  0xffffffff80023708 <+32>:	mv	a5,a0
*  0xffffffff8002370a <+34>:	mv	a2,a1
   0xffffffff8002370c <+36>:	nop
   0xffffffff80023710 <+40>:	ld	ra,40(sp)
   0xffffffff80023712 <+42>:	ld	s0,32(sp)
*  0xffffffff80023714 <+44>:	mv	a0,a5
*  0xffffffff80023716 <+46>:	mv	a1,a2
   0xffffffff80023718 <+48>:	addi	sp,sp,48
   0xffffffff8002371a <+50>:	ret
   [Tracing goes to +126]

I realized I had the environment configured for clang in the last mail,
so here is actual GCC 14.3, which also spills in the prologue:

   0xffffffff80023360 <+0>:	addi	sp,sp,-48
   0xffffffff80023362 <+2>:	sd	s0,32(sp)
*  0xffffffff80023364 <+4>:	sd	s1,24(sp)
*  0xffffffff80023366 <+6>:	sd	s2,16(sp)
   0xffffffff80023368 <+8>:	sd	ra,40(sp)
   0xffffffff8002336a <+10>:	addi	s0,sp,48
*  0xffffffff8002336c <+12>:	mv	s2,a0
*  0xffffffff8002336e <+14>:	mv	s1,a1
   0xffffffff80023370 <+16>:	nop
*  0xffffffff80023374 <+20>:	mv	a0,s2
*  0xffffffff80023376 <+22>:	mv	a1,s1
   0xffffffff80023378 <+24>:	li	a7,123
   0xffffffff8002337c <+28>:	li	a6,456
   0xffffffff80023380 <+32>:	ecall
*  0xffffffff80023384 <+36>:	mv	s2,a0
*  0xffffffff80023386 <+38>:	mv	s1,a1
   0xffffffff80023388 <+40>:	nop
   0xffffffff8002338c <+44>:	ld	ra,40(sp)
   0xffffffff8002338e <+46>:	ld	s0,32(sp)
*  0xffffffff80023390 <+48>:	mv	a0,s2
*  0xffffffff80023392 <+50>:	mv	a1,s1
*  0xffffffff80023394 <+52>:	ld	s2,16(sp)
*  0xffffffff80023396 <+54>:	ld	s1,24(sp)
   0xffffffff80023398 <+56>:	addi	sp,sp,48
   0xffffffff8002339a <+58>:	ret
   [Tracing goes to +108]

And clang in the last mail inlined the tracepoints, because I put the
example function in sbi_ecall.c, which bloated the tracing slowpath, and
spilled one more register than needed.
With the function in sbi.c, to better simulate actual use (gcc examples
are already doing this), clang 20.1.6 and 19.1.7 do:

   0xffffffff80016f08 <+0>:	addi	sp,sp,-32
   0xffffffff80016f0a <+2>:	sd	ra,24(sp)
   0xffffffff80016f0c <+4>:	sd	s0,16(sp)
*  0xffffffff80016f0e <+6>:	sd	s1,8(sp)
*  0xffffffff80016f10 <+8>:	sd	s2,0(sp)
   0xffffffff80016f12 <+10>:	addi	s0,sp,32
   0xffffffff80016f14 <+12>:	nop
   0xffffffff80016f18 <+16>:	li	a7,123
   0xffffffff80016f1c <+20>:	li	a6,456
   0xffffffff80016f20 <+24>:	ecall
   0xffffffff80016f24 <+28>:	nop
   0xffffffff80016f28 <+32>:	ld	ra,24(sp)
   0xffffffff80016f2a <+34>:	ld	s0,16(sp)
*  0xffffffff80016f2c <+36>:	ld	s1,8(sp)
*  0xffffffff80016f2e <+38>:	ld	s2,0(sp)
   0xffffffff80016f30 <+40>:	addi	sp,sp,32
   0xffffffff80016f32 <+42>:	ret
   [Tracing goes to +94]

When compared to GCC 15.1, clang spills in the prologue, but doesn't
store around the static branch sites.  The optimal result would be a
combination of what clang and GCC 15.1 do (code without any stars).

When I looked at real code samples, the behavior was roughly similar.
GCC just wasn't always placing the "mv"s as obviously.
Re: [PATCH v2 3/2] RISC-V: sbi: remove sbi_ecall tracepoints
Posted by David Laight 3 months, 2 weeks ago
On Wed, 25 Jun 2025 09:51:45 +0200
Radim Krčmář <rkrcmar@ventanamicro.com> wrote:

> 2025-06-24T15:09:09+02:00, Radim Krčmář <rkrcmar@ventanamicro.com>:
> > For another example, let's have the following function:
> >
> >   struct sbiret some_sbi_ecall(uintptr_t a0, uintptr_t a1)
> >   {
> >     return sbi_ecall(123, 456, a0, a1);
> >   }
> >
...
> 
> GCC 15.1 still leaves "mv" outside the branch, but at least seems to be
> on the right track (undesired overhead is marked with leading stars):
> 
>    0xffffffff800236e8 <+0>:	addi	sp,sp,-48
>    0xffffffff800236ea <+2>:	sd	s0,32(sp)
>    0xffffffff800236ec <+4>:	sd	ra,40(sp)
>    0xffffffff800236ee <+6>:	addi	s0,sp,48
> *  0xffffffff800236f0 <+8>:	mv	a4,a0
> *  0xffffffff800236f2 <+10>:	mv	a5,a1
>    0xffffffff800236f4 <+12>:	nop
> *  0xffffffff800236f8 <+16>:	mv	a0,a4
> *  0xffffffff800236fa <+18>:	mv	a1,a5
>    0xffffffff800236fc <+20>:	li	a7,123
>    0xffffffff80023700 <+24>:	li	a6,456
>    0xffffffff80023704 <+28>:	ecall
> *  0xffffffff80023708 <+32>:	mv	a5,a0
> *  0xffffffff8002370a <+34>:	mv	a2,a1
>    0xffffffff8002370c <+36>:	nop
>    0xffffffff80023710 <+40>:	ld	ra,40(sp)
>    0xffffffff80023712 <+42>:	ld	s0,32(sp)
> *  0xffffffff80023714 <+44>:	mv	a0,a5
> *  0xffffffff80023716 <+46>:	mv	a1,a2
>    0xffffffff80023718 <+48>:	addi	sp,sp,48
>    0xffffffff8002371a <+50>:	ret
>    [Tracing goes to +126]

How much do a few register moves/spills matter compared to the
cost of the called code?
There will but much worse things out there if you look.

	David
Re: [PATCH v2 3/2] RISC-V: sbi: remove sbi_ecall tracepoints
Posted by Radim Krčmář 3 months, 2 weeks ago
2025-06-25T09:34:15+01:00, David Laight <david.laight.linux@gmail.com>:
> On Wed, 25 Jun 2025 09:51:45 +0200
> Radim Krčmář <rkrcmar@ventanamicro.com> wrote:
>> 2025-06-24T15:09:09+02:00, Radim Krčmář <rkrcmar@ventanamicro.com>:
>> > For another example, let's have the following function:
>> >
>> >   struct sbiret some_sbi_ecall(uintptr_t a0, uintptr_t a1)
>> >   {
>> >     return sbi_ecall(123, 456, a0, a1);
>> >   }
>> >
> ...
>> 
>> GCC 15.1 still leaves "mv" outside the branch, but at least seems to be
>> on the right track (undesired overhead is marked with leading stars):
>> 
>>    0xffffffff800236e8 <+0>:	addi	sp,sp,-48
>>    0xffffffff800236ea <+2>:	sd	s0,32(sp)
>>    0xffffffff800236ec <+4>:	sd	ra,40(sp)
>>    0xffffffff800236ee <+6>:	addi	s0,sp,48
>> *  0xffffffff800236f0 <+8>:	mv	a4,a0
>> *  0xffffffff800236f2 <+10>:	mv	a5,a1
>>    0xffffffff800236f4 <+12>:	nop
>> *  0xffffffff800236f8 <+16>:	mv	a0,a4
>> *  0xffffffff800236fa <+18>:	mv	a1,a5
>>    0xffffffff800236fc <+20>:	li	a7,123
>>    0xffffffff80023700 <+24>:	li	a6,456
>>    0xffffffff80023704 <+28>:	ecall
>> *  0xffffffff80023708 <+32>:	mv	a5,a0
>> *  0xffffffff8002370a <+34>:	mv	a2,a1
>>    0xffffffff8002370c <+36>:	nop
>>    0xffffffff80023710 <+40>:	ld	ra,40(sp)
>>    0xffffffff80023712 <+42>:	ld	s0,32(sp)
>> *  0xffffffff80023714 <+44>:	mv	a0,a5
>> *  0xffffffff80023716 <+46>:	mv	a1,a2
>>    0xffffffff80023718 <+48>:	addi	sp,sp,48
>>    0xffffffff8002371a <+50>:	ret
>>    [Tracing goes to +126]
>
> How much do a few register moves/spills matter compared to the
> cost of the called code?

I didn't do any serious analysis... In general, simpler functions are
going to suffer a higher ratio of overhead from adding tracepoints, and
the constant overhead per added tracepoint increases with the number of
its arguments.

For a trap to kernel mode that passes through a dozen tracepoint sites,
we could save under a hundred instructions if disabled tracepoints had
the minimal overhead.  I don't have a good idea how much of the total
trap execution-time/instruction-count/entropy-increase that actually is.

> There will but much worse things out there if you look.

Definitely, I am trying my best not to look, but I sometimes happen to
stumble upon something, and try to understand it.

Waiting till the tracepoint overhead resolves itself sounds fine to me.

Thanks.