[RFC PATCH] accel/tcg: add tracepoints for cpu_loop_exit_atomic

Alex Bennée posted 1 patch 1 month, 2 weeks ago
accel/tcg/user-exec.c          |  2 +-
accel/tcg/ldst_atomicity.c.inc |  9 +++++++++
accel/tcg/trace-events         | 12 ++++++++++++
3 files changed, 22 insertions(+), 1 deletion(-)
[RFC PATCH] accel/tcg: add tracepoints for cpu_loop_exit_atomic
Posted by Alex Bennée 1 month, 2 weeks ago
We try to avoid using cpu_loop_exit_atomic as it brings in an all-core
sync point. However on some cpu/kernel/benchmark combinations it is
starting to show up in the performance profile. To make it easier to
see whats going on add tracepoints for the slow path so we can see
what is triggering the wait.

It seems for a modern CPU it can be quite a bit, for example:

./qemu-system-aarch64 \
           -machine type=virt,virtualization=on,pflash0=rom,pflash1=efivars,gic-version=max \
           -smp 4 \
           -accel tcg \
           -device virtio-net-pci,netdev=unet \
           -device virtio-scsi-pci \
           -device scsi-hd,drive=hd \
           -netdev user,id=unet,hostfwd=tcp::2222-:22 \
           -blockdev driver=raw,node-name=hd,file.driver=host_device,file.filename=/dev/zen-ssd2/trixie-arm64,discard=unmap \
           -serial mon:stdio \
           -blockdev node-name=rom,driver=file,filename=(pwd)/pc-bios/edk2-aarch64-code.fd,read-only=true \
           -blockdev node-name=efivars,driver=file,filename=$HOME/images/qemu-arm64-efivars \
           -m 8192 \
           -object memory-backend-memfd,id=mem,size=8G,share=on \
           -kernel /home/alex/lsrc/linux.git/builds/arm64/arch/arm64/boot/Image -append "root=/dev/sda2 console=ttyAMA0 systemd.unit=benchmark-stress-ng.service" \
           -display none -d trace:load_atom\*_fallback,trace:store_atom\*_fallback

With:

  -cpu neoverse-v1,pauth-impdef=on => 2203343

With:

  -cpu cortex-a76 => 0

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Cc: Pierrick Bouvier <pierrick.bouvier@linaro.org>
---
 accel/tcg/user-exec.c          |  2 +-
 accel/tcg/ldst_atomicity.c.inc |  9 +++++++++
 accel/tcg/trace-events         | 12 ++++++++++++
 3 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
index 7ddc47b0ba..f3a440ca29 100644
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -29,7 +29,7 @@
 #include "exec/page-protection.h"
 #include "exec/helper-proto.h"
 #include "qemu/atomic128.h"
-#include "trace/trace-root.h"
+#include "trace.h"
 #include "tcg/tcg-ldst.h"
 #include "internal-common.h"
 #include "internal-target.h"
diff --git a/accel/tcg/ldst_atomicity.c.inc b/accel/tcg/ldst_atomicity.c.inc
index 134da3c1da..c735add261 100644
--- a/accel/tcg/ldst_atomicity.c.inc
+++ b/accel/tcg/ldst_atomicity.c.inc
@@ -168,6 +168,7 @@ static uint64_t load_atomic8_or_exit(CPUState *cpu, uintptr_t ra, void *pv)
 #endif
 
     /* Ultimate fallback: re-execute in serial context. */
+    trace_load_atom8_or_exit_fallback(ra);
     cpu_loop_exit_atomic(cpu, ra);
 }
 
@@ -212,6 +213,7 @@ static Int128 load_atomic16_or_exit(CPUState *cpu, uintptr_t ra, void *pv)
     }
 
     /* Ultimate fallback: re-execute in serial context. */
+    trace_load_atom16_or_exit_fallback(ra);
     cpu_loop_exit_atomic(cpu, ra);
 }
 
@@ -519,6 +521,7 @@ static uint64_t load_atom_8(CPUState *cpu, uintptr_t ra,
         if (HAVE_al8) {
             return load_atom_extract_al8x2(pv);
         }
+        trace_load_atom8_fallback(memop, ra);
         cpu_loop_exit_atomic(cpu, ra);
     default:
         g_assert_not_reached();
@@ -563,6 +566,7 @@ static Int128 load_atom_16(CPUState *cpu, uintptr_t ra,
         break;
     case MO_64:
         if (!HAVE_al8) {
+            trace_load_atom16_fallback(memop, ra);
             cpu_loop_exit_atomic(cpu, ra);
         }
         a = load_atomic8(pv);
@@ -570,6 +574,7 @@ static Int128 load_atom_16(CPUState *cpu, uintptr_t ra,
         break;
     case -MO_64:
         if (!HAVE_al8) {
+            trace_load_atom16_fallback(memop, ra);
             cpu_loop_exit_atomic(cpu, ra);
         }
         a = load_atom_extract_al8x2(pv);
@@ -897,6 +902,7 @@ static void store_atom_2(CPUState *cpu, uintptr_t ra,
         g_assert_not_reached();
     }
 
+    trace_store_atom2_fallback(memop, ra);
     cpu_loop_exit_atomic(cpu, ra);
 }
 
@@ -961,6 +967,7 @@ static void store_atom_4(CPUState *cpu, uintptr_t ra,
                 return;
             }
         }
+        trace_store_atom4_fallback(memop, ra);
         cpu_loop_exit_atomic(cpu, ra);
     default:
         g_assert_not_reached();
@@ -1029,6 +1036,7 @@ static void store_atom_8(CPUState *cpu, uintptr_t ra,
     default:
         g_assert_not_reached();
     }
+    trace_store_atom8_fallback(memop, ra);
     cpu_loop_exit_atomic(cpu, ra);
 }
 
@@ -1107,5 +1115,6 @@ static void store_atom_16(CPUState *cpu, uintptr_t ra,
     default:
         g_assert_not_reached();
     }
+    trace_store_atom16_fallback(memop, ra);
     cpu_loop_exit_atomic(cpu, ra);
 }
diff --git a/accel/tcg/trace-events b/accel/tcg/trace-events
index 4e9b450520..0ce69d744f 100644
--- a/accel/tcg/trace-events
+++ b/accel/tcg/trace-events
@@ -12,3 +12,15 @@ memory_notdirty_set_dirty(uint64_t vaddr) "0x%" PRIx64
 
 # translate-all.c
 translate_block(void *tb, uintptr_t pc, const void *tb_code) "tb:%p, pc:0x%"PRIxPTR", tb_code:%p"
+
+# ldst_atomicity
+load_atom2_fallback(uint32_t memop, uintptr_t ra) "mop:0x%"PRIx32", ra:%"PRIxPTR""
+load_atom4_fallback(uint32_t memop, uintptr_t ra) "mop:0x%"PRIx32", ra:%"PRIxPTR""
+load_atom8_or_exit_fallback(uintptr_t ra) "ra:%"PRIxPTR""
+load_atom8_fallback(uint32_t memop, uintptr_t ra) "mop:0x%"PRIx32", ra:%"PRIxPTR""
+load_atom16_fallback(uint32_t memop, uintptr_t ra) "mop:0x%"PRIx32", ra:%"PRIxPTR""
+load_atom16_or_exit_fallback(uintptr_t ra) "ra:%"PRIxPTR""
+store_atom2_fallback(uint32_t memop, uintptr_t ra) "mop:0x%"PRIx32", ra:%"PRIxPTR""
+store_atom4_fallback(uint32_t memop, uintptr_t ra) "mop:0x%"PRIx32", ra:%"PRIxPTR""
+store_atom8_fallback(uint32_t memop, uintptr_t ra) "mop:0x%"PRIx32", ra:%"PRIxPTR""
+store_atom16_fallback(uint32_t memop, uintptr_t ra) "mop:0x%"PRIx32", ra:%"PRIxPTR""
-- 
2.39.5


Re: [RFC PATCH] accel/tcg: add tracepoints for cpu_loop_exit_atomic
Posted by Pierrick Bouvier 1 month, 2 weeks ago
On 10/4/24 06:52, Alex Bennée wrote:
> We try to avoid using cpu_loop_exit_atomic as it brings in an all-core
> sync point. However on some cpu/kernel/benchmark combinations it is
> starting to show up in the performance profile. To make it easier to
> see whats going on add tracepoints for the slow path so we can see
> what is triggering the wait.
> 
> It seems for a modern CPU it can be quite a bit, for example:
> 
> ./qemu-system-aarch64 \
>             -machine type=virt,virtualization=on,pflash0=rom,pflash1=efivars,gic-version=max \
>             -smp 4 \
>             -accel tcg \
>             -device virtio-net-pci,netdev=unet \
>             -device virtio-scsi-pci \
>             -device scsi-hd,drive=hd \
>             -netdev user,id=unet,hostfwd=tcp::2222-:22 \
>             -blockdev driver=raw,node-name=hd,file.driver=host_device,file.filename=/dev/zen-ssd2/trixie-arm64,discard=unmap \
>             -serial mon:stdio \
>             -blockdev node-name=rom,driver=file,filename=(pwd)/pc-bios/edk2-aarch64-code.fd,read-only=true \
>             -blockdev node-name=efivars,driver=file,filename=$HOME/images/qemu-arm64-efivars \
>             -m 8192 \
>             -object memory-backend-memfd,id=mem,size=8G,share=on \
>             -kernel /home/alex/lsrc/linux.git/builds/arm64/arch/arm64/boot/Image -append "root=/dev/sda2 console=ttyAMA0 systemd.unit=benchmark-stress-ng.service" \
>             -display none -d trace:load_atom\*_fallback,trace:store_atom\*_fallback
> 
> With:
> 
>    -cpu neoverse-v1,pauth-impdef=on => 2203343
> 
> With:
> 
>    -cpu cortex-a76 => 0
> 
> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
> Cc: Pierrick Bouvier <pierrick.bouvier@linaro.org>
> ---
>   accel/tcg/user-exec.c          |  2 +-
>   accel/tcg/ldst_atomicity.c.inc |  9 +++++++++
>   accel/tcg/trace-events         | 12 ++++++++++++
>   3 files changed, 22 insertions(+), 1 deletion(-)
> 
> diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
> index 7ddc47b0ba..f3a440ca29 100644
> --- a/accel/tcg/user-exec.c
> +++ b/accel/tcg/user-exec.c
> @@ -29,7 +29,7 @@
>   #include "exec/page-protection.h"
>   #include "exec/helper-proto.h"
>   #include "qemu/atomic128.h"
> -#include "trace/trace-root.h"
> +#include "trace.h"
>   #include "tcg/tcg-ldst.h"
>   #include "internal-common.h"
>   #include "internal-target.h"
> diff --git a/accel/tcg/ldst_atomicity.c.inc b/accel/tcg/ldst_atomicity.c.inc
> index 134da3c1da..c735add261 100644
> --- a/accel/tcg/ldst_atomicity.c.inc
> +++ b/accel/tcg/ldst_atomicity.c.inc
> @@ -168,6 +168,7 @@ static uint64_t load_atomic8_or_exit(CPUState *cpu, uintptr_t ra, void *pv)
>   #endif
>   
>       /* Ultimate fallback: re-execute in serial context. */
> +    trace_load_atom8_or_exit_fallback(ra);
>       cpu_loop_exit_atomic(cpu, ra);
>   }
>   
> @@ -212,6 +213,7 @@ static Int128 load_atomic16_or_exit(CPUState *cpu, uintptr_t ra, void *pv)
>       }
>   
>       /* Ultimate fallback: re-execute in serial context. */
> +    trace_load_atom16_or_exit_fallback(ra);
>       cpu_loop_exit_atomic(cpu, ra);
>   }
>   
> @@ -519,6 +521,7 @@ static uint64_t load_atom_8(CPUState *cpu, uintptr_t ra,
>           if (HAVE_al8) {
>               return load_atom_extract_al8x2(pv);
>           }
> +        trace_load_atom8_fallback(memop, ra);
>           cpu_loop_exit_atomic(cpu, ra);
>       default:
>           g_assert_not_reached();
> @@ -563,6 +566,7 @@ static Int128 load_atom_16(CPUState *cpu, uintptr_t ra,
>           break;
>       case MO_64:
>           if (!HAVE_al8) {
> +            trace_load_atom16_fallback(memop, ra);
>               cpu_loop_exit_atomic(cpu, ra);
>           }
>           a = load_atomic8(pv);
> @@ -570,6 +574,7 @@ static Int128 load_atom_16(CPUState *cpu, uintptr_t ra,
>           break;
>       case -MO_64:
>           if (!HAVE_al8) {
> +            trace_load_atom16_fallback(memop, ra);
>               cpu_loop_exit_atomic(cpu, ra);
>           }
>           a = load_atom_extract_al8x2(pv);
> @@ -897,6 +902,7 @@ static void store_atom_2(CPUState *cpu, uintptr_t ra,
>           g_assert_not_reached();
>       }
>   
> +    trace_store_atom2_fallback(memop, ra);
>       cpu_loop_exit_atomic(cpu, ra);
>   }
>   
> @@ -961,6 +967,7 @@ static void store_atom_4(CPUState *cpu, uintptr_t ra,
>                   return;
>               }
>           }
> +        trace_store_atom4_fallback(memop, ra);
>           cpu_loop_exit_atomic(cpu, ra);
>       default:
>           g_assert_not_reached();
> @@ -1029,6 +1036,7 @@ static void store_atom_8(CPUState *cpu, uintptr_t ra,
>       default:
>           g_assert_not_reached();
>       }
> +    trace_store_atom8_fallback(memop, ra);
>       cpu_loop_exit_atomic(cpu, ra);
>   }
>   
> @@ -1107,5 +1115,6 @@ static void store_atom_16(CPUState *cpu, uintptr_t ra,
>       default:
>           g_assert_not_reached();
>       }
> +    trace_store_atom16_fallback(memop, ra);
>       cpu_loop_exit_atomic(cpu, ra);
>   }
> diff --git a/accel/tcg/trace-events b/accel/tcg/trace-events
> index 4e9b450520..0ce69d744f 100644
> --- a/accel/tcg/trace-events
> +++ b/accel/tcg/trace-events
> @@ -12,3 +12,15 @@ memory_notdirty_set_dirty(uint64_t vaddr) "0x%" PRIx64
>   
>   # translate-all.c
>   translate_block(void *tb, uintptr_t pc, const void *tb_code) "tb:%p, pc:0x%"PRIxPTR", tb_code:%p"
> +
> +# ldst_atomicity
> +load_atom2_fallback(uint32_t memop, uintptr_t ra) "mop:0x%"PRIx32", ra:%"PRIxPTR""
> +load_atom4_fallback(uint32_t memop, uintptr_t ra) "mop:0x%"PRIx32", ra:%"PRIxPTR""
> +load_atom8_or_exit_fallback(uintptr_t ra) "ra:%"PRIxPTR""
> +load_atom8_fallback(uint32_t memop, uintptr_t ra) "mop:0x%"PRIx32", ra:%"PRIxPTR""
> +load_atom16_fallback(uint32_t memop, uintptr_t ra) "mop:0x%"PRIx32", ra:%"PRIxPTR""
> +load_atom16_or_exit_fallback(uintptr_t ra) "ra:%"PRIxPTR""
> +store_atom2_fallback(uint32_t memop, uintptr_t ra) "mop:0x%"PRIx32", ra:%"PRIxPTR""
> +store_atom4_fallback(uint32_t memop, uintptr_t ra) "mop:0x%"PRIx32", ra:%"PRIxPTR""
> +store_atom8_fallback(uint32_t memop, uintptr_t ra) "mop:0x%"PRIx32", ra:%"PRIxPTR""
> +store_atom16_fallback(uint32_t memop, uintptr_t ra) "mop:0x%"PRIx32", ra:%"PRIxPTR""

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Re: [RFC PATCH] accel/tcg: add tracepoints for cpu_loop_exit_atomic
Posted by Richard Henderson 1 month, 2 weeks ago
On 10/4/24 06:52, Alex Bennée wrote:
> We try to avoid using cpu_loop_exit_atomic as it brings in an all-core
> sync point. However on some cpu/kernel/benchmark combinations it is
> starting to show up in the performance profile. To make it easier to
> see whats going on add tracepoints for the slow path so we can see
> what is triggering the wait.
> 
> It seems for a modern CPU it can be quite a bit, for example:
> 
> ./qemu-system-aarch64 \
>             -machine type=virt,virtualization=on,pflash0=rom,pflash1=efivars,gic-version=max \
>             -smp 4 \
>             -accel tcg \
>             -device virtio-net-pci,netdev=unet \
>             -device virtio-scsi-pci \
>             -device scsi-hd,drive=hd \
>             -netdev user,id=unet,hostfwd=tcp::2222-:22 \
>             -blockdev driver=raw,node-name=hd,file.driver=host_device,file.filename=/dev/zen-ssd2/trixie-arm64,discard=unmap \
>             -serialmon:stdio \
>             -blockdev node-name=rom,driver=file,filename=(pwd)/pc-bios/edk2-aarch64-code.fd,read-only=true \
>             -blockdev node-name=efivars,driver=file,filename=$HOME/images/qemu-arm64-efivars \
>             -m 8192 \
>             -object memory-backend-memfd,id=mem,size=8G,share=on \
>             -kernel /home/alex/lsrc/linux.git/builds/arm64/arch/arm64/boot/Image -append "root=/dev/sda2 console=ttyAMA0 systemd.unit=benchmark-stress-ng.service" \
>             -display none -dtrace:load_atom\*_fallback,trace:store_atom\*_fallback
> 
> With:
> 
>    -cpu neoverse-v1,pauth-impdef=on => 2203343
> 
> With:
> 
>    -cpu cortex-a76 => 0
> 
> Signed-off-by: Alex Bennée<alex.bennee@linaro.org>
> Cc: Pierrick Bouvier<pierrick.bouvier@linaro.org>
> ---
>   accel/tcg/user-exec.c          |  2 +-
>   accel/tcg/ldst_atomicity.c.inc |  9 +++++++++
>   accel/tcg/trace-events         | 12 ++++++++++++
>   3 files changed, 22 insertions(+), 1 deletion(-)

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>

r~