target/ppc/helper.h | 1 + target/ppc/mem_helper.c | 14 ++++++++++++++ target/ppc/translate.c | 34 ++++++++++++++++++++++++++++------ 3 files changed, 43 insertions(+), 6 deletions(-)
This is an RFC patch, not finished, just to show the idea and test
this approach. I'm not sure it's correct but I'm sure it can be
improved so comments are requested.
The test case I've used came out of a discussion about very slow
access to VRAM of a graphics card passed through with vfio the reason
for which is still not clear but it was already known that dcbz is
often used by MacOS and AmigaOS for clearing memory and to avoid
reading values about to be overwritten which is faster on real CPU but
was found to be slower on QEMU. The optimised copy routines were
posted here:
https://www.amigans.net/modules/newbb/viewtopic.php?post_id=149123#forumpost149123
and the rest of it I've written to make it a test case is here:
http://zero.eik.bme.hu/~balaton/qemu/vramcopy.tar.xz
Replace the body of has_altivec() with just "return false". Sorry for
only giving pieces but the code posted above has a copyright that does
not allow me to include it in the test. This is not measuring VRAM
access now just memory copy but shows the effect of dcbz. I've got
these results with this patch:
Linux user master: Linux user patch:
byte loop: 2.2 sec byte loop: 2.2 sec
memcpy: 2.19 sec memcpy: 2.19 sec
copyToVRAMNoAltivec: 1.7 sec copyToVRAMNoAltivec: 1.71 sec
copyToVRAMAltivec: 2.13 sec copyToVRAMAltivec: 2.12 sec
copyFromVRAMNoAltivec: 5.11 sec copyFromVRAMNoAltivec: 2.79 sec
copyFromVRAMAltivec: 5.87 sec copyFromVRAMAltivec: 3.26 sec
Linux system master: Linux system patch:
byte loop: 5.86 sec byte loop: 5.9 sec
memcpy: 5.45 sec memcpy: 5.47 sec
copyToVRAMNoAltivec: 2.51 sec copyToVRAMNoAltivec: 2.53 sec
copyToVRAMAltivec: 3.84 sec copyToVRAMAltivec: 3.85 sec
copyFromVRAMNoAltivec: 6.11 sec copyFromVRAMNoAltivec: 3.92 sec
copyFromVRAMAltivec: 7.22 sec copyFromVRAMAltivec: 5.51 sec
It could probably be further optimised with using vector instuctions
(dcbz_size is between 32 and 128) or by eliminating the check left in
the helper for 970 but I don't know how to do those. (Also the series
that convert AltiVec to use 128 bit access may help but I haven't
tested that, only trying to optimise dcbz here,)
Signed-off-by: BALATON Zoltan <balaton@eik.bme.hu>
---
target/ppc/helper.h | 1 +
target/ppc/mem_helper.c | 14 ++++++++++++++
target/ppc/translate.c | 34 ++++++++++++++++++++++++++++------
3 files changed, 43 insertions(+), 6 deletions(-)
diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 76b8f25c77..e49681c25b 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -46,6 +46,7 @@ DEF_HELPER_FLAGS_3(stmw, TCG_CALL_NO_WG, void, env, tl, i32)
DEF_HELPER_4(lsw, void, env, tl, i32, i32)
DEF_HELPER_5(lswx, void, env, tl, i32, i32, i32)
DEF_HELPER_FLAGS_4(stsw, TCG_CALL_NO_WG, void, env, tl, i32, i32)
+DEF_HELPER_FLAGS_2(dcbz_size, TCG_CALL_NO_WG_SE, tl, env, i32)
DEF_HELPER_FLAGS_3(dcbz, TCG_CALL_NO_WG, void, env, tl, i32)
DEF_HELPER_FLAGS_3(dcbzep, TCG_CALL_NO_WG, void, env, tl, i32)
DEF_HELPER_FLAGS_2(icbi, TCG_CALL_NO_WG, void, env, tl)
diff --git a/target/ppc/mem_helper.c b/target/ppc/mem_helper.c
index f88155ad45..b06cb2d00e 100644
--- a/target/ppc/mem_helper.c
+++ b/target/ppc/mem_helper.c
@@ -270,6 +270,20 @@ void helper_stsw(CPUPPCState *env, target_ulong addr, uint32_t nb,
}
}
+target_ulong helper_dcbz_size(CPUPPCState *env, uint32_t opcode)
+{
+ target_ulong dcbz_size = env->dcache_line_size;
+
+#if defined(TARGET_PPC64)
+ /* Check for dcbz vs dcbzl on 970 */
+ if (env->excp_model == POWERPC_EXCP_970 &&
+ !(opcode & 0x00200000) && ((env->spr[SPR_970_HID5] >> 7) & 0x3) == 1) {
+ dcbz_size = 32;
+ }
+#endif
+ return dcbz_size;
+}
+
static void dcbz_common(CPUPPCState *env, target_ulong addr,
uint32_t opcode, bool epid, uintptr_t retaddr)
{
diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index 0bc16d7251..49221b8303 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -4445,14 +4445,36 @@ static void gen_dcblc(DisasContext *ctx)
/* dcbz */
static void gen_dcbz(DisasContext *ctx)
{
- TCGv tcgv_addr;
- TCGv_i32 tcgv_op;
+ TCGv addr, mask, dcbz_size, t0;
+ TCGv_i32 op = tcg_constant_i32(ctx->opcode & 0x03FF000);
+ TCGv_i64 z64 = tcg_constant_i64(0);
+ TCGv_i128 z128 = tcg_temp_new_i128();
+ TCGLabel *l;
+
+ addr = tcg_temp_new();
+ mask = tcg_temp_new();
+ dcbz_size = tcg_temp_new();
+ t0 = tcg_temp_new();
+ l = gen_new_label();
gen_set_access_type(ctx, ACCESS_CACHE);
- tcgv_addr = tcg_temp_new();
- tcgv_op = tcg_constant_i32(ctx->opcode & 0x03FF000);
- gen_addr_reg_index(ctx, tcgv_addr);
- gen_helper_dcbz(tcg_env, tcgv_addr, tcgv_op);
+ gen_helper_dcbz_size(dcbz_size, tcg_env, op);
+ tcg_gen_mov_tl(mask, dcbz_size);
+ tcg_gen_subi_tl(mask, mask, 1);
+ tcg_gen_not_tl(mask, mask);
+ gen_addr_reg_index(ctx, addr);
+ tcg_gen_and_tl(addr, addr, mask);
+ tcg_gen_mov_tl(t0, cpu_reserve);
+ tcg_gen_and_tl(t0, t0, mask);
+ tcg_gen_movcond_tl(TCG_COND_EQ, cpu_reserve, addr, t0,
+ tcg_constant_tl(-1), cpu_reserve);
+
+ tcg_gen_concat_i64_i128(z128, z64, z64);
+ gen_set_label(l);
+ tcg_gen_qemu_st_i128(z128, addr, ctx->mem_idx, DEF_MEMOP(MO_128));
+ tcg_gen_addi_tl(addr, addr, 16);
+ tcg_gen_subi_tl(dcbz_size, dcbz_size, 16);
+ tcg_gen_brcondi_tl(TCG_COND_GT, dcbz_size, 0, l);
}
/* dcbzep */
--
2.30.9
Hello,
On Mon, 1 Jul 2024, BALATON Zoltan wrote:
> This is an RFC patch, not finished, just to show the idea and test
> this approach. I'm not sure it's correct but I'm sure it can be
> improved so comments are requested.
Last time I did not get any replies to this so I try again. Some people
recently tried using passed through GPUs with qemu-system-ppc running
AmigaOS again and while it works it was slower than expected. This was
previosly found to maybe related to dcbz and accessing passed through PCI
memory with 32 bit instead of 128 bit access. The dcbz opcode is also
commonly used for clearing memory in MacOS so optimising it would make
these run faster even without pass through. (This was explored with KVM
here:
https://www.talospace.com/2018/08/making-your-talos-ii-into-power-mac_29.html)
dcbz was improved with user emulation after my last try but system
emulation is still affected. Results from the test case below now on QEMU
master on the same host machine (accessing RAM not passed through VRAM
same as previous tests below which also were to RAM):
qemu-ppc -cpu 7457:
byte loop: 2.22 sec
memcpy: 2.21 sec
copyToVRAMNoAltivec: 1.69 sec
copyToVRAMAltivec: 2.12 sec
copyFromVRAMNoAltivec: 2.24 sec
copyFromVRAMAltivec: 2.82 sec
qemu-system-ppc -machine pegasos2:
byte loop: 5.28 sec
memcpy: 5.06 sec
copyToVRAMNoAltivec: 2.52 sec
copyToVRAMAltivec: 2.66 sec
copyFromVRAMNoAltivec: 6.37 sec
copyFromVRAMAltivec: 6.84 sec
The qemu-system-ppc case is still very much not optimal. Is threre anybody
who wants to give this a try or any recommendations on what to do? I think
ideally maybe we should try to implement dcbz with TCG ops to avoid the
helper and try to use vector ops so these may be translated to wider 128
bit access ops on the host which is believed to be needed to for accessing
VRAM to avoid overhead of the transfer that's why these used AltiVec on
PPC. See original message below for more details.
Thank you,
BALATON Zoltan
> The test case I've used came out of a discussion about very slow
> access to VRAM of a graphics card passed through with vfio the reason
> for which is still not clear but it was already known that dcbz is
> often used by MacOS and AmigaOS for clearing memory and to avoid
> reading values about to be overwritten which is faster on real CPU but
> was found to be slower on QEMU. The optimised copy routines were
> posted here:
> https://www.amigans.net/modules/newbb/viewtopic.php?post_id=149123#forumpost149123
> and the rest of it I've written to make it a test case is here:
> http://zero.eik.bme.hu/~balaton/qemu/vramcopy.tar.xz
> Replace the body of has_altivec() with just "return false". Sorry for
> only giving pieces but the code posted above has a copyright that does
> not allow me to include it in the test. This is not measuring VRAM
> access now just memory copy but shows the effect of dcbz. I've got
> these results with this patch:
>
> Linux user master: Linux user patch:
> byte loop: 2.2 sec byte loop: 2.2 sec
> memcpy: 2.19 sec memcpy: 2.19 sec
> copyToVRAMNoAltivec: 1.7 sec copyToVRAMNoAltivec: 1.71 sec
> copyToVRAMAltivec: 2.13 sec copyToVRAMAltivec: 2.12 sec
> copyFromVRAMNoAltivec: 5.11 sec copyFromVRAMNoAltivec: 2.79 sec
> copyFromVRAMAltivec: 5.87 sec copyFromVRAMAltivec: 3.26 sec
>
> Linux system master: Linux system patch:
> byte loop: 5.86 sec byte loop: 5.9 sec
> memcpy: 5.45 sec memcpy: 5.47 sec
> copyToVRAMNoAltivec: 2.51 sec copyToVRAMNoAltivec: 2.53 sec
> copyToVRAMAltivec: 3.84 sec copyToVRAMAltivec: 3.85 sec
> copyFromVRAMNoAltivec: 6.11 sec copyFromVRAMNoAltivec: 3.92 sec
> copyFromVRAMAltivec: 7.22 sec copyFromVRAMAltivec: 5.51 sec
>
> It could probably be further optimised with using vector instuctions
> (dcbz_size is between 32 and 128) or by eliminating the check left in
> the helper for 970 but I don't know how to do those. (Also the series
> that convert AltiVec to use 128 bit access may help but I haven't
> tested that, only trying to optimise dcbz here,)
>
> Signed-off-by: BALATON Zoltan <balaton@eik.bme.hu>
> ---
> target/ppc/helper.h | 1 +
> target/ppc/mem_helper.c | 14 ++++++++++++++
> target/ppc/translate.c | 34 ++++++++++++++++++++++++++++------
> 3 files changed, 43 insertions(+), 6 deletions(-)
>
> diff --git a/target/ppc/helper.h b/target/ppc/helper.h
> index 76b8f25c77..e49681c25b 100644
> --- a/target/ppc/helper.h
> +++ b/target/ppc/helper.h
> @@ -46,6 +46,7 @@ DEF_HELPER_FLAGS_3(stmw, TCG_CALL_NO_WG, void, env, tl, i32)
> DEF_HELPER_4(lsw, void, env, tl, i32, i32)
> DEF_HELPER_5(lswx, void, env, tl, i32, i32, i32)
> DEF_HELPER_FLAGS_4(stsw, TCG_CALL_NO_WG, void, env, tl, i32, i32)
> +DEF_HELPER_FLAGS_2(dcbz_size, TCG_CALL_NO_WG_SE, tl, env, i32)
> DEF_HELPER_FLAGS_3(dcbz, TCG_CALL_NO_WG, void, env, tl, i32)
> DEF_HELPER_FLAGS_3(dcbzep, TCG_CALL_NO_WG, void, env, tl, i32)
> DEF_HELPER_FLAGS_2(icbi, TCG_CALL_NO_WG, void, env, tl)
> diff --git a/target/ppc/mem_helper.c b/target/ppc/mem_helper.c
> index f88155ad45..b06cb2d00e 100644
> --- a/target/ppc/mem_helper.c
> +++ b/target/ppc/mem_helper.c
> @@ -270,6 +270,20 @@ void helper_stsw(CPUPPCState *env, target_ulong addr, uint32_t nb,
> }
> }
>
> +target_ulong helper_dcbz_size(CPUPPCState *env, uint32_t opcode)
> +{
> + target_ulong dcbz_size = env->dcache_line_size;
> +
> +#if defined(TARGET_PPC64)
> + /* Check for dcbz vs dcbzl on 970 */
> + if (env->excp_model == POWERPC_EXCP_970 &&
> + !(opcode & 0x00200000) && ((env->spr[SPR_970_HID5] >> 7) & 0x3) == 1) {
> + dcbz_size = 32;
> + }
> +#endif
> + return dcbz_size;
> +}
> +
> static void dcbz_common(CPUPPCState *env, target_ulong addr,
> uint32_t opcode, bool epid, uintptr_t retaddr)
> {
> diff --git a/target/ppc/translate.c b/target/ppc/translate.c
> index 0bc16d7251..49221b8303 100644
> --- a/target/ppc/translate.c
> +++ b/target/ppc/translate.c
> @@ -4445,14 +4445,36 @@ static void gen_dcblc(DisasContext *ctx)
> /* dcbz */
> static void gen_dcbz(DisasContext *ctx)
> {
> - TCGv tcgv_addr;
> - TCGv_i32 tcgv_op;
> + TCGv addr, mask, dcbz_size, t0;
> + TCGv_i32 op = tcg_constant_i32(ctx->opcode & 0x03FF000);
> + TCGv_i64 z64 = tcg_constant_i64(0);
> + TCGv_i128 z128 = tcg_temp_new_i128();
> + TCGLabel *l;
> +
> + addr = tcg_temp_new();
> + mask = tcg_temp_new();
> + dcbz_size = tcg_temp_new();
> + t0 = tcg_temp_new();
> + l = gen_new_label();
>
> gen_set_access_type(ctx, ACCESS_CACHE);
> - tcgv_addr = tcg_temp_new();
> - tcgv_op = tcg_constant_i32(ctx->opcode & 0x03FF000);
> - gen_addr_reg_index(ctx, tcgv_addr);
> - gen_helper_dcbz(tcg_env, tcgv_addr, tcgv_op);
> + gen_helper_dcbz_size(dcbz_size, tcg_env, op);
> + tcg_gen_mov_tl(mask, dcbz_size);
> + tcg_gen_subi_tl(mask, mask, 1);
> + tcg_gen_not_tl(mask, mask);
> + gen_addr_reg_index(ctx, addr);
> + tcg_gen_and_tl(addr, addr, mask);
> + tcg_gen_mov_tl(t0, cpu_reserve);
> + tcg_gen_and_tl(t0, t0, mask);
> + tcg_gen_movcond_tl(TCG_COND_EQ, cpu_reserve, addr, t0,
> + tcg_constant_tl(-1), cpu_reserve);
> +
> + tcg_gen_concat_i64_i128(z128, z64, z64);
> + gen_set_label(l);
> + tcg_gen_qemu_st_i128(z128, addr, ctx->mem_idx, DEF_MEMOP(MO_128));
> + tcg_gen_addi_tl(addr, addr, 16);
> + tcg_gen_subi_tl(dcbz_size, dcbz_size, 16);
> + tcg_gen_brcondi_tl(TCG_COND_GT, dcbz_size, 0, l);
> }
>
> /* dcbzep */
>
On Thu, 24 Apr 2025, BALATON Zoltan wrote: >> The test case I've used came out of a discussion about very slow >> access to VRAM of a graphics card passed through with vfio the reason >> for which is still not clear but it was already known that dcbz is >> often used by MacOS and AmigaOS for clearing memory and to avoid >> reading values about to be overwritten which is faster on real CPU but >> was found to be slower on QEMU. The optimised copy routines were >> posted here: >> https://www.amigans.net/modules/newbb/viewtopic.php?post_id=149123#forumpost149123 >> and the rest of it I've written to make it a test case is here: >> http://zero.eik.bme.hu/~balaton/qemu/vramcopy.tar.xz >> Replace the body of has_altivec() with just "return false". Sorry for >> only giving pieces but the code posted above has a copyright that does >> not allow me to include it in the test. This is not measuring VRAM >> access now just memory copy but shows the effect of dcbz. I've got >> these results with this patch: >> >> Linux user master: Linux user patch: >> byte loop: 2.2 sec byte loop: 2.2 sec >> memcpy: 2.19 sec memcpy: 2.19 sec >> copyToVRAMNoAltivec: 1.7 sec copyToVRAMNoAltivec: 1.71 sec >> copyToVRAMAltivec: 2.13 sec copyToVRAMAltivec: 2.12 sec >> copyFromVRAMNoAltivec: 5.11 sec copyFromVRAMNoAltivec: 2.79 sec >> copyFromVRAMAltivec: 5.87 sec copyFromVRAMAltivec: 3.26 sec >> >> Linux system master: Linux system patch: >> byte loop: 5.86 sec byte loop: 5.9 sec >> memcpy: 5.45 sec memcpy: 5.47 sec >> copyToVRAMNoAltivec: 2.51 sec copyToVRAMNoAltivec: 2.53 sec >> copyToVRAMAltivec: 3.84 sec copyToVRAMAltivec: 3.85 sec >> copyFromVRAMNoAltivec: 6.11 sec copyFromVRAMNoAltivec: 3.92 sec >> copyFromVRAMAltivec: 7.22 sec copyFromVRAMAltivec: 5.51 sec I did some more benchmarking to identify what slows it down. I noticed that memset uses dcbz too so I added a test for that. I've also added a parameter to allow testing actual VRAM and now that I have a card working with vfio-pci passthrough I could also test that. The updated vramcopy.tar.xz is at the same URL as above. These tests were run with the amigaone machine under Linux booted as described here: https://www.qemu.org/docs/master/system/ppc/amigang.html I compiled the benchmark twice, once as in the tar and once replacing dcbz in the copyFromVRAM* routines with dcba (which is noop on QEMU). First two results are with both src and dst in RAM, second two tests are with dst in VRAM (mapped from phys address 0x80800000 where the card's framebuffer is mapped). The left column shows results with emulated ati-vga as in the amigang.html docs. The right column is with real ATI X550 card (old and slow but works with this old PPC Linux) passed through with vfio-pci. with ati-vga with vfio-pci src 0xb79c8008 dst 0xb78c7008 | src 0xb7c92008 dst 0xb7b91008 byte loop: 21.16 sec byte loop: 21.16 sec memset: 3.85 sec | memset: 3.87 sec memcpy: 5.07 sec memcpy: 5.07 sec copyToVRAMNoAltivec: 2.52 sec | copyToVRAMNoAltivec: 2.53 sec copyToVRAMAltivec: 2.42 sec | copyToVRAMAltivec: 2.37 sec copyFromVRAMNoAltivec: 6.39 sec | copyFromVRAMNoAltivec: 6.38 sec copyFromVRAMAltivec: 7.02 sec | copyFromVRAMAltivec: 7 sec using dcba instead of dcbz | using dcba instead of dcbz src 0xb7b69008 dst 0xb7a68008 | src 0xb7c44008 dst 0xb7b43008 byte loop: 21.14 sec byte loop: 21.14 sec memset: 3.85 sec | memset: 3.88 sec memcpy: 5.06 sec | memcpy: 5.07 sec copyToVRAMNoAltivec: 2.53 sec | copyToVRAMNoAltivec: 2.52 sec copyToVRAMAltivec: 2.3 sec copyToVRAMAltivec: 2.3 sec copyFromVRAMNoAltivec: 2.59 sec copyFromVRAMNoAltivec: 2.59 sec copyFromVRAMAltivec: 2.95 sec copyFromVRAMAltivec: 2.95 sec dst in emulated ati-vga | dst in real card vfio vram mapping 0x80800000 mapping 0x80800000 src 0xb78e0008 dst 0xb77de000 | src 0xb7ec5008 dst 0xb7dc3000 byte loop: 21.2 sec | byte loop: 563.98 sec memset: 3.89 sec | memset: 39.25 sec memcpy: 5.07 sec | memcpy: 140.49 sec copyToVRAMNoAltivec: 2.53 sec | copyToVRAMNoAltivec: 72.03 sec copyToVRAMAltivec: 12.22 sec | copyToVRAMAltivec: 78.12 sec copyFromVRAMNoAltivec: 6.43 sec | copyFromVRAMNoAltivec: 728.52 sec copyFromVRAMAltivec: 35.33 sec | copyFromVRAMAltivec: 754.95 sec dst in emulated ati-vga using dcba | dst in real card vfio vram using dcba mapping 0x80800000 mapping 0x80800000 src 0xb7ba7008 dst 0xb7aa5000 | src 0xb77f4008 dst 0xb76f2000 byte loop: 21.15 sec | byte loop: 577.42 sec memset: 3.85 sec | memset: 39.52 sec memcpy: 5.06 sec | memcpy: 142.8 sec copyToVRAMNoAltivec: 2.53 sec | copyToVRAMNoAltivec: 71.71 sec copyToVRAMAltivec: 12.2 sec | copyToVRAMAltivec: 78.09 sec copyFromVRAMNoAltivec: 2.6 sec | copyFromVRAMNoAltivec: 727.23 sec copyFromVRAMAltivec: 35.03 sec | copyFromVRAMAltivec: 753.15 sec The results show that dcbz has some effect but an even bigger slow down is caused by using AltiVec which is supposed to do wider access to reduce the overhead but maybe it's not translated to host vector instructions correctly. The host in the above test was Intel i7-9700K. So to solve this maybe AltiVec should be improved more than dcbz but I don't know what and how. Regards, BALATON Zoltan
On Mon, 28 Apr 2025, BALATON Zoltan wrote: > On Thu, 24 Apr 2025, BALATON Zoltan wrote: >>> The test case I've used came out of a discussion about very slow >>> access to VRAM of a graphics card passed through with vfio the reason >>> for which is still not clear but it was already known that dcbz is >>> often used by MacOS and AmigaOS for clearing memory and to avoid >>> reading values about to be overwritten which is faster on real CPU but >>> was found to be slower on QEMU. The optimised copy routines were >>> posted here: >>> https://www.amigans.net/modules/newbb/viewtopic.php?post_id=149123#forumpost149123 >>> and the rest of it I've written to make it a test case is here: >>> http://zero.eik.bme.hu/~balaton/qemu/vramcopy.tar.xz >>> Replace the body of has_altivec() with just "return false". Sorry for >>> only giving pieces but the code posted above has a copyright that does >>> not allow me to include it in the test. This is not measuring VRAM >>> access now just memory copy but shows the effect of dcbz. I've got >>> these results with this patch: >>> >>> Linux user master: Linux user patch: >>> byte loop: 2.2 sec byte loop: 2.2 sec >>> memcpy: 2.19 sec memcpy: 2.19 sec >>> copyToVRAMNoAltivec: 1.7 sec copyToVRAMNoAltivec: 1.71 sec >>> copyToVRAMAltivec: 2.13 sec copyToVRAMAltivec: 2.12 sec >>> copyFromVRAMNoAltivec: 5.11 sec copyFromVRAMNoAltivec: 2.79 sec >>> copyFromVRAMAltivec: 5.87 sec copyFromVRAMAltivec: 3.26 sec >>> >>> Linux system master: Linux system patch: >>> byte loop: 5.86 sec byte loop: 5.9 sec >>> memcpy: 5.45 sec memcpy: 5.47 sec >>> copyToVRAMNoAltivec: 2.51 sec copyToVRAMNoAltivec: 2.53 sec >>> copyToVRAMAltivec: 3.84 sec copyToVRAMAltivec: 3.85 sec >>> copyFromVRAMNoAltivec: 6.11 sec copyFromVRAMNoAltivec: 3.92 sec >>> copyFromVRAMAltivec: 7.22 sec copyFromVRAMAltivec: 5.51 sec > > I did some more benchmarking to identify what slows it down. I noticed that > memset uses dcbz too so I added a test for that. I've also added a parameter > to allow testing actual VRAM and now that I have a card working with vfio-pci > passthrough I could also test that. The updated vramcopy.tar.xz is at the > same URL as above. These tests were run with the amigaone machine under Linux > booted as described here: > https://www.qemu.org/docs/master/system/ppc/amigang.html > > I compiled the benchmark twice, once as in the tar and once replacing dcbz in > the copyFromVRAM* routines with dcba (which is noop on QEMU). First two > results are with both src and dst in RAM, second two tests are with dst in > VRAM (mapped from phys address 0x80800000 where the card's framebuffer is > mapped). The left column shows results with emulated ati-vga as in the > amigang.html docs. The right column is with real ATI X550 card (old and slow > but works with this old PPC Linux) passed through with vfio-pci. > > with ati-vga with vfio-pci > > src 0xb79c8008 dst 0xb78c7008 | src 0xb7c92008 dst 0xb7b91008 > byte loop: 21.16 sec byte loop: 21.16 sec > memset: 3.85 sec | memset: 3.87 sec > memcpy: 5.07 sec memcpy: 5.07 sec > copyToVRAMNoAltivec: 2.52 sec | copyToVRAMNoAltivec: 2.53 sec > copyToVRAMAltivec: 2.42 sec | copyToVRAMAltivec: 2.37 sec > copyFromVRAMNoAltivec: 6.39 sec | copyFromVRAMNoAltivec: 6.38 sec > copyFromVRAMAltivec: 7.02 sec | copyFromVRAMAltivec: 7 sec > > using dcba instead of dcbz | using dcba instead of dcbz > src 0xb7b69008 dst 0xb7a68008 | src 0xb7c44008 dst 0xb7b43008 > byte loop: 21.14 sec byte loop: 21.14 sec > memset: 3.85 sec | memset: 3.88 sec > memcpy: 5.06 sec | memcpy: 5.07 sec > copyToVRAMNoAltivec: 2.53 sec | copyToVRAMNoAltivec: 2.52 sec > copyToVRAMAltivec: 2.3 sec copyToVRAMAltivec: 2.3 sec > copyFromVRAMNoAltivec: 2.59 sec copyFromVRAMNoAltivec: 2.59 sec > copyFromVRAMAltivec: 2.95 sec copyFromVRAMAltivec: 2.95 sec > > dst in emulated ati-vga | dst in real card vfio vram > mapping 0x80800000 mapping 0x80800000 > src 0xb78e0008 dst 0xb77de000 | src 0xb7ec5008 dst 0xb7dc3000 > byte loop: 21.2 sec | byte loop: 563.98 sec > memset: 3.89 sec | memset: 39.25 sec > memcpy: 5.07 sec | memcpy: 140.49 sec > copyToVRAMNoAltivec: 2.53 sec | copyToVRAMNoAltivec: 72.03 sec > copyToVRAMAltivec: 12.22 sec | copyToVRAMAltivec: 78.12 sec > copyFromVRAMNoAltivec: 6.43 sec | copyFromVRAMNoAltivec: 728.52 sec > copyFromVRAMAltivec: 35.33 sec | copyFromVRAMAltivec: 754.95 sec > > dst in emulated ati-vga using dcba | dst in real card vfio vram using dcba > mapping 0x80800000 mapping 0x80800000 > src 0xb7ba7008 dst 0xb7aa5000 | src 0xb77f4008 dst 0xb76f2000 > byte loop: 21.15 sec | byte loop: 577.42 sec > memset: 3.85 sec | memset: 39.52 sec > memcpy: 5.06 sec | memcpy: 142.8 sec > copyToVRAMNoAltivec: 2.53 sec | copyToVRAMNoAltivec: 71.71 sec > copyToVRAMAltivec: 12.2 sec | copyToVRAMAltivec: 78.09 sec > copyFromVRAMNoAltivec: 2.6 sec | copyFromVRAMNoAltivec: 727.23 sec > copyFromVRAMAltivec: 35.03 sec | copyFromVRAMAltivec: 753.15 sec > > The results show that dcbz has some effect but an even bigger slow down is > caused by using AltiVec which is supposed to do wider access to reduce the > overhead but maybe it's not translated to host vector instructions correctly. > The host in the above test was Intel i7-9700K. So to solve this maybe AltiVec > should be improved more than dcbz but I don't know what and how. Looking at what AltiVec ops are used there aren't many. lvx and stvx should translate to 128 bit ops so those are probably ok, there are some lvsl lvsr ops which may be ok too and the only other one left is vperm which seems very much unoptimised, so my guess is likely that vperm causes the slow down here (I could try profiling to confirm if needed). Is there a way to improve that? I don't know vector support on different archs. Maybe other archs have less general permutation ops that's why ppc has unoptimised implementation or is it possible just wasn't addressed yet? Regards, BALATON Zoltan
On Mon, 28 Apr 2025, BALATON Zoltan wrote:
> On Mon, 28 Apr 2025, BALATON Zoltan wrote:
>> On Thu, 24 Apr 2025, BALATON Zoltan wrote:
>>>> The test case I've used came out of a discussion about very slow
>>>> access to VRAM of a graphics card passed through with vfio the reason
>>>> for which is still not clear but it was already known that dcbz is
>>>> often used by MacOS and AmigaOS for clearing memory and to avoid
>>>> reading values about to be overwritten which is faster on real CPU but
>>>> was found to be slower on QEMU. The optimised copy routines were
>>>> posted here:
>>>> https://www.amigans.net/modules/newbb/viewtopic.php?post_id=149123#forumpost149123
>>>> and the rest of it I've written to make it a test case is here:
>>>> http://zero.eik.bme.hu/~balaton/qemu/vramcopy.tar.xz
>>>> Replace the body of has_altivec() with just "return false". Sorry for
>>>> only giving pieces but the code posted above has a copyright that does
>>>> not allow me to include it in the test. This is not measuring VRAM
>>>> access now just memory copy but shows the effect of dcbz. I've got
>>>> these results with this patch:
>>>>
>>>> Linux user master: Linux user patch:
>>>> byte loop: 2.2 sec byte loop: 2.2 sec
>>>> memcpy: 2.19 sec memcpy: 2.19 sec
>>>> copyToVRAMNoAltivec: 1.7 sec copyToVRAMNoAltivec: 1.71 sec
>>>> copyToVRAMAltivec: 2.13 sec copyToVRAMAltivec: 2.12 sec
>>>> copyFromVRAMNoAltivec: 5.11 sec copyFromVRAMNoAltivec: 2.79 sec
>>>> copyFromVRAMAltivec: 5.87 sec copyFromVRAMAltivec: 3.26 sec
>>>>
>>>> Linux system master: Linux system patch:
>>>> byte loop: 5.86 sec byte loop: 5.9 sec
>>>> memcpy: 5.45 sec memcpy: 5.47 sec
>>>> copyToVRAMNoAltivec: 2.51 sec copyToVRAMNoAltivec: 2.53 sec
>>>> copyToVRAMAltivec: 3.84 sec copyToVRAMAltivec: 3.85 sec
>>>> copyFromVRAMNoAltivec: 6.11 sec copyFromVRAMNoAltivec: 3.92 sec
>>>> copyFromVRAMAltivec: 7.22 sec copyFromVRAMAltivec: 5.51 sec
>>
>> I did some more benchmarking to identify what slows it down. I noticed that
>> memset uses dcbz too so I added a test for that. I've also added a
>> parameter to allow testing actual VRAM and now that I have a card working
>> with vfio-pci passthrough I could also test that. The updated
>> vramcopy.tar.xz is at the same URL as above. These tests were run with the
>> amigaone machine under Linux booted as described here:
>> https://www.qemu.org/docs/master/system/ppc/amigang.html
>>
>> I compiled the benchmark twice, once as in the tar and once replacing dcbz
>> in the copyFromVRAM* routines with dcba (which is noop on QEMU). First two
>> results are with both src and dst in RAM, second two tests are with dst in
>> VRAM (mapped from phys address 0x80800000 where the card's framebuffer is
>> mapped). The left column shows results with emulated ati-vga as in the
>> amigang.html docs. The right column is with real ATI X550 card (old and
>> slow but works with this old PPC Linux) passed through with vfio-pci.
>>
>> with ati-vga with vfio-pci
>>
>> src 0xb79c8008 dst 0xb78c7008 | src 0xb7c92008 dst 0xb7b91008
>> byte loop: 21.16 sec byte loop: 21.16 sec
>> memset: 3.85 sec | memset: 3.87 sec
>> memcpy: 5.07 sec memcpy: 5.07 sec
>> copyToVRAMNoAltivec: 2.52 sec | copyToVRAMNoAltivec: 2.53 sec
>> copyToVRAMAltivec: 2.42 sec | copyToVRAMAltivec: 2.37 sec
>> copyFromVRAMNoAltivec: 6.39 sec | copyFromVRAMNoAltivec: 6.38
>> sec
>> copyFromVRAMAltivec: 7.02 sec | copyFromVRAMAltivec: 7 sec
>>
>> using dcba instead of dcbz | using dcba instead of dcbz
>> src 0xb7b69008 dst 0xb7a68008 | src 0xb7c44008 dst 0xb7b43008
>> byte loop: 21.14 sec byte loop: 21.14 sec
>> memset: 3.85 sec | memset: 3.88 sec
>> memcpy: 5.06 sec | memcpy: 5.07 sec
>> copyToVRAMNoAltivec: 2.53 sec | copyToVRAMNoAltivec: 2.52 sec
>> copyToVRAMAltivec: 2.3 sec copyToVRAMAltivec: 2.3 sec
>> copyFromVRAMNoAltivec: 2.59 sec copyFromVRAMNoAltivec: 2.59
>> sec
>> copyFromVRAMAltivec: 2.95 sec copyFromVRAMAltivec: 2.95 sec
>>
>> dst in emulated ati-vga | dst in real card vfio vram
>> mapping 0x80800000 mapping 0x80800000
>> src 0xb78e0008 dst 0xb77de000 | src 0xb7ec5008 dst 0xb7dc3000
>> byte loop: 21.2 sec | byte loop: 563.98 sec
>> memset: 3.89 sec | memset: 39.25 sec
>> memcpy: 5.07 sec | memcpy: 140.49 sec
>> copyToVRAMNoAltivec: 2.53 sec | copyToVRAMNoAltivec: 72.03 sec
>> copyToVRAMAltivec: 12.22 sec | copyToVRAMAltivec: 78.12 sec
>> copyFromVRAMNoAltivec: 6.43 sec | copyFromVRAMNoAltivec: 728.52
>> sec
>> copyFromVRAMAltivec: 35.33 sec | copyFromVRAMAltivec: 754.95
>> sec
>>
>> dst in emulated ati-vga using dcba | dst in real card vfio vram
>> using dcba
>> mapping 0x80800000 mapping 0x80800000
>> src 0xb7ba7008 dst 0xb7aa5000 | src 0xb77f4008 dst 0xb76f2000
>> byte loop: 21.15 sec | byte loop: 577.42 sec
>> memset: 3.85 sec | memset: 39.52 sec
>> memcpy: 5.06 sec | memcpy: 142.8 sec
>> copyToVRAMNoAltivec: 2.53 sec | copyToVRAMNoAltivec: 71.71 sec
>> copyToVRAMAltivec: 12.2 sec | copyToVRAMAltivec: 78.09 sec
>> copyFromVRAMNoAltivec: 2.6 sec | copyFromVRAMNoAltivec: 727.23
>> sec
>> copyFromVRAMAltivec: 35.03 sec | copyFromVRAMAltivec: 753.15
>> sec
>>
>> The results show that dcbz has some effect but an even bigger slow down is
>> caused by using AltiVec which is supposed to do wider access to reduce the
>> overhead but maybe it's not translated to host vector instructions
>> correctly. The host in the above test was Intel i7-9700K. So to solve this
>> maybe AltiVec should be improved more than dcbz but I don't know what and
>> how.
>
> Looking at what AltiVec ops are used there aren't many. lvx and stvx should
> translate to 128 bit ops so those are probably ok, there are some lvsl lvsr
> ops which may be ok too and the only other one left is vperm which seems very
> much unoptimised, so my guess is likely that vperm causes the slow down here
> (I could try profiling to confirm if needed). Is there a way to improve that?
I have tried profiling the dst in real card vfio vram with dcbz case (with
100 iterations instead of 10000 in above tests) but I'm not sure I
understand the results. vperm and dcbz show up but not too high. Can
somebody explain what is happening here and where the overhead likely
comes from? Here is the profile result I got:
Samples: 104K of event 'cycles:Pu', Event count (approx.): 122371086557
Children Self Command Shared Object Symbol
- 99.44% 0.95% qemu-system-ppc qemu-system-ppc [.] cpu_exec_loop
- 98.49% cpu_exec_loop
- 98.48% cpu_tb_exec
- 90.95% 0x7f4e705d8f15
helper_ldub_mmu
do_ld_mmio_beN
- cpu_io_recompile
- 45.79% cpu_loop_exit_noexc
- cpu_loop_exit
__longjmp_chk
cpu_exec_setjmp
- cpu_exec_loop
- 45.78% cpu_tb_exec
42.35% 0x7f4e6f3f0000
- 0.72% 0x7f4e99f37037
helper_VPERM
- 0.68% 0x7f4e99f3716d
helper_VPERM
- 45.16% rr_cpu_thread_fn
- 45.16% tcg_cpu_exec
- 45.15% cpu_exec
- 45.15% cpu_exec_setjmp
- cpu_exec_loop
- 45.14% cpu_tb_exec
42.08% 0x7f4e6f3f0000
- 0.72% 0x7f4e99f37037
helper_VPERM
- 0.67% 0x7f4e99f3716d
helper_VPERM
+ 2.40% 0x7f4e74e85bae
+ 2.15% 0x7f4e7060a2dc
+ 0.99% 0x7f4e73d93781
+ 99.32% 0.37% qemu-system-ppc qemu-system-ppc [.] cpu_tb_exec
+ 98.73% 0.00% qemu-system-ppc qemu-system-ppc [.] cpu_exec_setjmp
- 94.11% 0.00% qemu-system-ppc qemu-system-ppc [.] cpu_io_recompile
- 94.11% cpu_io_recompile
- 89.79% rr_cpu_thread_fn
- 89.78% tcg_cpu_exec
- 89.78% cpu_exec
cpu_exec_setjmp
- cpu_exec_loop
- 89.78% cpu_tb_exec
- 88.40% 0x7f4e705d8f15
helper_ldub_mmu
do_ld_mmio_beN
- cpu_io_recompile
- 44.47% cpu_loop_exit_noexc
- cpu_loop_exit
__longjmp_chk
cpu_exec_setjmp
- cpu_exec_loop
- 44.46% cpu_tb_exec
41.22% 0x7f4e6f3f0000
- 0.70% 0x7f4e99f37037
helper_VPERM
- 0.67% 0x7f4e99f3716d
helper_VPERM
- 43.94% rr_cpu_thread_fn
- 43.93% tcg_cpu_exec
- cpu_exec
- 43.93% cpu_exec_setjmp
- cpu_exec_loop
- 43.90% cpu_tb_exec
40.95% 0x7f4e6f3f0000
- 0.71% 0x7f4e99f37037
helper_VPERM
- 0.66% 0x7f4e99f3716d
helper_VPERM
1.23% 0x7f4e6f3f0000
+ 4.32% cpu_loop_exit_noexc
+ 91.90% 0.00% qemu-system-ppc qemu-system-ppc [.] cpu_exec
+ 91.90% 0.00% qemu-system-ppc qemu-system-ppc [.] tcg_cpu_exec
+ 91.88% 0.00% qemu-system-ppc qemu-system-ppc [.] rr_cpu_thread_fn
+ 91.12% 0.00% qemu-system-ppc qemu-system-ppc [.] helper_ldub_mmu
+ 91.12% 0.00% qemu-system-ppc qemu-system-ppc [.] do_ld_mmio_beN
- 91.10% 0.00% qemu-system-ppc [JIT] tid 4074 [.] 0x00007f4e705d8f15
0x7f4e705d8f15
helper_ldub_mmu
do_ld_mmio_beN
- cpu_io_recompile
- 45.93% cpu_loop_exit_noexc
- cpu_loop_exit
__longjmp_chk
cpu_exec_setjmp
- cpu_exec_loop
- 45.92% cpu_tb_exec
42.35% 0x7f4e6f3f0000
- 0.72% 0x7f4e99f37037
helper_VPERM
- 0.68% 0x7f4e99f3716d
helper_VPERM
- 45.18% rr_cpu_thread_fn
- 45.17% tcg_cpu_exec
- 45.17% cpu_exec
- 45.17% cpu_exec_setjmp
- cpu_exec_loop
- 45.14% cpu_tb_exec
42.08% 0x7f4e6f3f0000
- 0.72% 0x7f4e99f37037
helper_VPERM
- 0.67% 0x7f4e99f3716d
helper_VPERM
+ 88.80% 0.00% qemu-system-ppc [JIT] tid 4074 [.] 0x00007f4e6f3f0000
+ 53.56% 0.00% qemu-system-ppc qemu-system-ppc [.] cpu_loop_exit
+ 53.56% 0.00% qemu-system-ppc libc.so.6 [.] __longjmp_chk
+ 48.82% 0.00% qemu-system-ppc qemu-system-ppc [.] cpu_loop_exit_noexc
+ 7.41% 7.41% qemu-system-ppc [JIT] tid 4074 [.] 0x00007f4e99ef5c69
+ 6.89% 6.89% qemu-system-ppc [JIT] tid 4074 [.] 0x00007f4e99f3c0a2
+ 6.37% 6.37% qemu-system-ppc [JIT] tid 4074 [.] 0x00007f4e99ef5d47
+ 6.33% 6.33% qemu-system-ppc [JIT] tid 4074 [.] 0x00007f4e99ef5b9b
+ 6.21% 6.21% qemu-system-ppc [JIT] tid 4074 [.] 0x00007f4e99ef5cdc
+ 5.78% 5.78% qemu-system-ppc [JIT] tid 4074 [.] 0x00007f4e99f3c0a8
+ 5.60% 5.60% qemu-system-ppc [JIT] tid 4074 [.] 0x00007f4e99f3bdd1
+ 5.55% 5.55% qemu-system-ppc [JIT] tid 4074 [.] 0x00007f4e99f3bdd7
+ 5.43% 0.00% qemu-system-ppc qemu-system-ppc [.] cpu_loop_exit_restore
+ 5.32% 5.32% qemu-system-ppc [JIT] tid 4074 [.] 0x00007f4e99ea5beb
+ 5.30% 5.30% qemu-system-ppc [JIT] tid 4074 [.] 0x00007f4e99ea5be5
+ 4.82% 4.82% qemu-system-ppc [JIT] tid 4074 [.] 0x00007f4e99ea5bd5
+ 4.78% 4.78% qemu-system-ppc [JIT] tid 4074 [.] 0x00007f4e99ea5bdd
+ 4.68% 0.00% qemu-system-ppc qemu-system-ppc [.] helper_raise_exception_err
- 3.99% 3.97% qemu-system-ppc qemu-system-ppc [.] helper_VPERM
3.72% do_ld_mmio_beN
cpu_io_recompile
rr_cpu_thread_fn
tcg_cpu_exec
cpu_exec
cpu_exec_setjmp
cpu_exec_loop
cpu_tb_exec
0x7f4e705d8f15
helper_ldub_mmu
do_ld_mmio_beN
- cpu_io_recompile
- 1.90% cpu_loop_exit_noexc
cpu_loop_exit
__longjmp_chk
cpu_exec_setjmp
cpu_exec_loop
- cpu_tb_exec
- 0.69% 0x7f4e99f37037
helper_VPERM
- 0.66% 0x7f4e99f3716d
helper_VPERM
- 1.82% rr_cpu_thread_fn
tcg_cpu_exec
cpu_exec
cpu_exec_setjmp
cpu_exec_loop
- cpu_tb_exec
- 0.70% 0x7f4e99f37037
helper_VPERM
- 0.65% 0x7f4e99f3716d
helper_VPERM
+ 3.65% 0.00% qemu-system-ppc qemu-system-ppc [.] helper_raise_exception
+ 3.51% 0.82% qemu-system-ppc qemu-system-ppc [.] helper_lookup_tb_ptr
[...]
- 1.71% 1.52% qemu-system-ppc qemu-system-ppc [.] probe_access
1.30% do_ld_mmio_beN
cpu_io_recompile
rr_cpu_thread_fn
tcg_cpu_exec
cpu_exec
cpu_exec_setjmp
cpu_exec_loop
cpu_tb_exec
0x7f4e705d8f15
helper_ldub_mmu
do_ld_mmio_beN
- cpu_io_recompile
- 0.66% cpu_loop_exit_noexc
cpu_loop_exit
__longjmp_chk
cpu_exec_setjmp
cpu_exec_loop
cpu_tb_exec
- 0.64% rr_cpu_thread_fn
tcg_cpu_exec
cpu_exec
cpu_exec_setjmp
cpu_exec_loop
cpu_tb_exec
- 1.64% 0.05% qemu-system-ppc qemu-system-ppc [.] helper_dcbz
- 1.58% helper_dcbz
probe_access
Regards,
BALATON Zoltan
BALATON Zoltan <balaton@eik.bme.hu> writes: > On Mon, 28 Apr 2025, BALATON Zoltan wrote: >> On Mon, 28 Apr 2025, BALATON Zoltan wrote: >>> On Thu, 24 Apr 2025, BALATON Zoltan wrote: >>>>> The test case I've used came out of a discussion about very slow >>>>> access to VRAM of a graphics card passed through with vfio the reason >>>>> for which is still not clear but it was already known that dcbz is >>>>> often used by MacOS and AmigaOS for clearing memory and to avoid >>>>> reading values about to be overwritten which is faster on real CPU but >>>>> was found to be slower on QEMU. The optimised copy routines were >>>>> posted here: <snip> > > I have tried profiling the dst in real card vfio vram with dcbz case > (with 100 iterations instead of 10000 in above tests) but I'm not sure > I understand the results. vperm and dcbz show up but not too high. Can > somebody explain what is happening here and where the overhead likely > comes from? Here is the profile result I got: > > Samples: 104K of event 'cycles:Pu', Event count (approx.): 122371086557 > Children Self Command Shared Object Symbol > - 99.44% 0.95% qemu-system-ppc qemu-system-ppc [.] cpu_exec_loop > - 98.49% cpu_exec_loop > - 98.48% cpu_tb_exec > - 90.95% 0x7f4e705d8f15 > helper_ldub_mmu > do_ld_mmio_beN > - cpu_io_recompile This looks like the dbz instructions are being used to clear device memory and tripping over the can_do_io check (normally the translator tries to ensure all device access is at the end of a block). You could try ending the block on dbz instructions and seeing if that helps. Normally I would expect the helper to be more efficient as it can probe the whole address range once and then use host insns to blat the memory. > - 45.79% cpu_loop_exit_noexc > - cpu_loop_exit > __longjmp_chk > cpu_exec_setjmp > - cpu_exec_loop > - 45.78% cpu_tb_exec > 42.35% 0x7f4e6f3f0000 > - 0.72% 0x7f4e99f37037 > helper_VPERM > - 0.68% 0x7f4e99f3716d > helper_VPERM > - 45.16% rr_cpu_thread_fn Hmm you seem to be running in icount mode here for some reason. > - 45.16% tcg_cpu_exec > - 45.15% cpu_exec > - 45.15% cpu_exec_setjmp > - cpu_exec_loop > - 45.14% cpu_tb_exec > 42.08% 0x7f4e6f3f0000 > - 0.72% 0x7f4e99f37037 > helper_VPERM > - 0.67% 0x7f4e99f3716d > helper_VPERM <snip> -- Alex Bennée Virtualisation Tech Lead @ Linaro
On 4/29/25 08:27, Alex Bennée wrote: >> - 45.16% rr_cpu_thread_fn > > Hmm you seem to be running in icount mode here for some reason. For some reason ppc32 does not enable mttcg. I'm not sure what's missing to enable it properly. r~
Richard Henderson <richard.henderson@linaro.org> writes: > On 4/29/25 08:27, Alex Bennée wrote: >>> - 45.16% rr_cpu_thread_fn >> Hmm you seem to be running in icount mode here for some reason. > > For some reason ppc32 does not enable mttcg. > I'm not sure what's missing to enable it properly. I seem to recall it may have been reverted due to instability but I can't find the commit. > > > r~ -- Alex Bennée Virtualisation Tech Lead @ Linaro
On Tue, 29 Apr 2025, Alex Bennée wrote:
> Richard Henderson <richard.henderson@linaro.org> writes:
>
>> On 4/29/25 08:27, Alex Bennée wrote:
>>>> - 45.16% rr_cpu_thread_fn
>>> Hmm you seem to be running in icount mode here for some reason.
>>
>> For some reason ppc32 does not enable mttcg.
>> I'm not sure what's missing to enable it properly.
>
> I seem to recall it may have been reverted due to instability but I
> can't find the commit.
Or maybe it was never enabled? We've recently tried mttcg with G4 mac99
machine and it seems to work but the needed patches were not cleaned up
for upstream yet so they are using a fork for that now. But that's a
digression.
I've tried to rerun the benchmark with qemu-system-ppc64 instead of
qemu-system-ppc (no other change in the command) and it did not seem to
help much, it's still slow. Here's the profile:
Children Self Command Shared Object Symbol
- 99.42% 0.78% qemu-system-ppc qemu-system-ppc64 [.] cpu_exec_loop
- 99.32% cpu_exec_loop
- 99.32% cpu_tb_exec
- 91.29% 0x7f25d079f8b4
helper_ldub_mmu
do_ld_mmio_beN
- cpu_io_recompile
- 49.05% mttcg_cpu_thread_fn
- 49.05% tcg_cpu_exec
- 49.05% cpu_exec
- 49.04% cpu_exec_setjmp
- cpu_exec_loop
- 49.03% cpu_tb_exec
38.92% 0x7f25cf3f0000
- 0.63% 0x7f25fe78bd93
helper_VPERM
- 0.61% 0x7f25fe78bed8
helper_VPERM
- 42.24% cpu_loop_exit_noexc
cpu_loop_exit
__longjmp_chk
cpu_exec_setjmp
- cpu_exec_loop
- 42.23% cpu_tb_exec
38.67% 0x7f25cf3f0000
- 0.62% 0x7f25fe78bd93
helper_VPERM
- 0.60% 0x7f25fe78bed8
helper_VPERM
- 5.78% 0x7f25d0625055
helper_raise_exception
mttcg_cpu_thread_fn
tcg_cpu_exec
cpu_exec
cpu_exec_setjmp
cpu_exec_loop
cpu_tb_exec
0x7f25d0625055
helper_raise_exception
mttcg_cpu_thread_fn
tcg_cpu_exec
cpu_exec
cpu_exec_setjmp
cpu_exec_loop
- cpu_tb_exec
- 5.78% 0x7f25d0625055
- helper_raise_exception
- 5.49% mttcg_cpu_thread_fn
- 5.16% tcg_cpu_exec
- 5.11% cpu_exec
- 5.03% cpu_exec_setjmp
- 5.01% cpu_exec_loop
- 4.27% cpu_tb_exec
1.60% 0x7f25cf3f0000
+ 99.41% 0.25% qemu-system-ppc qemu-system-ppc64 [.] cpu_tb_exec
+ 99.41% 0.01% qemu-system-ppc qemu-system-ppc64 [.] cpu_exec_setjmp
+ 98.02% 0.17% qemu-system-ppc qemu-system-ppc64 [.] cpu_exec
+ 97.99% 0.02% qemu-system-ppc qemu-system-ppc64 [.] tcg_cpu_exec
+ 97.98% 0.05% qemu-system-ppc qemu-system-ppc64 [.] mttcg_cpu_thread_fn
+ 92.38% 0.00% qemu-system-ppc qemu-system-ppc64 [.] cpu_io_recompile
+ 91.54% 0.00% qemu-system-ppc qemu-system-ppc64 [.] do_ld_mmio_beN
+ 91.51% 0.00% qemu-system-ppc qemu-system-ppc64 [.] helper_ldub_mmu
+ 91.49% 0.00% qemu-system-ppc [JIT] tid 12410 [.] 0x00007f25d079f8b4
+ 81.15% 0.00% qemu-system-ppc [JIT] tid 12410 [.] 0x00007f25cf3f0000
+ 44.70% 0.00% qemu-system-ppc qemu-system-ppc64 [.] cpu_loop_exit
+ 44.50% 0.01% qemu-system-ppc libc.so.6 [.] __longjmp_chk
+ 43.16% 0.00% qemu-system-ppc qemu-system-ppc64 [.] cpu_loop_exit_noexc
+ 9.57% 0.00% qemu-system-ppc qemu-system-ppc64 [.] helper_raise_exception
+ 8.02% 0.08% qemu-system-ppc qemu-system-ppc64 [.] notdirty_write.isra.0
+ 7.60% 0.05% qemu-system-ppc qemu-system-ppc64 [.] mmu_lookup
+ 7.50% 0.03% qemu-system-ppc qemu-system-ppc64 [.] tb_invalidate_phys_range_fast
+ 7.34% 0.05% qemu-system-ppc qemu-system-ppc64 [.] do_st4_mmu
+ 7.18% 0.02% qemu-system-ppc qemu-system-ppc64 [.] mmu_watch_or_dirty
+ 6.99% 6.99% qemu-system-ppc [JIT] tid 12410 [.] 0x00007f25fe7bba4b
+ 6.82% 6.82% qemu-system-ppc [JIT] tid 12410 [.] 0x00007f25fe7c6545
+ 6.01% 6.01% qemu-system-ppc [JIT] tid 12410 [.] 0x00007f25fe7bbac9
+ 5.94% 5.94% qemu-system-ppc [JIT] tid 12410 [.] 0x00007f25fe7bbb47
+ 5.90% 5.90% qemu-system-ppc [JIT] tid 12410 [.] 0x00007f25fe7bb968
+ 5.85% 0.00% qemu-system-ppc [JIT] tid 12410 [.] 0x00007f25d0625055
+ 5.45% 1.17% qemu-system-ppc qemu-system-ppc64 [.] page_collection_lock
+ 5.13% 5.13% qemu-system-ppc [JIT] tid 12410 [.] 0x00007f25fe7c654b
+ 5.08% 5.08% qemu-system-ppc [JIT] tid 12410 [.] 0x00007f25fe71f74b
+ 5.07% 5.07% qemu-system-ppc [JIT] tid 12410 [.] 0x00007f25fe7c624f
+ 5.05% 5.05% qemu-system-ppc [JIT] tid 12410 [.] 0x00007f25fe7c6249
+ 4.93% 4.93% qemu-system-ppc [JIT] tid 12410 [.] 0x00007f25fe71f740
+ 4.64% 4.64% qemu-system-ppc [JIT] tid 12410 [.] 0x00007f25fe71f890
+ 4.49% 4.49% qemu-system-ppc [JIT] tid 12410 [.] 0x00007f25fe71f885
+ 4.05% 1.51% qemu-system-ppc qemu-system-ppc64 [.] page_trylock_add
+ 3.64% 3.62% qemu-system-ppc qemu-system-ppc64 [.] helper_VPERM
+ 2.43% 1.40% qemu-system-ppc qemu-system-ppc64 [.] probe_access
+ 2.16% 0.51% qemu-system-ppc libglib-2.0.so.0.7600.3 [.] g_tree_lookup
+ 2.09% 0.00% qemu-system-ppc qemu-system-ppc64 [.] cpu_loop_exit_restore
+ 1.66% 0.06% qemu-system-ppc qemu-system-ppc64 [.] helper_store_msr
+ 1.61% 0.12% qemu-system-ppc qemu-system-ppc64 [.] hreg_store_msr
+ 1.52% 1.52% qemu-system-ppc qemu-system-ppc64 [.] tb_invalidate_phys_page_range__locked.constprop.0
+ 1.49% 0.05% qemu-system-ppc qemu-system-ppc64 [.] dcbz_common
The times with 100 iterations were:
mapping 0x80800000
src 0xb773a008 dst 0xb7638000
byte loop: 6.49 sec
memset: 0.44 sec
memcpy: 1.6 sec
copyToVRAMNoAltivec: 0.8 sec
copyToVRAMAltivec: 0.88 sec
copyFromVRAMNoAltivec: 8.15 sec
copyFromVRAMAltivec: 8.41 sec
(previous results were with 10000 iterations but I did not rerun that now,
I assume we can roughly take 100 times these results to compare to that.
Then this may be even slower with qemu-system-ppc64 which can be as some
code is compiled out without TARGET_PPC64 defined.)
I try to investigate more but I'm still quite lost.
Regards,
BALATON Zoltan
On Tue, 29 Apr 2025, Alex Bennée wrote: > BALATON Zoltan <balaton@eik.bme.hu> writes: >> On Mon, 28 Apr 2025, BALATON Zoltan wrote: >>> On Mon, 28 Apr 2025, BALATON Zoltan wrote: >>>> On Thu, 24 Apr 2025, BALATON Zoltan wrote: >>>>>> The test case I've used came out of a discussion about very slow >>>>>> access to VRAM of a graphics card passed through with vfio the reason >>>>>> for which is still not clear but it was already known that dcbz is >>>>>> often used by MacOS and AmigaOS for clearing memory and to avoid >>>>>> reading values about to be overwritten which is faster on real CPU but >>>>>> was found to be slower on QEMU. The optimised copy routines were >>>>>> posted here: > <snip> >> >> I have tried profiling the dst in real card vfio vram with dcbz case >> (with 100 iterations instead of 10000 in above tests) but I'm not sure >> I understand the results. vperm and dcbz show up but not too high. Can >> somebody explain what is happening here and where the overhead likely >> comes from? Here is the profile result I got: >> >> Samples: 104K of event 'cycles:Pu', Event count (approx.): 122371086557 >> Children Self Command Shared Object Symbol >> - 99.44% 0.95% qemu-system-ppc qemu-system-ppc [.] cpu_exec_loop >> - 98.49% cpu_exec_loop >> - 98.48% cpu_tb_exec >> - 90.95% 0x7f4e705d8f15 >> helper_ldub_mmu >> do_ld_mmio_beN >> - cpu_io_recompile > > This looks like the dbz instructions are being used to clear device > memory and tripping over the can_do_io check (normally the translator > tries to ensure all device access is at the end of a block). If you look at the benchmark results I posted earlier in this thread in https://lists.nongnu.org/archive/html/qemu-ppc/2025-04/msg00326.html I also tried using dcba instead of dcbz in the CopyFromVRAM* functions but that only helped very little so not sure it's because of dcbz. Then I thought it might be VPERM but the NoAltivec variants are also only a little faster. It could be that using 64 bit access instead of 128 bit (the NoAltivec functions use FPU regs) makes it slower while avoiding VPERM makes it faster which cancel each other but the profile also shows VPERM not high and somebody else also tested this with -cpu g3 and only got 1% faster result so maybe it's also not primarily because of VPERM but there's a bigger overhead before these.. > You could try ending the block on dbz instructions and seeing if that > helps. Normally I would expect the helper to be more efficient as it can > probe the whole address range once and then use host insns to blat the > memory. Maybe I could try that if I can do that the same way as done in io_prepare. >> - 45.79% cpu_loop_exit_noexc >> - cpu_loop_exit >> __longjmp_chk >> cpu_exec_setjmp >> - cpu_exec_loop >> - 45.78% cpu_tb_exec >> 42.35% 0x7f4e6f3f0000 >> - 0.72% 0x7f4e99f37037 >> helper_VPERM >> - 0.68% 0x7f4e99f3716d >> helper_VPERM >> - 45.16% rr_cpu_thread_fn > > Hmm you seem to be running in icount mode here for some reason. No idea why. I had no such options and complied without --enable-debug and nothing special on QEMU command just defaults options. How can I check if icount is enabled? Can profiling with perf tool interfere? I thought that only reads CPU performance counters and does not attach to the process otherwise. Regards, BALATON Zoltan >> - 45.16% tcg_cpu_exec >> - 45.15% cpu_exec >> - 45.15% cpu_exec_setjmp >> - cpu_exec_loop >> - 45.14% cpu_tb_exec >> 42.08% 0x7f4e6f3f0000 >> - 0.72% 0x7f4e99f37037 >> helper_VPERM >> - 0.67% 0x7f4e99f3716d >> helper_VPERM > <snip> > >
On 4/28/25 06:26, BALATON Zoltan wrote: > I have tried profiling the dst in real card vfio vram with dcbz case (with 100 iterations > instead of 10000 in above tests) but I'm not sure I understand the results. vperm and dcbz > show up but not too high. Can somebody explain what is happening here and where the > overhead likely comes from? Here is the profile result I got: > > Samples: 104K of event 'cycles:Pu', Event count (approx.): 122371086557 > Children Self Command Shared Object Symbol > - 99.44% 0.95% qemu-system-ppc qemu-system-ppc [.] cpu_exec_loop > - 98.49% cpu_exec_loop > - 98.48% cpu_tb_exec > - 90.95% 0x7f4e705d8f15 > helper_ldub_mmu > do_ld_mmio_beN > - cpu_io_recompile > - 45.79% cpu_loop_exit_noexc I think the real problem is the number of loop exits due to i/o. If I'm reading this rightly, 45% of execution is in cpu_io_recompile. I/O can only happen as the last insn of a translation block. When we detect that it has happened in the middle of a translation block, we abort the block, compile a new one, and restart execution. Where this becomes a bottleneck is when this same translation block is in a loop. Exactly this case of memset/memcpy of VRAM. This could be addressed by invalidating the previous translation block and creating a new one which always ends with the i/o. r~
On Mon, 28 Apr 2025, Richard Henderson wrote: > On 4/28/25 06:26, BALATON Zoltan wrote: >> I have tried profiling the dst in real card vfio vram with dcbz case (with >> 100 iterations instead of 10000 in above tests) but I'm not sure I >> understand the results. vperm and dcbz show up but not too high. Can >> somebody explain what is happening here and where the overhead likely comes >> from? Here is the profile result I got: >> >> Samples: 104K of event 'cycles:Pu', Event count (approx.): 122371086557 >> Children Self Command Shared Object Symbol >> - 99.44% 0.95% qemu-system-ppc qemu-system-ppc [.] >> cpu_exec_loop >> - 98.49% cpu_exec_loop >> - 98.48% cpu_tb_exec >> - 90.95% 0x7f4e705d8f15 >> helper_ldub_mmu >> do_ld_mmio_beN >> - cpu_io_recompile >> - 45.79% cpu_loop_exit_noexc > > I think the real problem is the number of loop exits due to i/o. If I'm > reading this rightly, 45% of execution is in cpu_io_recompile. > > I/O can only happen as the last insn of a translation block. I'm not sure I understand this. A comment above cpu_io_recompile says "In deterministic execution mode, instructions doing device I/Os must be at the end of the TB." Is that wrong? Otherwise shouldn't this only apply if running with icount or something like that? > When we detect > that it has happened in the middle of a translation block, we abort the > block, compile a new one, and restart execution. Where does that happen? The calls of cpu_io_recompile in this case seem to come from io_prepare which is called from do_ld16_mmio_beN if (!cpu->neg.can_do_io) but I don't see how can_do_io is set. > Where this becomes a bottleneck is when this same translation block is in a > loop. Exactly this case of memset/memcpy of VRAM. This could be addressed > by invalidating the previous translation block and creating a new one which > always ends with the i/o. And where to do that? cpu_io_recompile just exits the TB but what generates the new TB? I need some more clues to understands how to do this. Regards, BALATON Zoltan
BALATON Zoltan <balaton@eik.bme.hu> writes: > On Mon, 28 Apr 2025, Richard Henderson wrote: >> On 4/28/25 06:26, BALATON Zoltan wrote: >>> I have tried profiling the dst in real card vfio vram with dcbz >>> case (with 100 iterations instead of 10000 in above tests) but I'm >>> not sure I understand the results. vperm and dcbz show up but not >>> too high. Can somebody explain what is happening here and where the >>> overhead likely comes from? Here is the profile result I got: >>> Samples: 104K of event 'cycles:Pu', Event count (approx.): >>> 122371086557 >>> Children Self Command Shared Object Symbol >>> - 99.44% 0.95% qemu-system-ppc qemu-system-ppc [.] >>> cpu_exec_loop >>> - 98.49% cpu_exec_loop >>> - 98.48% cpu_tb_exec >>> - 90.95% 0x7f4e705d8f15 >>> helper_ldub_mmu >>> do_ld_mmio_beN >>> - cpu_io_recompile >>> - 45.79% cpu_loop_exit_noexc >> >> I think the real problem is the number of loop exits due to i/o. If >> I'm reading this rightly, 45% of execution is in cpu_io_recompile. >> >> I/O can only happen as the last insn of a translation block. > > I'm not sure I understand this. A comment above cpu_io_recompile says > "In deterministic execution mode, instructions doing device I/Os must > be at the end of the TB." Is that wrong? Otherwise shouldn't this only > apply if running with icount or something like that? That comment should be fixed. It used to only be the case for icount mode but there was another race bug that meant we need to honour device access as the last insn for both modes. > >> When we detect that it has happened in the middle of a translation >> block, we abort the block, compile a new one, and restart execution. > > Where does that happen? The calls of cpu_io_recompile in this case > seem to come from io_prepare which is called from do_ld16_mmio_beN if > (!cpu->neg.can_do_io) but I don't see how can_do_io is set. Inline by set_can_do_io() >> Where this becomes a bottleneck is when this same translation block >> is in a loop. Exactly this case of memset/memcpy of VRAM. This >> could be addressed by invalidating the previous translation block >> and creating a new one which always ends with the i/o. > > And where to do that? cpu_io_recompile just exits the TB but what > generates the new TB? I need some more clues to understands how to do > this. cpu->cflags_next_tb = curr_cflags(cpu) | CF_MEMI_ONLY | CF_NOIRQ | n; sets the cflags for the next cb, which typically will fail to find and then regenerate. Normally cflags_next_tb is empty. > > Regards, > BALATON Zoltan -- Alex Bennée Virtualisation Tech Lead @ Linaro
On Tue, 29 Apr 2025, Alex Bennée wrote: > BALATON Zoltan <balaton@eik.bme.hu> writes: >> On Mon, 28 Apr 2025, Richard Henderson wrote: >>> On 4/28/25 06:26, BALATON Zoltan wrote: >>>> I have tried profiling the dst in real card vfio vram with dcbz >>>> case (with 100 iterations instead of 10000 in above tests) but I'm >>>> not sure I understand the results. vperm and dcbz show up but not >>>> too high. Can somebody explain what is happening here and where the >>>> overhead likely comes from? Here is the profile result I got: >>>> Samples: 104K of event 'cycles:Pu', Event count (approx.): >>>> 122371086557 >>>> Children Self Command Shared Object Symbol >>>> - 99.44% 0.95% qemu-system-ppc qemu-system-ppc [.] >>>> cpu_exec_loop >>>> - 98.49% cpu_exec_loop >>>> - 98.48% cpu_tb_exec >>>> - 90.95% 0x7f4e705d8f15 >>>> helper_ldub_mmu >>>> do_ld_mmio_beN >>>> - cpu_io_recompile >>>> - 45.79% cpu_loop_exit_noexc >>> >>> I think the real problem is the number of loop exits due to i/o. If >>> I'm reading this rightly, 45% of execution is in cpu_io_recompile. >>> >>> I/O can only happen as the last insn of a translation block. >> >> I'm not sure I understand this. A comment above cpu_io_recompile says >> "In deterministic execution mode, instructions doing device I/Os must >> be at the end of the TB." Is that wrong? Otherwise shouldn't this only >> apply if running with icount or something like that? > > That comment should be fixed. It used to only be the case for icount > mode but there was another race bug that meant we need to honour device > access as the last insn for both modes. > >> >>> When we detect that it has happened in the middle of a translation >>> block, we abort the block, compile a new one, and restart execution. >> >> Where does that happen? The calls of cpu_io_recompile in this case >> seem to come from io_prepare which is called from do_ld16_mmio_beN if >> (!cpu->neg.can_do_io) but I don't see how can_do_io is set. > > Inline by set_can_do_io() That one I've found but don't know where the cpu_loop_exit returns from the end of cpu_io_recompile. >>> Where this becomes a bottleneck is when this same translation block >>> is in a loop. Exactly this case of memset/memcpy of VRAM. This >>> could be addressed by invalidating the previous translation block >>> and creating a new one which always ends with the i/o. >> >> And where to do that? cpu_io_recompile just exits the TB but what >> generates the new TB? I need some more clues to understands how to do >> this. > > cpu->cflags_next_tb = curr_cflags(cpu) | CF_MEMI_ONLY | CF_NOIRQ | n; > > sets the cflags for the next cb, which typically will fail to find and > then regenerate. Normally cflags_next_tb is empty. Shouldn't this only regenerate the next TB on the first loop iteration and not afterwards? Regards, BALATON Zoltan
BALATON Zoltan <balaton@eik.bme.hu> writes: > On Tue, 29 Apr 2025, Alex Bennée wrote: >> BALATON Zoltan <balaton@eik.bme.hu> writes: >>> On Mon, 28 Apr 2025, Richard Henderson wrote: >>>> On 4/28/25 06:26, BALATON Zoltan wrote: >>>>> I have tried profiling the dst in real card vfio vram with dcbz >>>>> case (with 100 iterations instead of 10000 in above tests) but I'm >>>>> not sure I understand the results. vperm and dcbz show up but not >>>>> too high. Can somebody explain what is happening here and where the >>>>> overhead likely comes from? Here is the profile result I got: >>>>> Samples: 104K of event 'cycles:Pu', Event count (approx.): >>>>> 122371086557 >>>>> Children Self Command Shared Object Symbol >>>>> - 99.44% 0.95% qemu-system-ppc qemu-system-ppc [.] >>>>> cpu_exec_loop >>>>> - 98.49% cpu_exec_loop >>>>> - 98.48% cpu_tb_exec >>>>> - 90.95% 0x7f4e705d8f15 >>>>> helper_ldub_mmu >>>>> do_ld_mmio_beN >>>>> - cpu_io_recompile >>>>> - 45.79% cpu_loop_exit_noexc >>>> >>>> I think the real problem is the number of loop exits due to i/o. If >>>> I'm reading this rightly, 45% of execution is in cpu_io_recompile. >>>> >>>> I/O can only happen as the last insn of a translation block. >>> >>> I'm not sure I understand this. A comment above cpu_io_recompile says >>> "In deterministic execution mode, instructions doing device I/Os must >>> be at the end of the TB." Is that wrong? Otherwise shouldn't this only >>> apply if running with icount or something like that? >> >> That comment should be fixed. It used to only be the case for icount >> mode but there was another race bug that meant we need to honour device >> access as the last insn for both modes. >> >>> >>>> When we detect that it has happened in the middle of a translation >>>> block, we abort the block, compile a new one, and restart execution. >>> >>> Where does that happen? The calls of cpu_io_recompile in this case >>> seem to come from io_prepare which is called from do_ld16_mmio_beN if >>> (!cpu->neg.can_do_io) but I don't see how can_do_io is set. >> >> Inline by set_can_do_io() > > That one I've found but don't know where the cpu_loop_exit returns > from the end of cpu_io_recompile. cpu_loop_exit longjmp's back to the top of the execution loop. > >>>> Where this becomes a bottleneck is when this same translation block >>>> is in a loop. Exactly this case of memset/memcpy of VRAM. This >>>> could be addressed by invalidating the previous translation block >>>> and creating a new one which always ends with the i/o. >>> >>> And where to do that? cpu_io_recompile just exits the TB but what >>> generates the new TB? I need some more clues to understands how to do >>> this. >> >> cpu->cflags_next_tb = curr_cflags(cpu) | CF_MEMI_ONLY | CF_NOIRQ | n; >> >> sets the cflags for the next cb, which typically will fail to find and >> then regenerate. Normally cflags_next_tb is empty. > > Shouldn't this only regenerate the next TB on the first loop iteration > and not afterwards? if we've been here before (needing n insn from the base addr) we will have a cached translation we can re-use. It doesn't stop the longer TB being called again as we re-enter a loop. > > Regards, > BALATON Zoltan -- Alex Bennée Virtualisation Tech Lead @ Linaro
On Tue, 29 Apr 2025, Alex Bennée wrote: > BALATON Zoltan <balaton@eik.bme.hu> writes: >> On Tue, 29 Apr 2025, Alex Bennée wrote: >>> BALATON Zoltan <balaton@eik.bme.hu> writes: >>>> On Mon, 28 Apr 2025, Richard Henderson wrote: >>>>> On 4/28/25 06:26, BALATON Zoltan wrote: >>>>>> I have tried profiling the dst in real card vfio vram with dcbz >>>>>> case (with 100 iterations instead of 10000 in above tests) but I'm >>>>>> not sure I understand the results. vperm and dcbz show up but not >>>>>> too high. Can somebody explain what is happening here and where the >>>>>> overhead likely comes from? Here is the profile result I got: >>>>>> Samples: 104K of event 'cycles:Pu', Event count (approx.): >>>>>> 122371086557 >>>>>> Children Self Command Shared Object Symbol >>>>>> - 99.44% 0.95% qemu-system-ppc qemu-system-ppc [.] >>>>>> cpu_exec_loop >>>>>> - 98.49% cpu_exec_loop >>>>>> - 98.48% cpu_tb_exec >>>>>> - 90.95% 0x7f4e705d8f15 >>>>>> helper_ldub_mmu >>>>>> do_ld_mmio_beN >>>>>> - cpu_io_recompile >>>>>> - 45.79% cpu_loop_exit_noexc >>>>> >>>>> I think the real problem is the number of loop exits due to i/o. If >>>>> I'm reading this rightly, 45% of execution is in cpu_io_recompile. >>>>> >>>>> I/O can only happen as the last insn of a translation block. >>>> >>>> I'm not sure I understand this. A comment above cpu_io_recompile says >>>> "In deterministic execution mode, instructions doing device I/Os must >>>> be at the end of the TB." Is that wrong? Otherwise shouldn't this only >>>> apply if running with icount or something like that? >>> >>> That comment should be fixed. It used to only be the case for icount >>> mode but there was another race bug that meant we need to honour device >>> access as the last insn for both modes. >>> >>>> >>>>> When we detect that it has happened in the middle of a translation >>>>> block, we abort the block, compile a new one, and restart execution. >>>> >>>> Where does that happen? The calls of cpu_io_recompile in this case >>>> seem to come from io_prepare which is called from do_ld16_mmio_beN if >>>> (!cpu->neg.can_do_io) but I don't see how can_do_io is set. >>> >>> Inline by set_can_do_io() >> >> That one I've found but don't know where the cpu_loop_exit returns >> from the end of cpu_io_recompile. > > cpu_loop_exit longjmp's back to the top of the execution loop. > >> >>>>> Where this becomes a bottleneck is when this same translation block >>>>> is in a loop. Exactly this case of memset/memcpy of VRAM. This >>>>> could be addressed by invalidating the previous translation block >>>>> and creating a new one which always ends with the i/o. >>>> >>>> And where to do that? cpu_io_recompile just exits the TB but what >>>> generates the new TB? I need some more clues to understands how to do >>>> this. >>> >>> cpu->cflags_next_tb = curr_cflags(cpu) | CF_MEMI_ONLY | CF_NOIRQ | n; >>> >>> sets the cflags for the next cb, which typically will fail to find and >>> then regenerate. Normally cflags_next_tb is empty. >> >> Shouldn't this only regenerate the next TB on the first loop iteration >> and not afterwards? > > if we've been here before (needing n insn from the base addr) we will > have a cached translation we can re-use. It doesn't stop the longer TB > being called again as we re-enter a loop. So then maybe it should at least check if there's already a cached TB where it can continue before calling cpu_io_recompile in io_prepare and only recompile if needed? I was thinking maybe we need a flag or counter to see if cpu_io_recompile is called more than once and after a limit invalidate the TB and create two new ones the first ending at the I/O and then what cpu_io_recompile does now which as I understood was what Richard suggested but I don't know how to do that. Regards, BALATON Zoltan
On Wed Apr 30, 2025 at 7:09 AM AEST, BALATON Zoltan wrote: > On Tue, 29 Apr 2025, Alex Bennée wrote: >> BALATON Zoltan <balaton@eik.bme.hu> writes: >>> On Tue, 29 Apr 2025, Alex Bennée wrote: >>>> BALATON Zoltan <balaton@eik.bme.hu> writes: >>>>> On Mon, 28 Apr 2025, Richard Henderson wrote: >>>>>> On 4/28/25 06:26, BALATON Zoltan wrote: >>>>>>> I have tried profiling the dst in real card vfio vram with dcbz >>>>>>> case (with 100 iterations instead of 10000 in above tests) but I'm >>>>>>> not sure I understand the results. vperm and dcbz show up but not >>>>>>> too high. Can somebody explain what is happening here and where the >>>>>>> overhead likely comes from? Here is the profile result I got: >>>>>>> Samples: 104K of event 'cycles:Pu', Event count (approx.): >>>>>>> 122371086557 >>>>>>> Children Self Command Shared Object Symbol >>>>>>> - 99.44% 0.95% qemu-system-ppc qemu-system-ppc [.] >>>>>>> cpu_exec_loop >>>>>>> - 98.49% cpu_exec_loop >>>>>>> - 98.48% cpu_tb_exec >>>>>>> - 90.95% 0x7f4e705d8f15 >>>>>>> helper_ldub_mmu >>>>>>> do_ld_mmio_beN >>>>>>> - cpu_io_recompile >>>>>>> - 45.79% cpu_loop_exit_noexc >>>>>> >>>>>> I think the real problem is the number of loop exits due to i/o. If >>>>>> I'm reading this rightly, 45% of execution is in cpu_io_recompile. >>>>>> >>>>>> I/O can only happen as the last insn of a translation block. >>>>> >>>>> I'm not sure I understand this. A comment above cpu_io_recompile says >>>>> "In deterministic execution mode, instructions doing device I/Os must >>>>> be at the end of the TB." Is that wrong? Otherwise shouldn't this only >>>>> apply if running with icount or something like that? >>>> >>>> That comment should be fixed. It used to only be the case for icount >>>> mode but there was another race bug that meant we need to honour device >>>> access as the last insn for both modes. >>>> >>>>> >>>>>> When we detect that it has happened in the middle of a translation >>>>>> block, we abort the block, compile a new one, and restart execution. >>>>> >>>>> Where does that happen? The calls of cpu_io_recompile in this case >>>>> seem to come from io_prepare which is called from do_ld16_mmio_beN if >>>>> (!cpu->neg.can_do_io) but I don't see how can_do_io is set. >>>> >>>> Inline by set_can_do_io() >>> >>> That one I've found but don't know where the cpu_loop_exit returns >>> from the end of cpu_io_recompile. >> >> cpu_loop_exit longjmp's back to the top of the execution loop. >> >>> >>>>>> Where this becomes a bottleneck is when this same translation block >>>>>> is in a loop. Exactly this case of memset/memcpy of VRAM. This >>>>>> could be addressed by invalidating the previous translation block >>>>>> and creating a new one which always ends with the i/o. >>>>> >>>>> And where to do that? cpu_io_recompile just exits the TB but what >>>>> generates the new TB? I need some more clues to understands how to do >>>>> this. >>>> >>>> cpu->cflags_next_tb = curr_cflags(cpu) | CF_MEMI_ONLY | CF_NOIRQ | n; >>>> >>>> sets the cflags for the next cb, which typically will fail to find and >>>> then regenerate. Normally cflags_next_tb is empty. >>> >>> Shouldn't this only regenerate the next TB on the first loop iteration >>> and not afterwards? >> >> if we've been here before (needing n insn from the base addr) we will >> have a cached translation we can re-use. It doesn't stop the longer TB >> being called again as we re-enter a loop. > > So then maybe it should at least check if there's already a cached TB > where it can continue before calling cpu_io_recompile in io_prepare and > only recompile if needed? It basically does do that AFAIKS. cpu_io_recompile() name is misleading it does not cause a recompile, it just updates cflags and exits. Next entry will look up TB that has just 1 insn and enter that. > I was thinking maybe we need a flag or counter > to see if cpu_io_recompile is called more than once and after a limit > invalidate the TB and create two new ones the first ending at the I/O and > then what cpu_io_recompile does now which as I understood was what Richard > suggested but I don't know how to do that. memset/cpy routines had kind of the same problem with real hardware. They wanted to use vector instructions for best performance, but when those are used on MMIO they would trap and be very slow. Problem is we don't know ahead of time if some routine will access MMIO or not. You could recompile it with fewer instructions but then it will be slow when used for regular memory. Heuristics are tough because you could have e.g., one initial big memset to clear a MMIO region that iterates many times over inner loop of dcbz instructions, but then is never used again for MMIO but important for regular page clearing. Making something that dynamically decays or periodically would recompile to non-IO case perhaps, but then complexity goes up. I would prefer not like to do that just for a microbenchmark, but if you think it is reasonable overall win for average workloads of your users then perhaps. Thanks, Nick
On Wed, 30 Apr 2025, Nicholas Piggin wrote: > On Wed Apr 30, 2025 at 7:09 AM AEST, BALATON Zoltan wrote: >> On Tue, 29 Apr 2025, Alex Bennée wrote: >>> BALATON Zoltan <balaton@eik.bme.hu> writes: >>>> On Tue, 29 Apr 2025, Alex Bennée wrote: >>>>> BALATON Zoltan <balaton@eik.bme.hu> writes: >>>>>> On Mon, 28 Apr 2025, Richard Henderson wrote: >>>>>>> On 4/28/25 06:26, BALATON Zoltan wrote: >>>>>>>> I have tried profiling the dst in real card vfio vram with dcbz >>>>>>>> case (with 100 iterations instead of 10000 in above tests) but I'm >>>>>>>> not sure I understand the results. vperm and dcbz show up but not >>>>>>>> too high. Can somebody explain what is happening here and where the >>>>>>>> overhead likely comes from? Here is the profile result I got: >>>>>>>> Samples: 104K of event 'cycles:Pu', Event count (approx.): >>>>>>>> 122371086557 >>>>>>>> Children Self Command Shared Object Symbol >>>>>>>> - 99.44% 0.95% qemu-system-ppc qemu-system-ppc [.] >>>>>>>> cpu_exec_loop >>>>>>>> - 98.49% cpu_exec_loop >>>>>>>> - 98.48% cpu_tb_exec >>>>>>>> - 90.95% 0x7f4e705d8f15 >>>>>>>> helper_ldub_mmu >>>>>>>> do_ld_mmio_beN >>>>>>>> - cpu_io_recompile >>>>>>>> - 45.79% cpu_loop_exit_noexc >>>>>>> >>>>>>> I think the real problem is the number of loop exits due to i/o. If >>>>>>> I'm reading this rightly, 45% of execution is in cpu_io_recompile. >>>>>>> >>>>>>> I/O can only happen as the last insn of a translation block. >>>>>> >>>>>> I'm not sure I understand this. A comment above cpu_io_recompile says >>>>>> "In deterministic execution mode, instructions doing device I/Os must >>>>>> be at the end of the TB." Is that wrong? Otherwise shouldn't this only >>>>>> apply if running with icount or something like that? >>>>> >>>>> That comment should be fixed. It used to only be the case for icount >>>>> mode but there was another race bug that meant we need to honour device >>>>> access as the last insn for both modes. >>>>> >>>>>> >>>>>>> When we detect that it has happened in the middle of a translation >>>>>>> block, we abort the block, compile a new one, and restart execution. >>>>>> >>>>>> Where does that happen? The calls of cpu_io_recompile in this case >>>>>> seem to come from io_prepare which is called from do_ld16_mmio_beN if >>>>>> (!cpu->neg.can_do_io) but I don't see how can_do_io is set. >>>>> >>>>> Inline by set_can_do_io() >>>> >>>> That one I've found but don't know where the cpu_loop_exit returns >>>> from the end of cpu_io_recompile. >>> >>> cpu_loop_exit longjmp's back to the top of the execution loop. >>> >>>> >>>>>>> Where this becomes a bottleneck is when this same translation block >>>>>>> is in a loop. Exactly this case of memset/memcpy of VRAM. This >>>>>>> could be addressed by invalidating the previous translation block >>>>>>> and creating a new one which always ends with the i/o. >>>>>> >>>>>> And where to do that? cpu_io_recompile just exits the TB but what >>>>>> generates the new TB? I need some more clues to understands how to do >>>>>> this. >>>>> >>>>> cpu->cflags_next_tb = curr_cflags(cpu) | CF_MEMI_ONLY | CF_NOIRQ | n; >>>>> >>>>> sets the cflags for the next cb, which typically will fail to find and >>>>> then regenerate. Normally cflags_next_tb is empty. >>>> >>>> Shouldn't this only regenerate the next TB on the first loop iteration >>>> and not afterwards? >>> >>> if we've been here before (needing n insn from the base addr) we will >>> have a cached translation we can re-use. It doesn't stop the longer TB >>> being called again as we re-enter a loop. >> >> So then maybe it should at least check if there's already a cached TB >> where it can continue before calling cpu_io_recompile in io_prepare and >> only recompile if needed? > > It basically does do that AFAIKS. cpu_io_recompile() name is misleading > it does not cause a recompile, it just updates cflags and exits. Next > entry will look up TB that has just 1 insn and enter that. After reading it I came to the same conclusion but then I don't understand what causes the problem. Is it just that it will exit the loop for every IO to look up the recompiled TB? It looks like it tries to chain TBs, why does that not work here? >> I was thinking maybe we need a flag or counter >> to see if cpu_io_recompile is called more than once and after a limit >> invalidate the TB and create two new ones the first ending at the I/O and >> then what cpu_io_recompile does now which as I understood was what Richard >> suggested but I don't know how to do that. > > memset/cpy routines had kind of the same problem with real hardware. > They wanted to use vector instructions for best performance, but when > those are used on MMIO they would trap and be very slow. Why do those trap on MMIO on real machine? These routines were tested on real machines and the reasoning to use the widest possible access was that PCI transfer has overhead and that is minimised by transferring more bits in one op. I think they also verifed that it works at least for the 32 bit CPUs up to G4 that were used on real AmigaNG machines. There are some benchmark results here: https://hdrlab.org.nz/benchmark/gfxbench2d/OS/AmigaOS?start=60 which is also where the benchmark I used comes from so this should be similar. I think the MemCopy on that page has plain unoptimised copy as Copy to/from VRAM and optimised routines similar to this benchmark as Read/Write Pixel Array, but it's not easy to search. Some of the machines like Pegasos II and AmigaOne XE were made with both G3 or G4 CPUs so if I find a result from those with same graphics card that could show if AltiVec is faster (although the G4s were also higher clock so not directly comparable). Some results there are also from QEMU, mostly those that are with SiliconMotion 502 but that does not have this problem only vfio-pci pass through. So maybe it's something with how vfio-pci maps PCI memory BARs? > Problem is we don't know ahead of time if some routine will access > MMIO or not. You could recompile it with fewer instructions but then > it will be slow when used for regular memory. > > Heuristics are tough because you could have e.g., one initial big > memset to clear a MMIO region that iterates many times over inner > loop of dcbz instructions, but then is never used again for MMIO but > important for regular page clearing. Making something that dynamically > decays or periodically would recompile to non-IO case perhaps, but > then complexity goes up. > > I would prefer not like to do that just for a microbenchmark, but if > you think it is reasonable overall win for average workloads of your > users then perhaps. I'm still trying to understand what to optimise. So far it looks like that dcbz has the least impact, then vperm a bit bigger but still only about a few percent and the biggest impact is still not known for sure but we see faster access on real machines that run on slower PCIe (only 4x at best) while CPU benchmarks don't show slower performance on QEMU only accessing passed through card's VRAM is slower than expected. But if there's a trap involved I've found before that exceptions are slower with QEMU but I did not see evidence of that in the profile. Regards, BALATON Zoltan
BALATON Zoltan <balaton@eik.bme.hu> writes: > On Wed, 30 Apr 2025, Nicholas Piggin wrote: >> On Wed Apr 30, 2025 at 7:09 AM AEST, BALATON Zoltan wrote: >>> On Tue, 29 Apr 2025, Alex Bennée wrote: >>>> BALATON Zoltan <balaton@eik.bme.hu> writes: >>>>> On Tue, 29 Apr 2025, Alex Bennée wrote: >>>>>> BALATON Zoltan <balaton@eik.bme.hu> writes: >>>>>>> On Mon, 28 Apr 2025, Richard Henderson wrote: >>>>>>>> On 4/28/25 06:26, BALATON Zoltan wrote: <snip> >>>> >>>> if we've been here before (needing n insn from the base addr) we will >>>> have a cached translation we can re-use. It doesn't stop the longer TB >>>> being called again as we re-enter a loop. >>> >>> So then maybe it should at least check if there's already a cached TB >>> where it can continue before calling cpu_io_recompile in io_prepare and >>> only recompile if needed? >> >> It basically does do that AFAIKS. cpu_io_recompile() name is misleading >> it does not cause a recompile, it just updates cflags and exits. Next >> entry will look up TB that has just 1 insn and enter that. > > After reading it I came to the same conclusion but then I don't > understand what causes the problem. Is it just that it will exit the > loop for every IO to look up the recompiled TB? It looks like it tries > to chain TBs, why does that not work here? Any MMIO access has to come via the slow path. Any MMIO also currently has to be the last instruction in a block in case the operation triggers a change in the translation regime that needs to be picked up by the next instruction you execute. This is a pathological case when modelling VRAM on a device because its going to be slow either way. At least if you model the multiple byte access with a helper you can amortise some of the cost of the MMU lookup with a single probe_() call. >>> I was thinking maybe we need a flag or counter >>> to see if cpu_io_recompile is called more than once and after a limit >>> invalidate the TB and create two new ones the first ending at the I/O and >>> then what cpu_io_recompile does now which as I understood was what Richard >>> suggested but I don't know how to do that. >> >> memset/cpy routines had kind of the same problem with real hardware. >> They wanted to use vector instructions for best performance, but when >> those are used on MMIO they would trap and be very slow. > > Why do those trap on MMIO on real machine? These routines were tested > on real machines and the reasoning to use the widest possible access > was that PCI transfer has overhead and that is minimised by > transferring more bits in one op. I think they also verifed that it > works at least for the 32 bit CPUs up to G4 that were used on real > AmigaNG machines. There are some benchmark results here: > https://hdrlab.org.nz/benchmark/gfxbench2d/OS/AmigaOS?start=60 which > is also where the benchmark I used comes from so this should be > similar. I think the MemCopy on that page has plain unoptimised copy > as Copy to/from VRAM and optimised routines similar to this benchmark > as Read/Write Pixel Array, but it's not easy to search. Some of the > machines like Pegasos II and AmigaOne XE were made with both G3 or G4 > CPUs so if I find a result from those with same graphics card that > could show if AltiVec is faster (although the G4s were also higher > clock so not directly comparable). Some results there are also from > QEMU, mostly those that are with SiliconMotion 502 but that does not > have this problem only vfio-pci pass through. They don't - what we need is to have a RAM-like-device model for QEMU where we can relax the translation rules because we know we are writing to RAM like things that don't have registers or other state changing behaviour. The poor behaviour is because QEMU currently treats all MMIO as potentially system state altering where as for VRAM it doesn't need to. > So maybe it's something > with how vfio-pci maps PCI memory BARs? I don't know about vfio-pci but blob resources mapped via virtio-gpu just appear as chunks of RAM to the guest - hence no trapping. > >> Problem is we don't know ahead of time if some routine will access >> MMIO or not. You could recompile it with fewer instructions but then >> it will be slow when used for regular memory. >> >> Heuristics are tough because you could have e.g., one initial big >> memset to clear a MMIO region that iterates many times over inner >> loop of dcbz instructions, but then is never used again for MMIO but >> important for regular page clearing. Making something that dynamically >> decays or periodically would recompile to non-IO case perhaps, but >> then complexity goes up. We can't have heuristics when we must prioritise correctness. However we could expand the device model to make the exact behaviour of different devices clear and optimise when we know it is safe. >> I would prefer not like to do that just for a microbenchmark, but if >> you think it is reasonable overall win for average workloads of your >> users then perhaps. > > I'm still trying to understand what to optimise. So far it looks like > that dcbz has the least impact, then vperm a bit bigger but still only > about a few percent and the biggest impact is still not known for sure > but we see faster access on real machines that run on slower PCIe > (only 4x at best) while CPU benchmarks don't show slower performance > on QEMU only accessing passed through card's VRAM is slower than > expected. But if there's a trap involved I've found before that > exceptions are slower with QEMU but I did not see evidence of that in > the profile. > > Regards, > BALATON Zoltan -- Alex Bennée Virtualisation Tech Lead @ Linaro
On Wed, 30 Apr 2025, Alex Bennée wrote: > BALATON Zoltan <balaton@eik.bme.hu> writes: >> On Wed, 30 Apr 2025, Nicholas Piggin wrote: >>> On Wed Apr 30, 2025 at 7:09 AM AEST, BALATON Zoltan wrote: >>>> On Tue, 29 Apr 2025, Alex Bennée wrote: >>>>> BALATON Zoltan <balaton@eik.bme.hu> writes: >>>>>> On Tue, 29 Apr 2025, Alex Bennée wrote: >>>>>>> BALATON Zoltan <balaton@eik.bme.hu> writes: >>>>>>>> On Mon, 28 Apr 2025, Richard Henderson wrote: >>>>>>>>> On 4/28/25 06:26, BALATON Zoltan wrote: > <snip> >>>>> >>>>> if we've been here before (needing n insn from the base addr) we will >>>>> have a cached translation we can re-use. It doesn't stop the longer TB >>>>> being called again as we re-enter a loop. >>>> >>>> So then maybe it should at least check if there's already a cached TB >>>> where it can continue before calling cpu_io_recompile in io_prepare and >>>> only recompile if needed? >>> >>> It basically does do that AFAIKS. cpu_io_recompile() name is misleading >>> it does not cause a recompile, it just updates cflags and exits. Next >>> entry will look up TB that has just 1 insn and enter that. >> >> After reading it I came to the same conclusion but then I don't >> understand what causes the problem. Is it just that it will exit the >> loop for every IO to look up the recompiled TB? It looks like it tries >> to chain TBs, why does that not work here? > > Any MMIO access has to come via the slow path. Any MMIO also currently > has to be the last instruction in a block in case the operation triggers > a change in the translation regime that needs to be picked up by the > next instruction you execute. > > This is a pathological case when modelling VRAM on a device because its > going to be slow either way. At least if you model the multiple byte > access with a helper you can amortise some of the cost of the MMU lookup > with a single probe_() call. I think there is some mix up here because of all the different scenarios I benchmarked so let me try to clear that up. The goal is to find out why access to vfio-pci passed through graphics card VRAM is slower than expected when the host should be faster than those mostly embedded or old PPCs used on real machines with only 4x PCIe or PCIe to PCI bridges. In this case we are not emulating VRAM but mapping the framebuffer from the real card and access that. To find where the slow down comes from I've benchmarked all the cases upthread but here are the relevant parts again for easier comparison: First both src and dst are in RAM (just malloced buffers so this is the base line): src 0xb79c8008 dst 0xb78c7008 byte loop: 21.16 sec memset: 3.85 sec memcpy: 5.07 sec copyToVRAMNoAltivec: 2.52 sec copyToVRAMAltivec: 2.42 sec copyFromVRAMNoAltivec: 6.39 sec copyFromVRAMAltivec: 7.02 sec The FromVRAM cases use dcbz to avoid loading RAM contents to cache on real machine that is about to be overwritten so dcbz is never applied to MMIO. (Arguably it should use dcba but for some reason nobody remembers why it uses dcbz instead.) The ToVRAM cases have dcbt which is noop on QEMU. I guess the difference we see here is because of probe_access in dcbz as was shown by previous profiling. Replacing that with dcba (which is noop in QEMU) makes ToVRAM and FromVRAM run the about the same (you can find that case in original message). FromVRAM still a bit slower for some reason but most of this overhead can be accounted to dcbz. In second test dst is mmapped from emulated ati-vga framebuffer BAR. We can say we emulate vram here but that's just a ram memory region created in vga.c as: memory_region_init_ram_nomigrate(&s->vram, obj, "vga.vram", s->vram_size, &local_err); it also has dirty tracking enabled, I don't know if that has any effect. This is shown in left column here: dst in emulated ati-vga | dst in real card vfio vram mapping 0x80800000 mapping 0x80800000 src 0xb78e0008 dst 0xb77de000 | src 0xb7ec5008 dst 0xb7dc3000 byte loop: 21.2 sec | byte loop: 563.98 sec memset: 3.89 sec | memset: 39.25 sec memcpy: 5.07 sec | memcpy: 140.49 sec copyToVRAMNoAltivec: 2.53 sec | copyToVRAMNoAltivec: 72.03 sec copyToVRAMAltivec: 12.22 sec | copyToVRAMAltivec: 78.12 sec copyFromVRAMNoAltivec: 6.43 sec | copyFromVRAMNoAltivec: 728.52 sec copyFromVRAMAltivec: 35.33 sec | copyFromVRAMAltivec: 754.95 sec Here we see that AltiVec cases have additional overhead which I think is related to vperm as that's the only op that does not seem to be compiled to something sensible but calls an unoptimised helper (although that's also there for RAM so not sure why this is slower). But this shows no other overhead due to MMIO being involved as the NoAltivec cases are the same as with RAM. Last case, shown in right column above, is when instead of ati-vga I have a real ATI card passed through with vfio-pci which is much slower than what is explained only by PCI overhead and I'm trying to find out the source of that slow down. I've now also run 1000 iterations (vs. 10000 above so numbers are 10 times less here than above in right column) of the last case again (using real card with vfio-pci) with qemu-system-ppc vs. qemu-system-ppc64 to see if mttcg has any effect: 1000 iterations qemu-system-ppc | qemu-system-ppc64 mapping 0x80800000 mapping 0x80800000 src 0xb7dc6008 dst 0xb7cc4000 | src 0xb78b8008 dst 0xb77b6000 byte loop: 58.44 sec | byte loop: 57.72 sec memset: 3.99 sec | memset: 3.93 sec memcpy: 14.43 sec | memcpy: 14.24 sec copyToVRAMNoAltivec: 7.27 sec | copyToVRAMNoAltivec: 7.15 sec copyToVRAMAltivec: 7.9 sec | copyToVRAMAltivec: 7.78 sec copyFromVRAMNoAltivec: 72.68 sec | copyFromVRAMNoAltivec: 72.69 sec copyFromVRAMAltivec: 75.15 sec | copyFromVRAMAltivec: 75.05 sec This does not seem to have much effect so maybe not having mttcg does not enable icount just uses the same function which were confusing in the profile. Finally I dug up some comparable results from real machine vs QEMU. These are with QEMU with the default -cpu 7454 and -cpu g3 (to check AltiVec overhead but there seems to be only about 1%): https://hdrlab.org.nz/benchmark/gfxbench2d/OS/AmigaOS/Result/2939 https://hdrlab.org.nz/benchmark/gfxbench2d/OS/AmigaOS/Result/2941 and same card on real machine: https://hdrlab.org.nz/benchmark/gfxbench2d/OS/AmigaOS/Result/2414 It seems for larger rectangles we approach the same limits but smaller transfers (what I think VRAM copy also uses) have some big overhead compared to what PCIe communication alone explains. Another card on QEMU: https://hdrlab.org.nz/benchmark/gfxbench2d/OS/AmigaOS/Result/2931 and on real machine: https://hdrlab.org.nz/benchmark/gfxbench2d/OS/AmigaOS/Result/2372 or a similar card (I did not find exactly the same) with slower CPU real machine: https://hdrlab.org.nz/benchmark/gfxbench2d/OS/AmigaOS/Result/1672 Also on real machine using optimised routines does help so using wider transfers is better than default unoptimised case. >>>> I was thinking maybe we need a flag or counter >>>> to see if cpu_io_recompile is called more than once and after a limit >>>> invalidate the TB and create two new ones the first ending at the I/O and >>>> then what cpu_io_recompile does now which as I understood was what Richard >>>> suggested but I don't know how to do that. >>> >>> memset/cpy routines had kind of the same problem with real hardware. >>> They wanted to use vector instructions for best performance, but when >>> those are used on MMIO they would trap and be very slow. >> >> Why do those trap on MMIO on real machine? These routines were tested >> on real machines and the reasoning to use the widest possible access >> was that PCI transfer has overhead and that is minimised by >> transferring more bits in one op. I think they also verifed that it >> works at least for the 32 bit CPUs up to G4 that were used on real >> AmigaNG machines. There are some benchmark results here: >> https://hdrlab.org.nz/benchmark/gfxbench2d/OS/AmigaOS?start=60 which >> is also where the benchmark I used comes from so this should be >> similar. I think the MemCopy on that page has plain unoptimised copy >> as Copy to/from VRAM and optimised routines similar to this benchmark >> as Read/Write Pixel Array, but it's not easy to search. Some of the >> machines like Pegasos II and AmigaOne XE were made with both G3 or G4 >> CPUs so if I find a result from those with same graphics card that >> could show if AltiVec is faster (although the G4s were also higher >> clock so not directly comparable). Some results there are also from >> QEMU, mostly those that are with SiliconMotion 502 but that does not >> have this problem only vfio-pci pass through. > > They don't - what we need is to have a RAM-like-device model for QEMU > where we can relax the translation rules because we know we are writing > to RAM like things that don't have registers or other state changing > behaviour. > > The poor behaviour is because QEMU currently treats all MMIO as > potentially system state altering where as for VRAM it doesn't need to. This does not seem to be the case with emulated ati-vga, and with vfio-pci it should also be mapped memory from the graphics card which technically is MMIO but how does QEMU decides that when it does not seem to consider ati-vga as IO? Typically in QEMU MMIO is an io memory region that goes through memops and that's understandably slow but here we should read/write mapped memory space. Maybe I should try to find out what vfio-pci actually does here but it is used for gaming with KVM and there people get near native performance so I don't think there is an overhead in vfio-pci. So I could explain some small overheads with dcbz and maybe vperm but the biggest one seems to only happen when accessing real card VRAM with vfio-pci that does not seem to happen on real machine and I could not reproduce with emulated ati-vga either but that's all I could find out so far and still don't get where the biggest overhead comes from. Regards, BALATON Zoltan >> So maybe it's something >> with how vfio-pci maps PCI memory BARs? > > I don't know about vfio-pci but blob resources mapped via virtio-gpu > just appear as chunks of RAM to the guest - hence no trapping. > >> >>> Problem is we don't know ahead of time if some routine will access >>> MMIO or not. You could recompile it with fewer instructions but then >>> it will be slow when used for regular memory. >>> >>> Heuristics are tough because you could have e.g., one initial big >>> memset to clear a MMIO region that iterates many times over inner >>> loop of dcbz instructions, but then is never used again for MMIO but >>> important for regular page clearing. Making something that dynamically >>> decays or periodically would recompile to non-IO case perhaps, but >>> then complexity goes up. > > We can't have heuristics when we must prioritise correctness. However we > could expand the device model to make the exact behaviour of different > devices clear and optimise when we know it is safe. > >>> I would prefer not like to do that just for a microbenchmark, but if >>> you think it is reasonable overall win for average workloads of your >>> users then perhaps. >> >> I'm still trying to understand what to optimise. So far it looks like >> that dcbz has the least impact, then vperm a bit bigger but still only >> about a few percent and the biggest impact is still not known for sure >> but we see faster access on real machines that run on slower PCIe >> (only 4x at best) while CPU benchmarks don't show slower performance >> on QEMU only accessing passed through card's VRAM is slower than >> expected. But if there's a trap involved I've found before that >> exceptions are slower with QEMU but I did not see evidence of that in >> the profile. >> >> Regards, >> BALATON Zoltan > >
© 2016 - 2025 Red Hat, Inc.