[RFC PATCH 0/1] TCTI TCG backend for acceleration on non-JIT AArch64

Joelle van Dyne posted 1 patch 4 months, 3 weeks ago
Failed in applying to current master (apply log)
docs/devel/tcg-tcti.rst                       | 1140 +++++++++
meson.build                                   |   10 +
include/accel/tcg/getpc.h                     |    6 +-
include/disas/dis-asm.h                       |    1 +
include/tcg/tcg-opc.h                         |   12 +
include/tcg/tcg.h                             |    2 +-
tcg/aarch64-tcti/tcg-target-con-set.h         |   32 +
tcg/aarch64-tcti/tcg-target-con-str.h         |   20 +
tcg/aarch64-tcti/tcg-target-has.h             |  132 +
tcg/aarch64-tcti/tcg-target-mo.h              |   13 +
tcg/aarch64-tcti/tcg-target-reg-bits.h        |   16 +
tcg/aarch64-tcti/tcg-target.h                 |  107 +
host/include/generic/host/atomic128-cas.h.inc |    3 +-
tcg/aarch64-tcti/tcg-target-opc.h.inc         |   15 +
accel/tcg/cputlb.c                            |    3 +-
accel/tcg/tcg-accel-ops.c                     |    8 +
tcg/optimize.c                                |    2 +
tcg/region.c                                  |   11 +-
tcg/tcg-op.c                                  |   27 +
tcg/tcg.c                                     |   19 +-
tcg/aarch64-tcti/tcg-target.c.inc             | 2250 +++++++++++++++++
meson_options.txt                             |    2 +
scripts/meson-buildoptions.sh                 |    5 +
tcg/aarch64-tcti/tcti-gadget-gen.py           | 1192 +++++++++
tcg/meson.build                               |   71 +-
25 files changed, 5082 insertions(+), 17 deletions(-)
create mode 100644 docs/devel/tcg-tcti.rst
create mode 100644 tcg/aarch64-tcti/tcg-target-con-set.h
create mode 100644 tcg/aarch64-tcti/tcg-target-con-str.h
create mode 100644 tcg/aarch64-tcti/tcg-target-has.h
create mode 100644 tcg/aarch64-tcti/tcg-target-mo.h
create mode 100644 tcg/aarch64-tcti/tcg-target-reg-bits.h
create mode 100644 tcg/aarch64-tcti/tcg-target.h
create mode 100644 tcg/aarch64-tcti/tcg-target-opc.h.inc
create mode 100644 tcg/aarch64-tcti/tcg-target.c.inc
create mode 100755 tcg/aarch64-tcti/tcti-gadget-gen.py
[RFC PATCH 0/1] TCTI TCG backend for acceleration on non-JIT AArch64
Posted by Joelle van Dyne 4 months, 3 weeks ago
The following patch is from work by Katherine Temkin to add a JITless backend
for aarch64. The aarch64-tcti target for TCG uses pre-compiled "gadgets" which
are snippets of code for every TCG op x all operands and then at runtime will
"thread" together the gadgets with jumps after each gadget. This results in
significant speedup against TCI but is still much slower than JIT.

This backend is mainly useful for iOS, which does not allow JIT in distributed
apps. We ported the feature from v5.2 to v10.0 but will rebase it to master if
there is interest. Would this backend be useful for mainline QEMU?

Joelle van Dyne (1):
  tcg/tcti: add TCTI TCG backend for acceleration on non-JIT AArch64

 docs/devel/tcg-tcti.rst                       | 1140 +++++++++
 meson.build                                   |   10 +
 include/accel/tcg/getpc.h                     |    6 +-
 include/disas/dis-asm.h                       |    1 +
 include/tcg/tcg-opc.h                         |   12 +
 include/tcg/tcg.h                             |    2 +-
 tcg/aarch64-tcti/tcg-target-con-set.h         |   32 +
 tcg/aarch64-tcti/tcg-target-con-str.h         |   20 +
 tcg/aarch64-tcti/tcg-target-has.h             |  132 +
 tcg/aarch64-tcti/tcg-target-mo.h              |   13 +
 tcg/aarch64-tcti/tcg-target-reg-bits.h        |   16 +
 tcg/aarch64-tcti/tcg-target.h                 |  107 +
 host/include/generic/host/atomic128-cas.h.inc |    3 +-
 tcg/aarch64-tcti/tcg-target-opc.h.inc         |   15 +
 accel/tcg/cputlb.c                            |    3 +-
 accel/tcg/tcg-accel-ops.c                     |    8 +
 tcg/optimize.c                                |    2 +
 tcg/region.c                                  |   11 +-
 tcg/tcg-op.c                                  |   27 +
 tcg/tcg.c                                     |   19 +-
 tcg/aarch64-tcti/tcg-target.c.inc             | 2250 +++++++++++++++++
 meson_options.txt                             |    2 +
 scripts/meson-buildoptions.sh                 |    5 +
 tcg/aarch64-tcti/tcti-gadget-gen.py           | 1192 +++++++++
 tcg/meson.build                               |   71 +-
 25 files changed, 5082 insertions(+), 17 deletions(-)
 create mode 100644 docs/devel/tcg-tcti.rst
 create mode 100644 tcg/aarch64-tcti/tcg-target-con-set.h
 create mode 100644 tcg/aarch64-tcti/tcg-target-con-str.h
 create mode 100644 tcg/aarch64-tcti/tcg-target-has.h
 create mode 100644 tcg/aarch64-tcti/tcg-target-mo.h
 create mode 100644 tcg/aarch64-tcti/tcg-target-reg-bits.h
 create mode 100644 tcg/aarch64-tcti/tcg-target.h
 create mode 100644 tcg/aarch64-tcti/tcg-target-opc.h.inc
 create mode 100644 tcg/aarch64-tcti/tcg-target.c.inc
 create mode 100755 tcg/aarch64-tcti/tcti-gadget-gen.py

-- 
2.41.0
Re: [RFC PATCH 0/1] TCTI TCG backend for acceleration on non-JIT AArch64
Posted by Richard Henderson 4 months, 3 weeks ago
On 7/22/25 10:42, Joelle van Dyne wrote:
> The following patch is from work by Katherine Temkin to add a JITless backend
> for aarch64. The aarch64-tcti target for TCG uses pre-compiled "gadgets" which
> are snippets of code for every TCG op x all operands and then at runtime will
> "thread" together the gadgets with jumps after each gadget. This results in
> significant speedup against TCI but is still much slower than JIT.
> 
> This backend is mainly useful for iOS, which does not allow JIT in distributed
> apps. We ported the feature from v5.2 to v10.0 but will rebase it to master if
> there is interest. Would this backend be useful for mainline QEMU?

That's certainly interesting.

(1) The generated gadget code is excessively large: 75MiB of code, 18MiB of tables.

$ ld -r -o z.o *gadgets.c.o && size z.o

     text      data
74556216  18358784

Have you done experiments with the number of available registers to see at what point you 
get diminishing returns?  The combinatorics on 16 registers is not in your favor.

I would expect you could make do with e.g. 6 writable registers, plus SP and ENV.  Note 
that SP and ENV may be read in controlled ways, but not written.  You would never generate 
a gadget that writes to either.  They need not be accessed in other ways, such as the 
source of a multiply.

Have you done experiments with 2-operand matching constraints?
I.e. "d*=m" instead of "d=n*m".
That will also dramatically affect combinatorics.

Have you experimented with, for the most part, generating *only* 64-bit code?  For many 
operations, the 32-bit operation can be implemented by the 64-bit instruction simply by 
ignoring the high 32 bits of the output.  E.g. add, sub, mul, logicals, left-shift.  That 
would avoid repeating some gadgets for _i32 and _i64.

(2) The use of __attribute__((naked)) means this is clang-specific.

I'm really not sure why you're generating C and using naked, as opposed to simply 
generating assembly directly.  By generating assembly, you could also emit the correct 
unwind info for the gadgets.  Or, one set of unwind info for *all* of the gadgets.

I'll also note that it takes up to 5 minutes for clang to compile some of these files, so 
using the assembler instead of the compiler would help with that as well.

(3) The tables of gadgets are writable, placed in .data.

With C, it's trivial to simply place the "const" correctly to place these tables in 
mostly-read-only memory (.data.rel.ro; constant after runtime relocation).

With assembly, it's trivial to generate 32-bit pc-relative offsets instead of raw 
pointers, which would allow truly read-only memory (.rodata).  This would require trivial 
changes to the read of the tables within tcg_out_*gadget.

(4) Some of the other changes are unacceptable, such as the ifdefs for movcond.

I'm surprised about that one, really.  I suppose you were worried about combinatorics on a 
5-operand operation.  But there's no reason you need to keep comparisons next to uses.

You are currently generating C*N^3 versions of setcond_i64, setcond_i32, negsetcond_i64, 
negsetcond_i32, brcond_i64, and brcond_i32.

But instead you can generate N^2 versions of compare_i32 and compare_i64, and then chain 
that to C versions of brcond, C*N versions of setcond and negsetcond, and C*N^2 versions 
of movcond.  The cpu flags retain the comparison state across the two gadgets.

(5) docs/devel/tcg-tcti.rst generates a build error.

This is caused by not being linked to the index.

(6) Do you actually see much speed-up from the vector code?

(7) To be properly reviewed, this would have to be split into many many patches.

(8) True interest in this is probably limited to iOS, and thus an aarch64 host.

I guess I'm ok with that.  I would insist that the code remain buildable on any aarch64 
host so that we can test it in CI.

I have thought about using gcc's computed goto instead of assembly gadgets, which would 
allow this general scheme to work on any host.  We would be at the mercy of the quality of 
code generation though, and the size of the generated function might break the compiler -- 
this has happened before for machine-generated interpreters.  But it would avoid two 
interpreter implementations, which would be a big plus.

Here's an incomplete example:

uintptr_t tcg_qemu_tb_exec(CPUArchState *env, const void *v_tb_ptr)
{
     static void * const table[20]
         __asm__("tci_table")  __attribute__((used)) = {
         &&mov_0_1,
         &&mov_0_2,
         &&mov_0_3,
         &&mov_1_0,
         &&mov_1_2,
         &&mov_1_3,
         &&mov_2_0,
         &&mov_2_1,
         &&mov_2_3,
         &&mov_3_0,
         &&mov_3_1,
         &&mov_3_2,
         &&stq_0_sp,
         &&stq_1_sp,
         &&stq_2_sp,
         &&stq_3_sp,
         &&ldq_0_sp,
         &&ldq_1_sp,
         &&ldq_2_sp,
         &&ldq_3_sp,
     };

     const intptr_t *tb_ptr = v_tb_ptr;
     tcg_target_ulong r0 = 0, r1 = 0, r2 = 0, r3 = 0;
     uint64_t stack[(TCG_STATIC_CALL_ARGS_SIZE + TCG_STATIC_FRAME_SIZE)
                    / sizeof(uint64_t)];
     uintptr_t sp = (uintptr_t)stack;

#define NEXT  goto *tb_ptr++

     NEXT;

     mov_0_1: r0 = r1; NEXT;
     mov_0_2: r0 = r2; NEXT;
     mov_0_3: r0 = r3; NEXT;
     mov_1_0: r1 = r0; NEXT;
     mov_1_2: r1 = r2; NEXT;
     mov_1_3: r1 = r3; NEXT;
     mov_2_0: r2 = r0; NEXT;
     mov_2_1: r2 = r1; NEXT;
     mov_2_3: r2 = r3; NEXT;
     mov_3_0: r3 = r0; NEXT;
     mov_3_1: r3 = r1; NEXT;
     mov_3_2: r3 = r2; NEXT;
     stq_0_sp: *(uint64_t *)(sp + *tb_ptr++) = r0; NEXT;
     stq_1_sp: *(uint64_t *)(sp + *tb_ptr++) = r1; NEXT;
     stq_2_sp: *(uint64_t *)(sp + *tb_ptr++) = r2; NEXT;
     stq_3_sp: *(uint64_t *)(sp + *tb_ptr++) = r3; NEXT;
     ldq_0_sp: r0 = *(uint64_t *)(sp + *tb_ptr++); NEXT;
     ldq_1_sp: r1 = *(uint64_t *)(sp + *tb_ptr++); NEXT;
     ldq_2_sp: r2 = *(uint64_t *)(sp + *tb_ptr++); NEXT;
     ldq_3_sp: r3 = *(uint64_t *)(sp + *tb_ptr++); NEXT;

#undef NEXT

     return 0;
}

This compiles to fragments like

   // ldq_x_sp
   3c:   f9400420        ldr     x0, [x1, #8]
   40:   f8410425        ldr     x5, [x1], #16
   44:   f86668a5        ldr     x5, [x5, x6]
   48:   d61f0000        br      x0

   // stq_x_sp
   7c:   aa0103e7        mov     x7, x1
   80:   f84104e0        ldr     x0, [x7], #16
   84:   f8266805        str     x5, [x0, x6]
   88:   f9400420        ldr     x0, [x1, #8]
   8c:   aa0703e1        mov     x1, x7
   90:   d61f0000        br      x0

   // mov_x_y
  154:   f8408420        ldr     x0, [x1], #8
  158:   aa0403e2        mov     x2, x4
  15c:   d61f0000        br      x0

While stq has two unnecessary moves, the other two are optimal.

Obviously, the function would be not be hand-written in a complete implementation.


r~