[RFC PATCH] tcg/ppc: Enable direct branching tcg_out_goto_tb with TCG_REG_TB

Jordan Niethe posted 1 patch 9 months, 1 week ago
Failed in applying to current master (apply log)
include/tcg/tcg.h        |  1 +
tcg/ppc/tcg-target.c.inc | 59 ++++++++++++++++++++++++++--------------
tcg/tcg.c                |  3 ++
3 files changed, 42 insertions(+), 21 deletions(-)
[RFC PATCH] tcg/ppc: Enable direct branching tcg_out_goto_tb with TCG_REG_TB
Posted by Jordan Niethe 9 months, 1 week ago
Direct branch patching was disabled when using TCG_REG_TB in commit
736a1588c1 ("tcg/ppc: Fix race in goto_tb implementation"). Commit
7502f89c74 ("tcg/ppc: Use prefixed instructions for tcg_out_goto_tb")
used the support for pc relative ISAv3.1 instructions to re-enable
direct branch patching on POWER10.

The issue with direct branch patching with TCG_REG_TB is the lack of
synchronization between the new TCG_REG_TB being established and the
direct branch being patched in.

If each translation block is responsible for establishing its own
TCG_REG_TB then there can be no synchronization issue.

Make each translation block begin by setting up its own TCG_REG_TB.
ISA v3.0 has addpcis so use that for getting the pc at the beginning of
a translation block (plus offset). For systems without addpcis, use
the preferred 'bcl 20,31,$+4' sequence.

When branching indirectly to a translation block the setup sequence can
be skipped if the caller sets up TCG_REG_TB as there is no possible race
in this case.

Signed-off-by: Jordan Niethe <jniethe5@gmail.com>
---
This is just a proof of concept, not sure that this is the correct way
to do this or even if it is something we'd like to do.

Applies on top of Richard's series [1].

  [1] https://lore.kernel.org/qemu-devel/20230808030250.50602-1-richard.henderson@linaro.org/
---
 include/tcg/tcg.h        |  1 +
 tcg/ppc/tcg-target.c.inc | 59 ++++++++++++++++++++++++++--------------
 tcg/tcg.c                |  3 ++
 3 files changed, 42 insertions(+), 21 deletions(-)

diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index 0875971719..337506fea0 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -518,6 +518,7 @@ struct TCGContext {
        extension that allows arithmetic on void*.  */
     void *code_gen_buffer;
     size_t code_gen_buffer_size;
+    size_t code_gen_entry_size;
     void *code_gen_ptr;
     void *data_gen_ptr;
 
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index b686a68247..4b55751051 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -382,6 +382,7 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 #define CRNAND XO19(225)
 #define CROR   XO19(449)
 #define CRNOR  XO19( 33)
+#define ADDPCIS  XO19( 2)
 
 #define EXTSB  XO31(954)
 #define EXTSH  XO31(922)
@@ -2635,6 +2636,30 @@ static void tcg_target_qemu_prologue(TCGContext *s)
     tcg_out32(s, BCLR | BO_ALWAYS);
 }
 
+
+#define TCG_TARGET_NEED_ENTER_TB
+static void tcg_out_enter_tb(TCGContext *s)
+{
+    if (!USE_REG_TB) {
+        return;
+    }
+
+    if (have_isa_3_00) {
+        /* lnia REG_TB */
+        tcg_out32(s, ADDPCIS | RT(TCG_REG_TB));
+        tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, -4));
+    } else {
+        tcg_out32(s, MFSPR | RT(TCG_REG_TMP1) | LR);
+        /* bcl 20,31,$+4 (Preferred form for getting nia.) */
+        tcg_out32(s, BC | BO_ALWAYS | BI(7, CR_SO) | 0x4 | LK);
+        tcg_out32(s, MFSPR | RT(TCG_REG_TB) | LR);
+        tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, -8));
+        tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | LR);
+    }
+
+    s->code_gen_entry_size = tcg_current_code_size(s);
+}
+
 static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
 {
     tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R3, arg);
@@ -2645,23 +2670,6 @@ static void tcg_out_goto_tb(TCGContext *s, int which)
 {
     uintptr_t ptr = get_jmp_target_addr(s, which);
 
-    if (USE_REG_TB) {
-        /*
-         * With REG_TB, we must always use indirect branching,
-         * so that the branch destination and TCG_REG_TB match.
-         */
-        ptrdiff_t offset = tcg_tbrel_diff(s, (void *)ptr);
-        tcg_out_mem_long(s, LD, LDX, TCG_REG_TB, TCG_REG_TB, offset);
-        tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
-        tcg_out32(s, BCCTR | BO_ALWAYS);
-
-        /* For the unlinked case, need to reset TCG_REG_TB.  */
-        set_jmp_reset_offset(s, which);
-        tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
-                         -tcg_current_code_size(s));
-        return;
-    }
-
     /* Direct branch will be patched by tb_target_set_jmp_target. */
     set_jmp_insn_offset(s, which);
     tcg_out32(s, NOP);
@@ -2670,6 +2678,13 @@ static void tcg_out_goto_tb(TCGContext *s, int which)
     if (have_isa_3_10) {
         ptrdiff_t offset = tcg_pcrel_diff_for_prefix(s, (void *)ptr);
         tcg_out_8ls_d(s, PLD, TCG_REG_TMP1, 0, offset, 1);
+    } else if (USE_REG_TB) {
+        ptrdiff_t offset = tcg_tbrel_diff(s, (void *)ptr);
+
+        tcg_out_mem_long(s, LD, LDX, TCG_REG_TB, TCG_REG_TB, offset);
+        /* Callee can skip establishing REG_TB for the indirect case. */
+        tcg_out32(s, ADDI | TAI(TCG_REG_TMP1, TCG_REG_TB,
+                                s->code_gen_entry_size));
     } else {
         tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP1, ptr - (int16_t)ptr);
         tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, (int16_t)ptr);
@@ -2678,6 +2693,12 @@ static void tcg_out_goto_tb(TCGContext *s, int which)
     tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
     tcg_out32(s, BCCTR | BO_ALWAYS);
     set_jmp_reset_offset(s, which);
+
+    /* For the unlinked case, need to reset TCG_REG_TB.  */
+    if (USE_REG_TB) {
+        tcg_out_movi_int(s, TCG_TYPE_I64, TCG_REG_TB,
+                         (tcg_target_long)s->code_buf, true);
+    }
 }
 
 void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
@@ -2687,10 +2708,6 @@ void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
     intptr_t diff = addr - jmp_rx;
     tcg_insn_unit insn;
 
-    if (USE_REG_TB) {
-        return;
-    }
-
     if (in_range_b(diff)) {
         insn = B | (diff & 0x3fffffc);
     } else {
diff --git a/tcg/tcg.c b/tcg/tcg.c
index ddfe9a96cb..20698131c2 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -6010,6 +6010,9 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
         tcg_malloc(sizeof(uint64_t) * s->gen_tb->icount * start_words);
 
     num_insns = -1;
+#ifdef TCG_TARGET_NEED_ENTER_TB
+    tcg_out_enter_tb(s);
+#endif
     QTAILQ_FOREACH(op, &s->ops, link) {
         TCGOpcode opc = op->opc;
 
-- 
2.39.3
Re: [RFC PATCH] tcg/ppc: Enable direct branching tcg_out_goto_tb with TCG_REG_TB
Posted by Richard Henderson 9 months ago
On 8/14/23 22:01, Jordan Niethe wrote:
> Direct branch patching was disabled when using TCG_REG_TB in commit
> 736a1588c1 ("tcg/ppc: Fix race in goto_tb implementation"). Commit
> 7502f89c74 ("tcg/ppc: Use prefixed instructions for tcg_out_goto_tb")
> used the support for pc relative ISAv3.1 instructions to re-enable
> direct branch patching on POWER10.
> 
> The issue with direct branch patching with TCG_REG_TB is the lack of
> synchronization between the new TCG_REG_TB being established and the
> direct branch being patched in.
> 
> If each translation block is responsible for establishing its own
> TCG_REG_TB then there can be no synchronization issue.

That's a good idea, and can be used for other things...

It also begs the question of whether power10 should continue to use TCG_REG_TB, loading 
the address with PADDI.  Or whether power9 should, like power10, disable USE_REG_TB and 
use ADDPCIS throughout.

I imagine it depends on usage frequency, whether use of TCG_REG_TB allows 1 insn, where 
addpcis requires 2 insns and prefixed insns require 2 or 3 insn slots (depending on 
alignment).


> +        tcg_out32(s, MFSPR | RT(TCG_REG_TMP1) | LR);
> +        /* bcl 20,31,$+4 (Preferred form for getting nia.) */
> +        tcg_out32(s, BC | BO_ALWAYS | BI(7, CR_SO) | 0x4 | LK);
> +        tcg_out32(s, MFSPR | RT(TCG_REG_TB) | LR);
> +        tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, -8));
> +        tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | LR);

Don't need to save/restore LR.  It is saved in the prologue and may be clobbered within 
the tb itself (as we do for calls).

> @@ -2678,6 +2693,12 @@ static void tcg_out_goto_tb(TCGContext *s, int which)
>       tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
>       tcg_out32(s, BCCTR | BO_ALWAYS);
>       set_jmp_reset_offset(s, which);
> +
> +    /* For the unlinked case, need to reset TCG_REG_TB.  */
> +    if (USE_REG_TB) {
> +        tcg_out_movi_int(s, TCG_TYPE_I64, TCG_REG_TB,
> +                         (tcg_target_long)s->code_buf, true);
> +    }
>   }

Actually, we don't.  The only time we arrive here is when an unlinked TB branches to 
itself.  TCG_REG_TB is still valid.

> diff --git a/tcg/tcg.c b/tcg/tcg.c
> index ddfe9a96cb..20698131c2 100644
> --- a/tcg/tcg.c
> +++ b/tcg/tcg.c
> @@ -6010,6 +6010,9 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
>           tcg_malloc(sizeof(uint64_t) * s->gen_tb->icount * start_words);
>   
>       num_insns = -1;
> +#ifdef TCG_TARGET_NEED_ENTER_TB
> +    tcg_out_enter_tb(s);
> +#endif

Better would be to not have the ifdef, and add this symbol as an empty function in all 
other tcg backends.

I might play around with this a bit.


r~
Re: [RFC PATCH] tcg/ppc: Enable direct branching tcg_out_goto_tb with TCG_REG_TB
Posted by Jordan Niethe 9 months ago

On 16/8/23 2:07 am, Richard Henderson wrote:
> On 8/14/23 22:01, Jordan Niethe wrote:
>> Direct branch patching was disabled when using TCG_REG_TB in commit
>> 736a1588c1 ("tcg/ppc: Fix race in goto_tb implementation"). Commit
>> 7502f89c74 ("tcg/ppc: Use prefixed instructions for tcg_out_goto_tb")
>> used the support for pc relative ISAv3.1 instructions to re-enable
>> direct branch patching on POWER10.
>>
>> The issue with direct branch patching with TCG_REG_TB is the lack of
>> synchronization between the new TCG_REG_TB being established and the
>> direct branch being patched in.
>>
>> If each translation block is responsible for establishing its own
>> TCG_REG_TB then there can be no synchronization issue.
> 
> That's a good idea, and can be used for other things...
> 
> It also begs the question of whether power10 should continue to use 
> TCG_REG_TB, loading the address with PADDI.  Or whether power9 should, 
> like power10, disable USE_REG_TB and use ADDPCIS throughout.
> 
> I imagine it depends on usage frequency, whether use of TCG_REG_TB 
> allows 1 insn, where addpcis requires 2 insns and prefixed insns require 
> 2 or 3 insn slots (depending on alignment).

Yes, I agree. Your v3 series looks good, I'll try and get some 
performance numbers with it so we can make a decision about which way to 
go on power10 and power9.

> 
> 
>> +        tcg_out32(s, MFSPR | RT(TCG_REG_TMP1) | LR);
>> +        /* bcl 20,31,$+4 (Preferred form for getting nia.) */
>> +        tcg_out32(s, BC | BO_ALWAYS | BI(7, CR_SO) | 0x4 | LK);
>> +        tcg_out32(s, MFSPR | RT(TCG_REG_TB) | LR);
>> +        tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, -8));
>> +        tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | LR);
> 
> Don't need to save/restore LR.  It is saved in the prologue and may be 
> clobbered within the tb itself (as we do for calls >
>> @@ -2678,6 +2693,12 @@ static void tcg_out_goto_tb(TCGContext *s, int 
>> which)
>>       tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
>>       tcg_out32(s, BCCTR | BO_ALWAYS);
>>       set_jmp_reset_offset(s, which);
>> +
>> +    /* For the unlinked case, need to reset TCG_REG_TB.  */
>> +    if (USE_REG_TB) {
>> +        tcg_out_movi_int(s, TCG_TYPE_I64, TCG_REG_TB,
>> +                         (tcg_target_long)s->code_buf, true);
>> +    }
>>   }
> 
> Actually, we don't.  The only time we arrive here is when an unlinked TB 
> branches to itself.  TCG_REG_TB is still valid.

Ok, I was not sure how that was meant to work.

> 
>> diff --git a/tcg/tcg.c b/tcg/tcg.c
>> index ddfe9a96cb..20698131c2 100644
>> --- a/tcg/tcg.c
>> +++ b/tcg/tcg.c
>> @@ -6010,6 +6010,9 @@ int tcg_gen_code(TCGContext *s, TranslationBlock 
>> *tb, uint64_t pc_start)
>>           tcg_malloc(sizeof(uint64_t) * s->gen_tb->icount * start_words);
>>       num_insns = -1;
>> +#ifdef TCG_TARGET_NEED_ENTER_TB
>> +    tcg_out_enter_tb(s);
>> +#endif
> 
> Better would be to not have the ifdef, and add this symbol as an empty 
> function in all other tcg backends.
> 
> I might play around with this a bit.

Thank you for that, adding pcrel support on POWER9 too is really cool.

> 
> 
> r~
>