[PATCH 4/5] tcg: Expand missing rotri with extract2

Richard Henderson posted 5 patches 1 month, 2 weeks ago
Maintainers: Richard Henderson <richard.henderson@linaro.org>
[PATCH 4/5] tcg: Expand missing rotri with extract2
Posted by Richard Henderson 1 month, 2 weeks ago
Use extract2 to implement rotri.  To make this easier,
redefine rotli in terms of rotri, rather than the reverse.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg-op.c | 70 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 41 insertions(+), 29 deletions(-)

diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index 4caf77da1e..3e10a3ad16 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -834,23 +834,12 @@ void tcg_gen_rotl_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
 void tcg_gen_rotli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
 {
     tcg_debug_assert(arg2 >= 0 && arg2 < 32);
-    /* some cases can be optimized here */
     if (arg2 == 0) {
         tcg_gen_mov_i32(ret, arg1);
     } else if (tcg_op_supported(INDEX_op_rotl, TCG_TYPE_I32, 0)) {
-        TCGv_i32 t0 = tcg_constant_i32(arg2);
-        tcg_gen_op3_i32(INDEX_op_rotl, ret, arg1, t0);
-    } else if (tcg_op_supported(INDEX_op_rotr, TCG_TYPE_I32, 0)) {
-        TCGv_i32 t0 = tcg_constant_i32(32 - arg2);
-        tcg_gen_op3_i32(INDEX_op_rotr, ret, arg1, t0);
+        tcg_gen_op3_i32(INDEX_op_rotl, ret, arg1, tcg_constant_i32(arg2));
     } else {
-        TCGv_i32 t0 = tcg_temp_ebb_new_i32();
-        TCGv_i32 t1 = tcg_temp_ebb_new_i32();
-        tcg_gen_shli_i32(t0, arg1, arg2);
-        tcg_gen_shri_i32(t1, arg1, 32 - arg2);
-        tcg_gen_or_i32(ret, t0, t1);
-        tcg_temp_free_i32(t0);
-        tcg_temp_free_i32(t1);
+        tcg_gen_rotri_i32(ret, arg1, -arg2 & 31);
     }
 }
 
@@ -878,7 +867,16 @@ void tcg_gen_rotr_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
 void tcg_gen_rotri_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
 {
     tcg_debug_assert(arg2 >= 0 && arg2 < 32);
-    tcg_gen_rotli_i32(ret, arg1, -arg2 & 31);
+    if (arg2 == 0) {
+        tcg_gen_mov_i32(ret, arg1);
+    } else if (tcg_op_supported(INDEX_op_rotr, TCG_TYPE_I64, 0)) {
+        tcg_gen_op3_i32(INDEX_op_rotr, ret, arg1, tcg_constant_i32(arg2));
+    } else if (tcg_op_supported(INDEX_op_rotl, TCG_TYPE_I64, 0)) {
+        tcg_gen_op3_i32(INDEX_op_rotl, ret, arg1, tcg_constant_i32(32 - arg2));
+    } else {
+        /* Do not recurse with the rotri simplification. */
+        tcg_gen_op4i_i32(INDEX_op_extract2, ret, arg1, arg1, arg2);
+    }
 }
 
 void tcg_gen_deposit_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2,
@@ -2417,23 +2415,12 @@ void tcg_gen_rotl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
 void tcg_gen_rotli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
 {
     tcg_debug_assert(arg2 >= 0 && arg2 < 64);
-    /* some cases can be optimized here */
     if (arg2 == 0) {
         tcg_gen_mov_i64(ret, arg1);
-    } else if (tcg_op_supported(INDEX_op_rotl, TCG_TYPE_I64, 0)) {
-        TCGv_i64 t0 = tcg_constant_i64(arg2);
-        tcg_gen_op3_i64(INDEX_op_rotl, ret, arg1, t0);
-    } else if (tcg_op_supported(INDEX_op_rotr, TCG_TYPE_I64, 0)) {
-        TCGv_i64 t0 = tcg_constant_i64(64 - arg2);
-        tcg_gen_op3_i64(INDEX_op_rotr, ret, arg1, t0);
+    } else if (tcg_op_supported(INDEX_op_rotl, TCG_TYPE_I32, 0)) {
+        tcg_gen_op3_i64(INDEX_op_rotl, ret, arg1, tcg_constant_i64(arg2));
     } else {
-        TCGv_i64 t0 = tcg_temp_ebb_new_i64();
-        TCGv_i64 t1 = tcg_temp_ebb_new_i64();
-        tcg_gen_shli_i64(t0, arg1, arg2);
-        tcg_gen_shri_i64(t1, arg1, 64 - arg2);
-        tcg_gen_or_i64(ret, t0, t1);
-        tcg_temp_free_i64(t0);
-        tcg_temp_free_i64(t1);
+        tcg_gen_rotri_i64(ret, arg1, -arg2 & 63);
     }
 }
 
@@ -2461,7 +2448,32 @@ void tcg_gen_rotr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
 void tcg_gen_rotri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
 {
     tcg_debug_assert(arg2 >= 0 && arg2 < 64);
-    tcg_gen_rotli_i64(ret, arg1, -arg2 & 63);
+    if (arg2 == 0) {
+        tcg_gen_mov_i64(ret, arg1);
+    } else if (TCG_TARGET_REG_BITS == 32) {
+        TCGv_i32 rl = tcg_temp_ebb_new_i32();
+        TCGv_i32 rh = TCGV_HIGH(ret);
+        TCGv_i32 t0, t1;
+
+        if (arg2 & 32) {
+            t0 = TCGV_HIGH(arg1);
+            t1 = TCGV_LOW(arg1);
+        } else {
+            t0 = TCGV_LOW(arg1);
+            t1 = TCGV_HIGH(arg1);
+        }
+        tcg_gen_extract2_i32(rl, t0, t1, arg2 & 31);
+        tcg_gen_extract2_i32(rh, t1, t0, arg2 & 31);
+        tcg_gen_mov_i32(TCGV_LOW(ret), rl);
+        tcg_temp_free_i32(rl);
+    } else if (tcg_op_supported(INDEX_op_rotr, TCG_TYPE_I64, 0)) {
+        tcg_gen_op3_i64(INDEX_op_rotr, ret, arg1, tcg_constant_i64(arg2));
+    } else if (tcg_op_supported(INDEX_op_rotl, TCG_TYPE_I64, 0)) {
+        tcg_gen_op3_i64(INDEX_op_rotl, ret, arg1, tcg_constant_i64(64 - arg2));
+    } else {
+        /* Do not recurse with the rotri simplification. */
+        tcg_gen_op4i_i64(INDEX_op_extract2, ret, arg1, arg1, arg2);
+    }
 }
 
 void tcg_gen_deposit_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2,
-- 
2.43.0
Re: [PATCH 4/5] tcg: Expand missing rotri with extract2
Posted by Manos Pitsidianakis 2 weeks, 2 days ago
On Sun, Sep 28, 2025 at 7:33 PM Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> Use extract2 to implement rotri.  To make this easier,
> redefine rotli in terms of rotri, rather than the reverse.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  tcg/tcg-op.c | 70 ++++++++++++++++++++++++++++++----------------------
>  1 file changed, 41 insertions(+), 29 deletions(-)
>
> diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
> index 4caf77da1e..3e10a3ad16 100644
> --- a/tcg/tcg-op.c
> +++ b/tcg/tcg-op.c
> @@ -834,23 +834,12 @@ void tcg_gen_rotl_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
>  void tcg_gen_rotli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
>  {
>      tcg_debug_assert(arg2 >= 0 && arg2 < 32);
> -    /* some cases can be optimized here */
>      if (arg2 == 0) {
>          tcg_gen_mov_i32(ret, arg1);
>      } else if (tcg_op_supported(INDEX_op_rotl, TCG_TYPE_I32, 0)) {
> -        TCGv_i32 t0 = tcg_constant_i32(arg2);
> -        tcg_gen_op3_i32(INDEX_op_rotl, ret, arg1, t0);
> -    } else if (tcg_op_supported(INDEX_op_rotr, TCG_TYPE_I32, 0)) {
> -        TCGv_i32 t0 = tcg_constant_i32(32 - arg2);
> -        tcg_gen_op3_i32(INDEX_op_rotr, ret, arg1, t0);
> +        tcg_gen_op3_i32(INDEX_op_rotl, ret, arg1, tcg_constant_i32(arg2));
>      } else {
> -        TCGv_i32 t0 = tcg_temp_ebb_new_i32();
> -        TCGv_i32 t1 = tcg_temp_ebb_new_i32();
> -        tcg_gen_shli_i32(t0, arg1, arg2);
> -        tcg_gen_shri_i32(t1, arg1, 32 - arg2);
> -        tcg_gen_or_i32(ret, t0, t1);
> -        tcg_temp_free_i32(t0);
> -        tcg_temp_free_i32(t1);
> +        tcg_gen_rotri_i32(ret, arg1, -arg2 & 31);
>      }
>  }
>
> @@ -878,7 +867,16 @@ void tcg_gen_rotr_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
>  void tcg_gen_rotri_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
>  {
>      tcg_debug_assert(arg2 >= 0 && arg2 < 32);
> -    tcg_gen_rotli_i32(ret, arg1, -arg2 & 31);
> +    if (arg2 == 0) {
> +        tcg_gen_mov_i32(ret, arg1);
> +    } else if (tcg_op_supported(INDEX_op_rotr, TCG_TYPE_I64, 0)) {
> +        tcg_gen_op3_i32(INDEX_op_rotr, ret, arg1, tcg_constant_i32(arg2));
> +    } else if (tcg_op_supported(INDEX_op_rotl, TCG_TYPE_I64, 0)) {
> +        tcg_gen_op3_i32(INDEX_op_rotl, ret, arg1, tcg_constant_i32(32 - arg2));
> +    } else {
> +        /* Do not recurse with the rotri simplification. */
> +        tcg_gen_op4i_i32(INDEX_op_extract2, ret, arg1, arg1, arg2);
> +    }
>  }
>
>  void tcg_gen_deposit_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2,
> @@ -2417,23 +2415,12 @@ void tcg_gen_rotl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
>  void tcg_gen_rotli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
>  {
>      tcg_debug_assert(arg2 >= 0 && arg2 < 64);
> -    /* some cases can be optimized here */
>      if (arg2 == 0) {
>          tcg_gen_mov_i64(ret, arg1);
> -    } else if (tcg_op_supported(INDEX_op_rotl, TCG_TYPE_I64, 0)) {
> -        TCGv_i64 t0 = tcg_constant_i64(arg2);
> -        tcg_gen_op3_i64(INDEX_op_rotl, ret, arg1, t0);
> -    } else if (tcg_op_supported(INDEX_op_rotr, TCG_TYPE_I64, 0)) {
> -        TCGv_i64 t0 = tcg_constant_i64(64 - arg2);
> -        tcg_gen_op3_i64(INDEX_op_rotr, ret, arg1, t0);
> +    } else if (tcg_op_supported(INDEX_op_rotl, TCG_TYPE_I32, 0)) {

Shouldn't this be:

s/TCG_TYPE_I32/TCG_TYPE_I64

?

> +        tcg_gen_op3_i64(INDEX_op_rotl, ret, arg1, tcg_constant_i64(arg2));
>      } else {
> -        TCGv_i64 t0 = tcg_temp_ebb_new_i64();
> -        TCGv_i64 t1 = tcg_temp_ebb_new_i64();
> -        tcg_gen_shli_i64(t0, arg1, arg2);
> -        tcg_gen_shri_i64(t1, arg1, 64 - arg2);
> -        tcg_gen_or_i64(ret, t0, t1);
> -        tcg_temp_free_i64(t0);
> -        tcg_temp_free_i64(t1);
> +        tcg_gen_rotri_i64(ret, arg1, -arg2 & 63);
>      }
>  }
>
> @@ -2461,7 +2448,32 @@ void tcg_gen_rotr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
>  void tcg_gen_rotri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
>  {
>      tcg_debug_assert(arg2 >= 0 && arg2 < 64);
> -    tcg_gen_rotli_i64(ret, arg1, -arg2 & 63);
> +    if (arg2 == 0) {
> +        tcg_gen_mov_i64(ret, arg1);
> +    } else if (TCG_TARGET_REG_BITS == 32) {
> +        TCGv_i32 rl = tcg_temp_ebb_new_i32();
> +        TCGv_i32 rh = TCGV_HIGH(ret);
> +        TCGv_i32 t0, t1;
> +
> +        if (arg2 & 32) {
> +            t0 = TCGV_HIGH(arg1);
> +            t1 = TCGV_LOW(arg1);
> +        } else {
> +            t0 = TCGV_LOW(arg1);
> +            t1 = TCGV_HIGH(arg1);
> +        }
> +        tcg_gen_extract2_i32(rl, t0, t1, arg2 & 31);
> +        tcg_gen_extract2_i32(rh, t1, t0, arg2 & 31);
> +        tcg_gen_mov_i32(TCGV_LOW(ret), rl);
> +        tcg_temp_free_i32(rl);
> +    } else if (tcg_op_supported(INDEX_op_rotr, TCG_TYPE_I64, 0)) {
> +        tcg_gen_op3_i64(INDEX_op_rotr, ret, arg1, tcg_constant_i64(arg2));
> +    } else if (tcg_op_supported(INDEX_op_rotl, TCG_TYPE_I64, 0)) {
> +        tcg_gen_op3_i64(INDEX_op_rotl, ret, arg1, tcg_constant_i64(64 - arg2));
> +    } else {
> +        /* Do not recurse with the rotri simplification. */
> +        tcg_gen_op4i_i64(INDEX_op_extract2, ret, arg1, arg1, arg2);
> +    }
>  }
>
>  void tcg_gen_deposit_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2,
> --
> 2.43.0
>
>
Re: [PATCH 4/5] tcg: Expand missing rotri with extract2
Posted by Richard Henderson 2 weeks, 2 days ago
On 10/29/25 10:30, Manos Pitsidianakis wrote:
> On Sun, Sep 28, 2025 at 7:33 PM Richard Henderson
> <richard.henderson@linaro.org> wrote:
>>
>> Use extract2 to implement rotri.  To make this easier,
>> redefine rotli in terms of rotri, rather than the reverse.
>>
>> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
>> ---
>>   tcg/tcg-op.c | 70 ++++++++++++++++++++++++++++++----------------------
>>   1 file changed, 41 insertions(+), 29 deletions(-)
>>
>> diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
>> index 4caf77da1e..3e10a3ad16 100644
>> --- a/tcg/tcg-op.c
>> +++ b/tcg/tcg-op.c
>> @@ -834,23 +834,12 @@ void tcg_gen_rotl_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
>>   void tcg_gen_rotli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
>>   {
>>       tcg_debug_assert(arg2 >= 0 && arg2 < 32);
>> -    /* some cases can be optimized here */
>>       if (arg2 == 0) {
>>           tcg_gen_mov_i32(ret, arg1);
>>       } else if (tcg_op_supported(INDEX_op_rotl, TCG_TYPE_I32, 0)) {
>> -        TCGv_i32 t0 = tcg_constant_i32(arg2);
>> -        tcg_gen_op3_i32(INDEX_op_rotl, ret, arg1, t0);
>> -    } else if (tcg_op_supported(INDEX_op_rotr, TCG_TYPE_I32, 0)) {
>> -        TCGv_i32 t0 = tcg_constant_i32(32 - arg2);
>> -        tcg_gen_op3_i32(INDEX_op_rotr, ret, arg1, t0);
>> +        tcg_gen_op3_i32(INDEX_op_rotl, ret, arg1, tcg_constant_i32(arg2));
>>       } else {
>> -        TCGv_i32 t0 = tcg_temp_ebb_new_i32();
>> -        TCGv_i32 t1 = tcg_temp_ebb_new_i32();
>> -        tcg_gen_shli_i32(t0, arg1, arg2);
>> -        tcg_gen_shri_i32(t1, arg1, 32 - arg2);
>> -        tcg_gen_or_i32(ret, t0, t1);
>> -        tcg_temp_free_i32(t0);
>> -        tcg_temp_free_i32(t1);
>> +        tcg_gen_rotri_i32(ret, arg1, -arg2 & 31);
>>       }
>>   }
>>
>> @@ -878,7 +867,16 @@ void tcg_gen_rotr_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
>>   void tcg_gen_rotri_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
>>   {
>>       tcg_debug_assert(arg2 >= 0 && arg2 < 32);
>> -    tcg_gen_rotli_i32(ret, arg1, -arg2 & 31);
>> +    if (arg2 == 0) {
>> +        tcg_gen_mov_i32(ret, arg1);
>> +    } else if (tcg_op_supported(INDEX_op_rotr, TCG_TYPE_I64, 0)) {
>> +        tcg_gen_op3_i32(INDEX_op_rotr, ret, arg1, tcg_constant_i32(arg2));
>> +    } else if (tcg_op_supported(INDEX_op_rotl, TCG_TYPE_I64, 0)) {
>> +        tcg_gen_op3_i32(INDEX_op_rotl, ret, arg1, tcg_constant_i32(32 - arg2));
>> +    } else {
>> +        /* Do not recurse with the rotri simplification. */
>> +        tcg_gen_op4i_i32(INDEX_op_extract2, ret, arg1, arg1, arg2);
>> +    }
>>   }
>>
>>   void tcg_gen_deposit_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2,
>> @@ -2417,23 +2415,12 @@ void tcg_gen_rotl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
>>   void tcg_gen_rotli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
>>   {
>>       tcg_debug_assert(arg2 >= 0 && arg2 < 64);
>> -    /* some cases can be optimized here */
>>       if (arg2 == 0) {
>>           tcg_gen_mov_i64(ret, arg1);
>> -    } else if (tcg_op_supported(INDEX_op_rotl, TCG_TYPE_I64, 0)) {
>> -        TCGv_i64 t0 = tcg_constant_i64(arg2);
>> -        tcg_gen_op3_i64(INDEX_op_rotl, ret, arg1, t0);
>> -    } else if (tcg_op_supported(INDEX_op_rotr, TCG_TYPE_I64, 0)) {
>> -        TCGv_i64 t0 = tcg_constant_i64(64 - arg2);
>> -        tcg_gen_op3_i64(INDEX_op_rotr, ret, arg1, t0);
>> +    } else if (tcg_op_supported(INDEX_op_rotl, TCG_TYPE_I32, 0)) {
> 
> Shouldn't this be:
> 
> s/TCG_TYPE_I32/TCG_TYPE_I64
> 
> ?


Oops, yes.


r~