Use tcg_op_imm_match to choose between expanding with AND+SHL vs SHL+SHR.
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
tcg/optimize.c | 40 +++++++++++++++++++++++++++++++---------
1 file changed, 31 insertions(+), 9 deletions(-)
diff --git a/tcg/optimize.c b/tcg/optimize.c
index e6a16921c9..2944c5a748 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -1743,10 +1743,17 @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
goto done;
}
- /* Lower invalid deposit into zero as AND + SHL or SHL + AND. */
+ /* Lower invalid deposit into zero. */
if (!valid) {
- if (TCG_TARGET_extract_valid(ctx->type, 0, ofs + len) &&
- !TCG_TARGET_extract_valid(ctx->type, 0, len)) {
+ if (TCG_TARGET_extract_valid(ctx->type, 0, len)) {
+ /* EXTRACT (at 0) + SHL */
+ op2 = opt_insert_before(ctx, op, INDEX_op_extract, 4);
+ op2->args[0] = ret;
+ op2->args[1] = arg2;
+ op2->args[2] = 0;
+ op2->args[3] = len;
+ } else if (TCG_TARGET_extract_valid(ctx->type, 0, ofs + len)) {
+ /* SHL + EXTRACT (at 0) */
op2 = opt_insert_before(ctx, op, INDEX_op_shl, 3);
op2->args[0] = ret;
op2->args[1] = arg2;
@@ -1757,14 +1764,29 @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
op->args[2] = 0;
op->args[3] = ofs + len;
goto done;
+ } else if (tcg_op_imm_match(INDEX_op_and, ctx->type, len_mask)) {
+ /* AND + SHL */
+ op2 = opt_insert_before(ctx, op, INDEX_op_and, 3);
+ op2->args[0] = ret;
+ op2->args[1] = arg2;
+ op2->args[2] = arg_new_constant(ctx, len_mask);
+ } else {
+ /* SHL + SHR */
+ int shl = width - len;
+ int shr = width - len - ofs;
+
+ op2 = opt_insert_before(ctx, op, INDEX_op_shl, 3);
+ op2->args[0] = ret;
+ op2->args[1] = arg2;
+ op2->args[2] = arg_new_constant(ctx, shl);
+
+ op->opc = INDEX_op_shr;
+ op->args[1] = ret;
+ op->args[2] = arg_new_constant(ctx, shr);
+ goto done;
}
- op2 = opt_insert_before(ctx, op, INDEX_op_and, 3);
- op2->args[0] = ret;
- op2->args[1] = arg2;
- op2->args[2] = arg_new_constant(ctx, len_mask);
- fold_and(ctx, op2);
-
+ /* Finish the (EXTRACT|AND) + SHL cases. */
op->opc = INDEX_op_shl;
op->args[1] = ret;
op->args[2] = arg_new_constant(ctx, ofs);
--
2.43.0
On 2/4/26 06:24, Richard Henderson wrote:
> Use tcg_op_imm_match to choose between expanding with AND+SHL vs SHL+SHR.
>
> Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
> tcg/optimize.c | 40 +++++++++++++++++++++++++++++++---------
> 1 file changed, 31 insertions(+), 9 deletions(-)
>
> diff --git a/tcg/optimize.c b/tcg/optimize.c
> index e6a16921c9..2944c5a748 100644
> --- a/tcg/optimize.c
> +++ b/tcg/optimize.c
> @@ -1743,10 +1743,17 @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
> goto done;
> }
>
> - /* Lower invalid deposit into zero as AND + SHL or SHL + AND. */
> + /* Lower invalid deposit into zero. */
> if (!valid) {
> - if (TCG_TARGET_extract_valid(ctx->type, 0, ofs + len) &&
> - !TCG_TARGET_extract_valid(ctx->type, 0, len)) {
> + if (TCG_TARGET_extract_valid(ctx->type, 0, len)) {
> + /* EXTRACT (at 0) + SHL */
> + op2 = opt_insert_before(ctx, op, INDEX_op_extract, 4);
> + op2->args[0] = ret;
> + op2->args[1] = arg2;
> + op2->args[2] = 0;
> + op2->args[3] = len;
> + } else if (TCG_TARGET_extract_valid(ctx->type, 0, ofs + len)) {
> + /* SHL + EXTRACT (at 0) */
> op2 = opt_insert_before(ctx, op, INDEX_op_shl, 3);
> op2->args[0] = ret;
> op2->args[1] = arg2;
> @@ -1757,14 +1764,29 @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
> op->args[2] = 0;
> op->args[3] = ofs + len;
> goto done;
> + } else if (tcg_op_imm_match(INDEX_op_and, ctx->type, len_mask)) {
> + /* AND + SHL */
Even if these extracts are valid, can they really be cheaper then an AND
with immediate argument, or back to back shifts? You still have a
dependency between the two instruction. I wouldn't bother with using
EXTRACT here.
Paolo
> + op2 = opt_insert_before(ctx, op, INDEX_op_and, 3);
> + op2->args[0] = ret;
> + op2->args[1] = arg2;
> + op2->args[2] = arg_new_constant(ctx, len_mask);
> + } else {
> + /* SHL + SHR */
> + int shl = width - len;
> + int shr = width - len - ofs;
> +
> + op2 = opt_insert_before(ctx, op, INDEX_op_shl, 3);
> + op2->args[0] = ret;
> + op2->args[1] = arg2;
> + op2->args[2] = arg_new_constant(ctx, shl);
> +
> + op->opc = INDEX_op_shr;
> + op->args[1] = ret;
> + op->args[2] = arg_new_constant(ctx, shr);
> + goto done;
> }
>
> - op2 = opt_insert_before(ctx, op, INDEX_op_and, 3);
> - op2->args[0] = ret;
> - op2->args[1] = arg2;
> - op2->args[2] = arg_new_constant(ctx, len_mask);
> - fold_and(ctx, op2);
> -
> + /* Finish the (EXTRACT|AND) + SHL cases. */
> op->opc = INDEX_op_shl;
> op->args[1] = ret;
> op->args[2] = arg_new_constant(ctx, ofs);
On 2/4/26 18:05, Paolo Bonzini wrote:
> On 2/4/26 06:24, Richard Henderson wrote:
>> Use tcg_op_imm_match to choose between expanding with AND+SHL vs SHL+SHR.
>>
>> Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
>> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
>> ---
>> tcg/optimize.c | 40 +++++++++++++++++++++++++++++++---------
>> 1 file changed, 31 insertions(+), 9 deletions(-)
>>
>> diff --git a/tcg/optimize.c b/tcg/optimize.c
>> index e6a16921c9..2944c5a748 100644
>> --- a/tcg/optimize.c
>> +++ b/tcg/optimize.c
>> @@ -1743,10 +1743,17 @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
>> goto done;
>> }
>> - /* Lower invalid deposit into zero as AND + SHL or SHL + AND. */
>> + /* Lower invalid deposit into zero. */
>> if (!valid) {
>> - if (TCG_TARGET_extract_valid(ctx->type, 0, ofs + len) &&
>> - !TCG_TARGET_extract_valid(ctx->type, 0, len)) {
>> + if (TCG_TARGET_extract_valid(ctx->type, 0, len)) {
>> + /* EXTRACT (at 0) + SHL */
>> + op2 = opt_insert_before(ctx, op, INDEX_op_extract, 4);
>> + op2->args[0] = ret;
>> + op2->args[1] = arg2;
>> + op2->args[2] = 0;
>> + op2->args[3] = len;
>> + } else if (TCG_TARGET_extract_valid(ctx->type, 0, ofs + len)) {
>> + /* SHL + EXTRACT (at 0) */
>> op2 = opt_insert_before(ctx, op, INDEX_op_shl, 3);
>> op2->args[0] = ret;
>> op2->args[1] = arg2;
>> @@ -1757,14 +1764,29 @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
>> op->args[2] = 0;
>> op->args[3] = ofs + len;
>> goto done;
>> + } else if (tcg_op_imm_match(INDEX_op_and, ctx->type, len_mask)) {
>> + /* AND + SHL */
>
> Even if these extracts are valid, can they really be cheaper then an AND with immediate
> argument, or back to back shifts?
This is primarily for x86.
(1) movz is 2 operand, so that may avoid clobbering an input,
(2) movz is 3-4 byte whereas and r/i32 is 6-7 byte.
Because of these, there's a comment somewhere that says we'll prefer extract over and
(perhaps in tcg_gen_andi_* or fold_and). IIRC this also happens to simplify ppc and s390x
insn selection (and vs rotate and mask). AFAIK, no other hosts are penalized.
r~
Il mer 4 feb 2026, 10:06 Richard Henderson <richard.henderson@linaro.org>
ha scritto:
> On 2/4/26 18:05, Paolo Bonzini wrote:
> > On 2/4/26 06:24, Richard Henderson wrote:
> >> Use tcg_op_imm_match to choose between expanding with AND+SHL vs
> SHL+SHR.
> >>
> >> Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
> >> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> >> ---
> >> tcg/optimize.c | 40 +++++++++++++++++++++++++++++++---------
> >> 1 file changed, 31 insertions(+), 9 deletions(-)
> >>
> >> diff --git a/tcg/optimize.c b/tcg/optimize.c
> >> index e6a16921c9..2944c5a748 100644
> >> --- a/tcg/optimize.c
> >> +++ b/tcg/optimize.c
> >> @@ -1743,10 +1743,17 @@ static bool fold_deposit(OptContext *ctx, TCGOp
> *op)
> >> goto done;
> >> }
> >> - /* Lower invalid deposit into zero as AND + SHL or SHL + AND.
> */
> >> + /* Lower invalid deposit into zero. */
> >> if (!valid) {
> >> - if (TCG_TARGET_extract_valid(ctx->type, 0, ofs + len) &&
> >> - !TCG_TARGET_extract_valid(ctx->type, 0, len)) {
> >> + if (TCG_TARGET_extract_valid(ctx->type, 0, len)) {
> >> + /* EXTRACT (at 0) + SHL */
> >> + op2 = opt_insert_before(ctx, op, INDEX_op_extract, 4);
> >> + op2->args[0] = ret;
> >> + op2->args[1] = arg2;
> >> + op2->args[2] = 0;
> >> + op2->args[3] = len;
> >> + } else if (TCG_TARGET_extract_valid(ctx->type, 0, ofs +
> len)) {
> >> + /* SHL + EXTRACT (at 0) */
> >> op2 = opt_insert_before(ctx, op, INDEX_op_shl, 3);
> >> op2->args[0] = ret;
> >> op2->args[1] = arg2;
> >> @@ -1757,14 +1764,29 @@ static bool fold_deposit(OptContext *ctx, TCGOp
> *op)
> >> op->args[2] = 0;
> >> op->args[3] = ofs + len;
> >> goto done;
> >> + } else if (tcg_op_imm_match(INDEX_op_and, ctx->type,
> len_mask)) {
> >> + /* AND + SHL */
> >
> > Even if these extracts are valid, can they really be cheaper then an AND
> with immediate
> > argument, or back to back shifts?
>
> This is primarily for x86.
>
> (1) movz is 2 operand, so that may avoid clobbering an input,
> (2) movz is 3-4 byte whereas and r/i32 is 6-7 byte.
>
> Because of these, there's a comment somewhere that says we'll prefer
> extract over and
> (perhaps in tcg_gen_andi_* or fold_and). IIRC this also happens to
> simplify ppc and s390x
> insn selection (and vs rotate and mask). AFAIK, no other hosts are
> penalized.
>
I think it would be better to pick a canonical form for AND with 2^n-1 and
handle conversion to extract (like PPC rotates or movz) in the backend.
Picking AND as the canonical form also avoids makes the macros for extract
validity simpler too; adding an extra constraint for immediate 2^n-1 is
easier and it generalizes to other PPC rotate and mask cases.
Paolo
>
>
>
> r~
>
>
On 2/4/26 20:41, Paolo Bonzini wrote: > This is primarily for x86. > > (1) movz is 2 operand, so that may avoid clobbering an input, > (2) movz is 3-4 byte whereas and r/i32 is 6-7 byte. > > Because of these, there's a comment somewhere that says we'll prefer extract over and > (perhaps in tcg_gen_andi_* or fold_and). IIRC this also happens to simplify ppc and > s390x > insn selection (and vs rotate and mask). AFAIK, no other hosts are penalized. > > > I think it would be better to pick a canonical form for AND with 2^n-1 and handle > conversion to extract (like PPC rotates or movz) in the backend. > > Picking AND as the canonical form also avoids makes the macros for extract validity > simpler too; adding an extra constraint for immediate 2^n-1 is easier and it generalizes > to other PPC rotate and mask cases. Picking AND means we have to use "r,0,ri" for x86, losing register allocation flexibility. r~
Il mer 4 feb 2026, 21:46 Richard Henderson <richard.henderson@linaro.org> ha scritto: > On 2/4/26 20:41, Paolo Bonzini wrote: > > This is primarily for x86. > > > > (1) movz is 2 operand, so that may avoid clobbering an input, > > (2) movz is 3-4 byte whereas and r/i32 is 6-7 byte. > > > > Because of these, there's a comment somewhere that says we'll prefer > extract over and > > (perhaps in tcg_gen_andi_* or fold_and). IIRC this also happens to > simplify ppc and > > s390x > > insn selection (and vs rotate and mask). AFAIK, no other hosts are > penalized. > > > > > > I think it would be better to pick a canonical form for AND with 2^n-1 > and handle > > conversion to extract (like PPC rotates or movz) in the backend. > > > > Picking AND as the canonical form also avoids makes the macros for > extract validity > > simpler too; adding an extra constraint for immediate 2^n-1 is easier > and it generalizes > > to other PPC rotate and mask cases. > > Picking AND means we have to use "r,0,ri" for x86, losing register > allocation flexibility. > Then could you wrap the target specific extract_valid with one that allows ofs == 0 if AND allows the immediate 2^len-1? That would also simplify this series. Paolo > > r~ > >
On 2/5/26 18:22, Paolo Bonzini wrote: > > > Il mer 4 feb 2026, 21:46 Richard Henderson <richard.henderson@linaro.org > <mailto:richard.henderson@linaro.org>> ha scritto: > > On 2/4/26 20:41, Paolo Bonzini wrote: > > This is primarily for x86. > > > > (1) movz is 2 operand, so that may avoid clobbering an input, > > (2) movz is 3-4 byte whereas and r/i32 is 6-7 byte. > > > > Because of these, there's a comment somewhere that says we'll prefer extract > over and > > (perhaps in tcg_gen_andi_* or fold_and). IIRC this also happens to simplify > ppc and > > s390x > > insn selection (and vs rotate and mask). AFAIK, no other hosts are penalized. > > > > > > I think it would be better to pick a canonical form for AND with 2^n-1 and handle > > conversion to extract (like PPC rotates or movz) in the backend. > > > > Picking AND as the canonical form also avoids makes the macros for extract validity > > simpler too; adding an extra constraint for immediate 2^n-1 is easier and it > generalizes > > to other PPC rotate and mask cases. > > Picking AND means we have to use "r,0,ri" for x86, losing register allocation flexibility. > > > Then could you wrap the target specific extract_valid with one that allows ofs == 0 if AND > allows the immediate 2^len-1? That would also simplify this series. I don't understand your suggestion here. r~
Il gio 5 feb 2026, 23:29 Richard Henderson <richard.henderson@linaro.org> ha scritto: > > > I think it would be better to pick a canonical form for AND with > 2^n-1 and handle > > > conversion to extract (like PPC rotates or movz) in the backend. > > > > > > Picking AND as the canonical form also avoids makes the macros > for extract validity > > > simpler too; adding an extra constraint for immediate 2^n-1 is > easier and it > > generalizes > > > to other PPC rotate and mask cases. > > > > Picking AND means we have to use "r,0,ri" for x86, losing register > allocation flexibility. > > > > > > Then could you wrap the target specific extract_valid with one that > allows ofs == 0 if AND > > allows the immediate 2^len-1? That would also simplify this series. > > I don't understand your suggestion here. > I am not sure about it either... I am just not sure why extract is guaranteed to be cheaper or have better constraints than AND. It does happen to be true for x86, though only for len == 8 or 16; but is it true of all targets that have a more expansive extract instruction? Paolo > > r~ > >
On 2/6/26 09:22, Paolo Bonzini wrote: > > > Il gio 5 feb 2026, 23:29 Richard Henderson <richard.henderson@linaro.org > <mailto:richard.henderson@linaro.org>> ha scritto: > > > > I think it would be better to pick a canonical form for AND with 2^n-1 and > handle > > > conversion to extract (like PPC rotates or movz) in the backend. > > > > > > Picking AND as the canonical form also avoids makes the macros for extract > validity > > > simpler too; adding an extra constraint for immediate 2^n-1 is easier and it > > generalizes > > > to other PPC rotate and mask cases. > > > > Picking AND means we have to use "r,0,ri" for x86, losing register allocation > flexibility. > > > > > > Then could you wrap the target specific extract_valid with one that allows ofs == 0 > if AND > > allows the immediate 2^len-1? That would also simplify this series. > > I don't understand your suggestion here. > > > I am not sure about it either... I am just not sure why extract is guaranteed to be > cheaper or have better constraints than AND. > > It does happen to be true for x86, though only for len == 8 or 16; but is it true of all > targets that have a more expansive extract instruction? x86 includes len == 32 via 'movl', fwiw. Similarly, riscv64 has quite a number of filter conditions for extract, mostly because of a 12-bit signed argument for AND, and a collection of other zero-extend insns. AArch64, loongarch64, and ppc64 all emit ANDI if possible during tgen_extract. So it really is all about using extract if valid, and allowing the backend to use the more favorable set of constraints. r~
© 2016 - 2026 Red Hat, Inc.