[PATCH] target/loongarch: Add support for dbar hint variants

Song Gao posted 1 patch 6 days, 16 hours ago
Patches applied successfully (tree, apply log)
git fetch https://github.com/patchew-project/qemu tags/patchew/20260327003206.3749780-1-gaosong@loongson.cn
Maintainers: Song Gao <gaosong@loongson.cn>
target/loongarch/cpu.c                        |  4 +
.../tcg/insn_trans/trans_memory.c.inc         | 82 ++++++++++++++++++-
target/loongarch/tcg/translate.c              |  1 +
target/loongarch/translate.h                  |  3 +
4 files changed, 88 insertions(+), 2 deletions(-)
[PATCH] target/loongarch: Add support for dbar hint variants
Posted by Song Gao 6 days, 16 hours ago
LoongArch architecture (since LA664) introduces fine-grained dbar
hints that allow controlling which memory accesses are ordered by
the barrier. Previously, all dbar instructions were treated as a
full barrier (TCG_MO_ALL | TCG_BAR_SC).

This patch adds support for decoding dbar hints and emitting the
appropriate TCG memory barrier flags. For CPUs that do not advertise
the DBAR_HINTS feature (cpucfg3.DBAR_HINTS = 0), all dbar hints
fall back to a full barrier, preserving compatibility.

The hint encoding follows the LoongArch v1.10 specification:
- Bit4: 0 = completion barrier, 1 = ordering barrier
        (ignored by TCG as TCG only supports ordering barriers)
- Bit3: barrier for previous reads (0 = enforce, 1 = relax)
- Bit2: barrier for previous writes (0 = enforce, 1 = relax)
- Bit1: barrier for succeeding reads (0 = enforce, 1 = relax)
- Bit0: barrier for succeeding writes (0 = enforce, 1 = relax)

The mapping to TCG memory order flags is as follows:
- TCG_MO_LD_LD is set if both previous and succeeding reads are ordered.
- TCG_MO_ST_LD is set if previous write and succeeding read are ordered.
- TCG_MO_LD_ST is set if previous read and succeeding write are ordered.
- TCG_MO_ST_ST is set if both previous and succeeding writes are ordered.

If the resulting flags describe an acquire or release barrier,
TCG_BAR_LDAQ or TCG_BAR_STRL is used accordingly; otherwise a
full SC barrier (TCG_BAR_SC) is emitted.

Special hint handling:
- hint 0x700: LL/SC loop barrier, treated as a full barrier as recommended.
- hint 0xf and 0x1f: reserved/no-op, treated as no operation

Signed-off-by: Song Gao <gaosong@loongson.cn>
---
 target/loongarch/cpu.c                        |  4 +
 .../tcg/insn_trans/trans_memory.c.inc         | 82 ++++++++++++++++++-
 target/loongarch/tcg/translate.c              |  1 +
 target/loongarch/translate.h                  |  3 +
 4 files changed, 88 insertions(+), 2 deletions(-)

diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c
index e22568c84a..d8d106b07e 100644
--- a/target/loongarch/cpu.c
+++ b/target/loongarch/cpu.c
@@ -455,6 +455,10 @@ static void loongarch_max_initfn(Object *obj)
         data = FIELD_DP32(data, CPUCFG2, LLACQ_SCREL, 1);
         data = FIELD_DP32(data, CPUCFG2, SCQ, 1);
         cpu->env.cpucfg[2] = data;
+
+        data = cpu->env.cpucfg[3];
+        data = FIELD_DP32(data, CPUCFG3, DBAR_HINTS, 1);
+        cpu->env.cpucfg[3] = data;
     }
 }
 
diff --git a/target/loongarch/tcg/insn_trans/trans_memory.c.inc b/target/loongarch/tcg/insn_trans/trans_memory.c.inc
index e287d46363..99bc486119 100644
--- a/target/loongarch/tcg/insn_trans/trans_memory.c.inc
+++ b/target/loongarch/tcg/insn_trans/trans_memory.c.inc
@@ -137,11 +137,89 @@ static bool trans_preldx(DisasContext *ctx, arg_preldx * a)
     return true;
 }
 
+/*
+ * Decode dbar hint and emit appropriate TCG memory barrier.
+ *
+ * The hint is a 5-bit field (0-31) encoded in the instruction.
+ * For hint 0x700 (special LL/SC loop barrier), treat as full barrier.
+ *
+ * See LoongArch Reference Manual v1.10, Section 4.2.2 for details.
+ */
 static bool trans_dbar(DisasContext *ctx, arg_dbar * a)
 {
     tcg_gen_mb(TCG_BAR_SC | TCG_MO_ALL);
-    return true;
-}
+    int hint = a->imm;
+    TCGBar bar_flags = 0;
+
+    /* Reserved/no-op hints: 0xf and 0x1f */
+    if (hint == 0xf || hint == 0x1f) {
+        return true;
+    }
+
+    /* If the CPU does not support fine-grained hints,or for the special LL/SC
+     * loop barrier (0x700), emit a full barrier.
+     */
+    if (!avail_DBAR_HINT(ctx) || hint == 0x700) {
+        tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
+        return true;
+    }
+
+    /*
+     * Fine-grained hint decoding:
+     * Bits 3-0 control which accesses must be ordered.
+     *   bit3: barrier previous reads?   (0 = enforce, 1 = relax)
+     *   bit2: barrier previous writes?  (0 = enforce, 1 = relax)
+     *   bit1: barrier succeeding reads? (0 = enforce, 1 = relax)
+     *   bit0: barrier succeeding writes?(0 = enforce, 1 = relax)
+     *
+     * For each combination, we set the corresponding TCG_MO_* flag if both
+     * sides of the barrier require ordering.
+     */
+    bool prev_rd = !(hint & 0x08);   /* need barrier for previous reads */
+    bool prev_wr = !(hint & 0x04);   /* need barrier for previous writes */
+    bool succ_rd = !(hint & 0x02);   /* need barrier for succeeding reads */
+    bool succ_wr = !(hint & 0x01);   /* need barrier for succeeding writes */
+
+    if (prev_rd && succ_rd) {
+        bar_flags |= TCG_MO_LD_LD;
+    }
+    if (prev_wr && succ_rd) {
+        bar_flags |= TCG_MO_ST_LD;
+    }
+    if (prev_rd && succ_wr) {
+        bar_flags |= TCG_MO_LD_ST;
+    }
+    if (prev_wr && succ_wr) {
+        bar_flags |= TCG_MO_ST_ST;
+    }
+
+    /* If no flags were set, this is a no-op barrier */
+    if (bar_flags == 0) {
+        return true;
+    }
+
+    /*
+     * Use acquire/release semantics when possible to generate more efficient
+     * code. Otherwise, fall back to a sequential consistency barrier.
+     *
+     * Acquire: order loads before loads/stores (LD_LD | LD_ST)
+     * Release: order stores before stores/loads (ST_ST | ST_LD)
+     */
+    if ((bar_flags & (TCG_MO_LD_LD | TCG_MO_LD_ST)) &&
+        !(bar_flags & (TCG_MO_ST_ST | TCG_MO_ST_LD))) {
+        /* Only acquire flags present */
+        tcg_gen_mb(bar_flags | TCG_BAR_LDAQ);
+    } else if ((bar_flags & (TCG_MO_ST_ST | TCG_MO_ST_LD)) &&
+               !(bar_flags & (TCG_MO_LD_LD | TCG_MO_LD_ST))) {
+        /* Only release flags present */
+        tcg_gen_mb(bar_flags | TCG_BAR_STRL);
+    } else {
+        /* Mixed or full barrier */
+        tcg_gen_mb(bar_flags | TCG_BAR_SC);
+    }
+
+     return true;
+ }
 
 static bool trans_ibar(DisasContext *ctx, arg_ibar *a)
 {
diff --git a/target/loongarch/tcg/translate.c b/target/loongarch/tcg/translate.c
index b9ed13d19c..49280b1dd3 100644
--- a/target/loongarch/tcg/translate.c
+++ b/target/loongarch/tcg/translate.c
@@ -149,6 +149,7 @@ static void loongarch_tr_init_disas_context(DisasContextBase *dcbase,
 
     ctx->cpucfg1 = env->cpucfg[1];
     ctx->cpucfg2 = env->cpucfg[2];
+    ctx->cpucfg3 = env->cpucfg[3];
 }
 
 static void loongarch_tr_tb_start(DisasContextBase *dcbase, CPUState *cs)
diff --git a/target/loongarch/translate.h b/target/loongarch/translate.h
index ba1c89e57b..8aa8325dc6 100644
--- a/target/loongarch/translate.h
+++ b/target/loongarch/translate.h
@@ -43,6 +43,8 @@
 #define avail_LLACQ_SCREL(C)    (FIELD_EX32((C)->cpucfg2, CPUCFG2, LLACQ_SCREL))
 #define avail_LLACQ_SCREL_64(C) (avail_64(C) && avail_LLACQ_SCREL(C))
 
+#define avail_DBAR_HINT(C) (FIELD_EX32((C)->cpucfg3, CPUCFG3, DBAR_HINTS))
+
 /*
  * If an operation is being performed on less than TARGET_LONG_BITS,
  * it may require the inputs to be sign- or zero-extended; which will
@@ -66,6 +68,7 @@ typedef struct DisasContext {
     bool va32; /* 32-bit virtual address */
     uint32_t cpucfg1;
     uint32_t cpucfg2;
+    uint32_t cpucfg3;
 } DisasContext;
 
 void generate_exception(DisasContext *ctx, int excp);
-- 
2.47.3
Re: [PATCH] target/loongarch: Add support for dbar hint variants
Posted by Bibo Mao 15 hours ago

On 2026/3/27 上午8:32, Song Gao wrote:
> LoongArch architecture (since LA664) introduces fine-grained dbar
> hints that allow controlling which memory accesses are ordered by
> the barrier. Previously, all dbar instructions were treated as a
> full barrier (TCG_MO_ALL | TCG_BAR_SC).
> 
> This patch adds support for decoding dbar hints and emitting the
> appropriate TCG memory barrier flags. For CPUs that do not advertise
> the DBAR_HINTS feature (cpucfg3.DBAR_HINTS = 0), all dbar hints
> fall back to a full barrier, preserving compatibility.
> 
> The hint encoding follows the LoongArch v1.10 specification:
> - Bit4: 0 = completion barrier, 1 = ordering barrier
>          (ignored by TCG as TCG only supports ordering barriers)
> - Bit3: barrier for previous reads (0 = enforce, 1 = relax)
> - Bit2: barrier for previous writes (0 = enforce, 1 = relax)
> - Bit1: barrier for succeeding reads (0 = enforce, 1 = relax)
> - Bit0: barrier for succeeding writes (0 = enforce, 1 = relax)
> 
> The mapping to TCG memory order flags is as follows:
> - TCG_MO_LD_LD is set if both previous and succeeding reads are ordered.
> - TCG_MO_ST_LD is set if previous write and succeeding read are ordered.
> - TCG_MO_LD_ST is set if previous read and succeeding write are ordered.
> - TCG_MO_ST_ST is set if both previous and succeeding writes are ordered.
> 
> If the resulting flags describe an acquire or release barrier,
> TCG_BAR_LDAQ or TCG_BAR_STRL is used accordingly; otherwise a
> full SC barrier (TCG_BAR_SC) is emitted.
> 
> Special hint handling:
> - hint 0x700: LL/SC loop barrier, treated as a full barrier as recommended.
> - hint 0xf and 0x1f: reserved/no-op, treated as no operation
> 
> Signed-off-by: Song Gao <gaosong@loongson.cn>
> ---
>   target/loongarch/cpu.c                        |  4 +
>   .../tcg/insn_trans/trans_memory.c.inc         | 82 ++++++++++++++++++-
>   target/loongarch/tcg/translate.c              |  1 +
>   target/loongarch/translate.h                  |  3 +
>   4 files changed, 88 insertions(+), 2 deletions(-)
> 
> diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c
> index e22568c84a..d8d106b07e 100644
> --- a/target/loongarch/cpu.c
> +++ b/target/loongarch/cpu.c
> @@ -455,6 +455,10 @@ static void loongarch_max_initfn(Object *obj)
>           data = FIELD_DP32(data, CPUCFG2, LLACQ_SCREL, 1);
>           data = FIELD_DP32(data, CPUCFG2, SCQ, 1);
>           cpu->env.cpucfg[2] = data;
> +
> +        data = cpu->env.cpucfg[3];
> +        data = FIELD_DP32(data, CPUCFG3, DBAR_HINTS, 1);
> +        cpu->env.cpucfg[3] = data;
>       }
>   }
>   
> diff --git a/target/loongarch/tcg/insn_trans/trans_memory.c.inc b/target/loongarch/tcg/insn_trans/trans_memory.c.inc
> index e287d46363..99bc486119 100644
> --- a/target/loongarch/tcg/insn_trans/trans_memory.c.inc
> +++ b/target/loongarch/tcg/insn_trans/trans_memory.c.inc
> @@ -137,11 +137,89 @@ static bool trans_preldx(DisasContext *ctx, arg_preldx * a)
>       return true;
>   }
>   
> +/*
> + * Decode dbar hint and emit appropriate TCG memory barrier.
> + *
> + * The hint is a 5-bit field (0-31) encoded in the instruction.
> + * For hint 0x700 (special LL/SC loop barrier), treat as full barrier.
> + *
> + * See LoongArch Reference Manual v1.10, Section 4.2.2 for details.
> + */
>   static bool trans_dbar(DisasContext *ctx, arg_dbar * a)
>   {
>       tcg_gen_mb(TCG_BAR_SC | TCG_MO_ALL);
> -    return true;
> -}
> +    int hint = a->imm;
> +    TCGBar bar_flags = 0;
> +
> +    /* Reserved/no-op hints: 0xf and 0x1f */
> +    if (hint == 0xf || hint == 0x1f) {
> +        return true;
> +    }
> +
> +    /* If the CPU does not support fine-grained hints,or for the special LL/SC
> +     * loop barrier (0x700), emit a full barrier.
> +     */
> +    if (!avail_DBAR_HINT(ctx) || hint == 0x700) {
> +        tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
> +        return true;
> +    }
> +
> +    /*
> +     * Fine-grained hint decoding:
> +     * Bits 3-0 control which accesses must be ordered.
> +     *   bit3: barrier previous reads?   (0 = enforce, 1 = relax)
> +     *   bit2: barrier previous writes?  (0 = enforce, 1 = relax)
> +     *   bit1: barrier succeeding reads? (0 = enforce, 1 = relax)
> +     *   bit0: barrier succeeding writes?(0 = enforce, 1 = relax)
> +     *
> +     * For each combination, we set the corresponding TCG_MO_* flag if both
> +     * sides of the barrier require ordering.
> +     */
> +    bool prev_rd = !(hint & 0x08);   /* need barrier for previous reads */
> +    bool prev_wr = !(hint & 0x04);   /* need barrier for previous writes */
> +    bool succ_rd = !(hint & 0x02);   /* need barrier for succeeding reads */
> +    bool succ_wr = !(hint & 0x01);   /* need barrier for succeeding writes */
> +
> +    if (prev_rd && succ_rd) {
> +        bar_flags |= TCG_MO_LD_LD;
> +    }
> +    if (prev_wr && succ_rd) {
> +        bar_flags |= TCG_MO_ST_LD;
> +    }
> +    if (prev_rd && succ_wr) {
> +        bar_flags |= TCG_MO_LD_ST;
> +    }
> +    if (prev_wr && succ_wr) {
> +        bar_flags |= TCG_MO_ST_ST;
> +    }
I do not know the memory order, however it seems that it is different 
with other architectures with following usage when it is translated to 
QEMU TCG code. I do now which is right.

__smp_rmb() ---> tcg_gen_mb(TCG_BAR_SC | TCG_MO_LD_LD | TCG_MO_LD_ST)
__smp_wmb() ---> tcg_gen_mb(TCG_BAR_SC | TCG_MO_ST_ST)
__smp_mb()  ---> tcg_gen_mb(TCG_BAR_SC | TCG_MO_ALL)

Regards
Bibo Mao
> +
> +    /* If no flags were set, this is a no-op barrier */
> +    if (bar_flags == 0) {
> +        return true;
> +    }
> +
> +    /*
> +     * Use acquire/release semantics when possible to generate more efficient
> +     * code. Otherwise, fall back to a sequential consistency barrier.
> +     *
> +     * Acquire: order loads before loads/stores (LD_LD | LD_ST)
> +     * Release: order stores before stores/loads (ST_ST | ST_LD)
> +     */
> +    if ((bar_flags & (TCG_MO_LD_LD | TCG_MO_LD_ST)) &&
> +        !(bar_flags & (TCG_MO_ST_ST | TCG_MO_ST_LD))) {
> +        /* Only acquire flags present */
> +        tcg_gen_mb(bar_flags | TCG_BAR_LDAQ);
> +    } else if ((bar_flags & (TCG_MO_ST_ST | TCG_MO_ST_LD)) &&
> +               !(bar_flags & (TCG_MO_LD_LD | TCG_MO_LD_ST))) {
> +        /* Only release flags present */
> +        tcg_gen_mb(bar_flags | TCG_BAR_STRL);
> +    } else {
> +        /* Mixed or full barrier */
> +        tcg_gen_mb(bar_flags | TCG_BAR_SC);
> +    }
> +
> +     return true;
> + }
>   
>   static bool trans_ibar(DisasContext *ctx, arg_ibar *a)
>   {
> diff --git a/target/loongarch/tcg/translate.c b/target/loongarch/tcg/translate.c
> index b9ed13d19c..49280b1dd3 100644
> --- a/target/loongarch/tcg/translate.c
> +++ b/target/loongarch/tcg/translate.c
> @@ -149,6 +149,7 @@ static void loongarch_tr_init_disas_context(DisasContextBase *dcbase,
>   
>       ctx->cpucfg1 = env->cpucfg[1];
>       ctx->cpucfg2 = env->cpucfg[2];
> +    ctx->cpucfg3 = env->cpucfg[3];
>   }
>   
>   static void loongarch_tr_tb_start(DisasContextBase *dcbase, CPUState *cs)
> diff --git a/target/loongarch/translate.h b/target/loongarch/translate.h
> index ba1c89e57b..8aa8325dc6 100644
> --- a/target/loongarch/translate.h
> +++ b/target/loongarch/translate.h
> @@ -43,6 +43,8 @@
>   #define avail_LLACQ_SCREL(C)    (FIELD_EX32((C)->cpucfg2, CPUCFG2, LLACQ_SCREL))
>   #define avail_LLACQ_SCREL_64(C) (avail_64(C) && avail_LLACQ_SCREL(C))
>   
> +#define avail_DBAR_HINT(C) (FIELD_EX32((C)->cpucfg3, CPUCFG3, DBAR_HINTS))
> +
>   /*
>    * If an operation is being performed on less than TARGET_LONG_BITS,
>    * it may require the inputs to be sign- or zero-extended; which will
> @@ -66,6 +68,7 @@ typedef struct DisasContext {
>       bool va32; /* 32-bit virtual address */
>       uint32_t cpucfg1;
>       uint32_t cpucfg2;
> +    uint32_t cpucfg3;
>   } DisasContext;
>   
>   void generate_exception(DisasContext *ctx, int excp);
> 


Re: [PATCH] target/loongarch: Add support for dbar hint variants
Posted by gaosong 5 hours ago
在 2026/4/2 上午10:17, Bibo Mao 写道:
>
>
> On 2026/3/27 上午8:32, Song Gao wrote:
>> LoongArch architecture (since LA664) introduces fine-grained dbar
>> hints that allow controlling which memory accesses are ordered by
>> the barrier. Previously, all dbar instructions were treated as a
>> full barrier (TCG_MO_ALL | TCG_BAR_SC).
>>
>> This patch adds support for decoding dbar hints and emitting the
>> appropriate TCG memory barrier flags. For CPUs that do not advertise
>> the DBAR_HINTS feature (cpucfg3.DBAR_HINTS = 0), all dbar hints
>> fall back to a full barrier, preserving compatibility.
>>
>> The hint encoding follows the LoongArch v1.10 specification:
>> - Bit4: 0 = completion barrier, 1 = ordering barrier
>>          (ignored by TCG as TCG only supports ordering barriers)
>> - Bit3: barrier for previous reads (0 = enforce, 1 = relax)
>> - Bit2: barrier for previous writes (0 = enforce, 1 = relax)
>> - Bit1: barrier for succeeding reads (0 = enforce, 1 = relax)
>> - Bit0: barrier for succeeding writes (0 = enforce, 1 = relax)
>>
>> The mapping to TCG memory order flags is as follows:
>> - TCG_MO_LD_LD is set if both previous and succeeding reads are ordered.
>> - TCG_MO_ST_LD is set if previous write and succeeding read are ordered.
>> - TCG_MO_LD_ST is set if previous read and succeeding write are ordered.
>> - TCG_MO_ST_ST is set if both previous and succeeding writes are 
>> ordered.
>>
>> If the resulting flags describe an acquire or release barrier,
>> TCG_BAR_LDAQ or TCG_BAR_STRL is used accordingly; otherwise a
>> full SC barrier (TCG_BAR_SC) is emitted.
>>
>> Special hint handling:
>> - hint 0x700: LL/SC loop barrier, treated as a full barrier as 
>> recommended.
>> - hint 0xf and 0x1f: reserved/no-op, treated as no operation
>>
>> Signed-off-by: Song Gao <gaosong@loongson.cn>
>> ---
>>   target/loongarch/cpu.c                        |  4 +
>>   .../tcg/insn_trans/trans_memory.c.inc         | 82 ++++++++++++++++++-
>>   target/loongarch/tcg/translate.c              |  1 +
>>   target/loongarch/translate.h                  |  3 +
>>   4 files changed, 88 insertions(+), 2 deletions(-)
>>
>> diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c
>> index e22568c84a..d8d106b07e 100644
>> --- a/target/loongarch/cpu.c
>> +++ b/target/loongarch/cpu.c
>> @@ -455,6 +455,10 @@ static void loongarch_max_initfn(Object *obj)
>>           data = FIELD_DP32(data, CPUCFG2, LLACQ_SCREL, 1);
>>           data = FIELD_DP32(data, CPUCFG2, SCQ, 1);
>>           cpu->env.cpucfg[2] = data;
>> +
>> +        data = cpu->env.cpucfg[3];
>> +        data = FIELD_DP32(data, CPUCFG3, DBAR_HINTS, 1);
>> +        cpu->env.cpucfg[3] = data;
>>       }
>>   }
>>   diff --git a/target/loongarch/tcg/insn_trans/trans_memory.c.inc 
>> b/target/loongarch/tcg/insn_trans/trans_memory.c.inc
>> index e287d46363..99bc486119 100644
>> --- a/target/loongarch/tcg/insn_trans/trans_memory.c.inc
>> +++ b/target/loongarch/tcg/insn_trans/trans_memory.c.inc
>> @@ -137,11 +137,89 @@ static bool trans_preldx(DisasContext *ctx, 
>> arg_preldx * a)
>>       return true;
>>   }
>>   +/*
>> + * Decode dbar hint and emit appropriate TCG memory barrier.
>> + *
>> + * The hint is a 5-bit field (0-31) encoded in the instruction.
>> + * For hint 0x700 (special LL/SC loop barrier), treat as full barrier.
>> + *
>> + * See LoongArch Reference Manual v1.10, Section 4.2.2 for details.
>> + */
>>   static bool trans_dbar(DisasContext *ctx, arg_dbar * a)
>>   {
>>       tcg_gen_mb(TCG_BAR_SC | TCG_MO_ALL);
>> -    return true;
>> -}
>> +    int hint = a->imm;
>> +    TCGBar bar_flags = 0;
>> +
>> +    /* Reserved/no-op hints: 0xf and 0x1f */
>> +    if (hint == 0xf || hint == 0x1f) {
>> +        return true;
>> +    }
>> +
>> +    /* If the CPU does not support fine-grained hints,or for the 
>> special LL/SC
>> +     * loop barrier (0x700), emit a full barrier.
>> +     */
>> +    if (!avail_DBAR_HINT(ctx) || hint == 0x700) {
>> +        tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
>> +        return true;
>> +    }
>> +
>> +    /*
>> +     * Fine-grained hint decoding:
>> +     * Bits 3-0 control which accesses must be ordered.
>> +     *   bit3: barrier previous reads?   (0 = enforce, 1 = relax)
>> +     *   bit2: barrier previous writes?  (0 = enforce, 1 = relax)
>> +     *   bit1: barrier succeeding reads? (0 = enforce, 1 = relax)
>> +     *   bit0: barrier succeeding writes?(0 = enforce, 1 = relax)
>> +     *
>> +     * For each combination, we set the corresponding TCG_MO_* flag 
>> if both
>> +     * sides of the barrier require ordering.
>> +     */
>> +    bool prev_rd = !(hint & 0x08);   /* need barrier for previous 
>> reads */
>> +    bool prev_wr = !(hint & 0x04);   /* need barrier for previous 
>> writes */
>> +    bool succ_rd = !(hint & 0x02);   /* need barrier for succeeding 
>> reads */
>> +    bool succ_wr = !(hint & 0x01);   /* need barrier for succeeding 
>> writes */
>> +
>> +    if (prev_rd && succ_rd) {
>> +        bar_flags |= TCG_MO_LD_LD;
>> +    }
>> +    if (prev_wr && succ_rd) {
>> +        bar_flags |= TCG_MO_ST_LD;
>> +    }
>> +    if (prev_rd && succ_wr) {
>> +        bar_flags |= TCG_MO_LD_ST;
>> +    }
>> +    if (prev_wr && succ_wr) {
>> +        bar_flags |= TCG_MO_ST_ST;
>> +    }
> I do not know the memory order, however it seems that it is different 
> with other architectures with following usage when it is translated to 
> QEMU TCG code. I do now which is right.
>
> __smp_rmb() ---> tcg_gen_mb(TCG_BAR_SC | TCG_MO_LD_LD | TCG_MO_LD_ST)
> __smp_wmb() ---> tcg_gen_mb(TCG_BAR_SC | TCG_MO_ST_ST)
> __smp_mb()  ---> tcg_gen_mb(TCG_BAR_SC | TCG_MO_ALL)
>
Do you mean that TCG_BAR_SC might be missing here?

It is actuallyhandled later: for non acquire/release cases we fall back to
tcg_gen_mb(bar_flags | TCG_BAR_SC).

Thanks.
Song Gao

> Regards
> Bibo Mao
>> +
>> +    /* If no flags were set, this is a no-op barrier */
>> +    if (bar_flags == 0) {
>> +        return true;
>> +    }
>> +
>> +    /*
>> +     * Use acquire/release semantics when possible to generate more 
>> efficient
>> +     * code. Otherwise, fall back to a sequential consistency barrier.
>> +     *
>> +     * Acquire: order loads before loads/stores (LD_LD | LD_ST)
>> +     * Release: order stores before stores/loads (ST_ST | ST_LD)
>> +     */
>> +    if ((bar_flags & (TCG_MO_LD_LD | TCG_MO_LD_ST)) &&
>> +        !(bar_flags & (TCG_MO_ST_ST | TCG_MO_ST_LD))) {
>> +        /* Only acquire flags present */
>> +        tcg_gen_mb(bar_flags | TCG_BAR_LDAQ);
>> +    } else if ((bar_flags & (TCG_MO_ST_ST | TCG_MO_ST_LD)) &&
>> +               !(bar_flags & (TCG_MO_LD_LD | TCG_MO_LD_ST))) {
>> +        /* Only release flags present */
>> +        tcg_gen_mb(bar_flags | TCG_BAR_STRL);
>> +    } else {
>> +        /* Mixed or full barrier */
>> +        tcg_gen_mb(bar_flags | TCG_BAR_SC);
>> +    }
>> +
>> +     return true;
>> + }
>>     static bool trans_ibar(DisasContext *ctx, arg_ibar *a)
>>   {
>> diff --git a/target/loongarch/tcg/translate.c 
>> b/target/loongarch/tcg/translate.c
>> index b9ed13d19c..49280b1dd3 100644
>> --- a/target/loongarch/tcg/translate.c
>> +++ b/target/loongarch/tcg/translate.c
>> @@ -149,6 +149,7 @@ static void 
>> loongarch_tr_init_disas_context(DisasContextBase *dcbase,
>>         ctx->cpucfg1 = env->cpucfg[1];
>>       ctx->cpucfg2 = env->cpucfg[2];
>> +    ctx->cpucfg3 = env->cpucfg[3];
>>   }
>>     static void loongarch_tr_tb_start(DisasContextBase *dcbase, 
>> CPUState *cs)
>> diff --git a/target/loongarch/translate.h b/target/loongarch/translate.h
>> index ba1c89e57b..8aa8325dc6 100644
>> --- a/target/loongarch/translate.h
>> +++ b/target/loongarch/translate.h
>> @@ -43,6 +43,8 @@
>>   #define avail_LLACQ_SCREL(C)    (FIELD_EX32((C)->cpucfg2, CPUCFG2, 
>> LLACQ_SCREL))
>>   #define avail_LLACQ_SCREL_64(C) (avail_64(C) && avail_LLACQ_SCREL(C))
>>   +#define avail_DBAR_HINT(C) (FIELD_EX32((C)->cpucfg3, CPUCFG3, 
>> DBAR_HINTS))
>> +
>>   /*
>>    * If an operation is being performed on less than TARGET_LONG_BITS,
>>    * it may require the inputs to be sign- or zero-extended; which will
>> @@ -66,6 +68,7 @@ typedef struct DisasContext {
>>       bool va32; /* 32-bit virtual address */
>>       uint32_t cpucfg1;
>>       uint32_t cpucfg2;
>> +    uint32_t cpucfg3;
>>   } DisasContext;
>>     void generate_exception(DisasContext *ctx, int excp);
>>


Re: [PATCH] target/loongarch: Add support for dbar hint variants
Posted by Bibo Mao 3 days, 9 hours ago

On 2026/3/27 上午8:32, Song Gao wrote:
> LoongArch architecture (since LA664) introduces fine-grained dbar
> hints that allow controlling which memory accesses are ordered by
> the barrier. Previously, all dbar instructions were treated as a
> full barrier (TCG_MO_ALL | TCG_BAR_SC).
> 
> This patch adds support for decoding dbar hints and emitting the
> appropriate TCG memory barrier flags. For CPUs that do not advertise
> the DBAR_HINTS feature (cpucfg3.DBAR_HINTS = 0), all dbar hints
> fall back to a full barrier, preserving compatibility.
> 
> The hint encoding follows the LoongArch v1.10 specification:
> - Bit4: 0 = completion barrier, 1 = ordering barrier
>          (ignored by TCG as TCG only supports ordering barriers)
> - Bit3: barrier for previous reads (0 = enforce, 1 = relax)
> - Bit2: barrier for previous writes (0 = enforce, 1 = relax)
> - Bit1: barrier for succeeding reads (0 = enforce, 1 = relax)
> - Bit0: barrier for succeeding writes (0 = enforce, 1 = relax)
> 
> The mapping to TCG memory order flags is as follows:
> - TCG_MO_LD_LD is set if both previous and succeeding reads are ordered.
> - TCG_MO_ST_LD is set if previous write and succeeding read are ordered.
> - TCG_MO_LD_ST is set if previous read and succeeding write are ordered.
> - TCG_MO_ST_ST is set if both previous and succeeding writes are ordered.
> 
> If the resulting flags describe an acquire or release barrier,
> TCG_BAR_LDAQ or TCG_BAR_STRL is used accordingly; otherwise a
> full SC barrier (TCG_BAR_SC) is emitted.
> 
> Special hint handling:
> - hint 0x700: LL/SC loop barrier, treated as a full barrier as recommended.
> - hint 0xf and 0x1f: reserved/no-op, treated as no operation
> 
> Signed-off-by: Song Gao <gaosong@loongson.cn>
> ---
>   target/loongarch/cpu.c                        |  4 +
>   .../tcg/insn_trans/trans_memory.c.inc         | 82 ++++++++++++++++++-
>   target/loongarch/tcg/translate.c              |  1 +
>   target/loongarch/translate.h                  |  3 +
>   4 files changed, 88 insertions(+), 2 deletions(-)
> 
> diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c
> index e22568c84a..d8d106b07e 100644
> --- a/target/loongarch/cpu.c
> +++ b/target/loongarch/cpu.c
> @@ -455,6 +455,10 @@ static void loongarch_max_initfn(Object *obj)
>           data = FIELD_DP32(data, CPUCFG2, LLACQ_SCREL, 1);
>           data = FIELD_DP32(data, CPUCFG2, SCQ, 1);
>           cpu->env.cpucfg[2] = data;
> +
> +        data = cpu->env.cpucfg[3];
> +        data = FIELD_DP32(data, CPUCFG3, DBAR_HINTS, 1);
> +        cpu->env.cpucfg[3] = data;
>       }
>   }
>   
> diff --git a/target/loongarch/tcg/insn_trans/trans_memory.c.inc b/target/loongarch/tcg/insn_trans/trans_memory.c.inc
> index e287d46363..99bc486119 100644
> --- a/target/loongarch/tcg/insn_trans/trans_memory.c.inc
> +++ b/target/loongarch/tcg/insn_trans/trans_memory.c.inc
> @@ -137,11 +137,89 @@ static bool trans_preldx(DisasContext *ctx, arg_preldx * a)
>       return true;
>   }
>   
> +/*
> + * Decode dbar hint and emit appropriate TCG memory barrier.
> + *
> + * The hint is a 5-bit field (0-31) encoded in the instruction.
> + * For hint 0x700 (special LL/SC loop barrier), treat as full barrier.
> + *
> + * See LoongArch Reference Manual v1.10, Section 4.2.2 for details.
> + */
>   static bool trans_dbar(DisasContext *ctx, arg_dbar * a)
>   {
>       tcg_gen_mb(TCG_BAR_SC | TCG_MO_ALL);
I am not familiar with memory order. I only point out the possible 
problem purely from code.

Should tcg_gen_mb() be moved afterward, otherwise there may be two times 
calling with tcg_gen_mb().
> -    return true;
> -}
> +    int hint = a->imm;
> +    TCGBar bar_flags = 0;
> +
> +    /* Reserved/no-op hints: 0xf and 0x1f */
> +    if (hint == 0xf || hint == 0x1f) {
> +        return true;
> +    }
Ditto, should it move afterward?

Regards
Bibo Mao
> +
> +    /* If the CPU does not support fine-grained hints,or for the special LL/SC
> +     * loop barrier (0x700), emit a full barrier.
> +     */
> +    if (!avail_DBAR_HINT(ctx) || hint == 0x700) {
> +        tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
> +        return true;
> +    }
> +
> +    /*
> +     * Fine-grained hint decoding:
> +     * Bits 3-0 control which accesses must be ordered.
> +     *   bit3: barrier previous reads?   (0 = enforce, 1 = relax)
> +     *   bit2: barrier previous writes?  (0 = enforce, 1 = relax)
> +     *   bit1: barrier succeeding reads? (0 = enforce, 1 = relax)
> +     *   bit0: barrier succeeding writes?(0 = enforce, 1 = relax)
> +     *
> +     * For each combination, we set the corresponding TCG_MO_* flag if both
> +     * sides of the barrier require ordering.
> +     */
> +    bool prev_rd = !(hint & 0x08);   /* need barrier for previous reads */
> +    bool prev_wr = !(hint & 0x04);   /* need barrier for previous writes */
> +    bool succ_rd = !(hint & 0x02);   /* need barrier for succeeding reads */
> +    bool succ_wr = !(hint & 0x01);   /* need barrier for succeeding writes */
> +
> +    if (prev_rd && succ_rd) {
> +        bar_flags |= TCG_MO_LD_LD;
> +    }
> +    if (prev_wr && succ_rd) {
> +        bar_flags |= TCG_MO_ST_LD;
> +    }
> +    if (prev_rd && succ_wr) {
> +        bar_flags |= TCG_MO_LD_ST;
> +    }
> +    if (prev_wr && succ_wr) {
> +        bar_flags |= TCG_MO_ST_ST;
> +    }
> +
> +    /* If no flags were set, this is a no-op barrier */
> +    if (bar_flags == 0) {
> +        return true;
> +    }
> +
> +    /*
> +     * Use acquire/release semantics when possible to generate more efficient
> +     * code. Otherwise, fall back to a sequential consistency barrier.
> +     *
> +     * Acquire: order loads before loads/stores (LD_LD | LD_ST)
> +     * Release: order stores before stores/loads (ST_ST | ST_LD)
> +     */
> +    if ((bar_flags & (TCG_MO_LD_LD | TCG_MO_LD_ST)) &&
> +        !(bar_flags & (TCG_MO_ST_ST | TCG_MO_ST_LD))) {
> +        /* Only acquire flags present */
> +        tcg_gen_mb(bar_flags | TCG_BAR_LDAQ);
> +    } else if ((bar_flags & (TCG_MO_ST_ST | TCG_MO_ST_LD)) &&
> +               !(bar_flags & (TCG_MO_LD_LD | TCG_MO_LD_ST))) {
> +        /* Only release flags present */
> +        tcg_gen_mb(bar_flags | TCG_BAR_STRL);
> +    } else {
> +        /* Mixed or full barrier */
> +        tcg_gen_mb(bar_flags | TCG_BAR_SC);
> +    }
> +
> +     return true;
> + }
>   
>   static bool trans_ibar(DisasContext *ctx, arg_ibar *a)
>   {
> diff --git a/target/loongarch/tcg/translate.c b/target/loongarch/tcg/translate.c
> index b9ed13d19c..49280b1dd3 100644
> --- a/target/loongarch/tcg/translate.c
> +++ b/target/loongarch/tcg/translate.c
> @@ -149,6 +149,7 @@ static void loongarch_tr_init_disas_context(DisasContextBase *dcbase,
>   
>       ctx->cpucfg1 = env->cpucfg[1];
>       ctx->cpucfg2 = env->cpucfg[2];
> +    ctx->cpucfg3 = env->cpucfg[3];
>   }
>   
>   static void loongarch_tr_tb_start(DisasContextBase *dcbase, CPUState *cs)
> diff --git a/target/loongarch/translate.h b/target/loongarch/translate.h
> index ba1c89e57b..8aa8325dc6 100644
> --- a/target/loongarch/translate.h
> +++ b/target/loongarch/translate.h
> @@ -43,6 +43,8 @@
>   #define avail_LLACQ_SCREL(C)    (FIELD_EX32((C)->cpucfg2, CPUCFG2, LLACQ_SCREL))
>   #define avail_LLACQ_SCREL_64(C) (avail_64(C) && avail_LLACQ_SCREL(C))
>   
> +#define avail_DBAR_HINT(C) (FIELD_EX32((C)->cpucfg3, CPUCFG3, DBAR_HINTS))
> +
>   /*
>    * If an operation is being performed on less than TARGET_LONG_BITS,
>    * it may require the inputs to be sign- or zero-extended; which will
> @@ -66,6 +68,7 @@ typedef struct DisasContext {
>       bool va32; /* 32-bit virtual address */
>       uint32_t cpucfg1;
>       uint32_t cpucfg2;
> +    uint32_t cpucfg3;
>   } DisasContext;
>   
>   void generate_exception(DisasContext *ctx, int excp);
> 


Re: [PATCH] target/loongarch: Add support for dbar hint variants
Posted by gaosong 3 days, 8 hours ago
在 2026/3/30 下午3:57, Bibo Mao 写道:
>
>
> On 2026/3/27 上午8:32, Song Gao wrote:
>> LoongArch architecture (since LA664) introduces fine-grained dbar
>> hints that allow controlling which memory accesses are ordered by
>> the barrier. Previously, all dbar instructions were treated as a
>> full barrier (TCG_MO_ALL | TCG_BAR_SC).
>>
>> This patch adds support for decoding dbar hints and emitting the
>> appropriate TCG memory barrier flags. For CPUs that do not advertise
>> the DBAR_HINTS feature (cpucfg3.DBAR_HINTS = 0), all dbar hints
>> fall back to a full barrier, preserving compatibility.
>>
>> The hint encoding follows the LoongArch v1.10 specification:
>> - Bit4: 0 = completion barrier, 1 = ordering barrier
>>          (ignored by TCG as TCG only supports ordering barriers)
>> - Bit3: barrier for previous reads (0 = enforce, 1 = relax)
>> - Bit2: barrier for previous writes (0 = enforce, 1 = relax)
>> - Bit1: barrier for succeeding reads (0 = enforce, 1 = relax)
>> - Bit0: barrier for succeeding writes (0 = enforce, 1 = relax)
>>
>> The mapping to TCG memory order flags is as follows:
>> - TCG_MO_LD_LD is set if both previous and succeeding reads are ordered.
>> - TCG_MO_ST_LD is set if previous write and succeeding read are ordered.
>> - TCG_MO_LD_ST is set if previous read and succeeding write are ordered.
>> - TCG_MO_ST_ST is set if both previous and succeeding writes are 
>> ordered.
>>
>> If the resulting flags describe an acquire or release barrier,
>> TCG_BAR_LDAQ or TCG_BAR_STRL is used accordingly; otherwise a
>> full SC barrier (TCG_BAR_SC) is emitted.
>>
>> Special hint handling:
>> - hint 0x700: LL/SC loop barrier, treated as a full barrier as 
>> recommended.
>> - hint 0xf and 0x1f: reserved/no-op, treated as no operation
>>
>> Signed-off-by: Song Gao <gaosong@loongson.cn>
>> ---
>>   target/loongarch/cpu.c                        |  4 +
>>   .../tcg/insn_trans/trans_memory.c.inc         | 82 ++++++++++++++++++-
>>   target/loongarch/tcg/translate.c              |  1 +
>>   target/loongarch/translate.h                  |  3 +
>>   4 files changed, 88 insertions(+), 2 deletions(-)
>>
>> diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c
>> index e22568c84a..d8d106b07e 100644
>> --- a/target/loongarch/cpu.c
>> +++ b/target/loongarch/cpu.c
>> @@ -455,6 +455,10 @@ static void loongarch_max_initfn(Object *obj)
>>           data = FIELD_DP32(data, CPUCFG2, LLACQ_SCREL, 1);
>>           data = FIELD_DP32(data, CPUCFG2, SCQ, 1);
>>           cpu->env.cpucfg[2] = data;
>> +
>> +        data = cpu->env.cpucfg[3];
>> +        data = FIELD_DP32(data, CPUCFG3, DBAR_HINTS, 1);
>> +        cpu->env.cpucfg[3] = data;
>>       }
>>   }
>>   diff --git a/target/loongarch/tcg/insn_trans/trans_memory.c.inc 
>> b/target/loongarch/tcg/insn_trans/trans_memory.c.inc
>> index e287d46363..99bc486119 100644
>> --- a/target/loongarch/tcg/insn_trans/trans_memory.c.inc
>> +++ b/target/loongarch/tcg/insn_trans/trans_memory.c.inc
>> @@ -137,11 +137,89 @@ static bool trans_preldx(DisasContext *ctx, 
>> arg_preldx * a)
>>       return true;
>>   }
>>   +/*
>> + * Decode dbar hint and emit appropriate TCG memory barrier.
>> + *
>> + * The hint is a 5-bit field (0-31) encoded in the instruction.
>> + * For hint 0x700 (special LL/SC loop barrier), treat as full barrier.
>> + *
>> + * See LoongArch Reference Manual v1.10, Section 4.2.2 for details.
>> + */
>>   static bool trans_dbar(DisasContext *ctx, arg_dbar * a)
>>   {
>>       tcg_gen_mb(TCG_BAR_SC | TCG_MO_ALL);
> I am not familiar with memory order. I only point out the possible 
> problem purely from code.
>
> Should tcg_gen_mb() be moved afterward, otherwise there may be two 
> times calling with tcg_gen_mb().
yes , you are right.   this tcg_gen_mb() should remove.

>
>> -    return true;
>> -}
>> +    int hint = a->imm;
>> +    TCGBar bar_flags = 0;
>> +
>> +    /* Reserved/no-op hints: 0xf and 0x1f */
>> +    if (hint == 0xf || hint == 0x1f) {
>> +        return true;
>> +    }
> Ditto, should it move afterward?
here ,  no-op hint we juet return true,

Thanks.
Song Gao
>
> Regards
> Bibo Mao
>> +
>> +    /* If the CPU does not support fine-grained hints,or for the 
>> special LL/SC
>> +     * loop barrier (0x700), emit a full barrier.
>> +     */
>> +    if (!avail_DBAR_HINT(ctx) || hint == 0x700) {
>> +        tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
>> +        return true;
>> +    }
>> +
>> +    /*
>> +     * Fine-grained hint decoding:
>> +     * Bits 3-0 control which accesses must be ordered.
>> +     *   bit3: barrier previous reads?   (0 = enforce, 1 = relax)
>> +     *   bit2: barrier previous writes?  (0 = enforce, 1 = relax)
>> +     *   bit1: barrier succeeding reads? (0 = enforce, 1 = relax)
>> +     *   bit0: barrier succeeding writes?(0 = enforce, 1 = relax)
>> +     *
>> +     * For each combination, we set the corresponding TCG_MO_* flag 
>> if both
>> +     * sides of the barrier require ordering.
>> +     */
>> +    bool prev_rd = !(hint & 0x08);   /* need barrier for previous 
>> reads */
>> +    bool prev_wr = !(hint & 0x04);   /* need barrier for previous 
>> writes */
>> +    bool succ_rd = !(hint & 0x02);   /* need barrier for succeeding 
>> reads */
>> +    bool succ_wr = !(hint & 0x01);   /* need barrier for succeeding 
>> writes */
>> +
>> +    if (prev_rd && succ_rd) {
>> +        bar_flags |= TCG_MO_LD_LD;
>> +    }
>> +    if (prev_wr && succ_rd) {
>> +        bar_flags |= TCG_MO_ST_LD;
>> +    }
>> +    if (prev_rd && succ_wr) {
>> +        bar_flags |= TCG_MO_LD_ST;
>> +    }
>> +    if (prev_wr && succ_wr) {
>> +        bar_flags |= TCG_MO_ST_ST;
>> +    }
>> +
>> +    /* If no flags were set, this is a no-op barrier */
>> +    if (bar_flags == 0) {
>> +        return true;
>> +    }
>> +
>> +    /*
>> +     * Use acquire/release semantics when possible to generate more 
>> efficient
>> +     * code. Otherwise, fall back to a sequential consistency barrier.
>> +     *
>> +     * Acquire: order loads before loads/stores (LD_LD | LD_ST)
>> +     * Release: order stores before stores/loads (ST_ST | ST_LD)
>> +     */
>> +    if ((bar_flags & (TCG_MO_LD_LD | TCG_MO_LD_ST)) &&
>> +        !(bar_flags & (TCG_MO_ST_ST | TCG_MO_ST_LD))) {
>> +        /* Only acquire flags present */
>> +        tcg_gen_mb(bar_flags | TCG_BAR_LDAQ);
>> +    } else if ((bar_flags & (TCG_MO_ST_ST | TCG_MO_ST_LD)) &&
>> +               !(bar_flags & (TCG_MO_LD_LD | TCG_MO_LD_ST))) {
>> +        /* Only release flags present */
>> +        tcg_gen_mb(bar_flags | TCG_BAR_STRL);
>> +    } else {
>> +        /* Mixed or full barrier */
>> +        tcg_gen_mb(bar_flags | TCG_BAR_SC);
>> +    }
>> +
>> +     return true;
>> + }
>>     static bool trans_ibar(DisasContext *ctx, arg_ibar *a)
>>   {
>> diff --git a/target/loongarch/tcg/translate.c 
>> b/target/loongarch/tcg/translate.c
>> index b9ed13d19c..49280b1dd3 100644
>> --- a/target/loongarch/tcg/translate.c
>> +++ b/target/loongarch/tcg/translate.c
>> @@ -149,6 +149,7 @@ static void 
>> loongarch_tr_init_disas_context(DisasContextBase *dcbase,
>>         ctx->cpucfg1 = env->cpucfg[1];
>>       ctx->cpucfg2 = env->cpucfg[2];
>> +    ctx->cpucfg3 = env->cpucfg[3];
>>   }
>>     static void loongarch_tr_tb_start(DisasContextBase *dcbase, 
>> CPUState *cs)
>> diff --git a/target/loongarch/translate.h b/target/loongarch/translate.h
>> index ba1c89e57b..8aa8325dc6 100644
>> --- a/target/loongarch/translate.h
>> +++ b/target/loongarch/translate.h
>> @@ -43,6 +43,8 @@
>>   #define avail_LLACQ_SCREL(C)    (FIELD_EX32((C)->cpucfg2, CPUCFG2, 
>> LLACQ_SCREL))
>>   #define avail_LLACQ_SCREL_64(C) (avail_64(C) && avail_LLACQ_SCREL(C))
>>   +#define avail_DBAR_HINT(C) (FIELD_EX32((C)->cpucfg3, CPUCFG3, 
>> DBAR_HINTS))
>> +
>>   /*
>>    * If an operation is being performed on less than TARGET_LONG_BITS,
>>    * it may require the inputs to be sign- or zero-extended; which will
>> @@ -66,6 +68,7 @@ typedef struct DisasContext {
>>       bool va32; /* 32-bit virtual address */
>>       uint32_t cpucfg1;
>>       uint32_t cpucfg2;
>> +    uint32_t cpucfg3;
>>   } DisasContext;
>>     void generate_exception(DisasContext *ctx, int excp);
>>