target/loongarch/cpu.c | 4 + .../tcg/insn_trans/trans_memory.c.inc | 82 ++++++++++++++++++- target/loongarch/tcg/translate.c | 1 + target/loongarch/translate.h | 3 + 4 files changed, 88 insertions(+), 2 deletions(-)
LoongArch architecture (since LA664) introduces fine-grained dbar
hints that allow controlling which memory accesses are ordered by
the barrier. Previously, all dbar instructions were treated as a
full barrier (TCG_MO_ALL | TCG_BAR_SC).
This patch adds support for decoding dbar hints and emitting the
appropriate TCG memory barrier flags. For CPUs that do not advertise
the DBAR_HINTS feature (cpucfg3.DBAR_HINTS = 0), all dbar hints
fall back to a full barrier, preserving compatibility.
The hint encoding follows the LoongArch v1.10 specification:
- Bit4: 0 = completion barrier, 1 = ordering barrier
(ignored by TCG as TCG only supports ordering barriers)
- Bit3: barrier for previous reads (0 = enforce, 1 = relax)
- Bit2: barrier for previous writes (0 = enforce, 1 = relax)
- Bit1: barrier for succeeding reads (0 = enforce, 1 = relax)
- Bit0: barrier for succeeding writes (0 = enforce, 1 = relax)
The mapping to TCG memory order flags is as follows:
- TCG_MO_LD_LD is set if both previous and succeeding reads are ordered.
- TCG_MO_ST_LD is set if previous write and succeeding read are ordered.
- TCG_MO_LD_ST is set if previous read and succeeding write are ordered.
- TCG_MO_ST_ST is set if both previous and succeeding writes are ordered.
If the resulting flags describe an acquire or release barrier,
TCG_BAR_LDAQ or TCG_BAR_STRL is used accordingly; otherwise a
full SC barrier (TCG_BAR_SC) is emitted.
Special hint handling:
- hint 0x700: LL/SC loop barrier, treated as a full barrier as recommended.
- hint 0xf and 0x1f: reserved/no-op, treated as no operation
Signed-off-by: Song Gao <gaosong@loongson.cn>
---
target/loongarch/cpu.c | 4 +
.../tcg/insn_trans/trans_memory.c.inc | 82 ++++++++++++++++++-
target/loongarch/tcg/translate.c | 1 +
target/loongarch/translate.h | 3 +
4 files changed, 88 insertions(+), 2 deletions(-)
diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c
index e22568c84a..d8d106b07e 100644
--- a/target/loongarch/cpu.c
+++ b/target/loongarch/cpu.c
@@ -455,6 +455,10 @@ static void loongarch_max_initfn(Object *obj)
data = FIELD_DP32(data, CPUCFG2, LLACQ_SCREL, 1);
data = FIELD_DP32(data, CPUCFG2, SCQ, 1);
cpu->env.cpucfg[2] = data;
+
+ data = cpu->env.cpucfg[3];
+ data = FIELD_DP32(data, CPUCFG3, DBAR_HINTS, 1);
+ cpu->env.cpucfg[3] = data;
}
}
diff --git a/target/loongarch/tcg/insn_trans/trans_memory.c.inc b/target/loongarch/tcg/insn_trans/trans_memory.c.inc
index e287d46363..99bc486119 100644
--- a/target/loongarch/tcg/insn_trans/trans_memory.c.inc
+++ b/target/loongarch/tcg/insn_trans/trans_memory.c.inc
@@ -137,11 +137,89 @@ static bool trans_preldx(DisasContext *ctx, arg_preldx * a)
return true;
}
+/*
+ * Decode dbar hint and emit appropriate TCG memory barrier.
+ *
+ * The hint is a 5-bit field (0-31) encoded in the instruction.
+ * For hint 0x700 (special LL/SC loop barrier), treat as full barrier.
+ *
+ * See LoongArch Reference Manual v1.10, Section 4.2.2 for details.
+ */
static bool trans_dbar(DisasContext *ctx, arg_dbar * a)
{
tcg_gen_mb(TCG_BAR_SC | TCG_MO_ALL);
- return true;
-}
+ int hint = a->imm;
+ TCGBar bar_flags = 0;
+
+ /* Reserved/no-op hints: 0xf and 0x1f */
+ if (hint == 0xf || hint == 0x1f) {
+ return true;
+ }
+
+ /* If the CPU does not support fine-grained hints,or for the special LL/SC
+ * loop barrier (0x700), emit a full barrier.
+ */
+ if (!avail_DBAR_HINT(ctx) || hint == 0x700) {
+ tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
+ return true;
+ }
+
+ /*
+ * Fine-grained hint decoding:
+ * Bits 3-0 control which accesses must be ordered.
+ * bit3: barrier previous reads? (0 = enforce, 1 = relax)
+ * bit2: barrier previous writes? (0 = enforce, 1 = relax)
+ * bit1: barrier succeeding reads? (0 = enforce, 1 = relax)
+ * bit0: barrier succeeding writes?(0 = enforce, 1 = relax)
+ *
+ * For each combination, we set the corresponding TCG_MO_* flag if both
+ * sides of the barrier require ordering.
+ */
+ bool prev_rd = !(hint & 0x08); /* need barrier for previous reads */
+ bool prev_wr = !(hint & 0x04); /* need barrier for previous writes */
+ bool succ_rd = !(hint & 0x02); /* need barrier for succeeding reads */
+ bool succ_wr = !(hint & 0x01); /* need barrier for succeeding writes */
+
+ if (prev_rd && succ_rd) {
+ bar_flags |= TCG_MO_LD_LD;
+ }
+ if (prev_wr && succ_rd) {
+ bar_flags |= TCG_MO_ST_LD;
+ }
+ if (prev_rd && succ_wr) {
+ bar_flags |= TCG_MO_LD_ST;
+ }
+ if (prev_wr && succ_wr) {
+ bar_flags |= TCG_MO_ST_ST;
+ }
+
+ /* If no flags were set, this is a no-op barrier */
+ if (bar_flags == 0) {
+ return true;
+ }
+
+ /*
+ * Use acquire/release semantics when possible to generate more efficient
+ * code. Otherwise, fall back to a sequential consistency barrier.
+ *
+ * Acquire: order loads before loads/stores (LD_LD | LD_ST)
+ * Release: order stores before stores/loads (ST_ST | ST_LD)
+ */
+ if ((bar_flags & (TCG_MO_LD_LD | TCG_MO_LD_ST)) &&
+ !(bar_flags & (TCG_MO_ST_ST | TCG_MO_ST_LD))) {
+ /* Only acquire flags present */
+ tcg_gen_mb(bar_flags | TCG_BAR_LDAQ);
+ } else if ((bar_flags & (TCG_MO_ST_ST | TCG_MO_ST_LD)) &&
+ !(bar_flags & (TCG_MO_LD_LD | TCG_MO_LD_ST))) {
+ /* Only release flags present */
+ tcg_gen_mb(bar_flags | TCG_BAR_STRL);
+ } else {
+ /* Mixed or full barrier */
+ tcg_gen_mb(bar_flags | TCG_BAR_SC);
+ }
+
+ return true;
+ }
static bool trans_ibar(DisasContext *ctx, arg_ibar *a)
{
diff --git a/target/loongarch/tcg/translate.c b/target/loongarch/tcg/translate.c
index b9ed13d19c..49280b1dd3 100644
--- a/target/loongarch/tcg/translate.c
+++ b/target/loongarch/tcg/translate.c
@@ -149,6 +149,7 @@ static void loongarch_tr_init_disas_context(DisasContextBase *dcbase,
ctx->cpucfg1 = env->cpucfg[1];
ctx->cpucfg2 = env->cpucfg[2];
+ ctx->cpucfg3 = env->cpucfg[3];
}
static void loongarch_tr_tb_start(DisasContextBase *dcbase, CPUState *cs)
diff --git a/target/loongarch/translate.h b/target/loongarch/translate.h
index ba1c89e57b..8aa8325dc6 100644
--- a/target/loongarch/translate.h
+++ b/target/loongarch/translate.h
@@ -43,6 +43,8 @@
#define avail_LLACQ_SCREL(C) (FIELD_EX32((C)->cpucfg2, CPUCFG2, LLACQ_SCREL))
#define avail_LLACQ_SCREL_64(C) (avail_64(C) && avail_LLACQ_SCREL(C))
+#define avail_DBAR_HINT(C) (FIELD_EX32((C)->cpucfg3, CPUCFG3, DBAR_HINTS))
+
/*
* If an operation is being performed on less than TARGET_LONG_BITS,
* it may require the inputs to be sign- or zero-extended; which will
@@ -66,6 +68,7 @@ typedef struct DisasContext {
bool va32; /* 32-bit virtual address */
uint32_t cpucfg1;
uint32_t cpucfg2;
+ uint32_t cpucfg3;
} DisasContext;
void generate_exception(DisasContext *ctx, int excp);
--
2.47.3
On 2026/3/27 上午8:32, Song Gao wrote:
> LoongArch architecture (since LA664) introduces fine-grained dbar
> hints that allow controlling which memory accesses are ordered by
> the barrier. Previously, all dbar instructions were treated as a
> full barrier (TCG_MO_ALL | TCG_BAR_SC).
>
> This patch adds support for decoding dbar hints and emitting the
> appropriate TCG memory barrier flags. For CPUs that do not advertise
> the DBAR_HINTS feature (cpucfg3.DBAR_HINTS = 0), all dbar hints
> fall back to a full barrier, preserving compatibility.
>
> The hint encoding follows the LoongArch v1.10 specification:
> - Bit4: 0 = completion barrier, 1 = ordering barrier
> (ignored by TCG as TCG only supports ordering barriers)
> - Bit3: barrier for previous reads (0 = enforce, 1 = relax)
> - Bit2: barrier for previous writes (0 = enforce, 1 = relax)
> - Bit1: barrier for succeeding reads (0 = enforce, 1 = relax)
> - Bit0: barrier for succeeding writes (0 = enforce, 1 = relax)
>
> The mapping to TCG memory order flags is as follows:
> - TCG_MO_LD_LD is set if both previous and succeeding reads are ordered.
> - TCG_MO_ST_LD is set if previous write and succeeding read are ordered.
> - TCG_MO_LD_ST is set if previous read and succeeding write are ordered.
> - TCG_MO_ST_ST is set if both previous and succeeding writes are ordered.
>
> If the resulting flags describe an acquire or release barrier,
> TCG_BAR_LDAQ or TCG_BAR_STRL is used accordingly; otherwise a
> full SC barrier (TCG_BAR_SC) is emitted.
>
> Special hint handling:
> - hint 0x700: LL/SC loop barrier, treated as a full barrier as recommended.
> - hint 0xf and 0x1f: reserved/no-op, treated as no operation
>
> Signed-off-by: Song Gao <gaosong@loongson.cn>
> ---
> target/loongarch/cpu.c | 4 +
> .../tcg/insn_trans/trans_memory.c.inc | 82 ++++++++++++++++++-
> target/loongarch/tcg/translate.c | 1 +
> target/loongarch/translate.h | 3 +
> 4 files changed, 88 insertions(+), 2 deletions(-)
>
> diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c
> index e22568c84a..d8d106b07e 100644
> --- a/target/loongarch/cpu.c
> +++ b/target/loongarch/cpu.c
> @@ -455,6 +455,10 @@ static void loongarch_max_initfn(Object *obj)
> data = FIELD_DP32(data, CPUCFG2, LLACQ_SCREL, 1);
> data = FIELD_DP32(data, CPUCFG2, SCQ, 1);
> cpu->env.cpucfg[2] = data;
> +
> + data = cpu->env.cpucfg[3];
> + data = FIELD_DP32(data, CPUCFG3, DBAR_HINTS, 1);
> + cpu->env.cpucfg[3] = data;
> }
> }
>
> diff --git a/target/loongarch/tcg/insn_trans/trans_memory.c.inc b/target/loongarch/tcg/insn_trans/trans_memory.c.inc
> index e287d46363..99bc486119 100644
> --- a/target/loongarch/tcg/insn_trans/trans_memory.c.inc
> +++ b/target/loongarch/tcg/insn_trans/trans_memory.c.inc
> @@ -137,11 +137,89 @@ static bool trans_preldx(DisasContext *ctx, arg_preldx * a)
> return true;
> }
>
> +/*
> + * Decode dbar hint and emit appropriate TCG memory barrier.
> + *
> + * The hint is a 5-bit field (0-31) encoded in the instruction.
> + * For hint 0x700 (special LL/SC loop barrier), treat as full barrier.
> + *
> + * See LoongArch Reference Manual v1.10, Section 4.2.2 for details.
> + */
> static bool trans_dbar(DisasContext *ctx, arg_dbar * a)
> {
> tcg_gen_mb(TCG_BAR_SC | TCG_MO_ALL);
> - return true;
> -}
> + int hint = a->imm;
> + TCGBar bar_flags = 0;
> +
> + /* Reserved/no-op hints: 0xf and 0x1f */
> + if (hint == 0xf || hint == 0x1f) {
> + return true;
> + }
> +
> + /* If the CPU does not support fine-grained hints,or for the special LL/SC
> + * loop barrier (0x700), emit a full barrier.
> + */
> + if (!avail_DBAR_HINT(ctx) || hint == 0x700) {
> + tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
> + return true;
> + }
> +
> + /*
> + * Fine-grained hint decoding:
> + * Bits 3-0 control which accesses must be ordered.
> + * bit3: barrier previous reads? (0 = enforce, 1 = relax)
> + * bit2: barrier previous writes? (0 = enforce, 1 = relax)
> + * bit1: barrier succeeding reads? (0 = enforce, 1 = relax)
> + * bit0: barrier succeeding writes?(0 = enforce, 1 = relax)
> + *
> + * For each combination, we set the corresponding TCG_MO_* flag if both
> + * sides of the barrier require ordering.
> + */
> + bool prev_rd = !(hint & 0x08); /* need barrier for previous reads */
> + bool prev_wr = !(hint & 0x04); /* need barrier for previous writes */
> + bool succ_rd = !(hint & 0x02); /* need barrier for succeeding reads */
> + bool succ_wr = !(hint & 0x01); /* need barrier for succeeding writes */
> +
> + if (prev_rd && succ_rd) {
> + bar_flags |= TCG_MO_LD_LD;
> + }
> + if (prev_wr && succ_rd) {
> + bar_flags |= TCG_MO_ST_LD;
> + }
> + if (prev_rd && succ_wr) {
> + bar_flags |= TCG_MO_LD_ST;
> + }
> + if (prev_wr && succ_wr) {
> + bar_flags |= TCG_MO_ST_ST;
> + }
I do not know the memory order, however it seems that it is different
with other architectures with following usage when it is translated to
QEMU TCG code. I do now which is right.
__smp_rmb() ---> tcg_gen_mb(TCG_BAR_SC | TCG_MO_LD_LD | TCG_MO_LD_ST)
__smp_wmb() ---> tcg_gen_mb(TCG_BAR_SC | TCG_MO_ST_ST)
__smp_mb() ---> tcg_gen_mb(TCG_BAR_SC | TCG_MO_ALL)
Regards
Bibo Mao
> +
> + /* If no flags were set, this is a no-op barrier */
> + if (bar_flags == 0) {
> + return true;
> + }
> +
> + /*
> + * Use acquire/release semantics when possible to generate more efficient
> + * code. Otherwise, fall back to a sequential consistency barrier.
> + *
> + * Acquire: order loads before loads/stores (LD_LD | LD_ST)
> + * Release: order stores before stores/loads (ST_ST | ST_LD)
> + */
> + if ((bar_flags & (TCG_MO_LD_LD | TCG_MO_LD_ST)) &&
> + !(bar_flags & (TCG_MO_ST_ST | TCG_MO_ST_LD))) {
> + /* Only acquire flags present */
> + tcg_gen_mb(bar_flags | TCG_BAR_LDAQ);
> + } else if ((bar_flags & (TCG_MO_ST_ST | TCG_MO_ST_LD)) &&
> + !(bar_flags & (TCG_MO_LD_LD | TCG_MO_LD_ST))) {
> + /* Only release flags present */
> + tcg_gen_mb(bar_flags | TCG_BAR_STRL);
> + } else {
> + /* Mixed or full barrier */
> + tcg_gen_mb(bar_flags | TCG_BAR_SC);
> + }
> +
> + return true;
> + }
>
> static bool trans_ibar(DisasContext *ctx, arg_ibar *a)
> {
> diff --git a/target/loongarch/tcg/translate.c b/target/loongarch/tcg/translate.c
> index b9ed13d19c..49280b1dd3 100644
> --- a/target/loongarch/tcg/translate.c
> +++ b/target/loongarch/tcg/translate.c
> @@ -149,6 +149,7 @@ static void loongarch_tr_init_disas_context(DisasContextBase *dcbase,
>
> ctx->cpucfg1 = env->cpucfg[1];
> ctx->cpucfg2 = env->cpucfg[2];
> + ctx->cpucfg3 = env->cpucfg[3];
> }
>
> static void loongarch_tr_tb_start(DisasContextBase *dcbase, CPUState *cs)
> diff --git a/target/loongarch/translate.h b/target/loongarch/translate.h
> index ba1c89e57b..8aa8325dc6 100644
> --- a/target/loongarch/translate.h
> +++ b/target/loongarch/translate.h
> @@ -43,6 +43,8 @@
> #define avail_LLACQ_SCREL(C) (FIELD_EX32((C)->cpucfg2, CPUCFG2, LLACQ_SCREL))
> #define avail_LLACQ_SCREL_64(C) (avail_64(C) && avail_LLACQ_SCREL(C))
>
> +#define avail_DBAR_HINT(C) (FIELD_EX32((C)->cpucfg3, CPUCFG3, DBAR_HINTS))
> +
> /*
> * If an operation is being performed on less than TARGET_LONG_BITS,
> * it may require the inputs to be sign- or zero-extended; which will
> @@ -66,6 +68,7 @@ typedef struct DisasContext {
> bool va32; /* 32-bit virtual address */
> uint32_t cpucfg1;
> uint32_t cpucfg2;
> + uint32_t cpucfg3;
> } DisasContext;
>
> void generate_exception(DisasContext *ctx, int excp);
>
在 2026/4/2 上午10:17, Bibo Mao 写道:
>
>
> On 2026/3/27 上午8:32, Song Gao wrote:
>> LoongArch architecture (since LA664) introduces fine-grained dbar
>> hints that allow controlling which memory accesses are ordered by
>> the barrier. Previously, all dbar instructions were treated as a
>> full barrier (TCG_MO_ALL | TCG_BAR_SC).
>>
>> This patch adds support for decoding dbar hints and emitting the
>> appropriate TCG memory barrier flags. For CPUs that do not advertise
>> the DBAR_HINTS feature (cpucfg3.DBAR_HINTS = 0), all dbar hints
>> fall back to a full barrier, preserving compatibility.
>>
>> The hint encoding follows the LoongArch v1.10 specification:
>> - Bit4: 0 = completion barrier, 1 = ordering barrier
>> (ignored by TCG as TCG only supports ordering barriers)
>> - Bit3: barrier for previous reads (0 = enforce, 1 = relax)
>> - Bit2: barrier for previous writes (0 = enforce, 1 = relax)
>> - Bit1: barrier for succeeding reads (0 = enforce, 1 = relax)
>> - Bit0: barrier for succeeding writes (0 = enforce, 1 = relax)
>>
>> The mapping to TCG memory order flags is as follows:
>> - TCG_MO_LD_LD is set if both previous and succeeding reads are ordered.
>> - TCG_MO_ST_LD is set if previous write and succeeding read are ordered.
>> - TCG_MO_LD_ST is set if previous read and succeeding write are ordered.
>> - TCG_MO_ST_ST is set if both previous and succeeding writes are
>> ordered.
>>
>> If the resulting flags describe an acquire or release barrier,
>> TCG_BAR_LDAQ or TCG_BAR_STRL is used accordingly; otherwise a
>> full SC barrier (TCG_BAR_SC) is emitted.
>>
>> Special hint handling:
>> - hint 0x700: LL/SC loop barrier, treated as a full barrier as
>> recommended.
>> - hint 0xf and 0x1f: reserved/no-op, treated as no operation
>>
>> Signed-off-by: Song Gao <gaosong@loongson.cn>
>> ---
>> target/loongarch/cpu.c | 4 +
>> .../tcg/insn_trans/trans_memory.c.inc | 82 ++++++++++++++++++-
>> target/loongarch/tcg/translate.c | 1 +
>> target/loongarch/translate.h | 3 +
>> 4 files changed, 88 insertions(+), 2 deletions(-)
>>
>> diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c
>> index e22568c84a..d8d106b07e 100644
>> --- a/target/loongarch/cpu.c
>> +++ b/target/loongarch/cpu.c
>> @@ -455,6 +455,10 @@ static void loongarch_max_initfn(Object *obj)
>> data = FIELD_DP32(data, CPUCFG2, LLACQ_SCREL, 1);
>> data = FIELD_DP32(data, CPUCFG2, SCQ, 1);
>> cpu->env.cpucfg[2] = data;
>> +
>> + data = cpu->env.cpucfg[3];
>> + data = FIELD_DP32(data, CPUCFG3, DBAR_HINTS, 1);
>> + cpu->env.cpucfg[3] = data;
>> }
>> }
>> diff --git a/target/loongarch/tcg/insn_trans/trans_memory.c.inc
>> b/target/loongarch/tcg/insn_trans/trans_memory.c.inc
>> index e287d46363..99bc486119 100644
>> --- a/target/loongarch/tcg/insn_trans/trans_memory.c.inc
>> +++ b/target/loongarch/tcg/insn_trans/trans_memory.c.inc
>> @@ -137,11 +137,89 @@ static bool trans_preldx(DisasContext *ctx,
>> arg_preldx * a)
>> return true;
>> }
>> +/*
>> + * Decode dbar hint and emit appropriate TCG memory barrier.
>> + *
>> + * The hint is a 5-bit field (0-31) encoded in the instruction.
>> + * For hint 0x700 (special LL/SC loop barrier), treat as full barrier.
>> + *
>> + * See LoongArch Reference Manual v1.10, Section 4.2.2 for details.
>> + */
>> static bool trans_dbar(DisasContext *ctx, arg_dbar * a)
>> {
>> tcg_gen_mb(TCG_BAR_SC | TCG_MO_ALL);
>> - return true;
>> -}
>> + int hint = a->imm;
>> + TCGBar bar_flags = 0;
>> +
>> + /* Reserved/no-op hints: 0xf and 0x1f */
>> + if (hint == 0xf || hint == 0x1f) {
>> + return true;
>> + }
>> +
>> + /* If the CPU does not support fine-grained hints,or for the
>> special LL/SC
>> + * loop barrier (0x700), emit a full barrier.
>> + */
>> + if (!avail_DBAR_HINT(ctx) || hint == 0x700) {
>> + tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
>> + return true;
>> + }
>> +
>> + /*
>> + * Fine-grained hint decoding:
>> + * Bits 3-0 control which accesses must be ordered.
>> + * bit3: barrier previous reads? (0 = enforce, 1 = relax)
>> + * bit2: barrier previous writes? (0 = enforce, 1 = relax)
>> + * bit1: barrier succeeding reads? (0 = enforce, 1 = relax)
>> + * bit0: barrier succeeding writes?(0 = enforce, 1 = relax)
>> + *
>> + * For each combination, we set the corresponding TCG_MO_* flag
>> if both
>> + * sides of the barrier require ordering.
>> + */
>> + bool prev_rd = !(hint & 0x08); /* need barrier for previous
>> reads */
>> + bool prev_wr = !(hint & 0x04); /* need barrier for previous
>> writes */
>> + bool succ_rd = !(hint & 0x02); /* need barrier for succeeding
>> reads */
>> + bool succ_wr = !(hint & 0x01); /* need barrier for succeeding
>> writes */
>> +
>> + if (prev_rd && succ_rd) {
>> + bar_flags |= TCG_MO_LD_LD;
>> + }
>> + if (prev_wr && succ_rd) {
>> + bar_flags |= TCG_MO_ST_LD;
>> + }
>> + if (prev_rd && succ_wr) {
>> + bar_flags |= TCG_MO_LD_ST;
>> + }
>> + if (prev_wr && succ_wr) {
>> + bar_flags |= TCG_MO_ST_ST;
>> + }
> I do not know the memory order, however it seems that it is different
> with other architectures with following usage when it is translated to
> QEMU TCG code. I do now which is right.
>
> __smp_rmb() ---> tcg_gen_mb(TCG_BAR_SC | TCG_MO_LD_LD | TCG_MO_LD_ST)
> __smp_wmb() ---> tcg_gen_mb(TCG_BAR_SC | TCG_MO_ST_ST)
> __smp_mb() ---> tcg_gen_mb(TCG_BAR_SC | TCG_MO_ALL)
>
Do you mean that TCG_BAR_SC might be missing here?
It is actuallyhandled later: for non acquire/release cases we fall back to
tcg_gen_mb(bar_flags | TCG_BAR_SC).
Thanks.
Song Gao
> Regards
> Bibo Mao
>> +
>> + /* If no flags were set, this is a no-op barrier */
>> + if (bar_flags == 0) {
>> + return true;
>> + }
>> +
>> + /*
>> + * Use acquire/release semantics when possible to generate more
>> efficient
>> + * code. Otherwise, fall back to a sequential consistency barrier.
>> + *
>> + * Acquire: order loads before loads/stores (LD_LD | LD_ST)
>> + * Release: order stores before stores/loads (ST_ST | ST_LD)
>> + */
>> + if ((bar_flags & (TCG_MO_LD_LD | TCG_MO_LD_ST)) &&
>> + !(bar_flags & (TCG_MO_ST_ST | TCG_MO_ST_LD))) {
>> + /* Only acquire flags present */
>> + tcg_gen_mb(bar_flags | TCG_BAR_LDAQ);
>> + } else if ((bar_flags & (TCG_MO_ST_ST | TCG_MO_ST_LD)) &&
>> + !(bar_flags & (TCG_MO_LD_LD | TCG_MO_LD_ST))) {
>> + /* Only release flags present */
>> + tcg_gen_mb(bar_flags | TCG_BAR_STRL);
>> + } else {
>> + /* Mixed or full barrier */
>> + tcg_gen_mb(bar_flags | TCG_BAR_SC);
>> + }
>> +
>> + return true;
>> + }
>> static bool trans_ibar(DisasContext *ctx, arg_ibar *a)
>> {
>> diff --git a/target/loongarch/tcg/translate.c
>> b/target/loongarch/tcg/translate.c
>> index b9ed13d19c..49280b1dd3 100644
>> --- a/target/loongarch/tcg/translate.c
>> +++ b/target/loongarch/tcg/translate.c
>> @@ -149,6 +149,7 @@ static void
>> loongarch_tr_init_disas_context(DisasContextBase *dcbase,
>> ctx->cpucfg1 = env->cpucfg[1];
>> ctx->cpucfg2 = env->cpucfg[2];
>> + ctx->cpucfg3 = env->cpucfg[3];
>> }
>> static void loongarch_tr_tb_start(DisasContextBase *dcbase,
>> CPUState *cs)
>> diff --git a/target/loongarch/translate.h b/target/loongarch/translate.h
>> index ba1c89e57b..8aa8325dc6 100644
>> --- a/target/loongarch/translate.h
>> +++ b/target/loongarch/translate.h
>> @@ -43,6 +43,8 @@
>> #define avail_LLACQ_SCREL(C) (FIELD_EX32((C)->cpucfg2, CPUCFG2,
>> LLACQ_SCREL))
>> #define avail_LLACQ_SCREL_64(C) (avail_64(C) && avail_LLACQ_SCREL(C))
>> +#define avail_DBAR_HINT(C) (FIELD_EX32((C)->cpucfg3, CPUCFG3,
>> DBAR_HINTS))
>> +
>> /*
>> * If an operation is being performed on less than TARGET_LONG_BITS,
>> * it may require the inputs to be sign- or zero-extended; which will
>> @@ -66,6 +68,7 @@ typedef struct DisasContext {
>> bool va32; /* 32-bit virtual address */
>> uint32_t cpucfg1;
>> uint32_t cpucfg2;
>> + uint32_t cpucfg3;
>> } DisasContext;
>> void generate_exception(DisasContext *ctx, int excp);
>>
On 2026/3/27 上午8:32, Song Gao wrote:
> LoongArch architecture (since LA664) introduces fine-grained dbar
> hints that allow controlling which memory accesses are ordered by
> the barrier. Previously, all dbar instructions were treated as a
> full barrier (TCG_MO_ALL | TCG_BAR_SC).
>
> This patch adds support for decoding dbar hints and emitting the
> appropriate TCG memory barrier flags. For CPUs that do not advertise
> the DBAR_HINTS feature (cpucfg3.DBAR_HINTS = 0), all dbar hints
> fall back to a full barrier, preserving compatibility.
>
> The hint encoding follows the LoongArch v1.10 specification:
> - Bit4: 0 = completion barrier, 1 = ordering barrier
> (ignored by TCG as TCG only supports ordering barriers)
> - Bit3: barrier for previous reads (0 = enforce, 1 = relax)
> - Bit2: barrier for previous writes (0 = enforce, 1 = relax)
> - Bit1: barrier for succeeding reads (0 = enforce, 1 = relax)
> - Bit0: barrier for succeeding writes (0 = enforce, 1 = relax)
>
> The mapping to TCG memory order flags is as follows:
> - TCG_MO_LD_LD is set if both previous and succeeding reads are ordered.
> - TCG_MO_ST_LD is set if previous write and succeeding read are ordered.
> - TCG_MO_LD_ST is set if previous read and succeeding write are ordered.
> - TCG_MO_ST_ST is set if both previous and succeeding writes are ordered.
>
> If the resulting flags describe an acquire or release barrier,
> TCG_BAR_LDAQ or TCG_BAR_STRL is used accordingly; otherwise a
> full SC barrier (TCG_BAR_SC) is emitted.
>
> Special hint handling:
> - hint 0x700: LL/SC loop barrier, treated as a full barrier as recommended.
> - hint 0xf and 0x1f: reserved/no-op, treated as no operation
>
> Signed-off-by: Song Gao <gaosong@loongson.cn>
> ---
> target/loongarch/cpu.c | 4 +
> .../tcg/insn_trans/trans_memory.c.inc | 82 ++++++++++++++++++-
> target/loongarch/tcg/translate.c | 1 +
> target/loongarch/translate.h | 3 +
> 4 files changed, 88 insertions(+), 2 deletions(-)
>
> diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c
> index e22568c84a..d8d106b07e 100644
> --- a/target/loongarch/cpu.c
> +++ b/target/loongarch/cpu.c
> @@ -455,6 +455,10 @@ static void loongarch_max_initfn(Object *obj)
> data = FIELD_DP32(data, CPUCFG2, LLACQ_SCREL, 1);
> data = FIELD_DP32(data, CPUCFG2, SCQ, 1);
> cpu->env.cpucfg[2] = data;
> +
> + data = cpu->env.cpucfg[3];
> + data = FIELD_DP32(data, CPUCFG3, DBAR_HINTS, 1);
> + cpu->env.cpucfg[3] = data;
> }
> }
>
> diff --git a/target/loongarch/tcg/insn_trans/trans_memory.c.inc b/target/loongarch/tcg/insn_trans/trans_memory.c.inc
> index e287d46363..99bc486119 100644
> --- a/target/loongarch/tcg/insn_trans/trans_memory.c.inc
> +++ b/target/loongarch/tcg/insn_trans/trans_memory.c.inc
> @@ -137,11 +137,89 @@ static bool trans_preldx(DisasContext *ctx, arg_preldx * a)
> return true;
> }
>
> +/*
> + * Decode dbar hint and emit appropriate TCG memory barrier.
> + *
> + * The hint is a 5-bit field (0-31) encoded in the instruction.
> + * For hint 0x700 (special LL/SC loop barrier), treat as full barrier.
> + *
> + * See LoongArch Reference Manual v1.10, Section 4.2.2 for details.
> + */
> static bool trans_dbar(DisasContext *ctx, arg_dbar * a)
> {
> tcg_gen_mb(TCG_BAR_SC | TCG_MO_ALL);
I am not familiar with memory order. I only point out the possible
problem purely from code.
Should tcg_gen_mb() be moved afterward, otherwise there may be two times
calling with tcg_gen_mb().
> - return true;
> -}
> + int hint = a->imm;
> + TCGBar bar_flags = 0;
> +
> + /* Reserved/no-op hints: 0xf and 0x1f */
> + if (hint == 0xf || hint == 0x1f) {
> + return true;
> + }
Ditto, should it move afterward?
Regards
Bibo Mao
> +
> + /* If the CPU does not support fine-grained hints,or for the special LL/SC
> + * loop barrier (0x700), emit a full barrier.
> + */
> + if (!avail_DBAR_HINT(ctx) || hint == 0x700) {
> + tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
> + return true;
> + }
> +
> + /*
> + * Fine-grained hint decoding:
> + * Bits 3-0 control which accesses must be ordered.
> + * bit3: barrier previous reads? (0 = enforce, 1 = relax)
> + * bit2: barrier previous writes? (0 = enforce, 1 = relax)
> + * bit1: barrier succeeding reads? (0 = enforce, 1 = relax)
> + * bit0: barrier succeeding writes?(0 = enforce, 1 = relax)
> + *
> + * For each combination, we set the corresponding TCG_MO_* flag if both
> + * sides of the barrier require ordering.
> + */
> + bool prev_rd = !(hint & 0x08); /* need barrier for previous reads */
> + bool prev_wr = !(hint & 0x04); /* need barrier for previous writes */
> + bool succ_rd = !(hint & 0x02); /* need barrier for succeeding reads */
> + bool succ_wr = !(hint & 0x01); /* need barrier for succeeding writes */
> +
> + if (prev_rd && succ_rd) {
> + bar_flags |= TCG_MO_LD_LD;
> + }
> + if (prev_wr && succ_rd) {
> + bar_flags |= TCG_MO_ST_LD;
> + }
> + if (prev_rd && succ_wr) {
> + bar_flags |= TCG_MO_LD_ST;
> + }
> + if (prev_wr && succ_wr) {
> + bar_flags |= TCG_MO_ST_ST;
> + }
> +
> + /* If no flags were set, this is a no-op barrier */
> + if (bar_flags == 0) {
> + return true;
> + }
> +
> + /*
> + * Use acquire/release semantics when possible to generate more efficient
> + * code. Otherwise, fall back to a sequential consistency barrier.
> + *
> + * Acquire: order loads before loads/stores (LD_LD | LD_ST)
> + * Release: order stores before stores/loads (ST_ST | ST_LD)
> + */
> + if ((bar_flags & (TCG_MO_LD_LD | TCG_MO_LD_ST)) &&
> + !(bar_flags & (TCG_MO_ST_ST | TCG_MO_ST_LD))) {
> + /* Only acquire flags present */
> + tcg_gen_mb(bar_flags | TCG_BAR_LDAQ);
> + } else if ((bar_flags & (TCG_MO_ST_ST | TCG_MO_ST_LD)) &&
> + !(bar_flags & (TCG_MO_LD_LD | TCG_MO_LD_ST))) {
> + /* Only release flags present */
> + tcg_gen_mb(bar_flags | TCG_BAR_STRL);
> + } else {
> + /* Mixed or full barrier */
> + tcg_gen_mb(bar_flags | TCG_BAR_SC);
> + }
> +
> + return true;
> + }
>
> static bool trans_ibar(DisasContext *ctx, arg_ibar *a)
> {
> diff --git a/target/loongarch/tcg/translate.c b/target/loongarch/tcg/translate.c
> index b9ed13d19c..49280b1dd3 100644
> --- a/target/loongarch/tcg/translate.c
> +++ b/target/loongarch/tcg/translate.c
> @@ -149,6 +149,7 @@ static void loongarch_tr_init_disas_context(DisasContextBase *dcbase,
>
> ctx->cpucfg1 = env->cpucfg[1];
> ctx->cpucfg2 = env->cpucfg[2];
> + ctx->cpucfg3 = env->cpucfg[3];
> }
>
> static void loongarch_tr_tb_start(DisasContextBase *dcbase, CPUState *cs)
> diff --git a/target/loongarch/translate.h b/target/loongarch/translate.h
> index ba1c89e57b..8aa8325dc6 100644
> --- a/target/loongarch/translate.h
> +++ b/target/loongarch/translate.h
> @@ -43,6 +43,8 @@
> #define avail_LLACQ_SCREL(C) (FIELD_EX32((C)->cpucfg2, CPUCFG2, LLACQ_SCREL))
> #define avail_LLACQ_SCREL_64(C) (avail_64(C) && avail_LLACQ_SCREL(C))
>
> +#define avail_DBAR_HINT(C) (FIELD_EX32((C)->cpucfg3, CPUCFG3, DBAR_HINTS))
> +
> /*
> * If an operation is being performed on less than TARGET_LONG_BITS,
> * it may require the inputs to be sign- or zero-extended; which will
> @@ -66,6 +68,7 @@ typedef struct DisasContext {
> bool va32; /* 32-bit virtual address */
> uint32_t cpucfg1;
> uint32_t cpucfg2;
> + uint32_t cpucfg3;
> } DisasContext;
>
> void generate_exception(DisasContext *ctx, int excp);
>
在 2026/3/30 下午3:57, Bibo Mao 写道:
>
>
> On 2026/3/27 上午8:32, Song Gao wrote:
>> LoongArch architecture (since LA664) introduces fine-grained dbar
>> hints that allow controlling which memory accesses are ordered by
>> the barrier. Previously, all dbar instructions were treated as a
>> full barrier (TCG_MO_ALL | TCG_BAR_SC).
>>
>> This patch adds support for decoding dbar hints and emitting the
>> appropriate TCG memory barrier flags. For CPUs that do not advertise
>> the DBAR_HINTS feature (cpucfg3.DBAR_HINTS = 0), all dbar hints
>> fall back to a full barrier, preserving compatibility.
>>
>> The hint encoding follows the LoongArch v1.10 specification:
>> - Bit4: 0 = completion barrier, 1 = ordering barrier
>> (ignored by TCG as TCG only supports ordering barriers)
>> - Bit3: barrier for previous reads (0 = enforce, 1 = relax)
>> - Bit2: barrier for previous writes (0 = enforce, 1 = relax)
>> - Bit1: barrier for succeeding reads (0 = enforce, 1 = relax)
>> - Bit0: barrier for succeeding writes (0 = enforce, 1 = relax)
>>
>> The mapping to TCG memory order flags is as follows:
>> - TCG_MO_LD_LD is set if both previous and succeeding reads are ordered.
>> - TCG_MO_ST_LD is set if previous write and succeeding read are ordered.
>> - TCG_MO_LD_ST is set if previous read and succeeding write are ordered.
>> - TCG_MO_ST_ST is set if both previous and succeeding writes are
>> ordered.
>>
>> If the resulting flags describe an acquire or release barrier,
>> TCG_BAR_LDAQ or TCG_BAR_STRL is used accordingly; otherwise a
>> full SC barrier (TCG_BAR_SC) is emitted.
>>
>> Special hint handling:
>> - hint 0x700: LL/SC loop barrier, treated as a full barrier as
>> recommended.
>> - hint 0xf and 0x1f: reserved/no-op, treated as no operation
>>
>> Signed-off-by: Song Gao <gaosong@loongson.cn>
>> ---
>> target/loongarch/cpu.c | 4 +
>> .../tcg/insn_trans/trans_memory.c.inc | 82 ++++++++++++++++++-
>> target/loongarch/tcg/translate.c | 1 +
>> target/loongarch/translate.h | 3 +
>> 4 files changed, 88 insertions(+), 2 deletions(-)
>>
>> diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c
>> index e22568c84a..d8d106b07e 100644
>> --- a/target/loongarch/cpu.c
>> +++ b/target/loongarch/cpu.c
>> @@ -455,6 +455,10 @@ static void loongarch_max_initfn(Object *obj)
>> data = FIELD_DP32(data, CPUCFG2, LLACQ_SCREL, 1);
>> data = FIELD_DP32(data, CPUCFG2, SCQ, 1);
>> cpu->env.cpucfg[2] = data;
>> +
>> + data = cpu->env.cpucfg[3];
>> + data = FIELD_DP32(data, CPUCFG3, DBAR_HINTS, 1);
>> + cpu->env.cpucfg[3] = data;
>> }
>> }
>> diff --git a/target/loongarch/tcg/insn_trans/trans_memory.c.inc
>> b/target/loongarch/tcg/insn_trans/trans_memory.c.inc
>> index e287d46363..99bc486119 100644
>> --- a/target/loongarch/tcg/insn_trans/trans_memory.c.inc
>> +++ b/target/loongarch/tcg/insn_trans/trans_memory.c.inc
>> @@ -137,11 +137,89 @@ static bool trans_preldx(DisasContext *ctx,
>> arg_preldx * a)
>> return true;
>> }
>> +/*
>> + * Decode dbar hint and emit appropriate TCG memory barrier.
>> + *
>> + * The hint is a 5-bit field (0-31) encoded in the instruction.
>> + * For hint 0x700 (special LL/SC loop barrier), treat as full barrier.
>> + *
>> + * See LoongArch Reference Manual v1.10, Section 4.2.2 for details.
>> + */
>> static bool trans_dbar(DisasContext *ctx, arg_dbar * a)
>> {
>> tcg_gen_mb(TCG_BAR_SC | TCG_MO_ALL);
> I am not familiar with memory order. I only point out the possible
> problem purely from code.
>
> Should tcg_gen_mb() be moved afterward, otherwise there may be two
> times calling with tcg_gen_mb().
yes , you are right. this tcg_gen_mb() should remove.
>
>> - return true;
>> -}
>> + int hint = a->imm;
>> + TCGBar bar_flags = 0;
>> +
>> + /* Reserved/no-op hints: 0xf and 0x1f */
>> + if (hint == 0xf || hint == 0x1f) {
>> + return true;
>> + }
> Ditto, should it move afterward?
here , no-op hint we juet return true,
Thanks.
Song Gao
>
> Regards
> Bibo Mao
>> +
>> + /* If the CPU does not support fine-grained hints,or for the
>> special LL/SC
>> + * loop barrier (0x700), emit a full barrier.
>> + */
>> + if (!avail_DBAR_HINT(ctx) || hint == 0x700) {
>> + tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
>> + return true;
>> + }
>> +
>> + /*
>> + * Fine-grained hint decoding:
>> + * Bits 3-0 control which accesses must be ordered.
>> + * bit3: barrier previous reads? (0 = enforce, 1 = relax)
>> + * bit2: barrier previous writes? (0 = enforce, 1 = relax)
>> + * bit1: barrier succeeding reads? (0 = enforce, 1 = relax)
>> + * bit0: barrier succeeding writes?(0 = enforce, 1 = relax)
>> + *
>> + * For each combination, we set the corresponding TCG_MO_* flag
>> if both
>> + * sides of the barrier require ordering.
>> + */
>> + bool prev_rd = !(hint & 0x08); /* need barrier for previous
>> reads */
>> + bool prev_wr = !(hint & 0x04); /* need barrier for previous
>> writes */
>> + bool succ_rd = !(hint & 0x02); /* need barrier for succeeding
>> reads */
>> + bool succ_wr = !(hint & 0x01); /* need barrier for succeeding
>> writes */
>> +
>> + if (prev_rd && succ_rd) {
>> + bar_flags |= TCG_MO_LD_LD;
>> + }
>> + if (prev_wr && succ_rd) {
>> + bar_flags |= TCG_MO_ST_LD;
>> + }
>> + if (prev_rd && succ_wr) {
>> + bar_flags |= TCG_MO_LD_ST;
>> + }
>> + if (prev_wr && succ_wr) {
>> + bar_flags |= TCG_MO_ST_ST;
>> + }
>> +
>> + /* If no flags were set, this is a no-op barrier */
>> + if (bar_flags == 0) {
>> + return true;
>> + }
>> +
>> + /*
>> + * Use acquire/release semantics when possible to generate more
>> efficient
>> + * code. Otherwise, fall back to a sequential consistency barrier.
>> + *
>> + * Acquire: order loads before loads/stores (LD_LD | LD_ST)
>> + * Release: order stores before stores/loads (ST_ST | ST_LD)
>> + */
>> + if ((bar_flags & (TCG_MO_LD_LD | TCG_MO_LD_ST)) &&
>> + !(bar_flags & (TCG_MO_ST_ST | TCG_MO_ST_LD))) {
>> + /* Only acquire flags present */
>> + tcg_gen_mb(bar_flags | TCG_BAR_LDAQ);
>> + } else if ((bar_flags & (TCG_MO_ST_ST | TCG_MO_ST_LD)) &&
>> + !(bar_flags & (TCG_MO_LD_LD | TCG_MO_LD_ST))) {
>> + /* Only release flags present */
>> + tcg_gen_mb(bar_flags | TCG_BAR_STRL);
>> + } else {
>> + /* Mixed or full barrier */
>> + tcg_gen_mb(bar_flags | TCG_BAR_SC);
>> + }
>> +
>> + return true;
>> + }
>> static bool trans_ibar(DisasContext *ctx, arg_ibar *a)
>> {
>> diff --git a/target/loongarch/tcg/translate.c
>> b/target/loongarch/tcg/translate.c
>> index b9ed13d19c..49280b1dd3 100644
>> --- a/target/loongarch/tcg/translate.c
>> +++ b/target/loongarch/tcg/translate.c
>> @@ -149,6 +149,7 @@ static void
>> loongarch_tr_init_disas_context(DisasContextBase *dcbase,
>> ctx->cpucfg1 = env->cpucfg[1];
>> ctx->cpucfg2 = env->cpucfg[2];
>> + ctx->cpucfg3 = env->cpucfg[3];
>> }
>> static void loongarch_tr_tb_start(DisasContextBase *dcbase,
>> CPUState *cs)
>> diff --git a/target/loongarch/translate.h b/target/loongarch/translate.h
>> index ba1c89e57b..8aa8325dc6 100644
>> --- a/target/loongarch/translate.h
>> +++ b/target/loongarch/translate.h
>> @@ -43,6 +43,8 @@
>> #define avail_LLACQ_SCREL(C) (FIELD_EX32((C)->cpucfg2, CPUCFG2,
>> LLACQ_SCREL))
>> #define avail_LLACQ_SCREL_64(C) (avail_64(C) && avail_LLACQ_SCREL(C))
>> +#define avail_DBAR_HINT(C) (FIELD_EX32((C)->cpucfg3, CPUCFG3,
>> DBAR_HINTS))
>> +
>> /*
>> * If an operation is being performed on less than TARGET_LONG_BITS,
>> * it may require the inputs to be sign- or zero-extended; which will
>> @@ -66,6 +68,7 @@ typedef struct DisasContext {
>> bool va32; /* 32-bit virtual address */
>> uint32_t cpucfg1;
>> uint32_t cpucfg2;
>> + uint32_t cpucfg3;
>> } DisasContext;
>> void generate_exception(DisasContext *ctx, int excp);
>>
© 2016 - 2026 Red Hat, Inc.