Section 1.4 of the Power ISA v3.0B states that both of these
instructions are single-copy atomic. As we cannot (yet) issue
128-bit loads within TCG, use the generic helpers provided.
Since TCG cannot (yet) return a 128-bit value, add a slot within
CPUPPCState for returning the high half of a 128-bit return value.
This solution is preferred to the helper assigning to architectural
registers directly, as it avoids clobbering all TCG live values.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/ppc/cpu.h | 3 ++
target/ppc/helper.h | 5 +++
target/ppc/mem_helper.c | 20 ++++++++-
target/ppc/translate.c | 93 ++++++++++++++++++++++++++++++-----------
4 files changed, 95 insertions(+), 26 deletions(-)
diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index c7f3fb6b73..973cf44cda 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -1015,6 +1015,9 @@ struct CPUPPCState {
/* Next instruction pointer */
target_ulong nip;
+ /* High part of 128-bit helper return. */
+ uint64_t retxh;
+
int access_type; /* when a memory exception occurs, the access
type is stored here */
diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index d751f0e219..3f451a5d7e 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -799,3 +799,8 @@ DEF_HELPER_4(dscliq, void, env, fprp, fprp, i32)
DEF_HELPER_1(tbegin, void, env)
DEF_HELPER_FLAGS_1(fixup_thrm, TCG_CALL_NO_RWG, void, env)
+
+#if defined(TARGET_PPC64) && defined(CONFIG_ATOMIC128)
+DEF_HELPER_FLAGS_3(lq_le_parallel, TCG_CALL_NO_WG, i64, env, tl, i32)
+DEF_HELPER_FLAGS_3(lq_be_parallel, TCG_CALL_NO_WG, i64, env, tl, i32)
+#endif
diff --git a/target/ppc/mem_helper.c b/target/ppc/mem_helper.c
index a34e604db3..44a8f3445a 100644
--- a/target/ppc/mem_helper.c
+++ b/target/ppc/mem_helper.c
@@ -21,9 +21,9 @@
#include "exec/exec-all.h"
#include "qemu/host-utils.h"
#include "exec/helper-proto.h"
-
#include "helper_regs.h"
#include "exec/cpu_ldst.h"
+#include "tcg.h"
#include "internal.h"
//#define DEBUG_OP
@@ -215,6 +215,24 @@ target_ulong helper_lscbx(CPUPPCState *env, target_ulong addr, uint32_t reg,
return i;
}
+#if defined(TARGET_PPC64) && defined(CONFIG_ATOMIC128)
+uint64_t helper_lq_le_parallel(CPUPPCState *env, target_ulong addr,
+ uint32_t opidx)
+{
+ Int128 ret = helper_atomic_ldo_le_mmu(env, addr, opidx, GETPC());
+ env->retxh = int128_gethi(ret);
+ return int128_getlo(ret);
+}
+
+uint64_t helper_lq_be_parallel(CPUPPCState *env, target_ulong addr,
+ uint32_t opidx)
+{
+ Int128 ret = helper_atomic_ldo_be_mmu(env, addr, opidx, GETPC());
+ env->retxh = int128_gethi(ret);
+ return int128_getlo(ret);
+}
+#endif
+
/*****************************************************************************/
/* Altivec extension helpers */
#if defined(HOST_WORDS_BIGENDIAN)
diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index 3a215a1dc6..0923cc24e3 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -2607,7 +2607,7 @@ static void gen_ld(DisasContext *ctx)
static void gen_lq(DisasContext *ctx)
{
int ra, rd;
- TCGv EA;
+ TCGv EA, hi, lo;
/* lq is a legal user mode instruction starting in ISA 2.07 */
bool legal_in_user_mode = (ctx->insns_flags2 & PPC2_LSQ_ISA207) != 0;
@@ -2633,16 +2633,35 @@ static void gen_lq(DisasContext *ctx)
EA = tcg_temp_new();
gen_addr_imm_index(ctx, EA, 0x0F);
- /* We only need to swap high and low halves. gen_qemu_ld64_i64 does
- necessary 64-bit byteswap already. */
- if (unlikely(ctx->le_mode)) {
- gen_qemu_ld64_i64(ctx, cpu_gpr[rd + 1], EA);
+ /* Note that the low part is always in RD+1, even in LE mode. */
+ lo = cpu_gpr[rd + 1];
+ hi = cpu_gpr[rd];
+
+ if (tb_cflags(ctx->base.tb) & CF_PARALLEL) {
+#ifdef CONFIG_ATOMIC128
+ TCGv_i32 oi = tcg_temp_new_i32();
+ if (ctx->le_mode) {
+ tcg_gen_movi_i32(oi, make_memop_idx(MO_LEQ, ctx->mem_idx));
+ gen_helper_lq_le_parallel(lo, cpu_env, EA, oi);
+ } else {
+ tcg_gen_movi_i32(oi, make_memop_idx(MO_BEQ, ctx->mem_idx));
+ gen_helper_lq_be_parallel(lo, cpu_env, EA, oi);
+ }
+ tcg_temp_free_i32(oi);
+ tcg_gen_ld_i64(hi, cpu_env, offsetof(CPUPPCState, retxh));
+#else
+ /* Restart with exclusive lock. */
+ gen_helper_exit_atomic(cpu_env);
+ ctx->base.is_jmp = DISAS_NORETURN;
+#endif
+ } else if (ctx->le_mode) {
+ tcg_gen_qemu_ld_i64(lo, EA, ctx->mem_idx, MO_LEQ);
gen_addr_add(ctx, EA, EA, 8);
- gen_qemu_ld64_i64(ctx, cpu_gpr[rd], EA);
+ tcg_gen_qemu_ld_i64(hi, EA, ctx->mem_idx, MO_LEQ);
} else {
- gen_qemu_ld64_i64(ctx, cpu_gpr[rd], EA);
+ tcg_gen_qemu_ld_i64(hi, EA, ctx->mem_idx, MO_BEQ);
gen_addr_add(ctx, EA, EA, 8);
- gen_qemu_ld64_i64(ctx, cpu_gpr[rd + 1], EA);
+ tcg_gen_qemu_ld_i64(lo, EA, ctx->mem_idx, MO_BEQ);
}
tcg_temp_free(EA);
}
@@ -3236,9 +3255,8 @@ STCX(stdcx_, DEF_MEMOP(MO_Q))
/* lqarx */
static void gen_lqarx(DisasContext *ctx)
{
- TCGv EA;
int rd = rD(ctx->opcode);
- TCGv gpr1, gpr2;
+ TCGv EA, hi, lo;
if (unlikely((rd & 1) || (rd == rA(ctx->opcode)) ||
(rd == rB(ctx->opcode)))) {
@@ -3247,24 +3265,49 @@ static void gen_lqarx(DisasContext *ctx)
}
gen_set_access_type(ctx, ACCESS_RES);
- EA = tcg_temp_local_new();
+ EA = tcg_temp_new();
gen_addr_reg_index(ctx, EA);
- gen_check_align(ctx, EA, 15);
- if (unlikely(ctx->le_mode)) {
- gpr1 = cpu_gpr[rd+1];
- gpr2 = cpu_gpr[rd];
- } else {
- gpr1 = cpu_gpr[rd];
- gpr2 = cpu_gpr[rd+1];
- }
- tcg_gen_qemu_ld_i64(gpr1, EA, ctx->mem_idx, DEF_MEMOP(MO_Q));
- tcg_gen_mov_tl(cpu_reserve, EA);
- gen_addr_add(ctx, EA, EA, 8);
- tcg_gen_qemu_ld_i64(gpr2, EA, ctx->mem_idx, DEF_MEMOP(MO_Q));
- tcg_gen_st_tl(gpr1, cpu_env, offsetof(CPUPPCState, reserve_val));
- tcg_gen_st_tl(gpr2, cpu_env, offsetof(CPUPPCState, reserve_val2));
+ /* Note that the low part is always in RD+1, even in LE mode. */
+ lo = cpu_gpr[rd + 1];
+ hi = cpu_gpr[rd];
+
+ if (tb_cflags(ctx->base.tb) & CF_PARALLEL) {
+#ifdef CONFIG_ATOMIC128
+ TCGv_i32 oi = tcg_temp_new_i32();
+ if (ctx->le_mode) {
+ tcg_gen_movi_i32(oi, make_memop_idx(MO_LEQ | MO_ALIGN_16,
+ ctx->mem_idx));
+ gen_helper_lq_le_parallel(lo, cpu_env, EA, oi);
+ } else {
+ tcg_gen_movi_i32(oi, make_memop_idx(MO_BEQ | MO_ALIGN_16,
+ ctx->mem_idx));
+ gen_helper_lq_be_parallel(lo, cpu_env, EA, oi);
+ }
+ tcg_temp_free_i32(oi);
+ tcg_gen_ld_i64(hi, cpu_env, offsetof(CPUPPCState, retxh));
+#else
+ /* Restart with exclusive lock. */
+ gen_helper_exit_atomic(cpu_env);
+ ctx->base.is_jmp = DISAS_NORETURN;
+ tcg_temp_free(EA);
+ return;
+#endif
+ } else if (ctx->le_mode) {
+ tcg_gen_qemu_ld_i64(lo, EA, ctx->mem_idx, MO_LEQ | MO_ALIGN_16);
+ tcg_gen_mov_tl(cpu_reserve, EA);
+ gen_addr_add(ctx, EA, EA, 8);
+ tcg_gen_qemu_ld_i64(hi, EA, ctx->mem_idx, MO_LEQ);
+ } else {
+ tcg_gen_qemu_ld_i64(hi, EA, ctx->mem_idx, MO_BEQ | MO_ALIGN_16);
+ tcg_gen_mov_tl(cpu_reserve, EA);
+ gen_addr_add(ctx, EA, EA, 8);
+ tcg_gen_qemu_ld_i64(lo, EA, ctx->mem_idx, MO_BEQ);
+ }
tcg_temp_free(EA);
+
+ tcg_gen_st_tl(hi, cpu_env, offsetof(CPUPPCState, reserve_val));
+ tcg_gen_st_tl(lo, cpu_env, offsetof(CPUPPCState, reserve_val2));
}
/* stqcx. */
--
2.17.1
On Tue, Jun 26, 2018 at 09:19:10AM -0700, Richard Henderson wrote:
> Section 1.4 of the Power ISA v3.0B states that both of these
> instructions are single-copy atomic. As we cannot (yet) issue
> 128-bit loads within TCG, use the generic helpers provided.
>
> Since TCG cannot (yet) return a 128-bit value, add a slot within
> CPUPPCState for returning the high half of a 128-bit return value.
> This solution is preferred to the helper assigning to architectural
> registers directly, as it avoids clobbering all TCG live values.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
> target/ppc/cpu.h | 3 ++
> target/ppc/helper.h | 5 +++
> target/ppc/mem_helper.c | 20 ++++++++-
> target/ppc/translate.c | 93 ++++++++++++++++++++++++++++++-----------
> 4 files changed, 95 insertions(+), 26 deletions(-)
>
> diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
> index c7f3fb6b73..973cf44cda 100644
> --- a/target/ppc/cpu.h
> +++ b/target/ppc/cpu.h
> @@ -1015,6 +1015,9 @@ struct CPUPPCState {
> /* Next instruction pointer */
> target_ulong nip;
>
> + /* High part of 128-bit helper return. */
> + uint64_t retxh;
> +
Adding a temporary here is kind of gross. I guess the helper
interface doesn't allow for 128-bit returns, but couldn't you pass a
register number into the helper and have it update the right GPR
without going through a temp?
> int access_type; /* when a memory exception occurs, the access
> type is stored here */
>
> diff --git a/target/ppc/helper.h b/target/ppc/helper.h
> index d751f0e219..3f451a5d7e 100644
> --- a/target/ppc/helper.h
> +++ b/target/ppc/helper.h
> @@ -799,3 +799,8 @@ DEF_HELPER_4(dscliq, void, env, fprp, fprp, i32)
>
> DEF_HELPER_1(tbegin, void, env)
> DEF_HELPER_FLAGS_1(fixup_thrm, TCG_CALL_NO_RWG, void, env)
> +
> +#if defined(TARGET_PPC64) && defined(CONFIG_ATOMIC128)
> +DEF_HELPER_FLAGS_3(lq_le_parallel, TCG_CALL_NO_WG, i64, env, tl, i32)
> +DEF_HELPER_FLAGS_3(lq_be_parallel, TCG_CALL_NO_WG, i64, env, tl, i32)
> +#endif
> diff --git a/target/ppc/mem_helper.c b/target/ppc/mem_helper.c
> index a34e604db3..44a8f3445a 100644
> --- a/target/ppc/mem_helper.c
> +++ b/target/ppc/mem_helper.c
> @@ -21,9 +21,9 @@
> #include "exec/exec-all.h"
> #include "qemu/host-utils.h"
> #include "exec/helper-proto.h"
> -
> #include "helper_regs.h"
> #include "exec/cpu_ldst.h"
> +#include "tcg.h"
> #include "internal.h"
>
> //#define DEBUG_OP
> @@ -215,6 +215,24 @@ target_ulong helper_lscbx(CPUPPCState *env, target_ulong addr, uint32_t reg,
> return i;
> }
>
> +#if defined(TARGET_PPC64) && defined(CONFIG_ATOMIC128)
> +uint64_t helper_lq_le_parallel(CPUPPCState *env, target_ulong addr,
> + uint32_t opidx)
> +{
> + Int128 ret = helper_atomic_ldo_le_mmu(env, addr, opidx, GETPC());
> + env->retxh = int128_gethi(ret);
> + return int128_getlo(ret);
> +}
> +
> +uint64_t helper_lq_be_parallel(CPUPPCState *env, target_ulong addr,
> + uint32_t opidx)
> +{
> + Int128 ret = helper_atomic_ldo_be_mmu(env, addr, opidx, GETPC());
> + env->retxh = int128_gethi(ret);
> + return int128_getlo(ret);
> +}
> +#endif
> +
> /*****************************************************************************/
> /* Altivec extension helpers */
> #if defined(HOST_WORDS_BIGENDIAN)
> diff --git a/target/ppc/translate.c b/target/ppc/translate.c
> index 3a215a1dc6..0923cc24e3 100644
> --- a/target/ppc/translate.c
> +++ b/target/ppc/translate.c
> @@ -2607,7 +2607,7 @@ static void gen_ld(DisasContext *ctx)
> static void gen_lq(DisasContext *ctx)
> {
> int ra, rd;
> - TCGv EA;
> + TCGv EA, hi, lo;
>
> /* lq is a legal user mode instruction starting in ISA 2.07 */
> bool legal_in_user_mode = (ctx->insns_flags2 & PPC2_LSQ_ISA207) != 0;
> @@ -2633,16 +2633,35 @@ static void gen_lq(DisasContext *ctx)
> EA = tcg_temp_new();
> gen_addr_imm_index(ctx, EA, 0x0F);
>
> - /* We only need to swap high and low halves. gen_qemu_ld64_i64 does
> - necessary 64-bit byteswap already. */
> - if (unlikely(ctx->le_mode)) {
> - gen_qemu_ld64_i64(ctx, cpu_gpr[rd + 1], EA);
> + /* Note that the low part is always in RD+1, even in LE mode. */
> + lo = cpu_gpr[rd + 1];
> + hi = cpu_gpr[rd];
> +
> + if (tb_cflags(ctx->base.tb) & CF_PARALLEL) {
> +#ifdef CONFIG_ATOMIC128
> + TCGv_i32 oi = tcg_temp_new_i32();
> + if (ctx->le_mode) {
> + tcg_gen_movi_i32(oi, make_memop_idx(MO_LEQ, ctx->mem_idx));
> + gen_helper_lq_le_parallel(lo, cpu_env, EA, oi);
> + } else {
> + tcg_gen_movi_i32(oi, make_memop_idx(MO_BEQ, ctx->mem_idx));
> + gen_helper_lq_be_parallel(lo, cpu_env, EA, oi);
> + }
> + tcg_temp_free_i32(oi);
> + tcg_gen_ld_i64(hi, cpu_env, offsetof(CPUPPCState, retxh));
> +#else
> + /* Restart with exclusive lock. */
> + gen_helper_exit_atomic(cpu_env);
> + ctx->base.is_jmp = DISAS_NORETURN;
> +#endif
> + } else if (ctx->le_mode) {
> + tcg_gen_qemu_ld_i64(lo, EA, ctx->mem_idx, MO_LEQ);
> gen_addr_add(ctx, EA, EA, 8);
> - gen_qemu_ld64_i64(ctx, cpu_gpr[rd], EA);
> + tcg_gen_qemu_ld_i64(hi, EA, ctx->mem_idx, MO_LEQ);
> } else {
> - gen_qemu_ld64_i64(ctx, cpu_gpr[rd], EA);
> + tcg_gen_qemu_ld_i64(hi, EA, ctx->mem_idx, MO_BEQ);
> gen_addr_add(ctx, EA, EA, 8);
> - gen_qemu_ld64_i64(ctx, cpu_gpr[rd + 1], EA);
> + tcg_gen_qemu_ld_i64(lo, EA, ctx->mem_idx, MO_BEQ);
> }
> tcg_temp_free(EA);
> }
> @@ -3236,9 +3255,8 @@ STCX(stdcx_, DEF_MEMOP(MO_Q))
> /* lqarx */
> static void gen_lqarx(DisasContext *ctx)
> {
> - TCGv EA;
> int rd = rD(ctx->opcode);
> - TCGv gpr1, gpr2;
> + TCGv EA, hi, lo;
>
> if (unlikely((rd & 1) || (rd == rA(ctx->opcode)) ||
> (rd == rB(ctx->opcode)))) {
> @@ -3247,24 +3265,49 @@ static void gen_lqarx(DisasContext *ctx)
> }
>
> gen_set_access_type(ctx, ACCESS_RES);
> - EA = tcg_temp_local_new();
> + EA = tcg_temp_new();
> gen_addr_reg_index(ctx, EA);
> - gen_check_align(ctx, EA, 15);
> - if (unlikely(ctx->le_mode)) {
> - gpr1 = cpu_gpr[rd+1];
> - gpr2 = cpu_gpr[rd];
> - } else {
> - gpr1 = cpu_gpr[rd];
> - gpr2 = cpu_gpr[rd+1];
> - }
> - tcg_gen_qemu_ld_i64(gpr1, EA, ctx->mem_idx, DEF_MEMOP(MO_Q));
> - tcg_gen_mov_tl(cpu_reserve, EA);
> - gen_addr_add(ctx, EA, EA, 8);
> - tcg_gen_qemu_ld_i64(gpr2, EA, ctx->mem_idx, DEF_MEMOP(MO_Q));
>
> - tcg_gen_st_tl(gpr1, cpu_env, offsetof(CPUPPCState, reserve_val));
> - tcg_gen_st_tl(gpr2, cpu_env, offsetof(CPUPPCState, reserve_val2));
> + /* Note that the low part is always in RD+1, even in LE mode. */
> + lo = cpu_gpr[rd + 1];
> + hi = cpu_gpr[rd];
> +
> + if (tb_cflags(ctx->base.tb) & CF_PARALLEL) {
> +#ifdef CONFIG_ATOMIC128
> + TCGv_i32 oi = tcg_temp_new_i32();
> + if (ctx->le_mode) {
> + tcg_gen_movi_i32(oi, make_memop_idx(MO_LEQ | MO_ALIGN_16,
> + ctx->mem_idx));
> + gen_helper_lq_le_parallel(lo, cpu_env, EA, oi);
> + } else {
> + tcg_gen_movi_i32(oi, make_memop_idx(MO_BEQ | MO_ALIGN_16,
> + ctx->mem_idx));
> + gen_helper_lq_be_parallel(lo, cpu_env, EA, oi);
> + }
> + tcg_temp_free_i32(oi);
> + tcg_gen_ld_i64(hi, cpu_env, offsetof(CPUPPCState, retxh));
> +#else
> + /* Restart with exclusive lock. */
> + gen_helper_exit_atomic(cpu_env);
> + ctx->base.is_jmp = DISAS_NORETURN;
> + tcg_temp_free(EA);
> + return;
> +#endif
> + } else if (ctx->le_mode) {
> + tcg_gen_qemu_ld_i64(lo, EA, ctx->mem_idx, MO_LEQ | MO_ALIGN_16);
> + tcg_gen_mov_tl(cpu_reserve, EA);
> + gen_addr_add(ctx, EA, EA, 8);
> + tcg_gen_qemu_ld_i64(hi, EA, ctx->mem_idx, MO_LEQ);
> + } else {
> + tcg_gen_qemu_ld_i64(hi, EA, ctx->mem_idx, MO_BEQ | MO_ALIGN_16);
> + tcg_gen_mov_tl(cpu_reserve, EA);
> + gen_addr_add(ctx, EA, EA, 8);
> + tcg_gen_qemu_ld_i64(lo, EA, ctx->mem_idx, MO_BEQ);
> + }
> tcg_temp_free(EA);
> +
> + tcg_gen_st_tl(hi, cpu_env, offsetof(CPUPPCState, reserve_val));
> + tcg_gen_st_tl(lo, cpu_env, offsetof(CPUPPCState, reserve_val2));
> }
>
> /* stqcx. */
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
On 06/27/2018 08:49 PM, David Gibson wrote: >> + /* High part of 128-bit helper return. */ >> + uint64_t retxh; >> + > > Adding a temporary here is kind of gross. I guess the helper > interface doesn't allow for 128-bit returns, but couldn't you pass a > register number into the helper and have it update the right GPR > without going through a temp? I could pass a pointer, but that would cause ... >> +#if defined(TARGET_PPC64) && defined(CONFIG_ATOMIC128) >> +DEF_HELPER_FLAGS_3(lq_le_parallel, TCG_CALL_NO_WG, i64, env, tl, i32) >> +DEF_HELPER_FLAGS_3(lq_be_parallel, TCG_CALL_NO_WG, i64, env, tl, i32) ... the helper definitions to lose TCG_CALL_NO_WG, because they *would* write to a global register. Which would cause TCG to discard all of the global guest registers cached within host registers. I've used this secondary memory return before, in target/s390, and to me it seems cleaner than pointers. r~
On Thu, Jun 28, 2018 at 08:22:38AM -0700, Richard Henderson wrote: > On 06/27/2018 08:49 PM, David Gibson wrote: > >> + /* High part of 128-bit helper return. */ > >> + uint64_t retxh; > >> + > > > > Adding a temporary here is kind of gross. I guess the helper > > interface doesn't allow for 128-bit returns, but couldn't you pass a > > register number into the helper and have it update the right GPR > > without going through a temp? > > I could pass a pointer, but that would cause ... > > >> +#if defined(TARGET_PPC64) && defined(CONFIG_ATOMIC128) > >> +DEF_HELPER_FLAGS_3(lq_le_parallel, TCG_CALL_NO_WG, i64, env, tl, i32) > >> +DEF_HELPER_FLAGS_3(lq_be_parallel, TCG_CALL_NO_WG, i64, env, tl, i32) > > ... the helper definitions to lose TCG_CALL_NO_WG, because they *would* write > to a global register. Which would cause TCG to discard all of the global guest > registers cached within host registers. > > I've used this secondary memory return before, in target/s390, > and to me it seems cleaner than pointers. Ok, sounds reasonable, applied to ppc-for-3.0. -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
© 2016 - 2026 Red Hat, Inc.