Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/helper-sve.h | 40 ++++++++++
target/arm/sve_helper.c | 156 +++++++++++++++++++++++++++++++++++++
target/arm/translate-sve.c | 69 ++++++++++++++++
target/arm/sve.decode | 6 ++
4 files changed, 271 insertions(+)
diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index fcc9ba5f50..7338abbbcf 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -754,3 +754,43 @@ DEF_HELPER_FLAGS_4(sve_ld1hds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
DEF_HELPER_FLAGS_4(sve_ld1sdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
DEF_HELPER_FLAGS_4(sve_ld1sds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_ldff1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldff1bhu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldff1bsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldff1bdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldff1bhs_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldff1bss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldff1bds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_ldff1hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldff1hsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldff1hdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldff1hss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldff1hds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_ldff1ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldff1sdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldff1sds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_ldff1dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_ldnf1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldnf1bhu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldnf1bsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldnf1bdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldnf1bhs_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldnf1bss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldnf1bds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_ldnf1hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldnf1hsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldnf1hdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldnf1hss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldnf1hds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_ldnf1ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldnf1sdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldnf1sds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_ldnf1dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index 4e6ad282f9..6e1b539ce3 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -2963,3 +2963,159 @@ DO_LD4(sve_ld4dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
#undef DO_LD2
#undef DO_LD3
#undef DO_LD4
+
+/*
+ * Load contiguous data, first-fault and no-fault.
+ */
+
+#ifdef CONFIG_USER_ONLY
+
+/* Fault on byte I. All bits in FFR from I are cleared. The vector
+ * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
+ * option, which leaves subsequent data unchanged.
+ */
+static void __attribute__((cold))
+record_fault(CPUARMState *env, intptr_t i, intptr_t oprsz)
+{
+ uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
+ if (i & 63) {
+ ffr[i / 64] &= MAKE_64BIT_MASK(0, (i & 63) - 1);
+ i = ROUND_UP(i, 64);
+ }
+ for (; i < oprsz; i += 64) {
+ ffr[i / 64] = 0;
+ }
+}
+
+/* Hold the mmap lock during the operation so that there is no race
+ * between page_check_range and the load operation. We expect the
+ * usual case to have no faults at all, so we check the whole range
+ * first and if successful defer to the normal load operation.
+ *
+ * TODO: Change mmap_lock to a rwlock so that multiple readers
+ * can run simultaneously. This will probably help other uses
+ * within QEMU as well.
+ */
+#define DO_LDFF1(PART, FN, TYPEE, TYPEM, H) \
+static void do_sve_ldff1##PART(CPUARMState *env, void *vd, void *vg, \
+ target_ulong addr, intptr_t oprsz, \
+ bool first, uintptr_t ra) \
+{ \
+ intptr_t i = 0; \
+ do { \
+ uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
+ do { \
+ TYPEM m = 0; \
+ if (pg & 1) { \
+ if (!first && \
+ page_check_range(addr, sizeof(TYPEM), PAGE_READ)) { \
+ record_fault(env, i, oprsz); \
+ return; \
+ } \
+ m = FN(env, addr, ra); \
+ first = false; \
+ } \
+ *(TYPEE *)(vd + H(i)) = m; \
+ i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
+ addr += sizeof(TYPEM); \
+ } while (i & 15); \
+ } while (i < oprsz); \
+} \
+void HELPER(sve_ldff1##PART)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ intptr_t oprsz = simd_oprsz(desc); \
+ unsigned rd = simd_data(desc); \
+ void *vd = &env->vfp.zregs[rd]; \
+ mmap_lock(); \
+ if (likely(page_check_range(addr, oprsz, PAGE_READ) == 0)) { \
+ do_sve_ld1##PART(env, vd, vg, addr, oprsz, GETPC()); \
+ } else { \
+ do_sve_ldff1##PART(env, vd, vg, addr, oprsz, true, GETPC()); \
+ } \
+ mmap_unlock(); \
+}
+
+/* No-fault loads are like first-fault loads without the
+ * first faulting special case.
+ */
+#define DO_LDNF1(PART) \
+void HELPER(sve_ldnf1##PART)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ intptr_t oprsz = simd_oprsz(desc); \
+ unsigned rd = simd_data(desc); \
+ void *vd = &env->vfp.zregs[rd]; \
+ mmap_lock(); \
+ if (likely(page_check_range(addr, oprsz, PAGE_READ) == 0)) { \
+ do_sve_ld1##PART(env, vd, vg, addr, oprsz, GETPC()); \
+ } else { \
+ do_sve_ldff1##PART(env, vd, vg, addr, oprsz, false, GETPC()); \
+ } \
+ mmap_unlock(); \
+}
+
+#else
+
+/* TODO: System mode is not yet supported.
+ * This would probably use tlb_vaddr_to_host.
+ */
+#define DO_LDFF1(PART, FN, TYPEE, TYPEM, H) \
+void HELPER(sve_ldff1##PART)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ g_assert_not_reached(); \
+}
+
+#define DO_LDNF1(PART) \
+void HELPER(sve_ldnf1##PART)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ g_assert_not_reached(); \
+}
+
+#endif
+
+DO_LDFF1(bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
+DO_LDFF1(bhu_r, cpu_ldub_data_ra, uint16_t, uint8_t, H1_2)
+DO_LDFF1(bhs_r, cpu_ldsb_data_ra, uint16_t, int8_t, H1_2)
+DO_LDFF1(bsu_r, cpu_ldub_data_ra, uint32_t, uint8_t, H1_4)
+DO_LDFF1(bss_r, cpu_ldsb_data_ra, uint32_t, int8_t, H1_4)
+DO_LDFF1(bdu_r, cpu_ldub_data_ra, uint64_t, uint8_t, )
+DO_LDFF1(bds_r, cpu_ldsb_data_ra, uint64_t, int8_t, )
+
+DO_LDFF1(hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
+DO_LDFF1(hsu_r, cpu_lduw_data_ra, uint32_t, uint16_t, H1_4)
+DO_LDFF1(hss_r, cpu_ldsw_data_ra, uint32_t, int8_t, H1_4)
+DO_LDFF1(hdu_r, cpu_lduw_data_ra, uint64_t, uint16_t, )
+DO_LDFF1(hds_r, cpu_ldsw_data_ra, uint64_t, int16_t, )
+
+DO_LDFF1(ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
+DO_LDFF1(sdu_r, cpu_ldl_data_ra, uint64_t, uint32_t, )
+DO_LDFF1(sds_r, cpu_ldl_data_ra, uint64_t, int32_t, )
+
+DO_LDFF1(dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
+
+#undef DO_LDFF1
+
+DO_LDNF1(bb_r)
+DO_LDNF1(bhu_r)
+DO_LDNF1(bhs_r)
+DO_LDNF1(bsu_r)
+DO_LDNF1(bss_r)
+DO_LDNF1(bdu_r)
+DO_LDNF1(bds_r)
+
+DO_LDNF1(hh_r)
+DO_LDNF1(hsu_r)
+DO_LDNF1(hss_r)
+DO_LDNF1(hdu_r)
+DO_LDNF1(hds_r)
+
+DO_LDNF1(ss_r)
+DO_LDNF1(sdu_r)
+DO_LDNF1(sds_r)
+
+DO_LDNF1(dd_r)
+
+#undef DO_LDNF1
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 3543daff48..09f77b5405 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -3647,3 +3647,72 @@ static bool trans_LD_zpri(DisasContext *s, arg_rpri_load *a, uint32_t insn)
}
return true;
}
+
+static bool trans_LDFF1_zprr(DisasContext *s, arg_rprr_load *a, uint32_t insn)
+{
+ static gen_helper_gvec_mem * const fns[16] = {
+ gen_helper_sve_ldff1bb_r,
+ gen_helper_sve_ldff1bhu_r,
+ gen_helper_sve_ldff1bsu_r,
+ gen_helper_sve_ldff1bdu_r,
+
+ gen_helper_sve_ldff1sds_r,
+ gen_helper_sve_ldff1hh_r,
+ gen_helper_sve_ldff1hsu_r,
+ gen_helper_sve_ldff1hdu_r,
+
+ gen_helper_sve_ldff1hds_r,
+ gen_helper_sve_ldff1hss_r,
+ gen_helper_sve_ldff1ss_r,
+ gen_helper_sve_ldff1sdu_r,
+
+ gen_helper_sve_ldff1bds_r,
+ gen_helper_sve_ldff1bss_r,
+ gen_helper_sve_ldff1bhs_r,
+ gen_helper_sve_ldff1dd_r,
+ };
+
+ if (sve_access_check(s)) {
+ TCGv_i64 addr = new_tmp_a64(s);
+ tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), dtype_msz(a->dtype));
+ tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn));
+ do_mem_zpa(s, a->rd, a->pg, addr, fns[a->dtype]);
+ }
+ return true;
+}
+
+static bool trans_LDNF1_zpri(DisasContext *s, arg_rpri_load *a, uint32_t insn)
+{
+ static gen_helper_gvec_mem * const fns[16] = {
+ gen_helper_sve_ldnf1bb_r,
+ gen_helper_sve_ldnf1bhu_r,
+ gen_helper_sve_ldnf1bsu_r,
+ gen_helper_sve_ldnf1bdu_r,
+
+ gen_helper_sve_ldnf1sds_r,
+ gen_helper_sve_ldnf1hh_r,
+ gen_helper_sve_ldnf1hsu_r,
+ gen_helper_sve_ldnf1hdu_r,
+
+ gen_helper_sve_ldnf1hds_r,
+ gen_helper_sve_ldnf1hss_r,
+ gen_helper_sve_ldnf1ss_r,
+ gen_helper_sve_ldnf1sdu_r,
+
+ gen_helper_sve_ldnf1bds_r,
+ gen_helper_sve_ldnf1bss_r,
+ gen_helper_sve_ldnf1bhs_r,
+ gen_helper_sve_ldnf1dd_r,
+ };
+
+ if (sve_access_check(s)) {
+ int vsz = vec_full_reg_size(s);
+ int elements = vsz >> dtype_esz[a->dtype];
+ int off = (a->imm * elements) << dtype_msz(a->dtype);
+ TCGv_i64 addr = new_tmp_a64(s);
+
+ tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn), off);
+ do_mem_zpa(s, a->rd, a->pg, addr, fns[a->dtype]);
+ }
+ return true;
+}
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index cfb12da639..afbed57de1 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -685,9 +685,15 @@ LDR_zri 10000101 10 ...... 010 ... ..... ..... @rd_rn_i9
# SVE contiguous load (scalar plus scalar)
LD_zprr 1010010 .... ..... 010 ... ..... ..... @rprr_load_dt nreg=0
+# SVE contiguous first-fault load (scalar plus scalar)
+LDFF1_zprr 1010010 .... ..... 011 ... ..... ..... @rprr_load_dt nreg=0
+
# SVE contiguous load (scalar plus immediate)
LD_zpri 1010010 .... 0.... 101 ... ..... ..... @rpri_load_dt nreg=0
+# SVE contiguous non-fault load (scalar plus immediate)
+LDNF1_zpri 1010010 .... 1.... 101 ... ..... ..... @rpri_load_dt nreg=0
+
# SVE contiguous non-temporal load (scalar plus scalar)
# LDNT1B, LDNT1H, LDNT1W, LDNT1D
# SVE load multiple structures (scalar plus scalar)
--
2.17.1
On 21 June 2018 at 02:53, Richard Henderson
<richard.henderson@linaro.org> wrote:
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
> target/arm/helper-sve.h | 40 ++++++++++
> target/arm/sve_helper.c | 156 +++++++++++++++++++++++++++++++++++++
> target/arm/translate-sve.c | 69 ++++++++++++++++
> target/arm/sve.decode | 6 ++
> 4 files changed, 271 insertions(+)
>
> diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
> index fcc9ba5f50..7338abbbcf 100644
> --- a/target/arm/helper-sve.h
> +++ b/target/arm/helper-sve.h
> @@ -754,3 +754,43 @@ DEF_HELPER_FLAGS_4(sve_ld1hds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
>
> DEF_HELPER_FLAGS_4(sve_ld1sdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> DEF_HELPER_FLAGS_4(sve_ld1sds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +
> +DEF_HELPER_FLAGS_4(sve_ldff1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldff1bhu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldff1bsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldff1bdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldff1bhs_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldff1bss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldff1bds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +
> +DEF_HELPER_FLAGS_4(sve_ldff1hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldff1hsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldff1hdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldff1hss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldff1hds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +
> +DEF_HELPER_FLAGS_4(sve_ldff1ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldff1sdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldff1sds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +
> +DEF_HELPER_FLAGS_4(sve_ldff1dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +
> +DEF_HELPER_FLAGS_4(sve_ldnf1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldnf1bhu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldnf1bsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldnf1bdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldnf1bhs_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldnf1bss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldnf1bds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +
> +DEF_HELPER_FLAGS_4(sve_ldnf1hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldnf1hsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldnf1hdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldnf1hss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldnf1hds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +
> +DEF_HELPER_FLAGS_4(sve_ldnf1ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldnf1sdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldnf1sds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +
> +DEF_HELPER_FLAGS_4(sve_ldnf1dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
> index 4e6ad282f9..6e1b539ce3 100644
> --- a/target/arm/sve_helper.c
> +++ b/target/arm/sve_helper.c
> @@ -2963,3 +2963,159 @@ DO_LD4(sve_ld4dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
> #undef DO_LD2
> #undef DO_LD3
> #undef DO_LD4
> +
> +/*
> + * Load contiguous data, first-fault and no-fault.
> + */
> +
> +#ifdef CONFIG_USER_ONLY
> +
> +/* Fault on byte I. All bits in FFR from I are cleared. The vector
> + * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
> + * option, which leaves subsequent data unchanged.
> + */
> +static void __attribute__((cold))
attribute cold was first introduced in GCC 4.3. As of
commit fa54abb8c29 I think we still support gcc 4.1,
so we need to hide this behind a QEMU_COLD or something I think, eg
#ifndef __has_attribute
#define __has_attribute(x) 0 /* compatibility with older gcc */
#endif
#if __has_attribute(cold) || QEMU_GNUC_PREREQ(4, 3)
#define QEMU_COLD __attribute__((cold))
#else
#define QEMU_COLD
#endif
(gcc added __has_attribute in gcc 5, which is nice.)
> +record_fault(CPUARMState *env, intptr_t i, intptr_t oprsz)
> +{
> + uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
> + if (i & 63) {
> + ffr[i / 64] &= MAKE_64BIT_MASK(0, (i & 63) - 1);
Should this really have a - 1 here? (i & 63) will
be anything between 1 and 63, so I would have expected
the set of masks to be anything from "1 bit set" to
"63 bits set", not "0 bits set" to "62 bits set".
> + i = ROUND_UP(i, 64);
> + }
> + for (; i < oprsz; i += 64) {
> + ffr[i / 64] = 0;
> + }
> +}
> +
Otherwise
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
thanks
-- PMM
On 06/22/2018 09:04 AM, Peter Maydell wrote:
> On 21 June 2018 at 02:53, Richard Henderson
> <richard.henderson@linaro.org> wrote:
>> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
>> ---
>> target/arm/helper-sve.h | 40 ++++++++++
>> target/arm/sve_helper.c | 156 +++++++++++++++++++++++++++++++++++++
>> target/arm/translate-sve.c | 69 ++++++++++++++++
>> target/arm/sve.decode | 6 ++
>> 4 files changed, 271 insertions(+)
>>
>> diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
>> index fcc9ba5f50..7338abbbcf 100644
>> --- a/target/arm/helper-sve.h
>> +++ b/target/arm/helper-sve.h
>> @@ -754,3 +754,43 @@ DEF_HELPER_FLAGS_4(sve_ld1hds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
>>
>> DEF_HELPER_FLAGS_4(sve_ld1sdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
>> DEF_HELPER_FLAGS_4(sve_ld1sds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
>> +
>> +DEF_HELPER_FLAGS_4(sve_ldff1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
>> +DEF_HELPER_FLAGS_4(sve_ldff1bhu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
>> +DEF_HELPER_FLAGS_4(sve_ldff1bsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
>> +DEF_HELPER_FLAGS_4(sve_ldff1bdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
>> +DEF_HELPER_FLAGS_4(sve_ldff1bhs_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
>> +DEF_HELPER_FLAGS_4(sve_ldff1bss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
>> +DEF_HELPER_FLAGS_4(sve_ldff1bds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
>> +
>> +DEF_HELPER_FLAGS_4(sve_ldff1hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
>> +DEF_HELPER_FLAGS_4(sve_ldff1hsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
>> +DEF_HELPER_FLAGS_4(sve_ldff1hdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
>> +DEF_HELPER_FLAGS_4(sve_ldff1hss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
>> +DEF_HELPER_FLAGS_4(sve_ldff1hds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
>> +
>> +DEF_HELPER_FLAGS_4(sve_ldff1ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
>> +DEF_HELPER_FLAGS_4(sve_ldff1sdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
>> +DEF_HELPER_FLAGS_4(sve_ldff1sds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
>> +
>> +DEF_HELPER_FLAGS_4(sve_ldff1dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
>> +
>> +DEF_HELPER_FLAGS_4(sve_ldnf1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
>> +DEF_HELPER_FLAGS_4(sve_ldnf1bhu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
>> +DEF_HELPER_FLAGS_4(sve_ldnf1bsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
>> +DEF_HELPER_FLAGS_4(sve_ldnf1bdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
>> +DEF_HELPER_FLAGS_4(sve_ldnf1bhs_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
>> +DEF_HELPER_FLAGS_4(sve_ldnf1bss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
>> +DEF_HELPER_FLAGS_4(sve_ldnf1bds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
>> +
>> +DEF_HELPER_FLAGS_4(sve_ldnf1hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
>> +DEF_HELPER_FLAGS_4(sve_ldnf1hsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
>> +DEF_HELPER_FLAGS_4(sve_ldnf1hdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
>> +DEF_HELPER_FLAGS_4(sve_ldnf1hss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
>> +DEF_HELPER_FLAGS_4(sve_ldnf1hds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
>> +
>> +DEF_HELPER_FLAGS_4(sve_ldnf1ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
>> +DEF_HELPER_FLAGS_4(sve_ldnf1sdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
>> +DEF_HELPER_FLAGS_4(sve_ldnf1sds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
>> +
>> +DEF_HELPER_FLAGS_4(sve_ldnf1dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
>> diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
>> index 4e6ad282f9..6e1b539ce3 100644
>> --- a/target/arm/sve_helper.c
>> +++ b/target/arm/sve_helper.c
>> @@ -2963,3 +2963,159 @@ DO_LD4(sve_ld4dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
>> #undef DO_LD2
>> #undef DO_LD3
>> #undef DO_LD4
>> +
>> +/*
>> + * Load contiguous data, first-fault and no-fault.
>> + */
>> +
>> +#ifdef CONFIG_USER_ONLY
>> +
>> +/* Fault on byte I. All bits in FFR from I are cleared. The vector
>> + * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
>> + * option, which leaves subsequent data unchanged.
>> + */
>> +static void __attribute__((cold))
>
> attribute cold was first introduced in GCC 4.3. As of
> commit fa54abb8c29 I think we still support gcc 4.1,
> so we need to hide this behind a QEMU_COLD or something I think, eg
>
> #ifndef __has_attribute
> #define __has_attribute(x) 0 /* compatibility with older gcc */
> #endif
>
> #if __has_attribute(cold) || QEMU_GNUC_PREREQ(4, 3)
> #define QEMU_COLD __attribute__((cold))
> #else
> #define QEMU_COLD
> #endif
>
> (gcc added __has_attribute in gcc 5, which is nice.)
Ah, good archaeology.
But I think I'll just drop this. I put it in there as a hint that it won't be
called, but the x86_64 code generation for putting this into the .text.unlikely
section is really ugly.
>
>> +record_fault(CPUARMState *env, intptr_t i, intptr_t oprsz)
>> +{
>> + uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
>> + if (i & 63) {
>> + ffr[i / 64] &= MAKE_64BIT_MASK(0, (i & 63) - 1);
>
> Should this really have a - 1 here? (i & 63) will
> be anything between 1 and 63, so I would have expected
> the set of masks to be anything from "1 bit set" to
> "63 bits set", not "0 bits set" to "62 bits set".
We want to zero bits I to OPRSZ-1, which means retaining bits 0 to I-1.
But you're right that for e.g. I==65 this will produce ~0ULL >> 64.
I'll re-work this.
r~
Richard Henderson <richard.henderson@linaro.org> writes:
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
> target/arm/helper-sve.h | 40 ++++++++++
> target/arm/sve_helper.c | 156 +++++++++++++++++++++++++++++++++++++
> target/arm/translate-sve.c | 69 ++++++++++++++++
> target/arm/sve.decode | 6 ++
> 4 files changed, 271 insertions(+)
>
> diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
> index fcc9ba5f50..7338abbbcf 100644
> --- a/target/arm/helper-sve.h
> +++ b/target/arm/helper-sve.h
> @@ -754,3 +754,43 @@ DEF_HELPER_FLAGS_4(sve_ld1hds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
>
> DEF_HELPER_FLAGS_4(sve_ld1sdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> DEF_HELPER_FLAGS_4(sve_ld1sds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +
> +DEF_HELPER_FLAGS_4(sve_ldff1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldff1bhu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldff1bsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldff1bdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldff1bhs_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldff1bss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldff1bds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +
> +DEF_HELPER_FLAGS_4(sve_ldff1hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldff1hsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldff1hdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldff1hss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldff1hds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +
> +DEF_HELPER_FLAGS_4(sve_ldff1ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldff1sdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldff1sds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +
> +DEF_HELPER_FLAGS_4(sve_ldff1dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +
> +DEF_HELPER_FLAGS_4(sve_ldnf1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldnf1bhu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldnf1bsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldnf1bdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldnf1bhs_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldnf1bss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldnf1bds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +
> +DEF_HELPER_FLAGS_4(sve_ldnf1hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldnf1hsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldnf1hdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldnf1hss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldnf1hds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +
> +DEF_HELPER_FLAGS_4(sve_ldnf1ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldnf1sdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +DEF_HELPER_FLAGS_4(sve_ldnf1sds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> +
> +DEF_HELPER_FLAGS_4(sve_ldnf1dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
> diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
> index 4e6ad282f9..6e1b539ce3 100644
> --- a/target/arm/sve_helper.c
> +++ b/target/arm/sve_helper.c
> @@ -2963,3 +2963,159 @@ DO_LD4(sve_ld4dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
> #undef DO_LD2
> #undef DO_LD3
> #undef DO_LD4
> +
> +/*
> + * Load contiguous data, first-fault and no-fault.
> + */
> +
> +#ifdef CONFIG_USER_ONLY
> +
> +/* Fault on byte I. All bits in FFR from I are cleared. The vector
> + * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
> + * option, which leaves subsequent data unchanged.
> + */
> +static void __attribute__((cold))
> +record_fault(CPUARMState *env, intptr_t i, intptr_t oprsz)
> +{
> + uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
> + if (i & 63) {
> + ffr[i / 64] &= MAKE_64BIT_MASK(0, (i & 63) - 1);
> + i = ROUND_UP(i, 64);
> + }
> + for (; i < oprsz; i += 64) {
> + ffr[i / 64] = 0;
> + }
> +}
> +
> +/* Hold the mmap lock during the operation so that there is no race
> + * between page_check_range and the load operation. We expect the
> + * usual case to have no faults at all, so we check the whole range
> + * first and if successful defer to the normal load operation.
> + *
> + * TODO: Change mmap_lock to a rwlock so that multiple readers
> + * can run simultaneously. This will probably help other uses
> + * within QEMU as well.
> + */
> +#define DO_LDFF1(PART, FN, TYPEE, TYPEM, H) \
> +static void do_sve_ldff1##PART(CPUARMState *env, void *vd, void *vg, \
> + target_ulong addr, intptr_t oprsz, \
> + bool first, uintptr_t ra) \
> +{ \
> + intptr_t i = 0; \
> + do { \
> + uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
> + do { \
> + TYPEM m = 0; \
> + if (pg & 1) { \
> + if (!first && \
> + page_check_range(addr, sizeof(TYPEM), PAGE_READ)) { \
> + record_fault(env, i, oprsz); \
> + return; \
> + } \
> + m = FN(env, addr, ra); \
> + first = false; \
> + } \
> + *(TYPEE *)(vd + H(i)) = m; \
> + i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
> + addr += sizeof(TYPEM); \
> + } while (i & 15); \
> + } while (i < oprsz); \
> +}
> \
So I noticed that the disassembly of these two functions is mostly
parameter pushing and popping. Is there a case to be made to use the
__flatten__ approach and see how the compiler unrolls it all?
> +void HELPER(sve_ldff1##PART)(CPUARMState *env, void *vg, \
> + target_ulong addr, uint32_t desc) \
> +{ \
> + intptr_t oprsz = simd_oprsz(desc); \
> + unsigned rd = simd_data(desc); \
> + void *vd = &env->vfp.zregs[rd]; \
> + mmap_lock(); \
> + if (likely(page_check_range(addr, oprsz, PAGE_READ) == 0)) { \
> + do_sve_ld1##PART(env, vd, vg, addr, oprsz, GETPC()); \
> + } else { \
> + do_sve_ldff1##PART(env, vd, vg, addr, oprsz, true, GETPC()); \
> + } \
> + mmap_unlock(); \
> +}
> +
> +/* No-fault loads are like first-fault loads without the
> + * first faulting special case.
> + */
> +#define DO_LDNF1(PART) \
> +void HELPER(sve_ldnf1##PART)(CPUARMState *env, void *vg, \
> + target_ulong addr, uint32_t desc) \
> +{ \
> + intptr_t oprsz = simd_oprsz(desc); \
> + unsigned rd = simd_data(desc); \
> + void *vd = &env->vfp.zregs[rd]; \
> + mmap_lock(); \
> + if (likely(page_check_range(addr, oprsz, PAGE_READ) == 0)) { \
> + do_sve_ld1##PART(env, vd, vg, addr, oprsz, GETPC()); \
> + } else { \
> + do_sve_ldff1##PART(env, vd, vg, addr, oprsz, false, GETPC()); \
> + } \
> + mmap_unlock(); \
> +}
> +
> +#else
> +
> +/* TODO: System mode is not yet supported.
> + * This would probably use tlb_vaddr_to_host.
> + */
> +#define DO_LDFF1(PART, FN, TYPEE, TYPEM, H) \
> +void HELPER(sve_ldff1##PART)(CPUARMState *env, void *vg, \
> + target_ulong addr, uint32_t desc) \
> +{ \
> + g_assert_not_reached(); \
> +}
> +
> +#define DO_LDNF1(PART) \
> +void HELPER(sve_ldnf1##PART)(CPUARMState *env, void *vg, \
> + target_ulong addr, uint32_t desc) \
> +{ \
> + g_assert_not_reached(); \
> +}
> +
> +#endif
> +
> +DO_LDFF1(bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
> +DO_LDFF1(bhu_r, cpu_ldub_data_ra, uint16_t, uint8_t, H1_2)
> +DO_LDFF1(bhs_r, cpu_ldsb_data_ra, uint16_t, int8_t, H1_2)
> +DO_LDFF1(bsu_r, cpu_ldub_data_ra, uint32_t, uint8_t, H1_4)
> +DO_LDFF1(bss_r, cpu_ldsb_data_ra, uint32_t, int8_t, H1_4)
> +DO_LDFF1(bdu_r, cpu_ldub_data_ra, uint64_t, uint8_t, )
> +DO_LDFF1(bds_r, cpu_ldsb_data_ra, uint64_t, int8_t, )
> +
> +DO_LDFF1(hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
> +DO_LDFF1(hsu_r, cpu_lduw_data_ra, uint32_t, uint16_t, H1_4)
> +DO_LDFF1(hss_r, cpu_ldsw_data_ra, uint32_t, int8_t, H1_4)
> +DO_LDFF1(hdu_r, cpu_lduw_data_ra, uint64_t, uint16_t, )
> +DO_LDFF1(hds_r, cpu_ldsw_data_ra, uint64_t, int16_t, )
> +
> +DO_LDFF1(ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
> +DO_LDFF1(sdu_r, cpu_ldl_data_ra, uint64_t, uint32_t, )
> +DO_LDFF1(sds_r, cpu_ldl_data_ra, uint64_t, int32_t, )
> +
> +DO_LDFF1(dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
> +
> +#undef DO_LDFF1
> +
> +DO_LDNF1(bb_r)
> +DO_LDNF1(bhu_r)
> +DO_LDNF1(bhs_r)
> +DO_LDNF1(bsu_r)
> +DO_LDNF1(bss_r)
> +DO_LDNF1(bdu_r)
> +DO_LDNF1(bds_r)
> +
> +DO_LDNF1(hh_r)
> +DO_LDNF1(hsu_r)
> +DO_LDNF1(hss_r)
> +DO_LDNF1(hdu_r)
> +DO_LDNF1(hds_r)
> +
> +DO_LDNF1(ss_r)
> +DO_LDNF1(sdu_r)
> +DO_LDNF1(sds_r)
> +
> +DO_LDNF1(dd_r)
> +
> +#undef DO_LDNF1
> diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
> index 3543daff48..09f77b5405 100644
> --- a/target/arm/translate-sve.c
> +++ b/target/arm/translate-sve.c
> @@ -3647,3 +3647,72 @@ static bool trans_LD_zpri(DisasContext *s, arg_rpri_load *a, uint32_t insn)
> }
> return true;
> }
> +
> +static bool trans_LDFF1_zprr(DisasContext *s, arg_rprr_load *a, uint32_t insn)
> +{
> + static gen_helper_gvec_mem * const fns[16] = {
> + gen_helper_sve_ldff1bb_r,
> + gen_helper_sve_ldff1bhu_r,
> + gen_helper_sve_ldff1bsu_r,
> + gen_helper_sve_ldff1bdu_r,
> +
> + gen_helper_sve_ldff1sds_r,
> + gen_helper_sve_ldff1hh_r,
> + gen_helper_sve_ldff1hsu_r,
> + gen_helper_sve_ldff1hdu_r,
> +
> + gen_helper_sve_ldff1hds_r,
> + gen_helper_sve_ldff1hss_r,
> + gen_helper_sve_ldff1ss_r,
> + gen_helper_sve_ldff1sdu_r,
> +
> + gen_helper_sve_ldff1bds_r,
> + gen_helper_sve_ldff1bss_r,
> + gen_helper_sve_ldff1bhs_r,
> + gen_helper_sve_ldff1dd_r,
> + };
> +
> + if (sve_access_check(s)) {
> + TCGv_i64 addr = new_tmp_a64(s);
> + tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), dtype_msz(a->dtype));
> + tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn));
> + do_mem_zpa(s, a->rd, a->pg, addr, fns[a->dtype]);
> + }
> + return true;
> +}
> +
> +static bool trans_LDNF1_zpri(DisasContext *s, arg_rpri_load *a, uint32_t insn)
> +{
> + static gen_helper_gvec_mem * const fns[16] = {
> + gen_helper_sve_ldnf1bb_r,
> + gen_helper_sve_ldnf1bhu_r,
> + gen_helper_sve_ldnf1bsu_r,
> + gen_helper_sve_ldnf1bdu_r,
> +
> + gen_helper_sve_ldnf1sds_r,
> + gen_helper_sve_ldnf1hh_r,
> + gen_helper_sve_ldnf1hsu_r,
> + gen_helper_sve_ldnf1hdu_r,
> +
> + gen_helper_sve_ldnf1hds_r,
> + gen_helper_sve_ldnf1hss_r,
> + gen_helper_sve_ldnf1ss_r,
> + gen_helper_sve_ldnf1sdu_r,
> +
> + gen_helper_sve_ldnf1bds_r,
> + gen_helper_sve_ldnf1bss_r,
> + gen_helper_sve_ldnf1bhs_r,
> + gen_helper_sve_ldnf1dd_r,
> + };
> +
> + if (sve_access_check(s)) {
> + int vsz = vec_full_reg_size(s);
> + int elements = vsz >> dtype_esz[a->dtype];
> + int off = (a->imm * elements) << dtype_msz(a->dtype);
> + TCGv_i64 addr = new_tmp_a64(s);
> +
> + tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn), off);
> + do_mem_zpa(s, a->rd, a->pg, addr, fns[a->dtype]);
> + }
> + return true;
> +}
> diff --git a/target/arm/sve.decode b/target/arm/sve.decode
> index cfb12da639..afbed57de1 100644
> --- a/target/arm/sve.decode
> +++ b/target/arm/sve.decode
> @@ -685,9 +685,15 @@ LDR_zri 10000101 10 ...... 010 ... ..... ..... @rd_rn_i9
> # SVE contiguous load (scalar plus scalar)
> LD_zprr 1010010 .... ..... 010 ... ..... ..... @rprr_load_dt nreg=0
>
> +# SVE contiguous first-fault load (scalar plus scalar)
> +LDFF1_zprr 1010010 .... ..... 011 ... ..... ..... @rprr_load_dt nreg=0
> +
> # SVE contiguous load (scalar plus immediate)
> LD_zpri 1010010 .... 0.... 101 ... ..... ..... @rpri_load_dt nreg=0
>
> +# SVE contiguous non-fault load (scalar plus immediate)
> +LDNF1_zpri 1010010 .... 1.... 101 ... ..... ..... @rpri_load_dt nreg=0
> +
> # SVE contiguous non-temporal load (scalar plus scalar)
> # LDNT1B, LDNT1H, LDNT1W, LDNT1D
> # SVE load multiple structures (scalar plus scalar)
--
Alex Bennée
On 06/26/2018 05:52 AM, Alex Bennée wrote:
>> +#define DO_LDFF1(PART, FN, TYPEE, TYPEM, H) \
>> +static void do_sve_ldff1##PART(CPUARMState *env, void *vd, void *vg, \
>> + target_ulong addr, intptr_t oprsz, \
>> + bool first, uintptr_t ra) \
>> +{ \
>> + intptr_t i = 0; \
>> + do { \
>> + uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
>> + do { \
>> + TYPEM m = 0; \
>> + if (pg & 1) { \
>> + if (!first && \
>> + page_check_range(addr, sizeof(TYPEM), PAGE_READ)) { \
>> + record_fault(env, i, oprsz); \
>> + return; \
>> + } \
>> + m = FN(env, addr, ra); \
>> + first = false; \
>> + } \
>> + *(TYPEE *)(vd + H(i)) = m; \
>> + i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
>> + addr += sizeof(TYPEM); \
>> + } while (i & 15); \
>> + } while (i < oprsz); \
>> +}
>> \
> So I noticed that the disassembly of these two functions is mostly
> parameter pushing and popping. Is there a case to be made to use the
> __flatten__ approach and see how the compiler unrolls it all?
Em... for the most part the functions being called are not inlinable,
being defined in accel/tcg/.
r~
Richard Henderson <richard.henderson@linaro.org> writes:
> On 06/26/2018 05:52 AM, Alex Bennée wrote:
>>> +#define DO_LDFF1(PART, FN, TYPEE, TYPEM, H) \
>>> +static void do_sve_ldff1##PART(CPUARMState *env, void *vd, void *vg, \
>>> + target_ulong addr, intptr_t oprsz, \
>>> + bool first, uintptr_t ra) \
>>> +{ \
>>> + intptr_t i = 0; \
>>> + do { \
>>> + uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
>>> + do { \
>>> + TYPEM m = 0; \
>>> + if (pg & 1) { \
>>> + if (!first && \
>>> + page_check_range(addr, sizeof(TYPEM), PAGE_READ)) { \
>>> + record_fault(env, i, oprsz); \
>>> + return; \
>>> + } \
>>> + m = FN(env, addr, ra); \
>>> + first = false; \
>>> + } \
>>> + *(TYPEE *)(vd + H(i)) = m; \
>>> + i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
>>> + addr += sizeof(TYPEM); \
>>> + } while (i & 15); \
>>> + } while (i < oprsz); \
>>> +}
>>> \
>> So I noticed that the disassembly of these two functions is mostly
>> parameter pushing and popping. Is there a case to be made to use the
>> __flatten__ approach and see how the compiler unrolls it all?
>
> Em... for the most part the functions being called are not inlinable,
> being defined in accel/tcg/.
*sigh* I guess. It's a shame because the numbers get more disappointing:
12:13:48 [alex@zen:~/l/q/q/aarch64-linux-user] review/rth-sve-v5(+26/-1) + ./qemu-aarch64 ./tests/simd-memcpy libc intreg intpair simdreg simdpair sve
libc, 248298053, 4228 kb/s
intreg, 646085220, 1623 kb/s
intpair, 369350825, 2841 kb/s
simdreg, 1422096252, 737 kb/s
simdpair, 1369635566, 765 kb/s
sve, 2646179942, 396 kb/s
and the above example doesn't have the cost of page_check_range. I guess
this isn't something that could be improved until other architectures had a
similar predicated load solution we could use in generated code. Helpers
are always going to suck here :-/
Anyway my boy-racer disappointments aside:
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
--
Alex Bennée
© 2016 - 2026 Red Hat, Inc.