From nobody Wed Apr 16 04:29:40 2025 Delivered-To: importer@patchew.org Received-SPF: pass (zoho.com: domain of gnu.org designates 208.118.235.17 as permitted sender) client-ip=208.118.235.17; envelope-from=qemu-devel-bounces+importer=patchew.org@nongnu.org; helo=lists.gnu.org; Authentication-Results: mx.zohomail.com; spf=pass (zoho.com: domain of gnu.org designates 208.118.235.17 as permitted sender) smtp.mailfrom=qemu-devel-bounces+importer=patchew.org@nongnu.org; dmarc=fail(p=none dis=none) header.from=linaro.org Return-Path: Received: from lists.gnu.org (lists.gnu.org [208.118.235.17]) by mx.zohomail.com with SMTPS id 1539007746701769.1268455092747; Mon, 8 Oct 2018 07:09:06 -0700 (PDT) Received: from localhost ([::1]:46395 helo=lists.gnu.org) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1g9WDZ-0006jA-F5 for importer@patchew.org; Mon, 08 Oct 2018 10:09:05 -0400 Received: from eggs.gnu.org ([2001:4830:134:3::10]:46002) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1g9W5L-00012n-BB for qemu-devel@nongnu.org; Mon, 08 Oct 2018 10:00:40 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1g9W5B-0006xl-6l for qemu-devel@nongnu.org; Mon, 08 Oct 2018 10:00:35 -0400 Received: from orth.archaic.org.uk ([2001:8b0:1d0::2]:51692) by eggs.gnu.org with esmtps (TLS1.0:RSA_AES_256_CBC_SHA1:32) (Exim 4.71) (envelope-from ) id 1g9W57-0006uy-OT for qemu-devel@nongnu.org; Mon, 08 Oct 2018 10:00:23 -0400 Received: from pm215 by orth.archaic.org.uk with local (Exim 4.89) (envelope-from ) id 1g9W56-0003gb-48 for qemu-devel@nongnu.org; Mon, 08 Oct 2018 15:00:20 +0100 From: Peter Maydell To: qemu-devel@nongnu.org Date: Mon, 8 Oct 2018 14:59:43 +0100 Message-Id: <20181008140004.12612-13-peter.maydell@linaro.org> X-Mailer: git-send-email 2.19.0 In-Reply-To: <20181008140004.12612-1-peter.maydell@linaro.org> References: <20181008140004.12612-1-peter.maydell@linaro.org> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-detected-operating-system: by eggs.gnu.org: Genre and OS details not recognized. X-Received-From: 2001:8b0:1d0::2 Subject: [Qemu-devel] [PULL 12/33] target/arm: Rewrite helper_sve_ld[234]*_r X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.21 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+importer=patchew.org@nongnu.org Sender: "Qemu-devel" X-ZohoMail: RDMRC_1 RSF_0 Z_629925259 SPT_0 Content-Type: text/plain; charset="utf-8" From: Richard Henderson Use the same *_tlb primitives as we use for ld1. For linux-user, this hoists the set of helper_retaddr. For softmmu, hoists the computation of the current mmu_idx outside the loop, fixes the endianness problem, and moves the main loop out of a macro and into an inlined function. Reviewed-by: Peter Maydell Tested-by: Laurent Desnogues Signed-off-by: Richard Henderson Message-id: 20181005175350.30752-9-richard.henderson@linaro.org Signed-off-by: Peter Maydell --- target/arm/sve_helper.c | 210 ++++++++++++++++++++++------------------ 1 file changed, 117 insertions(+), 93 deletions(-) diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c index d628978431c..f712b382f8b 100644 --- a/target/arm/sve_helper.c +++ b/target/arm/sve_helper.c @@ -4285,109 +4285,133 @@ DO_LD1_2(ld1dd, 3, 3) #undef DO_LD1_1 #undef DO_LD1_2 =20 -#define DO_LD2(NAME, FN, TYPEE, TYPEM, H) \ -void HELPER(NAME)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - intptr_t i, oprsz =3D simd_oprsz(desc); \ - intptr_t ra =3D GETPC(); \ - unsigned rd =3D simd_data(desc); \ - void *d1 =3D &env->vfp.zregs[rd]; \ - void *d2 =3D &env->vfp.zregs[(rd + 1) & 31]; \ - for (i =3D 0; i < oprsz; ) { \ - uint16_t pg =3D *(uint16_t *)(vg + H1_2(i >> 3)); \ - do { \ - TYPEM m1 =3D 0, m2 =3D 0; \ - if (pg & 1) { \ - m1 =3D FN(env, addr, ra); \ - m2 =3D FN(env, addr + sizeof(TYPEM), ra); \ - } \ - *(TYPEE *)(d1 + H(i)) =3D m1; \ - *(TYPEE *)(d2 + H(i)) =3D m2; \ - i +=3D sizeof(TYPEE), pg >>=3D sizeof(TYPEE); \ - addr +=3D 2 * sizeof(TYPEM); \ - } while (i & 15); \ - } \ +/* + * Common helpers for all contiguous 2,3,4-register predicated loads. + */ +static void sve_ld2_r(CPUARMState *env, void *vg, target_ulong addr, + uint32_t desc, int size, uintptr_t ra, + sve_ld1_tlb_fn *tlb_fn) +{ + const int mmu_idx =3D cpu_mmu_index(env, false); + intptr_t i, oprsz =3D simd_oprsz(desc); + unsigned rd =3D simd_data(desc); + ARMVectorReg scratch[2] =3D { }; + + set_helper_retaddr(ra); + for (i =3D 0; i < oprsz; ) { + uint16_t pg =3D *(uint16_t *)(vg + H1_2(i >> 3)); + do { + if (pg & 1) { + tlb_fn(env, &scratch[0], i, addr, mmu_idx, ra); + tlb_fn(env, &scratch[1], i, addr + size, mmu_idx, ra); + } + i +=3D size, pg >>=3D size; + addr +=3D 2 * size; + } while (i & 15); + } + set_helper_retaddr(0); + + /* Wait until all exceptions have been raised to write back. */ + memcpy(&env->vfp.zregs[rd], &scratch[0], oprsz); + memcpy(&env->vfp.zregs[(rd + 1) & 31], &scratch[1], oprsz); } =20 -#define DO_LD3(NAME, FN, TYPEE, TYPEM, H) \ -void HELPER(NAME)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - intptr_t i, oprsz =3D simd_oprsz(desc); \ - intptr_t ra =3D GETPC(); \ - unsigned rd =3D simd_data(desc); \ - void *d1 =3D &env->vfp.zregs[rd]; \ - void *d2 =3D &env->vfp.zregs[(rd + 1) & 31]; \ - void *d3 =3D &env->vfp.zregs[(rd + 2) & 31]; \ - for (i =3D 0; i < oprsz; ) { \ - uint16_t pg =3D *(uint16_t *)(vg + H1_2(i >> 3)); \ - do { \ - TYPEM m1 =3D 0, m2 =3D 0, m3 =3D 0; \ - if (pg & 1) { \ - m1 =3D FN(env, addr, ra); \ - m2 =3D FN(env, addr + sizeof(TYPEM), ra); \ - m3 =3D FN(env, addr + 2 * sizeof(TYPEM), ra); \ - } \ - *(TYPEE *)(d1 + H(i)) =3D m1; \ - *(TYPEE *)(d2 + H(i)) =3D m2; \ - *(TYPEE *)(d3 + H(i)) =3D m3; \ - i +=3D sizeof(TYPEE), pg >>=3D sizeof(TYPEE); \ - addr +=3D 3 * sizeof(TYPEM); \ - } while (i & 15); \ - } \ +static void sve_ld3_r(CPUARMState *env, void *vg, target_ulong addr, + uint32_t desc, int size, uintptr_t ra, + sve_ld1_tlb_fn *tlb_fn) +{ + const int mmu_idx =3D cpu_mmu_index(env, false); + intptr_t i, oprsz =3D simd_oprsz(desc); + unsigned rd =3D simd_data(desc); + ARMVectorReg scratch[3] =3D { }; + + set_helper_retaddr(ra); + for (i =3D 0; i < oprsz; ) { + uint16_t pg =3D *(uint16_t *)(vg + H1_2(i >> 3)); + do { + if (pg & 1) { + tlb_fn(env, &scratch[0], i, addr, mmu_idx, ra); + tlb_fn(env, &scratch[1], i, addr + size, mmu_idx, ra); + tlb_fn(env, &scratch[2], i, addr + 2 * size, mmu_idx, ra); + } + i +=3D size, pg >>=3D size; + addr +=3D 3 * size; + } while (i & 15); + } + set_helper_retaddr(0); + + /* Wait until all exceptions have been raised to write back. */ + memcpy(&env->vfp.zregs[rd], &scratch[0], oprsz); + memcpy(&env->vfp.zregs[(rd + 1) & 31], &scratch[1], oprsz); + memcpy(&env->vfp.zregs[(rd + 2) & 31], &scratch[2], oprsz); } =20 -#define DO_LD4(NAME, FN, TYPEE, TYPEM, H) \ -void HELPER(NAME)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - intptr_t i, oprsz =3D simd_oprsz(desc); \ - intptr_t ra =3D GETPC(); \ - unsigned rd =3D simd_data(desc); \ - void *d1 =3D &env->vfp.zregs[rd]; \ - void *d2 =3D &env->vfp.zregs[(rd + 1) & 31]; \ - void *d3 =3D &env->vfp.zregs[(rd + 2) & 31]; \ - void *d4 =3D &env->vfp.zregs[(rd + 3) & 31]; \ - for (i =3D 0; i < oprsz; ) { \ - uint16_t pg =3D *(uint16_t *)(vg + H1_2(i >> 3)); \ - do { \ - TYPEM m1 =3D 0, m2 =3D 0, m3 =3D 0, m4 =3D 0; \ - if (pg & 1) { \ - m1 =3D FN(env, addr, ra); \ - m2 =3D FN(env, addr + sizeof(TYPEM), ra); \ - m3 =3D FN(env, addr + 2 * sizeof(TYPEM), ra); \ - m4 =3D FN(env, addr + 3 * sizeof(TYPEM), ra); \ - } \ - *(TYPEE *)(d1 + H(i)) =3D m1; \ - *(TYPEE *)(d2 + H(i)) =3D m2; \ - *(TYPEE *)(d3 + H(i)) =3D m3; \ - *(TYPEE *)(d4 + H(i)) =3D m4; \ - i +=3D sizeof(TYPEE), pg >>=3D sizeof(TYPEE); \ - addr +=3D 4 * sizeof(TYPEM); \ - } while (i & 15); \ - } \ +static void sve_ld4_r(CPUARMState *env, void *vg, target_ulong addr, + uint32_t desc, int size, uintptr_t ra, + sve_ld1_tlb_fn *tlb_fn) +{ + const int mmu_idx =3D cpu_mmu_index(env, false); + intptr_t i, oprsz =3D simd_oprsz(desc); + unsigned rd =3D simd_data(desc); + ARMVectorReg scratch[4] =3D { }; + + set_helper_retaddr(ra); + for (i =3D 0; i < oprsz; ) { + uint16_t pg =3D *(uint16_t *)(vg + H1_2(i >> 3)); + do { + if (pg & 1) { + tlb_fn(env, &scratch[0], i, addr, mmu_idx, ra); + tlb_fn(env, &scratch[1], i, addr + size, mmu_idx, ra); + tlb_fn(env, &scratch[2], i, addr + 2 * size, mmu_idx, ra); + tlb_fn(env, &scratch[3], i, addr + 3 * size, mmu_idx, ra); + } + i +=3D size, pg >>=3D size; + addr +=3D 4 * size; + } while (i & 15); + } + set_helper_retaddr(0); + + /* Wait until all exceptions have been raised to write back. */ + memcpy(&env->vfp.zregs[rd], &scratch[0], oprsz); + memcpy(&env->vfp.zregs[(rd + 1) & 31], &scratch[1], oprsz); + memcpy(&env->vfp.zregs[(rd + 2) & 31], &scratch[2], oprsz); + memcpy(&env->vfp.zregs[(rd + 3) & 31], &scratch[3], oprsz); } =20 -DO_LD2(sve_ld2bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1) -DO_LD3(sve_ld3bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1) -DO_LD4(sve_ld4bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1) +#define DO_LDN_1(N) \ +void __attribute__((flatten)) HELPER(sve_ld##N##bb_r) \ + (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \ +{ \ + sve_ld##N##_r(env, vg, addr, desc, 1, GETPC(), sve_ld1bb_tlb); \ +} =20 -DO_LD2(sve_ld2hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2) -DO_LD3(sve_ld3hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2) -DO_LD4(sve_ld4hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2) +#define DO_LDN_2(N, SUFF, SIZE) \ +void __attribute__((flatten)) HELPER(sve_ld##N##SUFF##_r) \ + (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \ +{ \ + sve_ld##N##_r(env, vg, addr, desc, SIZE, GETPC(), \ + arm_cpu_data_is_big_endian(env) \ + ? sve_ld1##SUFF##_be_tlb : sve_ld1##SUFF##_le_tlb); \ +} =20 -DO_LD2(sve_ld2ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4) -DO_LD3(sve_ld3ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4) -DO_LD4(sve_ld4ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4) +DO_LDN_1(2) +DO_LDN_1(3) +DO_LDN_1(4) =20 -DO_LD2(sve_ld2dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, ) -DO_LD3(sve_ld3dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, ) -DO_LD4(sve_ld4dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, ) +DO_LDN_2(2, hh, 2) +DO_LDN_2(3, hh, 2) +DO_LDN_2(4, hh, 2) =20 -#undef DO_LD2 -#undef DO_LD3 -#undef DO_LD4 +DO_LDN_2(2, ss, 4) +DO_LDN_2(3, ss, 4) +DO_LDN_2(4, ss, 4) + +DO_LDN_2(2, dd, 8) +DO_LDN_2(3, dd, 8) +DO_LDN_2(4, dd, 8) + +#undef DO_LDN_1 +#undef DO_LDN_2 =20 /* * Load contiguous data, first-fault and no-fault. --=20 2.19.0