From nobody Wed May  7 08:37:20 2025
Delivered-To: importer@patchew.org
Received-SPF: pass (zoho.com: domain of gnu.org designates 208.118.235.17 as
 permitted sender) client-ip=208.118.235.17;
 envelope-from=qemu-devel-bounces+importer=patchew.org@nongnu.org;
 helo=lists.gnu.org;
Authentication-Results: mx.zohomail.com;
	spf=pass (zoho.com: domain of gnu.org designates 208.118.235.17 as permitted
 sender)  smtp.mailfrom=qemu-devel-bounces+importer=patchew.org@nongnu.org;
	dmarc=fail(p=none dis=none)  header.from=linaro.org
Return-Path: <qemu-devel-bounces+importer=patchew.org@nongnu.org>
Received: from lists.gnu.org (lists.gnu.org [208.118.235.17]) by
 mx.zohomail.com
	with SMTPS id 1539970425879324.3527041991988;
 Fri, 19 Oct 2018 10:33:45 -0700 (PDT)
Received: from localhost ([::1]:51897 helo=lists.gnu.org)
	by lists.gnu.org with esmtp (Exim 4.71)
	(envelope-from <qemu-devel-bounces+importer=patchew.org@nongnu.org>)
	id 1gDYeb-0001G8-0D
	for importer@patchew.org; Fri, 19 Oct 2018 13:33:41 -0400
Received: from eggs.gnu.org ([2001:4830:134:3::10]:48613)
	by lists.gnu.org with esmtp (Exim 4.71)
	(envelope-from <pm215@archaic.org.uk>) id 1gDY7z-0003Lk-93
	for qemu-devel@nongnu.org; Fri, 19 Oct 2018 13:00:00 -0400
Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71)
	(envelope-from <pm215@archaic.org.uk>) id 1gDY7y-0004sF-53
	for qemu-devel@nongnu.org; Fri, 19 Oct 2018 12:59:59 -0400
Received: from orth.archaic.org.uk ([2001:8b0:1d0::2]:51980)
	by eggs.gnu.org with esmtps (TLS1.0:RSA_AES_256_CBC_SHA1:32)
	(Exim 4.71) (envelope-from <pm215@archaic.org.uk>)
	id 1gDY7x-00020Y-Pw
	for qemu-devel@nongnu.org; Fri, 19 Oct 2018 12:59:58 -0400
Received: from pm215 by orth.archaic.org.uk with local (Exim 4.89)
	(envelope-from <pm215@archaic.org.uk>) id 1gDY66-0006lC-RK
	for qemu-devel@nongnu.org; Fri, 19 Oct 2018 17:58:02 +0100
From: Peter Maydell <peter.maydell@linaro.org>
To: qemu-devel@nongnu.org
Date: Fri, 19 Oct 2018 17:57:15 +0100
Message-Id: <20181019165735.22511-26-peter.maydell@linaro.org>
X-Mailer: git-send-email 2.19.1
In-Reply-To: <20181019165735.22511-1-peter.maydell@linaro.org>
References: <20181019165735.22511-1-peter.maydell@linaro.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
X-detected-operating-system: by eggs.gnu.org: Genre and OS details not
	recognized.
X-Received-From: 2001:8b0:1d0::2
Subject: [Qemu-devel] [PULL 25/45] target/arm: Promote consecutive memory
 ops for aa64
X-BeenThere: qemu-devel@nongnu.org
X-Mailman-Version: 2.1.21
Precedence: list
List-Id: <qemu-devel.nongnu.org>
List-Unsubscribe: <https://lists.nongnu.org/mailman/options/qemu-devel>,
	<mailto:qemu-devel-request@nongnu.org?subject=unsubscribe>
List-Archive: <http://lists.nongnu.org/archive/html/qemu-devel/>
List-Post: <mailto:qemu-devel@nongnu.org>
List-Help: <mailto:qemu-devel-request@nongnu.org?subject=help>
List-Subscribe: <https://lists.nongnu.org/mailman/listinfo/qemu-devel>,
	<mailto:qemu-devel-request@nongnu.org?subject=subscribe>
Errors-To: qemu-devel-bounces+importer=patchew.org@nongnu.org
Sender: "Qemu-devel" <qemu-devel-bounces+importer=patchew.org@nongnu.org>
X-ZohoMail: RDMRC_1  RSF_0  Z_629925259 SPT_0
Content-Type: text/plain; charset="utf-8"

From: Richard Henderson <richard.henderson@linaro.org>

For a sequence of loads or stores from a single register,
little-endian operations can be promoted to an 8-byte op.
This can reduce the number of operations by a factor of 8.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20181011205206.3552-5-richard.henderson@linaro.org
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 target/arm/translate-a64.c | 66 +++++++++++++++++++++++---------------
 1 file changed, 40 insertions(+), 26 deletions(-)

diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index 39ac45c0080..f1bd9d7633a 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -1200,25 +1200,23 @@ static void write_vec_element_i32(DisasContext *s, =
TCGv_i32 tcg_src,
=20
 /* Store from vector register to memory */
 static void do_vec_st(DisasContext *s, int srcidx, int element,
-                      TCGv_i64 tcg_addr, int size)
+                      TCGv_i64 tcg_addr, int size, TCGMemOp endian)
 {
-    TCGMemOp memop =3D s->be_data + size;
     TCGv_i64 tcg_tmp =3D tcg_temp_new_i64();
=20
     read_vec_element(s, tcg_tmp, srcidx, element, size);
-    tcg_gen_qemu_st_i64(tcg_tmp, tcg_addr, get_mem_index(s), memop);
+    tcg_gen_qemu_st_i64(tcg_tmp, tcg_addr, get_mem_index(s), endian | size=
);
=20
     tcg_temp_free_i64(tcg_tmp);
 }
=20
 /* Load from memory to vector register */
 static void do_vec_ld(DisasContext *s, int destidx, int element,
-                      TCGv_i64 tcg_addr, int size)
+                      TCGv_i64 tcg_addr, int size, TCGMemOp endian)
 {
-    TCGMemOp memop =3D s->be_data + size;
     TCGv_i64 tcg_tmp =3D tcg_temp_new_i64();
=20
-    tcg_gen_qemu_ld_i64(tcg_tmp, tcg_addr, get_mem_index(s), memop);
+    tcg_gen_qemu_ld_i64(tcg_tmp, tcg_addr, get_mem_index(s), endian | size=
);
     write_vec_element(s, tcg_tmp, destidx, element, size);
=20
     tcg_temp_free_i64(tcg_tmp);
@@ -3013,9 +3011,10 @@ static void disas_ldst_multiple_struct(DisasContext =
*s, uint32_t insn)
     bool is_postidx =3D extract32(insn, 23, 1);
     bool is_q =3D extract32(insn, 30, 1);
     TCGv_i64 tcg_addr, tcg_rn, tcg_ebytes;
+    TCGMemOp endian =3D s->be_data;
=20
-    int ebytes =3D 1 << size;
-    int elements =3D (is_q ? 128 : 64) / (8 << size);
+    int ebytes;   /* bytes per element */
+    int elements; /* elements per vector */
     int rpt;    /* num iterations */
     int selem;  /* structure elements */
     int r;
@@ -3074,6 +3073,20 @@ static void disas_ldst_multiple_struct(DisasContext =
*s, uint32_t insn)
         gen_check_sp_alignment(s);
     }
=20
+    /* For our purposes, bytes are always little-endian.  */
+    if (size =3D=3D 0) {
+        endian =3D MO_LE;
+    }
+
+    /* Consecutive little-endian elements from a single register
+     * can be promoted to a larger little-endian operation.
+     */
+    if (selem =3D=3D 1 && endian =3D=3D MO_LE) {
+        size =3D 3;
+    }
+    ebytes =3D 1 << size;
+    elements =3D (is_q ? 16 : 8) / ebytes;
+
     tcg_rn =3D cpu_reg_sp(s, rn);
     tcg_addr =3D tcg_temp_new_i64();
     tcg_gen_mov_i64(tcg_addr, tcg_rn);
@@ -3082,32 +3095,33 @@ static void disas_ldst_multiple_struct(DisasContext=
 *s, uint32_t insn)
     for (r =3D 0; r < rpt; r++) {
         int e;
         for (e =3D 0; e < elements; e++) {
-            int tt =3D (rt + r) % 32;
             int xs;
             for (xs =3D 0; xs < selem; xs++) {
+                int tt =3D (rt + r + xs) % 32;
                 if (is_store) {
-                    do_vec_st(s, tt, e, tcg_addr, size);
+                    do_vec_st(s, tt, e, tcg_addr, size, endian);
                 } else {
-                    do_vec_ld(s, tt, e, tcg_addr, size);
-
-                    /* For non-quad operations, setting a slice of the low
-                     * 64 bits of the register clears the high 64 bits (in
-                     * the ARM ARM pseudocode this is implicit in the fact
-                     * that 'rval' is a 64 bit wide variable).
-                     * For quad operations, we might still need to zero the
-                     * high bits of SVE.  We optimize by noticing that we =
only
-                     * need to do this the first time we touch a register.
-                     */
-                    if (e =3D=3D 0 && (r =3D=3D 0 || xs =3D=3D selem - 1))=
 {
-                        clear_vec_high(s, is_q, tt);
-                    }
+                    do_vec_ld(s, tt, e, tcg_addr, size, endian);
                 }
                 tcg_gen_add_i64(tcg_addr, tcg_addr, tcg_ebytes);
-                tt =3D (tt + 1) % 32;
             }
         }
     }
=20
+    if (!is_store) {
+        /* For non-quad operations, setting a slice of the low
+         * 64 bits of the register clears the high 64 bits (in
+         * the ARM ARM pseudocode this is implicit in the fact
+         * that 'rval' is a 64 bit wide variable).
+         * For quad operations, we might still need to zero the
+         * high bits of SVE.
+         */
+        for (r =3D 0; r < rpt * selem; r++) {
+            int tt =3D (rt + r) % 32;
+            clear_vec_high(s, is_q, tt);
+        }
+    }
+
     if (is_postidx) {
         int rm =3D extract32(insn, 16, 5);
         if (rm =3D=3D 31) {
@@ -3228,9 +3242,9 @@ static void disas_ldst_single_struct(DisasContext *s,=
 uint32_t insn)
         } else {
             /* Load/store one element per register */
             if (is_load) {
-                do_vec_ld(s, rt, index, tcg_addr, scale);
+                do_vec_ld(s, rt, index, tcg_addr, scale, s->be_data);
             } else {
-                do_vec_st(s, rt, index, tcg_addr, scale);
+                do_vec_st(s, rt, index, tcg_addr, scale, s->be_data);
             }
         }
         tcg_gen_add_i64(tcg_addr, tcg_addr, tcg_ebytes);
--=20
2.19.1