Series comparison

-[PULL 0/5] target-arm queue
+[PULL 00/28] target-arm queue
-Just a few minor bugfixes, but we might as well get them in
+The following changes since commit 1ea06abceec61b6f3ab33dadb0510b6e09fb61e2:
 for rc0 tomorrow.
--- PMM
+  Merge remote-tracking branch 'remotes/berrange-gitlab/tags/misc-fixes-pull-request' into staging (2021-06-14 15:59:13 +0100)
 The following changes since commit 787f82407c5056a8b1097e39e53d01dd1abe406b:
   Merge remote-tracking branch 'remotes/cohuck/tags/s390x-20200323' into staging (2020-03-23 15:38:30 +0000)
 are available in the Git repository at:
-  https://git.linaro.org/people/pmaydell/qemu-arm.git tags/pull-target-arm-20200323
+  https://git.linaro.org/people/pmaydell/qemu-arm.git tags/pull-target-arm-20210615
-for you to fetch changes up to 550a04893c2bd4442211b353680b9a6408d94dba:
+for you to fetch changes up to c611c956c7fdce651e30687b1f5d19b4cab78b6a:
-  target/arm: Move computation of index in handle_simd_dupe (2020-03-23 17:22:30 +0000)
+  include/qemu/int128.h: Add function to create Int128 from int64_t (2021-06-15 16:18:50 +0100)
 ----------------------------------------------------------------
 target-arm queue:
- * target/arm: avoid undefined behaviour shift in watchpoint code
+ * hw/intc/arm_gicv3_cpuif: Tolerate spurious EOIR writes
- * target/arm: avoid undefined behaviour shift in handle_simd_dupe()
+ * handle some UNALLOCATED decode cases correctly rather
- * target/arm: add assert that immh != 0 in disas_simd_shift_imm()
+   than asserting
- * aspeed/smc: Fix DMA support for AST2600
+ * hw: virt: consider hw_compat_6_0
- * hw/arm/bcm283x: Correct the license text ('and' vs 'or')
+ * hw/arm: add quanta-gbs-bmc machine
  * hw/intc/armv7m_nvic: Remove stale comment
  * arm, acpi: Remove dependency on presence of 'virt' board
  * target/arm: Fix mte page crossing test
  * hw/arm: quanta-q71l add pca954x muxes
  * target/arm: First few parts of MVE support
 ----------------------------------------------------------------
-Cédric Le Goater (1):
+Heinrich Schuchardt (1):
-      aspeed/smc: Fix DMA support for AST2600
+      hw: virt: consider hw_compat_6_0
-Philippe Mathieu-Daudé (1):
+Jean-Philippe Brucker (1):
-      hw/arm/bcm283x: Correct the license text
+      hw/intc/arm_gicv3_cpuif: Tolerate spurious EOIR writes
-Richard Henderson (3):
+Patrick Venture (5):
-      target/arm: Rearrange disabled check for watchpoints
+      hw/arm: add quanta-gbs-bmc machine
-      target/arm: Assert immh != 0 in disas_simd_shift_imm
+      hw/arm: quanta-gbs-bmc add i2c comments
-      target/arm: Move computation of index in handle_simd_dupe
+      hw/arm: gsj add i2c comments
       hw/arm: gsj add pca9548
       hw/arm: quanta-q71l add pca954x muxes
- include/hw/arm/bcm2835_peripherals.h |  3 ++-
+Peter Maydell (17):
- include/hw/arm/bcm2836.h             |  3 ++-
+      hw/intc/armv7m_nvic: Remove stale comment
- include/hw/char/bcm2835_aux.h        |  3 ++-
+      hw/acpi: Provide stub version of acpi_ghes_record_errors()
- include/hw/display/bcm2835_fb.h      |  3 ++-
+      hw/acpi: Provide function acpi_ghes_present()
- include/hw/dma/bcm2835_dma.h         |  4 +++-
+      target/arm: Use acpi_ghes_present() to see if we report ACPI memory errors
- include/hw/intc/bcm2835_ic.h         |  4 +++-
+      target/arm: Provide and use H8 and H1_8 macros
- include/hw/intc/bcm2836_control.h    |  3 ++-
+      target/arm: Enable FPSCR.QC bit for MVE
- include/hw/misc/bcm2835_mbox.h       |  4 +++-
+      target/arm: Handle VPR semantics in existing code
- include/hw/misc/bcm2835_mbox_defs.h  |  4 +++-
+      target/arm: Add handling for PSR.ECI/ICI
- include/hw/misc/bcm2835_property.h   |  4 +++-
+      target/arm: Let vfp_access_check() handle late NOCP checks
- hw/arm/aspeed_ast2600.c              |  6 ++++++
+      target/arm: Implement MVE LCTP
- hw/arm/bcm2835_peripherals.c         |  3 ++-
+      target/arm: Implement MVE WLSTP insn
- hw/arm/bcm2836.c                     |  3 ++-
+      target/arm: Implement MVE DLSTP
- hw/arm/raspi.c                       |  3 ++-
+      target/arm: Implement MVE LETP insn
- hw/display/bcm2835_fb.c              |  1 -
+      target/arm: Add framework for MVE decode
- hw/dma/bcm2835_dma.c                 |  4 +++-
+      target/arm: Move expand_pred_b() data to vec_helper.c
- hw/intc/bcm2835_ic.c                 |  4 ++--
+      bitops.h: Provide hswap32(), hswap64(), wswap64() swapping operations
- hw/intc/bcm2836_control.c            |  4 +++-
+      include/qemu/int128.h: Add function to create Int128 from int64_t
  hw/misc/bcm2835_mbox.c               |  4 +++-
  hw/misc/bcm2835_property.c           |  4 +++-
  hw/ssi/aspeed_smc.c                  | 15 +++++++++++++--
  target/arm/helper.c                  | 11 ++++++-----
  target/arm/translate-a64.c           |  6 +++++-
  hw/ssi/trace-events                  |  1 +
 files changed, 76 insertions(+), 28 deletions(-)
+Richard Henderson (4):
+      target/arm: Diagnose UNALLOCATED in disas_simd_two_reg_misc_fp16
+      target/arm: Remove fprintf from disas_simd_mod_imm
+      target/arm: Diagnose UNALLOCATED in disas_simd_three_reg_same_fp16
+      target/arm: Fix mte page crossing test
+ include/hw/acpi/ghes.h            |   9 +
+ include/qemu/bitops.h             |  29 +++
+ include/qemu/int128.h             |  10 +
+ target/arm/translate-a32.h        |   2 +
+ target/arm/translate.h            |   9 +
+ target/arm/vec_internal.h         |   9 +
+ target/arm/mve.decode             |  20 ++
+ target/arm/t32.decode             |  15 +-
+ hw/acpi/ghes-stub.c               |  22 +++
+ hw/acpi/ghes.c                    |  17 ++
+ hw/arm/aspeed.c                   |  11 +-
+ hw/arm/npcm7xx_boards.c           | 107 ++++++++++-
+ hw/arm/virt.c                     |   2 +
+ hw/intc/arm_gicv3_cpuif.c         |   5 +-
+ hw/intc/armv7m_nvic.c             |   6 -
+ target/arm/kvm64.c                |   6 +-
+ target/arm/m_helper.c             |  54 +++++-
+ target/arm/mte_helper.c           |   2 +-
+ target/arm/sve_helper.c           | 381 +++++++++++++-------------------------
+ target/arm/translate-a64.c        |  87 +++++----
+ target/arm/translate-m-nocp.c     |  16 +-
+ target/arm/translate-mve.c        |  29 +++
+ target/arm/translate-vfp.c        |  65 +++++--
+ target/arm/translate.c            | 300 ++++++++++++++++++++++++++++--
+ target/arm/vec_helper.c           | 116 +++++++++++-
+ target/arm/vfp_helper.c           |   3 +-
+ tests/tcg/aarch64/mte-7.c         |  31 ++++
+ hw/acpi/meson.build               |   6 +-
+ hw/arm/Kconfig                    |   2 +
+ target/arm/meson.build            |   2 +
+ tests/tcg/aarch64/Makefile.target |   2 +-
+files changed, 1019 insertions(+), 356 deletions(-)
+ create mode 100644 target/arm/mve.decode
+ create mode 100644 hw/acpi/ghes-stub.c
+ create mode 100644 target/arm/translate-mve.c
+ create mode 100644 tests/tcg/aarch64/mte-7.c

-[PULL 1/5] hw/arm/bcm283x: Correct the license text
+[PULL 01/28] hw/intc/arm_gicv3_cpuif: Tolerate spurious EOIR writes
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
+From: Jean-Philippe Brucker <jean-philippe@linaro.org>
-The license is the 'GNU General Public License v2.0 or later',
+Commit 382c7160d1cd ("hw/intc/arm_gicv3_cpuif: Fix EOIR write access
-not 'and':
+check logic") added an assert_not_reached() if the guest writes the EOIR
 register while no interrupt is active.
-  This program is free software; you can redistribute it and/ori
+It turns out some software does this: EDK2, in
-  modify it under the terms of the GNU General Public License as
+GicV3ExitBootServicesEvent(), unconditionally write EOIR for all
-  published by the Free Software Foundation; either version 2 of
+interrupts that it manages. This now causes QEMU to abort when running
-  the License, or (at your option) any later version.
+UEFI on a VM with GICv3. Although it is UNPREDICTABLE behavior and EDK2
 does need fixing, the punishment seems a little harsh, especially since
 icc_eoir_write() already tolerates writes of nonexistent interrupt
 numbers. Display a guest error and tolerate spurious EOIR writes.
-Fix the license comment.
+Fixes: 382c7160d1cd ("hw/intc/arm_gicv3_cpuif: Fix EOIR write access check logic")
+Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
-Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
-Message-id: 20200312213455.15854-1-philmd@redhat.com
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Tested-by: Alex Bennée <alex.bennee@linaro.org>
 Message-id: 20210604130352.1887560-1-jean-philippe@linaro.org
 Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
 Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
 ---
- include/hw/arm/bcm2835_peripherals.h | 3 ++-
+ hw/intc/arm_gicv3_cpuif.c | 5 ++++-
- include/hw/arm/bcm2836.h             | 3 ++-
+file changed, 4 insertions(+), 1 deletion(-)
  include/hw/char/bcm2835_aux.h        | 3 ++-
  include/hw/display/bcm2835_fb.h      | 3 ++-
  include/hw/dma/bcm2835_dma.h         | 4 +++-
  include/hw/intc/bcm2835_ic.h         | 4 +++-
  include/hw/intc/bcm2836_control.h    | 3 ++-
  include/hw/misc/bcm2835_mbox.h       | 4 +++-
  include/hw/misc/bcm2835_mbox_defs.h  | 4 +++-
  include/hw/misc/bcm2835_property.h   | 4 +++-
  hw/arm/bcm2835_peripherals.c         | 3 ++-
  hw/arm/bcm2836.c                     | 3 ++-
  hw/arm/raspi.c                       | 3 ++-
  hw/display/bcm2835_fb.c              | 1 -
  hw/dma/bcm2835_dma.c                 | 4 +++-
  hw/intc/bcm2835_ic.c                 | 4 ++--
  hw/intc/bcm2836_control.c            | 4 +++-
  hw/misc/bcm2835_mbox.c               | 4 +++-
  hw/misc/bcm2835_property.c           | 4 +++-
 files changed, 45 insertions(+), 20 deletions(-)
-diff --git a/include/hw/arm/bcm2835_peripherals.h b/include/hw/arm/bcm2835_peripherals.h
+diff --git a/hw/intc/arm_gicv3_cpuif.c b/hw/intc/arm_gicv3_cpuif.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/hw/arm/bcm2835_peripherals.h
+--- a/hw/intc/arm_gicv3_cpuif.c
-+++ b/include/hw/arm/bcm2835_peripherals.h
++++ b/hw/intc/arm_gicv3_cpuif.c
 @@ -XXX,XX +XXX,XX @@
-  * Rasperry Pi 2 emulation and refactoring Copyright (c) 2015, Microsoft
-  * Written by Andrew Baumann
-  *
-- * This code is licensed under the GNU GPLv2 and later.
-+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
-+ * See the COPYING file in the top-level directory.
-  */
- #ifndef BCM2835_PERIPHERALS_H
-diff --git a/include/hw/arm/bcm2836.h b/include/hw/arm/bcm2836.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/hw/arm/bcm2836.h
-+++ b/include/hw/arm/bcm2836.h
-@@ -XXX,XX +XXX,XX @@
-  * Rasperry Pi 2 emulation and refactoring Copyright (c) 2015, Microsoft
-  * Written by Andrew Baumann
-  *
-- * This code is licensed under the GNU GPLv2 and later.
-+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
-+ * See the COPYING file in the top-level directory.
-  */
- #ifndef BCM2836_H
-diff --git a/include/hw/char/bcm2835_aux.h b/include/hw/char/bcm2835_aux.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/hw/char/bcm2835_aux.h
-+++ b/include/hw/char/bcm2835_aux.h
-@@ -XXX,XX +XXX,XX @@
-  * Rasperry Pi 2 emulation and refactoring Copyright (c) 2015, Microsoft
-  * Written by Andrew Baumann
-  *
-- * This code is licensed under the GNU GPLv2 and later.
-+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
-+ * See the COPYING file in the top-level directory.
-  */
- #ifndef BCM2835_AUX_H
-diff --git a/include/hw/display/bcm2835_fb.h b/include/hw/display/bcm2835_fb.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/hw/display/bcm2835_fb.h
-+++ b/include/hw/display/bcm2835_fb.h
-@@ -XXX,XX +XXX,XX @@
-  * Rasperry Pi 2 emulation and refactoring Copyright (c) 2015, Microsoft
-  * Written by Andrew Baumann
-  *
-- * This code is licensed under the GNU GPLv2 and later.
-+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
-+ * See the COPYING file in the top-level directory.
-  */
- #ifndef BCM2835_FB_H
-diff --git a/include/hw/dma/bcm2835_dma.h b/include/hw/dma/bcm2835_dma.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/hw/dma/bcm2835_dma.h
-+++ b/include/hw/dma/bcm2835_dma.h
-@@ -XXX,XX +XXX,XX @@
- /*
-  * Raspberry Pi emulation (c) 2012 Gregory Estrade
-- * This code is licensed under the GNU GPLv2 and later.
-+ *
-+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
-+ * See the COPYING file in the top-level directory.
-  */
- #ifndef BCM2835_DMA_H
-diff --git a/include/hw/intc/bcm2835_ic.h b/include/hw/intc/bcm2835_ic.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/hw/intc/bcm2835_ic.h
-+++ b/include/hw/intc/bcm2835_ic.h
-@@ -XXX,XX +XXX,XX @@
- /*
-  * Raspberry Pi emulation (c) 2012 Gregory Estrade
-- * This code is licensed under the GNU GPLv2 and later.
-+ *
-+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
-+ * See the COPYING file in the top-level directory.
-  */
- #ifndef BCM2835_IC_H
-diff --git a/include/hw/intc/bcm2836_control.h b/include/hw/intc/bcm2836_control.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/hw/intc/bcm2836_control.h
-+++ b/include/hw/intc/bcm2836_control.h
-@@ -XXX,XX +XXX,XX @@
-  * ARM Local Timer IRQ Copyright (c) 2019. Zoltán Baldaszti
-  * Added basic IRQ_TIMER interrupt support
-  *
-- * This code is licensed under the GNU GPLv2 and later.
-+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
-+ * See the COPYING file in the top-level directory.
-  */
- #ifndef BCM2836_CONTROL_H
-diff --git a/include/hw/misc/bcm2835_mbox.h b/include/hw/misc/bcm2835_mbox.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/hw/misc/bcm2835_mbox.h
-+++ b/include/hw/misc/bcm2835_mbox.h
-@@ -XXX,XX +XXX,XX @@
- /*
-  * Raspberry Pi emulation (c) 2012 Gregory Estrade
-- * This code is licensed under the GNU GPLv2 and later.
-+ *
-+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
-+ * See the COPYING file in the top-level directory.
-  */
- #ifndef BCM2835_MBOX_H
-diff --git a/include/hw/misc/bcm2835_mbox_defs.h b/include/hw/misc/bcm2835_mbox_defs.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/hw/misc/bcm2835_mbox_defs.h
-+++ b/include/hw/misc/bcm2835_mbox_defs.h
-@@ -XXX,XX +XXX,XX @@
- /*
-  * Raspberry Pi emulation (c) 2012 Gregory Estrade
-- * This code is licensed under the GNU GPLv2 and later.
-+ *
-+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
-+ * See the COPYING file in the top-level directory.
-  */
- #ifndef BCM2835_MBOX_DEFS_H
-diff --git a/include/hw/misc/bcm2835_property.h b/include/hw/misc/bcm2835_property.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/hw/misc/bcm2835_property.h
-+++ b/include/hw/misc/bcm2835_property.h
-@@ -XXX,XX +XXX,XX @@
- /*
-  * Raspberry Pi emulation (c) 2012 Gregory Estrade
-- * This code is licensed under the GNU GPLv2 and later.
-+ *
-+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
-+ * See the COPYING file in the top-level directory.
-  */
- #ifndef BCM2835_PROPERTY_H
-diff --git a/hw/arm/bcm2835_peripherals.c b/hw/arm/bcm2835_peripherals.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/arm/bcm2835_peripherals.c
-+++ b/hw/arm/bcm2835_peripherals.c
-@@ -XXX,XX +XXX,XX @@
-  * Rasperry Pi 2 emulation and refactoring Copyright (c) 2015, Microsoft
-  * Written by Andrew Baumann
-  *
-- * This code is licensed under the GNU GPLv2 and later.
-+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
-+ * See the COPYING file in the top-level directory.
-  */
  #include "qemu/osdep.h"
-diff --git a/hw/arm/bcm2836.c b/hw/arm/bcm2836.c
+ #include "qemu/bitops.h"
-index XXXXXXX..XXXXXXX 100644
++#include "qemu/log.h"
---- a/hw/arm/bcm2836.c
+ #include "qemu/main-loop.h"
-+++ b/hw/arm/bcm2836.c
+ #include "trace.h"
-@@ -XXX,XX +XXX,XX @@
+ #include "gicv3_internal.h"
-  * Rasperry Pi 2 emulation and refactoring Copyright (c) 2015, Microsoft
+@@ -XXX,XX +XXX,XX @@ static void icc_eoir_write(CPUARMState *env, const ARMCPRegInfo *ri,
-  * Written by Andrew Baumann
+         }
-  *
+         break;
-- * This code is licensed under the GNU GPLv2 and later.
+     default:
-+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+-        g_assert_not_reached();
-+ * See the COPYING file in the top-level directory.
++        qemu_log_mask(LOG_GUEST_ERROR,
-  */
++                      "%s: IRQ %d isn't active\n", __func__, irq);
++        return;
- #include "qemu/osdep.h"
+     }
-diff --git a/hw/arm/raspi.c b/hw/arm/raspi.c
-index XXXXXXX..XXXXXXX 100644
+     icc_drop_prio(cs, grp);
 --- a/hw/arm/raspi.c
 +++ b/hw/arm/raspi.c
@@ -XXX,XX +XXX,XX @@
   * Raspberry Pi 3 emulation Copyright (c) 2018 Zoltán Baldaszti
   * Upstream code cleanup (c) 2018 Pekka Enberg
   *
 - * This code is licensed under the GNU GPLv2 and later.
 + * This work is licensed under the terms of the GNU GPL, version 2 or later.
 + * See the COPYING file in the top-level directory.
   */
  #include "qemu/osdep.h"
 diff --git a/hw/display/bcm2835_fb.c b/hw/display/bcm2835_fb.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/display/bcm2835_fb.c
 +++ b/hw/display/bcm2835_fb.c
@@ -XXX,XX +XXX,XX @@
  /*
   * Raspberry Pi emulation (c) 2012 Gregory Estrade
   * Refactoring for Pi2 Copyright (c) 2015, Microsoft. Written by Andrew Baumann.
 - * This code is licensed under the GNU GPLv2 and later.
   *
   * Heavily based on milkymist-vgafb.c, copyright terms below:
   *  QEMU model of the Milkymist VGA framebuffer.
 diff --git a/hw/dma/bcm2835_dma.c b/hw/dma/bcm2835_dma.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/dma/bcm2835_dma.c
 +++ b/hw/dma/bcm2835_dma.c
@@ -XXX,XX +XXX,XX @@
  /*
   * Raspberry Pi emulation (c) 2012 Gregory Estrade
 - * This code is licensed under the GNU GPLv2 and later.
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or later.
 + * See the COPYING file in the top-level directory.
   */
  #include "qemu/osdep.h"
 diff --git a/hw/intc/bcm2835_ic.c b/hw/intc/bcm2835_ic.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/intc/bcm2835_ic.c
 +++ b/hw/intc/bcm2835_ic.c
@@ -XXX,XX +XXX,XX @@
  /*
   * Raspberry Pi emulation (c) 2012 Gregory Estrade
   * Refactoring for Pi2 Copyright (c) 2015, Microsoft. Written by Andrew Baumann.
 - * This code is licensed under the GNU GPLv2 and later.
   * Heavily based on pl190.c, copyright terms below:
   *
   * Arm PrimeCell PL190 Vector Interrupt Controller
@@ -XXX,XX +XXX,XX @@
   * Copyright (c) 2006 CodeSourcery.
   * Written by Paul Brook
   *
 - * This code is licensed under the GPL.
 + * This work is licensed under the terms of the GNU GPL, version 2 or later.
 + * See the COPYING file in the top-level directory.
   */
  #include "qemu/osdep.h"
 diff --git a/hw/intc/bcm2836_control.c b/hw/intc/bcm2836_control.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/intc/bcm2836_control.c
 +++ b/hw/intc/bcm2836_control.c
@@ -XXX,XX +XXX,XX @@
   * Written by Andrew Baumann
   *
   * Based on bcm2835_ic.c (Raspberry Pi emulation) (c) 2012 Gregory Estrade
 - * This code is licensed under the GNU GPLv2 and later.
   *
   * At present, only implements interrupt routing, and mailboxes (i.e.,
   * not PMU interrupt, or AXI counters).
@@ -XXX,XX +XXX,XX @@
   *
   * Ref:
   * https://www.raspberrypi.org/documentation/hardware/raspberrypi/bcm2836/QA7_rev3.4.pdf
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or later.
 + * See the COPYING file in the top-level directory.
   */
  #include "qemu/osdep.h"
 diff --git a/hw/misc/bcm2835_mbox.c b/hw/misc/bcm2835_mbox.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/misc/bcm2835_mbox.c
 +++ b/hw/misc/bcm2835_mbox.c
@@ -XXX,XX +XXX,XX @@
  /*
   * Raspberry Pi emulation (c) 2012 Gregory Estrade
 - * This code is licensed under the GNU GPLv2 and later.
   *
   * This file models the system mailboxes, which are used for
   * communication with low-bandwidth GPU peripherals. Refs:
   *   https://github.com/raspberrypi/firmware/wiki/Mailboxes
   *   https://github.com/raspberrypi/firmware/wiki/Accessing-mailboxes
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or later.
 + * See the COPYING file in the top-level directory.
   */
  #include "qemu/osdep.h"
 diff --git a/hw/misc/bcm2835_property.c b/hw/misc/bcm2835_property.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/misc/bcm2835_property.c
 +++ b/hw/misc/bcm2835_property.c
@@ -XXX,XX +XXX,XX @@
  /*
   * Raspberry Pi emulation (c) 2012 Gregory Estrade
 - * This code is licensed under the GNU GPLv2 and later.
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or later.
 + * See the COPYING file in the top-level directory.
   */
  #include "qemu/osdep.h"
 --
 .20.1

-New patch
+[PULL 02/28] target/arm: Diagnose UNALLOCATED in disas_simd_two_reg_misc_fp16
+From: Richard Henderson <richard.henderson@linaro.org>
+This fprintf+assert has been in place since the beginning.
+It is prior to the fp_access_check, so we're still good to
+raise sigill here.
+Resolves: https://gitlab.com/qemu-project/qemu/-/issues/381
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Message-id: 20210604183506.916654-2-richard.henderson@linaro.org
+Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
+---
+ target/arm/translate-a64.c | 4 ++--
+file changed, 2 insertions(+), 2 deletions(-)
+diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
+index XXXXXXX..XXXXXXX 100644
+--- a/target/arm/translate-a64.c
++++ b/target/arm/translate-a64.c
+@@ -XXX,XX +XXX,XX @@ static void disas_simd_two_reg_misc_fp16(DisasContext *s, uint32_t insn)
+     case 0x7f: /* FSQRT (vector) */
+         break;
+     default:
+-        fprintf(stderr, "%s: insn 0x%04x fpop 0x%2x\n", __func__, insn, fpop);
+-        g_assert_not_reached();
++        unallocated_encoding(s);
++        return;
+     }
+--
+.20.1

-[PULL 4/5] target/arm: Assert immh != 0 in disas_simd_shift_imm
+[PULL 03/28] target/arm: Remove fprintf from disas_simd_mod_imm
 From: Richard Henderson <richard.henderson@linaro.org>
-Coverity raised a shed-load of errors cascading from inferring
+The default of this switch is truly unreachable.
-that clz32(immh) might yield 32, from immh might be 0.
+The switch selector is 3 bits, and all 8 cases are present.
-While immh cannot be 0 from encoding, it is not obvious even to
-a human how we've checked that: via the filtering provided by
-data_proc_simd[].
-Reported-by: Coverity (CID 1421923, and more)
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
-Message-id: 20200320160622.8040-3-richard.henderson@linaro.org
+Message-id: 20210604183506.916654-3-richard.henderson@linaro.org
 Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
 Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
 ---
- target/arm/translate-a64.c | 3 +++
+ target/arm/translate-a64.c | 1 -
-file changed, 3 insertions(+)
+file changed, 1 deletion(-)
 diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/translate-a64.c
 +++ b/target/arm/translate-a64.c
-@@ -XXX,XX +XXX,XX @@ static void disas_simd_shift_imm(DisasContext *s, uint32_t insn)
+@@ -XXX,XX +XXX,XX @@ static void disas_simd_mod_imm(DisasContext *s, uint32_t insn)
-     bool is_u = extract32(insn, 29, 1);
+         }
-     bool is_q = extract32(insn, 30, 1);
+         break;
+     default:
-+    /* data_proc_simd[] has sent immh == 0 to disas_simd_mod_imm. */
+-        fprintf(stderr, "%s: cmode_3_1: %x\n", __func__, cmode_3_1);
-+    assert(immh != 0);
+         g_assert_not_reached();
-+
+     }
-     switch (opcode) {
      case 0x08: /* SRI */
          if (!is_u) {
 --
 .20.1

-[PULL 5/5] target/arm: Move computation of index in handle_simd_dupe
+[PULL 04/28] target/arm: Diagnose UNALLOCATED in disas_simd_three_reg_same_fp16
 From: Richard Henderson <richard.henderson@linaro.org>
-Coverity reports a BAD_SHIFT with ctz32(imm5), with imm5 == 0.
+This fprintf+assert has been in place since the beginning.
-This is an invalid encoding, but we diagnose that just below
+It is after to the fp_access_check, so we need to move the
-by rejecting size > 3.  Avoid the warning by sinking the
+check up.  Fold that in to the pairwise filter.
 computation of index below the check.
-Reported-by: Coverity (CID 1421965)
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
-Message-id: 20200320160622.8040-4-richard.henderson@linaro.org
+Message-id: 20210604183506.916654-4-richard.henderson@linaro.org
 Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
 Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
 ---
- target/arm/translate-a64.c | 3 ++-
+ target/arm/translate-a64.c | 82 +++++++++++++++++++++++---------------
-file changed, 2 insertions(+), 1 deletion(-)
+file changed, 50 insertions(+), 32 deletions(-)
 diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/translate-a64.c
 +++ b/target/arm/translate-a64.c
-@@ -XXX,XX +XXX,XX @@ static void handle_simd_dupe(DisasContext *s, int is_q, int rd, int rn,
+@@ -XXX,XX +XXX,XX @@ static void disas_simd_three_reg_same(DisasContext *s, uint32_t insn)
-                              int imm5)
+  */
  static void disas_simd_three_reg_same_fp16(DisasContext *s, uint32_t insn)
  {
-     int size = ctz32(imm5);
+-    int opcode, fpopcode;
--    int index = imm5 >> (size + 1);
+-    int is_q, u, a, rm, rn, rd;
-+    int index;
+-    int datasize, elements;
+-    int pass;
-     if (size > 3 || (size == 3 && !is_q)) {
++    int opcode = extract32(insn, 11, 3);
 +    int u = extract32(insn, 29, 1);
 +    int a = extract32(insn, 23, 1);
 +    int is_q = extract32(insn, 30, 1);
 +    int rm = extract32(insn, 16, 5);
 +    int rn = extract32(insn, 5, 5);
 +    int rd = extract32(insn, 0, 5);
 +    /*
 +     * For these floating point ops, the U, a and opcode bits
 +     * together indicate the operation.
 +     */
 +    int fpopcode = opcode | (a << 3) | (u << 4);
 +    int datasize = is_q ? 128 : 64;
 +    int elements = datasize / 16;
 +    bool pairwise;
      TCGv_ptr fpst;
 -    bool pairwise = false;
 +    int pass;
 +
 +    switch (fpopcode) {
 +    case 0x0: /* FMAXNM */
 +    case 0x1: /* FMLA */
 +    case 0x2: /* FADD */
 +    case 0x3: /* FMULX */
 +    case 0x4: /* FCMEQ */
 +    case 0x6: /* FMAX */
 +    case 0x7: /* FRECPS */
 +    case 0x8: /* FMINNM */
 +    case 0x9: /* FMLS */
 +    case 0xa: /* FSUB */
 +    case 0xe: /* FMIN */
 +    case 0xf: /* FRSQRTS */
 +    case 0x13: /* FMUL */
 +    case 0x14: /* FCMGE */
 +    case 0x15: /* FACGE */
 +    case 0x17: /* FDIV */
 +    case 0x1a: /* FABD */
 +    case 0x1c: /* FCMGT */
 +    case 0x1d: /* FACGT */
 +        pairwise = false;
 +        break;
 +    case 0x10: /* FMAXNMP */
 +    case 0x12: /* FADDP */
 +    case 0x16: /* FMAXP */
 +    case 0x18: /* FMINNMP */
 +    case 0x1e: /* FMINP */
 +        pairwise = true;
 +        break;
 +    default:
 +        unallocated_encoding(s);
 +        return;
 +    }
      if (!dc_isar_feature(aa64_fp16, s)) {
          unallocated_encoding(s);
-@@ -XXX,XX +XXX,XX @@ static void handle_simd_dupe(DisasContext *s, int is_q, int rd, int rn,
+@@ -XXX,XX +XXX,XX @@ static void disas_simd_three_reg_same_fp16(DisasContext *s, uint32_t insn)
          return;
      }
-+    index = imm5 >> (size + 1);
+-    /* For these floating point ops, the U, a and opcode bits
-     tcg_gen_gvec_dup_mem(size, vec_full_reg_offset(s, rd),
+-     * together indicate the operation.
-                          vec_reg_offset(s, rn, index, size),
+-     */
-                          is_q ? 16 : 8, vec_full_reg_size(s));
+-    opcode = extract32(insn, 11, 3);
 -    u = extract32(insn, 29, 1);
 -    a = extract32(insn, 23, 1);
 -    is_q = extract32(insn, 30, 1);
 -    rm = extract32(insn, 16, 5);
 -    rn = extract32(insn, 5, 5);
 -    rd = extract32(insn, 0, 5);
 -
 -    fpopcode = opcode | (a << 3) |  (u << 4);
 -    datasize = is_q ? 128 : 64;
 -    elements = datasize / 16;
 -
 -    switch (fpopcode) {
 -    case 0x10: /* FMAXNMP */
 -    case 0x12: /* FADDP */
 -    case 0x16: /* FMAXP */
 -    case 0x18: /* FMINNMP */
 -    case 0x1e: /* FMINP */
 -        pairwise = true;
 -        break;
 -    }
 -
      fpst = fpstatus_ptr(FPST_FPCR_F16);
      if (pairwise) {
@@ -XXX,XX +XXX,XX @@ static void disas_simd_three_reg_same_fp16(DisasContext *s, uint32_t insn)
                  gen_helper_advsimd_acgt_f16(tcg_res, tcg_op1, tcg_op2, fpst);
                  break;
              default:
 -                fprintf(stderr, "%s: insn 0x%04x, fpop 0x%2x @ 0x%" PRIx64 "\n",
 -                        __func__, insn, fpopcode, s->pc_curr);
                  g_assert_not_reached();
              }
 --
 .20.1

-New patch
+[PULL 05/28] hw: virt: consider hw_compat_6_0
+From: Heinrich Schuchardt <xypron.glpk@gmx.de>
+virt-6.0 must consider hw_compat_6_0.
+Fixes: da7e13c00b59 ("hw: add compat machines for 6.1")
+Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
+Reviewed-by: Cornelia Huck <cohuck@redhat.com>
+Message-id: 20210610183500.54207-1-xypron.glpk@gmx.de
+Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
+---
+ hw/arm/virt.c | 2 ++
+file changed, 2 insertions(+)
+diff --git a/hw/arm/virt.c b/hw/arm/virt.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/arm/virt.c
++++ b/hw/arm/virt.c
+@@ -XXX,XX +XXX,XX @@ DEFINE_VIRT_MACHINE_AS_LATEST(6, 1)
+ static void virt_machine_6_0_options(MachineClass *mc)
+ {
++    virt_machine_6_1_options(mc);
++    compat_props_add(mc->compat_props, hw_compat_6_0, hw_compat_6_0_len);
+ }
+ DEFINE_VIRT_MACHINE(6, 0)
+--
+.20.1

-New patch
+[PULL 06/28] hw/arm: add quanta-gbs-bmc machine
+From: Patrick Venture <venture@google.com>
+Adds initial quanta-gbs-bmc machine support.
+Tested: Boots to userspace.
+Signed-off-by: Patrick Venture <venture@google.com>
+Reviewed-by: Brandon Kim <brandonkim@google.com>
+Reviewed-by: Hao Wu <wuhaotsh@google.com>
+Message-id: 20210608193605.2611114-2-venture@google.com
+Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
+---
+ hw/arm/npcm7xx_boards.c | 33 +++++++++++++++++++++++++++++++++
+file changed, 33 insertions(+)
+diff --git a/hw/arm/npcm7xx_boards.c b/hw/arm/npcm7xx_boards.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/arm/npcm7xx_boards.c
++++ b/hw/arm/npcm7xx_boards.c
+@@ -XXX,XX +XXX,XX @@
+ #define NPCM750_EVB_POWER_ON_STRAPS 0x00001ff7
+ #define QUANTA_GSJ_POWER_ON_STRAPS 0x00001fff
++#define QUANTA_GBS_POWER_ON_STRAPS 0x000017ff
+ static const char npcm7xx_default_bootrom[] = "npcm7xx_bootrom.bin";
+@@ -XXX,XX +XXX,XX @@ static void quanta_gsj_init(MachineState *machine)
+     npcm7xx_load_kernel(machine, soc);
+ }
++static void quanta_gbs_init(MachineState *machine)
++{
++    NPCM7xxState *soc;
++
++    soc = npcm7xx_create_soc(machine, QUANTA_GBS_POWER_ON_STRAPS);
++    npcm7xx_connect_dram(soc, machine->ram);
++    qdev_realize(DEVICE(soc), NULL, &error_fatal);
++
++    npcm7xx_load_bootrom(machine, soc);
++
++    npcm7xx_connect_flash(&soc->fiu[0], 0, "mx66u51235f",
++                          drive_get(IF_MTD, 0, 0));
++
++    npcm7xx_load_kernel(machine, soc);
++}
++
+ static void npcm7xx_set_soc_type(NPCM7xxMachineClass *nmc, const char *type)
+ {
+     NPCM7xxClass *sc = NPCM7XX_CLASS(object_class_by_name(type));
+@@ -XXX,XX +XXX,XX @@ static void gsj_machine_class_init(ObjectClass *oc, void *data)
+     mc->default_ram_size = 512 * MiB;
+ };
++static void gbs_bmc_machine_class_init(ObjectClass *oc, void *data)
++{
++    NPCM7xxMachineClass *nmc = NPCM7XX_MACHINE_CLASS(oc);
++    MachineClass *mc = MACHINE_CLASS(oc);
++
++    npcm7xx_set_soc_type(nmc, TYPE_NPCM730);
++
++    mc->desc = "Quanta GBS (Cortex-A9)";
++    mc->init = quanta_gbs_init;
++    mc->default_ram_size = 1 * GiB;
++}
++
+ static const TypeInfo npcm7xx_machine_types[] = {
+     {
+         .name           = TYPE_NPCM7XX_MACHINE,
+@@ -XXX,XX +XXX,XX @@ static const TypeInfo npcm7xx_machine_types[] = {
+         .name           = MACHINE_TYPE_NAME("quanta-gsj"),
+         .parent         = TYPE_NPCM7XX_MACHINE,
+         .class_init     = gsj_machine_class_init,
++    }, {
++        .name           = MACHINE_TYPE_NAME("quanta-gbs-bmc"),
++        .parent         = TYPE_NPCM7XX_MACHINE,
++        .class_init     = gbs_bmc_machine_class_init,
+     },
+ };
+--
+.20.1

-New patch
+[PULL 07/28] hw/arm: quanta-gbs-bmc add i2c comments
+From: Patrick Venture <venture@google.com>
+Add a comment and i2c method that describes the board layout.
+Tested: firmware booted to userspace.
+Signed-off-by: Patrick Venture <venture@google.com>
+Reviewed-by: Brandon Kim <brandonkim@google.com>
+Reviewed-by: Hao Wu <wuhaotsh@google.com>
+Message-id: 20210608193605.2611114-3-venture@google.com
+Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
+---
+ hw/arm/npcm7xx_boards.c | 60 +++++++++++++++++++++++++++++++++++++++++
+file changed, 60 insertions(+)
+diff --git a/hw/arm/npcm7xx_boards.c b/hw/arm/npcm7xx_boards.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/arm/npcm7xx_boards.c
++++ b/hw/arm/npcm7xx_boards.c
+@@ -XXX,XX +XXX,XX @@ static void quanta_gsj_fan_init(NPCM7xxMachine *machine, NPCM7xxState *soc)
+     npcm7xx_connect_pwm_fan(soc, &splitter[2], 0x05, 1);
+ }
++static void quanta_gbs_i2c_init(NPCM7xxState *soc)
++{
++    /*
++     * i2c-0:
++     *     pca9546@71
++     *
++     * i2c-1:
++     *     pca9535@24
++     *     pca9535@20
++     *     pca9535@21
++     *     pca9535@22
++     *     pca9535@23
++     *     pca9535@25
++     *     pca9535@26
++     *
++     * i2c-2:
++     *     sbtsi@4c
++     *
++     * i2c-5:
++     *     atmel,24c64@50 mb_fru
++     *     pca9546@71
++     *         - channel 0: max31725@54
++     *         - channel 1: max31725@55
++     *         - channel 2: max31725@5d
++     *                      atmel,24c64@51 fan_fru
++     *         - channel 3: atmel,24c64@52 hsbp_fru
++     *
++     * i2c-6:
++     *     pca9545@73
++     *
++     * i2c-7:
++     *     pca9545@72
++     *
++     * i2c-8:
++     *     adi,adm1272@10
++     *
++     * i2c-9:
++     *     pca9546@71
++     *         - channel 0: isil,isl68137@60
++     *         - channel 1: isil,isl68137@61
++     *         - channel 2: isil,isl68137@63
++     *         - channel 3: isil,isl68137@45
++     *
++     * i2c-10:
++     *     pca9545@71
++     *
++     * i2c-11:
++     *     pca9545@76
++     *
++     * i2c-12:
++     *     maxim,max34451@4e
++     *     isil,isl68137@5d
++     *     isil,isl68137@5e
++     *
++     * i2c-14:
++     *     pca9545@70
++     */
++}
++
+ static void npcm750_evb_init(MachineState *machine)
+ {
+     NPCM7xxState *soc;
+@@ -XXX,XX +XXX,XX @@ static void quanta_gbs_init(MachineState *machine)
+     npcm7xx_connect_flash(&soc->fiu[0], 0, "mx66u51235f",
+                           drive_get(IF_MTD, 0, 0));
++    quanta_gbs_i2c_init(soc);
+     npcm7xx_load_kernel(machine, soc);
+ }
+--
+.20.1

-New patch
+[PULL 08/28] hw/intc/armv7m_nvic: Remove stale comment
+In commit da6d674e509f0939b we split the NVIC code out from the GIC.
+This allowed us to specify the NVIC's default value for the num-irq
+property (64) in the usual way in its property list, and we deleted
+the previous hack where we updated the value in the state struct in
+the instance init function.  Remove a stale comment about that hack
+which we forgot to delete at that time.
+Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
+Message-id: 20210614161243.14211-1-peter.maydell@linaro.org
+---
+ hw/intc/armv7m_nvic.c | 6 ------
+file changed, 6 deletions(-)
+diff --git a/hw/intc/armv7m_nvic.c b/hw/intc/armv7m_nvic.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/intc/armv7m_nvic.c
++++ b/hw/intc/armv7m_nvic.c
+@@ -XXX,XX +XXX,XX @@ static void armv7m_nvic_realize(DeviceState *dev, Error **errp)
+ static void armv7m_nvic_instance_init(Object *obj)
+ {
+-    /* We have a different default value for the num-irq property
+-     * than our superclass. This function runs after qdev init
+-     * has set the defaults from the Property array and before
+-     * any user-specified property setting, so just modify the
+-     * value in the GICState struct.
+-     */
+     DeviceState *dev = DEVICE(obj);
+     NVICState *nvic = NVIC(obj);
+     SysBusDevice *sbd = SYS_BUS_DEVICE(obj);
+--
+.20.1

-New patch
+[PULL 09/28] hw/acpi: Provide stub version of acpi_ghes_record_errors()
+Generic code in target/arm wants to call acpi_ghes_record_errors();
+provide a stub version so that we don't fail to link when
+CONFIG_ACPI_APEI is not set. This requires us to add a new
+ghes-stub.c file to contain it and the meson.build mechanics
+to use it when appropriate.
+Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
+Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
+Reviewed-by: Dongjiu Geng <gengdongjiu1@gmail.com>
+Message-id: 20210603171259.27962-2-peter.maydell@linaro.org
+---
+ hw/acpi/ghes-stub.c | 17 +++++++++++++++++
+ hw/acpi/meson.build |  6 +++---
+files changed, 20 insertions(+), 3 deletions(-)
+ create mode 100644 hw/acpi/ghes-stub.c
+diff --git a/hw/acpi/ghes-stub.c b/hw/acpi/ghes-stub.c
+new file mode 100644
+index XXXXXXX..XXXXXXX
+--- /dev/null
++++ b/hw/acpi/ghes-stub.c
+@@ -XXX,XX +XXX,XX @@
++/*
++ * Support for generating APEI tables and recording CPER for Guests:
++ * stub functions.
++ *
++ * Copyright (c) 2021 Linaro, Ltd
++ *
++ * This work is licensed under the terms of the GNU GPL, version 2 or later.
++ * See the COPYING file in the top-level directory.
++ */
++
++#include "qemu/osdep.h"
++#include "hw/acpi/ghes.h"
++
++int acpi_ghes_record_errors(uint8_t source_id, uint64_t physical_address)
++{
++    return -1;
++}
+diff --git a/hw/acpi/meson.build b/hw/acpi/meson.build
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/acpi/meson.build
++++ b/hw/acpi/meson.build
+@@ -XXX,XX +XXX,XX @@ acpi_ss.add(when: 'CONFIG_ACPI_PCI', if_true: files('pci.c'))
+ acpi_ss.add(when: 'CONFIG_ACPI_VMGENID', if_true: files('vmgenid.c'))
+ acpi_ss.add(when: 'CONFIG_ACPI_HW_REDUCED', if_true: files('generic_event_device.c'))
+ acpi_ss.add(when: 'CONFIG_ACPI_HMAT', if_true: files('hmat.c'))
+-acpi_ss.add(when: 'CONFIG_ACPI_APEI', if_true: files('ghes.c'))
++acpi_ss.add(when: 'CONFIG_ACPI_APEI', if_true: files('ghes.c'), if_false:('ghes-stub.c'))
+ acpi_ss.add(when: 'CONFIG_ACPI_X86', if_true: files('core.c', 'piix4.c', 'pcihp.c'), if_false: files('acpi-stub.c'))
+ acpi_ss.add(when: 'CONFIG_ACPI_X86_ICH', if_true: files('ich9.c', 'tco.c'))
+ acpi_ss.add(when: 'CONFIG_IPMI', if_true: files('ipmi.c'), if_false: files('ipmi-stub.c'))
+ acpi_ss.add(when: 'CONFIG_PC', if_false: files('acpi-x86-stub.c'))
+ acpi_ss.add(when: 'CONFIG_TPM', if_true: files('tpm.c'))
+-softmmu_ss.add(when: 'CONFIG_ACPI', if_false: files('acpi-stub.c', 'aml-build-stub.c'))
++softmmu_ss.add(when: 'CONFIG_ACPI', if_false: files('acpi-stub.c', 'aml-build-stub.c', 'ghes-stub.c'))
+ softmmu_ss.add_all(when: 'CONFIG_ACPI', if_true: acpi_ss)
+ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('acpi-stub.c', 'aml-build-stub.c',
+-                                                  'acpi-x86-stub.c', 'ipmi-stub.c'))
++                                                  'acpi-x86-stub.c', 'ipmi-stub.c', 'ghes-stub.c'))
+--
+.20.1

-New patch
+[PULL 10/28] hw/acpi: Provide function acpi_ghes_present()
+Allow code elsewhere in the system to check whether the ACPI GHES
+table is present, so it can determine whether it is OK to try to
+record an error by calling acpi_ghes_record_errors().
+(We don't need to migrate the new 'present' field in AcpiGhesState,
+because it is set once at system initialization and doesn't change.)
+Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
+Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
+Reviewed-by: Dongjiu Geng <gengdongjiu1@gmail.com>
+Message-id: 20210603171259.27962-3-peter.maydell@linaro.org
+---
+ include/hw/acpi/ghes.h |  9 +++++++++
+ hw/acpi/ghes-stub.c    |  5 +++++
+ hw/acpi/ghes.c         | 17 +++++++++++++++++
+files changed, 31 insertions(+)
+diff --git a/include/hw/acpi/ghes.h b/include/hw/acpi/ghes.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/hw/acpi/ghes.h
++++ b/include/hw/acpi/ghes.h
+@@ -XXX,XX +XXX,XX @@ enum {
+ typedef struct AcpiGhesState {
+     uint64_t ghes_addr_le;
++    bool present; /* True if GHES is present at all on this board */
+ } AcpiGhesState;
+ void build_ghes_error_table(GArray *hardware_errors, BIOSLinker *linker);
+@@ -XXX,XX +XXX,XX @@ void acpi_build_hest(GArray *table_data, BIOSLinker *linker,
+ void acpi_ghes_add_fw_cfg(AcpiGhesState *vms, FWCfgState *s,
+                           GArray *hardware_errors);
+ int acpi_ghes_record_errors(uint8_t notify, uint64_t error_physical_addr);
++
++/**
++ * acpi_ghes_present: Report whether ACPI GHES table is present
++ *
++ * Returns: true if the system has an ACPI GHES table and it is
++ * safe to call acpi_ghes_record_errors() to record a memory error.
++ */
++bool acpi_ghes_present(void);
+ #endif
+diff --git a/hw/acpi/ghes-stub.c b/hw/acpi/ghes-stub.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/acpi/ghes-stub.c
++++ b/hw/acpi/ghes-stub.c
+@@ -XXX,XX +XXX,XX @@ int acpi_ghes_record_errors(uint8_t source_id, uint64_t physical_address)
+ {
+     return -1;
+ }
++
++bool acpi_ghes_present(void)
++{
++    return false;
++}
+diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/acpi/ghes.c
++++ b/hw/acpi/ghes.c
+@@ -XXX,XX +XXX,XX @@ void acpi_ghes_add_fw_cfg(AcpiGhesState *ags, FWCfgState *s,
+     /* Create a read-write fw_cfg file for Address */
+     fw_cfg_add_file_callback(s, ACPI_GHES_DATA_ADDR_FW_CFG_FILE, NULL, NULL,
+         NULL, &(ags->ghes_addr_le), sizeof(ags->ghes_addr_le), false);
++
++    ags->present = true;
+ }
+ int acpi_ghes_record_errors(uint8_t source_id, uint64_t physical_address)
+@@ -XXX,XX +XXX,XX @@ int acpi_ghes_record_errors(uint8_t source_id, uint64_t physical_address)
+     return ret;
+ }
++
++bool acpi_ghes_present(void)
++{
++    AcpiGedState *acpi_ged_state;
++    AcpiGhesState *ags;
++
++    acpi_ged_state = ACPI_GED(object_resolve_path_type("", TYPE_ACPI_GED,
++                                                       NULL));
++
++    if (!acpi_ged_state) {
++        return false;
++    }
++    ags = &acpi_ged_state->ghes_state;
++    return ags->present;
++}
+--
+.20.1

-New patch
+[PULL 11/28] target/arm: Use acpi_ghes_present() to see if we report ACPI memory errors
+The virt_is_acpi_enabled() function is specific to the virt board, as
+is the check for its 'ras' property.  Use the new acpi_ghes_present()
+function to check whether we should report memory errors via
+acpi_ghes_record_errors().
+This avoids a link error if QEMU was built without support for the
+virt board, and provides a mechanism that can be used by any future
+board models that want to add ACPI memory error reporting support
+(they only need to call acpi_ghes_add_fw_cfg()).
+Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
+Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
+Reviewed-by: Dongjiu Geng <gengdongjiu1@gmail.com>
+Message-id: 20210603171259.27962-4-peter.maydell@linaro.org
+---
+ target/arm/kvm64.c | 6 +-----
+file changed, 1 insertion(+), 5 deletions(-)
+diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c
+index XXXXXXX..XXXXXXX 100644
+--- a/target/arm/kvm64.c
++++ b/target/arm/kvm64.c
+@@ -XXX,XX +XXX,XX @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
+ {
+     ram_addr_t ram_addr;
+     hwaddr paddr;
+-    Object *obj = qdev_get_machine();
+-    VirtMachineState *vms = VIRT_MACHINE(obj);
+-    bool acpi_enabled = virt_is_acpi_enabled(vms);
+     assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO);
+-    if (acpi_enabled && addr &&
+-            object_property_get_bool(obj, "ras", NULL)) {
++    if (acpi_ghes_present() && addr) {
+         ram_addr = qemu_ram_addr_from_host(addr);
+         if (ram_addr != RAM_ADDR_INVALID &&
+             kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) {
+--
+.20.1

-[PULL 3/5] target/arm: Rearrange disabled check for watchpoints
+[PULL 12/28] target/arm: Fix mte page crossing test
 From: Richard Henderson <richard.henderson@linaro.org>
-Coverity rightly notes that ctz32(bas) on 0 will return 32,
+The test was off-by-one, because tag_last points to the
-which makes the len calculation a BAD_SHIFT.
+last byte of the tag to check, thus tag_last - prev_page
 will equal TARGET_PAGE_SIZE when we use the first byte
 of the next page.
-A value of 0 in DBGWCR<n>_EL1.BAS is reserved.  Simply move
+Resolves: https://gitlab.com/qemu-project/qemu/-/issues/403
-the existing check we have for this case.
+Reported-by: Peter Collingbourne <pcc@google.com>
 Reported-by: Coverity (CID 1421964)
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Message-id: 20210612195707.840217-1-richard.henderson@linaro.org
 Message-id: 20200320160622.8040-2-richard.henderson@linaro.org
 Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
 Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
 ---
- target/arm/helper.c | 11 ++++++-----
+ target/arm/mte_helper.c           |  2 +-
-file changed, 6 insertions(+), 5 deletions(-)
+ tests/tcg/aarch64/mte-7.c         | 31 +++++++++++++++++++++++++++++++
  tests/tcg/aarch64/Makefile.target |  2 +-
 files changed, 33 insertions(+), 2 deletions(-)
  create mode 100644 tests/tcg/aarch64/mte-7.c
-diff --git a/target/arm/helper.c b/target/arm/helper.c
+diff --git a/target/arm/mte_helper.c b/target/arm/mte_helper.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/arm/helper.c
+--- a/target/arm/mte_helper.c
-+++ b/target/arm/helper.c
++++ b/target/arm/mte_helper.c
-@@ -XXX,XX +XXX,XX @@ void hw_watchpoint_update(ARMCPU *cpu, int n)
+@@ -XXX,XX +XXX,XX @@ static int mte_probe_int(CPUARMState *env, uint32_t desc, uint64_t ptr,
-         int bas = extract64(wcr, 5, 8);
+     prev_page = ptr & TARGET_PAGE_MASK;
-         int basstart;
+     next_page = prev_page + TARGET_PAGE_SIZE;
--        if (bas == 0) {
+-    if (likely(tag_last - prev_page <= TARGET_PAGE_SIZE)) {
--            /* This must act as if the watchpoint is disabled */
++    if (likely(tag_last - prev_page < TARGET_PAGE_SIZE)) {
--            return;
+         /* Memory access stays on one page. */
--        }
+         tag_size = ((tag_byte_last - tag_byte_first) / (2 * TAG_GRANULE)) + 1;
--
+         mem1 = allocation_tag_mem(env, mmu_idx, ptr, type, sizem1 + 1,
-         if (extract64(wvr, 2, 1)) {
+diff --git a/tests/tcg/aarch64/mte-7.c b/tests/tcg/aarch64/mte-7.c
-             /* Deprecated case of an only 4-aligned address. BAS[7:4] are
+new file mode 100644
-              * ignored, and BAS[3:0] define which bytes to watch.
+index XXXXXXX..XXXXXXX
-              */
+--- /dev/null
-             bas &= 0xf;
++++ b/tests/tcg/aarch64/mte-7.c
-         }
+@@ -XXX,XX +XXX,XX @@
 +/*
 + * Memory tagging, unaligned access crossing pages.
 + * https://gitlab.com/qemu-project/qemu/-/issues/403
 + *
 + * Copyright (c) 2021 Linaro Ltd
 + * SPDX-License-Identifier: GPL-2.0-or-later
 + */
 +
-+        if (bas == 0) {
++#include "mte.h"
 +            /* This must act as if the watchpoint is disabled */
 +            return;
 +        }
 +
-         /* The BAS bits are supposed to be programmed to indicate a contiguous
++int main(int ac, char **av)
-          * range of bytes. Otherwise it is CONSTRAINED UNPREDICTABLE whether
++{
-          * we fire for each byte in the word/doubleword addressed by the WVR.
++    void *p;
 +
 +    enable_mte(PR_MTE_TCF_SYNC);
 +    p = alloc_mte_mem(2 * 0x1000);
 +
 +    /* Tag the pointer. */
 +    p = (void *)((unsigned long)p | (1ul << 56));
 +
 +    /* Store tag in sequential granules. */
 +    asm("stg %0, [%0]" : : "r"(p + 0x0ff0));
 +    asm("stg %0, [%0]" : : "r"(p + 0x1000));
 +
 +    /*
 +     * Perform an unaligned store with tag 1 crossing the pages.
 +     * Failure dies with SIGSEGV.
 +     */
 +    asm("str %0, [%0]" : : "r"(p + 0x0ffc));
 +    return 0;
 +}
 diff --git a/tests/tcg/aarch64/Makefile.target b/tests/tcg/aarch64/Makefile.target
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/tcg/aarch64/Makefile.target
 +++ b/tests/tcg/aarch64/Makefile.target
@@ -XXX,XX +XXX,XX @@ AARCH64_TESTS += bti-2
  # MTE Tests
  ifneq ($(DOCKER_IMAGE)$(CROSS_CC_HAS_ARMV8_MTE),)
 -AARCH64_TESTS += mte-1 mte-2 mte-3 mte-4 mte-5 mte-6
 +AARCH64_TESTS += mte-1 mte-2 mte-3 mte-4 mte-5 mte-6 mte-7
  mte-%: CFLAGS += -march=armv8.5-a+memtag
  endif
 --
 .20.1

-New patch
+[PULL 13/28] hw/arm: gsj add i2c comments
+From: Patrick Venture <venture@google.com>
+Adds comments to the board init to identify missing i2c devices.
+Signed-off-by: Patrick Venture <venture@google.com>
+Reviewed-by: Hao Wu <wuhaotsh@google.com>
+Reviewed-by: Joel Stanley <joel@jms.id.au>
+Message-id: 20210608202522.2677850-2-venture@google.com
+Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
+---
+ hw/arm/npcm7xx_boards.c | 16 +++++++++++++++-
+file changed, 15 insertions(+), 1 deletion(-)
+diff --git a/hw/arm/npcm7xx_boards.c b/hw/arm/npcm7xx_boards.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/arm/npcm7xx_boards.c
++++ b/hw/arm/npcm7xx_boards.c
+@@ -XXX,XX +XXX,XX @@ static void quanta_gsj_i2c_init(NPCM7xxState *soc)
+     at24c_eeprom_init(soc, 9, 0x55, 8192);
+     at24c_eeprom_init(soc, 10, 0x55, 8192);
+-    /* TODO: Add additional i2c devices. */
++    /*
++     * i2c-11:
++     * - power-brick@36: delta,dps800
++     * - hotswap@15: ti,lm5066i
++     */
++
++    /*
++     * i2c-12:
++     * - ucd90160@6b
++     */
++
++    /*
++     * i2c-15:
++     * - pca9548@75
++     */
+ }
+ static void quanta_gsj_fan_init(NPCM7xxMachine *machine, NPCM7xxState *soc)
+--
+.20.1

-New patch
+[PULL 14/28] hw/arm: gsj add pca9548
+From: Patrick Venture <venture@google.com>
+Tested: Quanta-gsj firmware booted.
+i2c /dev entries driver
+I2C init bus 1 freq 100000
+I2C init bus 2 freq 100000
+I2C init bus 3 freq 100000
+I2C init bus 4 freq 100000
+I2C init bus 8 freq 100000
+I2C init bus 9 freq 100000
+at24 9-0055: 8192 byte 24c64 EEPROM, writable, 1 bytes/write
+I2C init bus 10 freq 100000
+at24 10-0055: 8192 byte 24c64 EEPROM, writable, 1 bytes/write
+I2C init bus 12 freq 100000
+I2C init bus 15 freq 100000
+i2c i2c-15: Added multiplexed i2c bus 16
+i2c i2c-15: Added multiplexed i2c bus 17
+i2c i2c-15: Added multiplexed i2c bus 18
+i2c i2c-15: Added multiplexed i2c bus 19
+i2c i2c-15: Added multiplexed i2c bus 20
+i2c i2c-15: Added multiplexed i2c bus 21
+i2c i2c-15: Added multiplexed i2c bus 22
+i2c i2c-15: Added multiplexed i2c bus 23
+pca954x 15-0075: registered 8 multiplexed busses for I2C switch pca9548
+Signed-off-by: Patrick Venture <venture@google.com>
+Reviewed-by: Hao Wu <wuhaotsh@google.com>
+Reviewed-by: Joel Stanley <joel@jms.id.au>
+Message-id: 20210608202522.2677850-3-venture@google.com
+Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
+---
+ hw/arm/npcm7xx_boards.c | 6 ++----
+ hw/arm/Kconfig          | 1 +
+files changed, 3 insertions(+), 4 deletions(-)
+diff --git a/hw/arm/npcm7xx_boards.c b/hw/arm/npcm7xx_boards.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/arm/npcm7xx_boards.c
++++ b/hw/arm/npcm7xx_boards.c
+@@ -XXX,XX +XXX,XX @@
+ #include "hw/arm/npcm7xx.h"
+ #include "hw/core/cpu.h"
++#include "hw/i2c/i2c_mux_pca954x.h"
+ #include "hw/i2c/smbus_eeprom.h"
+ #include "hw/loader.h"
+ #include "hw/qdev-core.h"
+@@ -XXX,XX +XXX,XX @@ static void quanta_gsj_i2c_init(NPCM7xxState *soc)
+      * - ucd90160@6b
+      */
+-    /*
+-     * i2c-15:
+-     * - pca9548@75
+-     */
++    i2c_slave_create_simple(npcm7xx_i2c_get_bus(soc, 15), "pca9548", 0x75);
+ }
+ static void quanta_gsj_fan_init(NPCM7xxMachine *machine, NPCM7xxState *soc)
+diff --git a/hw/arm/Kconfig b/hw/arm/Kconfig
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/arm/Kconfig
++++ b/hw/arm/Kconfig
+@@ -XXX,XX +XXX,XX @@ config NPCM7XX
+     select SERIAL
+     select SSI
+     select UNIMP
++    select PCA954X
+ config FSL_IMX25
+     bool
+--
+.20.1

-[PULL 2/5] aspeed/smc: Fix DMA support for AST2600
+[PULL 15/28] hw/arm: quanta-q71l add pca954x muxes
-From: Cédric Le Goater <clg@kaod.org>
+From: Patrick Venture <venture@google.com>
-Recent firmwares uses SPI DMA transfers in U-Boot to load the
+Adds the pca954x muxes expected.
 different images (kernel, initrd, dtb) in the SoC DRAM. The AST2600
 FMC model is missing the masks to be applied on the DMA registers
 which resulted in incorrect values. Fix that and wire the SPI
 controllers which have DMA support on the AST2600.
-Fixes: bcaa8ddd081c ("aspeed/smc: Add AST2600 support")
+Tested: Booted quanta-q71l image to userspace.
-Signed-off-by: Cédric Le Goater <clg@kaod.org>
+Signed-off-by: Patrick Venture <venture@google.com>
 Reviewed-by: Hao Wu <wuhaotsh@google.com>
 Reviewed-by: Joel Stanley <joel@jms.id.au>
-Message-id: 20200320053923.20565-1-clg@kaod.org
+Reviewed-by: Cédric Le Goater <clg@kaod.org>
 Message-id: 20210608202522.2677850-4-venture@google.com
 Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
 ---
- hw/arm/aspeed_ast2600.c |  6 ++++++
+ hw/arm/aspeed.c | 11 ++++++++---
- hw/ssi/aspeed_smc.c     | 15 +++++++++++++--
+ hw/arm/Kconfig  |  1 +
- hw/ssi/trace-events     |  1 +
+files changed, 9 insertions(+), 3 deletions(-)
 files changed, 20 insertions(+), 2 deletions(-)
-diff --git a/hw/arm/aspeed_ast2600.c b/hw/arm/aspeed_ast2600.c
+diff --git a/hw/arm/aspeed.c b/hw/arm/aspeed.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/arm/aspeed_ast2600.c
+--- a/hw/arm/aspeed.c
-+++ b/hw/arm/aspeed_ast2600.c
++++ b/hw/arm/aspeed.c
-@@ -XXX,XX +XXX,XX @@ static void aspeed_soc_ast2600_realize(DeviceState *dev, Error **errp)
+@@ -XXX,XX +XXX,XX @@
+ #include "hw/arm/boot.h"
-     /* SPI */
+ #include "hw/arm/aspeed.h"
-     for (i = 0; i < sc->spis_num; i++) {
+ #include "hw/arm/aspeed_soc.h"
-+        object_property_set_link(OBJECT(&s->spi[i]), OBJECT(s->dram_mr),
++#include "hw/i2c/i2c_mux_pca954x.h"
-+                                 "dram", &err);
+ #include "hw/i2c/smbus_eeprom.h"
-+        if (err) {
+ #include "hw/misc/pca9552.h"
-+            error_propagate(errp, err);
+ #include "hw/misc/tmp105.h"
-+            return;
+@@ -XXX,XX +XXX,XX @@ static void quanta_q71l_bmc_i2c_init(AspeedMachineState *bmc)
-+        }
+     /* TODO: i2c-1: Add Frontpanel FRU eeprom@57 24c64 */
-         object_property_set_int(OBJECT(&s->spi[i]), 1, "num-cs", &err);
+     /* TODO: Add Memory Riser i2c mux and eeproms. */
-         object_property_set_bool(OBJECT(&s->spi[i]), true, "realized",
-                                  &local_err);
+-    /* TODO: i2c-2: pca9546@74 */
-diff --git a/hw/ssi/aspeed_smc.c b/hw/ssi/aspeed_smc.c
+-    /* TODO: i2c-2: pca9548@77 */
 +    i2c_slave_create_simple(aspeed_i2c_get_bus(&soc->i2c, 2), "pca9546", 0x74);
 +    i2c_slave_create_simple(aspeed_i2c_get_bus(&soc->i2c, 2), "pca9548", 0x77);
 +
      /* TODO: i2c-3: Add BIOS FRU eeprom@56 24c64 */
 -    /* TODO: i2c-7: Add pca9546@70 */
 +
 +    /* i2c-7 */
 +    i2c_slave_create_simple(aspeed_i2c_get_bus(&soc->i2c, 7), "pca9546", 0x70);
      /*        - i2c@0: pmbus@59 */
      /*        - i2c@1: pmbus@58 */
      /*        - i2c@2: pmbus@58 */
      /*        - i2c@3: pmbus@59 */
 +
      /* TODO: i2c-7: Add PDB FRU eeprom@52 */
      /* TODO: i2c-8: Add BMC FRU eeprom@50 */
  }
 diff --git a/hw/arm/Kconfig b/hw/arm/Kconfig
 index XXXXXXX..XXXXXXX 100644
---- a/hw/ssi/aspeed_smc.c
+--- a/hw/arm/Kconfig
-+++ b/hw/ssi/aspeed_smc.c
++++ b/hw/arm/Kconfig
-@@ -XXX,XX +XXX,XX @@ static const AspeedSMCController controllers[] = {
+@@ -XXX,XX +XXX,XX @@ config ASPEED_SOC
-         .flash_window_base = ASPEED26_SOC_FMC_FLASH_BASE,
+     select PCA9552
-         .flash_window_size = 0x10000000,
+     select SERIAL
-         .has_dma           = true,
+     select SMBUS_EEPROM
-+        .dma_flash_mask    = 0x0FFFFFFC,
++    select PCA954X
-+        .dma_dram_mask     = 0x3FFFFFFC,
+     select SSI
-         .nregs             = ASPEED_SMC_R_MAX,
+     select SSI_M25P80
-         .segment_to_reg    = aspeed_2600_smc_segment_to_reg,
+     select TMP105
          .reg_to_segment    = aspeed_2600_smc_reg_to_segment,
@@ -XXX,XX +XXX,XX @@ static const AspeedSMCController controllers[] = {
          .segments          = aspeed_segments_ast2600_spi1,
          .flash_window_base = ASPEED26_SOC_SPI_FLASH_BASE,
          .flash_window_size = 0x10000000,
 -        .has_dma           = false,
 +        .has_dma           = true,
 +        .dma_flash_mask    = 0x0FFFFFFC,
 +        .dma_dram_mask     = 0x3FFFFFFC,
          .nregs             = ASPEED_SMC_R_MAX,
          .segment_to_reg    = aspeed_2600_smc_segment_to_reg,
          .reg_to_segment    = aspeed_2600_smc_reg_to_segment,
@@ -XXX,XX +XXX,XX @@ static const AspeedSMCController controllers[] = {
          .segments          = aspeed_segments_ast2600_spi2,
          .flash_window_base = ASPEED26_SOC_SPI2_FLASH_BASE,
          .flash_window_size = 0x10000000,
 -        .has_dma           = false,
 +        .has_dma           = true,
 +        .dma_flash_mask    = 0x0FFFFFFC,
 +        .dma_dram_mask     = 0x3FFFFFFC,
          .nregs             = ASPEED_SMC_R_MAX,
          .segment_to_reg    = aspeed_2600_smc_segment_to_reg,
          .reg_to_segment    = aspeed_2600_smc_reg_to_segment,
@@ -XXX,XX +XXX,XX @@ static void aspeed_smc_dma_rw(AspeedSMCState *s)
      MemTxResult result;
      uint32_t data;
 +    trace_aspeed_smc_dma_rw(s->regs[R_DMA_CTRL] & DMA_CTRL_WRITE ?
 +                            "write" : "read",
 +                            s->regs[R_DMA_FLASH_ADDR],
 +                            s->regs[R_DMA_DRAM_ADDR],
 +                            s->regs[R_DMA_LEN]);
      while (s->regs[R_DMA_LEN]) {
          if (s->regs[R_DMA_CTRL] & DMA_CTRL_WRITE) {
              data = address_space_ldl_le(&s->dram_as, s->regs[R_DMA_DRAM_ADDR],
 diff --git a/hw/ssi/trace-events b/hw/ssi/trace-events
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/ssi/trace-events
 +++ b/hw/ssi/trace-events
@@ -XXX,XX +XXX,XX @@ aspeed_smc_do_snoop(int cs, int index, int dummies, int data) "CS%d index:0x%x d
  aspeed_smc_flash_write(int cs, uint64_t addr,  uint32_t size, uint64_t data, int mode) "CS%d @0x%" PRIx64 " size %u: 0x%" PRIx64" mode:%d"
  aspeed_smc_read(uint64_t addr,  uint32_t size, uint64_t data) "@0x%" PRIx64 " size %u: 0x%" PRIx64
  aspeed_smc_dma_checksum(uint32_t addr, uint32_t data) "0x%08x: 0x%08x"
 +aspeed_smc_dma_rw(const char *dir, uint32_t flash_addr, uint32_t dram_addr, uint32_t size) "%s flash:@0x%08x dram:@0x%08x size:0x%08x"
  aspeed_smc_write(uint64_t addr,  uint32_t size, uint64_t data) "@0x%" PRIx64 " size %u: 0x%" PRIx64
  aspeed_smc_flash_select(int cs, const char *prefix) "CS%d %sselect"
 --
 .20.1

-New patch
+[PULL 16/28] target/arm: Provide and use H8 and H1_8 macros
+Currently we provide Hn and H1_n macros for accessing the correct
+data within arrays of vector elements of size 1, 2 and 4, accounting
+for host endianness.  We don't provide any macros for elements of
+size 8 because there the host endianness doesn't matter.  However,
+this does result in awkwardness where we need to pass empty arguments
+to macros, because checkpatch complains about them.  The empty
+argument is a little confusing for humans to read as well.
+Add H8() and H1_8() macros and use them where we were previously
+passing empty arguments to macros.
+Suggested-by: Richard Henderson <richard.henderson@linaro.org>
+Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
+Message-id: 20210614151007.4545-2-peter.maydell@linaro.org
+Message-id: 20210610132505.5827-1-peter.maydell@linaro.org
+---
+ target/arm/vec_internal.h |   8 +-
+ target/arm/sve_helper.c   | 258 +++++++++++++++++++-------------------
+ target/arm/vec_helper.c   |  14 +--
+files changed, 143 insertions(+), 137 deletions(-)
+diff --git a/target/arm/vec_internal.h b/target/arm/vec_internal.h
+index XXXXXXX..XXXXXXX 100644
+--- a/target/arm/vec_internal.h
++++ b/target/arm/vec_internal.h
+@@ -XXX,XX +XXX,XX @@
+ #define H2(x)   (x)
+ #define H4(x)   (x)
+ #endif
+-
++/*
++ * Access to 64-bit elements isn't host-endian dependent; we provide H8
++ * and H1_8 so that when a function is being generated from a macro we
++ * can pass these rather than an empty macro argument, for clarity.
++ */
++#define H8(x)   (x)
++#define H1_8(x) (x)
+ static inline void clear_tail(void *vd, uintptr_t opr_sz, uintptr_t max_sz)
+ {
+diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
+index XXXXXXX..XXXXXXX 100644
+--- a/target/arm/sve_helper.c
++++ b/target/arm/sve_helper.c
+@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
+ DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
+ DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
+-DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64,     , float64_add)
++DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add)
+ DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
+ DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
+-DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64,     , float64_maxnum)
++DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum)
+ DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
+ DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
+-DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64,     , float64_minnum)
++DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum)
+ DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
+ DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
+-DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64,     , float64_max)
++DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max)
+ DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
+ DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
+-DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64,     , float64_min)
++DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min)
+ #undef DO_ZPZZ_PAIR_FP
+@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
+ DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
+ DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
+-DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t,     , H1_4, DO_ADD)
++DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
+ DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
+ DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
+-DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t,     , H1_4, DO_SUB)
++DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
+ DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
+ DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
+-DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t,     , H1_4, DO_ABD)
++DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
+ DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
+ DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
+-DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t,     , H1_4, DO_ADD)
++DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
+ DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
+ DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
+-DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t,     , H1_4, DO_SUB)
++DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
+ DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
+ DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
+-DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t,     , H1_4, DO_ABD)
++DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
+ DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
+ DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
+-DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t,     , H1_4, DO_MUL)
++DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
+ DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
+ DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
+-DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t,     , H1_4, DO_MUL)
++DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
+ /* Note that the multiply cannot overflow, but the doubling can. */
+ static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
+@@ -XXX,XX +XXX,XX @@ static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
+ DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
+ DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
+-DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t,     , H1_4, do_sqdmull_d)
++DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
+ #undef DO_ZZZ_TB
+@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+ DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
+ DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
+-DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t,     , H1_4, DO_ADD)
++DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
+ DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
+ DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
+-DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t,     , H1_4, DO_SUB)
++DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
+ DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
+ DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
+-DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t,     , H1_4, DO_ADD)
++DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
+ DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
+ DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
+-DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t,     , H1_4, DO_SUB)
++DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
+ #undef DO_ZZZ_WTB
+@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
+ DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
+ DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
+ DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
+-DO_ZZZ_NTB(sve2_eoril_d, uint64_t,     , DO_EOR)
++DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR)
+ #undef DO_ZZZ_NTB
+@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
+ DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
+ DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
+-DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t,     , H1_4, DO_ABD)
++DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
+ DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
+ DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
+-DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t,     , H1_4, DO_ABD)
++DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
+ DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
+ DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
+-DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t,     , H1_4, DO_MUL)
++DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
+ DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
+ DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
+-DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t,     , H1_4, DO_MUL)
++DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
+ #define DO_NMUL(N, M)  -(N * M)
+ DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
+ DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
+-DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t,     , H1_4, DO_NMUL)
++DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL)
+ DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
+ DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
+-DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t,     , H1_4, DO_NMUL)
++DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL)
+ #undef DO_ZZZW_ACC
+@@ -XXX,XX +XXX,XX @@ DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
+            do_sqdmull_h, DO_SQADD_H)
+ DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
+            do_sqdmull_s, DO_SQADD_S)
+-DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t,     , H1_4,
++DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4,
+            do_sqdmull_d, do_sqadd_d)
+ DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
+            do_sqdmull_h, DO_SQSUB_H)
+ DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
+            do_sqdmull_s, DO_SQSUB_S)
+-DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t,     , H1_4,
++DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4,
+            do_sqdmull_d, do_sqsub_d)
+ #undef DO_SQDMLAL
+@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
+ DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
+ DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
+ DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
+-DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t,   , DO_CMLA)
++DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA)
+ #define DO_SQRDMLAH_B(N, M, A, S) \
+     do_sqrdmlah_b(N, M, A, S, true)
+@@ -XXX,XX +XXX,XX @@ DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t,   , DO_CMLA)
+ DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
+ DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
+ DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
+-DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t,   , DO_SQRDMLAH_D)
++DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D)
+ #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
+ void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)    \
+@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
+ DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
+ DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
+-DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t,   , DO_SQRDMLAH_D)
++DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D)
+ #define DO_SQRDMLSH_H(N, M, A) \
+     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
+@@ -XXX,XX +XXX,XX @@ DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t,   , DO_SQRDMLAH_D)
+ DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H)
+ DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S)
+-DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t,   , DO_SQRDMLSH_D)
++DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D)
+ #undef DO_ZZXZ
+@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
+ #define DO_MLA(N, M, A)  (A + N * M)
+ DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA)
+-DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t,     , H1_4, DO_MLA)
++DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA)
+ DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA)
+-DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t,     , H1_4, DO_MLA)
++DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA)
+ #define DO_MLS(N, M, A)  (A - N * M)
+ DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS)
+-DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t,     , H1_4, DO_MLS)
++DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS)
+ DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS)
+-DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t,     , H1_4, DO_MLS)
++DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS)
+ #define DO_SQDMLAL_S(N, M, A)  DO_SQADD_S(A, do_sqdmull_s(N, M))
+ #define DO_SQDMLAL_D(N, M, A)  do_sqadd_d(A, do_sqdmull_d(N, M))
+ DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S)
+-DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t,     , H1_4, DO_SQDMLAL_D)
++DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D)
+ #define DO_SQDMLSL_S(N, M, A)  DO_SQSUB_S(A, do_sqdmull_s(N, M))
+ #define DO_SQDMLSL_D(N, M, A)  do_sqsub_d(A, do_sqdmull_d(N, M))
+ DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S)
+-DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t,     , H1_4, DO_SQDMLSL_D)
++DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D)
+ #undef DO_MLA
+ #undef DO_MLS
+@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)            \
+ }
+ DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
+-DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t,     , H1_4, do_sqdmull_d)
++DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
+ DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
+-DO_ZZX(sve2_smull_idx_d, int64_t, int32_t,     , H1_4, DO_MUL)
++DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
+ DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
+-DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t,     , H1_4, DO_MUL)
++DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
+ #undef DO_ZZX
+@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
+ DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
+ DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
+ DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
+-DO_CADD(sve2_cadd_d, int64_t,     , DO_ADD, DO_SUB)
++DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB)
+ DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
+ DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
+ DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
+-DO_CADD(sve2_sqcadd_d, int64_t,     , do_sqadd_d, do_sqsub_d)
++DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d)
+ #undef DO_CADD
+@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
+ DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
+ DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
+-DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t,     , H1_4)
++DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4)
+ DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
+ DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
+-DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t,     , H1_4)
++DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4)
+ #undef DO_ZZI_SHLL
+@@ -XXX,XX +XXX,XX @@ DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
+ DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
+ DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
+-DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t,     , H1_4, DO_SHR)
++DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR)
+ DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
+ DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
+@@ -XXX,XX +XXX,XX @@ DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
+ DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
+ DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
+-DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t,     , H1_4, do_urshr)
++DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
+ #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
+ #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
+@@ -XXX,XX +XXX,XX @@ DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
+ DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
+ DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
+-DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t,     , H1_4, DO_SQSHRUN_D)
++DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
+ #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
+ #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
+@@ -XXX,XX +XXX,XX @@ DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
+ DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
+ DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
+-DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t,     , H1_4, DO_SQRSHRUN_D)
++DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
+ #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
+ #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
+@@ -XXX,XX +XXX,XX @@ DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
+ DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
+ DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
+-DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t,     , H1_4, DO_SQSHRN_D)
++DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
+ #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
+ #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
+@@ -XXX,XX +XXX,XX @@ DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
+ DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
+ DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
+-DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t,     , H1_4, DO_SQRSHRN_D)
++DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D)
+ #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
+ #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
+@@ -XXX,XX +XXX,XX @@ DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
+ DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
+ DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
+-DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t,     , H1_4, DO_UQSHRN_D)
++DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D)
+ #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
+ #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
+@@ -XXX,XX +XXX,XX @@ DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
+ DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
+ DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
+-DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t,     , H1_4, DO_UQRSHRN_D)
++DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D)
+ #undef DO_SHRNB
+ #undef DO_SHRNT
+@@ -XXX,XX +XXX,XX @@ DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
+ DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
+ DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
+-DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32,     , H1_4, DO_ADDHN)
++DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN)
+ DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
+ DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
+@@ -XXX,XX +XXX,XX @@ DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
+ DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
+ DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
+-DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32,     , H1_4, DO_RADDHN)
++DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN)
+ DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
+ DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
+@@ -XXX,XX +XXX,XX @@ DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
+ DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
+ DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
+-DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32,     , H1_4, DO_SUBHN)
++DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN)
+ DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
+ DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
+@@ -XXX,XX +XXX,XX @@ DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
+ DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
+ DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
+-DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32,     , H1_4, DO_RSUBHN)
++DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN)
+ #undef DO_RSUBHN
+ #undef DO_SUBHN
+@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
+ DO_INSR(sve_insr_b, uint8_t, H1)
+ DO_INSR(sve_insr_h, uint16_t, H1_2)
+ DO_INSR(sve_insr_s, uint32_t, H1_4)
+-DO_INSR(sve_insr_d, uint64_t, )
++DO_INSR(sve_insr_d, uint64_t, H1_8)
+ #undef DO_INSR
+@@ -XXX,XX +XXX,XX @@ void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
+ DO_TB(b, uint8_t, H1)
+ DO_TB(h, uint16_t, H2)
+ DO_TB(s, uint32_t, H4)
+-DO_TB(d, uint64_t,   )
++DO_TB(d, uint64_t, H8)
+ #undef DO_TB
+@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
+ DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
+ DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
+-DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
++DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4)
+ DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
+ DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
+-DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
++DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4)
+ #undef DO_UNPK
+@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)       \
+ DO_ZIP(sve_zip_b, uint8_t, H1)
+ DO_ZIP(sve_zip_h, uint16_t, H1_2)
+ DO_ZIP(sve_zip_s, uint32_t, H1_4)
+-DO_ZIP(sve_zip_d, uint64_t, )
++DO_ZIP(sve_zip_d, uint64_t, H1_8)
+ DO_ZIP(sve2_zip_q, Int128, )
+ #define DO_UZP(NAME, TYPE, H) \
+@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
+ DO_UZP(sve_uzp_b, uint8_t, H1)
+ DO_UZP(sve_uzp_h, uint16_t, H1_2)
+ DO_UZP(sve_uzp_s, uint32_t, H1_4)
+-DO_UZP(sve_uzp_d, uint64_t, )
++DO_UZP(sve_uzp_d, uint64_t, H1_8)
+ DO_UZP(sve2_uzp_q, Int128, )
+ #define DO_TRN(NAME, TYPE, H) \
+@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
+ DO_TRN(sve_trn_b, uint8_t, H1)
+ DO_TRN(sve_trn_h, uint16_t, H1_2)
+ DO_TRN(sve_trn_s, uint32_t, H1_4)
+-DO_TRN(sve_trn_d, uint64_t, )
++DO_TRN(sve_trn_d, uint64_t, H1_8)
+ DO_TRN(sve2_trn_q, Int128, )
+ #undef DO_ZIP
+@@ -XXX,XX +XXX,XX @@ uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
+ #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
+     DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
+ #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
+-    DO_CMP_PPZZ(NAME, TYPE, OP,     , 0x0101010101010101ull)
++    DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
+ DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t,  ==)
+ DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
+@@ -XXX,XX +XXX,XX @@ uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)   \
+ #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
+     DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
+ #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
+-    DO_CMP_PPZI(NAME, TYPE, OP,     , 0x0101010101010101ull)
++    DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
+ DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t,  ==)
+ DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
+@@ -XXX,XX +XXX,XX @@ uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc)    \
+ DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
+ DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
+-DO_REDUCE(sve_faddv_d, float64,     , add, float64_zero)
++DO_REDUCE(sve_faddv_d, float64, H1_8, add, float64_zero)
+ /* Identity is floatN_default_nan, without the function call.  */
+ DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
+ DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
+-DO_REDUCE(sve_fminnmv_d, float64,     , minnum, 0x7FF8000000000000ULL)
++DO_REDUCE(sve_fminnmv_d, float64, H1_8, minnum, 0x7FF8000000000000ULL)
+ DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
+ DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
+-DO_REDUCE(sve_fmaxnmv_d, float64,     , maxnum, 0x7FF8000000000000ULL)
++DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, maxnum, 0x7FF8000000000000ULL)
+ DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
+ DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
+-DO_REDUCE(sve_fminv_d, float64,     , min, float64_infinity)
++DO_REDUCE(sve_fminv_d, float64, H1_8, min, float64_infinity)
+ DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
+ DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
+-DO_REDUCE(sve_fmaxv_d, float64,     , max, float64_chs(float64_infinity))
++DO_REDUCE(sve_fmaxv_d, float64, H1_8, max, float64_chs(float64_infinity))
+ #undef DO_REDUCE
+@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,       \
+ DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
+ DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
+-DO_ZPZZ_FP(sve_fadd_d, uint64_t,     , float64_add)
++DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add)
+ DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
+ DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
+-DO_ZPZZ_FP(sve_fsub_d, uint64_t,     , float64_sub)
++DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub)
+ DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
+ DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
+-DO_ZPZZ_FP(sve_fmul_d, uint64_t,     , float64_mul)
++DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul)
+ DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
+ DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
+-DO_ZPZZ_FP(sve_fdiv_d, uint64_t,     , float64_div)
++DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div)
+ DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
+ DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
+-DO_ZPZZ_FP(sve_fmin_d, uint64_t,     , float64_min)
++DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min)
+ DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
+ DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
+-DO_ZPZZ_FP(sve_fmax_d, uint64_t,     , float64_max)
++DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max)
+ DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
+ DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
+-DO_ZPZZ_FP(sve_fminnum_d, uint64_t,     , float64_minnum)
++DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum)
+ DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
+ DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
+-DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t,     , float64_maxnum)
++DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum)
+ static inline float16 abd_h(float16 a, float16 b, float_status *s)
+ {
+@@ -XXX,XX +XXX,XX @@ static inline float64 abd_d(float64 a, float64 b, float_status *s)
+ DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
+ DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
+-DO_ZPZZ_FP(sve_fabd_d, uint64_t,     , abd_d)
++DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d)
+ static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
+ {
+@@ -XXX,XX +XXX,XX @@ static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
+ DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
+ DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
+-DO_ZPZZ_FP(sve_fscalbn_d, int64_t,     , scalbn_d)
++DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d)
+ DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
+ DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
+-DO_ZPZZ_FP(sve_fmulx_d, uint64_t,     , helper_vfp_mulxd)
++DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd)
+ #undef DO_ZPZZ_FP
+@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar,  \
+ DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
+ DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
+-DO_ZPZS_FP(sve_fadds_d, float64,     , float64_add)
++DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add)
+ DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
+ DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
+-DO_ZPZS_FP(sve_fsubs_d, float64,     , float64_sub)
++DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub)
+ DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
+ DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
+-DO_ZPZS_FP(sve_fmuls_d, float64,     , float64_mul)
++DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul)
+ static inline float16 subr_h(float16 a, float16 b, float_status *s)
+ {
+@@ -XXX,XX +XXX,XX @@ static inline float64 subr_d(float64 a, float64 b, float_status *s)
+ DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
+ DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
+-DO_ZPZS_FP(sve_fsubrs_d, float64,     , subr_d)
++DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d)
+ DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
+ DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
+-DO_ZPZS_FP(sve_fmaxnms_d, float64,     , float64_maxnum)
++DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum)
+ DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
+ DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
+-DO_ZPZS_FP(sve_fminnms_d, float64,     , float64_minnum)
++DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum)
+ DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
+ DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
+-DO_ZPZS_FP(sve_fmaxs_d, float64,     , float64_max)
++DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max)
+ DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
+ DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
+-DO_ZPZS_FP(sve_fmins_d, float64,     , float64_min)
++DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min)
+ /* Fully general two-operand expander, controlled by a predicate,
+  * With the extra float_status parameter.
+@@ -XXX,XX +XXX,XX @@ static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
+ DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
+ DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
+ DO_ZPZ_FP(sve_bfcvt,   uint32_t, H1_4, float32_to_bfloat16)
+-DO_ZPZ_FP(sve_fcvt_dh, uint64_t,     , sve_f64_to_f16)
+-DO_ZPZ_FP(sve_fcvt_hd, uint64_t,     , sve_f16_to_f64)
+-DO_ZPZ_FP(sve_fcvt_ds, uint64_t,     , float64_to_float32)
+-DO_ZPZ_FP(sve_fcvt_sd, uint64_t,     , float32_to_float64)
++DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16)
++DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64)
++DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32)
++DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64)
+ DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
+ DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
+ DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
+-DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t,     , vfp_float16_to_int64_rtz)
+-DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t,     , vfp_float32_to_int64_rtz)
+-DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t,     , helper_vfp_tosizd)
+-DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t,     , vfp_float64_to_int64_rtz)
++DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz)
++DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz)
++DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd)
++DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz)
+ DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
+ DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
+ DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
+-DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t,     , vfp_float16_to_uint64_rtz)
+-DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t,     , vfp_float32_to_uint64_rtz)
+-DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t,     , helper_vfp_touizd)
+-DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t,     , vfp_float64_to_uint64_rtz)
++DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz)
++DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz)
++DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd)
++DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz)
+ DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
+ DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
+-DO_ZPZ_FP(sve_frint_d, uint64_t,     , helper_rintd)
++DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd)
+ DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
+ DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
+-DO_ZPZ_FP(sve_frintx_d, uint64_t,     , float64_round_to_int)
++DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int)
+ DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
+ DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
+-DO_ZPZ_FP(sve_frecpx_d, uint64_t,     , helper_frecpx_f64)
++DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64)
+ DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
+ DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
+-DO_ZPZ_FP(sve_fsqrt_d, uint64_t,     , float64_sqrt)
++DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt)
+ DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
+ DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
+ DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
+-DO_ZPZ_FP(sve_scvt_sd, uint64_t,     , int32_to_float64)
+-DO_ZPZ_FP(sve_scvt_dh, uint64_t,     , int64_to_float16)
+-DO_ZPZ_FP(sve_scvt_ds, uint64_t,     , int64_to_float32)
+-DO_ZPZ_FP(sve_scvt_dd, uint64_t,     , int64_to_float64)
++DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64)
++DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16)
++DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32)
++DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64)
+ DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
+ DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
+ DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
+-DO_ZPZ_FP(sve_ucvt_sd, uint64_t,     , uint32_to_float64)
+-DO_ZPZ_FP(sve_ucvt_dh, uint64_t,     , uint64_to_float16)
+-DO_ZPZ_FP(sve_ucvt_ds, uint64_t,     , uint64_to_float32)
+-DO_ZPZ_FP(sve_ucvt_dd, uint64_t,     , uint64_to_float64)
++DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64)
++DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16)
++DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32)
++DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64)
+ static int16_t do_float16_logb_as_int(float16 a, float_status *s)
+ {
+@@ -XXX,XX +XXX,XX @@ static int64_t do_float64_logb_as_int(float64 a, float_status *s)
+ DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int)
+ DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int)
+-DO_ZPZ_FP(flogb_d, float64,     , do_float64_logb_as_int)
++DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int)
+ #undef DO_ZPZ_FP
+@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
+ #define DO_FPCMP_PPZZ_S(NAME, OP) \
+     DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
+ #define DO_FPCMP_PPZZ_D(NAME, OP) \
+-    DO_FPCMP_PPZZ(NAME##_d, float64,     , OP)
++    DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP)
+ #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
+     DO_FPCMP_PPZZ_H(NAME, OP)   \
+@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vg,            \
+ #define DO_FPCMP_PPZ0_S(NAME, OP) \
+     DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
+ #define DO_FPCMP_PPZ0_D(NAME, OP) \
+-    DO_FPCMP_PPZ0(NAME##_d, float64,     , OP)
++    DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP)
+ #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
+     DO_FPCMP_PPZ0_H(NAME, OP)   \
+@@ -XXX,XX +XXX,XX @@ DO_LD_PRIM_1(ld1bhu, H1_2, uint16_t, uint8_t)
+ DO_LD_PRIM_1(ld1bhs, H1_2, uint16_t,  int8_t)
+ DO_LD_PRIM_1(ld1bsu, H1_4, uint32_t, uint8_t)
+ DO_LD_PRIM_1(ld1bss, H1_4, uint32_t,  int8_t)
+-DO_LD_PRIM_1(ld1bdu,     , uint64_t, uint8_t)
+-DO_LD_PRIM_1(ld1bds,     , uint64_t,  int8_t)
++DO_LD_PRIM_1(ld1bdu, H1_8, uint64_t, uint8_t)
++DO_LD_PRIM_1(ld1bds, H1_8, uint64_t,  int8_t)
+ #define DO_ST_PRIM_1(NAME, H, TE, TM)                   \
+     DO_ST_HOST(st1##NAME, H, TE, TM, stb_p)             \
+@@ -XXX,XX +XXX,XX @@ DO_LD_PRIM_1(ld1bds,     , uint64_t,  int8_t)
+ DO_ST_PRIM_1(bb,   H1,  uint8_t, uint8_t)
+ DO_ST_PRIM_1(bh, H1_2, uint16_t, uint8_t)
+ DO_ST_PRIM_1(bs, H1_4, uint32_t, uint8_t)
+-DO_ST_PRIM_1(bd,     , uint64_t, uint8_t)
++DO_ST_PRIM_1(bd, H1_8, uint64_t, uint8_t)
+ #define DO_LD_PRIM_2(NAME, H, TE, TM, LD) \
+     DO_LD_HOST(ld1##NAME##_be, H, TE, TM, LD##_be_p)    \
+@@ -XXX,XX +XXX,XX @@ DO_ST_PRIM_1(bd,     , uint64_t, uint8_t)
+ DO_LD_PRIM_2(hh,  H1_2, uint16_t, uint16_t, lduw)
+ DO_LD_PRIM_2(hsu, H1_4, uint32_t, uint16_t, lduw)
+ DO_LD_PRIM_2(hss, H1_4, uint32_t,  int16_t, lduw)
+-DO_LD_PRIM_2(hdu,     , uint64_t, uint16_t, lduw)
+-DO_LD_PRIM_2(hds,     , uint64_t,  int16_t, lduw)
++DO_LD_PRIM_2(hdu, H1_8, uint64_t, uint16_t, lduw)
++DO_LD_PRIM_2(hds, H1_8, uint64_t,  int16_t, lduw)
+ DO_ST_PRIM_2(hh, H1_2, uint16_t, uint16_t, stw)
+ DO_ST_PRIM_2(hs, H1_4, uint32_t, uint16_t, stw)
+-DO_ST_PRIM_2(hd,     , uint64_t, uint16_t, stw)
++DO_ST_PRIM_2(hd, H1_8, uint64_t, uint16_t, stw)
+ DO_LD_PRIM_2(ss,  H1_4, uint32_t, uint32_t, ldl)
+-DO_LD_PRIM_2(sdu,     , uint64_t, uint32_t, ldl)
+-DO_LD_PRIM_2(sds,     , uint64_t,  int32_t, ldl)
++DO_LD_PRIM_2(sdu, H1_8, uint64_t, uint32_t, ldl)
++DO_LD_PRIM_2(sds, H1_8, uint64_t,  int32_t, ldl)
+ DO_ST_PRIM_2(ss, H1_4, uint32_t, uint32_t, stl)
+-DO_ST_PRIM_2(sd,     , uint64_t, uint32_t, stl)
++DO_ST_PRIM_2(sd, H1_8, uint64_t, uint32_t, stl)
+-DO_LD_PRIM_2(dd,     , uint64_t, uint64_t, ldq)
+-DO_ST_PRIM_2(dd,     , uint64_t, uint64_t, stq)
++DO_LD_PRIM_2(dd, H1_8, uint64_t, uint64_t, ldq)
++DO_ST_PRIM_2(dd, H1_8, uint64_t, uint64_t, stq)
+ #undef DO_LD_TLB
+ #undef DO_ST_TLB
+@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc)  \
+ DO_FCVTNT(sve_bfcvtnt,    uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
+ DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
+-DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t,     , H1_4, float64_to_float32)
++DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32)
+ #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP)                             \
+ void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc)  \
+@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc)  \
+ }
+ DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32)
+-DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t,     , H1_4, float32_to_float64)
++DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
+ #undef DO_FCVTLT
+ #undef DO_FCVTNT
+diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
+index XXXXXXX..XXXXXXX 100644
+--- a/target/arm/vec_helper.c
++++ b/target/arm/vec_helper.c
+@@ -XXX,XX +XXX,XX @@ DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
+ DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
+ DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
+ DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
+-DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, )
+-DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, )
++DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
++DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
+ void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
+                          void *vfpst, uint32_t desc)
+@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+ DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
+ DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
+-DO_MUL_IDX(gvec_mul_idx_d, uint64_t, )
++DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
+ #undef DO_MUL_IDX
+@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
+ DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
+ DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
+-DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +,   )
++DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
+ DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
+ DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
+-DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -,   )
++DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
+ #undef DO_MLA_IDX
+@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
+ DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16, H2)
+ DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32, H4)
+-DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64, )
++DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64, H8)
+ /*
+  * Non-fused multiply-accumulate operations, for Neon. NB that unlike
+@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
+ DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
+ DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
+-DO_FMLA_IDX(gvec_fmla_idx_d, float64, )
++DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
+ #undef DO_FMLA_IDX
+--
+.20.1

-New patch
+[PULL 17/28] target/arm: Enable FPSCR.QC bit for MVE
+MVE has an FPSCR.QC bit similar to the A-profile Neon one; when MVE
+is implemented make the bit writeable, both in the generic "load and
+store FPSCR" helper functions and in the code for handling the NZCVQC
+sysreg which we had previously left as "TODO when we implement MVE".
+Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
+Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
+Message-id: 20210614151007.4545-3-peter.maydell@linaro.org
+---
+ target/arm/translate-vfp.c | 30 +++++++++++++++++++++---------
+ target/arm/vfp_helper.c    |  3 ++-
+files changed, 23 insertions(+), 10 deletions(-)
+diff --git a/target/arm/translate-vfp.c b/target/arm/translate-vfp.c
+index XXXXXXX..XXXXXXX 100644
+--- a/target/arm/translate-vfp.c
++++ b/target/arm/translate-vfp.c
+@@ -XXX,XX +XXX,XX @@ static bool gen_M_fp_sysreg_write(DisasContext *s, int regno,
+     {
+         TCGv_i32 fpscr;
+         tmp = loadfn(s, opaque);
+-        /*
+-         * TODO: when we implement MVE, write the QC bit.
+-         * For non-MVE, QC is RES0.
+-         */
++        if (dc_isar_feature(aa32_mve, s)) {
++            /* QC is only present for MVE; otherwise RES0 */
++            TCGv_i32 qc = tcg_temp_new_i32();
++            tcg_gen_andi_i32(qc, tmp, FPCR_QC);
++            /*
++             * The 4 vfp.qc[] fields need only be "zero" vs "non-zero";
++             * here writing the same value into all elements is simplest.
++             */
++            tcg_gen_gvec_dup_i32(MO_32, offsetof(CPUARMState, vfp.qc),
++                                 16, 16, qc);
++        }
+         tcg_gen_andi_i32(tmp, tmp, FPCR_NZCV_MASK);
+         fpscr = load_cpu_field(vfp.xregs[ARM_VFP_FPSCR]);
+         tcg_gen_andi_i32(fpscr, fpscr, ~FPCR_NZCV_MASK);
+@@ -XXX,XX +XXX,XX @@ static bool gen_M_fp_sysreg_read(DisasContext *s, int regno,
+         break;
+     }
++    if (regno == ARM_VFP_FPSCR_NZCVQC && !dc_isar_feature(aa32_mve, s)) {
++        /* QC is RES0 without MVE, so NZCVQC simplifies to NZCV */
++        regno = QEMU_VFP_FPSCR_NZCV;
++    }
++
+     switch (regno) {
+     case ARM_VFP_FPSCR:
+         tmp = tcg_temp_new_i32();
+@@ -XXX,XX +XXX,XX @@ static bool gen_M_fp_sysreg_read(DisasContext *s, int regno,
+         storefn(s, opaque, tmp);
+         break;
+     case ARM_VFP_FPSCR_NZCVQC:
+-        /*
+-         * TODO: MVE has a QC bit, which we probably won't store
+-         * in the xregs[] field. For non-MVE, where QC is RES0,
+-         * we can just fall through to the FPSCR_NZCV case.
+-         */
++        tmp = tcg_temp_new_i32();
++        gen_helper_vfp_get_fpscr(tmp, cpu_env);
++        tcg_gen_andi_i32(tmp, tmp, FPCR_NZCVQC_MASK);
++        storefn(s, opaque, tmp);
++        break;
+     case QEMU_VFP_FPSCR_NZCV:
+         /*
+          * Read just NZCV; this is a special case to avoid the
+diff --git a/target/arm/vfp_helper.c b/target/arm/vfp_helper.c
+index XXXXXXX..XXXXXXX 100644
+--- a/target/arm/vfp_helper.c
++++ b/target/arm/vfp_helper.c
+@@ -XXX,XX +XXX,XX @@ void HELPER(vfp_set_fpscr)(CPUARMState *env, uint32_t val)
+                                      FPCR_LTPSIZE_LENGTH);
+     }
+-    if (arm_feature(env, ARM_FEATURE_NEON)) {
++    if (arm_feature(env, ARM_FEATURE_NEON) ||
++        cpu_isar_feature(aa32_mve, cpu)) {
+         /*
+          * The bit we set within fpscr_q is arbitrary; the register as a
+          * whole being zero/non-zero is what counts.
+--
+.20.1

-New patch
+[PULL 18/28] target/arm: Handle VPR semantics in existing code
+When MVE is supported, the VPR register has a place on the exception
 stack frame in a previously reserved slot just above the FPSCR.
 It must also be zeroed in various situations when we invalidate
 FPU context.
 Update the code which handles the stack frames (exception entry and
 exit code, VLLDM, and VLSTM) to save/restore VPR.
 Update code which invalidates FP registers (mostly also exception
 entry and exit code, but also VSCCLRM and the code in
 full_vfp_access_check() that corresponds to the ExecuteFPCheck()
 pseudocode) to zero VPR.
 Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Message-id: 20210614151007.4545-4-peter.maydell@linaro.org
 ---
  target/arm/m_helper.c         | 54 +++++++++++++++++++++++++++++------
  target/arm/translate-m-nocp.c |  5 +++-
  target/arm/translate-vfp.c    |  9 ++++--
 files changed, 57 insertions(+), 11 deletions(-)
 diff --git a/target/arm/m_helper.c b/target/arm/m_helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/m_helper.c
 +++ b/target/arm/m_helper.c
@@ -XXX,XX +XXX,XX @@ void HELPER(v7m_preserve_fp_state)(CPUARMState *env)
              uint32_t shi = extract64(dn, 32, 32);
              if (i >= 16) {
 -                faddr += 8; /* skip the slot for the FPSCR */
 +                faddr += 8; /* skip the slot for the FPSCR/VPR */
              }
              stacked_ok = stacked_ok &&
                  v7m_stack_write(cpu, faddr, slo, mmu_idx, STACK_LAZYFP) &&
@@ -XXX,XX +XXX,XX @@ void HELPER(v7m_preserve_fp_state)(CPUARMState *env)
          stacked_ok = stacked_ok &&
              v7m_stack_write(cpu, fpcar + 0x40,
                              vfp_get_fpscr(env), mmu_idx, STACK_LAZYFP);
 +        if (cpu_isar_feature(aa32_mve, cpu)) {
 +            stacked_ok = stacked_ok &&
 +                v7m_stack_write(cpu, fpcar + 0x44,
 +                                env->v7m.vpr, mmu_idx, STACK_LAZYFP);
 +        }
      }
      /*
@@ -XXX,XX +XXX,XX @@ void HELPER(v7m_preserve_fp_state)(CPUARMState *env)
      env->v7m.fpccr[is_secure] &= ~R_V7M_FPCCR_LSPACT_MASK;
      if (ts) {
 -        /* Clear s0 to s31 and the FPSCR */
 +        /* Clear s0 to s31 and the FPSCR and VPR */
          int i;
          for (i = 0; i < 32; i += 2) {
              *aa32_vfp_dreg(env, i / 2) = 0;
          }
          vfp_set_fpscr(env, 0);
 +        if (cpu_isar_feature(aa32_mve, cpu)) {
 +            env->v7m.vpr = 0;
 +        }
      }
      /*
 -     * Otherwise s0 to s15 and FPSCR are UNKNOWN; we choose to leave them
 +     * Otherwise s0 to s15, FPSCR and VPR are UNKNOWN; we choose to leave them
       * unchanged.
       */
  }
@@ -XXX,XX +XXX,XX @@ static void v7m_update_fpccr(CPUARMState *env, uint32_t frameptr,
  void HELPER(v7m_vlstm)(CPUARMState *env, uint32_t fptr)
  {
      /* fptr is the value of Rn, the frame pointer we store the FP regs to */
 +    ARMCPU *cpu = env_archcpu(env);
      bool s = env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_S_MASK;
      bool lspact = env->v7m.fpccr[s] & R_V7M_FPCCR_LSPACT_MASK;
      uintptr_t ra = GETPC();
@@ -XXX,XX +XXX,XX @@ void HELPER(v7m_vlstm)(CPUARMState *env, uint32_t fptr)
              cpu_stl_data_ra(env, faddr + 4, shi, ra);
          }
          cpu_stl_data_ra(env, fptr + 0x40, vfp_get_fpscr(env), ra);
 +        if (cpu_isar_feature(aa32_mve, cpu)) {
 +            cpu_stl_data_ra(env, fptr + 0x44, env->v7m.vpr, ra);
 +        }
          /*
 -         * If TS is 0 then s0 to s15 and FPSCR are UNKNOWN; we choose to
 +         * If TS is 0 then s0 to s15, FPSCR and VPR are UNKNOWN; we choose to
           * leave them unchanged, matching our choice in v7m_preserve_fp_state.
           */
          if (ts) {
@@ -XXX,XX +XXX,XX @@ void HELPER(v7m_vlstm)(CPUARMState *env, uint32_t fptr)
                  *aa32_vfp_dreg(env, i / 2) = 0;
              }
              vfp_set_fpscr(env, 0);
 +            if (cpu_isar_feature(aa32_mve, cpu)) {
 +                env->v7m.vpr = 0;
 +            }
          }
      } else {
          v7m_update_fpccr(env, fptr, false);
@@ -XXX,XX +XXX,XX @@ void HELPER(v7m_vlstm)(CPUARMState *env, uint32_t fptr)
  void HELPER(v7m_vlldm)(CPUARMState *env, uint32_t fptr)
  {
 +    ARMCPU *cpu = env_archcpu(env);
      uintptr_t ra = GETPC();
      /* fptr is the value of Rn, the frame pointer we load the FP regs from */
@@ -XXX,XX +XXX,XX @@ void HELPER(v7m_vlldm)(CPUARMState *env, uint32_t fptr)
              uint32_t faddr = fptr + 4 * i;
              if (i >= 16) {
 -                faddr += 8; /* skip the slot for the FPSCR */
 +                faddr += 8; /* skip the slot for the FPSCR and VPR */
              }
              slo = cpu_ldl_data_ra(env, faddr, ra);
@@ -XXX,XX +XXX,XX @@ void HELPER(v7m_vlldm)(CPUARMState *env, uint32_t fptr)
          }
          fpscr = cpu_ldl_data_ra(env, fptr + 0x40, ra);
          vfp_set_fpscr(env, fpscr);
 +        if (cpu_isar_feature(aa32_mve, cpu)) {
 +            env->v7m.vpr = cpu_ldl_data_ra(env, fptr + 0x44, ra);
 +        }
      }
      env->v7m.control[M_REG_S] |= R_V7M_CONTROL_FPCA_MASK;
@@ -XXX,XX +XXX,XX @@ static bool v7m_push_stack(ARMCPU *cpu)
                      uint32_t shi = extract64(dn, 32, 32);
                      if (i >= 16) {
 -                        faddr += 8; /* skip the slot for the FPSCR */
 +                        faddr += 8; /* skip the slot for the FPSCR and VPR */
                      }
                      stacked_ok = stacked_ok &&
                          v7m_stack_write(cpu, faddr, slo,
@@ -XXX,XX +XXX,XX @@ static bool v7m_push_stack(ARMCPU *cpu)
                  stacked_ok = stacked_ok &&
                      v7m_stack_write(cpu, frameptr + 0x60,
                                      vfp_get_fpscr(env), mmu_idx, STACK_NORMAL);
 +                if (cpu_isar_feature(aa32_mve, cpu)) {
 +                    stacked_ok = stacked_ok &&
 +                        v7m_stack_write(cpu, frameptr + 0x64,
 +                                        env->v7m.vpr, mmu_idx, STACK_NORMAL);
 +                }
                  if (cpacr_pass) {
                      for (i = 0; i < ((framesize == 0xa8) ? 32 : 16); i += 2) {
                          *aa32_vfp_dreg(env, i / 2) = 0;
                      }
                      vfp_set_fpscr(env, 0);
 +                    if (cpu_isar_feature(aa32_mve, cpu)) {
 +                        env->v7m.vpr = 0;
 +                    }
                  }
              } else {
                  /* Lazy stacking enabled, save necessary info to stack later */
@@ -XXX,XX +XXX,XX @@ static void do_v7m_exception_exit(ARMCPU *cpu)
                      v7m_exception_taken(cpu, excret, true, false);
                  }
              }
 -            /* Clear s0..s15 and FPSCR; TODO also VPR when MVE is implemented */
 +            /* Clear s0..s15, FPSCR and VPR */
              int i;
              for (i = 0; i < 16; i += 2) {
                  *aa32_vfp_dreg(env, i / 2) = 0;
              }
              vfp_set_fpscr(env, 0);
 +            if (cpu_isar_feature(aa32_mve, cpu)) {
 +                env->v7m.vpr = 0;
 +            }
          }
      }
@@ -XXX,XX +XXX,XX @@ static void do_v7m_exception_exit(ARMCPU *cpu)
                      uint32_t faddr = frameptr + 0x20 + 4 * i;
                      if (i >= 16) {
 -                        faddr += 8; /* Skip the slot for the FPSCR */
 +                        faddr += 8; /* Skip the slot for the FPSCR and VPR */
                      }
                      pop_ok = pop_ok &&
@@ -XXX,XX +XXX,XX @@ static void do_v7m_exception_exit(ARMCPU *cpu)
                  if (pop_ok) {
                      vfp_set_fpscr(env, fpscr);
                  }
 +                if (cpu_isar_feature(aa32_mve, cpu)) {
 +                    pop_ok = pop_ok &&
 +                        v7m_stack_read(cpu, &env->v7m.vpr,
 +                                       frameptr + 0x64, mmu_idx);
 +                }
                  if (!pop_ok) {
                      /*
                       * These regs are 0 if security extension present;
@@ -XXX,XX +XXX,XX @@ static void do_v7m_exception_exit(ARMCPU *cpu)
                          *aa32_vfp_dreg(env, i / 2) = 0;
                      }
                      vfp_set_fpscr(env, 0);
 +                    if (cpu_isar_feature(aa32_mve, cpu)) {
 +                        env->v7m.vpr = 0;
 +                    }
                  }
              }
          }
 diff --git a/target/arm/translate-m-nocp.c b/target/arm/translate-m-nocp.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/translate-m-nocp.c
 +++ b/target/arm/translate-m-nocp.c
@@ -XXX,XX +XXX,XX @@ static bool trans_VSCCLRM(DisasContext *s, arg_VSCCLRM *a)
          btmreg++;
      }
      assert(btmreg == topreg + 1);
 -    /* TODO: when MVE is implemented, zero VPR here */
 +    if (dc_isar_feature(aa32_mve, s)) {
 +        TCGv_i32 z32 = tcg_const_i32(0);
 +        store_cpu_field(z32, v7m.vpr);
 +    }
      return true;
  }
 diff --git a/target/arm/translate-vfp.c b/target/arm/translate-vfp.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/translate-vfp.c
 +++ b/target/arm/translate-vfp.c
@@ -XXX,XX +XXX,XX @@ static bool full_vfp_access_check(DisasContext *s, bool ignore_vfp_enabled)
          if (s->v7m_new_fp_ctxt_needed) {
              /*
 -             * Create new FP context by updating CONTROL.FPCA, CONTROL.SFPA
 -             * and the FPSCR.
 +             * Create new FP context by updating CONTROL.FPCA, CONTROL.SFPA,
 +             * the FPSCR, and VPR.
               */
              TCGv_i32 control, fpscr;
              uint32_t bits = R_V7M_CONTROL_FPCA_MASK;
@@ -XXX,XX +XXX,XX @@ static bool full_vfp_access_check(DisasContext *s, bool ignore_vfp_enabled)
              fpscr = load_cpu_field(v7m.fpdscr[s->v8m_secure]);
              gen_helper_vfp_set_fpscr(cpu_env, fpscr);
              tcg_temp_free_i32(fpscr);
 +            if (dc_isar_feature(aa32_mve, s)) {
 +                TCGv_i32 z32 = tcg_const_i32(0);
 +                store_cpu_field(z32, v7m.vpr);
 +            }
 +
              /*
               * We don't need to arrange to end the TB, because the only
               * parts of FPSCR which we cache in the TB flags are the VECLEN
 --
 .20.1

-New patch
+[PULL 19/28] target/arm: Add handling for PSR.ECI/ICI
+On A-profile, PSR bits [15:10][26:25] are always the IT state bits.
 On M-profile, some of the reserved encodings of the IT state are used
 to instead indicate partial progress through instructions that were
 interrupted partway through by an exception and can be resumed.
 These resumable instructions fall into two categories:
 (1) load/store multiple instructions, where these bits are called
 "ICI" and specify the register in the ldm/stm list where execution
 should resume.  (Specifically: LDM, STM, VLDM, VSTM, VLLDM, VLSTM,
 CLRM, VSCCLRM.)
 (2) MVE instructions subject to beatwise execution, where these bits
 are called "ECI" and specify which beats in this and possibly also
 the following MVE insn have been executed.
 There are also a few insns (LE, LETP, and BKPT) which do not use the
 ICI/ECI bits but must leave them alone.
 Otherwise, we should raise an INVSTATE UsageFault for any attempt to
 execute an insn with non-zero ICI/ECI bits.
 So far we have been able to ignore ECI/ICI, because the architecture
 allows the IMPDEF choice of "always restart load/store multiple from
 the beginning regardless of ICI state", so the only thing we have
 been missing is that we don't raise the INVSTATE fault for bad guest
 code.  However, MVE requires that we honour ECI bits and do not
 rexecute beats of an insn that have already been executed.
 Add the support in the decoder for handling ECI/ICI:
  * identify the ECI/ICI case in the CONDEXEC TB flags
  * when a load/store multiple insn succeeds, it updates the ECI/ICI
    state (both in DisasContext and in the CPU state), and sets a flag
    to say that the ECI/ICI state was handled
  * if we find that the insn we just decoded did not handle the
    ECI/ICI state, we delete all the code that we just generated for
    it and instead emit the code to raise the INVFAULT.  This allows
    us to avoid having to update every non-MVE non-LDM/STM insn to
    make it check for "is ECI/ICI set?".
 We continue with our existing IMPDEF choice of not caring about the
 ICI state for the load/store multiples and simply restarting them
 from the beginning.  Because we don't allow interrupts in the middle
 of an insn, the only way we would see this state is if the guest set
 ICI manually on return from an exception handler, so it's a corner
 case which doesn't merit optimisation.
 ICI update for LDM/STM is simple -- it always zeroes the state.  ECI
 update for MVE beatwise insns will be a little more complex, since
 the ECI state may include information for the following insn.
 Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Message-id: 20210614151007.4545-5-peter.maydell@linaro.org
 ---
  target/arm/translate-a32.h    |   1 +
  target/arm/translate.h        |   9 +++
  target/arm/translate-m-nocp.c |  11 ++++
  target/arm/translate-vfp.c    |   6 ++
  target/arm/translate.c        | 111 ++++++++++++++++++++++++++++++++--
 files changed, 133 insertions(+), 5 deletions(-)
 diff --git a/target/arm/translate-a32.h b/target/arm/translate-a32.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/translate-a32.h
 +++ b/target/arm/translate-a32.h
@@ -XXX,XX +XXX,XX @@ long vfp_reg_offset(bool dp, unsigned reg);
  long neon_full_reg_offset(unsigned reg);
  long neon_element_offset(int reg, int element, MemOp memop);
  void gen_rev16(TCGv_i32 dest, TCGv_i32 var);
 +void clear_eci_state(DisasContext *s);
  static inline TCGv_i32 load_cpu_offset(int offset)
  {
 diff --git a/target/arm/translate.h b/target/arm/translate.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/translate.h
 +++ b/target/arm/translate.h
@@ -XXX,XX +XXX,XX @@ typedef struct DisasContext {
      /* Thumb-2 conditional execution bits.  */
      int condexec_mask;
      int condexec_cond;
 +    /* M-profile ECI/ICI exception-continuable instruction state */
 +    int eci;
 +    /*
 +     * trans_ functions for insns which are continuable should set this true
 +     * after decode (ie after any UNDEF checks)
 +     */
 +    bool eci_handled;
 +    /* TCG op to rewind to if this turns out to be an invalid ECI state */
 +    TCGOp *insn_eci_rewind;
      int thumb;
      int sctlr_b;
      MemOp be_data;
 diff --git a/target/arm/translate-m-nocp.c b/target/arm/translate-m-nocp.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/translate-m-nocp.c
 +++ b/target/arm/translate-m-nocp.c
@@ -XXX,XX +XXX,XX @@ static bool trans_VLLDM_VLSTM(DisasContext *s, arg_VLLDM_VLSTM *a)
          unallocated_encoding(s);
          return true;
      }
 +
 +    s->eci_handled = true;
 +
      /* If no fpu, NOP. */
      if (!dc_isar_feature(aa32_vfp, s)) {
 +        clear_eci_state(s);
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool trans_VLLDM_VLSTM(DisasContext *s, arg_VLLDM_VLSTM *a)
      }
      tcg_temp_free_i32(fptr);
 +    clear_eci_state(s);
 +
      /* End the TB, because we have updated FP control bits */
      s->base.is_jmp = DISAS_UPDATE_EXIT;
      return true;
@@ -XXX,XX +XXX,XX @@ static bool trans_VSCCLRM(DisasContext *s, arg_VSCCLRM *a)
          return true;
      }
 +    s->eci_handled = true;
 +
      if (!dc_isar_feature(aa32_vfp_simd, s)) {
          /* NOP if we have neither FP nor MVE */
 +        clear_eci_state(s);
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool trans_VSCCLRM(DisasContext *s, arg_VSCCLRM *a)
          TCGv_i32 z32 = tcg_const_i32(0);
          store_cpu_field(z32, v7m.vpr);
      }
 +
 +    clear_eci_state(s);
      return true;
  }
 diff --git a/target/arm/translate-vfp.c b/target/arm/translate-vfp.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/translate-vfp.c
 +++ b/target/arm/translate-vfp.c
@@ -XXX,XX +XXX,XX @@ static bool trans_VLDM_VSTM_sp(DisasContext *s, arg_VLDM_VSTM_sp *a)
          return false;
      }
 +    s->eci_handled = true;
 +
      if (!vfp_access_check(s)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool trans_VLDM_VSTM_sp(DisasContext *s, arg_VLDM_VSTM_sp *a)
          tcg_temp_free_i32(addr);
      }
 +    clear_eci_state(s);
      return true;
  }
@@ -XXX,XX +XXX,XX @@ static bool trans_VLDM_VSTM_dp(DisasContext *s, arg_VLDM_VSTM_dp *a)
          return false;
      }
 +    s->eci_handled = true;
 +
      if (!vfp_access_check(s)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool trans_VLDM_VSTM_dp(DisasContext *s, arg_VLDM_VSTM_dp *a)
          tcg_temp_free_i32(addr);
      }
 +    clear_eci_state(s);
      return true;
  }
 diff --git a/target/arm/translate.c b/target/arm/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/translate.c
 +++ b/target/arm/translate.c
@@ -XXX,XX +XXX,XX @@ static inline bool is_singlestepping(DisasContext *s)
      return s->base.singlestep_enabled || s->ss_active;
  }
 +void clear_eci_state(DisasContext *s)
 +{
 +    /*
 +     * Clear any ECI/ICI state: used when a load multiple/store
 +     * multiple insn executes.
 +     */
 +    if (s->eci) {
 +        TCGv_i32 tmp = tcg_const_i32(0);
 +        store_cpu_field(tmp, condexec_bits);
 +        s->eci = 0;
 +    }
 +}
 +
  static void gen_smul_dual(TCGv_i32 a, TCGv_i32 b)
  {
      TCGv_i32 tmp1 = tcg_temp_new_i32();
@@ -XXX,XX +XXX,XX @@ static bool trans_BKPT(DisasContext *s, arg_BKPT *a)
      if (!ENABLE_ARCH_5) {
          return false;
      }
 +    /* BKPT is OK with ECI set and leaves it untouched */
 +    s->eci_handled = true;
      if (arm_dc_feature(s, ARM_FEATURE_M) &&
          semihosting_enabled() &&
  #ifndef CONFIG_USER_ONLY
@@ -XXX,XX +XXX,XX @@ static bool op_stm(DisasContext *s, arg_ldst_block *a, int min_n)
          return true;
      }
 +    s->eci_handled = true;
 +
      addr = op_addr_block_pre(s, a, n);
      mem_idx = get_mem_index(s);
@@ -XXX,XX +XXX,XX @@ static bool op_stm(DisasContext *s, arg_ldst_block *a, int min_n)
      }
      op_addr_block_post(s, a, addr, n);
 +    clear_eci_state(s);
      return true;
  }
@@ -XXX,XX +XXX,XX @@ static bool do_ldm(DisasContext *s, arg_ldst_block *a, int min_n)
          return true;
      }
 +    s->eci_handled = true;
 +
      addr = op_addr_block_pre(s, a, n);
      mem_idx = get_mem_index(s);
      loaded_base = false;
@@ -XXX,XX +XXX,XX @@ static bool do_ldm(DisasContext *s, arg_ldst_block *a, int min_n)
          /* Must exit loop to check un-masked IRQs */
          s->base.is_jmp = DISAS_EXIT;
      }
 +    clear_eci_state(s);
      return true;
  }
@@ -XXX,XX +XXX,XX @@ static bool trans_CLRM(DisasContext *s, arg_CLRM *a)
          return false;
      }
 +    s->eci_handled = true;
 +
      zero = tcg_const_i32(0);
      for (i = 0; i < 15; i++) {
          if (extract32(a->list, i, 1)) {
@@ -XXX,XX +XXX,XX @@ static bool trans_CLRM(DisasContext *s, arg_CLRM *a)
          tcg_temp_free_i32(maskreg);
      }
      tcg_temp_free_i32(zero);
 +    clear_eci_state(s);
      return true;
  }
@@ -XXX,XX +XXX,XX @@ static bool trans_LE(DisasContext *s, arg_LE *a)
          return false;
      }
 +    /* LE/LETP is OK with ECI set and leaves it untouched */
 +    s->eci_handled = true;
 +
      if (!a->f) {
          /* Not loop-forever. If LR <= 1 this is the last loop: do nothing. */
          arm_gen_condlabel(s);
@@ -XXX,XX +XXX,XX @@ static void arm_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cs)
      dc->thumb = EX_TBFLAG_AM32(tb_flags, THUMB);
      dc->be_data = EX_TBFLAG_ANY(tb_flags, BE_DATA) ? MO_BE : MO_LE;
      condexec = EX_TBFLAG_AM32(tb_flags, CONDEXEC);
 -    dc->condexec_mask = (condexec & 0xf) << 1;
 -    dc->condexec_cond = condexec >> 4;
 +    /*
 +     * the CONDEXEC TB flags are CPSR bits [15:10][26:25]. On A-profile this
 +     * is always the IT bits. On M-profile, some of the reserved encodings
 +     * of IT are used instead to indicate either ICI or ECI, which
 +     * indicate partial progress of a restartable insn that was interrupted
 +     * partway through by an exception:
 +     *  * if CONDEXEC[3:0] != 0b0000 : CONDEXEC is IT bits
 +     *  * if CONDEXEC[3:0] == 0b0000 : CONDEXEC is ICI or ECI bits
 +     * In all cases CONDEXEC == 0 means "not in IT block or restartable
 +     * insn, behave normally".
 +     */
 +    dc->eci = dc->condexec_mask = dc->condexec_cond = 0;
 +    dc->eci_handled = false;
 +    dc->insn_eci_rewind = NULL;
 +    if (condexec & 0xf) {
 +        dc->condexec_mask = (condexec & 0xf) << 1;
 +        dc->condexec_cond = condexec >> 4;
 +    } else {
 +        if (arm_feature(env, ARM_FEATURE_M)) {
 +            dc->eci = condexec >> 4;
 +        }
 +    }
      core_mmu_idx = EX_TBFLAG_ANY(tb_flags, MMUIDX);
      dc->mmu_idx = core_to_arm_mmu_idx(env, core_mmu_idx);
@@ -XXX,XX +XXX,XX @@ static void arm_tr_tb_start(DisasContextBase *dcbase, CPUState *cpu)
  static void arm_tr_insn_start(DisasContextBase *dcbase, CPUState *cpu)
  {
      DisasContext *dc = container_of(dcbase, DisasContext, base);
 +    /*
 +     * The ECI/ICI bits share PSR bits with the IT bits, so we
 +     * need to reconstitute the bits from the split-out DisasContext
 +     * fields here.
 +     */
 +    uint32_t condexec_bits;
 -    tcg_gen_insn_start(dc->base.pc_next,
 -                       (dc->condexec_cond << 4) | (dc->condexec_mask >> 1),
 -                       0);
 +    if (dc->eci) {
 +        condexec_bits = dc->eci << 4;
 +    } else {
 +        condexec_bits = (dc->condexec_cond << 4) | (dc->condexec_mask >> 1);
 +    }
 +    tcg_gen_insn_start(dc->base.pc_next, condexec_bits, 0);
      dc->insn_start = tcg_last_op();
  }
@@ -XXX,XX +XXX,XX @@ static void thumb_tr_translate_insn(DisasContextBase *dcbase, CPUState *cpu)
      }
      dc->insn = insn;
 +    if (dc->eci) {
 +        /*
 +         * For M-profile continuable instructions, ECI/ICI handling
 +         * falls into these cases:
 +         *  - interrupt-continuable instructions
 +         *     These are the various load/store multiple insns (both
 +         *     integer and fp). The ICI bits indicate the register
 +         *     where the load/store can resume. We make the IMPDEF
 +         *     choice to always do "instruction restart", ie ignore
 +         *     the ICI value and always execute the ldm/stm from the
 +         *     start. So all we need to do is zero PSR.ICI if the
 +         *     insn executes.
 +         *  - MVE instructions subject to beat-wise execution
 +         *     Here the ECI bits indicate which beats have already been
 +         *     executed, and we must honour this. Each insn of this
 +         *     type will handle it correctly. We will update PSR.ECI
 +         *     in the helper function for the insn (some ECI values
 +         *     mean that the following insn also has been partially
 +         *     executed).
 +         *  - Special cases which don't advance ECI
 +         *     The insns LE, LETP and BKPT leave the ECI/ICI state
 +         *     bits untouched.
 +         *  - all other insns (the common case)
 +         *     Non-zero ECI/ICI means an INVSTATE UsageFault.
 +         *     We place a rewind-marker here. Insns in the previous
 +         *     three categories will set a flag in the DisasContext.
 +         *     If the flag isn't set after we call disas_thumb_insn()
 +         *     or disas_thumb2_insn() then we know we have a "some other
 +         *     insn" case. We will rewind to the marker (ie throwing away
 +         *     all the generated code) and instead emit "take exception".
 +         */
 +        dc->insn_eci_rewind = tcg_last_op();
 +    }
 +
      if (dc->condexec_mask && !thumb_insn_is_unconditional(dc, insn)) {
          uint32_t cond = dc->condexec_cond;
@@ -XXX,XX +XXX,XX @@ static void thumb_tr_translate_insn(DisasContextBase *dcbase, CPUState *cpu)
          }
      }
 +    if (dc->eci && !dc->eci_handled) {
 +        /*
 +         * Insn wasn't valid for ECI/ICI at all: undo what we
 +         * just generated and instead emit an exception
 +         */
 +        tcg_remove_ops_after(dc->insn_eci_rewind);
 +        dc->condjmp = 0;
 +        gen_exception_insn(dc, dc->pc_curr, EXCP_INVSTATE, syn_uncategorized(),
 +                           default_exception_el(dc));
 +    }
 +
      arm_post_translate_insn(dc);
      /* Thumb is a variable-length ISA.  Stop translation when the next insn
 --
 .20.1

-New patch
+[PULL 20/28] target/arm: Let vfp_access_check() handle late NOCP checks
+In commit a3494d4671797c we reworked the M-profile handling of its
+checks for when the NOCP exception should be raised because the FPU
+is disabled, so that (in line with the architecture) the NOCP check
+is done early over a large range of the encoding space, and takes
+precedence over UNDEF exceptions.  As part of this, we removed the
+code from full_vfp_access_check() which raised an exception there for
+M-profile with the FPU disabled, because it was no longer reachable.
+For MVE, some instructions which are outside the "coprocessor space"
+region of the encoding space must nonetheless do "is the FPU enabled"
+checks and possibly raise a NOCP exception.  (In particular this
+covers the MVE-specific low-overhead branch insns LCTP, DLSTP and
+WLSTP.) To support these insns, reinstate the code in
+full_vfp_access_check(), so that their trans functions can call
+vfp_access_check() and get the correct behaviour.
+Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
+Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
+Message-id: 20210614151007.4545-6-peter.maydell@linaro.org
+---
+ target/arm/translate-vfp.c | 20 +++++++++++++++-----
+file changed, 15 insertions(+), 5 deletions(-)
+diff --git a/target/arm/translate-vfp.c b/target/arm/translate-vfp.c
+index XXXXXXX..XXXXXXX 100644
+--- a/target/arm/translate-vfp.c
++++ b/target/arm/translate-vfp.c
+@@ -XXX,XX +XXX,XX @@ static void gen_preserve_fp_state(DisasContext *s)
+ static bool full_vfp_access_check(DisasContext *s, bool ignore_vfp_enabled)
+ {
+     if (s->fp_excp_el) {
+-        /* M-profile handled this earlier, in disas_m_nocp() */
+-        assert (!arm_dc_feature(s, ARM_FEATURE_M));
+-        gen_exception_insn(s, s->pc_curr, EXCP_UDEF,
+-                           syn_fp_access_trap(1, 0xe, false),
+-                           s->fp_excp_el);
++        if (arm_dc_feature(s, ARM_FEATURE_M)) {
++            /*
++             * M-profile mostly catches the "FPU disabled" case early, in
++             * disas_m_nocp(), but a few insns (eg LCTP, WLSTP, DLSTP)
++             * which do coprocessor-checks are outside the large ranges of
++             * the encoding space handled by the patterns in m-nocp.decode,
++             * and for them we may need to raise NOCP here.
++             */
++            gen_exception_insn(s, s->pc_curr, EXCP_NOCP,
++                               syn_uncategorized(), s->fp_excp_el);
++        } else {
++            gen_exception_insn(s, s->pc_curr, EXCP_UDEF,
++                               syn_fp_access_trap(1, 0xe, false),
++                               s->fp_excp_el);
++        }
+         return false;
+     }
+--
+.20.1

-New patch
+[PULL 21/28] target/arm: Implement MVE LCTP
+Implement the MVE LCTP instruction.
+We put its decode and implementation with the other
+low-overhead-branch insns because although it is only present if MVE
+is implemented it is logically in the same group as the other LOB
+insns.
+Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
+Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
+Message-id: 20210614151007.4545-7-peter.maydell@linaro.org
+---
+ target/arm/t32.decode  |  2 ++
+ target/arm/translate.c | 24 ++++++++++++++++++++++++
+files changed, 26 insertions(+)
+diff --git a/target/arm/t32.decode b/target/arm/t32.decode
+index XXXXXXX..XXXXXXX 100644
+--- a/target/arm/t32.decode
++++ b/target/arm/t32.decode
+@@ -XXX,XX +XXX,XX @@ BL               1111 0. .......... 11.1 ............         @branch24
+     DLS          1111 0 0000 100     rn:4 1110 0000 0000 0001
+     WLS          1111 0 0000 100     rn:4 1100 . .......... 1 imm=%lob_imm
+     LE           1111 0 0000 0 f:1 0 1111 1100 . .......... 1 imm=%lob_imm
++
++    LCTP         1111 0 0000 000     1111 1110 0000 0000 0001
+   ]
+ }
+diff --git a/target/arm/translate.c b/target/arm/translate.c
+index XXXXXXX..XXXXXXX 100644
+--- a/target/arm/translate.c
++++ b/target/arm/translate.c
+@@ -XXX,XX +XXX,XX @@ static bool trans_LE(DisasContext *s, arg_LE *a)
+     return true;
+ }
++static bool trans_LCTP(DisasContext *s, arg_LCTP *a)
++{
++    /*
++     * M-profile Loop Clear with Tail Predication. Since our implementation
++     * doesn't cache branch information, all we need to do is reset
++     * FPSCR.LTPSIZE to 4.
++     */
++    TCGv_i32 ltpsize;
++
++    if (!dc_isar_feature(aa32_lob, s) ||
++        !dc_isar_feature(aa32_mve, s)) {
++        return false;
++    }
++
++    if (!vfp_access_check(s)) {
++        return true;
++    }
++
++    ltpsize = tcg_const_i32(4);
++    store_cpu_field(ltpsize, v7m.ltpsize);
++    return true;
++}
++
++
+ static bool op_tbranch(DisasContext *s, arg_tbranch *a, bool half)
+ {
+     TCGv_i32 addr, tmp;
+--
+.20.1

-New patch
+[PULL 22/28] target/arm: Implement MVE WLSTP insn
+Implement the MVE WLSTP insn; this is like the existing WLS insn,
+except that it specifies a size value which is used to set
+FPSCR.LTPSIZE.
+Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
+Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
+Message-id: 20210614151007.4545-8-peter.maydell@linaro.org
+---
+ target/arm/t32.decode  |  8 ++++++--
+ target/arm/translate.c | 37 ++++++++++++++++++++++++++++++++++++-
+files changed, 42 insertions(+), 3 deletions(-)
+diff --git a/target/arm/t32.decode b/target/arm/t32.decode
+index XXXXXXX..XXXXXXX 100644
+--- a/target/arm/t32.decode
++++ b/target/arm/t32.decode
+@@ -XXX,XX +XXX,XX @@ BL               1111 0. .......... 11.1 ............         @branch24
+     %lob_imm 1:10 11:1 !function=times_2
+     DLS          1111 0 0000 100     rn:4 1110 0000 0000 0001
+-    WLS          1111 0 0000 100     rn:4 1100 . .......... 1 imm=%lob_imm
+-    LE           1111 0 0000 0 f:1 0 1111 1100 . .......... 1 imm=%lob_imm
++    WLS          1111 0 0000 100     rn:4 1100 . .......... 1 imm=%lob_imm size=4
++    {
++      LE         1111 0 0000 0 f:1 0 1111 1100 . .......... 1 imm=%lob_imm
++      # This is WLSTP
++      WLS        1111 0 0000 0 size:2 rn:4 1100 . .......... 1 imm=%lob_imm
++    }
+     LCTP         1111 0 0000 000     1111 1110 0000 0000 0001
+   ]
+diff --git a/target/arm/translate.c b/target/arm/translate.c
+index XXXXXXX..XXXXXXX 100644
+--- a/target/arm/translate.c
++++ b/target/arm/translate.c
+@@ -XXX,XX +XXX,XX @@ static bool trans_WLS(DisasContext *s, arg_WLS *a)
+         return false;
+     }
+     if (a->rn == 13 || a->rn == 15) {
+-        /* CONSTRAINED UNPREDICTABLE: we choose to UNDEF */
++        /*
++         * For WLSTP rn == 15 is a related encoding (LE); the
++         * other cases caught by this condition are all
++         * CONSTRAINED UNPREDICTABLE: we choose to UNDEF
++         */
+         return false;
+     }
+     if (s->condexec_mask) {
+@@ -XXX,XX +XXX,XX @@ static bool trans_WLS(DisasContext *s, arg_WLS *a)
+          */
+         return false;
+     }
++    if (a->size != 4) {
++        /* WLSTP */
++        if (!dc_isar_feature(aa32_mve, s)) {
++            return false;
++        }
++        /*
++         * We need to check that the FPU is enabled here, but mustn't
++         * call vfp_access_check() to do that because we don't want to
++         * do the lazy state preservation in the "loop count is zero" case.
++         * Do the check-and-raise-exception by hand.
++         */
++        if (s->fp_excp_el) {
++            gen_exception_insn(s, s->pc_curr, EXCP_NOCP,
++                               syn_uncategorized(), s->fp_excp_el);
++            return true;
++        }
++    }
++
+     nextlabel = gen_new_label();
+     tcg_gen_brcondi_i32(TCG_COND_EQ, cpu_R[a->rn], 0, nextlabel);
+     tmp = load_reg(s, a->rn);
+     store_reg(s, 14, tmp);
++    if (a->size != 4) {
++        /*
++         * WLSTP: set FPSCR.LTPSIZE. This requires that we do the
++         * lazy state preservation, new FP context creation, etc,
++         * that vfp_access_check() does. We know that the actual
++         * access check will succeed (ie it won't generate code that
++         * throws an exception) because we did that check by hand earlier.
++         */
++        bool ok = vfp_access_check(s);
++        assert(ok);
++        tmp = tcg_const_i32(a->size);
++        store_cpu_field(tmp, v7m.ltpsize);
++    }
+     gen_jmp_tb(s, s->base.pc_next, 1);
+     gen_set_label(nextlabel);
+--
+.20.1

-New patch
+[PULL 23/28] target/arm: Implement MVE DLSTP
+Implement the MVE DLSTP insn; this is like the existing DLS
+insn, except that it must do an FPU access check and it
+sets LTPSIZE to the value specified in the insn.
+Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
+Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
+Message-id: 20210614151007.4545-9-peter.maydell@linaro.org
+---
+ target/arm/t32.decode  |  9 ++++++---
+ target/arm/translate.c | 23 +++++++++++++++++++++--
+files changed, 27 insertions(+), 5 deletions(-)
+diff --git a/target/arm/t32.decode b/target/arm/t32.decode
+index XXXXXXX..XXXXXXX 100644
+--- a/target/arm/t32.decode
++++ b/target/arm/t32.decode
+@@ -XXX,XX +XXX,XX @@ BL               1111 0. .......... 11.1 ............         @branch24
+     # LE and WLS immediate
+     %lob_imm 1:10 11:1 !function=times_2
+-    DLS          1111 0 0000 100     rn:4 1110 0000 0000 0001
++    DLS          1111 0 0000 100     rn:4 1110 0000 0000 0001 size=4
+     WLS          1111 0 0000 100     rn:4 1100 . .......... 1 imm=%lob_imm size=4
+     {
+       LE         1111 0 0000 0 f:1 0 1111 1100 . .......... 1 imm=%lob_imm
+       # This is WLSTP
+       WLS        1111 0 0000 0 size:2 rn:4 1100 . .......... 1 imm=%lob_imm
+     }
+-
+-    LCTP         1111 0 0000 000     1111 1110 0000 0000 0001
++    {
++      LCTP       1111 0 0000 000     1111 1110 0000 0000 0001
++      # This is DLSTP
++      DLS        1111 0 0000 0 size:2 rn:4 1110 0000 0000 0001
++    }
+   ]
+ }
+diff --git a/target/arm/translate.c b/target/arm/translate.c
+index XXXXXXX..XXXXXXX 100644
+--- a/target/arm/translate.c
++++ b/target/arm/translate.c
+@@ -XXX,XX +XXX,XX @@ static bool trans_DLS(DisasContext *s, arg_DLS *a)
+         return false;
+     }
+     if (a->rn == 13 || a->rn == 15) {
+-        /* CONSTRAINED UNPREDICTABLE: we choose to UNDEF */
++        /*
++         * For DLSTP rn == 15 is a related encoding (LCTP); the
++         * other cases caught by this condition are all
++         * CONSTRAINED UNPREDICTABLE: we choose to UNDEF
++         */
+         return false;
+     }
+-    /* Not a while loop, no tail predication: just set LR to the count */
++    if (a->size != 4) {
++        /* DLSTP */
++        if (!dc_isar_feature(aa32_mve, s)) {
++            return false;
++        }
++        if (!vfp_access_check(s)) {
++            return true;
++        }
++    }
++
++    /* Not a while loop: set LR to the count, and set LTPSIZE for DLSTP */
+     tmp = load_reg(s, a->rn);
+     store_reg(s, 14, tmp);
++    if (a->size != 4) {
++        /* DLSTP: set FPSCR.LTPSIZE */
++        tmp = tcg_const_i32(a->size);
++        store_cpu_field(tmp, v7m.ltpsize);
++    }
+     return true;
+ }
+--
+.20.1

-New patch
+[PULL 24/28] target/arm: Implement MVE LETP insn
+Implement the MVE LETP insn.  This is like the existing LE loop-end
+insn, but it must perform an FPU-enabled check, and on loop-exit it
+resets LTPSIZE to 4.
+To accommodate the requirement to do something on loop-exit, we drop
+the use of condlabel and instead manage both the TB exits manually,
+in the same way we already do in trans_WLS().
+The other MVE-specific change to the LE insn is that we must raise an
+INVSTATE UsageFault insn if LTPSIZE is not 4.
+Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
+Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
+Message-id: 20210614151007.4545-10-peter.maydell@linaro.org
+---
+ target/arm/t32.decode  |   2 +-
+ target/arm/translate.c | 104 +++++++++++++++++++++++++++++++++++++----
+files changed, 97 insertions(+), 9 deletions(-)
+diff --git a/target/arm/t32.decode b/target/arm/t32.decode
+index XXXXXXX..XXXXXXX 100644
+--- a/target/arm/t32.decode
++++ b/target/arm/t32.decode
+@@ -XXX,XX +XXX,XX @@ BL               1111 0. .......... 11.1 ............         @branch24
+     DLS          1111 0 0000 100     rn:4 1110 0000 0000 0001 size=4
+     WLS          1111 0 0000 100     rn:4 1100 . .......... 1 imm=%lob_imm size=4
+     {
+-      LE         1111 0 0000 0 f:1 0 1111 1100 . .......... 1 imm=%lob_imm
++      LE         1111 0 0000 0 f:1 tp:1 1111 1100 . .......... 1 imm=%lob_imm
+       # This is WLSTP
+       WLS        1111 0 0000 0 size:2 rn:4 1100 . .......... 1 imm=%lob_imm
+     }
+diff --git a/target/arm/translate.c b/target/arm/translate.c
+index XXXXXXX..XXXXXXX 100644
+--- a/target/arm/translate.c
++++ b/target/arm/translate.c
+@@ -XXX,XX +XXX,XX @@ static bool trans_LE(DisasContext *s, arg_LE *a)
+      * any faster.
+      */
+     TCGv_i32 tmp;
++    TCGLabel *loopend;
++    bool fpu_active;
+     if (!dc_isar_feature(aa32_lob, s)) {
+         return false;
+     }
++    if (a->f && a->tp) {
++        return false;
++    }
++    if (s->condexec_mask) {
++        /*
++         * LE in an IT block is CONSTRAINED UNPREDICTABLE;
++         * we choose to UNDEF, because otherwise our use of
++         * gen_goto_tb(1) would clash with the use of TB exit 1
++         * in the dc->condjmp condition-failed codepath in
++         * arm_tr_tb_stop() and we'd get an assertion.
++         */
++        return false;
++    }
++    if (a->tp) {
++        /* LETP */
++        if (!dc_isar_feature(aa32_mve, s)) {
++            return false;
++        }
++        if (!vfp_access_check(s)) {
++            s->eci_handled = true;
++            return true;
++        }
++    }
+     /* LE/LETP is OK with ECI set and leaves it untouched */
+     s->eci_handled = true;
+-    if (!a->f) {
+-        /* Not loop-forever. If LR <= 1 this is the last loop: do nothing. */
+-        arm_gen_condlabel(s);
+-        tcg_gen_brcondi_i32(TCG_COND_LEU, cpu_R[14], 1, s->condlabel);
+-        /* Decrement LR */
+-        tmp = load_reg(s, 14);
+-        tcg_gen_addi_i32(tmp, tmp, -1);
+-        store_reg(s, 14, tmp);
++    /*
++     * With MVE, LTPSIZE might not be 4, and we must emit an INVSTATE
++     * UsageFault exception for the LE insn in that case. Note that we
++     * are not directly checking FPSCR.LTPSIZE but instead check the
++     * pseudocode LTPSIZE() function, which returns 4 if the FPU is
++     * not currently active (ie ActiveFPState() returns false). We
++     * can identify not-active purely from our TB state flags, as the
++     * FPU is active only if:
++     *  the FPU is enabled
++     *  AND lazy state preservation is not active
++     *  AND we do not need a new fp context (this is the ASPEN/FPCA check)
++     *
++     * Usually we don't need to care about this distinction between
++     * LTPSIZE and FPSCR.LTPSIZE, because the code in vfp_access_check()
++     * will either take an exception or clear the conditions that make
++     * the FPU not active. But LE is an unusual case of a non-FP insn
++     * that looks at LTPSIZE.
++     */
++    fpu_active = !s->fp_excp_el && !s->v7m_lspact && !s->v7m_new_fp_ctxt_needed;
++
++    if (!a->tp && dc_isar_feature(aa32_mve, s) && fpu_active) {
++        /* Need to do a runtime check for LTPSIZE != 4 */
++        TCGLabel *skipexc = gen_new_label();
++        tmp = load_cpu_field(v7m.ltpsize);
++        tcg_gen_brcondi_i32(TCG_COND_EQ, tmp, 4, skipexc);
++        tcg_temp_free_i32(tmp);
++        gen_exception_insn(s, s->pc_curr, EXCP_INVSTATE, syn_uncategorized(),
++                           default_exception_el(s));
++        gen_set_label(skipexc);
++    }
++
++    if (a->f) {
++        /* Loop-forever: just jump back to the loop start */
++        gen_jmp(s, read_pc(s) - a->imm);
++        return true;
++    }
++
++    /*
++     * Not loop-forever. If LR <= loop-decrement-value this is the last loop.
++     * For LE, we know at this point that LTPSIZE must be 4 and the
++     * loop decrement value is 1. For LETP we need to calculate the decrement
++     * value from LTPSIZE.
++     */
++    loopend = gen_new_label();
++    if (!a->tp) {
++        tcg_gen_brcondi_i32(TCG_COND_LEU, cpu_R[14], 1, loopend);
++        tcg_gen_addi_i32(cpu_R[14], cpu_R[14], -1);
++    } else {
++        /*
++         * Decrement by 1 << (4 - LTPSIZE). We need to use a TCG local
++         * so that decr stays live after the brcondi.
++         */
++        TCGv_i32 decr = tcg_temp_local_new_i32();
++        TCGv_i32 ltpsize = load_cpu_field(v7m.ltpsize);
++        tcg_gen_sub_i32(decr, tcg_constant_i32(4), ltpsize);
++        tcg_gen_shl_i32(decr, tcg_constant_i32(1), decr);
++        tcg_temp_free_i32(ltpsize);
++
++        tcg_gen_brcond_i32(TCG_COND_LEU, cpu_R[14], decr, loopend);
++
++        tcg_gen_sub_i32(cpu_R[14], cpu_R[14], decr);
++        tcg_temp_free_i32(decr);
+     }
+     /* Jump back to the loop start */
+     gen_jmp(s, read_pc(s) - a->imm);
++
++    gen_set_label(loopend);
++    if (a->tp) {
++        /* Exits from tail-pred loops must reset LTPSIZE to 4 */
++        tmp = tcg_const_i32(4);
++        store_cpu_field(tmp, v7m.ltpsize);
++    }
++    /* End TB, continuing to following insn */
++    gen_jmp_tb(s, s->base.pc_next, 1);
+     return true;
+ }
+--
+.20.1

-New patch
+[PULL 25/28] target/arm: Add framework for MVE decode
+Add the framework for decoding MVE insns, with the necessary new
+files and the meson.build rules, but no actual content yet.
+Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
+Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
+Message-id: 20210614151007.4545-11-peter.maydell@linaro.org
+---
+ target/arm/translate-a32.h |  1 +
+ target/arm/mve.decode      | 20 ++++++++++++++++++++
+ target/arm/translate-mve.c | 29 +++++++++++++++++++++++++++++
+ target/arm/translate.c     |  1 +
+ target/arm/meson.build     |  2 ++
+files changed, 53 insertions(+)
+ create mode 100644 target/arm/mve.decode
+ create mode 100644 target/arm/translate-mve.c
+diff --git a/target/arm/translate-a32.h b/target/arm/translate-a32.h
+index XXXXXXX..XXXXXXX 100644
+--- a/target/arm/translate-a32.h
++++ b/target/arm/translate-a32.h
+@@ -XXX,XX +XXX,XX @@
+ /* Prototypes for autogenerated disassembler functions */
+ bool disas_m_nocp(DisasContext *dc, uint32_t insn);
++bool disas_mve(DisasContext *dc, uint32_t insn);
+ bool disas_vfp(DisasContext *s, uint32_t insn);
+ bool disas_vfp_uncond(DisasContext *s, uint32_t insn);
+ bool disas_neon_dp(DisasContext *s, uint32_t insn);
+diff --git a/target/arm/mve.decode b/target/arm/mve.decode
+new file mode 100644
+index XXXXXXX..XXXXXXX
+--- /dev/null
++++ b/target/arm/mve.decode
+@@ -XXX,XX +XXX,XX @@
++# M-profile MVE instruction descriptions
++#
++#  Copyright (c) 2021 Linaro, Ltd
++#
++# This library is free software; you can redistribute it and/or
++# modify it under the terms of the GNU Lesser General Public
++# License as published by the Free Software Foundation; either
++# version 2.1 of the License, or (at your option) any later version.
++#
++# This library is distributed in the hope that it will be useful,
++# but WITHOUT ANY WARRANTY; without even the implied warranty of
++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++# Lesser General Public License for more details.
++#
++# You should have received a copy of the GNU Lesser General Public
++# License along with this library; if not, see <http://www.gnu.org/licenses/>.
++
++#
++# This file is processed by scripts/decodetree.py
++#
+diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c
+new file mode 100644
+index XXXXXXX..XXXXXXX
+--- /dev/null
++++ b/target/arm/translate-mve.c
+@@ -XXX,XX +XXX,XX @@
++/*
++ *  ARM translation: M-profile MVE instructions
++ *
++ *  Copyright (c) 2021 Linaro, Ltd.
++ *
++ * This library is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * This library is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
++ */
++
++#include "qemu/osdep.h"
++#include "tcg/tcg-op.h"
++#include "tcg/tcg-op-gvec.h"
++#include "exec/exec-all.h"
++#include "exec/gen-icount.h"
++#include "translate.h"
++#include "translate-a32.h"
++
++/* Include the generated decoder */
++#include "decode-mve.c.inc"
+diff --git a/target/arm/translate.c b/target/arm/translate.c
+index XXXXXXX..XXXXXXX 100644
+--- a/target/arm/translate.c
++++ b/target/arm/translate.c
+@@ -XXX,XX +XXX,XX @@ static void disas_thumb2_insn(DisasContext *s, uint32_t insn)
+     if (disas_t32(s, insn) ||
+         disas_vfp_uncond(s, insn) ||
+         disas_neon_shared(s, insn) ||
++        disas_mve(s, insn) ||
+         ((insn >> 28) == 0xe && disas_vfp(s, insn))) {
+         return;
+     }
+diff --git a/target/arm/meson.build b/target/arm/meson.build
+index XXXXXXX..XXXXXXX 100644
+--- a/target/arm/meson.build
++++ b/target/arm/meson.build
+@@ -XXX,XX +XXX,XX @@ gen = [
+   decodetree.process('vfp.decode', extra_args: '--decode=disas_vfp'),
+   decodetree.process('vfp-uncond.decode', extra_args: '--decode=disas_vfp_uncond'),
+   decodetree.process('m-nocp.decode', extra_args: '--decode=disas_m_nocp'),
++  decodetree.process('mve.decode', extra_args: '--decode=disas_mve'),
+   decodetree.process('a32.decode', extra_args: '--static-decode=disas_a32'),
+   decodetree.process('a32-uncond.decode', extra_args: '--static-decode=disas_a32_uncond'),
+   decodetree.process('t32.decode', extra_args: '--static-decode=disas_t32'),
+@@ -XXX,XX +XXX,XX @@ arm_ss.add(files(
+   'tlb_helper.c',
+   'translate.c',
+   'translate-m-nocp.c',
++  'translate-mve.c',
+   'translate-neon.c',
+   'translate-vfp.c',
+   'vec_helper.c',
+--
+.20.1

-New patch
+[PULL 26/28] target/arm: Move expand_pred_b() data to vec_helper.c
+For MVE, we want to re-use the large data table from expand_pred_b().
 Move the data table to vec_helper.c so it is no longer in an SVE
 specific source file.
 Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Message-id: 20210614151007.4545-14-peter.maydell@linaro.org
 ---
  target/arm/vec_internal.h |   3 ++
  target/arm/sve_helper.c   | 103 ++------------------------------------
  target/arm/vec_helper.c   | 102 +++++++++++++++++++++++++++++++++++++
 files changed, 109 insertions(+), 99 deletions(-)
 diff --git a/target/arm/vec_internal.h b/target/arm/vec_internal.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/vec_internal.h
 +++ b/target/arm/vec_internal.h
@@ -XXX,XX +XXX,XX @@
  #define H8(x)   (x)
  #define H1_8(x) (x)
 +/* Data for expanding active predicate bits to bytes, for byte elements. */
 +extern const uint64_t expand_pred_b_data[256];
 +
  static inline void clear_tail(void *vd, uintptr_t opr_sz, uintptr_t max_sz)
  {
      uint64_t *d = vd + opr_sz;
 diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/sve_helper.c
 +++ b/target/arm/sve_helper.c
@@ -XXX,XX +XXX,XX @@ uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
      return flags;
  }
 -/* Expand active predicate bits to bytes, for byte elements.
 - *  for (i = 0; i < 256; ++i) {
 - *      unsigned long m = 0;
 - *      for (j = 0; j < 8; j++) {
 - *          if ((i >> j) & 1) {
 - *              m |= 0xfful << (j << 3);
 - *          }
 - *      }
 - *      printf("0x%016lx,\n", m);
 - *  }
 +/*
 + * Expand active predicate bits to bytes, for byte elements.
 + * (The data table itself is in vec_helper.c as MVE also needs it.)
   */
  static inline uint64_t expand_pred_b(uint8_t byte)
  {
 -    static const uint64_t word[256] = {
 -        0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
 -        0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
 -        0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
 -        0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
 -        0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
 -        0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
 -        0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
 -        0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
 -        0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
 -        0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
 -        0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
 -        0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
 -        0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
 -        0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
 -        0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
 -        0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
 -        0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
 -        0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
 -        0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
 -        0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
 -        0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
 -        0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
 -        0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
 -        0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
 -        0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
 -        0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
 -        0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
 -        0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
 -        0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
 -        0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
 -        0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
 -        0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
 -        0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
 -        0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
 -        0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
 -        0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
 -        0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
 -        0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
 -        0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
 -        0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
 -        0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
 -        0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
 -        0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
 -        0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
 -        0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
 -        0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
 -        0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
 -        0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
 -        0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
 -        0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
 -        0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
 -        0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
 -        0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
 -        0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
 -        0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
 -        0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
 -        0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
 -        0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
 -        0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
 -        0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
 -        0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
 -        0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
 -        0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
 -        0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
 -        0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
 -        0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
 -        0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
 -        0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
 -        0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
 -        0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
 -        0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
 -        0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
 -        0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
 -        0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
 -        0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
 -        0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
 -        0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
 -        0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
 -        0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
 -        0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
 -        0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
 -        0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
 -        0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
 -        0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
 -        0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
 -        0xffffffffffffffff,
 -    };
 -    return word[byte];
 +    return expand_pred_b_data[byte];
  }
  /* Similarly for half-word elements.
 diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/vec_helper.c
 +++ b/target/arm/vec_helper.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/int128.h"
  #include "vec_internal.h"
 +/*
 + * Data for expanding active predicate bits to bytes, for byte elements.
 + *
 + *  for (i = 0; i < 256; ++i) {
 + *      unsigned long m = 0;
 + *      for (j = 0; j < 8; j++) {
 + *          if ((i >> j) & 1) {
 + *              m |= 0xfful << (j << 3);
 + *          }
 + *      }
 + *      printf("0x%016lx,\n", m);
 + *  }
 + */
 +const uint64_t expand_pred_b_data[256] = {
 +    0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
 +    0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
 +    0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
 +    0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
 +    0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
 +    0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
 +    0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
 +    0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
 +    0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
 +    0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
 +    0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
 +    0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
 +    0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
 +    0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
 +    0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
 +    0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
 +    0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
 +    0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
 +    0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
 +    0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
 +    0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
 +    0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
 +    0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
 +    0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
 +    0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
 +    0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
 +    0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
 +    0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
 +    0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
 +    0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
 +    0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
 +    0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
 +    0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
 +    0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
 +    0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
 +    0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
 +    0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
 +    0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
 +    0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
 +    0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
 +    0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
 +    0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
 +    0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
 +    0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
 +    0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
 +    0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
 +    0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
 +    0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
 +    0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
 +    0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
 +    0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
 +    0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
 +    0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
 +    0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
 +    0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
 +    0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
 +    0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
 +    0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
 +    0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
 +    0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
 +    0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
 +    0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
 +    0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
 +    0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
 +    0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
 +    0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
 +    0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
 +    0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
 +    0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
 +    0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
 +    0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
 +    0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
 +    0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
 +    0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
 +    0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
 +    0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
 +    0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
 +    0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
 +    0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
 +    0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
 +    0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
 +    0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
 +    0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
 +    0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
 +    0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
 +    0xffffffffffffffff,
 +};
 +
  /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
  int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
                       bool neg, bool round)
 --
 .20.1

-New patch
+[PULL 27/28] bitops.h: Provide hswap32(), hswap64(), wswap64() swapping operations
+Currently the ARM SVE helper code defines locally some utility
+functions for swapping 16-bit halfwords within 32-bit or 64-bit
+values and for swapping 32-bit words within 64-bit values,
+parallel to the byte-swapping bswap16/32/64 functions.
+We want these also for the ARM MVE code, and they're potentially
+generally useful for other targets, so move them to bitops.h.
+(We don't put them in bswap.h with the bswap* functions because
+they are implemented in terms of the rotate operations also
+defined in bitops.h, and including bitops.h from bswap.h seems
+better avoided.)
+Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
+Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Message-id: 20210614151007.4545-17-peter.maydell@linaro.org
+---
+ include/qemu/bitops.h   | 29 +++++++++++++++++++++++++++++
+ target/arm/sve_helper.c | 20 --------------------
+files changed, 29 insertions(+), 20 deletions(-)
+diff --git a/include/qemu/bitops.h b/include/qemu/bitops.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/qemu/bitops.h
++++ b/include/qemu/bitops.h
+@@ -XXX,XX +XXX,XX @@ static inline uint64_t ror64(uint64_t word, unsigned int shift)
+     return (word >> shift) | (word << ((64 - shift) & 63));
+ }
++/**
++ * hswap32 - swap 16-bit halfwords within a 32-bit value
++ * @h: value to swap
++ */
++static inline uint32_t hswap32(uint32_t h)
++{
++    return rol32(h, 16);
++}
++
++/**
++ * hswap64 - swap 16-bit halfwords within a 64-bit value
++ * @h: value to swap
++ */
++static inline uint64_t hswap64(uint64_t h)
++{
++    uint64_t m = 0x0000ffff0000ffffull;
++    h = rol64(h, 32);
++    return ((h & m) << 16) | ((h >> 16) & m);
++}
++
++/**
++ * wswap64 - swap 32-bit words within a 64-bit value
++ * @h: value to swap
++ */
++static inline uint64_t wswap64(uint64_t h)
++{
++    return rol64(h, 32);
++}
++
+ /**
+  * extract32:
+  * @value: the value to extract the bit field from
+diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
+index XXXXXXX..XXXXXXX 100644
+--- a/target/arm/sve_helper.c
++++ b/target/arm/sve_helper.c
+@@ -XXX,XX +XXX,XX @@ static inline uint64_t expand_pred_s(uint8_t byte)
+     return word[byte & 0x11];
+ }
+-/* Swap 16-bit words within a 32-bit word.  */
+-static inline uint32_t hswap32(uint32_t h)
+-{
+-    return rol32(h, 16);
+-}
+-
+-/* Swap 16-bit words within a 64-bit word.  */
+-static inline uint64_t hswap64(uint64_t h)
+-{
+-    uint64_t m = 0x0000ffff0000ffffull;
+-    h = rol64(h, 32);
+-    return ((h & m) << 16) | ((h >> 16) & m);
+-}
+-
+-/* Swap 32-bit words within a 64-bit word.  */
+-static inline uint64_t wswap64(uint64_t h)
+-{
+-    return rol64(h, 32);
+-}
+-
+ #define LOGICAL_PPPP(NAME, FUNC) \
+ void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
+ {                                                                         \
+--
+.20.1

-New patch
+[PULL 28/28] include/qemu/int128.h: Add function to create Int128 from int64_t
+int128_make64() creates an Int128 from an unsigned 64 bit value; add
+a function int128_makes64() creating an Int128 from a signed 64 bit
+value.
+Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
+Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Message-id: 20210614151007.4545-34-peter.maydell@linaro.org
+---
+ include/qemu/int128.h | 10 ++++++++++
+file changed, 10 insertions(+)
+diff --git a/include/qemu/int128.h b/include/qemu/int128.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/qemu/int128.h
++++ b/include/qemu/int128.h
+@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_make64(uint64_t a)
+     return a;
+ }
++static inline Int128 int128_makes64(int64_t a)
++{
++    return a;
++}
++
+ static inline Int128 int128_make128(uint64_t lo, uint64_t hi)
+ {
+     return (__uint128_t)hi << 64 | lo;
+@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_make64(uint64_t a)
+     return (Int128) { a, 0 };
+ }
++static inline Int128 int128_makes64(int64_t a)
++{
++    return (Int128) { a, a >> 63 };
++}
++
+ static inline Int128 int128_make128(uint64_t lo, uint64_t hi)
+ {
+     return (Int128) { lo, hi };
+--
+.20.1

Just a few minor bugfixes, but we might as well get them in
for rc0 tomorrow.

-- PMM

The following changes since commit 787f82407c5056a8b1097e39e53d01dd1abe406b:

Merge remote-tracking branch 'remotes/cohuck/tags/s390x-20200323' into staging (2020-03-23 15:38:30 +0000)

are available in the Git repository at:

https://git.linaro.org/people/pmaydell/qemu-arm.git tags/pull-target-arm-20200323

for you to fetch changes up to 550a04893c2bd4442211b353680b9a6408d94dba:

target/arm: Move computation of index in handle_simd_dupe (2020-03-23 17:22:30 +0000)

----------------------------------------------------------------
target-arm queue:
 * target/arm: avoid undefined behaviour shift in watchpoint code
 * target/arm: avoid undefined behaviour shift in handle_simd_dupe()
 * target/arm: add assert that immh != 0 in disas_simd_shift_imm()
 * aspeed/smc: Fix DMA support for AST2600
 * hw/arm/bcm283x: Correct the license text ('and' vs 'or')

----------------------------------------------------------------
Cédric Le Goater (1):
      aspeed/smc: Fix DMA support for AST2600

Philippe Mathieu-Daudé (1):
      hw/arm/bcm283x: Correct the license text

Richard Henderson (3):
      target/arm: Rearrange disabled check for watchpoints
      target/arm: Assert immh != 0 in disas_simd_shift_imm
      target/arm: Move computation of index in handle_simd_dupe

From: Philippe Mathieu-Daudé <philmd@redhat.com>

The license is the 'GNU General Public License v2.0 or later',
not 'and':

This program is free software; you can redistribute it and/ori
  modify it under the terms of the GNU General Public License as
  published by the Free Software Foundation; either version 2 of
  the License, or (at your option) any later version.

Fix the license comment.

Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20200312213455.15854-1-philmd@redhat.com
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 include/hw/arm/bcm2835_peripherals.h | 3 ++-
 include/hw/arm/bcm2836.h             | 3 ++-
 include/hw/char/bcm2835_aux.h        | 3 ++-
 include/hw/display/bcm2835_fb.h      | 3 ++-
 include/hw/dma/bcm2835_dma.h         | 4 +++-
 include/hw/intc/bcm2835_ic.h         | 4 +++-
 include/hw/intc/bcm2836_control.h    | 3 ++-
 include/hw/misc/bcm2835_mbox.h       | 4 +++-
 include/hw/misc/bcm2835_mbox_defs.h  | 4 +++-
 include/hw/misc/bcm2835_property.h   | 4 +++-
 hw/arm/bcm2835_peripherals.c         | 3 ++-
 hw/arm/bcm2836.c                     | 3 ++-
 hw/arm/raspi.c                       | 3 ++-
 hw/display/bcm2835_fb.c              | 1 -
 hw/dma/bcm2835_dma.c                 | 4 +++-
 hw/intc/bcm2835_ic.c                 | 4 ++--
 hw/intc/bcm2836_control.c            | 4 +++-
 hw/misc/bcm2835_mbox.c               | 4 +++-
 hw/misc/bcm2835_property.c           | 4 +++-
 19 files changed, 45 insertions(+), 20 deletions(-)

diff --git a/include/hw/arm/bcm2835_peripherals.h b/include/hw/arm/bcm2835_peripherals.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/arm/bcm2835_peripherals.h
+++ b/include/hw/arm/bcm2835_peripherals.h
@@ -XXX,XX +XXX,XX @@
  * Rasperry Pi 2 emulation and refactoring Copyright (c) 2015, Microsoft
  * Written by Andrew Baumann
  *
- * This code is licensed under the GNU GPLv2 and later.
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
  */
 
 #ifndef BCM2835_PERIPHERALS_H
diff --git a/include/hw/arm/bcm2836.h b/include/hw/arm/bcm2836.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/arm/bcm2836.h
+++ b/include/hw/arm/bcm2836.h
@@ -XXX,XX +XXX,XX @@
  * Rasperry Pi 2 emulation and refactoring Copyright (c) 2015, Microsoft
  * Written by Andrew Baumann
  *
- * This code is licensed under the GNU GPLv2 and later.
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
  */
 
 #ifndef BCM2836_H
diff --git a/include/hw/char/bcm2835_aux.h b/include/hw/char/bcm2835_aux.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/char/bcm2835_aux.h
+++ b/include/hw/char/bcm2835_aux.h
@@ -XXX,XX +XXX,XX @@
  * Rasperry Pi 2 emulation and refactoring Copyright (c) 2015, Microsoft
  * Written by Andrew Baumann
  *
- * This code is licensed under the GNU GPLv2 and later.
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
  */
 
 #ifndef BCM2835_AUX_H
diff --git a/include/hw/display/bcm2835_fb.h b/include/hw/display/bcm2835_fb.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/display/bcm2835_fb.h
+++ b/include/hw/display/bcm2835_fb.h
@@ -XXX,XX +XXX,XX @@
  * Rasperry Pi 2 emulation and refactoring Copyright (c) 2015, Microsoft
  * Written by Andrew Baumann
  *
- * This code is licensed under the GNU GPLv2 and later.
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
  */
 
 #ifndef BCM2835_FB_H
diff --git a/include/hw/dma/bcm2835_dma.h b/include/hw/dma/bcm2835_dma.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/dma/bcm2835_dma.h
+++ b/include/hw/dma/bcm2835_dma.h
@@ -XXX,XX +XXX,XX @@
 /*
  * Raspberry Pi emulation (c) 2012 Gregory Estrade
- * This code is licensed under the GNU GPLv2 and later.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
  */
 
 #ifndef BCM2835_DMA_H
diff --git a/include/hw/intc/bcm2835_ic.h b/include/hw/intc/bcm2835_ic.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/intc/bcm2835_ic.h
+++ b/include/hw/intc/bcm2835_ic.h
@@ -XXX,XX +XXX,XX @@
 /*
  * Raspberry Pi emulation (c) 2012 Gregory Estrade
- * This code is licensed under the GNU GPLv2 and later.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
  */
 
 #ifndef BCM2835_IC_H
diff --git a/include/hw/intc/bcm2836_control.h b/include/hw/intc/bcm2836_control.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/intc/bcm2836_control.h
+++ b/include/hw/intc/bcm2836_control.h
@@ -XXX,XX +XXX,XX @@
  * ARM Local Timer IRQ Copyright (c) 2019. Zoltán Baldaszti
  * Added basic IRQ_TIMER interrupt support
  *
- * This code is licensed under the GNU GPLv2 and later.
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
  */
 
 #ifndef BCM2836_CONTROL_H
diff --git a/include/hw/misc/bcm2835_mbox.h b/include/hw/misc/bcm2835_mbox.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/misc/bcm2835_mbox.h
+++ b/include/hw/misc/bcm2835_mbox.h
@@ -XXX,XX +XXX,XX @@
 /*
  * Raspberry Pi emulation (c) 2012 Gregory Estrade
- * This code is licensed under the GNU GPLv2 and later.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
  */
 
 #ifndef BCM2835_MBOX_H
diff --git a/include/hw/misc/bcm2835_mbox_defs.h b/include/hw/misc/bcm2835_mbox_defs.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/misc/bcm2835_mbox_defs.h
+++ b/include/hw/misc/bcm2835_mbox_defs.h
@@ -XXX,XX +XXX,XX @@
 /*
  * Raspberry Pi emulation (c) 2012 Gregory Estrade
- * This code is licensed under the GNU GPLv2 and later.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
  */
 
 #ifndef BCM2835_MBOX_DEFS_H
diff --git a/include/hw/misc/bcm2835_property.h b/include/hw/misc/bcm2835_property.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/misc/bcm2835_property.h
+++ b/include/hw/misc/bcm2835_property.h
@@ -XXX,XX +XXX,XX @@
 /*
  * Raspberry Pi emulation (c) 2012 Gregory Estrade
- * This code is licensed under the GNU GPLv2 and later.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
  */
 
 #ifndef BCM2835_PROPERTY_H
diff --git a/hw/arm/bcm2835_peripherals.c b/hw/arm/bcm2835_peripherals.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/arm/bcm2835_peripherals.c
+++ b/hw/arm/bcm2835_peripherals.c
@@ -XXX,XX +XXX,XX @@
  * Rasperry Pi 2 emulation and refactoring Copyright (c) 2015, Microsoft
  * Written by Andrew Baumann
  *
- * This code is licensed under the GNU GPLv2 and later.
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
  */
 
 #include "qemu/osdep.h"
diff --git a/hw/arm/bcm2836.c b/hw/arm/bcm2836.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/arm/bcm2836.c
+++ b/hw/arm/bcm2836.c
@@ -XXX,XX +XXX,XX @@
  * Rasperry Pi 2 emulation and refactoring Copyright (c) 2015, Microsoft
  * Written by Andrew Baumann
  *
- * This code is licensed under the GNU GPLv2 and later.
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
  */
 
 #include "qemu/osdep.h"
diff --git a/hw/arm/raspi.c b/hw/arm/raspi.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/arm/raspi.c
+++ b/hw/arm/raspi.c
@@ -XXX,XX +XXX,XX @@
  * Raspberry Pi 3 emulation Copyright (c) 2018 Zoltán Baldaszti
  * Upstream code cleanup (c) 2018 Pekka Enberg
  *
- * This code is licensed under the GNU GPLv2 and later.
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
  */
 
 #include "qemu/osdep.h"
diff --git a/hw/display/bcm2835_fb.c b/hw/display/bcm2835_fb.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/display/bcm2835_fb.c
+++ b/hw/display/bcm2835_fb.c
@@ -XXX,XX +XXX,XX @@
 /*
  * Raspberry Pi emulation (c) 2012 Gregory Estrade
  * Refactoring for Pi2 Copyright (c) 2015, Microsoft. Written by Andrew Baumann.
- * This code is licensed under the GNU GPLv2 and later.
  *
  * Heavily based on milkymist-vgafb.c, copyright terms below:
  *  QEMU model of the Milkymist VGA framebuffer.
diff --git a/hw/dma/bcm2835_dma.c b/hw/dma/bcm2835_dma.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/dma/bcm2835_dma.c
+++ b/hw/dma/bcm2835_dma.c
@@ -XXX,XX +XXX,XX @@
 /*
  * Raspberry Pi emulation (c) 2012 Gregory Estrade
- * This code is licensed under the GNU GPLv2 and later.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
  */
 
 #include "qemu/osdep.h"
diff --git a/hw/intc/bcm2835_ic.c b/hw/intc/bcm2835_ic.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/intc/bcm2835_ic.c
+++ b/hw/intc/bcm2835_ic.c
@@ -XXX,XX +XXX,XX @@
 /*
  * Raspberry Pi emulation (c) 2012 Gregory Estrade
  * Refactoring for Pi2 Copyright (c) 2015, Microsoft. Written by Andrew Baumann.
- * This code is licensed under the GNU GPLv2 and later.
  * Heavily based on pl190.c, copyright terms below:
  *
  * Arm PrimeCell PL190 Vector Interrupt Controller
@@ -XXX,XX +XXX,XX @@
  * Copyright (c) 2006 CodeSourcery.
  * Written by Paul Brook
  *
- * This code is licensed under the GPL.
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
  */
 
 #include "qemu/osdep.h"
diff --git a/hw/intc/bcm2836_control.c b/hw/intc/bcm2836_control.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/intc/bcm2836_control.c
+++ b/hw/intc/bcm2836_control.c
@@ -XXX,XX +XXX,XX @@
  * Written by Andrew Baumann
  *
  * Based on bcm2835_ic.c (Raspberry Pi emulation) (c) 2012 Gregory Estrade
- * This code is licensed under the GNU GPLv2 and later.
  *
  * At present, only implements interrupt routing, and mailboxes (i.e.,
  * not PMU interrupt, or AXI counters).
@@ -XXX,XX +XXX,XX @@
  *
  * Ref:
  * https://www.raspberrypi.org/documentation/hardware/raspberrypi/bcm2836/QA7_rev3.4.pdf
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
  */
 
 #include "qemu/osdep.h"
diff --git a/hw/misc/bcm2835_mbox.c b/hw/misc/bcm2835_mbox.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/misc/bcm2835_mbox.c
+++ b/hw/misc/bcm2835_mbox.c
@@ -XXX,XX +XXX,XX @@
 /*
  * Raspberry Pi emulation (c) 2012 Gregory Estrade
- * This code is licensed under the GNU GPLv2 and later.
  *
  * This file models the system mailboxes, which are used for
  * communication with low-bandwidth GPU peripherals. Refs:
  *   https://github.com/raspberrypi/firmware/wiki/Mailboxes
  *   https://github.com/raspberrypi/firmware/wiki/Accessing-mailboxes
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
  */
 
 #include "qemu/osdep.h"
diff --git a/hw/misc/bcm2835_property.c b/hw/misc/bcm2835_property.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/misc/bcm2835_property.c
+++ b/hw/misc/bcm2835_property.c
@@ -XXX,XX +XXX,XX @@
 /*
  * Raspberry Pi emulation (c) 2012 Gregory Estrade
- * This code is licensed under the GNU GPLv2 and later.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
  */
 
 #include "qemu/osdep.h"
-- 
2.20.1

From: Cédric Le Goater <clg@kaod.org>

Recent firmwares uses SPI DMA transfers in U-Boot to load the
different images (kernel, initrd, dtb) in the SoC DRAM. The AST2600
FMC model is missing the masks to be applied on the DMA registers
which resulted in incorrect values. Fix that and wire the SPI
controllers which have DMA support on the AST2600.

Fixes: bcaa8ddd081c ("aspeed/smc: Add AST2600 support")
Signed-off-by: Cédric Le Goater <clg@kaod.org>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Message-id: 20200320053923.20565-1-clg@kaod.org
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 hw/arm/aspeed_ast2600.c |  6 ++++++
 hw/ssi/aspeed_smc.c     | 15 +++++++++++++--
 hw/ssi/trace-events     |  1 +
 3 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/hw/arm/aspeed_ast2600.c b/hw/arm/aspeed_ast2600.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/arm/aspeed_ast2600.c
+++ b/hw/arm/aspeed_ast2600.c
@@ -XXX,XX +XXX,XX @@ static void aspeed_soc_ast2600_realize(DeviceState *dev, Error **errp)
 
     /* SPI */
     for (i = 0; i < sc->spis_num; i++) {
+        object_property_set_link(OBJECT(&s->spi[i]), OBJECT(s->dram_mr),
+                                 "dram", &err);
+        if (err) {
+            error_propagate(errp, err);
+            return;
+        }
         object_property_set_int(OBJECT(&s->spi[i]), 1, "num-cs", &err);
         object_property_set_bool(OBJECT(&s->spi[i]), true, "realized",
                                  &local_err);
diff --git a/hw/ssi/aspeed_smc.c b/hw/ssi/aspeed_smc.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/ssi/aspeed_smc.c
+++ b/hw/ssi/aspeed_smc.c
@@ -XXX,XX +XXX,XX @@ static const AspeedSMCController controllers[] = {
         .flash_window_base = ASPEED26_SOC_FMC_FLASH_BASE,
         .flash_window_size = 0x10000000,
         .has_dma           = true,
+        .dma_flash_mask    = 0x0FFFFFFC,
+        .dma_dram_mask     = 0x3FFFFFFC,
         .nregs             = ASPEED_SMC_R_MAX,
         .segment_to_reg    = aspeed_2600_smc_segment_to_reg,
         .reg_to_segment    = aspeed_2600_smc_reg_to_segment,
@@ -XXX,XX +XXX,XX @@ static const AspeedSMCController controllers[] = {
         .segments          = aspeed_segments_ast2600_spi1,
         .flash_window_base = ASPEED26_SOC_SPI_FLASH_BASE,
         .flash_window_size = 0x10000000,
-        .has_dma           = false,
+        .has_dma           = true,
+        .dma_flash_mask    = 0x0FFFFFFC,
+        .dma_dram_mask     = 0x3FFFFFFC,
         .nregs             = ASPEED_SMC_R_MAX,
         .segment_to_reg    = aspeed_2600_smc_segment_to_reg,
         .reg_to_segment    = aspeed_2600_smc_reg_to_segment,
@@ -XXX,XX +XXX,XX @@ static const AspeedSMCController controllers[] = {
         .segments          = aspeed_segments_ast2600_spi2,
         .flash_window_base = ASPEED26_SOC_SPI2_FLASH_BASE,
         .flash_window_size = 0x10000000,
-        .has_dma           = false,
+        .has_dma           = true,
+        .dma_flash_mask    = 0x0FFFFFFC,
+        .dma_dram_mask     = 0x3FFFFFFC,
         .nregs             = ASPEED_SMC_R_MAX,
         .segment_to_reg    = aspeed_2600_smc_segment_to_reg,
         .reg_to_segment    = aspeed_2600_smc_reg_to_segment,
@@ -XXX,XX +XXX,XX @@ static void aspeed_smc_dma_rw(AspeedSMCState *s)
     MemTxResult result;
     uint32_t data;
 
+    trace_aspeed_smc_dma_rw(s->regs[R_DMA_CTRL] & DMA_CTRL_WRITE ?
+                            "write" : "read",
+                            s->regs[R_DMA_FLASH_ADDR],
+                            s->regs[R_DMA_DRAM_ADDR],
+                            s->regs[R_DMA_LEN]);
     while (s->regs[R_DMA_LEN]) {
         if (s->regs[R_DMA_CTRL] & DMA_CTRL_WRITE) {
             data = address_space_ldl_le(&s->dram_as, s->regs[R_DMA_DRAM_ADDR],
diff --git a/hw/ssi/trace-events b/hw/ssi/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/hw/ssi/trace-events
+++ b/hw/ssi/trace-events
@@ -XXX,XX +XXX,XX @@ aspeed_smc_do_snoop(int cs, int index, int dummies, int data) "CS%d index:0x%x d
 aspeed_smc_flash_write(int cs, uint64_t addr,  uint32_t size, uint64_t data, int mode) "CS%d @0x%" PRIx64 " size %u: 0x%" PRIx64" mode:%d"
 aspeed_smc_read(uint64_t addr,  uint32_t size, uint64_t data) "@0x%" PRIx64 " size %u: 0x%" PRIx64
 aspeed_smc_dma_checksum(uint32_t addr, uint32_t data) "0x%08x: 0x%08x"
+aspeed_smc_dma_rw(const char *dir, uint32_t flash_addr, uint32_t dram_addr, uint32_t size) "%s flash:@0x%08x dram:@0x%08x size:0x%08x"
 aspeed_smc_write(uint64_t addr,  uint32_t size, uint64_t data) "@0x%" PRIx64 " size %u: 0x%" PRIx64
 aspeed_smc_flash_select(int cs, const char *prefix) "CS%d %sselect"
-- 
2.20.1

From: Richard Henderson <richard.henderson@linaro.org>

Coverity rightly notes that ctz32(bas) on 0 will return 32,
which makes the len calculation a BAD_SHIFT.

A value of 0 in DBGWCR<n>_EL1.BAS is reserved.  Simply move
the existing check we have for this case.

Reported-by: Coverity (CID 1421964)
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20200320160622.8040-2-richard.henderson@linaro.org
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 target/arm/helper.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/target/arm/helper.c b/target/arm/helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -XXX,XX +XXX,XX @@ void hw_watchpoint_update(ARMCPU *cpu, int n)
         int bas = extract64(wcr, 5, 8);
         int basstart;
 
-        if (bas == 0) {
-            /* This must act as if the watchpoint is disabled */
-            return;
-        }
-
         if (extract64(wvr, 2, 1)) {
             /* Deprecated case of an only 4-aligned address. BAS[7:4] are
              * ignored, and BAS[3:0] define which bytes to watch.
              */
             bas &= 0xf;
         }
+
+        if (bas == 0) {
+            /* This must act as if the watchpoint is disabled */
+            return;
+        }
+
         /* The BAS bits are supposed to be programmed to indicate a contiguous
          * range of bytes. Otherwise it is CONSTRAINED UNPREDICTABLE whether
          * we fire for each byte in the word/doubleword addressed by the WVR.
-- 
2.20.1

From: Richard Henderson <richard.henderson@linaro.org>

Coverity raised a shed-load of errors cascading from inferring
that clz32(immh) might yield 32, from immh might be 0.

While immh cannot be 0 from encoding, it is not obvious even to
a human how we've checked that: via the filtering provided by
data_proc_simd[].

Reported-by: Coverity (CID 1421923, and more)
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20200320160622.8040-3-richard.henderson@linaro.org
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 target/arm/translate-a64.c | 3 +++
 1 file changed, 3 insertions(+)

From: Richard Henderson <richard.henderson@linaro.org>

Coverity reports a BAD_SHIFT with ctz32(imm5), with imm5 == 0.
This is an invalid encoding, but we diagnose that just below
by rejecting size > 3.  Avoid the warning by sinking the
computation of index below the check.

Reported-by: Coverity (CID 1421965)
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20200320160622.8040-4-richard.henderson@linaro.org
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 target/arm/translate-a64.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -XXX,XX +XXX,XX @@ static void handle_simd_dupe(DisasContext *s, int is_q, int rd, int rn,
                              int imm5)
 {
     int size = ctz32(imm5);
-    int index = imm5 >> (size + 1);
+    int index;
 
     if (size > 3 || (size == 3 && !is_q)) {
         unallocated_encoding(s);
@@ -XXX,XX +XXX,XX @@ static void handle_simd_dupe(DisasContext *s, int is_q, int rd, int rn,
         return;
     }
 
+    index = imm5 >> (size + 1);
     tcg_gen_gvec_dup_mem(size, vec_full_reg_offset(s, rd),
                          vec_reg_offset(s, rn, index, size),
                          is_q ? 16 : 8, vec_full_reg_size(s));
-- 
2.20.1

The following changes since commit 1ea06abceec61b6f3ab33dadb0510b6e09fb61e2:

Merge remote-tracking branch 'remotes/berrange-gitlab/tags/misc-fixes-pull-request' into staging (2021-06-14 15:59:13 +0100)

are available in the Git repository at:

https://git.linaro.org/people/pmaydell/qemu-arm.git tags/pull-target-arm-20210615

for you to fetch changes up to c611c956c7fdce651e30687b1f5d19b4cab78b6a:

include/qemu/int128.h: Add function to create Int128 from int64_t (2021-06-15 16:18:50 +0100)

----------------------------------------------------------------
target-arm queue:
 * hw/intc/arm_gicv3_cpuif: Tolerate spurious EOIR writes
 * handle some UNALLOCATED decode cases correctly rather
   than asserting
 * hw: virt: consider hw_compat_6_0
 * hw/arm: add quanta-gbs-bmc machine
 * hw/intc/armv7m_nvic: Remove stale comment
 * arm, acpi: Remove dependency on presence of 'virt' board
 * target/arm: Fix mte page crossing test
 * hw/arm: quanta-q71l add pca954x muxes
 * target/arm: First few parts of MVE support

----------------------------------------------------------------
Heinrich Schuchardt (1):
      hw: virt: consider hw_compat_6_0

Jean-Philippe Brucker (1):
      hw/intc/arm_gicv3_cpuif: Tolerate spurious EOIR writes

Patrick Venture (5):
      hw/arm: add quanta-gbs-bmc machine
      hw/arm: quanta-gbs-bmc add i2c comments
      hw/arm: gsj add i2c comments
      hw/arm: gsj add pca9548
      hw/arm: quanta-q71l add pca954x muxes

Peter Maydell (17):
      hw/intc/armv7m_nvic: Remove stale comment
      hw/acpi: Provide stub version of acpi_ghes_record_errors()
      hw/acpi: Provide function acpi_ghes_present()
      target/arm: Use acpi_ghes_present() to see if we report ACPI memory errors
      target/arm: Provide and use H8 and H1_8 macros
      target/arm: Enable FPSCR.QC bit for MVE
      target/arm: Handle VPR semantics in existing code
      target/arm: Add handling for PSR.ECI/ICI
      target/arm: Let vfp_access_check() handle late NOCP checks
      target/arm: Implement MVE LCTP
      target/arm: Implement MVE WLSTP insn
      target/arm: Implement MVE DLSTP
      target/arm: Implement MVE LETP insn
      target/arm: Add framework for MVE decode
      target/arm: Move expand_pred_b() data to vec_helper.c
      bitops.h: Provide hswap32(), hswap64(), wswap64() swapping operations
      include/qemu/int128.h: Add function to create Int128 from int64_t

Richard Henderson (4):
      target/arm: Diagnose UNALLOCATED in disas_simd_two_reg_misc_fp16
      target/arm: Remove fprintf from disas_simd_mod_imm
      target/arm: Diagnose UNALLOCATED in disas_simd_three_reg_same_fp16
      target/arm: Fix mte page crossing test

From: Jean-Philippe Brucker <jean-philippe@linaro.org>

Commit 382c7160d1cd ("hw/intc/arm_gicv3_cpuif: Fix EOIR write access
check logic") added an assert_not_reached() if the guest writes the EOIR
register while no interrupt is active.

It turns out some software does this: EDK2, in
GicV3ExitBootServicesEvent(), unconditionally write EOIR for all
interrupts that it manages. This now causes QEMU to abort when running
UEFI on a VM with GICv3. Although it is UNPREDICTABLE behavior and EDK2
does need fixing, the punishment seems a little harsh, especially since
icc_eoir_write() already tolerates writes of nonexistent interrupt
numbers. Display a guest error and tolerate spurious EOIR writes.

Fixes: 382c7160d1cd ("hw/intc/arm_gicv3_cpuif: Fix EOIR write access check logic")
Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Tested-by: Alex Bennée <alex.bennee@linaro.org>
Message-id: 20210604130352.1887560-1-jean-philippe@linaro.org
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 hw/intc/arm_gicv3_cpuif.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/hw/intc/arm_gicv3_cpuif.c b/hw/intc/arm_gicv3_cpuif.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/intc/arm_gicv3_cpuif.c
+++ b/hw/intc/arm_gicv3_cpuif.c
@@ -XXX,XX +XXX,XX @@
 
 #include "qemu/osdep.h"
 #include "qemu/bitops.h"
+#include "qemu/log.h"
 #include "qemu/main-loop.h"
 #include "trace.h"
 #include "gicv3_internal.h"
@@ -XXX,XX +XXX,XX @@ static void icc_eoir_write(CPUARMState *env, const ARMCPRegInfo *ri,
         }
         break;
     default:
-        g_assert_not_reached();
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "%s: IRQ %d isn't active\n", __func__, irq);
+        return;
     }
 
     icc_drop_prio(cs, grp);
-- 
2.20.1

From: Richard Henderson <richard.henderson@linaro.org>

This fprintf+assert has been in place since the beginning.
It is prior to the fp_access_check, so we're still good to
raise sigill here.

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/381
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Message-id: 20210604183506.916654-2-richard.henderson@linaro.org
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 target/arm/translate-a64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

From: Richard Henderson <richard.henderson@linaro.org>

This fprintf+assert has been in place since the beginning.
It is after to the fp_access_check, so we need to move the
check up.  Fold that in to the pairwise filter.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Message-id: 20210604183506.916654-4-richard.henderson@linaro.org
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 target/arm/translate-a64.c | 82 +++++++++++++++++++++++---------------
 1 file changed, 50 insertions(+), 32 deletions(-)

diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -XXX,XX +XXX,XX @@ static void disas_simd_three_reg_same(DisasContext *s, uint32_t insn)
  */
 static void disas_simd_three_reg_same_fp16(DisasContext *s, uint32_t insn)
 {
-    int opcode, fpopcode;
-    int is_q, u, a, rm, rn, rd;
-    int datasize, elements;
-    int pass;
+    int opcode = extract32(insn, 11, 3);
+    int u = extract32(insn, 29, 1);
+    int a = extract32(insn, 23, 1);
+    int is_q = extract32(insn, 30, 1);
+    int rm = extract32(insn, 16, 5);
+    int rn = extract32(insn, 5, 5);
+    int rd = extract32(insn, 0, 5);
+    /*
+     * For these floating point ops, the U, a and opcode bits
+     * together indicate the operation.
+     */
+    int fpopcode = opcode | (a << 3) | (u << 4);
+    int datasize = is_q ? 128 : 64;
+    int elements = datasize / 16;
+    bool pairwise;
     TCGv_ptr fpst;
-    bool pairwise = false;
+    int pass;
+
+    switch (fpopcode) {
+    case 0x0: /* FMAXNM */
+    case 0x1: /* FMLA */
+    case 0x2: /* FADD */
+    case 0x3: /* FMULX */
+    case 0x4: /* FCMEQ */
+    case 0x6: /* FMAX */
+    case 0x7: /* FRECPS */
+    case 0x8: /* FMINNM */
+    case 0x9: /* FMLS */
+    case 0xa: /* FSUB */
+    case 0xe: /* FMIN */
+    case 0xf: /* FRSQRTS */
+    case 0x13: /* FMUL */
+    case 0x14: /* FCMGE */
+    case 0x15: /* FACGE */
+    case 0x17: /* FDIV */
+    case 0x1a: /* FABD */
+    case 0x1c: /* FCMGT */
+    case 0x1d: /* FACGT */
+        pairwise = false;
+        break;
+    case 0x10: /* FMAXNMP */
+    case 0x12: /* FADDP */
+    case 0x16: /* FMAXP */
+    case 0x18: /* FMINNMP */
+    case 0x1e: /* FMINP */
+        pairwise = true;
+        break;
+    default:
+        unallocated_encoding(s);
+        return;
+    }
 
     if (!dc_isar_feature(aa64_fp16, s)) {
         unallocated_encoding(s);
@@ -XXX,XX +XXX,XX @@ static void disas_simd_three_reg_same_fp16(DisasContext *s, uint32_t insn)
         return;
     }
 
-    /* For these floating point ops, the U, a and opcode bits
-     * together indicate the operation.
-     */
-    opcode = extract32(insn, 11, 3);
-    u = extract32(insn, 29, 1);
-    a = extract32(insn, 23, 1);
-    is_q = extract32(insn, 30, 1);
-    rm = extract32(insn, 16, 5);
-    rn = extract32(insn, 5, 5);
-    rd = extract32(insn, 0, 5);
-
-    fpopcode = opcode | (a << 3) |  (u << 4);
-    datasize = is_q ? 128 : 64;
-    elements = datasize / 16;
-
-    switch (fpopcode) {
-    case 0x10: /* FMAXNMP */
-    case 0x12: /* FADDP */
-    case 0x16: /* FMAXP */
-    case 0x18: /* FMINNMP */
-    case 0x1e: /* FMINP */
-        pairwise = true;
-        break;
-    }
-
     fpst = fpstatus_ptr(FPST_FPCR_F16);
 
     if (pairwise) {
@@ -XXX,XX +XXX,XX @@ static void disas_simd_three_reg_same_fp16(DisasContext *s, uint32_t insn)
                 gen_helper_advsimd_acgt_f16(tcg_res, tcg_op1, tcg_op2, fpst);
                 break;
             default:
-                fprintf(stderr, "%s: insn 0x%04x, fpop 0x%2x @ 0x%" PRIx64 "\n",
-                        __func__, insn, fpopcode, s->pc_curr);
                 g_assert_not_reached();
             }
 
-- 
2.20.1

From: Patrick Venture <venture@google.com>

Adds initial quanta-gbs-bmc machine support.

Tested: Boots to userspace.
Signed-off-by: Patrick Venture <venture@google.com>
Reviewed-by: Brandon Kim <brandonkim@google.com>
Reviewed-by: Hao Wu <wuhaotsh@google.com>
Message-id: 20210608193605.2611114-2-venture@google.com
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 hw/arm/npcm7xx_boards.c | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/hw/arm/npcm7xx_boards.c b/hw/arm/npcm7xx_boards.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/arm/npcm7xx_boards.c
+++ b/hw/arm/npcm7xx_boards.c
@@ -XXX,XX +XXX,XX @@
 
 #define NPCM750_EVB_POWER_ON_STRAPS 0x00001ff7
 #define QUANTA_GSJ_POWER_ON_STRAPS 0x00001fff
+#define QUANTA_GBS_POWER_ON_STRAPS 0x000017ff
 
 static const char npcm7xx_default_bootrom[] = "npcm7xx_bootrom.bin";
 
@@ -XXX,XX +XXX,XX @@ static void quanta_gsj_init(MachineState *machine)
     npcm7xx_load_kernel(machine, soc);
 }
 
+static void quanta_gbs_init(MachineState *machine)
+{
+    NPCM7xxState *soc;
+
+    soc = npcm7xx_create_soc(machine, QUANTA_GBS_POWER_ON_STRAPS);
+    npcm7xx_connect_dram(soc, machine->ram);
+    qdev_realize(DEVICE(soc), NULL, &error_fatal);
+
+    npcm7xx_load_bootrom(machine, soc);
+
+    npcm7xx_connect_flash(&soc->fiu[0], 0, "mx66u51235f",
+                          drive_get(IF_MTD, 0, 0));
+
+    npcm7xx_load_kernel(machine, soc);
+}
+
 static void npcm7xx_set_soc_type(NPCM7xxMachineClass *nmc, const char *type)
 {
     NPCM7xxClass *sc = NPCM7XX_CLASS(object_class_by_name(type));
@@ -XXX,XX +XXX,XX @@ static void gsj_machine_class_init(ObjectClass *oc, void *data)
     mc->default_ram_size = 512 * MiB;
 };
 
+static void gbs_bmc_machine_class_init(ObjectClass *oc, void *data)
+{
+    NPCM7xxMachineClass *nmc = NPCM7XX_MACHINE_CLASS(oc);
+    MachineClass *mc = MACHINE_CLASS(oc);
+
+    npcm7xx_set_soc_type(nmc, TYPE_NPCM730);
+
+    mc->desc = "Quanta GBS (Cortex-A9)";
+    mc->init = quanta_gbs_init;
+    mc->default_ram_size = 1 * GiB;
+}
+
 static const TypeInfo npcm7xx_machine_types[] = {
     {
         .name           = TYPE_NPCM7XX_MACHINE,
@@ -XXX,XX +XXX,XX @@ static const TypeInfo npcm7xx_machine_types[] = {
         .name           = MACHINE_TYPE_NAME("quanta-gsj"),
         .parent         = TYPE_NPCM7XX_MACHINE,
         .class_init     = gsj_machine_class_init,
+    }, {
+        .name           = MACHINE_TYPE_NAME("quanta-gbs-bmc"),
+        .parent         = TYPE_NPCM7XX_MACHINE,
+        .class_init     = gbs_bmc_machine_class_init,
     },
 };
 
-- 
2.20.1

From: Patrick Venture <venture@google.com>

Add a comment and i2c method that describes the board layout.

Tested: firmware booted to userspace.
Signed-off-by: Patrick Venture <venture@google.com>
Reviewed-by: Brandon Kim <brandonkim@google.com>
Reviewed-by: Hao Wu <wuhaotsh@google.com>
Message-id: 20210608193605.2611114-3-venture@google.com
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 hw/arm/npcm7xx_boards.c | 60 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/hw/arm/npcm7xx_boards.c b/hw/arm/npcm7xx_boards.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/arm/npcm7xx_boards.c
+++ b/hw/arm/npcm7xx_boards.c
@@ -XXX,XX +XXX,XX @@ static void quanta_gsj_fan_init(NPCM7xxMachine *machine, NPCM7xxState *soc)
     npcm7xx_connect_pwm_fan(soc, &splitter[2], 0x05, 1);
 }
 
+static void quanta_gbs_i2c_init(NPCM7xxState *soc)
+{
+    /*
+     * i2c-0:
+     *     pca9546@71
+     *
+     * i2c-1:
+     *     pca9535@24
+     *     pca9535@20
+     *     pca9535@21
+     *     pca9535@22
+     *     pca9535@23
+     *     pca9535@25
+     *     pca9535@26
+     *
+     * i2c-2:
+     *     sbtsi@4c
+     *
+     * i2c-5:
+     *     atmel,24c64@50 mb_fru
+     *     pca9546@71
+     *         - channel 0: max31725@54
+     *         - channel 1: max31725@55
+     *         - channel 2: max31725@5d
+     *                      atmel,24c64@51 fan_fru
+     *         - channel 3: atmel,24c64@52 hsbp_fru
+     *
+     * i2c-6:
+     *     pca9545@73
+     *
+     * i2c-7:
+     *     pca9545@72
+     *
+     * i2c-8:
+     *     adi,adm1272@10
+     *
+     * i2c-9:
+     *     pca9546@71
+     *         - channel 0: isil,isl68137@60
+     *         - channel 1: isil,isl68137@61
+     *         - channel 2: isil,isl68137@63
+     *         - channel 3: isil,isl68137@45
+     *
+     * i2c-10:
+     *     pca9545@71
+     *
+     * i2c-11:
+     *     pca9545@76
+     *
+     * i2c-12:
+     *     maxim,max34451@4e
+     *     isil,isl68137@5d
+     *     isil,isl68137@5e
+     *
+     * i2c-14:
+     *     pca9545@70
+     */
+}
+
 static void npcm750_evb_init(MachineState *machine)
 {
     NPCM7xxState *soc;
@@ -XXX,XX +XXX,XX @@ static void quanta_gbs_init(MachineState *machine)
     npcm7xx_connect_flash(&soc->fiu[0], 0, "mx66u51235f",
                           drive_get(IF_MTD, 0, 0));
 
+    quanta_gbs_i2c_init(soc);
     npcm7xx_load_kernel(machine, soc);
 }
 
-- 
2.20.1

In commit da6d674e509f0939b we split the NVIC code out from the GIC.
This allowed us to specify the NVIC's default value for the num-irq
property (64) in the usual way in its property list, and we deleted
the previous hack where we updated the value in the state struct in
the instance init function.  Remove a stale comment about that hack
which we forgot to delete at that time.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20210614161243.14211-1-peter.maydell@linaro.org
---
 hw/intc/armv7m_nvic.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/hw/intc/armv7m_nvic.c b/hw/intc/armv7m_nvic.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/intc/armv7m_nvic.c
+++ b/hw/intc/armv7m_nvic.c
@@ -XXX,XX +XXX,XX @@ static void armv7m_nvic_realize(DeviceState *dev, Error **errp)
 
 static void armv7m_nvic_instance_init(Object *obj)
 {
-    /* We have a different default value for the num-irq property
-     * than our superclass. This function runs after qdev init
-     * has set the defaults from the Property array and before
-     * any user-specified property setting, so just modify the
-     * value in the GICState struct.
-     */
     DeviceState *dev = DEVICE(obj);
     NVICState *nvic = NVIC(obj);
     SysBusDevice *sbd = SYS_BUS_DEVICE(obj);
-- 
2.20.1

Generic code in target/arm wants to call acpi_ghes_record_errors();
provide a stub version so that we don't fail to link when
CONFIG_ACPI_APEI is not set. This requires us to add a new
ghes-stub.c file to contain it and the meson.build mechanics
to use it when appropriate.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Dongjiu Geng <gengdongjiu1@gmail.com>
Message-id: 20210603171259.27962-2-peter.maydell@linaro.org
---
 hw/acpi/ghes-stub.c | 17 +++++++++++++++++
 hw/acpi/meson.build |  6 +++---
 2 files changed, 20 insertions(+), 3 deletions(-)
 create mode 100644 hw/acpi/ghes-stub.c

diff --git a/hw/acpi/ghes-stub.c b/hw/acpi/ghes-stub.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/hw/acpi/ghes-stub.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * Support for generating APEI tables and recording CPER for Guests:
+ * stub functions.
+ *
+ * Copyright (c) 2021 Linaro, Ltd
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/acpi/ghes.h"
+
+int acpi_ghes_record_errors(uint8_t source_id, uint64_t physical_address)
+{
+    return -1;
+}
diff --git a/hw/acpi/meson.build b/hw/acpi/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/hw/acpi/meson.build
+++ b/hw/acpi/meson.build
@@ -XXX,XX +XXX,XX @@ acpi_ss.add(when: 'CONFIG_ACPI_PCI', if_true: files('pci.c'))
 acpi_ss.add(when: 'CONFIG_ACPI_VMGENID', if_true: files('vmgenid.c'))
 acpi_ss.add(when: 'CONFIG_ACPI_HW_REDUCED', if_true: files('generic_event_device.c'))
 acpi_ss.add(when: 'CONFIG_ACPI_HMAT', if_true: files('hmat.c'))
-acpi_ss.add(when: 'CONFIG_ACPI_APEI', if_true: files('ghes.c'))
+acpi_ss.add(when: 'CONFIG_ACPI_APEI', if_true: files('ghes.c'), if_false:('ghes-stub.c'))
 acpi_ss.add(when: 'CONFIG_ACPI_X86', if_true: files('core.c', 'piix4.c', 'pcihp.c'), if_false: files('acpi-stub.c'))
 acpi_ss.add(when: 'CONFIG_ACPI_X86_ICH', if_true: files('ich9.c', 'tco.c'))
 acpi_ss.add(when: 'CONFIG_IPMI', if_true: files('ipmi.c'), if_false: files('ipmi-stub.c'))
 acpi_ss.add(when: 'CONFIG_PC', if_false: files('acpi-x86-stub.c'))
 acpi_ss.add(when: 'CONFIG_TPM', if_true: files('tpm.c'))
-softmmu_ss.add(when: 'CONFIG_ACPI', if_false: files('acpi-stub.c', 'aml-build-stub.c'))
+softmmu_ss.add(when: 'CONFIG_ACPI', if_false: files('acpi-stub.c', 'aml-build-stub.c', 'ghes-stub.c'))
 softmmu_ss.add_all(when: 'CONFIG_ACPI', if_true: acpi_ss)
 softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('acpi-stub.c', 'aml-build-stub.c',
-                                                  'acpi-x86-stub.c', 'ipmi-stub.c'))
+                                                  'acpi-x86-stub.c', 'ipmi-stub.c', 'ghes-stub.c'))
-- 
2.20.1

Allow code elsewhere in the system to check whether the ACPI GHES
table is present, so it can determine whether it is OK to try to
record an error by calling acpi_ghes_record_errors().

(We don't need to migrate the new 'present' field in AcpiGhesState,
because it is set once at system initialization and doesn't change.)

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Dongjiu Geng <gengdongjiu1@gmail.com>
Message-id: 20210603171259.27962-3-peter.maydell@linaro.org
---
 include/hw/acpi/ghes.h |  9 +++++++++
 hw/acpi/ghes-stub.c    |  5 +++++
 hw/acpi/ghes.c         | 17 +++++++++++++++++
 3 files changed, 31 insertions(+)

diff --git a/include/hw/acpi/ghes.h b/include/hw/acpi/ghes.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/acpi/ghes.h
+++ b/include/hw/acpi/ghes.h
@@ -XXX,XX +XXX,XX @@ enum {
 
 typedef struct AcpiGhesState {
     uint64_t ghes_addr_le;
+    bool present; /* True if GHES is present at all on this board */
 } AcpiGhesState;
 
 void build_ghes_error_table(GArray *hardware_errors, BIOSLinker *linker);
@@ -XXX,XX +XXX,XX @@ void acpi_build_hest(GArray *table_data, BIOSLinker *linker,
 void acpi_ghes_add_fw_cfg(AcpiGhesState *vms, FWCfgState *s,
                           GArray *hardware_errors);
 int acpi_ghes_record_errors(uint8_t notify, uint64_t error_physical_addr);
+
+/**
+ * acpi_ghes_present: Report whether ACPI GHES table is present
+ *
+ * Returns: true if the system has an ACPI GHES table and it is
+ * safe to call acpi_ghes_record_errors() to record a memory error.
+ */
+bool acpi_ghes_present(void);
 #endif
diff --git a/hw/acpi/ghes-stub.c b/hw/acpi/ghes-stub.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/acpi/ghes-stub.c
+++ b/hw/acpi/ghes-stub.c
@@ -XXX,XX +XXX,XX @@ int acpi_ghes_record_errors(uint8_t source_id, uint64_t physical_address)
 {
     return -1;
 }
+
+bool acpi_ghes_present(void)
+{
+    return false;
+}
diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/acpi/ghes.c
+++ b/hw/acpi/ghes.c
@@ -XXX,XX +XXX,XX @@ void acpi_ghes_add_fw_cfg(AcpiGhesState *ags, FWCfgState *s,
     /* Create a read-write fw_cfg file for Address */
     fw_cfg_add_file_callback(s, ACPI_GHES_DATA_ADDR_FW_CFG_FILE, NULL, NULL,
         NULL, &(ags->ghes_addr_le), sizeof(ags->ghes_addr_le), false);
+
+    ags->present = true;
 }
 
 int acpi_ghes_record_errors(uint8_t source_id, uint64_t physical_address)
@@ -XXX,XX +XXX,XX @@ int acpi_ghes_record_errors(uint8_t source_id, uint64_t physical_address)
 
     return ret;
 }
+
+bool acpi_ghes_present(void)
+{
+    AcpiGedState *acpi_ged_state;
+    AcpiGhesState *ags;
+
+    acpi_ged_state = ACPI_GED(object_resolve_path_type("", TYPE_ACPI_GED,
+                                                       NULL));
+
+    if (!acpi_ged_state) {
+        return false;
+    }
+    ags = &acpi_ged_state->ghes_state;
+    return ags->present;
+}
-- 
2.20.1

The virt_is_acpi_enabled() function is specific to the virt board, as
is the check for its 'ras' property.  Use the new acpi_ghes_present()
function to check whether we should report memory errors via
acpi_ghes_record_errors().

This avoids a link error if QEMU was built without support for the
virt board, and provides a mechanism that can be used by any future
board models that want to add ACPI memory error reporting support
(they only need to call acpi_ghes_add_fw_cfg()).

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Dongjiu Geng <gengdongjiu1@gmail.com>
Message-id: 20210603171259.27962-4-peter.maydell@linaro.org
---
 target/arm/kvm64.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/kvm64.c
+++ b/target/arm/kvm64.c
@@ -XXX,XX +XXX,XX @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
 {
     ram_addr_t ram_addr;
     hwaddr paddr;
-    Object *obj = qdev_get_machine();
-    VirtMachineState *vms = VIRT_MACHINE(obj);
-    bool acpi_enabled = virt_is_acpi_enabled(vms);
 
     assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO);
 
-    if (acpi_enabled && addr &&
-            object_property_get_bool(obj, "ras", NULL)) {
+    if (acpi_ghes_present() && addr) {
         ram_addr = qemu_ram_addr_from_host(addr);
         if (ram_addr != RAM_ADDR_INVALID &&
             kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) {
-- 
2.20.1

From: Richard Henderson <richard.henderson@linaro.org>

The test was off-by-one, because tag_last points to the
last byte of the tag to check, thus tag_last - prev_page
will equal TARGET_PAGE_SIZE when we use the first byte
of the next page.

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/403
Reported-by: Peter Collingbourne <pcc@google.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20210612195707.840217-1-richard.henderson@linaro.org
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 target/arm/mte_helper.c           |  2 +-
 tests/tcg/aarch64/mte-7.c         | 31 +++++++++++++++++++++++++++++++
 tests/tcg/aarch64/Makefile.target |  2 +-
 3 files changed, 33 insertions(+), 2 deletions(-)
 create mode 100644 tests/tcg/aarch64/mte-7.c

diff --git a/target/arm/mte_helper.c b/target/arm/mte_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/mte_helper.c
+++ b/target/arm/mte_helper.c
@@ -XXX,XX +XXX,XX @@ static int mte_probe_int(CPUARMState *env, uint32_t desc, uint64_t ptr,
     prev_page = ptr & TARGET_PAGE_MASK;
     next_page = prev_page + TARGET_PAGE_SIZE;
 
-    if (likely(tag_last - prev_page <= TARGET_PAGE_SIZE)) {
+    if (likely(tag_last - prev_page < TARGET_PAGE_SIZE)) {
         /* Memory access stays on one page. */
         tag_size = ((tag_byte_last - tag_byte_first) / (2 * TAG_GRANULE)) + 1;
         mem1 = allocation_tag_mem(env, mmu_idx, ptr, type, sizem1 + 1,
diff --git a/tests/tcg/aarch64/mte-7.c b/tests/tcg/aarch64/mte-7.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/tcg/aarch64/mte-7.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * Memory tagging, unaligned access crossing pages.
+ * https://gitlab.com/qemu-project/qemu/-/issues/403
+ *
+ * Copyright (c) 2021 Linaro Ltd
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "mte.h"
+
+int main(int ac, char **av)
+{
+    void *p;
+
+    enable_mte(PR_MTE_TCF_SYNC);
+    p = alloc_mte_mem(2 * 0x1000);
+
+    /* Tag the pointer. */
+    p = (void *)((unsigned long)p | (1ul << 56));
+
+    /* Store tag in sequential granules. */
+    asm("stg %0, [%0]" : : "r"(p + 0x0ff0));
+    asm("stg %0, [%0]" : : "r"(p + 0x1000));
+
+    /*
+     * Perform an unaligned store with tag 1 crossing the pages.
+     * Failure dies with SIGSEGV.
+     */
+    asm("str %0, [%0]" : : "r"(p + 0x0ffc));
+    return 0;
+}
diff --git a/tests/tcg/aarch64/Makefile.target b/tests/tcg/aarch64/Makefile.target
index XXXXXXX..XXXXXXX 100644
--- a/tests/tcg/aarch64/Makefile.target
+++ b/tests/tcg/aarch64/Makefile.target
@@ -XXX,XX +XXX,XX @@ AARCH64_TESTS += bti-2
 
 # MTE Tests
 ifneq ($(DOCKER_IMAGE)$(CROSS_CC_HAS_ARMV8_MTE),)
-AARCH64_TESTS += mte-1 mte-2 mte-3 mte-4 mte-5 mte-6
+AARCH64_TESTS += mte-1 mte-2 mte-3 mte-4 mte-5 mte-6 mte-7
 mte-%: CFLAGS += -march=armv8.5-a+memtag
 endif
 
-- 
2.20.1

From: Patrick Venture <venture@google.com>

Adds comments to the board init to identify missing i2c devices.

Signed-off-by: Patrick Venture <venture@google.com>
Reviewed-by: Hao Wu <wuhaotsh@google.com>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Message-id: 20210608202522.2677850-2-venture@google.com
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 hw/arm/npcm7xx_boards.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/hw/arm/npcm7xx_boards.c b/hw/arm/npcm7xx_boards.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/arm/npcm7xx_boards.c
+++ b/hw/arm/npcm7xx_boards.c
@@ -XXX,XX +XXX,XX @@ static void quanta_gsj_i2c_init(NPCM7xxState *soc)
     at24c_eeprom_init(soc, 9, 0x55, 8192);
     at24c_eeprom_init(soc, 10, 0x55, 8192);
 
-    /* TODO: Add additional i2c devices. */
+    /*
+     * i2c-11:
+     * - power-brick@36: delta,dps800
+     * - hotswap@15: ti,lm5066i
+     */
+
+    /*
+     * i2c-12:
+     * - ucd90160@6b
+     */
+
+    /*
+     * i2c-15:
+     * - pca9548@75
+     */
 }
 
 static void quanta_gsj_fan_init(NPCM7xxMachine *machine, NPCM7xxState *soc)
-- 
2.20.1

From: Patrick Venture <venture@google.com>

Tested: Quanta-gsj firmware booted.

i2c /dev entries driver
I2C init bus 1 freq 100000
I2C init bus 2 freq 100000
I2C init bus 3 freq 100000
I2C init bus 4 freq 100000
I2C init bus 8 freq 100000
I2C init bus 9 freq 100000
at24 9-0055: 8192 byte 24c64 EEPROM, writable, 1 bytes/write
I2C init bus 10 freq 100000
at24 10-0055: 8192 byte 24c64 EEPROM, writable, 1 bytes/write
I2C init bus 12 freq 100000
I2C init bus 15 freq 100000
i2c i2c-15: Added multiplexed i2c bus 16
i2c i2c-15: Added multiplexed i2c bus 17
i2c i2c-15: Added multiplexed i2c bus 18
i2c i2c-15: Added multiplexed i2c bus 19
i2c i2c-15: Added multiplexed i2c bus 20
i2c i2c-15: Added multiplexed i2c bus 21
i2c i2c-15: Added multiplexed i2c bus 22
i2c i2c-15: Added multiplexed i2c bus 23
pca954x 15-0075: registered 8 multiplexed busses for I2C switch pca9548

Signed-off-by: Patrick Venture <venture@google.com>
Reviewed-by: Hao Wu <wuhaotsh@google.com>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Message-id: 20210608202522.2677850-3-venture@google.com
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 hw/arm/npcm7xx_boards.c | 6 ++----
 hw/arm/Kconfig          | 1 +
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/hw/arm/npcm7xx_boards.c b/hw/arm/npcm7xx_boards.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/arm/npcm7xx_boards.c
+++ b/hw/arm/npcm7xx_boards.c
@@ -XXX,XX +XXX,XX @@
 
 #include "hw/arm/npcm7xx.h"
 #include "hw/core/cpu.h"
+#include "hw/i2c/i2c_mux_pca954x.h"
 #include "hw/i2c/smbus_eeprom.h"
 #include "hw/loader.h"
 #include "hw/qdev-core.h"
@@ -XXX,XX +XXX,XX @@ static void quanta_gsj_i2c_init(NPCM7xxState *soc)
      * - ucd90160@6b
      */
 
-    /*
-     * i2c-15:
-     * - pca9548@75
-     */
+    i2c_slave_create_simple(npcm7xx_i2c_get_bus(soc, 15), "pca9548", 0x75);
 }
 
 static void quanta_gsj_fan_init(NPCM7xxMachine *machine, NPCM7xxState *soc)
diff --git a/hw/arm/Kconfig b/hw/arm/Kconfig
index XXXXXXX..XXXXXXX 100644
--- a/hw/arm/Kconfig
+++ b/hw/arm/Kconfig
@@ -XXX,XX +XXX,XX @@ config NPCM7XX
     select SERIAL
     select SSI
     select UNIMP
+    select PCA954X
 
 config FSL_IMX25
     bool
-- 
2.20.1

From: Patrick Venture <venture@google.com>

Adds the pca954x muxes expected.

Tested: Booted quanta-q71l image to userspace.
Signed-off-by: Patrick Venture <venture@google.com>
Reviewed-by: Hao Wu <wuhaotsh@google.com>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Reviewed-by: Cédric Le Goater <clg@kaod.org>
Message-id: 20210608202522.2677850-4-venture@google.com
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 hw/arm/aspeed.c | 11 ++++++++---
 hw/arm/Kconfig  |  1 +
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/hw/arm/aspeed.c b/hw/arm/aspeed.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/arm/aspeed.c
+++ b/hw/arm/aspeed.c
@@ -XXX,XX +XXX,XX @@
 #include "hw/arm/boot.h"
 #include "hw/arm/aspeed.h"
 #include "hw/arm/aspeed_soc.h"
+#include "hw/i2c/i2c_mux_pca954x.h"
 #include "hw/i2c/smbus_eeprom.h"
 #include "hw/misc/pca9552.h"
 #include "hw/misc/tmp105.h"
@@ -XXX,XX +XXX,XX @@ static void quanta_q71l_bmc_i2c_init(AspeedMachineState *bmc)
     /* TODO: i2c-1: Add Frontpanel FRU eeprom@57 24c64 */
     /* TODO: Add Memory Riser i2c mux and eeproms. */
 
-    /* TODO: i2c-2: pca9546@74 */
-    /* TODO: i2c-2: pca9548@77 */
+    i2c_slave_create_simple(aspeed_i2c_get_bus(&soc->i2c, 2), "pca9546", 0x74);
+    i2c_slave_create_simple(aspeed_i2c_get_bus(&soc->i2c, 2), "pca9548", 0x77);
+
     /* TODO: i2c-3: Add BIOS FRU eeprom@56 24c64 */
-    /* TODO: i2c-7: Add pca9546@70 */
+
+    /* i2c-7 */
+    i2c_slave_create_simple(aspeed_i2c_get_bus(&soc->i2c, 7), "pca9546", 0x70);
     /*        - i2c@0: pmbus@59 */
     /*        - i2c@1: pmbus@58 */
     /*        - i2c@2: pmbus@58 */
     /*        - i2c@3: pmbus@59 */
+
     /* TODO: i2c-7: Add PDB FRU eeprom@52 */
     /* TODO: i2c-8: Add BMC FRU eeprom@50 */
 }
diff --git a/hw/arm/Kconfig b/hw/arm/Kconfig
index XXXXXXX..XXXXXXX 100644
--- a/hw/arm/Kconfig
+++ b/hw/arm/Kconfig
@@ -XXX,XX +XXX,XX @@ config ASPEED_SOC
     select PCA9552
     select SERIAL
     select SMBUS_EEPROM
+    select PCA954X
     select SSI
     select SSI_M25P80
     select TMP105
-- 
2.20.1

Currently we provide Hn and H1_n macros for accessing the correct
data within arrays of vector elements of size 1, 2 and 4, accounting
for host endianness.  We don't provide any macros for elements of
size 8 because there the host endianness doesn't matter.  However,
this does result in awkwardness where we need to pass empty arguments
to macros, because checkpatch complains about them.  The empty
argument is a little confusing for humans to read as well.

Add H8() and H1_8() macros and use them where we were previously
passing empty arguments to macros.

Suggested-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20210614151007.4545-2-peter.maydell@linaro.org
Message-id: 20210610132505.5827-1-peter.maydell@linaro.org
---
 target/arm/vec_internal.h |   8 +-
 target/arm/sve_helper.c   | 258 +++++++++++++++++++-------------------
 target/arm/vec_helper.c   |  14 +--
 3 files changed, 143 insertions(+), 137 deletions(-)

diff --git a/target/arm/vec_internal.h b/target/arm/vec_internal.h
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/vec_internal.h
+++ b/target/arm/vec_internal.h
@@ -XXX,XX +XXX,XX @@
 #define H2(x)   (x)
 #define H4(x)   (x)
 #endif
-
+/*
+ * Access to 64-bit elements isn't host-endian dependent; we provide H8
+ * and H1_8 so that when a function is being generated from a macro we
+ * can pass these rather than an empty macro argument, for clarity.
+ */
+#define H8(x)   (x)
+#define H1_8(x) (x)
 
 static inline void clear_tail(void *vd, uintptr_t opr_sz, uintptr_t max_sz)
 {
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
 
 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
-DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64,     , float64_add)
+DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add)
 
 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
-DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64,     , float64_maxnum)
+DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum)
 
 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
-DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64,     , float64_minnum)
+DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum)
 
 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
-DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64,     , float64_max)
+DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max)
 
 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
-DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64,     , float64_min)
+DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min)
 
 #undef DO_ZPZZ_PAIR_FP
 
@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
 
 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
-DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t,     , H1_4, DO_ADD)
+DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
 
 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
-DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t,     , H1_4, DO_SUB)
+DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
 
 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
-DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t,     , H1_4, DO_ABD)
+DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
 
 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
-DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t,     , H1_4, DO_ADD)
+DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
 
 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
-DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t,     , H1_4, DO_SUB)
+DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
 
 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
-DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t,     , H1_4, DO_ABD)
+DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
 
 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
-DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t,     , H1_4, DO_MUL)
+DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
 
 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
-DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t,     , H1_4, DO_MUL)
+DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
 
 /* Note that the multiply cannot overflow, but the doubling can. */
 static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
@@ -XXX,XX +XXX,XX @@ static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
 
 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
-DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t,     , H1_4, do_sqdmull_d)
+DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
 
 #undef DO_ZZZ_TB
 
@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
 
 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
-DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t,     , H1_4, DO_ADD)
+DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
 
 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
-DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t,     , H1_4, DO_SUB)
+DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
 
 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
-DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t,     , H1_4, DO_ADD)
+DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
 
 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
-DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t,     , H1_4, DO_SUB)
+DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
 
 #undef DO_ZZZ_WTB
 
@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
-DO_ZZZ_NTB(sve2_eoril_d, uint64_t,     , DO_EOR)
+DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR)
 
 #undef DO_ZZZ_NTB
 
@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
 
 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
-DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t,     , H1_4, DO_ABD)
+DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
 
 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
-DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t,     , H1_4, DO_ABD)
+DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
 
 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
-DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t,     , H1_4, DO_MUL)
+DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
 
 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
-DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t,     , H1_4, DO_MUL)
+DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
 
 #define DO_NMUL(N, M)  -(N * M)
 
 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
-DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t,     , H1_4, DO_NMUL)
+DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL)
 
 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
-DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t,     , H1_4, DO_NMUL)
+DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL)
 
 #undef DO_ZZZW_ACC
 
@@ -XXX,XX +XXX,XX @@ DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
            do_sqdmull_h, DO_SQADD_H)
 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
            do_sqdmull_s, DO_SQADD_S)
-DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t,     , H1_4,
+DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4,
            do_sqdmull_d, do_sqadd_d)
 
 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
            do_sqdmull_h, DO_SQSUB_H)
 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
            do_sqdmull_s, DO_SQSUB_S)
-DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t,     , H1_4,
+DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4,
            do_sqdmull_d, do_sqsub_d)
 
 #undef DO_SQDMLAL
@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
-DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t,   , DO_CMLA)
+DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA)
 
 #define DO_SQRDMLAH_B(N, M, A, S) \
     do_sqrdmlah_b(N, M, A, S, true)
@@ -XXX,XX +XXX,XX @@ DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t,   , DO_CMLA)
 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
-DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t,   , DO_SQRDMLAH_D)
+DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D)
 
 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)    \
@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
 
 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
-DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t,   , DO_SQRDMLAH_D)
+DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D)
 
 #define DO_SQRDMLSH_H(N, M, A) \
     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
@@ -XXX,XX +XXX,XX @@ DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t,   , DO_SQRDMLAH_D)
 
 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H)
 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S)
-DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t,   , DO_SQRDMLSH_D)
+DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D)
 
 #undef DO_ZZXZ
 
@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
 #define DO_MLA(N, M, A)  (A + N * M)
 
 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA)
-DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t,     , H1_4, DO_MLA)
+DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA)
 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA)
-DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t,     , H1_4, DO_MLA)
+DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA)
 
 #define DO_MLS(N, M, A)  (A - N * M)
 
 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS)
-DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t,     , H1_4, DO_MLS)
+DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS)
 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS)
-DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t,     , H1_4, DO_MLS)
+DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS)
 
 #define DO_SQDMLAL_S(N, M, A)  DO_SQADD_S(A, do_sqdmull_s(N, M))
 #define DO_SQDMLAL_D(N, M, A)  do_sqadd_d(A, do_sqdmull_d(N, M))
 
 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S)
-DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t,     , H1_4, DO_SQDMLAL_D)
+DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D)
 
 #define DO_SQDMLSL_S(N, M, A)  DO_SQSUB_S(A, do_sqdmull_s(N, M))
 #define DO_SQDMLSL_D(N, M, A)  do_sqsub_d(A, do_sqdmull_d(N, M))
 
 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S)
-DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t,     , H1_4, DO_SQDMLSL_D)
+DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D)
 
 #undef DO_MLA
 #undef DO_MLS
@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)            \
 }
 
 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
-DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t,     , H1_4, do_sqdmull_d)
+DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
 
 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
-DO_ZZX(sve2_smull_idx_d, int64_t, int32_t,     , H1_4, DO_MUL)
+DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
 
 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
-DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t,     , H1_4, DO_MUL)
+DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
 
 #undef DO_ZZX
 
@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
-DO_CADD(sve2_cadd_d, int64_t,     , DO_ADD, DO_SUB)
+DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB)
 
 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
-DO_CADD(sve2_sqcadd_d, int64_t,     , do_sqadd_d, do_sqsub_d)
+DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d)
 
 #undef DO_CADD
 
@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
 
 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
-DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t,     , H1_4)
+DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4)
 
 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
-DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t,     , H1_4)
+DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4)
 
 #undef DO_ZZI_SHLL
 
@@ -XXX,XX +XXX,XX @@ DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
 
 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
-DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t,     , H1_4, DO_SHR)
+DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR)
 
 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
@@ -XXX,XX +XXX,XX @@ DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
 
 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
-DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t,     , H1_4, do_urshr)
+DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
 
 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
@@ -XXX,XX +XXX,XX @@ DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
 
 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
-DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t,     , H1_4, DO_SQSHRUN_D)
+DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
 
 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
@@ -XXX,XX +XXX,XX @@ DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
 
 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
-DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t,     , H1_4, DO_SQRSHRUN_D)
+DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
 
 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
@@ -XXX,XX +XXX,XX @@ DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
 
 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
-DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t,     , H1_4, DO_SQSHRN_D)
+DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
 
 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
@@ -XXX,XX +XXX,XX @@ DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
 
 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
-DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t,     , H1_4, DO_SQRSHRN_D)
+DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D)
 
 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
@@ -XXX,XX +XXX,XX @@ DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
 
 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
-DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t,     , H1_4, DO_UQSHRN_D)
+DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D)
 
 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
@@ -XXX,XX +XXX,XX @@ DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
 
 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
-DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t,     , H1_4, DO_UQRSHRN_D)
+DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D)
 
 #undef DO_SHRNB
 #undef DO_SHRNT
@@ -XXX,XX +XXX,XX @@ DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
 
 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
-DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32,     , H1_4, DO_ADDHN)
+DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN)
 
 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
@@ -XXX,XX +XXX,XX @@ DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
 
 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
-DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32,     , H1_4, DO_RADDHN)
+DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN)
 
 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
@@ -XXX,XX +XXX,XX @@ DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
 
 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
-DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32,     , H1_4, DO_SUBHN)
+DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN)
 
 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
@@ -XXX,XX +XXX,XX @@ DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
 
 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
-DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32,     , H1_4, DO_RSUBHN)
+DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN)
 
 #undef DO_RSUBHN
 #undef DO_SUBHN
@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
 DO_INSR(sve_insr_b, uint8_t, H1)
 DO_INSR(sve_insr_h, uint16_t, H1_2)
 DO_INSR(sve_insr_s, uint32_t, H1_4)
-DO_INSR(sve_insr_d, uint64_t, )
+DO_INSR(sve_insr_d, uint64_t, H1_8)
 
 #undef DO_INSR
 
@@ -XXX,XX +XXX,XX @@ void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
 DO_TB(b, uint8_t, H1)
 DO_TB(h, uint16_t, H2)
 DO_TB(s, uint32_t, H4)
-DO_TB(d, uint64_t,   )
+DO_TB(d, uint64_t, H8)
 
 #undef DO_TB
 
@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
 
 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
-DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
+DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4)
 
 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
-DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
+DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4)
 
 #undef DO_UNPK
 
@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)       \
 DO_ZIP(sve_zip_b, uint8_t, H1)
 DO_ZIP(sve_zip_h, uint16_t, H1_2)
 DO_ZIP(sve_zip_s, uint32_t, H1_4)
-DO_ZIP(sve_zip_d, uint64_t, )
+DO_ZIP(sve_zip_d, uint64_t, H1_8)
 DO_ZIP(sve2_zip_q, Int128, )
 
 #define DO_UZP(NAME, TYPE, H) \
@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
 DO_UZP(sve_uzp_b, uint8_t, H1)
 DO_UZP(sve_uzp_h, uint16_t, H1_2)
 DO_UZP(sve_uzp_s, uint32_t, H1_4)
-DO_UZP(sve_uzp_d, uint64_t, )
+DO_UZP(sve_uzp_d, uint64_t, H1_8)
 DO_UZP(sve2_uzp_q, Int128, )
 
 #define DO_TRN(NAME, TYPE, H) \
@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
 DO_TRN(sve_trn_b, uint8_t, H1)
 DO_TRN(sve_trn_h, uint16_t, H1_2)
 DO_TRN(sve_trn_s, uint32_t, H1_4)
-DO_TRN(sve_trn_d, uint64_t, )
+DO_TRN(sve_trn_d, uint64_t, H1_8)
 DO_TRN(sve2_trn_q, Int128, )
 
 #undef DO_ZIP
@@ -XXX,XX +XXX,XX @@ uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
     DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
-    DO_CMP_PPZZ(NAME, TYPE, OP,     , 0x0101010101010101ull)
+    DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
 
 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t,  ==)
 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
@@ -XXX,XX +XXX,XX @@ uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)   \
 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
     DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
-    DO_CMP_PPZI(NAME, TYPE, OP,     , 0x0101010101010101ull)
+    DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
 
 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t,  ==)
 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
@@ -XXX,XX +XXX,XX @@ uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc)    \
 
 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
-DO_REDUCE(sve_faddv_d, float64,     , add, float64_zero)
+DO_REDUCE(sve_faddv_d, float64, H1_8, add, float64_zero)
 
 /* Identity is floatN_default_nan, without the function call.  */
 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
-DO_REDUCE(sve_fminnmv_d, float64,     , minnum, 0x7FF8000000000000ULL)
+DO_REDUCE(sve_fminnmv_d, float64, H1_8, minnum, 0x7FF8000000000000ULL)
 
 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
-DO_REDUCE(sve_fmaxnmv_d, float64,     , maxnum, 0x7FF8000000000000ULL)
+DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, maxnum, 0x7FF8000000000000ULL)
 
 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
-DO_REDUCE(sve_fminv_d, float64,     , min, float64_infinity)
+DO_REDUCE(sve_fminv_d, float64, H1_8, min, float64_infinity)
 
 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
-DO_REDUCE(sve_fmaxv_d, float64,     , max, float64_chs(float64_infinity))
+DO_REDUCE(sve_fmaxv_d, float64, H1_8, max, float64_chs(float64_infinity))
 
 #undef DO_REDUCE
 
@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,       \
 
 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
-DO_ZPZZ_FP(sve_fadd_d, uint64_t,     , float64_add)
+DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add)
 
 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
-DO_ZPZZ_FP(sve_fsub_d, uint64_t,     , float64_sub)
+DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub)
 
 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
-DO_ZPZZ_FP(sve_fmul_d, uint64_t,     , float64_mul)
+DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul)
 
 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
-DO_ZPZZ_FP(sve_fdiv_d, uint64_t,     , float64_div)
+DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div)
 
 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
-DO_ZPZZ_FP(sve_fmin_d, uint64_t,     , float64_min)
+DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min)
 
 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
-DO_ZPZZ_FP(sve_fmax_d, uint64_t,     , float64_max)
+DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max)
 
 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
-DO_ZPZZ_FP(sve_fminnum_d, uint64_t,     , float64_minnum)
+DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum)
 
 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
-DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t,     , float64_maxnum)
+DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum)
 
 static inline float16 abd_h(float16 a, float16 b, float_status *s)
 {
@@ -XXX,XX +XXX,XX @@ static inline float64 abd_d(float64 a, float64 b, float_status *s)
 
 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
-DO_ZPZZ_FP(sve_fabd_d, uint64_t,     , abd_d)
+DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d)
 
 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
 {
@@ -XXX,XX +XXX,XX @@ static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
 
 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
-DO_ZPZZ_FP(sve_fscalbn_d, int64_t,     , scalbn_d)
+DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d)
 
 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
-DO_ZPZZ_FP(sve_fmulx_d, uint64_t,     , helper_vfp_mulxd)
+DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd)
 
 #undef DO_ZPZZ_FP
 
@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar,  \
 
 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
-DO_ZPZS_FP(sve_fadds_d, float64,     , float64_add)
+DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add)
 
 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
-DO_ZPZS_FP(sve_fsubs_d, float64,     , float64_sub)
+DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub)
 
 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
-DO_ZPZS_FP(sve_fmuls_d, float64,     , float64_mul)
+DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul)
 
 static inline float16 subr_h(float16 a, float16 b, float_status *s)
 {
@@ -XXX,XX +XXX,XX @@ static inline float64 subr_d(float64 a, float64 b, float_status *s)
 
 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
-DO_ZPZS_FP(sve_fsubrs_d, float64,     , subr_d)
+DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d)
 
 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
-DO_ZPZS_FP(sve_fmaxnms_d, float64,     , float64_maxnum)
+DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum)
 
 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
-DO_ZPZS_FP(sve_fminnms_d, float64,     , float64_minnum)
+DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum)
 
 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
-DO_ZPZS_FP(sve_fmaxs_d, float64,     , float64_max)
+DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max)
 
 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
-DO_ZPZS_FP(sve_fmins_d, float64,     , float64_min)
+DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min)
 
 /* Fully general two-operand expander, controlled by a predicate,
  * With the extra float_status parameter.
@@ -XXX,XX +XXX,XX @@ static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
 DO_ZPZ_FP(sve_bfcvt,   uint32_t, H1_4, float32_to_bfloat16)
-DO_ZPZ_FP(sve_fcvt_dh, uint64_t,     , sve_f64_to_f16)
-DO_ZPZ_FP(sve_fcvt_hd, uint64_t,     , sve_f16_to_f64)
-DO_ZPZ_FP(sve_fcvt_ds, uint64_t,     , float64_to_float32)
-DO_ZPZ_FP(sve_fcvt_sd, uint64_t,     , float32_to_float64)
+DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16)
+DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64)
+DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32)
+DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64)
 
 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
-DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t,     , vfp_float16_to_int64_rtz)
-DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t,     , vfp_float32_to_int64_rtz)
-DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t,     , helper_vfp_tosizd)
-DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t,     , vfp_float64_to_int64_rtz)
+DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz)
+DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz)
+DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd)
+DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz)
 
 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
-DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t,     , vfp_float16_to_uint64_rtz)
-DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t,     , vfp_float32_to_uint64_rtz)
-DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t,     , helper_vfp_touizd)
-DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t,     , vfp_float64_to_uint64_rtz)
+DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz)
+DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz)
+DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd)
+DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz)
 
 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
-DO_ZPZ_FP(sve_frint_d, uint64_t,     , helper_rintd)
+DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd)
 
 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
-DO_ZPZ_FP(sve_frintx_d, uint64_t,     , float64_round_to_int)
+DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int)
 
 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
-DO_ZPZ_FP(sve_frecpx_d, uint64_t,     , helper_frecpx_f64)
+DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64)
 
 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
-DO_ZPZ_FP(sve_fsqrt_d, uint64_t,     , float64_sqrt)
+DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt)
 
 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
-DO_ZPZ_FP(sve_scvt_sd, uint64_t,     , int32_to_float64)
-DO_ZPZ_FP(sve_scvt_dh, uint64_t,     , int64_to_float16)
-DO_ZPZ_FP(sve_scvt_ds, uint64_t,     , int64_to_float32)
-DO_ZPZ_FP(sve_scvt_dd, uint64_t,     , int64_to_float64)
+DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64)
+DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16)
+DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32)
+DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64)
 
 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
-DO_ZPZ_FP(sve_ucvt_sd, uint64_t,     , uint32_to_float64)
-DO_ZPZ_FP(sve_ucvt_dh, uint64_t,     , uint64_to_float16)
-DO_ZPZ_FP(sve_ucvt_ds, uint64_t,     , uint64_to_float32)
-DO_ZPZ_FP(sve_ucvt_dd, uint64_t,     , uint64_to_float64)
+DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64)
+DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16)
+DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32)
+DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64)
 
 static int16_t do_float16_logb_as_int(float16 a, float_status *s)
 {
@@ -XXX,XX +XXX,XX @@ static int64_t do_float64_logb_as_int(float64 a, float_status *s)
 
 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int)
 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int)
-DO_ZPZ_FP(flogb_d, float64,     , do_float64_logb_as_int)
+DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int)
 
 #undef DO_ZPZ_FP
 
@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
 #define DO_FPCMP_PPZZ_S(NAME, OP) \
     DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
 #define DO_FPCMP_PPZZ_D(NAME, OP) \
-    DO_FPCMP_PPZZ(NAME##_d, float64,     , OP)
+    DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP)
 
 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
     DO_FPCMP_PPZZ_H(NAME, OP)   \
@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vg,            \
 #define DO_FPCMP_PPZ0_S(NAME, OP) \
     DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
 #define DO_FPCMP_PPZ0_D(NAME, OP) \
-    DO_FPCMP_PPZ0(NAME##_d, float64,     , OP)
+    DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP)
 
 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
     DO_FPCMP_PPZ0_H(NAME, OP)   \
@@ -XXX,XX +XXX,XX @@ DO_LD_PRIM_1(ld1bhu, H1_2, uint16_t, uint8_t)
 DO_LD_PRIM_1(ld1bhs, H1_2, uint16_t,  int8_t)
 DO_LD_PRIM_1(ld1bsu, H1_4, uint32_t, uint8_t)
 DO_LD_PRIM_1(ld1bss, H1_4, uint32_t,  int8_t)
-DO_LD_PRIM_1(ld1bdu,     , uint64_t, uint8_t)
-DO_LD_PRIM_1(ld1bds,     , uint64_t,  int8_t)
+DO_LD_PRIM_1(ld1bdu, H1_8, uint64_t, uint8_t)
+DO_LD_PRIM_1(ld1bds, H1_8, uint64_t,  int8_t)
 
 #define DO_ST_PRIM_1(NAME, H, TE, TM)                   \
     DO_ST_HOST(st1##NAME, H, TE, TM, stb_p)             \
@@ -XXX,XX +XXX,XX @@ DO_LD_PRIM_1(ld1bds,     , uint64_t,  int8_t)
 DO_ST_PRIM_1(bb,   H1,  uint8_t, uint8_t)
 DO_ST_PRIM_1(bh, H1_2, uint16_t, uint8_t)
 DO_ST_PRIM_1(bs, H1_4, uint32_t, uint8_t)
-DO_ST_PRIM_1(bd,     , uint64_t, uint8_t)
+DO_ST_PRIM_1(bd, H1_8, uint64_t, uint8_t)
 
 #define DO_LD_PRIM_2(NAME, H, TE, TM, LD) \
     DO_LD_HOST(ld1##NAME##_be, H, TE, TM, LD##_be_p)    \
@@ -XXX,XX +XXX,XX @@ DO_ST_PRIM_1(bd,     , uint64_t, uint8_t)
 DO_LD_PRIM_2(hh,  H1_2, uint16_t, uint16_t, lduw)
 DO_LD_PRIM_2(hsu, H1_4, uint32_t, uint16_t, lduw)
 DO_LD_PRIM_2(hss, H1_4, uint32_t,  int16_t, lduw)
-DO_LD_PRIM_2(hdu,     , uint64_t, uint16_t, lduw)
-DO_LD_PRIM_2(hds,     , uint64_t,  int16_t, lduw)
+DO_LD_PRIM_2(hdu, H1_8, uint64_t, uint16_t, lduw)
+DO_LD_PRIM_2(hds, H1_8, uint64_t,  int16_t, lduw)
 
 DO_ST_PRIM_2(hh, H1_2, uint16_t, uint16_t, stw)
 DO_ST_PRIM_2(hs, H1_4, uint32_t, uint16_t, stw)
-DO_ST_PRIM_2(hd,     , uint64_t, uint16_t, stw)
+DO_ST_PRIM_2(hd, H1_8, uint64_t, uint16_t, stw)
 
 DO_LD_PRIM_2(ss,  H1_4, uint32_t, uint32_t, ldl)
-DO_LD_PRIM_2(sdu,     , uint64_t, uint32_t, ldl)
-DO_LD_PRIM_2(sds,     , uint64_t,  int32_t, ldl)
+DO_LD_PRIM_2(sdu, H1_8, uint64_t, uint32_t, ldl)
+DO_LD_PRIM_2(sds, H1_8, uint64_t,  int32_t, ldl)
 
 DO_ST_PRIM_2(ss, H1_4, uint32_t, uint32_t, stl)
-DO_ST_PRIM_2(sd,     , uint64_t, uint32_t, stl)
+DO_ST_PRIM_2(sd, H1_8, uint64_t, uint32_t, stl)
 
-DO_LD_PRIM_2(dd,     , uint64_t, uint64_t, ldq)
-DO_ST_PRIM_2(dd,     , uint64_t, uint64_t, stq)
+DO_LD_PRIM_2(dd, H1_8, uint64_t, uint64_t, ldq)
+DO_ST_PRIM_2(dd, H1_8, uint64_t, uint64_t, stq)
 
 #undef DO_LD_TLB
 #undef DO_ST_TLB
@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc)  \
 
 DO_FCVTNT(sve_bfcvtnt,    uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
-DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t,     , H1_4, float64_to_float32)
+DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32)
 
 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP)                             \
 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc)  \
@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc)  \
 }
 
 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32)
-DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t,     , H1_4, float32_to_float64)
+DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
 
 #undef DO_FCVTLT
 #undef DO_FCVTNT
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/vec_helper.c
+++ b/target/arm/vec_helper.c
@@ -XXX,XX +XXX,XX @@ DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
-DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, )
-DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, )
+DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
+DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
 
 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
                          void *vfpst, uint32_t desc)
@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
 
 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
-DO_MUL_IDX(gvec_mul_idx_d, uint64_t, )
+DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
 
 #undef DO_MUL_IDX
 
@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
 
 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
-DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +,   )
+DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
 
 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
-DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -,   )
+DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
 
 #undef DO_MLA_IDX
 
@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
 
 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16, H2)
 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32, H4)
-DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64, )
+DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64, H8)
 
 /*
  * Non-fused multiply-accumulate operations, for Neon. NB that unlike
@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
 
 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
-DO_FMLA_IDX(gvec_fmla_idx_d, float64, )
+DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
 
 #undef DO_FMLA_IDX
 
-- 
2.20.1

MVE has an FPSCR.QC bit similar to the A-profile Neon one; when MVE
is implemented make the bit writeable, both in the generic "load and
store FPSCR" helper functions and in the code for handling the NZCVQC
sysreg which we had previously left as "TODO when we implement MVE".

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20210614151007.4545-3-peter.maydell@linaro.org
---
 target/arm/translate-vfp.c | 30 +++++++++++++++++++++---------
 target/arm/vfp_helper.c    |  3 ++-
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/target/arm/translate-vfp.c b/target/arm/translate-vfp.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/translate-vfp.c
+++ b/target/arm/translate-vfp.c
@@ -XXX,XX +XXX,XX @@ static bool gen_M_fp_sysreg_write(DisasContext *s, int regno,
     {
         TCGv_i32 fpscr;
         tmp = loadfn(s, opaque);
-        /*
-         * TODO: when we implement MVE, write the QC bit.
-         * For non-MVE, QC is RES0.
-         */
+        if (dc_isar_feature(aa32_mve, s)) {
+            /* QC is only present for MVE; otherwise RES0 */
+            TCGv_i32 qc = tcg_temp_new_i32();
+            tcg_gen_andi_i32(qc, tmp, FPCR_QC);
+            /*
+             * The 4 vfp.qc[] fields need only be "zero" vs "non-zero";
+             * here writing the same value into all elements is simplest.
+             */
+            tcg_gen_gvec_dup_i32(MO_32, offsetof(CPUARMState, vfp.qc),
+                                 16, 16, qc);
+        }
         tcg_gen_andi_i32(tmp, tmp, FPCR_NZCV_MASK);
         fpscr = load_cpu_field(vfp.xregs[ARM_VFP_FPSCR]);
         tcg_gen_andi_i32(fpscr, fpscr, ~FPCR_NZCV_MASK);
@@ -XXX,XX +XXX,XX @@ static bool gen_M_fp_sysreg_read(DisasContext *s, int regno,
         break;
     }
 
+    if (regno == ARM_VFP_FPSCR_NZCVQC && !dc_isar_feature(aa32_mve, s)) {
+        /* QC is RES0 without MVE, so NZCVQC simplifies to NZCV */
+        regno = QEMU_VFP_FPSCR_NZCV;
+    }
+
     switch (regno) {
     case ARM_VFP_FPSCR:
         tmp = tcg_temp_new_i32();
@@ -XXX,XX +XXX,XX @@ static bool gen_M_fp_sysreg_read(DisasContext *s, int regno,
         storefn(s, opaque, tmp);
         break;
     case ARM_VFP_FPSCR_NZCVQC:
-        /*
-         * TODO: MVE has a QC bit, which we probably won't store
-         * in the xregs[] field. For non-MVE, where QC is RES0,
-         * we can just fall through to the FPSCR_NZCV case.
-         */
+        tmp = tcg_temp_new_i32();
+        gen_helper_vfp_get_fpscr(tmp, cpu_env);
+        tcg_gen_andi_i32(tmp, tmp, FPCR_NZCVQC_MASK);
+        storefn(s, opaque, tmp);
+        break;
     case QEMU_VFP_FPSCR_NZCV:
         /*
          * Read just NZCV; this is a special case to avoid the
diff --git a/target/arm/vfp_helper.c b/target/arm/vfp_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/vfp_helper.c
+++ b/target/arm/vfp_helper.c
@@ -XXX,XX +XXX,XX @@ void HELPER(vfp_set_fpscr)(CPUARMState *env, uint32_t val)
                                      FPCR_LTPSIZE_LENGTH);
     }
 
-    if (arm_feature(env, ARM_FEATURE_NEON)) {
+    if (arm_feature(env, ARM_FEATURE_NEON) ||
+        cpu_isar_feature(aa32_mve, cpu)) {
         /*
          * The bit we set within fpscr_q is arbitrary; the register as a
          * whole being zero/non-zero is what counts.
-- 
2.20.1

When MVE is supported, the VPR register has a place on the exception
stack frame in a previously reserved slot just above the FPSCR.
It must also be zeroed in various situations when we invalidate
FPU context.

Update the code which handles the stack frames (exception entry and
exit code, VLLDM, and VLSTM) to save/restore VPR.

Update code which invalidates FP registers (mostly also exception
entry and exit code, but also VSCCLRM and the code in
full_vfp_access_check() that corresponds to the ExecuteFPCheck()
pseudocode) to zero VPR.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20210614151007.4545-4-peter.maydell@linaro.org
---
 target/arm/m_helper.c         | 54 +++++++++++++++++++++++++++++------
 target/arm/translate-m-nocp.c |  5 +++-
 target/arm/translate-vfp.c    |  9 ++++--
 3 files changed, 57 insertions(+), 11 deletions(-)

diff --git a/target/arm/m_helper.c b/target/arm/m_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/m_helper.c
+++ b/target/arm/m_helper.c
@@ -XXX,XX +XXX,XX @@ void HELPER(v7m_preserve_fp_state)(CPUARMState *env)
             uint32_t shi = extract64(dn, 32, 32);
 
             if (i >= 16) {
-                faddr += 8; /* skip the slot for the FPSCR */
+                faddr += 8; /* skip the slot for the FPSCR/VPR */
             }
             stacked_ok = stacked_ok &&
                 v7m_stack_write(cpu, faddr, slo, mmu_idx, STACK_LAZYFP) &&
@@ -XXX,XX +XXX,XX @@ void HELPER(v7m_preserve_fp_state)(CPUARMState *env)
         stacked_ok = stacked_ok &&
             v7m_stack_write(cpu, fpcar + 0x40,
                             vfp_get_fpscr(env), mmu_idx, STACK_LAZYFP);
+        if (cpu_isar_feature(aa32_mve, cpu)) {
+            stacked_ok = stacked_ok &&
+                v7m_stack_write(cpu, fpcar + 0x44,
+                                env->v7m.vpr, mmu_idx, STACK_LAZYFP);
+        }
     }
 
     /*
@@ -XXX,XX +XXX,XX @@ void HELPER(v7m_preserve_fp_state)(CPUARMState *env)
     env->v7m.fpccr[is_secure] &= ~R_V7M_FPCCR_LSPACT_MASK;
 
     if (ts) {
-        /* Clear s0 to s31 and the FPSCR */
+        /* Clear s0 to s31 and the FPSCR and VPR */
         int i;
 
         for (i = 0; i < 32; i += 2) {
             *aa32_vfp_dreg(env, i / 2) = 0;
         }
         vfp_set_fpscr(env, 0);
+        if (cpu_isar_feature(aa32_mve, cpu)) {
+            env->v7m.vpr = 0;
+        }
     }
     /*
-     * Otherwise s0 to s15 and FPSCR are UNKNOWN; we choose to leave them
+     * Otherwise s0 to s15, FPSCR and VPR are UNKNOWN; we choose to leave them
      * unchanged.
      */
 }
@@ -XXX,XX +XXX,XX @@ static void v7m_update_fpccr(CPUARMState *env, uint32_t frameptr,
 void HELPER(v7m_vlstm)(CPUARMState *env, uint32_t fptr)
 {
     /* fptr is the value of Rn, the frame pointer we store the FP regs to */
+    ARMCPU *cpu = env_archcpu(env);
     bool s = env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_S_MASK;
     bool lspact = env->v7m.fpccr[s] & R_V7M_FPCCR_LSPACT_MASK;
     uintptr_t ra = GETPC();
@@ -XXX,XX +XXX,XX @@ void HELPER(v7m_vlstm)(CPUARMState *env, uint32_t fptr)
             cpu_stl_data_ra(env, faddr + 4, shi, ra);
         }
         cpu_stl_data_ra(env, fptr + 0x40, vfp_get_fpscr(env), ra);
+        if (cpu_isar_feature(aa32_mve, cpu)) {
+            cpu_stl_data_ra(env, fptr + 0x44, env->v7m.vpr, ra);
+        }
 
         /*
-         * If TS is 0 then s0 to s15 and FPSCR are UNKNOWN; we choose to
+         * If TS is 0 then s0 to s15, FPSCR and VPR are UNKNOWN; we choose to
          * leave them unchanged, matching our choice in v7m_preserve_fp_state.
          */
         if (ts) {
@@ -XXX,XX +XXX,XX @@ void HELPER(v7m_vlstm)(CPUARMState *env, uint32_t fptr)
                 *aa32_vfp_dreg(env, i / 2) = 0;
             }
             vfp_set_fpscr(env, 0);
+            if (cpu_isar_feature(aa32_mve, cpu)) {
+                env->v7m.vpr = 0;
+            }
         }
     } else {
         v7m_update_fpccr(env, fptr, false);
@@ -XXX,XX +XXX,XX @@ void HELPER(v7m_vlstm)(CPUARMState *env, uint32_t fptr)
 
 void HELPER(v7m_vlldm)(CPUARMState *env, uint32_t fptr)
 {
+    ARMCPU *cpu = env_archcpu(env);
     uintptr_t ra = GETPC();
 
     /* fptr is the value of Rn, the frame pointer we load the FP regs from */
@@ -XXX,XX +XXX,XX @@ void HELPER(v7m_vlldm)(CPUARMState *env, uint32_t fptr)
             uint32_t faddr = fptr + 4 * i;
 
             if (i >= 16) {
-                faddr += 8; /* skip the slot for the FPSCR */
+                faddr += 8; /* skip the slot for the FPSCR and VPR */
             }
 
             slo = cpu_ldl_data_ra(env, faddr, ra);
@@ -XXX,XX +XXX,XX @@ void HELPER(v7m_vlldm)(CPUARMState *env, uint32_t fptr)
         }
         fpscr = cpu_ldl_data_ra(env, fptr + 0x40, ra);
         vfp_set_fpscr(env, fpscr);
+        if (cpu_isar_feature(aa32_mve, cpu)) {
+            env->v7m.vpr = cpu_ldl_data_ra(env, fptr + 0x44, ra);
+        }
     }
 
     env->v7m.control[M_REG_S] |= R_V7M_CONTROL_FPCA_MASK;
@@ -XXX,XX +XXX,XX @@ static bool v7m_push_stack(ARMCPU *cpu)
                     uint32_t shi = extract64(dn, 32, 32);
 
                     if (i >= 16) {
-                        faddr += 8; /* skip the slot for the FPSCR */
+                        faddr += 8; /* skip the slot for the FPSCR and VPR */
                     }
                     stacked_ok = stacked_ok &&
                         v7m_stack_write(cpu, faddr, slo,
@@ -XXX,XX +XXX,XX @@ static bool v7m_push_stack(ARMCPU *cpu)
                 stacked_ok = stacked_ok &&
                     v7m_stack_write(cpu, frameptr + 0x60,
                                     vfp_get_fpscr(env), mmu_idx, STACK_NORMAL);
+                if (cpu_isar_feature(aa32_mve, cpu)) {
+                    stacked_ok = stacked_ok &&
+                        v7m_stack_write(cpu, frameptr + 0x64,
+                                        env->v7m.vpr, mmu_idx, STACK_NORMAL);
+                }
                 if (cpacr_pass) {
                     for (i = 0; i < ((framesize == 0xa8) ? 32 : 16); i += 2) {
                         *aa32_vfp_dreg(env, i / 2) = 0;
                     }
                     vfp_set_fpscr(env, 0);
+                    if (cpu_isar_feature(aa32_mve, cpu)) {
+                        env->v7m.vpr = 0;
+                    }
                 }
             } else {
                 /* Lazy stacking enabled, save necessary info to stack later */
@@ -XXX,XX +XXX,XX @@ static void do_v7m_exception_exit(ARMCPU *cpu)
                     v7m_exception_taken(cpu, excret, true, false);
                 }
             }
-            /* Clear s0..s15 and FPSCR; TODO also VPR when MVE is implemented */
+            /* Clear s0..s15, FPSCR and VPR */
             int i;
 
             for (i = 0; i < 16; i += 2) {
                 *aa32_vfp_dreg(env, i / 2) = 0;
             }
             vfp_set_fpscr(env, 0);
+            if (cpu_isar_feature(aa32_mve, cpu)) {
+                env->v7m.vpr = 0;
+            }
         }
     }
 
@@ -XXX,XX +XXX,XX @@ static void do_v7m_exception_exit(ARMCPU *cpu)
                     uint32_t faddr = frameptr + 0x20 + 4 * i;
 
                     if (i >= 16) {
-                        faddr += 8; /* Skip the slot for the FPSCR */
+                        faddr += 8; /* Skip the slot for the FPSCR and VPR */
                     }
 
                     pop_ok = pop_ok &&
@@ -XXX,XX +XXX,XX @@ static void do_v7m_exception_exit(ARMCPU *cpu)
                 if (pop_ok) {
                     vfp_set_fpscr(env, fpscr);
                 }
+                if (cpu_isar_feature(aa32_mve, cpu)) {
+                    pop_ok = pop_ok &&
+                        v7m_stack_read(cpu, &env->v7m.vpr,
+                                       frameptr + 0x64, mmu_idx);
+                }
                 if (!pop_ok) {
                     /*
                      * These regs are 0 if security extension present;
@@ -XXX,XX +XXX,XX @@ static void do_v7m_exception_exit(ARMCPU *cpu)
                         *aa32_vfp_dreg(env, i / 2) = 0;
                     }
                     vfp_set_fpscr(env, 0);
+                    if (cpu_isar_feature(aa32_mve, cpu)) {
+                        env->v7m.vpr = 0;
+                    }
                 }
             }
         }
diff --git a/target/arm/translate-m-nocp.c b/target/arm/translate-m-nocp.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/translate-m-nocp.c
+++ b/target/arm/translate-m-nocp.c
@@ -XXX,XX +XXX,XX @@ static bool trans_VSCCLRM(DisasContext *s, arg_VSCCLRM *a)
         btmreg++;
     }
     assert(btmreg == topreg + 1);
-    /* TODO: when MVE is implemented, zero VPR here */
+    if (dc_isar_feature(aa32_mve, s)) {
+        TCGv_i32 z32 = tcg_const_i32(0);
+        store_cpu_field(z32, v7m.vpr);
+    }
     return true;
 }
 
diff --git a/target/arm/translate-vfp.c b/target/arm/translate-vfp.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/translate-vfp.c
+++ b/target/arm/translate-vfp.c
@@ -XXX,XX +XXX,XX @@ static bool full_vfp_access_check(DisasContext *s, bool ignore_vfp_enabled)
 
         if (s->v7m_new_fp_ctxt_needed) {
             /*
-             * Create new FP context by updating CONTROL.FPCA, CONTROL.SFPA
-             * and the FPSCR.
+             * Create new FP context by updating CONTROL.FPCA, CONTROL.SFPA,
+             * the FPSCR, and VPR.
              */
             TCGv_i32 control, fpscr;
             uint32_t bits = R_V7M_CONTROL_FPCA_MASK;
@@ -XXX,XX +XXX,XX @@ static bool full_vfp_access_check(DisasContext *s, bool ignore_vfp_enabled)
             fpscr = load_cpu_field(v7m.fpdscr[s->v8m_secure]);
             gen_helper_vfp_set_fpscr(cpu_env, fpscr);
             tcg_temp_free_i32(fpscr);
+            if (dc_isar_feature(aa32_mve, s)) {
+                TCGv_i32 z32 = tcg_const_i32(0);
+                store_cpu_field(z32, v7m.vpr);
+            }
+
             /*
              * We don't need to arrange to end the TB, because the only
              * parts of FPSCR which we cache in the TB flags are the VECLEN
-- 
2.20.1

On A-profile, PSR bits [15:10][26:25] are always the IT state bits.
On M-profile, some of the reserved encodings of the IT state are used
to instead indicate partial progress through instructions that were
interrupted partway through by an exception and can be resumed.

These resumable instructions fall into two categories:

(1) load/store multiple instructions, where these bits are called
"ICI" and specify the register in the ldm/stm list where execution
should resume.  (Specifically: LDM, STM, VLDM, VSTM, VLLDM, VLSTM,
CLRM, VSCCLRM.)

(2) MVE instructions subject to beatwise execution, where these bits
are called "ECI" and specify which beats in this and possibly also
the following MVE insn have been executed.

There are also a few insns (LE, LETP, and BKPT) which do not use the
ICI/ECI bits but must leave them alone.

Otherwise, we should raise an INVSTATE UsageFault for any attempt to
execute an insn with non-zero ICI/ECI bits.

So far we have been able to ignore ECI/ICI, because the architecture
allows the IMPDEF choice of "always restart load/store multiple from
the beginning regardless of ICI state", so the only thing we have
been missing is that we don't raise the INVSTATE fault for bad guest
code.  However, MVE requires that we honour ECI bits and do not
rexecute beats of an insn that have already been executed.

Add the support in the decoder for handling ECI/ICI:
 * identify the ECI/ICI case in the CONDEXEC TB flags
 * when a load/store multiple insn succeeds, it updates the ECI/ICI
   state (both in DisasContext and in the CPU state), and sets a flag
   to say that the ECI/ICI state was handled
 * if we find that the insn we just decoded did not handle the
   ECI/ICI state, we delete all the code that we just generated for
   it and instead emit the code to raise the INVFAULT.  This allows
   us to avoid having to update every non-MVE non-LDM/STM insn to
   make it check for "is ECI/ICI set?".

We continue with our existing IMPDEF choice of not caring about the
ICI state for the load/store multiples and simply restarting them
from the beginning.  Because we don't allow interrupts in the middle
of an insn, the only way we would see this state is if the guest set
ICI manually on return from an exception handler, so it's a corner
case which doesn't merit optimisation.

ICI update for LDM/STM is simple -- it always zeroes the state.  ECI
update for MVE beatwise insns will be a little more complex, since
the ECI state may include information for the following insn.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20210614151007.4545-5-peter.maydell@linaro.org
---
 target/arm/translate-a32.h    |   1 +
 target/arm/translate.h        |   9 +++
 target/arm/translate-m-nocp.c |  11 ++++
 target/arm/translate-vfp.c    |   6 ++
 target/arm/translate.c        | 111 ++++++++++++++++++++++++++++++++--
 5 files changed, 133 insertions(+), 5 deletions(-)

diff --git a/target/arm/translate-a32.h b/target/arm/translate-a32.h
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/translate-a32.h
+++ b/target/arm/translate-a32.h
@@ -XXX,XX +XXX,XX @@ long vfp_reg_offset(bool dp, unsigned reg);
 long neon_full_reg_offset(unsigned reg);
 long neon_element_offset(int reg, int element, MemOp memop);
 void gen_rev16(TCGv_i32 dest, TCGv_i32 var);
+void clear_eci_state(DisasContext *s);
 
 static inline TCGv_i32 load_cpu_offset(int offset)
 {
diff --git a/target/arm/translate.h b/target/arm/translate.h
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/translate.h
+++ b/target/arm/translate.h
@@ -XXX,XX +XXX,XX @@ typedef struct DisasContext {
     /* Thumb-2 conditional execution bits.  */
     int condexec_mask;
     int condexec_cond;
+    /* M-profile ECI/ICI exception-continuable instruction state */
+    int eci;
+    /*
+     * trans_ functions for insns which are continuable should set this true
+     * after decode (ie after any UNDEF checks)
+     */
+    bool eci_handled;
+    /* TCG op to rewind to if this turns out to be an invalid ECI state */
+    TCGOp *insn_eci_rewind;
     int thumb;
     int sctlr_b;
     MemOp be_data;
diff --git a/target/arm/translate-m-nocp.c b/target/arm/translate-m-nocp.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/translate-m-nocp.c
+++ b/target/arm/translate-m-nocp.c
@@ -XXX,XX +XXX,XX @@ static bool trans_VLLDM_VLSTM(DisasContext *s, arg_VLLDM_VLSTM *a)
         unallocated_encoding(s);
         return true;
     }
+
+    s->eci_handled = true;
+
     /* If no fpu, NOP. */
     if (!dc_isar_feature(aa32_vfp, s)) {
+        clear_eci_state(s);
         return true;
     }
 
@@ -XXX,XX +XXX,XX @@ static bool trans_VLLDM_VLSTM(DisasContext *s, arg_VLLDM_VLSTM *a)
     }
     tcg_temp_free_i32(fptr);
 
+    clear_eci_state(s);
+
     /* End the TB, because we have updated FP control bits */
     s->base.is_jmp = DISAS_UPDATE_EXIT;
     return true;
@@ -XXX,XX +XXX,XX @@ static bool trans_VSCCLRM(DisasContext *s, arg_VSCCLRM *a)
         return true;
     }
 
+    s->eci_handled = true;
+
     if (!dc_isar_feature(aa32_vfp_simd, s)) {
         /* NOP if we have neither FP nor MVE */
+        clear_eci_state(s);
         return true;
     }
 
@@ -XXX,XX +XXX,XX @@ static bool trans_VSCCLRM(DisasContext *s, arg_VSCCLRM *a)
         TCGv_i32 z32 = tcg_const_i32(0);
         store_cpu_field(z32, v7m.vpr);
     }
+
+    clear_eci_state(s);
     return true;
 }
 
diff --git a/target/arm/translate-vfp.c b/target/arm/translate-vfp.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/translate-vfp.c
+++ b/target/arm/translate-vfp.c
@@ -XXX,XX +XXX,XX @@ static bool trans_VLDM_VSTM_sp(DisasContext *s, arg_VLDM_VSTM_sp *a)
         return false;
     }
 
+    s->eci_handled = true;
+
     if (!vfp_access_check(s)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool trans_VLDM_VSTM_sp(DisasContext *s, arg_VLDM_VSTM_sp *a)
         tcg_temp_free_i32(addr);
     }
 
+    clear_eci_state(s);
     return true;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool trans_VLDM_VSTM_dp(DisasContext *s, arg_VLDM_VSTM_dp *a)
         return false;
     }
 
+    s->eci_handled = true;
+
     if (!vfp_access_check(s)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool trans_VLDM_VSTM_dp(DisasContext *s, arg_VLDM_VSTM_dp *a)
         tcg_temp_free_i32(addr);
     }
 
+    clear_eci_state(s);
     return true;
 }
 
diff --git a/target/arm/translate.c b/target/arm/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -XXX,XX +XXX,XX @@ static inline bool is_singlestepping(DisasContext *s)
     return s->base.singlestep_enabled || s->ss_active;
 }
 
+void clear_eci_state(DisasContext *s)
+{
+    /*
+     * Clear any ECI/ICI state: used when a load multiple/store
+     * multiple insn executes.
+     */
+    if (s->eci) {
+        TCGv_i32 tmp = tcg_const_i32(0);
+        store_cpu_field(tmp, condexec_bits);
+        s->eci = 0;
+    }
+}
+
 static void gen_smul_dual(TCGv_i32 a, TCGv_i32 b)
 {
     TCGv_i32 tmp1 = tcg_temp_new_i32();
@@ -XXX,XX +XXX,XX @@ static bool trans_BKPT(DisasContext *s, arg_BKPT *a)
     if (!ENABLE_ARCH_5) {
         return false;
     }
+    /* BKPT is OK with ECI set and leaves it untouched */
+    s->eci_handled = true;
     if (arm_dc_feature(s, ARM_FEATURE_M) &&
         semihosting_enabled() &&
 #ifndef CONFIG_USER_ONLY
@@ -XXX,XX +XXX,XX @@ static bool op_stm(DisasContext *s, arg_ldst_block *a, int min_n)
         return true;
     }
 
+    s->eci_handled = true;
+
     addr = op_addr_block_pre(s, a, n);
     mem_idx = get_mem_index(s);
 
@@ -XXX,XX +XXX,XX @@ static bool op_stm(DisasContext *s, arg_ldst_block *a, int min_n)
     }
 
     op_addr_block_post(s, a, addr, n);
+    clear_eci_state(s);
     return true;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool do_ldm(DisasContext *s, arg_ldst_block *a, int min_n)
         return true;
     }
 
+    s->eci_handled = true;
+
     addr = op_addr_block_pre(s, a, n);
     mem_idx = get_mem_index(s);
     loaded_base = false;
@@ -XXX,XX +XXX,XX @@ static bool do_ldm(DisasContext *s, arg_ldst_block *a, int min_n)
         /* Must exit loop to check un-masked IRQs */
         s->base.is_jmp = DISAS_EXIT;
     }
+    clear_eci_state(s);
     return true;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool trans_CLRM(DisasContext *s, arg_CLRM *a)
         return false;
     }
 
+    s->eci_handled = true;
+
     zero = tcg_const_i32(0);
     for (i = 0; i < 15; i++) {
         if (extract32(a->list, i, 1)) {
@@ -XXX,XX +XXX,XX @@ static bool trans_CLRM(DisasContext *s, arg_CLRM *a)
         tcg_temp_free_i32(maskreg);
     }
     tcg_temp_free_i32(zero);
+    clear_eci_state(s);
     return true;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool trans_LE(DisasContext *s, arg_LE *a)
         return false;
     }
 
+    /* LE/LETP is OK with ECI set and leaves it untouched */
+    s->eci_handled = true;
+
     if (!a->f) {
         /* Not loop-forever. If LR <= 1 this is the last loop: do nothing. */
         arm_gen_condlabel(s);
@@ -XXX,XX +XXX,XX @@ static void arm_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cs)
     dc->thumb = EX_TBFLAG_AM32(tb_flags, THUMB);
     dc->be_data = EX_TBFLAG_ANY(tb_flags, BE_DATA) ? MO_BE : MO_LE;
     condexec = EX_TBFLAG_AM32(tb_flags, CONDEXEC);
-    dc->condexec_mask = (condexec & 0xf) << 1;
-    dc->condexec_cond = condexec >> 4;
+    /*
+     * the CONDEXEC TB flags are CPSR bits [15:10][26:25]. On A-profile this
+     * is always the IT bits. On M-profile, some of the reserved encodings
+     * of IT are used instead to indicate either ICI or ECI, which
+     * indicate partial progress of a restartable insn that was interrupted
+     * partway through by an exception:
+     *  * if CONDEXEC[3:0] != 0b0000 : CONDEXEC is IT bits
+     *  * if CONDEXEC[3:0] == 0b0000 : CONDEXEC is ICI or ECI bits
+     * In all cases CONDEXEC == 0 means "not in IT block or restartable
+     * insn, behave normally".
+     */
+    dc->eci = dc->condexec_mask = dc->condexec_cond = 0;
+    dc->eci_handled = false;
+    dc->insn_eci_rewind = NULL;
+    if (condexec & 0xf) {
+        dc->condexec_mask = (condexec & 0xf) << 1;
+        dc->condexec_cond = condexec >> 4;
+    } else {
+        if (arm_feature(env, ARM_FEATURE_M)) {
+            dc->eci = condexec >> 4;
+        }
+    }
 
     core_mmu_idx = EX_TBFLAG_ANY(tb_flags, MMUIDX);
     dc->mmu_idx = core_to_arm_mmu_idx(env, core_mmu_idx);
@@ -XXX,XX +XXX,XX @@ static void arm_tr_tb_start(DisasContextBase *dcbase, CPUState *cpu)
 static void arm_tr_insn_start(DisasContextBase *dcbase, CPUState *cpu)
 {
     DisasContext *dc = container_of(dcbase, DisasContext, base);
+    /*
+     * The ECI/ICI bits share PSR bits with the IT bits, so we
+     * need to reconstitute the bits from the split-out DisasContext
+     * fields here.
+     */
+    uint32_t condexec_bits;
 
-    tcg_gen_insn_start(dc->base.pc_next,
-                       (dc->condexec_cond << 4) | (dc->condexec_mask >> 1),
-                       0);
+    if (dc->eci) {
+        condexec_bits = dc->eci << 4;
+    } else {
+        condexec_bits = (dc->condexec_cond << 4) | (dc->condexec_mask >> 1);
+    }
+    tcg_gen_insn_start(dc->base.pc_next, condexec_bits, 0);
     dc->insn_start = tcg_last_op();
 }
 
@@ -XXX,XX +XXX,XX @@ static void thumb_tr_translate_insn(DisasContextBase *dcbase, CPUState *cpu)
     }
     dc->insn = insn;
 
+    if (dc->eci) {
+        /*
+         * For M-profile continuable instructions, ECI/ICI handling
+         * falls into these cases:
+         *  - interrupt-continuable instructions
+         *     These are the various load/store multiple insns (both
+         *     integer and fp). The ICI bits indicate the register
+         *     where the load/store can resume. We make the IMPDEF
+         *     choice to always do "instruction restart", ie ignore
+         *     the ICI value and always execute the ldm/stm from the
+         *     start. So all we need to do is zero PSR.ICI if the
+         *     insn executes.
+         *  - MVE instructions subject to beat-wise execution
+         *     Here the ECI bits indicate which beats have already been
+         *     executed, and we must honour this. Each insn of this
+         *     type will handle it correctly. We will update PSR.ECI
+         *     in the helper function for the insn (some ECI values
+         *     mean that the following insn also has been partially
+         *     executed).
+         *  - Special cases which don't advance ECI
+         *     The insns LE, LETP and BKPT leave the ECI/ICI state
+         *     bits untouched.
+         *  - all other insns (the common case)
+         *     Non-zero ECI/ICI means an INVSTATE UsageFault.
+         *     We place a rewind-marker here. Insns in the previous
+         *     three categories will set a flag in the DisasContext.
+         *     If the flag isn't set after we call disas_thumb_insn()
+         *     or disas_thumb2_insn() then we know we have a "some other
+         *     insn" case. We will rewind to the marker (ie throwing away
+         *     all the generated code) and instead emit "take exception".
+         */
+        dc->insn_eci_rewind = tcg_last_op();
+    }
+
     if (dc->condexec_mask && !thumb_insn_is_unconditional(dc, insn)) {
         uint32_t cond = dc->condexec_cond;
 
@@ -XXX,XX +XXX,XX @@ static void thumb_tr_translate_insn(DisasContextBase *dcbase, CPUState *cpu)
         }
     }
 
+    if (dc->eci && !dc->eci_handled) {
+        /*
+         * Insn wasn't valid for ECI/ICI at all: undo what we
+         * just generated and instead emit an exception
+         */
+        tcg_remove_ops_after(dc->insn_eci_rewind);
+        dc->condjmp = 0;
+        gen_exception_insn(dc, dc->pc_curr, EXCP_INVSTATE, syn_uncategorized(),
+                           default_exception_el(dc));
+    }
+
     arm_post_translate_insn(dc);
 
     /* Thumb is a variable-length ISA.  Stop translation when the next insn
-- 
2.20.1

In commit a3494d4671797c we reworked the M-profile handling of its
checks for when the NOCP exception should be raised because the FPU
is disabled, so that (in line with the architecture) the NOCP check
is done early over a large range of the encoding space, and takes
precedence over UNDEF exceptions.  As part of this, we removed the
code from full_vfp_access_check() which raised an exception there for
M-profile with the FPU disabled, because it was no longer reachable.

For MVE, some instructions which are outside the "coprocessor space"
region of the encoding space must nonetheless do "is the FPU enabled"
checks and possibly raise a NOCP exception.  (In particular this
covers the MVE-specific low-overhead branch insns LCTP, DLSTP and
WLSTP.) To support these insns, reinstate the code in
full_vfp_access_check(), so that their trans functions can call
vfp_access_check() and get the correct behaviour.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20210614151007.4545-6-peter.maydell@linaro.org
---
 target/arm/translate-vfp.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/target/arm/translate-vfp.c b/target/arm/translate-vfp.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/translate-vfp.c
+++ b/target/arm/translate-vfp.c
@@ -XXX,XX +XXX,XX @@ static void gen_preserve_fp_state(DisasContext *s)
 static bool full_vfp_access_check(DisasContext *s, bool ignore_vfp_enabled)
 {
     if (s->fp_excp_el) {
-        /* M-profile handled this earlier, in disas_m_nocp() */
-        assert (!arm_dc_feature(s, ARM_FEATURE_M));
-        gen_exception_insn(s, s->pc_curr, EXCP_UDEF,
-                           syn_fp_access_trap(1, 0xe, false),
-                           s->fp_excp_el);
+        if (arm_dc_feature(s, ARM_FEATURE_M)) {
+            /*
+             * M-profile mostly catches the "FPU disabled" case early, in
+             * disas_m_nocp(), but a few insns (eg LCTP, WLSTP, DLSTP)
+             * which do coprocessor-checks are outside the large ranges of
+             * the encoding space handled by the patterns in m-nocp.decode,
+             * and for them we may need to raise NOCP here.
+             */
+            gen_exception_insn(s, s->pc_curr, EXCP_NOCP,
+                               syn_uncategorized(), s->fp_excp_el);
+        } else {
+            gen_exception_insn(s, s->pc_curr, EXCP_UDEF,
+                               syn_fp_access_trap(1, 0xe, false),
+                               s->fp_excp_el);
+        }
         return false;
     }
 
-- 
2.20.1

Implement the MVE LCTP instruction.

We put its decode and implementation with the other
low-overhead-branch insns because although it is only present if MVE
is implemented it is logically in the same group as the other LOB
insns.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20210614151007.4545-7-peter.maydell@linaro.org
---
 target/arm/t32.decode  |  2 ++
 target/arm/translate.c | 24 ++++++++++++++++++++++++
 2 files changed, 26 insertions(+)

Implement the MVE WLSTP insn; this is like the existing WLS insn,
except that it specifies a size value which is used to set
FPSCR.LTPSIZE.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20210614151007.4545-8-peter.maydell@linaro.org
---
 target/arm/t32.decode  |  8 ++++++--
 target/arm/translate.c | 37 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 42 insertions(+), 3 deletions(-)

Implement the MVE DLSTP insn; this is like the existing DLS
insn, except that it must do an FPU access check and it
sets LTPSIZE to the value specified in the insn.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20210614151007.4545-9-peter.maydell@linaro.org
---
 target/arm/t32.decode  |  9 ++++++---
 target/arm/translate.c | 23 +++++++++++++++++++++--
 2 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/target/arm/t32.decode b/target/arm/t32.decode
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/t32.decode
+++ b/target/arm/t32.decode
@@ -XXX,XX +XXX,XX @@ BL               1111 0. .......... 11.1 ............         @branch24
     # LE and WLS immediate
     %lob_imm 1:10 11:1 !function=times_2
 
-    DLS          1111 0 0000 100     rn:4 1110 0000 0000 0001
+    DLS          1111 0 0000 100     rn:4 1110 0000 0000 0001 size=4
     WLS          1111 0 0000 100     rn:4 1100 . .......... 1 imm=%lob_imm size=4
     {
       LE         1111 0 0000 0 f:1 0 1111 1100 . .......... 1 imm=%lob_imm
       # This is WLSTP
       WLS        1111 0 0000 0 size:2 rn:4 1100 . .......... 1 imm=%lob_imm
     }
-
-    LCTP         1111 0 0000 000     1111 1110 0000 0000 0001
+    {
+      LCTP       1111 0 0000 000     1111 1110 0000 0000 0001
+      # This is DLSTP
+      DLS        1111 0 0000 0 size:2 rn:4 1110 0000 0000 0001
+    }
   ]
 }
diff --git a/target/arm/translate.c b/target/arm/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -XXX,XX +XXX,XX @@ static bool trans_DLS(DisasContext *s, arg_DLS *a)
         return false;
     }
     if (a->rn == 13 || a->rn == 15) {
-        /* CONSTRAINED UNPREDICTABLE: we choose to UNDEF */
+        /*
+         * For DLSTP rn == 15 is a related encoding (LCTP); the
+         * other cases caught by this condition are all
+         * CONSTRAINED UNPREDICTABLE: we choose to UNDEF
+         */
         return false;
     }
 
-    /* Not a while loop, no tail predication: just set LR to the count */
+    if (a->size != 4) {
+        /* DLSTP */
+        if (!dc_isar_feature(aa32_mve, s)) {
+            return false;
+        }
+        if (!vfp_access_check(s)) {
+            return true;
+        }
+    }
+
+    /* Not a while loop: set LR to the count, and set LTPSIZE for DLSTP */
     tmp = load_reg(s, a->rn);
     store_reg(s, 14, tmp);
+    if (a->size != 4) {
+        /* DLSTP: set FPSCR.LTPSIZE */
+        tmp = tcg_const_i32(a->size);
+        store_cpu_field(tmp, v7m.ltpsize);
+    }
     return true;
 }
 
-- 
2.20.1

Implement the MVE LETP insn.  This is like the existing LE loop-end
insn, but it must perform an FPU-enabled check, and on loop-exit it
resets LTPSIZE to 4.

To accommodate the requirement to do something on loop-exit, we drop
the use of condlabel and instead manage both the TB exits manually,
in the same way we already do in trans_WLS().

The other MVE-specific change to the LE insn is that we must raise an
INVSTATE UsageFault insn if LTPSIZE is not 4.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20210614151007.4545-10-peter.maydell@linaro.org
---
 target/arm/t32.decode  |   2 +-
 target/arm/translate.c | 104 +++++++++++++++++++++++++++++++++++++----
 2 files changed, 97 insertions(+), 9 deletions(-)

diff --git a/target/arm/t32.decode b/target/arm/t32.decode
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/t32.decode
+++ b/target/arm/t32.decode
@@ -XXX,XX +XXX,XX @@ BL               1111 0. .......... 11.1 ............         @branch24
     DLS          1111 0 0000 100     rn:4 1110 0000 0000 0001 size=4
     WLS          1111 0 0000 100     rn:4 1100 . .......... 1 imm=%lob_imm size=4
     {
-      LE         1111 0 0000 0 f:1 0 1111 1100 . .......... 1 imm=%lob_imm
+      LE         1111 0 0000 0 f:1 tp:1 1111 1100 . .......... 1 imm=%lob_imm
       # This is WLSTP
       WLS        1111 0 0000 0 size:2 rn:4 1100 . .......... 1 imm=%lob_imm
     }
diff --git a/target/arm/translate.c b/target/arm/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -XXX,XX +XXX,XX @@ static bool trans_LE(DisasContext *s, arg_LE *a)
      * any faster.
      */
     TCGv_i32 tmp;
+    TCGLabel *loopend;
+    bool fpu_active;
 
     if (!dc_isar_feature(aa32_lob, s)) {
         return false;
     }
+    if (a->f && a->tp) {
+        return false;
+    }
+    if (s->condexec_mask) {
+        /*
+         * LE in an IT block is CONSTRAINED UNPREDICTABLE;
+         * we choose to UNDEF, because otherwise our use of
+         * gen_goto_tb(1) would clash with the use of TB exit 1
+         * in the dc->condjmp condition-failed codepath in
+         * arm_tr_tb_stop() and we'd get an assertion.
+         */
+        return false;
+    }
+    if (a->tp) {
+        /* LETP */
+        if (!dc_isar_feature(aa32_mve, s)) {
+            return false;
+        }
+        if (!vfp_access_check(s)) {
+            s->eci_handled = true;
+            return true;
+        }
+    }
 
     /* LE/LETP is OK with ECI set and leaves it untouched */
     s->eci_handled = true;
 
-    if (!a->f) {
-        /* Not loop-forever. If LR <= 1 this is the last loop: do nothing. */
-        arm_gen_condlabel(s);
-        tcg_gen_brcondi_i32(TCG_COND_LEU, cpu_R[14], 1, s->condlabel);
-        /* Decrement LR */
-        tmp = load_reg(s, 14);
-        tcg_gen_addi_i32(tmp, tmp, -1);
-        store_reg(s, 14, tmp);
+    /*
+     * With MVE, LTPSIZE might not be 4, and we must emit an INVSTATE
+     * UsageFault exception for the LE insn in that case. Note that we
+     * are not directly checking FPSCR.LTPSIZE but instead check the
+     * pseudocode LTPSIZE() function, which returns 4 if the FPU is
+     * not currently active (ie ActiveFPState() returns false). We
+     * can identify not-active purely from our TB state flags, as the
+     * FPU is active only if:
+     *  the FPU is enabled
+     *  AND lazy state preservation is not active
+     *  AND we do not need a new fp context (this is the ASPEN/FPCA check)
+     *
+     * Usually we don't need to care about this distinction between
+     * LTPSIZE and FPSCR.LTPSIZE, because the code in vfp_access_check()
+     * will either take an exception or clear the conditions that make
+     * the FPU not active. But LE is an unusual case of a non-FP insn
+     * that looks at LTPSIZE.
+     */
+    fpu_active = !s->fp_excp_el && !s->v7m_lspact && !s->v7m_new_fp_ctxt_needed;
+
+    if (!a->tp && dc_isar_feature(aa32_mve, s) && fpu_active) {
+        /* Need to do a runtime check for LTPSIZE != 4 */
+        TCGLabel *skipexc = gen_new_label();
+        tmp = load_cpu_field(v7m.ltpsize);
+        tcg_gen_brcondi_i32(TCG_COND_EQ, tmp, 4, skipexc);
+        tcg_temp_free_i32(tmp);
+        gen_exception_insn(s, s->pc_curr, EXCP_INVSTATE, syn_uncategorized(),
+                           default_exception_el(s));
+        gen_set_label(skipexc);
+    }
+
+    if (a->f) {
+        /* Loop-forever: just jump back to the loop start */
+        gen_jmp(s, read_pc(s) - a->imm);
+        return true;
+    }
+
+    /*
+     * Not loop-forever. If LR <= loop-decrement-value this is the last loop.
+     * For LE, we know at this point that LTPSIZE must be 4 and the
+     * loop decrement value is 1. For LETP we need to calculate the decrement
+     * value from LTPSIZE.
+     */
+    loopend = gen_new_label();
+    if (!a->tp) {
+        tcg_gen_brcondi_i32(TCG_COND_LEU, cpu_R[14], 1, loopend);
+        tcg_gen_addi_i32(cpu_R[14], cpu_R[14], -1);
+    } else {
+        /*
+         * Decrement by 1 << (4 - LTPSIZE). We need to use a TCG local
+         * so that decr stays live after the brcondi.
+         */
+        TCGv_i32 decr = tcg_temp_local_new_i32();
+        TCGv_i32 ltpsize = load_cpu_field(v7m.ltpsize);
+        tcg_gen_sub_i32(decr, tcg_constant_i32(4), ltpsize);
+        tcg_gen_shl_i32(decr, tcg_constant_i32(1), decr);
+        tcg_temp_free_i32(ltpsize);
+
+        tcg_gen_brcond_i32(TCG_COND_LEU, cpu_R[14], decr, loopend);
+
+        tcg_gen_sub_i32(cpu_R[14], cpu_R[14], decr);
+        tcg_temp_free_i32(decr);
     }
     /* Jump back to the loop start */
     gen_jmp(s, read_pc(s) - a->imm);
+
+    gen_set_label(loopend);
+    if (a->tp) {
+        /* Exits from tail-pred loops must reset LTPSIZE to 4 */
+        tmp = tcg_const_i32(4);
+        store_cpu_field(tmp, v7m.ltpsize);
+    }
+    /* End TB, continuing to following insn */
+    gen_jmp_tb(s, s->base.pc_next, 1);
     return true;
 }
 
-- 
2.20.1

Add the framework for decoding MVE insns, with the necessary new
files and the meson.build rules, but no actual content yet.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20210614151007.4545-11-peter.maydell@linaro.org
---
 target/arm/translate-a32.h |  1 +
 target/arm/mve.decode      | 20 ++++++++++++++++++++
 target/arm/translate-mve.c | 29 +++++++++++++++++++++++++++++
 target/arm/translate.c     |  1 +
 target/arm/meson.build     |  2 ++
 5 files changed, 53 insertions(+)
 create mode 100644 target/arm/mve.decode
 create mode 100644 target/arm/translate-mve.c

diff --git a/target/arm/translate-a32.h b/target/arm/translate-a32.h
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/translate-a32.h
+++ b/target/arm/translate-a32.h
@@ -XXX,XX +XXX,XX @@
 
 /* Prototypes for autogenerated disassembler functions */
 bool disas_m_nocp(DisasContext *dc, uint32_t insn);
+bool disas_mve(DisasContext *dc, uint32_t insn);
 bool disas_vfp(DisasContext *s, uint32_t insn);
 bool disas_vfp_uncond(DisasContext *s, uint32_t insn);
 bool disas_neon_dp(DisasContext *s, uint32_t insn);
diff --git a/target/arm/mve.decode b/target/arm/mve.decode
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/target/arm/mve.decode
@@ -XXX,XX +XXX,XX @@
+# M-profile MVE instruction descriptions
+#
+#  Copyright (c) 2021 Linaro, Ltd
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+#
+# This file is processed by scripts/decodetree.py
+#
diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/target/arm/translate-mve.c
@@ -XXX,XX +XXX,XX @@
+/*
+ *  ARM translation: M-profile MVE instructions
+ *
+ *  Copyright (c) 2021 Linaro, Ltd.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-gvec.h"
+#include "exec/exec-all.h"
+#include "exec/gen-icount.h"
+#include "translate.h"
+#include "translate-a32.h"
+
+/* Include the generated decoder */
+#include "decode-mve.c.inc"
diff --git a/target/arm/translate.c b/target/arm/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -XXX,XX +XXX,XX @@ static void disas_thumb2_insn(DisasContext *s, uint32_t insn)
     if (disas_t32(s, insn) ||
         disas_vfp_uncond(s, insn) ||
         disas_neon_shared(s, insn) ||
+        disas_mve(s, insn) ||
         ((insn >> 28) == 0xe && disas_vfp(s, insn))) {
         return;
     }
diff --git a/target/arm/meson.build b/target/arm/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/meson.build
+++ b/target/arm/meson.build
@@ -XXX,XX +XXX,XX @@ gen = [
   decodetree.process('vfp.decode', extra_args: '--decode=disas_vfp'),
   decodetree.process('vfp-uncond.decode', extra_args: '--decode=disas_vfp_uncond'),
   decodetree.process('m-nocp.decode', extra_args: '--decode=disas_m_nocp'),
+  decodetree.process('mve.decode', extra_args: '--decode=disas_mve'),
   decodetree.process('a32.decode', extra_args: '--static-decode=disas_a32'),
   decodetree.process('a32-uncond.decode', extra_args: '--static-decode=disas_a32_uncond'),
   decodetree.process('t32.decode', extra_args: '--static-decode=disas_t32'),
@@ -XXX,XX +XXX,XX @@ arm_ss.add(files(
   'tlb_helper.c',
   'translate.c',
   'translate-m-nocp.c',
+  'translate-mve.c',
   'translate-neon.c',
   'translate-vfp.c',
   'vec_helper.c',
-- 
2.20.1

For MVE, we want to re-use the large data table from expand_pred_b().
Move the data table to vec_helper.c so it is no longer in an SVE
specific source file.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20210614151007.4545-14-peter.maydell@linaro.org
---
 target/arm/vec_internal.h |   3 ++
 target/arm/sve_helper.c   | 103 ++------------------------------------
 target/arm/vec_helper.c   | 102 +++++++++++++++++++++++++++++++++++++
 3 files changed, 109 insertions(+), 99 deletions(-)

diff --git a/target/arm/vec_internal.h b/target/arm/vec_internal.h
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/vec_internal.h
+++ b/target/arm/vec_internal.h
@@ -XXX,XX +XXX,XX @@
 #define H8(x)   (x)
 #define H1_8(x) (x)
 
+/* Data for expanding active predicate bits to bytes, for byte elements. */
+extern const uint64_t expand_pred_b_data[256];
+
 static inline void clear_tail(void *vd, uintptr_t opr_sz, uintptr_t max_sz)
 {
     uint64_t *d = vd + opr_sz;
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -XXX,XX +XXX,XX @@ uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
     return flags;
 }
 
-/* Expand active predicate bits to bytes, for byte elements.
- *  for (i = 0; i < 256; ++i) {
- *      unsigned long m = 0;
- *      for (j = 0; j < 8; j++) {
- *          if ((i >> j) & 1) {
- *              m |= 0xfful << (j << 3);
- *          }
- *      }
- *      printf("0x%016lx,\n", m);
- *  }
+/*
+ * Expand active predicate bits to bytes, for byte elements.
+ * (The data table itself is in vec_helper.c as MVE also needs it.)
  */
 static inline uint64_t expand_pred_b(uint8_t byte)
 {
-    static const uint64_t word[256] = {
-        0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
-        0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
-        0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
-        0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
-        0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
-        0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
-        0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
-        0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
-        0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
-        0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
-        0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
-        0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
-        0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
-        0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
-        0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
-        0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
-        0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
-        0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
-        0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
-        0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
-        0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
-        0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
-        0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
-        0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
-        0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
-        0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
-        0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
-        0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
-        0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
-        0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
-        0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
-        0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
-        0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
-        0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
-        0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
-        0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
-        0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
-        0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
-        0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
-        0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
-        0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
-        0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
-        0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
-        0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
-        0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
-        0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
-        0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
-        0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
-        0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
-        0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
-        0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
-        0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
-        0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
-        0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
-        0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
-        0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
-        0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
-        0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
-        0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
-        0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
-        0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
-        0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
-        0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
-        0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
-        0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
-        0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
-        0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
-        0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
-        0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
-        0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
-        0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
-        0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
-        0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
-        0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
-        0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
-        0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
-        0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
-        0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
-        0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
-        0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
-        0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
-        0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
-        0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
-        0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
-        0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
-        0xffffffffffffffff,
-    };
-    return word[byte];
+    return expand_pred_b_data[byte];
 }
 
 /* Similarly for half-word elements.
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/vec_helper.c
+++ b/target/arm/vec_helper.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/int128.h"
 #include "vec_internal.h"
 
+/*
+ * Data for expanding active predicate bits to bytes, for byte elements.
+ *
+ *  for (i = 0; i < 256; ++i) {
+ *      unsigned long m = 0;
+ *      for (j = 0; j < 8; j++) {
+ *          if ((i >> j) & 1) {
+ *              m |= 0xfful << (j << 3);
+ *          }
+ *      }
+ *      printf("0x%016lx,\n", m);
+ *  }
+ */
+const uint64_t expand_pred_b_data[256] = {
+    0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
+    0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
+    0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
+    0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
+    0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
+    0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
+    0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
+    0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
+    0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
+    0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
+    0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
+    0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
+    0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
+    0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
+    0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
+    0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
+    0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
+    0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
+    0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
+    0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
+    0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
+    0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
+    0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
+    0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
+    0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
+    0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
+    0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
+    0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
+    0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
+    0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
+    0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
+    0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
+    0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
+    0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
+    0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
+    0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
+    0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
+    0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
+    0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
+    0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
+    0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
+    0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
+    0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
+    0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
+    0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
+    0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
+    0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
+    0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
+    0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
+    0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
+    0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
+    0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
+    0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
+    0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
+    0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
+    0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
+    0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
+    0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
+    0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
+    0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
+    0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
+    0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
+    0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
+    0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
+    0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
+    0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
+    0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
+    0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
+    0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
+    0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
+    0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
+    0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
+    0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
+    0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
+    0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
+    0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
+    0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
+    0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
+    0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
+    0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
+    0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
+    0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
+    0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
+    0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
+    0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
+    0xffffffffffffffff,
+};
+
 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
                      bool neg, bool round)
-- 
2.20.1

Currently the ARM SVE helper code defines locally some utility
functions for swapping 16-bit halfwords within 32-bit or 64-bit
values and for swapping 32-bit words within 64-bit values,
parallel to the byte-swapping bswap16/32/64 functions.

We want these also for the ARM MVE code, and they're potentially
generally useful for other targets, so move them to bitops.h.
(We don't put them in bswap.h with the bswap* functions because
they are implemented in terms of the rotate operations also
defined in bitops.h, and including bitops.h from bswap.h seems
better avoided.)

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Message-id: 20210614151007.4545-17-peter.maydell@linaro.org
---
 include/qemu/bitops.h   | 29 +++++++++++++++++++++++++++++
 target/arm/sve_helper.c | 20 --------------------
 2 files changed, 29 insertions(+), 20 deletions(-)

diff --git a/include/qemu/bitops.h b/include/qemu/bitops.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/bitops.h
+++ b/include/qemu/bitops.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t ror64(uint64_t word, unsigned int shift)
     return (word >> shift) | (word << ((64 - shift) & 63));
 }
 
+/**
+ * hswap32 - swap 16-bit halfwords within a 32-bit value
+ * @h: value to swap
+ */
+static inline uint32_t hswap32(uint32_t h)
+{
+    return rol32(h, 16);
+}
+
+/**
+ * hswap64 - swap 16-bit halfwords within a 64-bit value
+ * @h: value to swap
+ */
+static inline uint64_t hswap64(uint64_t h)
+{
+    uint64_t m = 0x0000ffff0000ffffull;
+    h = rol64(h, 32);
+    return ((h & m) << 16) | ((h >> 16) & m);
+}
+
+/**
+ * wswap64 - swap 32-bit words within a 64-bit value
+ * @h: value to swap
+ */
+static inline uint64_t wswap64(uint64_t h)
+{
+    return rol64(h, 32);
+}
+
 /**
  * extract32:
  * @value: the value to extract the bit field from
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -XXX,XX +XXX,XX @@ static inline uint64_t expand_pred_s(uint8_t byte)
     return word[byte & 0x11];
 }
 
-/* Swap 16-bit words within a 32-bit word.  */
-static inline uint32_t hswap32(uint32_t h)
-{
-    return rol32(h, 16);
-}
-
-/* Swap 16-bit words within a 64-bit word.  */
-static inline uint64_t hswap64(uint64_t h)
-{
-    uint64_t m = 0x0000ffff0000ffffull;
-    h = rol64(h, 32);
-    return ((h & m) << 16) | ((h >> 16) & m);
-}
-
-/* Swap 32-bit words within a 64-bit word.  */
-static inline uint64_t wswap64(uint64_t h)
-{
-    return rol64(h, 32);
-}
-
 #define LOGICAL_PPPP(NAME, FUNC) \
 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
 {                                                                         \
-- 
2.20.1

int128_make64() creates an Int128 from an unsigned 64 bit value; add
a function int128_makes64() creating an Int128 from a signed 64 bit
value.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Message-id: 20210614151007.4545-34-peter.maydell@linaro.org
---
 include/qemu/int128.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/include/qemu/int128.h b/include/qemu/int128.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/int128.h
+++ b/include/qemu/int128.h
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_make64(uint64_t a)
     return a;
 }
 
+static inline Int128 int128_makes64(int64_t a)
+{
+    return a;
+}
+
 static inline Int128 int128_make128(uint64_t lo, uint64_t hi)
 {
     return (__uint128_t)hi << 64 | lo;
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_make64(uint64_t a)
     return (Int128) { a, 0 };
 }
 
+static inline Int128 int128_makes64(int64_t a)
+{
+    return (Int128) { a, a >> 63 };
+}
+
 static inline Int128 int128_make128(uint64_t lo, uint64_t hi)
 {
     return (Int128) { lo, hi };
-- 
2.20.1