1
Just my fp16 work, plus some small stuff for the sbsa-ref board;
1
The following changes since commit 5a67d7735d4162630769ef495cf813244fc850df:
2
but my rule of thumb is to send a pullreq once I get over about
3
30 patches...
4
2
5
-- PMM
3
Merge remote-tracking branch 'remotes/berrange-gitlab/tags/tls-deps-pull-request' into staging (2021-07-02 08:22:39 +0100)
6
7
The following changes since commit 2f4c51c0f384d7888a04b4815861e6d5fd244d75:
8
9
Merge remote-tracking branch 'remotes/kraxel/tags/usb-20200831-pull-request' into staging (2020-08-31 19:39:13 +0100)
10
4
11
are available in the Git repository at:
5
are available in the Git repository at:
12
6
13
https://git.linaro.org/people/pmaydell/qemu-arm.git tags/pull-target-arm-20200901
7
https://git.linaro.org/people/pmaydell/qemu-arm.git tags/pull-target-arm-20210702
14
8
15
for you to fetch changes up to 3f462bf0f6ea6382dd1502d4eb1fcd33c8e774f5:
9
for you to fetch changes up to 04ea4d3cfd0a21b248ece8eb7a9436a3d9898dd8:
16
10
17
hw/arm/sbsa-ref : Add embedded controller in secure memory (2020-09-01 14:01:34 +0100)
11
target/arm: Implement MVE shifts by register (2021-07-02 11:48:38 +0100)
18
12
19
----------------------------------------------------------------
13
----------------------------------------------------------------
20
target-arm queue:
14
target-arm queue:
21
* Implement fp16 support for AArch32 VFP and Neon
15
* more MVE instructions
22
* hw/arm/sbsa-ref: add "reg" property to DT cpu nodes
16
* hw/gpio/gpio_pwr: use shutdown function for reboot
23
* hw/arm/sbsa-ref : Add embedded controller in secure memory
17
* target/arm: Check NaN mode before silencing NaN
18
* tests: Boot and halt a Linux guest on the Raspberry Pi 2 machine
19
* hw/arm: Add basic power management to raspi.
20
* docs/system/arm: Add quanta-gbs-bmc, quanta-q7l1-bmc
24
21
25
----------------------------------------------------------------
22
----------------------------------------------------------------
26
Graeme Gregory (2):
23
Joe Komlodi (1):
27
hw/misc/sbsa_ec : Add an embedded controller for sbsa-ref
24
target/arm: Check NaN mode before silencing NaN
28
hw/arm/sbsa-ref : Add embedded controller in secure memory
29
25
30
Leif Lindholm (1):
26
Maxim Uvarov (1):
31
hw/arm/sbsa-ref: add "reg" property to DT cpu nodes
27
hw/gpio/gpio_pwr: use shutdown function for reboot
32
28
33
Peter Maydell (44):
29
Nolan Leake (1):
34
target/arm: Remove local definitions of float constants
30
hw/arm: Add basic power management to raspi.
35
target/arm: Use correct ID register check for aa32_fp16_arith
36
target/arm: Implement VFP fp16 for VFP_BINOP operations
37
target/arm: Implement VFP fp16 VMLA, VMLS, VNMLS, VNMLA, VNMUL
38
target/arm: Macroify trans functions for VFMA, VFMS, VFNMA, VFNMS
39
target/arm: Implement VFP fp16 for fused-multiply-add
40
target/arm: Macroify uses of do_vfp_2op_sp() and do_vfp_2op_dp()
41
target/arm: Implement VFP fp16 for VABS, VNEG, VSQRT
42
target/arm: Implement VFP fp16 for VMOV immediate
43
target/arm: Implement VFP fp16 VCMP
44
target/arm: Implement VFP fp16 VLDR and VSTR
45
target/arm: Implement VFP fp16 VCVT between float and integer
46
target/arm: Make VFP_CONV_FIX macros take separate float type and float size
47
target/arm: Use macros instead of open-coding fp16 conversion helpers
48
target/arm: Implement VFP fp16 VCVT between float and fixed-point
49
target/arm: Implement VFP vp16 VCVT-with-specified-rounding-mode
50
target/arm: Implement VFP fp16 VSEL
51
target/arm: Implement VFP fp16 VRINT*
52
target/arm: Implement new VFP fp16 insn VINS
53
target/arm: Implement new VFP fp16 insn VMOVX
54
target/arm: Implement VFP fp16 VMOV between gp and halfprec registers
55
target/arm: Implement FP16 for Neon VADD, VSUB, VABD, VMUL
56
target/arm: Implement fp16 for Neon VRECPE, VRSQRTE using gvec
57
target/arm: Implement fp16 for Neon VABS, VNEG of floats
58
target/arm: Implement fp16 for VCEQ, VCGE, VCGT comparisons
59
target/arm: Implement fp16 for VACGE, VACGT
60
target/arm: Implement fp16 for Neon VMAX, VMIN
61
target/arm: Implement fp16 for Neon VMAXNM, VMINNM
62
target/arm: Implement fp16 for Neon VMLA, VMLS operations
63
target/arm: Implement fp16 for Neon VFMA, VMFS
64
target/arm: Implement fp16 for Neon fp compare-vs-0
65
target/arm: Implement fp16 for Neon VRECPS
66
target/arm: Implement fp16 for Neon VRSQRTS
67
target/arm: Implement fp16 for Neon pairwise fp ops
68
target/arm: Implement fp16 for Neon float-integer VCVT
69
target/arm: Convert Neon VCVT fixed-point to gvec
70
target/arm: Implement fp16 for Neon VCVT fixed-point
71
target/arm: Implement fp16 for Neon VCVT with rounding modes
72
target/arm: Implement fp16 for Neon VRINT-with-specified-rounding-mode
73
target/arm: Implement fp16 for Neon VRINTX
74
target/arm/vec_helper: Handle oprsz less than 16 bytes in indexed operations
75
target/arm/vec_helper: Add gvec fp indexed multiply-and-add operations
76
target/arm: Implement fp16 for Neon VMUL, VMLA, VMLS
77
target/arm: Enable FP16 in '-cpu max'
78
31
79
target/arm/cpu.h | 7 +-
32
Patrick Venture (2):
80
target/arm/helper.h | 133 ++++++-
33
docs/system/arm: Add quanta-q7l1-bmc reference
81
target/arm/neon-dp.decode | 8 +-
34
docs/system/arm: Add quanta-gbs-bmc reference
82
target/arm/vfp-uncond.decode | 27 +-
83
target/arm/vfp.decode | 34 +-
84
hw/arm/sbsa-ref.c | 43 ++-
85
hw/misc/sbsa_ec.c | 98 +++++
86
target/arm/cpu.c | 3 +-
87
target/arm/cpu64.c | 10 +-
88
target/arm/helper-a64.c | 11 -
89
target/arm/translate-sve.c | 4 -
90
target/arm/vec_helper.c | 431 ++++++++++++++++++++-
91
target/arm/vfp_helper.c | 244 +++++-------
92
hw/misc/meson.build | 2 +
93
target/arm/translate-neon.c.inc | 755 +++++++++++++------------------------
94
target/arm/translate-vfp.c.inc | 810 ++++++++++++++++++++++++++++++++++++----
95
16 files changed, 1819 insertions(+), 801 deletions(-)
96
create mode 100644 hw/misc/sbsa_ec.c
97
35
36
Peter Maydell (18):
37
target/arm: Fix MVE widening/narrowing VLDR/VSTR offset calculation
38
target/arm: Fix bugs in MVE VRMLALDAVH, VRMLSLDAVH
39
target/arm: Make asimd_imm_const() public
40
target/arm: Use asimd_imm_const for A64 decode
41
target/arm: Use dup_const() instead of bitfield_replicate()
42
target/arm: Implement MVE logical immediate insns
43
target/arm: Implement MVE vector shift left by immediate insns
44
target/arm: Implement MVE vector shift right by immediate insns
45
target/arm: Implement MVE VSHLL
46
target/arm: Implement MVE VSRI, VSLI
47
target/arm: Implement MVE VSHRN, VRSHRN
48
target/arm: Implement MVE saturating narrowing shifts
49
target/arm: Implement MVE VSHLC
50
target/arm: Implement MVE VADDLV
51
target/arm: Implement MVE long shifts by immediate
52
target/arm: Implement MVE long shifts by register
53
target/arm: Implement MVE shifts by immediate
54
target/arm: Implement MVE shifts by register
55
56
Philippe Mathieu-Daudé (1):
57
tests: Boot and halt a Linux guest on the Raspberry Pi 2 machine
58
59
docs/system/arm/aspeed.rst | 1 +
60
docs/system/arm/nuvoton.rst | 5 +-
61
include/hw/arm/bcm2835_peripherals.h | 3 +-
62
include/hw/misc/bcm2835_powermgt.h | 29 ++
63
target/arm/helper-mve.h | 108 +++++++
64
target/arm/translate.h | 41 +++
65
target/arm/mve.decode | 177 ++++++++++-
66
target/arm/t32.decode | 71 ++++-
67
hw/arm/bcm2835_peripherals.c | 13 +-
68
hw/gpio/gpio_pwr.c | 2 +-
69
hw/misc/bcm2835_powermgt.c | 160 ++++++++++
70
target/arm/helper-a64.c | 12 +-
71
target/arm/mve_helper.c | 524 +++++++++++++++++++++++++++++++--
72
target/arm/translate-a64.c | 86 +-----
73
target/arm/translate-mve.c | 261 +++++++++++++++-
74
target/arm/translate-neon.c | 81 -----
75
target/arm/translate.c | 327 +++++++++++++++++++-
76
target/arm/vfp_helper.c | 24 +-
77
hw/misc/meson.build | 1 +
78
tests/acceptance/boot_linux_console.py | 43 +++
79
20 files changed, 1760 insertions(+), 209 deletions(-)
80
create mode 100644 include/hw/misc/bcm2835_powermgt.h
81
create mode 100644 hw/misc/bcm2835_powermgt.c
82
diff view generated by jsdifflib
1
Convert the neon floating-point vector absolute comparison ops
1
From: Patrick Venture <venture@google.com>
2
VACGE and VACGT over to using a gvec hepler and use this to
3
implement the fp16 case.
4
2
3
Adds a line-item reference to the supported quanta-q71l-bmc aspeed
4
entry.
5
6
Signed-off-by: Patrick Venture <venture@google.com>
7
Reviewed-by: Cédric Le Goater <clg@kaod.org>
8
Message-id: 20210615192848.1065297-2-venture@google.com
5
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
9
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
6
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
7
Message-id: 20200828183354.27913-28-peter.maydell@linaro.org
8
---
10
---
9
target/arm/helper.h | 6 ++++++
11
docs/system/arm/aspeed.rst | 1 +
10
target/arm/vec_helper.c | 26 ++++++++++++++++++++++++++
12
1 file changed, 1 insertion(+)
11
target/arm/translate-neon.c.inc | 4 ++--
12
3 files changed, 34 insertions(+), 2 deletions(-)
13
13
14
diff --git a/target/arm/helper.h b/target/arm/helper.h
14
diff --git a/docs/system/arm/aspeed.rst b/docs/system/arm/aspeed.rst
15
index XXXXXXX..XXXXXXX 100644
15
index XXXXXXX..XXXXXXX 100644
16
--- a/target/arm/helper.h
16
--- a/docs/system/arm/aspeed.rst
17
+++ b/target/arm/helper.h
17
+++ b/docs/system/arm/aspeed.rst
18
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_5(gvec_fcge_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
18
@@ -XXX,XX +XXX,XX @@ etc.
19
DEF_HELPER_FLAGS_5(gvec_fcgt_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
19
AST2400 SoC based machines :
20
DEF_HELPER_FLAGS_5(gvec_fcgt_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
20
21
21
- ``palmetto-bmc`` OpenPOWER Palmetto POWER8 BMC
22
+DEF_HELPER_FLAGS_5(gvec_facge_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
22
+- ``quanta-q71l-bmc`` OpenBMC Quanta BMC
23
+DEF_HELPER_FLAGS_5(gvec_facge_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
23
24
+
24
AST2500 SoC based machines :
25
+DEF_HELPER_FLAGS_5(gvec_facgt_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
26
+DEF_HELPER_FLAGS_5(gvec_facgt_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
27
+
28
DEF_HELPER_FLAGS_5(gvec_ftsmul_h, TCG_CALL_NO_RWG,
29
void, ptr, ptr, ptr, ptr, i32)
30
DEF_HELPER_FLAGS_5(gvec_ftsmul_s, TCG_CALL_NO_RWG,
31
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
32
index XXXXXXX..XXXXXXX 100644
33
--- a/target/arm/vec_helper.c
34
+++ b/target/arm/vec_helper.c
35
@@ -XXX,XX +XXX,XX @@ static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
36
return -float32_lt(op2, op1, stat);
37
}
38
39
+static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
40
+{
41
+ return -float16_le(float16_abs(op2), float16_abs(op1), stat);
42
+}
43
+
44
+static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
45
+{
46
+ return -float32_le(float32_abs(op2), float32_abs(op1), stat);
47
+}
48
+
49
+static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
50
+{
51
+ return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
52
+}
53
+
54
+static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
55
+{
56
+ return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
57
+}
58
+
59
#define DO_2OP(NAME, FUNC, TYPE) \
60
void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
61
{ \
62
@@ -XXX,XX +XXX,XX @@ DO_3OP(gvec_fcge_s, float32_cge, float32)
63
DO_3OP(gvec_fcgt_h, float16_cgt, float16)
64
DO_3OP(gvec_fcgt_s, float32_cgt, float32)
65
66
+DO_3OP(gvec_facge_h, float16_acge, float16)
67
+DO_3OP(gvec_facge_s, float32_acge, float32)
68
+
69
+DO_3OP(gvec_facgt_h, float16_acgt, float16)
70
+DO_3OP(gvec_facgt_s, float32_acgt, float32)
71
+
72
#ifdef TARGET_AARCH64
73
74
DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
75
diff --git a/target/arm/translate-neon.c.inc b/target/arm/translate-neon.c.inc
76
index XXXXXXX..XXXXXXX 100644
77
--- a/target/arm/translate-neon.c.inc
78
+++ b/target/arm/translate-neon.c.inc
79
@@ -XXX,XX +XXX,XX @@ DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s, gen_helper_gvec_fmul_h)
80
DO_3S_FP_GVEC(VCEQ, gen_helper_gvec_fceq_s, gen_helper_gvec_fceq_h)
81
DO_3S_FP_GVEC(VCGE, gen_helper_gvec_fcge_s, gen_helper_gvec_fcge_h)
82
DO_3S_FP_GVEC(VCGT, gen_helper_gvec_fcgt_s, gen_helper_gvec_fcgt_h)
83
+DO_3S_FP_GVEC(VACGE, gen_helper_gvec_facge_s, gen_helper_gvec_facge_h)
84
+DO_3S_FP_GVEC(VACGT, gen_helper_gvec_facgt_s, gen_helper_gvec_facgt_h)
85
86
/*
87
* For all the functions using this macro, size == 1 means fp16,
88
@@ -XXX,XX +XXX,XX @@ DO_3S_FP_GVEC(VCGT, gen_helper_gvec_fcgt_s, gen_helper_gvec_fcgt_h)
89
return do_3same_fp(s, a, FUNC, READS_VD); \
90
}
91
92
-DO_3S_FP(VACGE, gen_helper_neon_acge_f32, false)
93
-DO_3S_FP(VACGT, gen_helper_neon_acgt_f32, false)
94
DO_3S_FP(VMAX, gen_helper_vfp_maxs, false)
95
DO_3S_FP(VMIN, gen_helper_vfp_mins, false)
96
25
97
--
26
--
98
2.20.1
27
2.20.1
99
28
100
29
diff view generated by jsdifflib
1
Set the MVFR1 ID register FPHP and SIMDHP fields to indicate
1
From: Patrick Venture <venture@google.com>
2
that our "-cpu max" has v8.2-FP16.
3
2
3
Add line item reference to quanta-gbs-bmc machine.
4
5
Signed-off-by: Patrick Venture <venture@google.com>
6
Reviewed-by: Cédric Le Goater <clg@kaod.org>
7
Message-id: 20210615192848.1065297-3-venture@google.com
8
[PMM: fixed underline Sphinx warning]
4
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
9
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
5
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
6
Message-id: 20200828183354.27913-46-peter.maydell@linaro.org
7
---
10
---
8
target/arm/cpu.c | 3 ++-
11
docs/system/arm/nuvoton.rst | 5 +++--
9
target/arm/cpu64.c | 10 ++++------
12
1 file changed, 3 insertions(+), 2 deletions(-)
10
2 files changed, 6 insertions(+), 7 deletions(-)
11
13
12
diff --git a/target/arm/cpu.c b/target/arm/cpu.c
14
diff --git a/docs/system/arm/nuvoton.rst b/docs/system/arm/nuvoton.rst
13
index XXXXXXX..XXXXXXX 100644
15
index XXXXXXX..XXXXXXX 100644
14
--- a/target/arm/cpu.c
16
--- a/docs/system/arm/nuvoton.rst
15
+++ b/target/arm/cpu.c
17
+++ b/docs/system/arm/nuvoton.rst
16
@@ -XXX,XX +XXX,XX @@ static void arm_max_initfn(Object *obj)
18
@@ -XXX,XX +XXX,XX @@
17
cpu->isar.id_isar6 = t;
19
-Nuvoton iBMC boards (``npcm750-evb``, ``quanta-gsj``)
18
20
-=====================================================
19
t = cpu->isar.mvfr1;
21
+Nuvoton iBMC boards (``*-bmc``, ``npcm750-evb``, ``quanta-gsj``)
20
- t = FIELD_DP32(t, MVFR1, FPHP, 2); /* v8.0 FP support */
22
+================================================================
21
+ t = FIELD_DP32(t, MVFR1, FPHP, 3); /* v8.2-FP16 */
23
22
+ t = FIELD_DP32(t, MVFR1, SIMDHP, 2); /* v8.2-FP16 */
24
The `Nuvoton iBMC`_ chips (NPCM7xx) are a family of ARM-based SoCs that are
23
cpu->isar.mvfr1 = t;
25
designed to be used as Baseboard Management Controllers (BMCs) in various
24
26
@@ -XXX,XX +XXX,XX @@ segment. The following machines are based on this chip :
25
t = cpu->isar.mvfr2;
27
The NPCM730 SoC has two Cortex-A9 cores and is targeted for Data Center and
26
diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c
28
Hyperscale applications. The following machines are based on this chip :
27
index XXXXXXX..XXXXXXX 100644
29
28
--- a/target/arm/cpu64.c
30
+- ``quanta-gbs-bmc`` Quanta GBS server BMC
29
+++ b/target/arm/cpu64.c
31
- ``quanta-gsj`` Quanta GSJ server BMC
30
@@ -XXX,XX +XXX,XX @@ static void aarch64_max_initfn(Object *obj)
32
31
u = FIELD_DP32(u, ID_DFR0, PERFMON, 5); /* v8.4-PMU */
33
There are also two more SoCs, NPCM710 and NPCM705, which are single-core
32
cpu->isar.id_dfr0 = u;
33
34
- /*
35
- * FIXME: We do not yet support ARMv8.2-fp16 for AArch32 yet,
36
- * so do not set MVFR1.FPHP. Strictly speaking this is not legal,
37
- * but it is also not legal to enable SVE without support for FP16,
38
- * and enabling SVE in system mode is more useful in the short term.
39
- */
40
+ u = cpu->isar.mvfr1;
41
+ u = FIELD_DP32(u, MVFR1, FPHP, 3); /* v8.2-FP16 */
42
+ u = FIELD_DP32(u, MVFR1, SIMDHP, 2); /* v8.2-FP16 */
43
+ cpu->isar.mvfr1 = u;
44
45
#ifdef CONFIG_USER_ONLY
46
/* For usermode -cpu max we can use a larger and more efficient DCZ
47
--
34
--
48
2.20.1
35
2.20.1
49
36
50
37
diff view generated by jsdifflib
1
From: Graeme Gregory <graeme@nuviainc.com>
1
From: Nolan Leake <nolan@sigbus.net>
2
2
3
A difference between sbsa platform and the virt platform is PSCI is
3
This is just enough to make reboot and poweroff work. Works for
4
handled by ARM-TF in the sbsa platform. This means that the PSCI code
4
linux, u-boot, and the arm trusted firmware. Not tested, but should
5
there needs to communicate some of the platform power changes down
5
work for plan9, and bare-metal/hobby OSes, since they seem to generally
6
to the qemu code for things like shutdown/reset control.
6
do what linux does for reset.
7
7
8
Space has been left to extend the EC if we find other use cases in
8
The watchdog timer functionality is not yet implemented.
9
future where ARM-TF and qemu need to communicate.
9
10
10
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/64
11
Signed-off-by: Graeme Gregory <graeme@nuviainc.com>
11
Signed-off-by: Nolan Leake <nolan@sigbus.net>
12
Reviewed-by: Leif Lindholm <leif@nuviainc.com>
12
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
13
Tested-by: Leif Lindholm <leif@nuviainc.com>
13
Tested-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
14
Message-id: 20200826141952.136164-2-graeme@nuviainc.com
14
Message-id: 20210625210209.1870217-1-nolan@sigbus.net
15
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
15
[PMM: tweaked commit title; fixed region size to 0x200;
16
moved header file to include/]
16
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
17
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
17
---
18
---
18
hw/misc/sbsa_ec.c | 98 +++++++++++++++++++++++++++++++++++++++++++++
19
include/hw/arm/bcm2835_peripherals.h | 3 +-
19
hw/misc/meson.build | 2 +
20
include/hw/misc/bcm2835_powermgt.h | 29 +++++
20
2 files changed, 100 insertions(+)
21
hw/arm/bcm2835_peripherals.c | 13 ++-
21
create mode 100644 hw/misc/sbsa_ec.c
22
hw/misc/bcm2835_powermgt.c | 160 +++++++++++++++++++++++++++
22
23
hw/misc/meson.build | 1 +
23
diff --git a/hw/misc/sbsa_ec.c b/hw/misc/sbsa_ec.c
24
5 files changed, 204 insertions(+), 2 deletions(-)
25
create mode 100644 include/hw/misc/bcm2835_powermgt.h
26
create mode 100644 hw/misc/bcm2835_powermgt.c
27
28
diff --git a/include/hw/arm/bcm2835_peripherals.h b/include/hw/arm/bcm2835_peripherals.h
29
index XXXXXXX..XXXXXXX 100644
30
--- a/include/hw/arm/bcm2835_peripherals.h
31
+++ b/include/hw/arm/bcm2835_peripherals.h
32
@@ -XXX,XX +XXX,XX @@
33
#include "hw/misc/bcm2835_mphi.h"
34
#include "hw/misc/bcm2835_thermal.h"
35
#include "hw/misc/bcm2835_cprman.h"
36
+#include "hw/misc/bcm2835_powermgt.h"
37
#include "hw/sd/sdhci.h"
38
#include "hw/sd/bcm2835_sdhost.h"
39
#include "hw/gpio/bcm2835_gpio.h"
40
@@ -XXX,XX +XXX,XX @@ struct BCM2835PeripheralState {
41
BCM2835MphiState mphi;
42
UnimplementedDeviceState txp;
43
UnimplementedDeviceState armtmr;
44
- UnimplementedDeviceState powermgt;
45
+ BCM2835PowerMgtState powermgt;
46
BCM2835CprmanState cprman;
47
PL011State uart0;
48
BCM2835AuxState aux;
49
diff --git a/include/hw/misc/bcm2835_powermgt.h b/include/hw/misc/bcm2835_powermgt.h
24
new file mode 100644
50
new file mode 100644
25
index XXXXXXX..XXXXXXX
51
index XXXXXXX..XXXXXXX
26
--- /dev/null
52
--- /dev/null
27
+++ b/hw/misc/sbsa_ec.c
53
+++ b/include/hw/misc/bcm2835_powermgt.h
28
@@ -XXX,XX +XXX,XX @@
54
@@ -XXX,XX +XXX,XX @@
29
+/*
55
+/*
30
+ * ARM SBSA Reference Platform Embedded Controller
56
+ * BCM2835 Power Management emulation
31
+ *
57
+ *
32
+ * A device to allow PSCI running in the secure side of sbsa-ref machine
58
+ * Copyright (C) 2017 Marcin Chojnacki <marcinch7@gmail.com>
33
+ * to communicate platform power states to qemu.
59
+ * Copyright (C) 2021 Nolan Leake <nolan@sigbus.net>
34
+ *
60
+ *
35
+ * Copyright (c) 2020 Nuvia Inc
61
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
36
+ * Written by Graeme Gregory <graeme@nuviainc.com>
62
+ * See the COPYING file in the top-level directory.
37
+ *
38
+ * SPDX-License-Identifer: GPL-2.0-or-later
39
+ */
63
+ */
40
+
64
+
65
+#ifndef BCM2835_POWERMGT_H
66
+#define BCM2835_POWERMGT_H
67
+
68
+#include "hw/sysbus.h"
69
+#include "qom/object.h"
70
+
71
+#define TYPE_BCM2835_POWERMGT "bcm2835-powermgt"
72
+OBJECT_DECLARE_SIMPLE_TYPE(BCM2835PowerMgtState, BCM2835_POWERMGT)
73
+
74
+struct BCM2835PowerMgtState {
75
+ SysBusDevice busdev;
76
+ MemoryRegion iomem;
77
+
78
+ uint32_t rstc;
79
+ uint32_t rsts;
80
+ uint32_t wdog;
81
+};
82
+
83
+#endif
84
diff --git a/hw/arm/bcm2835_peripherals.c b/hw/arm/bcm2835_peripherals.c
85
index XXXXXXX..XXXXXXX 100644
86
--- a/hw/arm/bcm2835_peripherals.c
87
+++ b/hw/arm/bcm2835_peripherals.c
88
@@ -XXX,XX +XXX,XX @@ static void bcm2835_peripherals_init(Object *obj)
89
90
object_property_add_const_link(OBJECT(&s->dwc2), "dma-mr",
91
OBJECT(&s->gpu_bus_mr));
92
+
93
+ /* Power Management */
94
+ object_initialize_child(obj, "powermgt", &s->powermgt,
95
+ TYPE_BCM2835_POWERMGT);
96
}
97
98
static void bcm2835_peripherals_realize(DeviceState *dev, Error **errp)
99
@@ -XXX,XX +XXX,XX @@ static void bcm2835_peripherals_realize(DeviceState *dev, Error **errp)
100
qdev_get_gpio_in_named(DEVICE(&s->ic), BCM2835_IC_GPU_IRQ,
101
INTERRUPT_USB));
102
103
+ /* Power Management */
104
+ if (!sysbus_realize(SYS_BUS_DEVICE(&s->powermgt), errp)) {
105
+ return;
106
+ }
107
+
108
+ memory_region_add_subregion(&s->peri_mr, PM_OFFSET,
109
+ sysbus_mmio_get_region(SYS_BUS_DEVICE(&s->powermgt), 0));
110
+
111
create_unimp(s, &s->txp, "bcm2835-txp", TXP_OFFSET, 0x1000);
112
create_unimp(s, &s->armtmr, "bcm2835-sp804", ARMCTRL_TIMER0_1_OFFSET, 0x40);
113
- create_unimp(s, &s->powermgt, "bcm2835-powermgt", PM_OFFSET, 0x114);
114
create_unimp(s, &s->i2s, "bcm2835-i2s", I2S_OFFSET, 0x100);
115
create_unimp(s, &s->smi, "bcm2835-smi", SMI_OFFSET, 0x100);
116
create_unimp(s, &s->spi[0], "bcm2835-spi0", SPI0_OFFSET, 0x20);
117
diff --git a/hw/misc/bcm2835_powermgt.c b/hw/misc/bcm2835_powermgt.c
118
new file mode 100644
119
index XXXXXXX..XXXXXXX
120
--- /dev/null
121
+++ b/hw/misc/bcm2835_powermgt.c
122
@@ -XXX,XX +XXX,XX @@
123
+/*
124
+ * BCM2835 Power Management emulation
125
+ *
126
+ * Copyright (C) 2017 Marcin Chojnacki <marcinch7@gmail.com>
127
+ * Copyright (C) 2021 Nolan Leake <nolan@sigbus.net>
128
+ *
129
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
130
+ * See the COPYING file in the top-level directory.
131
+ */
132
+
41
+#include "qemu/osdep.h"
133
+#include "qemu/osdep.h"
42
+#include "qemu-common.h"
43
+#include "qemu/log.h"
134
+#include "qemu/log.h"
44
+#include "hw/sysbus.h"
135
+#include "qemu/module.h"
136
+#include "hw/misc/bcm2835_powermgt.h"
137
+#include "migration/vmstate.h"
45
+#include "sysemu/runstate.h"
138
+#include "sysemu/runstate.h"
46
+
139
+
47
+typedef struct {
140
+#define PASSWORD 0x5a000000
48
+ SysBusDevice parent_obj;
141
+#define PASSWORD_MASK 0xff000000
49
+ MemoryRegion iomem;
142
+
50
+} SECUREECState;
143
+#define R_RSTC 0x1c
51
+
144
+#define V_RSTC_RESET 0x20
52
+#define TYPE_SBSA_EC "sbsa-ec"
145
+#define R_RSTS 0x20
53
+#define SECURE_EC(obj) OBJECT_CHECK(SECUREECState, (obj), TYPE_SBSA_EC)
146
+#define V_RSTS_POWEROFF 0x555 /* Linux uses partition 63 to indicate halt. */
54
+
147
+#define R_WDOG 0x24
55
+enum sbsa_ec_powerstates {
148
+
56
+ SBSA_EC_CMD_POWEROFF = 0x01,
149
+static uint64_t bcm2835_powermgt_read(void *opaque, hwaddr offset,
57
+ SBSA_EC_CMD_REBOOT = 0x02,
150
+ unsigned size)
58
+};
151
+{
59
+
152
+ BCM2835PowerMgtState *s = (BCM2835PowerMgtState *)opaque;
60
+static uint64_t sbsa_ec_read(void *opaque, hwaddr offset, unsigned size)
153
+ uint32_t res = 0;
61
+{
154
+
62
+ /* No use for this currently */
155
+ switch (offset) {
63
+ qemu_log_mask(LOG_GUEST_ERROR, "sbsa-ec: no readable registers");
156
+ case R_RSTC:
64
+ return 0;
157
+ res = s->rstc;
65
+}
158
+ break;
66
+
159
+ case R_RSTS:
67
+static void sbsa_ec_write(void *opaque, hwaddr offset,
160
+ res = s->rsts;
68
+ uint64_t value, unsigned size)
161
+ break;
69
+{
162
+ case R_WDOG:
70
+ if (offset == 0) { /* PSCI machine power command register */
163
+ res = s->wdog;
71
+ switch (value) {
164
+ break;
72
+ case SBSA_EC_CMD_POWEROFF:
165
+
73
+ qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
166
+ default:
74
+ break;
167
+ qemu_log_mask(LOG_UNIMP,
75
+ case SBSA_EC_CMD_REBOOT:
168
+ "bcm2835_powermgt_read: Unknown offset 0x%08"HWADDR_PRIx
76
+ qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
169
+ "\n", offset);
77
+ break;
170
+ res = 0;
78
+ default:
171
+ break;
79
+ qemu_log_mask(LOG_GUEST_ERROR,
172
+ }
80
+ "sbsa-ec: unknown power command");
173
+
174
+ return res;
175
+}
176
+
177
+static void bcm2835_powermgt_write(void *opaque, hwaddr offset,
178
+ uint64_t value, unsigned size)
179
+{
180
+ BCM2835PowerMgtState *s = (BCM2835PowerMgtState *)opaque;
181
+
182
+ if ((value & PASSWORD_MASK) != PASSWORD) {
183
+ qemu_log_mask(LOG_GUEST_ERROR,
184
+ "bcm2835_powermgt_write: Bad password 0x%"PRIx64
185
+ " at offset 0x%08"HWADDR_PRIx"\n",
186
+ value, offset);
187
+ return;
188
+ }
189
+
190
+ value = value & ~PASSWORD_MASK;
191
+
192
+ switch (offset) {
193
+ case R_RSTC:
194
+ s->rstc = value;
195
+ if (value & V_RSTC_RESET) {
196
+ if ((s->rsts & 0xfff) == V_RSTS_POWEROFF) {
197
+ qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
198
+ } else {
199
+ qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
200
+ }
81
+ }
201
+ }
82
+ } else {
202
+ break;
83
+ qemu_log_mask(LOG_GUEST_ERROR, "sbsa-ec: unknown EC register");
203
+ case R_RSTS:
84
+ }
204
+ qemu_log_mask(LOG_UNIMP,
85
+}
205
+ "bcm2835_powermgt_write: RSTS\n");
86
+
206
+ s->rsts = value;
87
+static const MemoryRegionOps sbsa_ec_ops = {
207
+ break;
88
+ .read = sbsa_ec_read,
208
+ case R_WDOG:
89
+ .write = sbsa_ec_write,
209
+ qemu_log_mask(LOG_UNIMP,
210
+ "bcm2835_powermgt_write: WDOG\n");
211
+ s->wdog = value;
212
+ break;
213
+
214
+ default:
215
+ qemu_log_mask(LOG_UNIMP,
216
+ "bcm2835_powermgt_write: Unknown offset 0x%08"HWADDR_PRIx
217
+ "\n", offset);
218
+ break;
219
+ }
220
+}
221
+
222
+static const MemoryRegionOps bcm2835_powermgt_ops = {
223
+ .read = bcm2835_powermgt_read,
224
+ .write = bcm2835_powermgt_write,
90
+ .endianness = DEVICE_NATIVE_ENDIAN,
225
+ .endianness = DEVICE_NATIVE_ENDIAN,
91
+ .valid.min_access_size = 4,
226
+ .impl.min_access_size = 4,
92
+ .valid.max_access_size = 4,
227
+ .impl.max_access_size = 4,
93
+};
228
+};
94
+
229
+
95
+static void sbsa_ec_init(Object *obj)
230
+static const VMStateDescription vmstate_bcm2835_powermgt = {
96
+{
231
+ .name = TYPE_BCM2835_POWERMGT,
97
+ SECUREECState *s = SECURE_EC(obj);
232
+ .version_id = 1,
98
+ SysBusDevice *dev = SYS_BUS_DEVICE(obj);
233
+ .minimum_version_id = 1,
99
+
234
+ .fields = (VMStateField[]) {
100
+ memory_region_init_io(&s->iomem, obj, &sbsa_ec_ops, s, "sbsa-ec",
235
+ VMSTATE_UINT32(rstc, BCM2835PowerMgtState),
101
+ 0x1000);
236
+ VMSTATE_UINT32(rsts, BCM2835PowerMgtState),
102
+ sysbus_init_mmio(dev, &s->iomem);
237
+ VMSTATE_UINT32(wdog, BCM2835PowerMgtState),
103
+}
238
+ VMSTATE_END_OF_LIST()
104
+
239
+ }
105
+static void sbsa_ec_class_init(ObjectClass *klass, void *data)
240
+};
241
+
242
+static void bcm2835_powermgt_init(Object *obj)
243
+{
244
+ BCM2835PowerMgtState *s = BCM2835_POWERMGT(obj);
245
+
246
+ memory_region_init_io(&s->iomem, obj, &bcm2835_powermgt_ops, s,
247
+ TYPE_BCM2835_POWERMGT, 0x200);
248
+ sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->iomem);
249
+}
250
+
251
+static void bcm2835_powermgt_reset(DeviceState *dev)
252
+{
253
+ BCM2835PowerMgtState *s = BCM2835_POWERMGT(dev);
254
+
255
+ /* https://elinux.org/BCM2835_registers#PM */
256
+ s->rstc = 0x00000102;
257
+ s->rsts = 0x00001000;
258
+ s->wdog = 0x00000000;
259
+}
260
+
261
+static void bcm2835_powermgt_class_init(ObjectClass *klass, void *data)
106
+{
262
+{
107
+ DeviceClass *dc = DEVICE_CLASS(klass);
263
+ DeviceClass *dc = DEVICE_CLASS(klass);
108
+
264
+
109
+ /* No vmstate or reset required: device has no internal state */
265
+ dc->reset = bcm2835_powermgt_reset;
110
+ dc->user_creatable = false;
266
+ dc->vmsd = &vmstate_bcm2835_powermgt;
111
+}
267
+}
112
+
268
+
113
+static const TypeInfo sbsa_ec_info = {
269
+static TypeInfo bcm2835_powermgt_info = {
114
+ .name = TYPE_SBSA_EC,
270
+ .name = TYPE_BCM2835_POWERMGT,
115
+ .parent = TYPE_SYS_BUS_DEVICE,
271
+ .parent = TYPE_SYS_BUS_DEVICE,
116
+ .instance_size = sizeof(SECUREECState),
272
+ .instance_size = sizeof(BCM2835PowerMgtState),
117
+ .instance_init = sbsa_ec_init,
273
+ .class_init = bcm2835_powermgt_class_init,
118
+ .class_init = sbsa_ec_class_init,
274
+ .instance_init = bcm2835_powermgt_init,
119
+};
275
+};
120
+
276
+
121
+static void sbsa_ec_register_type(void)
277
+static void bcm2835_powermgt_register_types(void)
122
+{
278
+{
123
+ type_register_static(&sbsa_ec_info);
279
+ type_register_static(&bcm2835_powermgt_info);
124
+}
280
+}
125
+
281
+
126
+type_init(sbsa_ec_register_type);
282
+type_init(bcm2835_powermgt_register_types)
127
diff --git a/hw/misc/meson.build b/hw/misc/meson.build
283
diff --git a/hw/misc/meson.build b/hw/misc/meson.build
128
index XXXXXXX..XXXXXXX 100644
284
index XXXXXXX..XXXXXXX 100644
129
--- a/hw/misc/meson.build
285
--- a/hw/misc/meson.build
130
+++ b/hw/misc/meson.build
286
+++ b/hw/misc/meson.build
131
@@ -XXX,XX +XXX,XX @@ specific_ss.add(when: 'CONFIG_MAC_VIA', if_true: files('mac_via.c'))
287
@@ -XXX,XX +XXX,XX @@ softmmu_ss.add(when: 'CONFIG_RASPI', if_true: files(
132
288
'bcm2835_rng.c',
133
specific_ss.add(when: 'CONFIG_MIPS_CPS', if_true: files('mips_cmgcr.c', 'mips_cpc.c'))
289
'bcm2835_thermal.c',
134
specific_ss.add(when: 'CONFIG_MIPS_ITU', if_true: files('mips_itu.c'))
290
'bcm2835_cprman.c',
135
+
291
+ 'bcm2835_powermgt.c',
136
+specific_ss.add(when: 'CONFIG_SBSA_REF', if_true: files('sbsa_ec.c'))
292
))
293
softmmu_ss.add(when: 'CONFIG_SLAVIO', if_true: files('slavio_misc.c'))
294
softmmu_ss.add(when: 'CONFIG_ZYNQ', if_true: files('zynq_slcr.c', 'zynq-xadc.c'))
137
--
295
--
138
2.20.1
296
2.20.1
139
297
140
298
diff view generated by jsdifflib
1
From: Graeme Gregory <graeme@nuviainc.com>
1
From: Philippe Mathieu-Daudé <f4bug@amsat.org>
2
2
3
Add the previously created sbsa-ec device to the sbsa-ref machine in
3
Add a test booting and quickly shutdown a raspi2 machine,
4
secure memory so the PSCI implementation in ARM-TF can access it, but
4
to test the power management model:
5
not expose it to non secure firmware or OS except by via ARM-TF.
6
5
7
Signed-off-by: Graeme Gregory <graeme@nuviainc.com>
6
(1/1) tests/acceptance/boot_linux_console.py:BootLinuxConsole.test_arm_raspi2_initrd:
8
Reviewed-by: Leif Lindholm <leif@nuviainc.com>
7
console: [ 0.000000] Booting Linux on physical CPU 0xf00
9
Tested-by: Leif Lindholm <leif@nuviainc.com>
8
console: [ 0.000000] Linux version 4.14.98-v7+ (dom@dom-XPS-13-9370) (gcc version 4.9.3 (crosstool-NG crosstool-ng-1.22.0-88-g8460611)) #1200 SMP Tue Feb 12 20:27:48 GMT 2019
10
Message-id: 20200826141952.136164-3-graeme@nuviainc.com
9
console: [ 0.000000] CPU: ARMv7 Processor [410fc075] revision 5 (ARMv7), cr=10c5387d
11
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
10
console: [ 0.000000] CPU: div instructions available: patching division code
11
console: [ 0.000000] CPU: PIPT / VIPT nonaliasing data cache, VIPT aliasing instruction cache
12
console: [ 0.000000] OF: fdt: Machine model: Raspberry Pi 2 Model B
13
...
14
console: Boot successful.
15
console: cat /proc/cpuinfo
16
console: / # cat /proc/cpuinfo
17
...
18
console: processor : 3
19
console: model name : ARMv7 Processor rev 5 (v7l)
20
console: BogoMIPS : 125.00
21
console: Features : half thumb fastmult vfp edsp neon vfpv3 tls vfpv4 idiva idivt vfpd32 lpae evtstrm
22
console: CPU implementer : 0x41
23
console: CPU architecture: 7
24
console: CPU variant : 0x0
25
console: CPU part : 0xc07
26
console: CPU revision : 5
27
console: Hardware : BCM2835
28
console: Revision : 0000
29
console: Serial : 0000000000000000
30
console: cat /proc/iomem
31
console: / # cat /proc/iomem
32
console: 00000000-3bffffff : System RAM
33
console: 00008000-00afffff : Kernel code
34
console: 00c00000-00d468ef : Kernel data
35
console: 3f006000-3f006fff : dwc_otg
36
console: 3f007000-3f007eff : /soc/dma@7e007000
37
console: 3f00b880-3f00b8bf : /soc/mailbox@7e00b880
38
console: 3f100000-3f100027 : /soc/watchdog@7e100000
39
console: 3f101000-3f102fff : /soc/cprman@7e101000
40
console: 3f200000-3f2000b3 : /soc/gpio@7e200000
41
PASS (24.59 s)
42
RESULTS : PASS 1 | ERROR 0 | FAIL 0 | SKIP 0 | WARN 0 | INTERRUPT 0 | CANCEL 0
43
JOB TIME : 25.02 s
44
45
Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
46
Reviewed-by: Wainer dos Santos Moschetta <wainersm@redhat.com>
47
Message-id: 20210531113837.1689775-1-f4bug@amsat.org
12
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
48
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
13
---
49
---
14
hw/arm/sbsa-ref.c | 14 ++++++++++++++
50
tests/acceptance/boot_linux_console.py | 43 ++++++++++++++++++++++++++
15
1 file changed, 14 insertions(+)
51
1 file changed, 43 insertions(+)
16
52
17
diff --git a/hw/arm/sbsa-ref.c b/hw/arm/sbsa-ref.c
53
diff --git a/tests/acceptance/boot_linux_console.py b/tests/acceptance/boot_linux_console.py
18
index XXXXXXX..XXXXXXX 100644
54
index XXXXXXX..XXXXXXX 100644
19
--- a/hw/arm/sbsa-ref.c
55
--- a/tests/acceptance/boot_linux_console.py
20
+++ b/hw/arm/sbsa-ref.c
56
+++ b/tests/acceptance/boot_linux_console.py
21
@@ -XXX,XX +XXX,XX @@ enum {
57
@@ -XXX,XX +XXX,XX @@
22
SBSA_CPUPERIPHS,
58
from avocado import skip
23
SBSA_GIC_DIST,
59
from avocado import skipUnless
24
SBSA_GIC_REDIST,
60
from avocado_qemu import Test
25
+ SBSA_SECURE_EC,
61
+from avocado_qemu import exec_command
26
SBSA_SMMU,
62
from avocado_qemu import exec_command_and_wait_for_pattern
27
SBSA_UART,
63
from avocado_qemu import interrupt_interactive_console_until_pattern
28
SBSA_RTC,
64
from avocado_qemu import wait_for_console_pattern
29
@@ -XXX,XX +XXX,XX @@ static const MemMapEntry sbsa_ref_memmap[] = {
65
@@ -XXX,XX +XXX,XX @@ def test_arm_raspi2_uart0(self):
30
[SBSA_CPUPERIPHS] = { 0x40000000, 0x00040000 },
66
"""
31
[SBSA_GIC_DIST] = { 0x40060000, 0x00010000 },
67
self.do_test_arm_raspi2(0)
32
[SBSA_GIC_REDIST] = { 0x40080000, 0x04000000 },
68
33
+ [SBSA_SECURE_EC] = { 0x50000000, 0x00001000 },
69
+ def test_arm_raspi2_initrd(self):
34
[SBSA_UART] = { 0x60000000, 0x00001000 },
70
+ """
35
[SBSA_RTC] = { 0x60010000, 0x00001000 },
71
+ :avocado: tags=arch:arm
36
[SBSA_GPIO] = { 0x60020000, 0x00001000 },
72
+ :avocado: tags=machine:raspi2
37
@@ -XXX,XX +XXX,XX @@ static void *sbsa_ref_dtb(const struct arm_boot_info *binfo, int *fdt_size)
73
+ """
38
return board->fdt;
74
+ deb_url = ('http://archive.raspberrypi.org/debian/'
39
}
75
+ 'pool/main/r/raspberrypi-firmware/'
40
76
+ 'raspberrypi-kernel_1.20190215-1_armhf.deb')
41
+static void create_secure_ec(MemoryRegion *mem)
77
+ deb_hash = 'cd284220b32128c5084037553db3c482426f3972'
42
+{
78
+ deb_path = self.fetch_asset(deb_url, asset_hash=deb_hash)
43
+ hwaddr base = sbsa_ref_memmap[SBSA_SECURE_EC].base;
79
+ kernel_path = self.extract_from_deb(deb_path, '/boot/kernel7.img')
44
+ DeviceState *dev = qdev_new("sbsa-ec");
80
+ dtb_path = self.extract_from_deb(deb_path, '/boot/bcm2709-rpi-2-b.dtb')
45
+ SysBusDevice *s = SYS_BUS_DEVICE(dev);
46
+
81
+
47
+ memory_region_add_subregion(mem, base,
82
+ initrd_url = ('https://github.com/groeck/linux-build-test/raw/'
48
+ sysbus_mmio_get_region(s, 0));
83
+ '2eb0a73b5d5a28df3170c546ddaaa9757e1e0848/rootfs/'
49
+}
84
+ 'arm/rootfs-armv7a.cpio.gz')
85
+ initrd_hash = '604b2e45cdf35045846b8bbfbf2129b1891bdc9c'
86
+ initrd_path_gz = self.fetch_asset(initrd_url, asset_hash=initrd_hash)
87
+ initrd_path = os.path.join(self.workdir, 'rootfs.cpio')
88
+ archive.gzip_uncompress(initrd_path_gz, initrd_path)
50
+
89
+
51
static void sbsa_ref_init(MachineState *machine)
90
+ self.vm.set_console()
52
{
91
+ kernel_command_line = (self.KERNEL_COMMON_COMMAND_LINE +
53
unsigned int smp_cpus = machine->smp.cpus;
92
+ 'earlycon=pl011,0x3f201000 console=ttyAMA0 '
54
@@ -XXX,XX +XXX,XX @@ static void sbsa_ref_init(MachineState *machine)
93
+ 'panic=-1 noreboot ' +
55
94
+ 'dwc_otg.fiq_fsm_enable=0')
56
create_pcie(sms);
95
+ self.vm.add_args('-kernel', kernel_path,
57
96
+ '-dtb', dtb_path,
58
+ create_secure_ec(secure_sysmem);
97
+ '-initrd', initrd_path,
98
+ '-append', kernel_command_line,
99
+ '-no-reboot')
100
+ self.vm.launch()
101
+ self.wait_for_console_pattern('Boot successful.')
59
+
102
+
60
sms->bootinfo.ram_size = machine->ram_size;
103
+ exec_command_and_wait_for_pattern(self, 'cat /proc/cpuinfo',
61
sms->bootinfo.nb_cpus = smp_cpus;
104
+ 'BCM2835')
62
sms->bootinfo.board_id = -1;
105
+ exec_command_and_wait_for_pattern(self, 'cat /proc/iomem',
106
+ '/soc/cprman@7e101000')
107
+ exec_command(self, 'halt')
108
+ # Wait for VM to shut down gracefully
109
+ self.vm.wait()
110
+
111
def test_arm_exynos4210_initrd(self):
112
"""
113
:avocado: tags=arch:arm
63
--
114
--
64
2.20.1
115
2.20.1
65
116
66
117
diff view generated by jsdifflib
1
In several places the target/arm code defines local float constants
1
From: Joe Komlodi <joe.komlodi@xilinx.com>
2
for 2, 3 and 1.5, which are also provided by include/fpu/softfloat.h.
3
Remove the unnecessary local duplicate versions.
4
2
3
If the CPU is running in default NaN mode (FPCR.DN == 1) and we execute
4
FRSQRTE, FRECPE, or FRECPX with a signaling NaN, parts_silence_nan_frac() will
5
assert due to fpst->default_nan_mode being set.
6
7
To avoid this, we check to see what NaN mode we're running in before we call
8
floatxx_silence_nan().
9
10
Signed-off-by: Joe Komlodi <joe.komlodi@xilinx.com>
11
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
12
Message-id: 1624662174-175828-2-git-send-email-joe.komlodi@xilinx.com
13
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
5
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
14
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
6
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
7
Message-id: 20200828183354.27913-2-peter.maydell@linaro.org
8
---
15
---
9
target/arm/helper-a64.c | 11 -----------
16
target/arm/helper-a64.c | 12 +++++++++---
10
target/arm/translate-sve.c | 4 ----
17
target/arm/vfp_helper.c | 24 ++++++++++++++++++------
11
target/arm/vfp_helper.c | 4 ----
18
2 files changed, 27 insertions(+), 9 deletions(-)
12
3 files changed, 19 deletions(-)
13
19
14
diff --git a/target/arm/helper-a64.c b/target/arm/helper-a64.c
20
diff --git a/target/arm/helper-a64.c b/target/arm/helper-a64.c
15
index XXXXXXX..XXXXXXX 100644
21
index XXXXXXX..XXXXXXX 100644
16
--- a/target/arm/helper-a64.c
22
--- a/target/arm/helper-a64.c
17
+++ b/target/arm/helper-a64.c
23
+++ b/target/arm/helper-a64.c
18
@@ -XXX,XX +XXX,XX @@ uint64_t HELPER(neon_cgt_f64)(float64 a, float64 b, void *fpstp)
24
@@ -XXX,XX +XXX,XX @@ uint32_t HELPER(frecpx_f16)(uint32_t a, void *fpstp)
19
* versions, these do a fully fused multiply-add or
25
float16 nan = a;
20
* multiply-add-and-halve.
26
if (float16_is_signaling_nan(a, fpst)) {
21
*/
27
float_raise(float_flag_invalid, fpst);
22
-#define float16_two make_float16(0x4000)
28
- nan = float16_silence_nan(a, fpst);
23
-#define float16_three make_float16(0x4200)
29
+ if (!fpst->default_nan_mode) {
24
-#define float16_one_point_five make_float16(0x3e00)
30
+ nan = float16_silence_nan(a, fpst);
25
-
31
+ }
26
-#define float32_two make_float32(0x40000000)
32
}
27
-#define float32_three make_float32(0x40400000)
33
if (fpst->default_nan_mode) {
28
-#define float32_one_point_five make_float32(0x3fc00000)
34
nan = float16_default_nan(fpst);
29
-
35
@@ -XXX,XX +XXX,XX @@ float32 HELPER(frecpx_f32)(float32 a, void *fpstp)
30
-#define float64_two make_float64(0x4000000000000000ULL)
36
float32 nan = a;
31
-#define float64_three make_float64(0x4008000000000000ULL)
37
if (float32_is_signaling_nan(a, fpst)) {
32
-#define float64_one_point_five make_float64(0x3FF8000000000000ULL)
38
float_raise(float_flag_invalid, fpst);
33
39
- nan = float32_silence_nan(a, fpst);
34
uint32_t HELPER(recpsf_f16)(uint32_t a, uint32_t b, void *fpstp)
40
+ if (!fpst->default_nan_mode) {
35
{
41
+ nan = float32_silence_nan(a, fpst);
36
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
42
+ }
37
index XXXXXXX..XXXXXXX 100644
43
}
38
--- a/target/arm/translate-sve.c
44
if (fpst->default_nan_mode) {
39
+++ b/target/arm/translate-sve.c
45
nan = float32_default_nan(fpst);
40
@@ -XXX,XX +XXX,XX @@ static bool trans_##NAME##_zpzi(DisasContext *s, arg_rpri_esz *a) \
46
@@ -XXX,XX +XXX,XX @@ float64 HELPER(frecpx_f64)(float64 a, void *fpstp)
41
return true; \
47
float64 nan = a;
42
}
48
if (float64_is_signaling_nan(a, fpst)) {
43
49
float_raise(float_flag_invalid, fpst);
44
-#define float16_two make_float16(0x4000)
50
- nan = float64_silence_nan(a, fpst);
45
-#define float32_two make_float32(0x40000000)
51
+ if (!fpst->default_nan_mode) {
46
-#define float64_two make_float64(0x4000000000000000ULL)
52
+ nan = float64_silence_nan(a, fpst);
47
-
53
+ }
48
DO_FP_IMM(FADD, fadds, half, one)
54
}
49
DO_FP_IMM(FSUB, fsubs, half, one)
55
if (fpst->default_nan_mode) {
50
DO_FP_IMM(FMUL, fmuls, half, two)
56
nan = float64_default_nan(fpst);
51
diff --git a/target/arm/vfp_helper.c b/target/arm/vfp_helper.c
57
diff --git a/target/arm/vfp_helper.c b/target/arm/vfp_helper.c
52
index XXXXXXX..XXXXXXX 100644
58
index XXXXXXX..XXXXXXX 100644
53
--- a/target/arm/vfp_helper.c
59
--- a/target/arm/vfp_helper.c
54
+++ b/target/arm/vfp_helper.c
60
+++ b/target/arm/vfp_helper.c
55
@@ -XXX,XX +XXX,XX @@ uint32_t HELPER(vfp_fcvt_f64_to_f16)(float64 a, void *fpstp, uint32_t ahp_mode)
61
@@ -XXX,XX +XXX,XX @@ uint32_t HELPER(recpe_f16)(uint32_t input, void *fpstp)
56
return r;
62
float16 nan = f16;
57
}
63
if (float16_is_signaling_nan(f16, fpst)) {
58
64
float_raise(float_flag_invalid, fpst);
59
-#define float32_two make_float32(0x40000000)
65
- nan = float16_silence_nan(f16, fpst);
60
-#define float32_three make_float32(0x40400000)
66
+ if (!fpst->default_nan_mode) {
61
-#define float32_one_point_five make_float32(0x3fc00000)
67
+ nan = float16_silence_nan(f16, fpst);
62
-
68
+ }
63
float32 HELPER(recps_f32)(CPUARMState *env, float32 a, float32 b)
69
}
64
{
70
if (fpst->default_nan_mode) {
65
float_status *s = &env->vfp.standard_fp_status;
71
nan = float16_default_nan(fpst);
72
@@ -XXX,XX +XXX,XX @@ float32 HELPER(recpe_f32)(float32 input, void *fpstp)
73
float32 nan = f32;
74
if (float32_is_signaling_nan(f32, fpst)) {
75
float_raise(float_flag_invalid, fpst);
76
- nan = float32_silence_nan(f32, fpst);
77
+ if (!fpst->default_nan_mode) {
78
+ nan = float32_silence_nan(f32, fpst);
79
+ }
80
}
81
if (fpst->default_nan_mode) {
82
nan = float32_default_nan(fpst);
83
@@ -XXX,XX +XXX,XX @@ float64 HELPER(recpe_f64)(float64 input, void *fpstp)
84
float64 nan = f64;
85
if (float64_is_signaling_nan(f64, fpst)) {
86
float_raise(float_flag_invalid, fpst);
87
- nan = float64_silence_nan(f64, fpst);
88
+ if (!fpst->default_nan_mode) {
89
+ nan = float64_silence_nan(f64, fpst);
90
+ }
91
}
92
if (fpst->default_nan_mode) {
93
nan = float64_default_nan(fpst);
94
@@ -XXX,XX +XXX,XX @@ uint32_t HELPER(rsqrte_f16)(uint32_t input, void *fpstp)
95
float16 nan = f16;
96
if (float16_is_signaling_nan(f16, s)) {
97
float_raise(float_flag_invalid, s);
98
- nan = float16_silence_nan(f16, s);
99
+ if (!s->default_nan_mode) {
100
+ nan = float16_silence_nan(f16, fpstp);
101
+ }
102
}
103
if (s->default_nan_mode) {
104
nan = float16_default_nan(s);
105
@@ -XXX,XX +XXX,XX @@ float32 HELPER(rsqrte_f32)(float32 input, void *fpstp)
106
float32 nan = f32;
107
if (float32_is_signaling_nan(f32, s)) {
108
float_raise(float_flag_invalid, s);
109
- nan = float32_silence_nan(f32, s);
110
+ if (!s->default_nan_mode) {
111
+ nan = float32_silence_nan(f32, fpstp);
112
+ }
113
}
114
if (s->default_nan_mode) {
115
nan = float32_default_nan(s);
116
@@ -XXX,XX +XXX,XX @@ float64 HELPER(rsqrte_f64)(float64 input, void *fpstp)
117
float64 nan = f64;
118
if (float64_is_signaling_nan(f64, s)) {
119
float_raise(float_flag_invalid, s);
120
- nan = float64_silence_nan(f64, s);
121
+ if (!s->default_nan_mode) {
122
+ nan = float64_silence_nan(f64, fpstp);
123
+ }
124
}
125
if (s->default_nan_mode) {
126
nan = float64_default_nan(s);
66
--
127
--
67
2.20.1
128
2.20.1
68
129
69
130
diff view generated by jsdifflib
Deleted patch
1
The aa32_fp16_arith feature check function currently looks at the
2
AArch64 ID_AA64PFR0 register. This is (as the comment notes) not
3
correct. The bogus check was put in mostly to allow testing of the
4
fp16 variants of the VCMLA instructions and it was something of
5
a mistake that we allowed them to exist in master.
6
1
7
Switch the feature check function to testing VMFR1.FPHP, which is
8
what it ought to be.
9
10
This will remove emulation of the VCMLA and VCADD insns from
11
AArch32 code running on an AArch64 '-cpu max' using system emulation.
12
(They were never enabled for aarch32 linux-user and system-emulation.)
13
Since we weren't advertising their existence via the AArch32 ID
14
register, well-behaved guests wouldn't have been using them anyway.
15
16
Once we have implemented all the AArch32 support for the FP16 extension
17
we will advertise it in the MVFR1 ID register field, which will reenable
18
these insns along with all the others.
19
20
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
21
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
22
Message-id: 20200828183354.27913-3-peter.maydell@linaro.org
23
---
24
target/arm/cpu.h | 7 +------
25
1 file changed, 1 insertion(+), 6 deletions(-)
26
27
diff --git a/target/arm/cpu.h b/target/arm/cpu.h
28
index XXXXXXX..XXXXXXX 100644
29
--- a/target/arm/cpu.h
30
+++ b/target/arm/cpu.h
31
@@ -XXX,XX +XXX,XX @@ static inline bool isar_feature_aa32_predinv(const ARMISARegisters *id)
32
33
static inline bool isar_feature_aa32_fp16_arith(const ARMISARegisters *id)
34
{
35
- /*
36
- * This is a placeholder for use by VCMA until the rest of
37
- * the ARMv8.2-FP16 extension is implemented for aa32 mode.
38
- * At which point we can properly set and check MVFR1.FPHP.
39
- */
40
- return FIELD_EX64(id->id_aa64pfr0, ID_AA64PFR0, FP) == 1;
41
+ return FIELD_EX32(id->mvfr1, MVFR1, FPHP) >= 3;
42
}
43
44
static inline bool isar_feature_aa32_vfp_simd(const ARMISARegisters *id)
45
--
46
2.20.1
47
48
diff view generated by jsdifflib
1
From: Leif Lindholm <leif@nuviainc.com>
1
From: Maxim Uvarov <maxim.uvarov@linaro.org>
2
2
3
The sbsa-ref platform uses a minimal device tree to pass amount of memory
3
qemu has 2 type of functions: shutdown and reboot. Shutdown
4
as well as number of cpus to the firmware. However, when dumping that
4
function has to be used for machine shutdown. Otherwise we cause
5
minimal dtb (with -M sbsa-virt,dumpdtb=<file>), the resulting blob
5
a reset with a bogus "cause" value, when we intended a shutdown.
6
generates a warning when decompiled by dtc due to lack of reg property.
7
6
8
Add a simple reg property per cpu, representing a 64-bit MPIDR_EL1.
7
Signed-off-by: Maxim Uvarov <maxim.uvarov@linaro.org>
9
8
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
10
This also ends up being cleaner than having the firmware calculating its
9
Message-id: 20210625111842.3790-3-maxim.uvarov@linaro.org
11
own IDs for generating APCI.
10
[PMM: tweaked commit message]
12
13
Signed-off-by: Leif Lindholm <leif@nuviainc.com>
14
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
15
Message-id: 20200827124335.30586-1-leif@nuviainc.com
16
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
11
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
17
---
12
---
18
hw/arm/sbsa-ref.c | 29 +++++++++++++++++++++++------
13
hw/gpio/gpio_pwr.c | 2 +-
19
1 file changed, 23 insertions(+), 6 deletions(-)
14
1 file changed, 1 insertion(+), 1 deletion(-)
20
15
21
diff --git a/hw/arm/sbsa-ref.c b/hw/arm/sbsa-ref.c
16
diff --git a/hw/gpio/gpio_pwr.c b/hw/gpio/gpio_pwr.c
22
index XXXXXXX..XXXXXXX 100644
17
index XXXXXXX..XXXXXXX 100644
23
--- a/hw/arm/sbsa-ref.c
18
--- a/hw/gpio/gpio_pwr.c
24
+++ b/hw/arm/sbsa-ref.c
19
+++ b/hw/gpio/gpio_pwr.c
25
@@ -XXX,XX +XXX,XX @@ static const int sbsa_ref_irqmap[] = {
20
@@ -XXX,XX +XXX,XX @@ static void gpio_pwr_reset(void *opaque, int n, int level)
26
[SBSA_EHCI] = 11,
21
static void gpio_pwr_shutdown(void *opaque, int n, int level)
27
};
22
{
28
23
if (level) {
29
+static uint64_t sbsa_ref_cpu_mp_affinity(SBSAMachineState *sms, int idx)
24
- qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
30
+{
25
+ qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
31
+ uint8_t clustersz = ARM_DEFAULT_CPUS_PER_CLUSTER;
32
+ return arm_cpu_mp_affinity(idx, clustersz);
33
+}
34
+
35
/*
36
* Firmware on this machine only uses ACPI table to load OS, these limited
37
* device tree nodes are just to let firmware know the info which varies from
38
@@ -XXX,XX +XXX,XX @@ static void create_fdt(SBSAMachineState *sms)
39
g_free(matrix);
40
}
26
}
41
42
+ /*
43
+ * From Documentation/devicetree/bindings/arm/cpus.yaml
44
+ * On ARM v8 64-bit systems this property is required
45
+ * and matches the MPIDR_EL1 register affinity bits.
46
+ *
47
+ * * If cpus node's #address-cells property is set to 2
48
+ *
49
+ * The first reg cell bits [7:0] must be set to
50
+ * bits [39:32] of MPIDR_EL1.
51
+ *
52
+ * The second reg cell bits [23:0] must be set to
53
+ * bits [23:0] of MPIDR_EL1.
54
+ */
55
qemu_fdt_add_subnode(sms->fdt, "/cpus");
56
+ qemu_fdt_setprop_cell(sms->fdt, "/cpus", "#address-cells", 2);
57
+ qemu_fdt_setprop_cell(sms->fdt, "/cpus", "#size-cells", 0x0);
58
59
for (cpu = sms->smp_cpus - 1; cpu >= 0; cpu--) {
60
char *nodename = g_strdup_printf("/cpus/cpu@%d", cpu);
61
ARMCPU *armcpu = ARM_CPU(qemu_get_cpu(cpu));
62
CPUState *cs = CPU(armcpu);
63
+ uint64_t mpidr = sbsa_ref_cpu_mp_affinity(sms, cpu);
64
65
qemu_fdt_add_subnode(sms->fdt, nodename);
66
+ qemu_fdt_setprop_u64(sms->fdt, nodename, "reg", mpidr);
67
68
if (ms->possible_cpus->cpus[cs->cpu_index].props.has_node_id) {
69
qemu_fdt_setprop_cell(sms->fdt, nodename, "numa-node-id",
70
@@ -XXX,XX +XXX,XX @@ static void sbsa_ref_init(MachineState *machine)
71
arm_load_kernel(ARM_CPU(first_cpu), machine, &sms->bootinfo);
72
}
27
}
73
28
74
-static uint64_t sbsa_ref_cpu_mp_affinity(SBSAMachineState *sms, int idx)
75
-{
76
- uint8_t clustersz = ARM_DEFAULT_CPUS_PER_CLUSTER;
77
- return arm_cpu_mp_affinity(idx, clustersz);
78
-}
79
-
80
static const CPUArchIdList *sbsa_ref_possible_cpu_arch_ids(MachineState *ms)
81
{
82
unsigned int max_cpus = ms->smp.max_cpus;
83
--
29
--
84
2.20.1
30
2.20.1
85
31
86
32
diff view generated by jsdifflib
1
Convert the Neon VCVT float<->fixed-point insns to a
1
In do_ldst(), the calculation of the offset needs to be based on the
2
gvec style, in preparation for adding fp16 support.
2
size of the memory access, not the size of the elements in the
3
vector. This meant we were getting it wrong for the widening and
4
narrowing variants of the various VLDR and VSTR insns.
3
5
4
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
6
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
5
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
7
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
6
Message-id: 20200828183354.27913-38-peter.maydell@linaro.org
8
Message-id: 20210628135835.6690-2-peter.maydell@linaro.org
7
---
9
---
8
target/arm/helper.h | 5 +++++
10
target/arm/translate-mve.c | 17 +++++++++--------
9
target/arm/vec_helper.c | 20 +++++++++++++++++++
11
1 file changed, 9 insertions(+), 8 deletions(-)
10
target/arm/translate-neon.c.inc | 35 +++++++++++++++++----------------
11
3 files changed, 43 insertions(+), 17 deletions(-)
12
12
13
diff --git a/target/arm/helper.h b/target/arm/helper.h
13
diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c
14
index XXXXXXX..XXXXXXX 100644
14
index XXXXXXX..XXXXXXX 100644
15
--- a/target/arm/helper.h
15
--- a/target/arm/translate-mve.c
16
+++ b/target/arm/helper.h
16
+++ b/target/arm/translate-mve.c
17
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_4(gvec_tosizs, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
17
@@ -XXX,XX +XXX,XX @@ static bool mve_skip_first_beat(DisasContext *s)
18
DEF_HELPER_FLAGS_4(gvec_touszh, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
18
}
19
DEF_HELPER_FLAGS_4(gvec_touizs, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
20
21
+DEF_HELPER_FLAGS_4(gvec_vcvt_sf, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
22
+DEF_HELPER_FLAGS_4(gvec_vcvt_uf, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
23
+DEF_HELPER_FLAGS_4(gvec_vcvt_fs, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
24
+DEF_HELPER_FLAGS_4(gvec_vcvt_fu, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
25
+
26
DEF_HELPER_FLAGS_4(gvec_frecpe_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
27
DEF_HELPER_FLAGS_4(gvec_frecpe_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
28
DEF_HELPER_FLAGS_4(gvec_frecpe_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
29
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
30
index XXXXXXX..XXXXXXX 100644
31
--- a/target/arm/vec_helper.c
32
+++ b/target/arm/vec_helper.c
33
@@ -XXX,XX +XXX,XX @@ DO_NEON_PAIRWISE(neon_pmax, max)
34
DO_NEON_PAIRWISE(neon_pmin, min)
35
36
#undef DO_NEON_PAIRWISE
37
+
38
+#define DO_VCVT_FIXED(NAME, FUNC, TYPE) \
39
+ void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
40
+ { \
41
+ intptr_t i, oprsz = simd_oprsz(desc); \
42
+ int shift = simd_data(desc); \
43
+ TYPE *d = vd, *n = vn; \
44
+ float_status *fpst = stat; \
45
+ for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
46
+ d[i] = FUNC(n[i], shift, fpst); \
47
+ } \
48
+ clear_tail(d, oprsz, simd_maxsz(desc)); \
49
+ }
50
+
51
+DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
52
+DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
53
+DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t)
54
+DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t)
55
+
56
+#undef DO_VCVT_FIXED
57
diff --git a/target/arm/translate-neon.c.inc b/target/arm/translate-neon.c.inc
58
index XXXXXXX..XXXXXXX 100644
59
--- a/target/arm/translate-neon.c.inc
60
+++ b/target/arm/translate-neon.c.inc
61
@@ -XXX,XX +XXX,XX @@ static bool trans_VSHLL_U_2sh(DisasContext *s, arg_2reg_shift *a)
62
}
19
}
63
20
64
static bool do_fp_2sh(DisasContext *s, arg_2reg_shift *a,
21
-static bool do_ldst(DisasContext *s, arg_VLDR_VSTR *a, MVEGenLdStFn *fn)
65
- NeonGenTwoSingleOpFn *fn)
22
+static bool do_ldst(DisasContext *s, arg_VLDR_VSTR *a, MVEGenLdStFn *fn,
66
+ gen_helper_gvec_2_ptr *fn)
23
+ unsigned msize)
67
{
24
{
68
/* FP operations in 2-reg-and-shift group */
25
TCGv_i32 addr;
69
- TCGv_i32 tmp, shiftv;
26
uint32_t offset;
70
- TCGv_ptr fpstatus;
27
@@ -XXX,XX +XXX,XX @@ static bool do_ldst(DisasContext *s, arg_VLDR_VSTR *a, MVEGenLdStFn *fn)
71
- int pass;
72
+ int vec_size = a->q ? 16 : 8;
73
+ int rd_ofs = neon_reg_offset(a->vd, 0);
74
+ int rm_ofs = neon_reg_offset(a->vm, 0);
75
+ TCGv_ptr fpst;
76
77
if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
78
return false;
79
}
80
81
+ if (a->size != 0) {
82
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
83
+ return false;
84
+ }
85
+ }
86
+
87
/* UNDEF accesses to D16-D31 if they don't exist. */
88
if (!dc_isar_feature(aa32_simd_r32, s) &&
89
((a->vd | a->vm) & 0x10)) {
90
@@ -XXX,XX +XXX,XX @@ static bool do_fp_2sh(DisasContext *s, arg_2reg_shift *a,
91
return true;
28
return true;
92
}
29
}
93
30
94
- fpstatus = fpstatus_ptr(FPST_STD);
31
- offset = a->imm << a->size;
95
- shiftv = tcg_const_i32(a->shift);
32
+ offset = a->imm << msize;
96
- for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
33
if (!a->a) {
97
- tmp = neon_load_reg(a->vm, pass);
34
offset = -offset;
98
- fn(tmp, tmp, shiftv, fpstatus);
35
}
99
- neon_store_reg(a->vd, pass, tmp);
36
@@ -XXX,XX +XXX,XX @@ static bool trans_VLDR_VSTR(DisasContext *s, arg_VLDR_VSTR *a)
100
- }
37
{ gen_helper_mve_vstrw, gen_helper_mve_vldrw },
101
- tcg_temp_free_ptr(fpstatus);
38
{ NULL, NULL }
102
- tcg_temp_free_i32(shiftv);
39
};
103
+ fpst = fpstatus_ptr(a->size ? FPST_STD_F16 : FPST_STD);
40
- return do_ldst(s, a, ldstfns[a->size][a->l]);
104
+ tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, vec_size, vec_size, a->shift, fn);
41
+ return do_ldst(s, a, ldstfns[a->size][a->l], a->size);
105
+ tcg_temp_free_ptr(fpst);
106
return true;
107
}
42
}
108
43
109
@@ -XXX,XX +XXX,XX @@ static bool do_fp_2sh(DisasContext *s, arg_2reg_shift *a,
44
-#define DO_VLDST_WIDE_NARROW(OP, SLD, ULD, ST) \
110
return do_fp_2sh(s, a, FUNC); \
45
+#define DO_VLDST_WIDE_NARROW(OP, SLD, ULD, ST, MSIZE) \
46
static bool trans_##OP(DisasContext *s, arg_VLDR_VSTR *a) \
47
{ \
48
static MVEGenLdStFn * const ldstfns[2][2] = { \
49
{ gen_helper_mve_##ST, gen_helper_mve_##SLD }, \
50
{ NULL, gen_helper_mve_##ULD }, \
51
}; \
52
- return do_ldst(s, a, ldstfns[a->u][a->l]); \
53
+ return do_ldst(s, a, ldstfns[a->u][a->l], MSIZE); \
111
}
54
}
112
55
113
-DO_FP_2SH(VCVT_SF, gen_helper_vfp_sltos)
56
-DO_VLDST_WIDE_NARROW(VLDSTB_H, vldrb_sh, vldrb_uh, vstrb_h)
114
-DO_FP_2SH(VCVT_UF, gen_helper_vfp_ultos)
57
-DO_VLDST_WIDE_NARROW(VLDSTB_W, vldrb_sw, vldrb_uw, vstrb_w)
115
-DO_FP_2SH(VCVT_FS, gen_helper_vfp_tosls_round_to_zero)
58
-DO_VLDST_WIDE_NARROW(VLDSTH_W, vldrh_sw, vldrh_uw, vstrh_w)
116
-DO_FP_2SH(VCVT_FU, gen_helper_vfp_touls_round_to_zero)
59
+DO_VLDST_WIDE_NARROW(VLDSTB_H, vldrb_sh, vldrb_uh, vstrb_h, MO_8)
117
+DO_FP_2SH(VCVT_SF, gen_helper_gvec_vcvt_sf)
60
+DO_VLDST_WIDE_NARROW(VLDSTB_W, vldrb_sw, vldrb_uw, vstrb_w, MO_8)
118
+DO_FP_2SH(VCVT_UF, gen_helper_gvec_vcvt_uf)
61
+DO_VLDST_WIDE_NARROW(VLDSTH_W, vldrh_sw, vldrh_uw, vstrh_w, MO_16)
119
+DO_FP_2SH(VCVT_FS, gen_helper_gvec_vcvt_fs)
62
120
+DO_FP_2SH(VCVT_FU, gen_helper_gvec_vcvt_fu)
63
static bool trans_VDUP(DisasContext *s, arg_VDUP *a)
121
122
static uint64_t asimd_imm_const(uint32_t imm, int cmode, int op)
123
{
64
{
124
--
65
--
125
2.20.1
66
2.20.1
126
67
127
68
diff view generated by jsdifflib
1
Convert the Neon float-point VMAX and VMIN insns over to using
1
The initial implementation of the MVE VRMLALDAVH and VRMLSLDAVH
2
a gvec helper, and use this to implement the fp16 case.
2
insns had some bugs:
3
* the 32x32 multiply of elements was being done as 32x32->32,
4
not 32x32->64
5
* we were incorrectly maintaining the accumulator in its full
6
72-bit form across all 4 beats of the insn; in the pseudocode
7
it is squashed back into the 64 bits of the RdaHi:RdaLo
8
registers after each beat
3
9
10
In particular, fixing the second of these allows us to recast
11
the implementation to avoid 128-bit arithmetic entirely.
12
13
Since the element size here is always 4, we can also drop the
14
parameterization of ESIZE to make the code a little more readable.
15
16
Suggested-by: Richard Henderson <richard.henderson@linaro.org>
4
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
17
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
5
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
18
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
6
Message-id: 20200828183354.27913-29-peter.maydell@linaro.org
19
Message-id: 20210628135835.6690-3-peter.maydell@linaro.org
7
---
20
---
8
target/arm/helper.h | 6 ++++++
21
target/arm/mve_helper.c | 38 +++++++++++++++++++++-----------------
9
target/arm/vec_helper.c | 6 ++++++
22
1 file changed, 21 insertions(+), 17 deletions(-)
10
target/arm/translate-neon.c.inc | 5 ++---
11
3 files changed, 14 insertions(+), 3 deletions(-)
12
23
13
diff --git a/target/arm/helper.h b/target/arm/helper.h
24
diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c
14
index XXXXXXX..XXXXXXX 100644
25
index XXXXXXX..XXXXXXX 100644
15
--- a/target/arm/helper.h
26
--- a/target/arm/mve_helper.c
16
+++ b/target/arm/helper.h
27
+++ b/target/arm/mve_helper.c
17
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_5(gvec_facge_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
28
@@ -XXX,XX +XXX,XX @@
18
DEF_HELPER_FLAGS_5(gvec_facgt_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
29
*/
19
DEF_HELPER_FLAGS_5(gvec_facgt_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
30
20
31
#include "qemu/osdep.h"
21
+DEF_HELPER_FLAGS_5(gvec_fmax_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
32
-#include "qemu/int128.h"
22
+DEF_HELPER_FLAGS_5(gvec_fmax_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
33
#include "cpu.h"
23
+
34
#include "internals.h"
24
+DEF_HELPER_FLAGS_5(gvec_fmin_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
35
#include "vec_internal.h"
25
+DEF_HELPER_FLAGS_5(gvec_fmin_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
36
@@ -XXX,XX +XXX,XX @@ DO_LDAV(vmlsldavsw, 4, int32_t, false, +=, -=)
26
+
37
DO_LDAV(vmlsldavxsw, 4, int32_t, true, +=, -=)
27
DEF_HELPER_FLAGS_5(gvec_ftsmul_h, TCG_CALL_NO_RWG,
28
void, ptr, ptr, ptr, ptr, i32)
29
DEF_HELPER_FLAGS_5(gvec_ftsmul_s, TCG_CALL_NO_RWG,
30
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
31
index XXXXXXX..XXXXXXX 100644
32
--- a/target/arm/vec_helper.c
33
+++ b/target/arm/vec_helper.c
34
@@ -XXX,XX +XXX,XX @@ DO_3OP(gvec_facge_s, float32_acge, float32)
35
DO_3OP(gvec_facgt_h, float16_acgt, float16)
36
DO_3OP(gvec_facgt_s, float32_acgt, float32)
37
38
+DO_3OP(gvec_fmax_h, float16_max, float16)
39
+DO_3OP(gvec_fmax_s, float32_max, float32)
40
+
41
+DO_3OP(gvec_fmin_h, float16_min, float16)
42
+DO_3OP(gvec_fmin_s, float32_min, float32)
43
+
44
#ifdef TARGET_AARCH64
45
46
DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
47
diff --git a/target/arm/translate-neon.c.inc b/target/arm/translate-neon.c.inc
48
index XXXXXXX..XXXXXXX 100644
49
--- a/target/arm/translate-neon.c.inc
50
+++ b/target/arm/translate-neon.c.inc
51
@@ -XXX,XX +XXX,XX @@ DO_3S_FP_GVEC(VCGE, gen_helper_gvec_fcge_s, gen_helper_gvec_fcge_h)
52
DO_3S_FP_GVEC(VCGT, gen_helper_gvec_fcgt_s, gen_helper_gvec_fcgt_h)
53
DO_3S_FP_GVEC(VACGE, gen_helper_gvec_facge_s, gen_helper_gvec_facge_h)
54
DO_3S_FP_GVEC(VACGT, gen_helper_gvec_facgt_s, gen_helper_gvec_facgt_h)
55
+DO_3S_FP_GVEC(VMAX, gen_helper_gvec_fmax_s, gen_helper_gvec_fmax_h)
56
+DO_3S_FP_GVEC(VMIN, gen_helper_gvec_fmin_s, gen_helper_gvec_fmin_h)
57
38
58
/*
39
/*
59
* For all the functions using this macro, size == 1 means fp16,
40
- * Rounding multiply add long dual accumulate high: we must keep
60
@@ -XXX,XX +XXX,XX @@ DO_3S_FP_GVEC(VACGT, gen_helper_gvec_facgt_s, gen_helper_gvec_facgt_h)
41
- * a 72-bit internal accumulator value and return the top 64 bits.
61
return do_3same_fp(s, a, FUNC, READS_VD); \
42
+ * Rounding multiply add long dual accumulate high. In the pseudocode
43
+ * this is implemented with a 72-bit internal accumulator value of which
44
+ * the top 64 bits are returned. We optimize this to avoid having to
45
+ * use 128-bit arithmetic -- we can do this because the 74-bit accumulator
46
+ * is squashed back into 64-bits after each beat.
47
*/
48
-#define DO_LDAVH(OP, ESIZE, TYPE, XCHG, EVENACC, ODDACC, TO128) \
49
+#define DO_LDAVH(OP, TYPE, LTYPE, XCHG, SUB) \
50
uint64_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, \
51
void *vm, uint64_t a) \
52
{ \
53
uint16_t mask = mve_element_mask(env); \
54
unsigned e; \
55
TYPE *n = vn, *m = vm; \
56
- Int128 acc = int128_lshift(TO128(a), 8); \
57
- for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
58
+ for (e = 0; e < 16 / 4; e++, mask >>= 4) { \
59
if (mask & 1) { \
60
+ LTYPE mul; \
61
if (e & 1) { \
62
- acc = ODDACC(acc, TO128(n[H##ESIZE(e - 1 * XCHG)] * \
63
- m[H##ESIZE(e)])); \
64
+ mul = (LTYPE)n[H4(e - 1 * XCHG)] * m[H4(e)]; \
65
+ if (SUB) { \
66
+ mul = -mul; \
67
+ } \
68
} else { \
69
- acc = EVENACC(acc, TO128(n[H##ESIZE(e + 1 * XCHG)] * \
70
- m[H##ESIZE(e)])); \
71
+ mul = (LTYPE)n[H4(e + 1 * XCHG)] * m[H4(e)]; \
72
} \
73
- acc = int128_add(acc, int128_make64(1 << 7)); \
74
+ mul = (mul >> 8) + ((mul >> 7) & 1); \
75
+ a += mul; \
76
} \
77
} \
78
mve_advance_vpt(env); \
79
- return int128_getlo(int128_rshift(acc, 8)); \
80
+ return a; \
62
}
81
}
63
82
64
-DO_3S_FP(VMAX, gen_helper_vfp_maxs, false)
83
-DO_LDAVH(vrmlaldavhsw, 4, int32_t, false, int128_add, int128_add, int128_makes64)
65
-DO_3S_FP(VMIN, gen_helper_vfp_mins, false)
84
-DO_LDAVH(vrmlaldavhxsw, 4, int32_t, true, int128_add, int128_add, int128_makes64)
66
-
85
+DO_LDAVH(vrmlaldavhsw, int32_t, int64_t, false, false)
67
static void gen_VMLA_fp_3s(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm,
86
+DO_LDAVH(vrmlaldavhxsw, int32_t, int64_t, true, false)
68
TCGv_ptr fpstatus)
87
69
{
88
-DO_LDAVH(vrmlaldavhuw, 4, uint32_t, false, int128_add, int128_add, int128_make64)
89
+DO_LDAVH(vrmlaldavhuw, uint32_t, uint64_t, false, false)
90
91
-DO_LDAVH(vrmlsldavhsw, 4, int32_t, false, int128_add, int128_sub, int128_makes64)
92
-DO_LDAVH(vrmlsldavhxsw, 4, int32_t, true, int128_add, int128_sub, int128_makes64)
93
+DO_LDAVH(vrmlsldavhsw, int32_t, int64_t, false, true)
94
+DO_LDAVH(vrmlsldavhxsw, int32_t, int64_t, true, true)
95
96
/* Vector add across vector */
97
#define DO_VADDV(OP, ESIZE, TYPE) \
70
--
98
--
71
2.20.1
99
2.20.1
72
100
73
101
diff view generated by jsdifflib
1
Macroify the uses of do_vfp_2op_sp() and do_vfp_2op_dp(); this will
1
The function asimd_imm_const() in translate-neon.c is an
2
make it easier to add the halfprec support.
2
implementation of the pseudocode AdvSIMDExpandImm(), which we will
3
also want for MVE. Move the implementation to translate.c, with a
4
prototype in translate.h.
3
5
4
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
6
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
5
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
7
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
6
Message-id: 20200828183354.27913-8-peter.maydell@linaro.org
8
Message-id: 20210628135835.6690-4-peter.maydell@linaro.org
7
---
9
---
8
target/arm/translate-vfp.c.inc | 49 ++++++++++------------------------
10
target/arm/translate.h | 16 ++++++++++
9
1 file changed, 14 insertions(+), 35 deletions(-)
11
target/arm/translate-neon.c | 63 -------------------------------------
12
target/arm/translate.c | 57 +++++++++++++++++++++++++++++++++
13
3 files changed, 73 insertions(+), 63 deletions(-)
10
14
11
diff --git a/target/arm/translate-vfp.c.inc b/target/arm/translate-vfp.c.inc
15
diff --git a/target/arm/translate.h b/target/arm/translate.h
12
index XXXXXXX..XXXXXXX 100644
16
index XXXXXXX..XXXXXXX 100644
13
--- a/target/arm/translate-vfp.c.inc
17
--- a/target/arm/translate.h
14
+++ b/target/arm/translate-vfp.c.inc
18
+++ b/target/arm/translate.h
15
@@ -XXX,XX +XXX,XX @@ static bool trans_VMOV_imm_dp(DisasContext *s, arg_VMOV_imm_dp *a)
19
@@ -XXX,XX +XXX,XX @@ static inline MemOp finalize_memop(DisasContext *s, MemOp opc)
16
return true;
20
return opc | s->be_data;
17
}
21
}
18
22
19
-static bool trans_VMOV_reg_sp(DisasContext *s, arg_VMOV_reg_sp *a)
23
+/**
24
+ * asimd_imm_const: Expand an encoded SIMD constant value
25
+ *
26
+ * Expand a SIMD constant value. This is essentially the pseudocode
27
+ * AdvSIMDExpandImm, except that we also perform the boolean NOT needed for
28
+ * VMVN and VBIC (when cmode < 14 && op == 1).
29
+ *
30
+ * The combination cmode == 15 op == 1 is a reserved encoding for AArch32;
31
+ * callers must catch this.
32
+ *
33
+ * cmode = 2,3,4,5,6,7,10,11,12,13 imm=0 was UNPREDICTABLE in v7A but
34
+ * is either not unpredictable or merely CONSTRAINED UNPREDICTABLE in v8A;
35
+ * we produce an immediate constant value of 0 in these cases.
36
+ */
37
+uint64_t asimd_imm_const(uint32_t imm, int cmode, int op);
38
+
39
#endif /* TARGET_ARM_TRANSLATE_H */
40
diff --git a/target/arm/translate-neon.c b/target/arm/translate-neon.c
41
index XXXXXXX..XXXXXXX 100644
42
--- a/target/arm/translate-neon.c
43
+++ b/target/arm/translate-neon.c
44
@@ -XXX,XX +XXX,XX @@ DO_FP_2SH(VCVT_UH, gen_helper_gvec_vcvt_uh)
45
DO_FP_2SH(VCVT_HS, gen_helper_gvec_vcvt_hs)
46
DO_FP_2SH(VCVT_HU, gen_helper_gvec_vcvt_hu)
47
48
-static uint64_t asimd_imm_const(uint32_t imm, int cmode, int op)
20
-{
49
-{
21
- return do_vfp_2op_sp(s, tcg_gen_mov_i32, a->vd, a->vm);
50
- /*
22
-}
51
- * Expand the encoded constant.
23
+#define DO_VFP_2OP(INSN, PREC, FN) \
52
- * Note that cmode = 2,3,4,5,6,7,10,11,12,13 imm=0 is UNPREDICTABLE.
24
+ static bool trans_##INSN##_##PREC(DisasContext *s, \
53
- * We choose to not special-case this and will behave as if a
25
+ arg_##INSN##_##PREC *a) \
54
- * valid constant encoding of 0 had been given.
26
+ { \
55
- * cmode = 15 op = 1 must UNDEF; we assume decode has handled that.
27
+ return do_vfp_2op_##PREC(s, FN, a->vd, a->vm); \
56
- */
28
+ }
57
- switch (cmode) {
29
58
- case 0: case 1:
30
-static bool trans_VMOV_reg_dp(DisasContext *s, arg_VMOV_reg_dp *a)
59
- /* no-op */
31
-{
60
- break;
32
- return do_vfp_2op_dp(s, tcg_gen_mov_i64, a->vd, a->vm);
61
- case 2: case 3:
33
-}
62
- imm <<= 8;
34
+DO_VFP_2OP(VMOV_reg, sp, tcg_gen_mov_i32)
63
- break;
35
+DO_VFP_2OP(VMOV_reg, dp, tcg_gen_mov_i64)
64
- case 4: case 5:
36
65
- imm <<= 16;
37
-static bool trans_VABS_sp(DisasContext *s, arg_VABS_sp *a)
66
- break;
38
-{
67
- case 6: case 7:
39
- return do_vfp_2op_sp(s, gen_helper_vfp_abss, a->vd, a->vm);
68
- imm <<= 24;
40
-}
69
- break;
41
+DO_VFP_2OP(VABS, sp, gen_helper_vfp_abss)
70
- case 8: case 9:
42
+DO_VFP_2OP(VABS, dp, gen_helper_vfp_absd)
71
- imm |= imm << 16;
43
72
- break;
44
-static bool trans_VABS_dp(DisasContext *s, arg_VABS_dp *a)
73
- case 10: case 11:
45
-{
74
- imm = (imm << 8) | (imm << 24);
46
- return do_vfp_2op_dp(s, gen_helper_vfp_absd, a->vd, a->vm);
75
- break;
76
- case 12:
77
- imm = (imm << 8) | 0xff;
78
- break;
79
- case 13:
80
- imm = (imm << 16) | 0xffff;
81
- break;
82
- case 14:
83
- if (op) {
84
- /*
85
- * This is the only case where the top and bottom 32 bits
86
- * of the encoded constant differ.
87
- */
88
- uint64_t imm64 = 0;
89
- int n;
90
-
91
- for (n = 0; n < 8; n++) {
92
- if (imm & (1 << n)) {
93
- imm64 |= (0xffULL << (n * 8));
94
- }
95
- }
96
- return imm64;
97
- }
98
- imm |= (imm << 8) | (imm << 16) | (imm << 24);
99
- break;
100
- case 15:
101
- imm = ((imm & 0x80) << 24) | ((imm & 0x3f) << 19)
102
- | ((imm & 0x40) ? (0x1f << 25) : (1 << 30));
103
- break;
104
- }
105
- if (op) {
106
- imm = ~imm;
107
- }
108
- return dup_const(MO_32, imm);
47
-}
109
-}
48
-
110
-
49
-static bool trans_VNEG_sp(DisasContext *s, arg_VNEG_sp *a)
111
static bool do_1reg_imm(DisasContext *s, arg_1reg_imm *a,
50
-{
112
GVecGen2iFn *fn)
51
- return do_vfp_2op_sp(s, gen_helper_vfp_negs, a->vd, a->vm);
52
-}
53
-
54
-static bool trans_VNEG_dp(DisasContext *s, arg_VNEG_dp *a)
55
-{
56
- return do_vfp_2op_dp(s, gen_helper_vfp_negd, a->vd, a->vm);
57
-}
58
+DO_VFP_2OP(VNEG, sp, gen_helper_vfp_negs)
59
+DO_VFP_2OP(VNEG, dp, gen_helper_vfp_negd)
60
61
static void gen_VSQRT_sp(TCGv_i32 vd, TCGv_i32 vm)
62
{
113
{
63
gen_helper_vfp_sqrts(vd, vm, cpu_env);
114
diff --git a/target/arm/translate.c b/target/arm/translate.c
115
index XXXXXXX..XXXXXXX 100644
116
--- a/target/arm/translate.c
117
+++ b/target/arm/translate.c
118
@@ -XXX,XX +XXX,XX @@ void arm_translate_init(void)
119
a64_translate_init();
64
}
120
}
65
121
66
-static bool trans_VSQRT_sp(DisasContext *s, arg_VSQRT_sp *a)
122
+uint64_t asimd_imm_const(uint32_t imm, int cmode, int op)
67
-{
123
+{
68
- return do_vfp_2op_sp(s, gen_VSQRT_sp, a->vd, a->vm);
124
+ /* Expand the encoded constant as per AdvSIMDExpandImm pseudocode */
69
-}
125
+ switch (cmode) {
70
-
126
+ case 0: case 1:
71
static void gen_VSQRT_dp(TCGv_i64 vd, TCGv_i64 vm)
127
+ /* no-op */
72
{
128
+ break;
73
gen_helper_vfp_sqrtd(vd, vm, cpu_env);
129
+ case 2: case 3:
74
}
130
+ imm <<= 8;
75
131
+ break;
76
-static bool trans_VSQRT_dp(DisasContext *s, arg_VSQRT_dp *a)
132
+ case 4: case 5:
77
-{
133
+ imm <<= 16;
78
- return do_vfp_2op_dp(s, gen_VSQRT_dp, a->vd, a->vm);
134
+ break;
79
-}
135
+ case 6: case 7:
80
+DO_VFP_2OP(VSQRT, sp, gen_VSQRT_sp)
136
+ imm <<= 24;
81
+DO_VFP_2OP(VSQRT, dp, gen_VSQRT_dp)
137
+ break;
82
138
+ case 8: case 9:
83
static bool trans_VCMP_sp(DisasContext *s, arg_VCMP_sp *a)
139
+ imm |= imm << 16;
140
+ break;
141
+ case 10: case 11:
142
+ imm = (imm << 8) | (imm << 24);
143
+ break;
144
+ case 12:
145
+ imm = (imm << 8) | 0xff;
146
+ break;
147
+ case 13:
148
+ imm = (imm << 16) | 0xffff;
149
+ break;
150
+ case 14:
151
+ if (op) {
152
+ /*
153
+ * This is the only case where the top and bottom 32 bits
154
+ * of the encoded constant differ.
155
+ */
156
+ uint64_t imm64 = 0;
157
+ int n;
158
+
159
+ for (n = 0; n < 8; n++) {
160
+ if (imm & (1 << n)) {
161
+ imm64 |= (0xffULL << (n * 8));
162
+ }
163
+ }
164
+ return imm64;
165
+ }
166
+ imm |= (imm << 8) | (imm << 16) | (imm << 24);
167
+ break;
168
+ case 15:
169
+ imm = ((imm & 0x80) << 24) | ((imm & 0x3f) << 19)
170
+ | ((imm & 0x40) ? (0x1f << 25) : (1 << 30));
171
+ break;
172
+ }
173
+ if (op) {
174
+ imm = ~imm;
175
+ }
176
+ return dup_const(MO_32, imm);
177
+}
178
+
179
/* Generate a label used for skipping this instruction */
180
void arm_gen_condlabel(DisasContext *s)
84
{
181
{
85
--
182
--
86
2.20.1
183
2.20.1
87
184
88
185
diff view generated by jsdifflib
1
Convert the Neon VCVT with-specified-rounding-mode instructions
1
The A64 AdvSIMD modified-immediate grouping uses almost the same
2
to gvec, and use this to implement fp16 support for them.
2
constant encoding that A32 Neon does; reuse asimd_imm_const() (to
3
which we add the AArch64-specific case for cmode 15 op 1) instead of
4
reimplementing it all.
3
5
4
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
6
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
5
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
7
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
6
Message-id: 20200828183354.27913-40-peter.maydell@linaro.org
8
Message-id: 20210628135835.6690-5-peter.maydell@linaro.org
7
---
9
---
8
target/arm/helper.h | 5 ++
10
target/arm/translate.h | 3 +-
9
target/arm/vec_helper.c | 23 +++++++
11
target/arm/translate-a64.c | 86 ++++----------------------------------
10
target/arm/translate-neon.c.inc | 105 ++++++++++++--------------------
12
target/arm/translate.c | 17 +++++++-
11
3 files changed, 66 insertions(+), 67 deletions(-)
13
3 files changed, 24 insertions(+), 82 deletions(-)
12
14
13
diff --git a/target/arm/helper.h b/target/arm/helper.h
15
diff --git a/target/arm/translate.h b/target/arm/translate.h
14
index XXXXXXX..XXXXXXX 100644
16
index XXXXXXX..XXXXXXX 100644
15
--- a/target/arm/helper.h
17
--- a/target/arm/translate.h
16
+++ b/target/arm/helper.h
18
+++ b/target/arm/translate.h
17
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_4(gvec_vcvt_uh, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
19
@@ -XXX,XX +XXX,XX @@ static inline MemOp finalize_memop(DisasContext *s, MemOp opc)
18
DEF_HELPER_FLAGS_4(gvec_vcvt_hs, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
20
* VMVN and VBIC (when cmode < 14 && op == 1).
19
DEF_HELPER_FLAGS_4(gvec_vcvt_hu, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
21
*
20
22
* The combination cmode == 15 op == 1 is a reserved encoding for AArch32;
21
+DEF_HELPER_FLAGS_4(gvec_vcvt_rm_ss, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
23
- * callers must catch this.
22
+DEF_HELPER_FLAGS_4(gvec_vcvt_rm_us, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
24
+ * callers must catch this; we return the 64-bit constant value defined
23
+DEF_HELPER_FLAGS_4(gvec_vcvt_rm_sh, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
25
+ * for AArch64.
24
+DEF_HELPER_FLAGS_4(gvec_vcvt_rm_uh, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
26
*
25
+
27
* cmode = 2,3,4,5,6,7,10,11,12,13 imm=0 was UNPREDICTABLE in v7A but
26
DEF_HELPER_FLAGS_4(gvec_frecpe_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
28
* is either not unpredictable or merely CONSTRAINED UNPREDICTABLE in v8A;
27
DEF_HELPER_FLAGS_4(gvec_frecpe_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
29
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
28
DEF_HELPER_FLAGS_4(gvec_frecpe_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
29
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
30
index XXXXXXX..XXXXXXX 100644
30
index XXXXXXX..XXXXXXX 100644
31
--- a/target/arm/vec_helper.c
31
--- a/target/arm/translate-a64.c
32
+++ b/target/arm/vec_helper.c
32
+++ b/target/arm/translate-a64.c
33
@@ -XXX,XX +XXX,XX @@ DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t)
33
@@ -XXX,XX +XXX,XX @@ static void disas_simd_mod_imm(DisasContext *s, uint32_t insn)
34
DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t)
34
{
35
35
int rd = extract32(insn, 0, 5);
36
#undef DO_VCVT_FIXED
36
int cmode = extract32(insn, 12, 4);
37
+
37
- int cmode_3_1 = extract32(cmode, 1, 3);
38
+#define DO_VCVT_RMODE(NAME, FUNC, TYPE) \
38
- int cmode_0 = extract32(cmode, 0, 1);
39
+ void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
39
int o2 = extract32(insn, 11, 1);
40
+ { \
40
uint64_t abcdefgh = extract32(insn, 5, 5) | (extract32(insn, 16, 3) << 5);
41
+ float_status *fpst = stat; \
41
bool is_neg = extract32(insn, 29, 1);
42
+ intptr_t i, oprsz = simd_oprsz(desc); \
42
@@ -XXX,XX +XXX,XX @@ static void disas_simd_mod_imm(DisasContext *s, uint32_t insn)
43
+ uint32_t rmode = simd_data(desc); \
43
return;
44
+ uint32_t prev_rmode = get_float_rounding_mode(fpst); \
45
+ TYPE *d = vd, *n = vn; \
46
+ set_float_rounding_mode(rmode, fpst); \
47
+ for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
48
+ d[i] = FUNC(n[i], 0, fpst); \
49
+ } \
50
+ set_float_rounding_mode(prev_rmode, fpst); \
51
+ clear_tail(d, oprsz, simd_maxsz(desc)); \
52
+ }
53
+
54
+DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
55
+DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
56
+DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
57
+DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
58
+
59
+#undef DO_VCVT_RMODE
60
diff --git a/target/arm/translate-neon.c.inc b/target/arm/translate-neon.c.inc
61
index XXXXXXX..XXXXXXX 100644
62
--- a/target/arm/translate-neon.c.inc
63
+++ b/target/arm/translate-neon.c.inc
64
@@ -XXX,XX +XXX,XX @@ DO_VRINT(VRINTZ, FPROUNDING_ZERO)
65
DO_VRINT(VRINTM, FPROUNDING_NEGINF)
66
DO_VRINT(VRINTP, FPROUNDING_POSINF)
67
68
-static bool do_vcvt(DisasContext *s, arg_2misc *a, int rmode, bool is_signed)
69
-{
70
- /*
71
- * Handle a VCVT* operation by iterating 32 bits at a time,
72
- * with a specified rounding mode in operation.
73
- */
74
- int pass;
75
- TCGv_ptr fpst;
76
- TCGv_i32 tcg_rmode, tcg_shift;
77
-
78
- if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
79
- !arm_dc_feature(s, ARM_FEATURE_V8)) {
80
- return false;
81
+#define DO_VEC_RMODE(INSN, RMODE, OP) \
82
+ static void gen_##INSN(unsigned vece, uint32_t rd_ofs, \
83
+ uint32_t rm_ofs, \
84
+ uint32_t oprsz, uint32_t maxsz) \
85
+ { \
86
+ static gen_helper_gvec_2_ptr * const fns[4] = { \
87
+ NULL, \
88
+ gen_helper_gvec_##OP##h, \
89
+ gen_helper_gvec_##OP##s, \
90
+ NULL, \
91
+ }; \
92
+ TCGv_ptr fpst; \
93
+ fpst = fpstatus_ptr(vece == 1 ? FPST_STD_F16 : FPST_STD); \
94
+ tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz, \
95
+ arm_rmode_to_sf(RMODE), fns[vece]); \
96
+ tcg_temp_free_ptr(fpst); \
97
+ } \
98
+ static bool trans_##INSN(DisasContext *s, arg_2misc *a) \
99
+ { \
100
+ if (!arm_dc_feature(s, ARM_FEATURE_V8)) { \
101
+ return false; \
102
+ } \
103
+ if (a->size == MO_16) { \
104
+ if (!dc_isar_feature(aa32_fp16_arith, s)) { \
105
+ return false; \
106
+ } \
107
+ } else if (a->size != MO_32) { \
108
+ return false; \
109
+ } \
110
+ return do_2misc_vec(s, a, gen_##INSN); \
111
}
44
}
112
45
113
- /* UNDEF accesses to D16-D31 if they don't exist. */
46
- /* See AdvSIMDExpandImm() in ARM ARM */
114
- if (!dc_isar_feature(aa32_simd_r32, s) &&
47
- switch (cmode_3_1) {
115
- ((a->vd | a->vm) & 0x10)) {
48
- case 0: /* Replicate(Zeros(24):imm8, 2) */
116
- return false;
49
- case 1: /* Replicate(Zeros(16):imm8:Zeros(8), 2) */
50
- case 2: /* Replicate(Zeros(8):imm8:Zeros(16), 2) */
51
- case 3: /* Replicate(imm8:Zeros(24), 2) */
52
- {
53
- int shift = cmode_3_1 * 8;
54
- imm = bitfield_replicate(abcdefgh << shift, 32);
55
- break;
56
- }
57
- case 4: /* Replicate(Zeros(8):imm8, 4) */
58
- case 5: /* Replicate(imm8:Zeros(8), 4) */
59
- {
60
- int shift = (cmode_3_1 & 0x1) * 8;
61
- imm = bitfield_replicate(abcdefgh << shift, 16);
62
- break;
63
- }
64
- case 6:
65
- if (cmode_0) {
66
- /* Replicate(Zeros(8):imm8:Ones(16), 2) */
67
- imm = (abcdefgh << 16) | 0xffff;
68
- } else {
69
- /* Replicate(Zeros(16):imm8:Ones(8), 2) */
70
- imm = (abcdefgh << 8) | 0xff;
71
- }
72
- imm = bitfield_replicate(imm, 32);
73
- break;
74
- case 7:
75
- if (!cmode_0 && !is_neg) {
76
- imm = bitfield_replicate(abcdefgh, 8);
77
- } else if (!cmode_0 && is_neg) {
78
- int i;
79
- imm = 0;
80
- for (i = 0; i < 8; i++) {
81
- if ((abcdefgh) & (1 << i)) {
82
- imm |= 0xffULL << (i * 8);
83
- }
84
- }
85
- } else if (cmode_0) {
86
- if (is_neg) {
87
- imm = (abcdefgh & 0x3f) << 48;
88
- if (abcdefgh & 0x80) {
89
- imm |= 0x8000000000000000ULL;
90
- }
91
- if (abcdefgh & 0x40) {
92
- imm |= 0x3fc0000000000000ULL;
93
- } else {
94
- imm |= 0x4000000000000000ULL;
95
- }
96
- } else {
97
- if (o2) {
98
- /* FMOV (vector, immediate) - half-precision */
99
- imm = vfp_expand_imm(MO_16, abcdefgh);
100
- /* now duplicate across the lanes */
101
- imm = bitfield_replicate(imm, 16);
102
- } else {
103
- imm = (abcdefgh & 0x3f) << 19;
104
- if (abcdefgh & 0x80) {
105
- imm |= 0x80000000;
106
- }
107
- if (abcdefgh & 0x40) {
108
- imm |= 0x3e000000;
109
- } else {
110
- imm |= 0x40000000;
111
- }
112
- imm |= (imm << 32);
113
- }
114
- }
115
- }
116
- break;
117
- default:
118
- g_assert_not_reached();
117
- }
119
- }
118
-
120
-
119
- if (a->size != 2) {
121
- if (cmode_3_1 != 7 && is_neg) {
120
- /* TODO: FP16 will be the size == 1 case */
122
- imm = ~imm;
121
- return false;
123
+ if (cmode == 15 && o2 && !is_neg) {
122
- }
124
+ /* FMOV (vector, immediate) - half-precision */
123
-
125
+ imm = vfp_expand_imm(MO_16, abcdefgh);
124
- if ((a->vd | a->vm) & a->q) {
126
+ /* now duplicate across the lanes */
125
- return false;
127
+ imm = bitfield_replicate(imm, 16);
126
- }
128
+ } else {
127
-
129
+ imm = asimd_imm_const(abcdefgh, cmode, is_neg);
128
- if (!vfp_access_check(s)) {
130
}
129
- return true;
131
130
- }
132
if (!((cmode & 0x9) == 0x1 || (cmode & 0xd) == 0x9)) {
131
-
133
diff --git a/target/arm/translate.c b/target/arm/translate.c
132
- fpst = fpstatus_ptr(FPST_STD);
134
index XXXXXXX..XXXXXXX 100644
133
- tcg_shift = tcg_const_i32(0);
135
--- a/target/arm/translate.c
134
- tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rmode));
136
+++ b/target/arm/translate.c
135
- gen_helper_set_neon_rmode(tcg_rmode, tcg_rmode, cpu_env);
137
@@ -XXX,XX +XXX,XX @@ uint64_t asimd_imm_const(uint32_t imm, int cmode, int op)
136
- for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
138
case 14:
137
- TCGv_i32 tmp = neon_load_reg(a->vm, pass);
139
if (op) {
138
- if (is_signed) {
140
/*
139
- gen_helper_vfp_tosls(tmp, tmp, tcg_shift, fpst);
141
- * This is the only case where the top and bottom 32 bits
140
- } else {
142
- * of the encoded constant differ.
141
- gen_helper_vfp_touls(tmp, tmp, tcg_shift, fpst);
143
+ * This and cmode == 15 op == 1 are the only cases where
142
- }
144
+ * the top and bottom 32 bits of the encoded constant differ.
143
- neon_store_reg(a->vd, pass, tmp);
145
*/
144
- }
146
uint64_t imm64 = 0;
145
- gen_helper_set_neon_rmode(tcg_rmode, tcg_rmode, cpu_env);
147
int n;
146
- tcg_temp_free_i32(tcg_rmode);
148
@@ -XXX,XX +XXX,XX @@ uint64_t asimd_imm_const(uint32_t imm, int cmode, int op)
147
- tcg_temp_free_i32(tcg_shift);
149
imm |= (imm << 8) | (imm << 16) | (imm << 24);
148
- tcg_temp_free_ptr(fpst);
150
break;
149
-
151
case 15:
150
- return true;
152
+ if (op) {
151
-}
153
+ /* Reserved encoding for AArch32; valid for AArch64 */
152
-
154
+ uint64_t imm64 = (uint64_t)(imm & 0x3f) << 48;
153
-#define DO_VCVT(INSN, RMODE, SIGNED) \
155
+ if (imm & 0x80) {
154
- static bool trans_##INSN(DisasContext *s, arg_2misc *a) \
156
+ imm64 |= 0x8000000000000000ULL;
155
- { \
157
+ }
156
- return do_vcvt(s, a, RMODE, SIGNED); \
158
+ if (imm & 0x40) {
157
- }
159
+ imm64 |= 0x3fc0000000000000ULL;
158
-
160
+ } else {
159
-DO_VCVT(VCVTAU, FPROUNDING_TIEAWAY, false)
161
+ imm64 |= 0x4000000000000000ULL;
160
-DO_VCVT(VCVTAS, FPROUNDING_TIEAWAY, true)
162
+ }
161
-DO_VCVT(VCVTNU, FPROUNDING_TIEEVEN, false)
163
+ return imm64;
162
-DO_VCVT(VCVTNS, FPROUNDING_TIEEVEN, true)
164
+ }
163
-DO_VCVT(VCVTPU, FPROUNDING_POSINF, false)
165
imm = ((imm & 0x80) << 24) | ((imm & 0x3f) << 19)
164
-DO_VCVT(VCVTPS, FPROUNDING_POSINF, true)
166
| ((imm & 0x40) ? (0x1f << 25) : (1 << 30));
165
-DO_VCVT(VCVTMU, FPROUNDING_NEGINF, false)
167
break;
166
-DO_VCVT(VCVTMS, FPROUNDING_NEGINF, true)
167
+DO_VEC_RMODE(VCVTAU, FPROUNDING_TIEAWAY, vcvt_rm_u)
168
+DO_VEC_RMODE(VCVTAS, FPROUNDING_TIEAWAY, vcvt_rm_s)
169
+DO_VEC_RMODE(VCVTNU, FPROUNDING_TIEEVEN, vcvt_rm_u)
170
+DO_VEC_RMODE(VCVTNS, FPROUNDING_TIEEVEN, vcvt_rm_s)
171
+DO_VEC_RMODE(VCVTPU, FPROUNDING_POSINF, vcvt_rm_u)
172
+DO_VEC_RMODE(VCVTPS, FPROUNDING_POSINF, vcvt_rm_s)
173
+DO_VEC_RMODE(VCVTMU, FPROUNDING_NEGINF, vcvt_rm_u)
174
+DO_VEC_RMODE(VCVTMS, FPROUNDING_NEGINF, vcvt_rm_s)
175
176
static bool trans_VSWP(DisasContext *s, arg_2misc *a)
177
{
178
--
168
--
179
2.20.1
169
2.20.1
180
170
181
171
diff view generated by jsdifflib
1
Convert the Neon floating-point VMUL, VMLA and VMLS to use gvec,
1
Use dup_const() instead of bitfield_replicate() in
2
and use this to implement fp16 support.
2
disas_simd_mod_imm().
3
4
(We can't replace the other use of bitfield_replicate() in this file,
5
in logic_imm_decode_wmask(), because that location needs to handle 2
6
and 4 bit elements, which dup_const() cannot.)
3
7
4
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
8
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
5
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
9
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
6
Message-id: 20200828183354.27913-45-peter.maydell@linaro.org
10
Message-id: 20210628135835.6690-6-peter.maydell@linaro.org
7
---
11
---
8
target/arm/translate-neon.c.inc | 114 ++++++++++++++++----------------
12
target/arm/translate-a64.c | 2 +-
9
1 file changed, 57 insertions(+), 57 deletions(-)
13
1 file changed, 1 insertion(+), 1 deletion(-)
10
14
11
diff --git a/target/arm/translate-neon.c.inc b/target/arm/translate-neon.c.inc
15
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
12
index XXXXXXX..XXXXXXX 100644
16
index XXXXXXX..XXXXXXX 100644
13
--- a/target/arm/translate-neon.c.inc
17
--- a/target/arm/translate-a64.c
14
+++ b/target/arm/translate-neon.c.inc
18
+++ b/target/arm/translate-a64.c
15
@@ -XXX,XX +XXX,XX @@ static bool trans_VMLS_2sc(DisasContext *s, arg_2scalar *a)
19
@@ -XXX,XX +XXX,XX @@ static void disas_simd_mod_imm(DisasContext *s, uint32_t insn)
16
return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
20
/* FMOV (vector, immediate) - half-precision */
17
}
21
imm = vfp_expand_imm(MO_16, abcdefgh);
18
22
/* now duplicate across the lanes */
19
-/*
23
- imm = bitfield_replicate(imm, 16);
20
- * Rather than have a float-specific version of do_2scalar just for
24
+ imm = dup_const(MO_16, imm);
21
- * three insns, we wrap a NeonGenTwoSingleOpFn to turn it into
25
} else {
22
- * a NeonGenTwoOpFn.
26
imm = asimd_imm_const(abcdefgh, cmode, is_neg);
23
- */
24
-#define WRAP_FP_FN(WRAPNAME, FUNC) \
25
- static void WRAPNAME(TCGv_i32 rd, TCGv_i32 rn, TCGv_i32 rm) \
26
- { \
27
- TCGv_ptr fpstatus = fpstatus_ptr(FPST_STD); \
28
- FUNC(rd, rn, rm, fpstatus); \
29
- tcg_temp_free_ptr(fpstatus); \
30
+static bool do_2scalar_fp_vec(DisasContext *s, arg_2scalar *a,
31
+ gen_helper_gvec_3_ptr *fn)
32
+{
33
+ /* Two registers and a scalar, using gvec */
34
+ int vec_size = a->q ? 16 : 8;
35
+ int rd_ofs = neon_reg_offset(a->vd, 0);
36
+ int rn_ofs = neon_reg_offset(a->vn, 0);
37
+ int rm_ofs;
38
+ int idx;
39
+ TCGv_ptr fpstatus;
40
+
41
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
42
+ return false;
43
}
27
}
44
45
-WRAP_FP_FN(gen_VMUL_F_mul, gen_helper_vfp_muls)
46
-WRAP_FP_FN(gen_VMUL_F_add, gen_helper_vfp_adds)
47
-WRAP_FP_FN(gen_VMUL_F_sub, gen_helper_vfp_subs)
48
+ /* UNDEF accesses to D16-D31 if they don't exist. */
49
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
50
+ ((a->vd | a->vn | a->vm) & 0x10)) {
51
+ return false;
52
+ }
53
54
-static bool trans_VMUL_F_2sc(DisasContext *s, arg_2scalar *a)
55
-{
56
- static NeonGenTwoOpFn * const opfn[] = {
57
- NULL,
58
- NULL, /* TODO: fp16 support */
59
- gen_VMUL_F_mul,
60
- NULL,
61
- };
62
+ if (!fn) {
63
+ /* Bad size (including size == 3, which is a different insn group) */
64
+ return false;
65
+ }
66
67
- return do_2scalar(s, a, opfn[a->size], NULL);
68
+ if (a->q && ((a->vd | a->vn) & 1)) {
69
+ return false;
70
+ }
71
+
72
+ if (!vfp_access_check(s)) {
73
+ return true;
74
+ }
75
+
76
+ /* a->vm is M:Vm, which encodes both register and index */
77
+ idx = extract32(a->vm, a->size + 2, 2);
78
+ a->vm = extract32(a->vm, 0, a->size + 2);
79
+ rm_ofs = neon_reg_offset(a->vm, 0);
80
+
81
+ fpstatus = fpstatus_ptr(a->size == 1 ? FPST_STD_F16 : FPST_STD);
82
+ tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpstatus,
83
+ vec_size, vec_size, idx, fn);
84
+ tcg_temp_free_ptr(fpstatus);
85
+ return true;
86
}
87
88
-static bool trans_VMLA_F_2sc(DisasContext *s, arg_2scalar *a)
89
-{
90
- static NeonGenTwoOpFn * const opfn[] = {
91
- NULL,
92
- NULL, /* TODO: fp16 support */
93
- gen_VMUL_F_mul,
94
- NULL,
95
- };
96
- static NeonGenTwoOpFn * const accfn[] = {
97
- NULL,
98
- NULL, /* TODO: fp16 support */
99
- gen_VMUL_F_add,
100
- NULL,
101
- };
102
+#define DO_VMUL_F_2sc(NAME, FUNC) \
103
+ static bool trans_##NAME##_F_2sc(DisasContext *s, arg_2scalar *a) \
104
+ { \
105
+ static gen_helper_gvec_3_ptr * const opfn[] = { \
106
+ NULL, \
107
+ gen_helper_##FUNC##_h, \
108
+ gen_helper_##FUNC##_s, \
109
+ NULL, \
110
+ }; \
111
+ if (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s)) { \
112
+ return false; \
113
+ } \
114
+ return do_2scalar_fp_vec(s, a, opfn[a->size]); \
115
+ }
116
117
- return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
118
-}
119
-
120
-static bool trans_VMLS_F_2sc(DisasContext *s, arg_2scalar *a)
121
-{
122
- static NeonGenTwoOpFn * const opfn[] = {
123
- NULL,
124
- NULL, /* TODO: fp16 support */
125
- gen_VMUL_F_mul,
126
- NULL,
127
- };
128
- static NeonGenTwoOpFn * const accfn[] = {
129
- NULL,
130
- NULL, /* TODO: fp16 support */
131
- gen_VMUL_F_sub,
132
- NULL,
133
- };
134
-
135
- return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
136
-}
137
+DO_VMUL_F_2sc(VMUL, gvec_fmul_idx)
138
+DO_VMUL_F_2sc(VMLA, gvec_fmla_nf_idx)
139
+DO_VMUL_F_2sc(VMLS, gvec_fmls_nf_idx)
140
141
WRAP_ENV_FN(gen_VQDMULH_16, gen_helper_neon_qdmulh_s16)
142
WRAP_ENV_FN(gen_VQDMULH_32, gen_helper_neon_qdmulh_s32)
143
--
28
--
144
2.20.1
29
2.20.1
145
30
146
31
diff view generated by jsdifflib
1
Implement the fp16 versions of the VFP VCVT instruction forms which
1
Implement the MVE logical-immediate insns (VMOV, VMVN,
2
convert between floating point and integer.
2
VORR and VBIC). These have essentially the same encoding
3
as their Neon equivalents, and we implement the decode
4
in the same way.
3
5
4
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
6
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
5
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
7
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
6
Message-id: 20200828183354.27913-13-peter.maydell@linaro.org
8
Message-id: 20210628135835.6690-7-peter.maydell@linaro.org
7
---
9
---
8
target/arm/vfp.decode | 4 +++
10
target/arm/helper-mve.h | 4 +++
9
target/arm/translate-vfp.c.inc | 65 ++++++++++++++++++++++++++++++++++
11
target/arm/mve.decode | 17 +++++++++++++
10
2 files changed, 69 insertions(+)
12
target/arm/mve_helper.c | 24 ++++++++++++++++++
13
target/arm/translate-mve.c | 50 ++++++++++++++++++++++++++++++++++++++
14
4 files changed, 95 insertions(+)
11
15
12
diff --git a/target/arm/vfp.decode b/target/arm/vfp.decode
16
diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h
13
index XXXXXXX..XXXXXXX 100644
17
index XXXXXXX..XXXXXXX 100644
14
--- a/target/arm/vfp.decode
18
--- a/target/arm/helper-mve.h
15
+++ b/target/arm/vfp.decode
19
+++ b/target/arm/helper-mve.h
16
@@ -XXX,XX +XXX,XX @@ VCVT_sp ---- 1110 1.11 0111 .... 1010 11.0 .... @vfp_dm_ds
20
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_3(mve_vaddvsh, TCG_CALL_NO_WG, i32, env, ptr, i32)
17
VCVT_dp ---- 1110 1.11 0111 .... 1011 11.0 .... @vfp_dm_sd
21
DEF_HELPER_FLAGS_3(mve_vaddvuh, TCG_CALL_NO_WG, i32, env, ptr, i32)
18
22
DEF_HELPER_FLAGS_3(mve_vaddvsw, TCG_CALL_NO_WG, i32, env, ptr, i32)
19
# VCVT from integer to floating point: Vm always single; Vd depends on size
23
DEF_HELPER_FLAGS_3(mve_vaddvuw, TCG_CALL_NO_WG, i32, env, ptr, i32)
20
+VCVT_int_hp ---- 1110 1.11 1000 .... 1001 s:1 1.0 .... \
24
+
21
+ vd=%vd_sp vm=%vm_sp
25
+DEF_HELPER_FLAGS_3(mve_vmovi, TCG_CALL_NO_WG, void, env, ptr, i64)
22
VCVT_int_sp ---- 1110 1.11 1000 .... 1010 s:1 1.0 .... \
26
+DEF_HELPER_FLAGS_3(mve_vandi, TCG_CALL_NO_WG, void, env, ptr, i64)
23
vd=%vd_sp vm=%vm_sp
27
+DEF_HELPER_FLAGS_3(mve_vorri, TCG_CALL_NO_WG, void, env, ptr, i64)
24
VCVT_int_dp ---- 1110 1.11 1000 .... 1011 s:1 1.0 .... \
28
diff --git a/target/arm/mve.decode b/target/arm/mve.decode
25
@@ -XXX,XX +XXX,XX @@ VCVT_fix_dp ---- 1110 1.11 1.1. .... 1011 .1.0 .... \
26
vd=%vd_dp imm=%vm_sp opc=%vcvt_fix_op
27
28
# VCVT float to integer (VCVT and VCVTR): Vd always single; Vd depends on size
29
+VCVT_hp_int ---- 1110 1.11 110 s:1 .... 1001 rz:1 1.0 .... \
30
+ vd=%vd_sp vm=%vm_sp
31
VCVT_sp_int ---- 1110 1.11 110 s:1 .... 1010 rz:1 1.0 .... \
32
vd=%vd_sp vm=%vm_sp
33
VCVT_dp_int ---- 1110 1.11 110 s:1 .... 1011 rz:1 1.0 .... \
34
diff --git a/target/arm/translate-vfp.c.inc b/target/arm/translate-vfp.c.inc
35
index XXXXXXX..XXXXXXX 100644
29
index XXXXXXX..XXXXXXX 100644
36
--- a/target/arm/translate-vfp.c.inc
30
--- a/target/arm/mve.decode
37
+++ b/target/arm/translate-vfp.c.inc
31
+++ b/target/arm/mve.decode
38
@@ -XXX,XX +XXX,XX @@ static bool trans_VCVT_dp(DisasContext *s, arg_VCVT_dp *a)
32
@@ -XXX,XX +XXX,XX @@
33
# VQDMULL has size in bit 28: 0 for 16 bit, 1 for 32 bit
34
%size_28 28:1 !function=plus_1
35
36
+# 1imm format immediate
37
+%imm_28_16_0 28:1 16:3 0:4
38
+
39
&vldr_vstr rn qd imm p a w size l u
40
&1op qd qm size
41
&2op qd qm qn size
42
&2scalar qd qn rm size
43
+&1imm qd imm cmode op
44
45
@vldr_vstr ....... . . . . l:1 rn:4 ... ...... imm:7 &vldr_vstr qd=%qd u=0
46
# Note that both Rn and Qd are 3 bits only (no D bit)
47
@@ -XXX,XX +XXX,XX @@
48
@2op_nosz .... .... .... .... .... .... .... .... &2op qd=%qd qm=%qm qn=%qn size=0
49
@2op_sz28 .... .... .... .... .... .... .... .... &2op qd=%qd qm=%qm qn=%qn \
50
size=%size_28
51
+@1imm .... .... .... .... .... cmode:4 .. op:1 . .... &1imm qd=%qd imm=%imm_28_16_0
52
53
# The _rev suffix indicates that Vn and Vm are reversed. This is
54
# the case for shifts. In the Arm ARM these insns are documented
55
@@ -XXX,XX +XXX,XX @@ VADDV 111 u:1 1110 1111 size:2 01 ... 0 1111 0 0 a:1 0 qm:3 0 rda=%rd
56
# Predicate operations
57
%mask_22_13 22:1 13:3
58
VPST 1111 1110 0 . 11 000 1 ... 0 1111 0100 1101 mask=%mask_22_13
59
+
60
+# Logical immediate operations (1 reg and modified-immediate)
61
+
62
+# The cmode/op bits here decode VORR/VBIC/VMOV/VMVN, but
63
+# not in a way we can conveniently represent in decodetree without
64
+# a lot of repetition:
65
+# VORR: op=0, (cmode & 1) && cmode < 12
66
+# VBIC: op=1, (cmode & 1) && cmode < 12
67
+# VMOV: everything else
68
+# So we have a single decode line and check the cmode/op in the
69
+# trans function.
70
+Vimm_1r 111 . 1111 1 . 00 0 ... ... 0 .... 0 1 . 1 .... @1imm
71
diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c
72
index XXXXXXX..XXXXXXX 100644
73
--- a/target/arm/mve_helper.c
74
+++ b/target/arm/mve_helper.c
75
@@ -XXX,XX +XXX,XX @@ DO_1OP(vnegw, 4, int32_t, DO_NEG)
76
DO_1OP(vfnegh, 8, uint64_t, DO_FNEGH)
77
DO_1OP(vfnegs, 8, uint64_t, DO_FNEGS)
78
79
+/*
80
+ * 1 operand immediates: Vda is destination and possibly also one source.
81
+ * All these insns work at 64-bit widths.
82
+ */
83
+#define DO_1OP_IMM(OP, FN) \
84
+ void HELPER(mve_##OP)(CPUARMState *env, void *vda, uint64_t imm) \
85
+ { \
86
+ uint64_t *da = vda; \
87
+ uint16_t mask = mve_element_mask(env); \
88
+ unsigned e; \
89
+ for (e = 0; e < 16 / 8; e++, mask >>= 8) { \
90
+ mergemask(&da[H8(e)], FN(da[H8(e)], imm), mask); \
91
+ } \
92
+ mve_advance_vpt(env); \
93
+ }
94
+
95
+#define DO_MOVI(N, I) (I)
96
+#define DO_ANDI(N, I) ((N) & (I))
97
+#define DO_ORRI(N, I) ((N) | (I))
98
+
99
+DO_1OP_IMM(vmovi, DO_MOVI)
100
+DO_1OP_IMM(vandi, DO_ANDI)
101
+DO_1OP_IMM(vorri, DO_ORRI)
102
+
103
#define DO_2OP(OP, ESIZE, TYPE, FN) \
104
void HELPER(glue(mve_, OP))(CPUARMState *env, \
105
void *vd, void *vn, void *vm) \
106
diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c
107
index XXXXXXX..XXXXXXX 100644
108
--- a/target/arm/translate-mve.c
109
+++ b/target/arm/translate-mve.c
110
@@ -XXX,XX +XXX,XX @@ typedef void MVEGenTwoOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr);
111
typedef void MVEGenTwoOpScalarFn(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
112
typedef void MVEGenDualAccOpFn(TCGv_i64, TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i64);
113
typedef void MVEGenVADDVFn(TCGv_i32, TCGv_ptr, TCGv_ptr, TCGv_i32);
114
+typedef void MVEGenOneOpImmFn(TCGv_ptr, TCGv_ptr, TCGv_i64);
115
116
/* Return the offset of a Qn register (same semantics as aa32_vfp_qreg()) */
117
static inline long mve_qreg_offset(unsigned reg)
118
@@ -XXX,XX +XXX,XX @@ static bool trans_VADDV(DisasContext *s, arg_VADDV *a)
119
mve_update_eci(s);
39
return true;
120
return true;
40
}
121
}
41
122
+
42
+static bool trans_VCVT_int_hp(DisasContext *s, arg_VCVT_int_sp *a)
123
+static bool do_1imm(DisasContext *s, arg_1imm *a, MVEGenOneOpImmFn *fn)
43
+{
124
+{
44
+ TCGv_i32 vm;
125
+ TCGv_ptr qd;
45
+ TCGv_ptr fpst;
126
+ uint64_t imm;
46
+
127
+
47
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
128
+ if (!dc_isar_feature(aa32_mve, s) ||
129
+ !mve_check_qreg_bank(s, a->qd) ||
130
+ !fn) {
48
+ return false;
131
+ return false;
49
+ }
132
+ }
50
+
133
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
51
+ if (!vfp_access_check(s)) {
52
+ return true;
134
+ return true;
53
+ }
135
+ }
54
+
136
+
55
+ vm = tcg_temp_new_i32();
137
+ imm = asimd_imm_const(a->imm, a->cmode, a->op);
56
+ neon_load_reg32(vm, a->vm);
138
+
57
+ fpst = fpstatus_ptr(FPST_FPCR_F16);
139
+ qd = mve_qreg_ptr(a->qd);
58
+ if (a->s) {
140
+ fn(cpu_env, qd, tcg_constant_i64(imm));
59
+ /* i32 -> f16 */
141
+ tcg_temp_free_ptr(qd);
60
+ gen_helper_vfp_sitoh(vm, vm, fpst);
142
+ mve_update_eci(s);
61
+ } else {
62
+ /* u32 -> f16 */
63
+ gen_helper_vfp_uitoh(vm, vm, fpst);
64
+ }
65
+ neon_store_reg32(vm, a->vd);
66
+ tcg_temp_free_i32(vm);
67
+ tcg_temp_free_ptr(fpst);
68
+ return true;
143
+ return true;
69
+}
144
+}
70
+
145
+
71
static bool trans_VCVT_int_sp(DisasContext *s, arg_VCVT_int_sp *a)
146
+static bool trans_Vimm_1r(DisasContext *s, arg_1imm *a)
72
{
73
TCGv_i32 vm;
74
@@ -XXX,XX +XXX,XX @@ static bool trans_VCVT_fix_dp(DisasContext *s, arg_VCVT_fix_dp *a)
75
return true;
76
}
77
78
+static bool trans_VCVT_hp_int(DisasContext *s, arg_VCVT_sp_int *a)
79
+{
147
+{
80
+ TCGv_i32 vm;
148
+ /* Handle decode of cmode/op here between VORR/VBIC/VMOV */
81
+ TCGv_ptr fpst;
149
+ MVEGenOneOpImmFn *fn;
82
+
150
+
83
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
151
+ if ((a->cmode & 1) && a->cmode < 12) {
84
+ return false;
152
+ if (a->op) {
85
+ }
153
+ /*
86
+
154
+ * For op=1, the immediate will be inverted by asimd_imm_const(),
87
+ if (!vfp_access_check(s)) {
155
+ * so the VBIC becomes a logical AND operation.
88
+ return true;
156
+ */
89
+ }
157
+ fn = gen_helper_mve_vandi;
90
+
91
+ fpst = fpstatus_ptr(FPST_FPCR_F16);
92
+ vm = tcg_temp_new_i32();
93
+ neon_load_reg32(vm, a->vm);
94
+
95
+ if (a->s) {
96
+ if (a->rz) {
97
+ gen_helper_vfp_tosizh(vm, vm, fpst);
98
+ } else {
158
+ } else {
99
+ gen_helper_vfp_tosih(vm, vm, fpst);
159
+ fn = gen_helper_mve_vorri;
100
+ }
160
+ }
101
+ } else {
161
+ } else {
102
+ if (a->rz) {
162
+ /* There is one unallocated cmode/op combination in this space */
103
+ gen_helper_vfp_touizh(vm, vm, fpst);
163
+ if (a->cmode == 15 && a->op == 1) {
104
+ } else {
164
+ return false;
105
+ gen_helper_vfp_touih(vm, vm, fpst);
106
+ }
165
+ }
166
+ /* asimd_imm_const() sorts out VMVNI vs VMOVI for us */
167
+ fn = gen_helper_mve_vmovi;
107
+ }
168
+ }
108
+ neon_store_reg32(vm, a->vd);
169
+ return do_1imm(s, a, fn);
109
+ tcg_temp_free_i32(vm);
110
+ tcg_temp_free_ptr(fpst);
111
+ return true;
112
+}
170
+}
113
+
114
static bool trans_VCVT_sp_int(DisasContext *s, arg_VCVT_sp_int *a)
115
{
116
TCGv_i32 vm;
117
--
171
--
118
2.20.1
172
2.20.1
119
173
120
174
diff view generated by jsdifflib
1
Implement the fp16 versions of the VFP VLDR/VSTR (immediate).
1
Implement the MVE shift-vector-left-by-immediate insns VSHL, VQSHL
2
and VQSHLU.
3
4
The size-and-immediate encoding here is the same as Neon, and we
5
handle it the same way neon-dp.decode does.
2
6
3
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
7
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
4
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
8
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
5
Message-id: 20200828183354.27913-12-peter.maydell@linaro.org
9
Message-id: 20210628135835.6690-8-peter.maydell@linaro.org
6
---
10
---
7
target/arm/vfp.decode | 3 +--
11
target/arm/helper-mve.h | 16 +++++++++++
8
target/arm/translate-vfp.c.inc | 35 ++++++++++++++++++++++++++++++++++
12
target/arm/mve.decode | 23 +++++++++++++++
9
2 files changed, 36 insertions(+), 2 deletions(-)
13
target/arm/mve_helper.c | 57 ++++++++++++++++++++++++++++++++++++++
10
14
target/arm/translate-mve.c | 51 ++++++++++++++++++++++++++++++++++
11
diff --git a/target/arm/vfp.decode b/target/arm/vfp.decode
15
4 files changed, 147 insertions(+)
12
index XXXXXXX..XXXXXXX 100644
16
13
--- a/target/arm/vfp.decode
17
diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h
14
+++ b/target/arm/vfp.decode
18
index XXXXXXX..XXXXXXX 100644
15
@@ -XXX,XX +XXX,XX @@ VMOV_single ---- 1110 000 l:1 .... rt:4 1010 . 001 0000 vn=%vn_sp
19
--- a/target/arm/helper-mve.h
16
VMOV_64_sp ---- 1100 010 op:1 rt2:4 rt:4 1010 00.1 .... vm=%vm_sp
20
+++ b/target/arm/helper-mve.h
17
VMOV_64_dp ---- 1100 010 op:1 rt2:4 rt:4 1011 00.1 .... vm=%vm_dp
21
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_3(mve_vaddvuw, TCG_CALL_NO_WG, i32, env, ptr, i32)
18
22
DEF_HELPER_FLAGS_3(mve_vmovi, TCG_CALL_NO_WG, void, env, ptr, i64)
19
-# Note that the half-precision variants of VLDR and VSTR are
23
DEF_HELPER_FLAGS_3(mve_vandi, TCG_CALL_NO_WG, void, env, ptr, i64)
20
-# not part of this decodetree at all because they have bits [9:8] == 0b01
24
DEF_HELPER_FLAGS_3(mve_vorri, TCG_CALL_NO_WG, void, env, ptr, i64)
21
+VLDR_VSTR_hp ---- 1101 u:1 .0 l:1 rn:4 .... 1001 imm:8 vd=%vd_sp
25
+
22
VLDR_VSTR_sp ---- 1101 u:1 .0 l:1 rn:4 .... 1010 imm:8 vd=%vd_sp
26
+DEF_HELPER_FLAGS_4(mve_vshli_ub, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
23
VLDR_VSTR_dp ---- 1101 u:1 .0 l:1 rn:4 .... 1011 imm:8 vd=%vd_dp
27
+DEF_HELPER_FLAGS_4(mve_vshli_uh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
24
28
+DEF_HELPER_FLAGS_4(mve_vshli_uw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
25
diff --git a/target/arm/translate-vfp.c.inc b/target/arm/translate-vfp.c.inc
29
+
26
index XXXXXXX..XXXXXXX 100644
30
+DEF_HELPER_FLAGS_4(mve_vqshli_sb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
27
--- a/target/arm/translate-vfp.c.inc
31
+DEF_HELPER_FLAGS_4(mve_vqshli_sh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
28
+++ b/target/arm/translate-vfp.c.inc
32
+DEF_HELPER_FLAGS_4(mve_vqshli_sw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
29
@@ -XXX,XX +XXX,XX @@ static bool trans_VMOV_64_dp(DisasContext *s, arg_VMOV_64_dp *a)
33
+
30
return true;
34
+DEF_HELPER_FLAGS_4(mve_vqshli_ub, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
35
+DEF_HELPER_FLAGS_4(mve_vqshli_uh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
36
+DEF_HELPER_FLAGS_4(mve_vqshli_uw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
37
+
38
+DEF_HELPER_FLAGS_4(mve_vqshlui_sb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
39
+DEF_HELPER_FLAGS_4(mve_vqshlui_sh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
40
+DEF_HELPER_FLAGS_4(mve_vqshlui_sw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
41
diff --git a/target/arm/mve.decode b/target/arm/mve.decode
42
index XXXXXXX..XXXXXXX 100644
43
--- a/target/arm/mve.decode
44
+++ b/target/arm/mve.decode
45
@@ -XXX,XX +XXX,XX @@
46
&2op qd qm qn size
47
&2scalar qd qn rm size
48
&1imm qd imm cmode op
49
+&2shift qd qm shift size
50
51
@vldr_vstr ....... . . . . l:1 rn:4 ... ...... imm:7 &vldr_vstr qd=%qd u=0
52
# Note that both Rn and Qd are 3 bits only (no D bit)
53
@@ -XXX,XX +XXX,XX @@
54
@2scalar .... .... .. size:2 .... .... .... .... rm:4 &2scalar qd=%qd qn=%qn
55
@2scalar_nosz .... .... .... .... .... .... .... rm:4 &2scalar qd=%qd qn=%qn
56
57
+@2_shl_b .... .... .. 001 shift:3 .... .... .... .... &2shift qd=%qd qm=%qm size=0
58
+@2_shl_h .... .... .. 01 shift:4 .... .... .... .... &2shift qd=%qd qm=%qm size=1
59
+@2_shl_w .... .... .. 1 shift:5 .... .... .... .... &2shift qd=%qd qm=%qm size=2
60
+
61
# Vector loads and stores
62
63
# Widening loads and narrowing stores:
64
@@ -XXX,XX +XXX,XX @@ VPST 1111 1110 0 . 11 000 1 ... 0 1111 0100 1101 mask=%mask_22_13
65
# So we have a single decode line and check the cmode/op in the
66
# trans function.
67
Vimm_1r 111 . 1111 1 . 00 0 ... ... 0 .... 0 1 . 1 .... @1imm
68
+
69
+# Shifts by immediate
70
+
71
+VSHLI 111 0 1111 1 . ... ... ... 0 0101 0 1 . 1 ... 0 @2_shl_b
72
+VSHLI 111 0 1111 1 . ... ... ... 0 0101 0 1 . 1 ... 0 @2_shl_h
73
+VSHLI 111 0 1111 1 . ... ... ... 0 0101 0 1 . 1 ... 0 @2_shl_w
74
+
75
+VQSHLI_S 111 0 1111 1 . ... ... ... 0 0111 0 1 . 1 ... 0 @2_shl_b
76
+VQSHLI_S 111 0 1111 1 . ... ... ... 0 0111 0 1 . 1 ... 0 @2_shl_h
77
+VQSHLI_S 111 0 1111 1 . ... ... ... 0 0111 0 1 . 1 ... 0 @2_shl_w
78
+
79
+VQSHLI_U 111 1 1111 1 . ... ... ... 0 0111 0 1 . 1 ... 0 @2_shl_b
80
+VQSHLI_U 111 1 1111 1 . ... ... ... 0 0111 0 1 . 1 ... 0 @2_shl_h
81
+VQSHLI_U 111 1 1111 1 . ... ... ... 0 0111 0 1 . 1 ... 0 @2_shl_w
82
+
83
+VQSHLUI 111 1 1111 1 . ... ... ... 0 0110 0 1 . 1 ... 0 @2_shl_b
84
+VQSHLUI 111 1 1111 1 . ... ... ... 0 0110 0 1 . 1 ... 0 @2_shl_h
85
+VQSHLUI 111 1 1111 1 . ... ... ... 0 0110 0 1 . 1 ... 0 @2_shl_w
86
diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c
87
index XXXXXXX..XXXXXXX 100644
88
--- a/target/arm/mve_helper.c
89
+++ b/target/arm/mve_helper.c
90
@@ -XXX,XX +XXX,XX @@ DO_2OP_SAT(vqsubsw, 4, int32_t, DO_SQSUB_W)
91
WRAP_QRSHL_HELPER(do_sqrshl_bhs, N, M, true, satp)
92
#define DO_UQRSHL_OP(N, M, satp) \
93
WRAP_QRSHL_HELPER(do_uqrshl_bhs, N, M, true, satp)
94
+#define DO_SUQSHL_OP(N, M, satp) \
95
+ WRAP_QRSHL_HELPER(do_suqrshl_bhs, N, M, false, satp)
96
97
DO_2OP_SAT_S(vqshls, DO_SQSHL_OP)
98
DO_2OP_SAT_U(vqshlu, DO_UQSHL_OP)
99
@@ -XXX,XX +XXX,XX @@ DO_VADDV(vaddvsw, 4, uint32_t)
100
DO_VADDV(vaddvub, 1, uint8_t)
101
DO_VADDV(vaddvuh, 2, uint16_t)
102
DO_VADDV(vaddvuw, 4, uint32_t)
103
+
104
+/* Shifts by immediate */
105
+#define DO_2SHIFT(OP, ESIZE, TYPE, FN) \
106
+ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, \
107
+ void *vm, uint32_t shift) \
108
+ { \
109
+ TYPE *d = vd, *m = vm; \
110
+ uint16_t mask = mve_element_mask(env); \
111
+ unsigned e; \
112
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
113
+ mergemask(&d[H##ESIZE(e)], \
114
+ FN(m[H##ESIZE(e)], shift), mask); \
115
+ } \
116
+ mve_advance_vpt(env); \
117
+ }
118
+
119
+#define DO_2SHIFT_SAT(OP, ESIZE, TYPE, FN) \
120
+ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, \
121
+ void *vm, uint32_t shift) \
122
+ { \
123
+ TYPE *d = vd, *m = vm; \
124
+ uint16_t mask = mve_element_mask(env); \
125
+ unsigned e; \
126
+ bool qc = false; \
127
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
128
+ bool sat = false; \
129
+ mergemask(&d[H##ESIZE(e)], \
130
+ FN(m[H##ESIZE(e)], shift, &sat), mask); \
131
+ qc |= sat & mask & 1; \
132
+ } \
133
+ if (qc) { \
134
+ env->vfp.qc[0] = qc; \
135
+ } \
136
+ mve_advance_vpt(env); \
137
+ }
138
+
139
+/* provide unsigned 2-op shift helpers for all sizes */
140
+#define DO_2SHIFT_U(OP, FN) \
141
+ DO_2SHIFT(OP##b, 1, uint8_t, FN) \
142
+ DO_2SHIFT(OP##h, 2, uint16_t, FN) \
143
+ DO_2SHIFT(OP##w, 4, uint32_t, FN)
144
+
145
+#define DO_2SHIFT_SAT_U(OP, FN) \
146
+ DO_2SHIFT_SAT(OP##b, 1, uint8_t, FN) \
147
+ DO_2SHIFT_SAT(OP##h, 2, uint16_t, FN) \
148
+ DO_2SHIFT_SAT(OP##w, 4, uint32_t, FN)
149
+#define DO_2SHIFT_SAT_S(OP, FN) \
150
+ DO_2SHIFT_SAT(OP##b, 1, int8_t, FN) \
151
+ DO_2SHIFT_SAT(OP##h, 2, int16_t, FN) \
152
+ DO_2SHIFT_SAT(OP##w, 4, int32_t, FN)
153
+
154
+DO_2SHIFT_U(vshli_u, DO_VSHLU)
155
+DO_2SHIFT_SAT_U(vqshli_u, DO_UQSHL_OP)
156
+DO_2SHIFT_SAT_S(vqshli_s, DO_SQSHL_OP)
157
+DO_2SHIFT_SAT_S(vqshlui_s, DO_SUQSHL_OP)
158
diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c
159
index XXXXXXX..XXXXXXX 100644
160
--- a/target/arm/translate-mve.c
161
+++ b/target/arm/translate-mve.c
162
@@ -XXX,XX +XXX,XX @@ typedef void MVEGenLdStFn(TCGv_ptr, TCGv_ptr, TCGv_i32);
163
typedef void MVEGenOneOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr);
164
typedef void MVEGenTwoOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr);
165
typedef void MVEGenTwoOpScalarFn(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
166
+typedef void MVEGenTwoOpShiftFn(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
167
typedef void MVEGenDualAccOpFn(TCGv_i64, TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i64);
168
typedef void MVEGenVADDVFn(TCGv_i32, TCGv_ptr, TCGv_ptr, TCGv_i32);
169
typedef void MVEGenOneOpImmFn(TCGv_ptr, TCGv_ptr, TCGv_i64);
170
@@ -XXX,XX +XXX,XX @@ static bool trans_Vimm_1r(DisasContext *s, arg_1imm *a)
171
}
172
return do_1imm(s, a, fn);
31
}
173
}
32
174
+
33
+static bool trans_VLDR_VSTR_hp(DisasContext *s, arg_VLDR_VSTR_sp *a)
175
+static bool do_2shift(DisasContext *s, arg_2shift *a, MVEGenTwoOpShiftFn fn,
176
+ bool negateshift)
34
+{
177
+{
35
+ uint32_t offset;
178
+ TCGv_ptr qd, qm;
36
+ TCGv_i32 addr, tmp;
179
+ int shift = a->shift;
37
+
180
+
38
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
181
+ if (!dc_isar_feature(aa32_mve, s) ||
182
+ !mve_check_qreg_bank(s, a->qd | a->qm) ||
183
+ !fn) {
39
+ return false;
184
+ return false;
40
+ }
185
+ }
41
+
186
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
42
+ if (!vfp_access_check(s)) {
43
+ return true;
187
+ return true;
44
+ }
188
+ }
45
+
189
+
46
+ /* imm8 field is offset/2 for fp16, unlike fp32 and fp64 */
190
+ /*
47
+ offset = a->imm << 1;
191
+ * When we handle a right shift insn using a left-shift helper
48
+ if (!a->u) {
192
+ * which permits a negative shift count to indicate a right-shift,
49
+ offset = -offset;
193
+ * we must negate the shift count.
50
+ }
194
+ */
51
+
195
+ if (negateshift) {
52
+ /* For thumb, use of PC is UNPREDICTABLE. */
196
+ shift = -shift;
53
+ addr = add_reg_for_lit(s, a->rn, offset);
197
+ }
54
+ tmp = tcg_temp_new_i32();
198
+
55
+ if (a->l) {
199
+ qd = mve_qreg_ptr(a->qd);
56
+ gen_aa32_ld16u(s, tmp, addr, get_mem_index(s));
200
+ qm = mve_qreg_ptr(a->qm);
57
+ neon_store_reg32(tmp, a->vd);
201
+ fn(cpu_env, qd, qm, tcg_constant_i32(shift));
58
+ } else {
202
+ tcg_temp_free_ptr(qd);
59
+ neon_load_reg32(tmp, a->vd);
203
+ tcg_temp_free_ptr(qm);
60
+ gen_aa32_st16(s, tmp, addr, get_mem_index(s));
204
+ mve_update_eci(s);
61
+ }
62
+ tcg_temp_free_i32(tmp);
63
+ tcg_temp_free_i32(addr);
64
+
65
+ return true;
205
+ return true;
66
+}
206
+}
67
+
207
+
68
static bool trans_VLDR_VSTR_sp(DisasContext *s, arg_VLDR_VSTR_sp *a)
208
+#define DO_2SHIFT(INSN, FN, NEGATESHIFT) \
69
{
209
+ static bool trans_##INSN(DisasContext *s, arg_2shift *a) \
70
uint32_t offset;
210
+ { \
211
+ static MVEGenTwoOpShiftFn * const fns[] = { \
212
+ gen_helper_mve_##FN##b, \
213
+ gen_helper_mve_##FN##h, \
214
+ gen_helper_mve_##FN##w, \
215
+ NULL, \
216
+ }; \
217
+ return do_2shift(s, a, fns[a->size], NEGATESHIFT); \
218
+ }
219
+
220
+DO_2SHIFT(VSHLI, vshli_u, false)
221
+DO_2SHIFT(VQSHLI_S, vqshli_s, false)
222
+DO_2SHIFT(VQSHLI_U, vqshli_u, false)
223
+DO_2SHIFT(VQSHLUI, vqshlui_s, false)
71
--
224
--
72
2.20.1
225
2.20.1
73
226
74
227
diff view generated by jsdifflib
1
Convert the neon floating-point vector operations VFMA and VFMS
1
Implement the MVE vector shift right by immediate insns VSHRI and
2
to use a gvec helper, and use this to implement the fp16 case.
2
VRSHRI. As with Neon, we implement these by using helper functions
3
3
which perform left shifts but allow negative shift counts to indicate
4
This is the last use of do_3same_fp() so we can now delete
4
right shifts.
5
that function.
6
5
7
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
6
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
8
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
7
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
9
Message-id: 20200828183354.27913-32-peter.maydell@linaro.org
8
Message-id: 20210628135835.6690-9-peter.maydell@linaro.org
10
---
9
---
11
target/arm/helper.h | 6 +++
10
target/arm/helper-mve.h | 12 ++++++++++++
12
target/arm/vec_helper.c | 33 +++++++++++-
11
target/arm/translate.h | 20 ++++++++++++++++++++
13
target/arm/translate-neon.c.inc | 92 +--------------------------------
12
target/arm/mve.decode | 28 ++++++++++++++++++++++++++++
14
3 files changed, 40 insertions(+), 91 deletions(-)
13
target/arm/mve_helper.c | 7 +++++++
14
target/arm/translate-mve.c | 5 +++++
15
target/arm/translate-neon.c | 18 ------------------
16
6 files changed, 72 insertions(+), 18 deletions(-)
15
17
16
diff --git a/target/arm/helper.h b/target/arm/helper.h
18
diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h
17
index XXXXXXX..XXXXXXX 100644
19
index XXXXXXX..XXXXXXX 100644
18
--- a/target/arm/helper.h
20
--- a/target/arm/helper-mve.h
19
+++ b/target/arm/helper.h
21
+++ b/target/arm/helper-mve.h
20
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_5(gvec_fmla_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
22
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_3(mve_vmovi, TCG_CALL_NO_WG, void, env, ptr, i64)
21
DEF_HELPER_FLAGS_5(gvec_fmls_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
23
DEF_HELPER_FLAGS_3(mve_vandi, TCG_CALL_NO_WG, void, env, ptr, i64)
22
DEF_HELPER_FLAGS_5(gvec_fmls_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
24
DEF_HELPER_FLAGS_3(mve_vorri, TCG_CALL_NO_WG, void, env, ptr, i64)
23
25
24
+DEF_HELPER_FLAGS_5(gvec_vfma_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
26
+DEF_HELPER_FLAGS_4(mve_vshli_sb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
25
+DEF_HELPER_FLAGS_5(gvec_vfma_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
27
+DEF_HELPER_FLAGS_4(mve_vshli_sh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
28
+DEF_HELPER_FLAGS_4(mve_vshli_sw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
26
+
29
+
27
+DEF_HELPER_FLAGS_5(gvec_vfms_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
30
DEF_HELPER_FLAGS_4(mve_vshli_ub, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
28
+DEF_HELPER_FLAGS_5(gvec_vfms_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
31
DEF_HELPER_FLAGS_4(mve_vshli_uh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
32
DEF_HELPER_FLAGS_4(mve_vshli_uw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
33
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_4(mve_vqshli_uw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
34
DEF_HELPER_FLAGS_4(mve_vqshlui_sb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
35
DEF_HELPER_FLAGS_4(mve_vqshlui_sh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
36
DEF_HELPER_FLAGS_4(mve_vqshlui_sw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
29
+
37
+
30
DEF_HELPER_FLAGS_5(gvec_ftsmul_h, TCG_CALL_NO_RWG,
38
+DEF_HELPER_FLAGS_4(mve_vrshli_sb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
31
void, ptr, ptr, ptr, ptr, i32)
39
+DEF_HELPER_FLAGS_4(mve_vrshli_sh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
32
DEF_HELPER_FLAGS_5(gvec_ftsmul_s, TCG_CALL_NO_RWG,
40
+DEF_HELPER_FLAGS_4(mve_vrshli_sw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
33
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
41
+
42
+DEF_HELPER_FLAGS_4(mve_vrshli_ub, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
43
+DEF_HELPER_FLAGS_4(mve_vrshli_uh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
44
+DEF_HELPER_FLAGS_4(mve_vrshli_uw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
45
diff --git a/target/arm/translate.h b/target/arm/translate.h
34
index XXXXXXX..XXXXXXX 100644
46
index XXXXXXX..XXXXXXX 100644
35
--- a/target/arm/vec_helper.c
47
--- a/target/arm/translate.h
36
+++ b/target/arm/vec_helper.c
48
+++ b/target/arm/translate.h
37
@@ -XXX,XX +XXX,XX @@ static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
49
@@ -XXX,XX +XXX,XX @@ static inline int times_2_plus_1(DisasContext *s, int x)
38
return float32_sub(dest, float32_mul(op1, op2, stat), stat);
50
return x * 2 + 1;
39
}
51
}
40
52
41
-#define DO_MULADD(NAME, FUNC, TYPE) \
53
+static inline int rsub_64(DisasContext *s, int x)
42
+/* Fused versions; these have the semantics Neon VFMA/VFMS want */
43
+static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
44
+ float_status *stat)
45
+{
54
+{
46
+ return float16_muladd(op1, op2, dest, 0, stat);
55
+ return 64 - x;
47
+}
56
+}
48
+
57
+
49
+static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
58
+static inline int rsub_32(DisasContext *s, int x)
50
+ float_status *stat)
51
+{
59
+{
52
+ return float32_muladd(op1, op2, dest, 0, stat);
60
+ return 32 - x;
53
+}
61
+}
54
+
62
+
55
+static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
63
+static inline int rsub_16(DisasContext *s, int x)
56
+ float_status *stat)
57
+{
64
+{
58
+ return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
65
+ return 16 - x;
59
+}
66
+}
60
+
67
+
61
+static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
68
+static inline int rsub_8(DisasContext *s, int x)
62
+ float_status *stat)
63
+{
69
+{
64
+ return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
70
+ return 8 - x;
65
+}
71
+}
66
+
72
+
67
+#define DO_MULADD(NAME, FUNC, TYPE) \
73
static inline int arm_dc_feature(DisasContext *dc, int feature)
68
void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
74
{
69
{ \
75
return (dc->features & (1ULL << feature)) != 0;
70
intptr_t i, oprsz = simd_oprsz(desc); \
76
diff --git a/target/arm/mve.decode b/target/arm/mve.decode
71
@@ -XXX,XX +XXX,XX @@ DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
77
index XXXXXXX..XXXXXXX 100644
72
DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
78
--- a/target/arm/mve.decode
73
DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
79
+++ b/target/arm/mve.decode
74
80
@@ -XXX,XX +XXX,XX @@
75
+DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
81
@2_shl_h .... .... .. 01 shift:4 .... .... .... .... &2shift qd=%qd qm=%qm size=1
76
+DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
82
@2_shl_w .... .... .. 1 shift:5 .... .... .... .... &2shift qd=%qd qm=%qm size=2
83
84
+# Right shifts are encoded as N - shift, where N is the element size in bits.
85
+%rshift_i5 16:5 !function=rsub_32
86
+%rshift_i4 16:4 !function=rsub_16
87
+%rshift_i3 16:3 !function=rsub_8
77
+
88
+
78
+DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
89
+@2_shr_b .... .... .. 001 ... .... .... .... .... &2shift qd=%qd qm=%qm \
79
+DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
90
+ size=0 shift=%rshift_i3
91
+@2_shr_h .... .... .. 01 .... .... .... .... .... &2shift qd=%qd qm=%qm \
92
+ size=1 shift=%rshift_i4
93
+@2_shr_w .... .... .. 1 ..... .... .... .... .... &2shift qd=%qd qm=%qm \
94
+ size=2 shift=%rshift_i5
80
+
95
+
81
/* For the indexed ops, SVE applies the index per 128-bit vector segment.
96
# Vector loads and stores
82
* For AdvSIMD, there is of course only one such vector segment.
97
83
*/
98
# Widening loads and narrowing stores:
84
diff --git a/target/arm/translate-neon.c.inc b/target/arm/translate-neon.c.inc
99
@@ -XXX,XX +XXX,XX @@ VQSHLI_U 111 1 1111 1 . ... ... ... 0 0111 0 1 . 1 ... 0 @2_shl_w
100
VQSHLUI 111 1 1111 1 . ... ... ... 0 0110 0 1 . 1 ... 0 @2_shl_b
101
VQSHLUI 111 1 1111 1 . ... ... ... 0 0110 0 1 . 1 ... 0 @2_shl_h
102
VQSHLUI 111 1 1111 1 . ... ... ... 0 0110 0 1 . 1 ... 0 @2_shl_w
103
+
104
+VSHRI_S 111 0 1111 1 . ... ... ... 0 0000 0 1 . 1 ... 0 @2_shr_b
105
+VSHRI_S 111 0 1111 1 . ... ... ... 0 0000 0 1 . 1 ... 0 @2_shr_h
106
+VSHRI_S 111 0 1111 1 . ... ... ... 0 0000 0 1 . 1 ... 0 @2_shr_w
107
+
108
+VSHRI_U 111 1 1111 1 . ... ... ... 0 0000 0 1 . 1 ... 0 @2_shr_b
109
+VSHRI_U 111 1 1111 1 . ... ... ... 0 0000 0 1 . 1 ... 0 @2_shr_h
110
+VSHRI_U 111 1 1111 1 . ... ... ... 0 0000 0 1 . 1 ... 0 @2_shr_w
111
+
112
+VRSHRI_S 111 0 1111 1 . ... ... ... 0 0010 0 1 . 1 ... 0 @2_shr_b
113
+VRSHRI_S 111 0 1111 1 . ... ... ... 0 0010 0 1 . 1 ... 0 @2_shr_h
114
+VRSHRI_S 111 0 1111 1 . ... ... ... 0 0010 0 1 . 1 ... 0 @2_shr_w
115
+
116
+VRSHRI_U 111 1 1111 1 . ... ... ... 0 0010 0 1 . 1 ... 0 @2_shr_b
117
+VRSHRI_U 111 1 1111 1 . ... ... ... 0 0010 0 1 . 1 ... 0 @2_shr_h
118
+VRSHRI_U 111 1 1111 1 . ... ... ... 0 0010 0 1 . 1 ... 0 @2_shr_w
119
diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c
85
index XXXXXXX..XXXXXXX 100644
120
index XXXXXXX..XXXXXXX 100644
86
--- a/target/arm/translate-neon.c.inc
121
--- a/target/arm/mve_helper.c
87
+++ b/target/arm/translate-neon.c.inc
122
+++ b/target/arm/mve_helper.c
88
@@ -XXX,XX +XXX,XX @@ DO_3SAME_PAIR(VPADD, padd_u)
123
@@ -XXX,XX +XXX,XX @@ DO_VADDV(vaddvuw, 4, uint32_t)
89
DO_3SAME_VQDMULH(VQDMULH, qdmulh)
124
DO_2SHIFT(OP##b, 1, uint8_t, FN) \
90
DO_3SAME_VQDMULH(VQRDMULH, qrdmulh)
125
DO_2SHIFT(OP##h, 2, uint16_t, FN) \
91
126
DO_2SHIFT(OP##w, 4, uint32_t, FN)
92
-static bool do_3same_fp(DisasContext *s, arg_3same *a, VFPGen3OpSPFn *fn,
127
+#define DO_2SHIFT_S(OP, FN) \
93
- bool reads_vd)
128
+ DO_2SHIFT(OP##b, 1, int8_t, FN) \
129
+ DO_2SHIFT(OP##h, 2, int16_t, FN) \
130
+ DO_2SHIFT(OP##w, 4, int32_t, FN)
131
132
#define DO_2SHIFT_SAT_U(OP, FN) \
133
DO_2SHIFT_SAT(OP##b, 1, uint8_t, FN) \
134
@@ -XXX,XX +XXX,XX @@ DO_VADDV(vaddvuw, 4, uint32_t)
135
DO_2SHIFT_SAT(OP##w, 4, int32_t, FN)
136
137
DO_2SHIFT_U(vshli_u, DO_VSHLU)
138
+DO_2SHIFT_S(vshli_s, DO_VSHLS)
139
DO_2SHIFT_SAT_U(vqshli_u, DO_UQSHL_OP)
140
DO_2SHIFT_SAT_S(vqshli_s, DO_SQSHL_OP)
141
DO_2SHIFT_SAT_S(vqshlui_s, DO_SUQSHL_OP)
142
+DO_2SHIFT_U(vrshli_u, DO_VRSHLU)
143
+DO_2SHIFT_S(vrshli_s, DO_VRSHLS)
144
diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c
145
index XXXXXXX..XXXXXXX 100644
146
--- a/target/arm/translate-mve.c
147
+++ b/target/arm/translate-mve.c
148
@@ -XXX,XX +XXX,XX @@ DO_2SHIFT(VSHLI, vshli_u, false)
149
DO_2SHIFT(VQSHLI_S, vqshli_s, false)
150
DO_2SHIFT(VQSHLI_U, vqshli_u, false)
151
DO_2SHIFT(VQSHLUI, vqshlui_s, false)
152
+/* These right shifts use a left-shift helper with negated shift count */
153
+DO_2SHIFT(VSHRI_S, vshli_s, true)
154
+DO_2SHIFT(VSHRI_U, vshli_u, true)
155
+DO_2SHIFT(VRSHRI_S, vrshli_s, true)
156
+DO_2SHIFT(VRSHRI_U, vrshli_u, true)
157
diff --git a/target/arm/translate-neon.c b/target/arm/translate-neon.c
158
index XXXXXXX..XXXXXXX 100644
159
--- a/target/arm/translate-neon.c
160
+++ b/target/arm/translate-neon.c
161
@@ -XXX,XX +XXX,XX @@ static inline int plus1(DisasContext *s, int x)
162
return x + 1;
163
}
164
165
-static inline int rsub_64(DisasContext *s, int x)
94
-{
166
-{
95
- /*
167
- return 64 - x;
96
- * FP operations handled elementwise 32 bits at a time.
97
- * If reads_vd is true then the old value of Vd will be
98
- * loaded before calling the callback function. This is
99
- * used for multiply-accumulate type operations.
100
- */
101
- TCGv_i32 tmp, tmp2;
102
- int pass;
103
-
104
- if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
105
- return false;
106
- }
107
-
108
- /* UNDEF accesses to D16-D31 if they don't exist. */
109
- if (!dc_isar_feature(aa32_simd_r32, s) &&
110
- ((a->vd | a->vn | a->vm) & 0x10)) {
111
- return false;
112
- }
113
-
114
- if ((a->vn | a->vm | a->vd) & a->q) {
115
- return false;
116
- }
117
-
118
- if (!vfp_access_check(s)) {
119
- return true;
120
- }
121
-
122
- TCGv_ptr fpstatus = fpstatus_ptr(FPST_STD);
123
- for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
124
- tmp = neon_load_reg(a->vn, pass);
125
- tmp2 = neon_load_reg(a->vm, pass);
126
- if (reads_vd) {
127
- TCGv_i32 tmp_rd = neon_load_reg(a->vd, pass);
128
- fn(tmp_rd, tmp, tmp2, fpstatus);
129
- neon_store_reg(a->vd, pass, tmp_rd);
130
- tcg_temp_free_i32(tmp);
131
- } else {
132
- fn(tmp, tmp, tmp2, fpstatus);
133
- neon_store_reg(a->vd, pass, tmp);
134
- }
135
- tcg_temp_free_i32(tmp2);
136
- }
137
- tcg_temp_free_ptr(fpstatus);
138
- return true;
139
-}
168
-}
140
-
169
-
141
#define WRAP_FP_GVEC(WRAPNAME, FPST, FUNC) \
170
-static inline int rsub_32(DisasContext *s, int x)
142
static void WRAPNAME(unsigned vece, uint32_t rd_ofs, \
143
uint32_t rn_ofs, uint32_t rm_ofs, \
144
@@ -XXX,XX +XXX,XX @@ DO_3S_FP_GVEC(VMAX, gen_helper_gvec_fmax_s, gen_helper_gvec_fmax_h)
145
DO_3S_FP_GVEC(VMIN, gen_helper_gvec_fmin_s, gen_helper_gvec_fmin_h)
146
DO_3S_FP_GVEC(VMLA, gen_helper_gvec_fmla_s, gen_helper_gvec_fmla_h)
147
DO_3S_FP_GVEC(VMLS, gen_helper_gvec_fmls_s, gen_helper_gvec_fmls_h)
148
+DO_3S_FP_GVEC(VFMA, gen_helper_gvec_vfma_s, gen_helper_gvec_vfma_h)
149
+DO_3S_FP_GVEC(VFMS, gen_helper_gvec_vfms_s, gen_helper_gvec_vfms_h)
150
151
WRAP_FP_GVEC(gen_VMAXNM_fp32_3s, FPST_STD, gen_helper_gvec_fmaxnum_s)
152
WRAP_FP_GVEC(gen_VMAXNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fmaxnum_h)
153
@@ -XXX,XX +XXX,XX @@ static bool trans_VRSQRTS_fp_3s(DisasContext *s, arg_3same *a)
154
return do_3same(s, a, gen_VRSQRTS_fp_3s);
155
}
156
157
-static void gen_VFMA_fp_3s(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm,
158
- TCGv_ptr fpstatus)
159
-{
171
-{
160
- gen_helper_vfp_muladds(vd, vn, vm, vd, fpstatus);
172
- return 32 - x;
173
-}
174
-static inline int rsub_16(DisasContext *s, int x)
175
-{
176
- return 16 - x;
177
-}
178
-static inline int rsub_8(DisasContext *s, int x)
179
-{
180
- return 8 - x;
161
-}
181
-}
162
-
182
-
163
-static bool trans_VFMA_fp_3s(DisasContext *s, arg_3same *a)
183
static inline int neon_3same_fp_size(DisasContext *s, int x)
164
-{
165
- if (!dc_isar_feature(aa32_simdfmac, s)) {
166
- return false;
167
- }
168
-
169
- if (a->size != 0) {
170
- /* TODO fp16 support */
171
- return false;
172
- }
173
-
174
- return do_3same_fp(s, a, gen_VFMA_fp_3s, true);
175
-}
176
-
177
-static void gen_VFMS_fp_3s(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm,
178
- TCGv_ptr fpstatus)
179
-{
180
- gen_helper_vfp_negs(vn, vn);
181
- gen_helper_vfp_muladds(vd, vn, vm, vd, fpstatus);
182
-}
183
-
184
-static bool trans_VFMS_fp_3s(DisasContext *s, arg_3same *a)
185
-{
186
- if (!dc_isar_feature(aa32_simdfmac, s)) {
187
- return false;
188
- }
189
-
190
- if (a->size != 0) {
191
- /* TODO fp16 support */
192
- return false;
193
- }
194
-
195
- return do_3same_fp(s, a, gen_VFMS_fp_3s, true);
196
-}
197
-
198
static bool do_3same_fp_pair(DisasContext *s, arg_3same *a, VFPGen3OpSPFn *fn)
199
{
184
{
200
/* FP operations handled pairwise 32 bits at a time */
185
/* Convert 0==fp32, 1==fp16 into a MO_* value */
201
--
186
--
202
2.20.1
187
2.20.1
203
188
204
189
diff view generated by jsdifflib
1
Implement VFP fp16 for VABS, VNEG and VSQRT. This is all
1
Implement the MVE VHLL (vector shift left long) insn. This has two
2
the fp16 insns that use the DO_VFP_2OP macro, because there
2
encodings: the T1 encoding is the usual shift-by-immediate format,
3
is no fp16 version of VMOV_reg.
3
and the T2 encoding is a special case where the shift count is always
4
4
equal to the element size.
5
Notes:
6
* the gen_helper_vfp_negh already exists as we needed to create
7
it for the fp16 multiply-add insns
8
* as usual we need to use the f16 version of the fp_status;
9
this is only relevant for VSQRT
10
5
11
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
6
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
12
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
7
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
13
Message-id: 20200828183354.27913-9-peter.maydell@linaro.org
8
Message-id: 20210628135835.6690-10-peter.maydell@linaro.org
14
---
9
---
15
target/arm/helper.h | 2 ++
10
target/arm/helper-mve.h | 9 +++++++
16
target/arm/vfp.decode | 3 +++
11
target/arm/mve.decode | 53 +++++++++++++++++++++++++++++++++++---
17
target/arm/vfp_helper.c | 10 +++++++++
12
target/arm/mve_helper.c | 32 +++++++++++++++++++++++
18
target/arm/translate-vfp.c.inc | 40 ++++++++++++++++++++++++++++++++++
13
target/arm/translate-mve.c | 15 +++++++++++
19
4 files changed, 55 insertions(+)
14
4 files changed, 105 insertions(+), 4 deletions(-)
20
15
21
diff --git a/target/arm/helper.h b/target/arm/helper.h
16
diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h
22
index XXXXXXX..XXXXXXX 100644
17
index XXXXXXX..XXXXXXX 100644
23
--- a/target/arm/helper.h
18
--- a/target/arm/helper-mve.h
24
+++ b/target/arm/helper.h
19
+++ b/target/arm/helper-mve.h
25
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_3(vfp_minnumd, f64, f64, f64, ptr)
20
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_4(mve_vrshli_sw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
26
DEF_HELPER_1(vfp_negh, f16, f16)
21
DEF_HELPER_FLAGS_4(mve_vrshli_ub, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
27
DEF_HELPER_1(vfp_negs, f32, f32)
22
DEF_HELPER_FLAGS_4(mve_vrshli_uh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
28
DEF_HELPER_1(vfp_negd, f64, f64)
23
DEF_HELPER_FLAGS_4(mve_vrshli_uw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
29
+DEF_HELPER_1(vfp_absh, f16, f16)
24
+
30
DEF_HELPER_1(vfp_abss, f32, f32)
25
+DEF_HELPER_FLAGS_4(mve_vshllbsb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
31
DEF_HELPER_1(vfp_absd, f64, f64)
26
+DEF_HELPER_FLAGS_4(mve_vshllbsh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
32
+DEF_HELPER_2(vfp_sqrth, f16, f16, env)
27
+DEF_HELPER_FLAGS_4(mve_vshllbub, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
33
DEF_HELPER_2(vfp_sqrts, f32, f32, env)
28
+DEF_HELPER_FLAGS_4(mve_vshllbuh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
34
DEF_HELPER_2(vfp_sqrtd, f64, f64, env)
29
+DEF_HELPER_FLAGS_4(mve_vshlltsb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
35
DEF_HELPER_3(vfp_cmps, void, f32, f32, env)
30
+DEF_HELPER_FLAGS_4(mve_vshlltsh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
36
diff --git a/target/arm/vfp.decode b/target/arm/vfp.decode
31
+DEF_HELPER_FLAGS_4(mve_vshlltub, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
32
+DEF_HELPER_FLAGS_4(mve_vshlltuh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
33
diff --git a/target/arm/mve.decode b/target/arm/mve.decode
37
index XXXXXXX..XXXXXXX 100644
34
index XXXXXXX..XXXXXXX 100644
38
--- a/target/arm/vfp.decode
35
--- a/target/arm/mve.decode
39
+++ b/target/arm/vfp.decode
36
+++ b/target/arm/mve.decode
40
@@ -XXX,XX +XXX,XX @@ VMOV_imm_dp ---- 1110 1.11 .... .... 1011 0000 .... \
37
@@ -XXX,XX +XXX,XX @@
41
VMOV_reg_sp ---- 1110 1.11 0000 .... 1010 01.0 .... @vfp_dm_ss
38
@2_shl_h .... .... .. 01 shift:4 .... .... .... .... &2shift qd=%qd qm=%qm size=1
42
VMOV_reg_dp ---- 1110 1.11 0000 .... 1011 01.0 .... @vfp_dm_dd
39
@2_shl_w .... .... .. 1 shift:5 .... .... .... .... &2shift qd=%qd qm=%qm size=2
43
40
44
+VABS_hp ---- 1110 1.11 0000 .... 1001 11.0 .... @vfp_dm_ss
41
+@2_shll_b .... .... ... 01 shift:3 .... .... .... .... &2shift qd=%qd qm=%qm size=0
45
VABS_sp ---- 1110 1.11 0000 .... 1010 11.0 .... @vfp_dm_ss
42
+@2_shll_h .... .... ... 1 shift:4 .... .... .... .... &2shift qd=%qd qm=%qm size=1
46
VABS_dp ---- 1110 1.11 0000 .... 1011 11.0 .... @vfp_dm_dd
43
+# VSHLL encoding T2 where shift == esize
47
44
+@2_shll_esize_b .... .... .... 00 .. .... .... .... .... &2shift \
48
+VNEG_hp ---- 1110 1.11 0001 .... 1001 01.0 .... @vfp_dm_ss
45
+ qd=%qd qm=%qm size=0 shift=8
49
VNEG_sp ---- 1110 1.11 0001 .... 1010 01.0 .... @vfp_dm_ss
46
+@2_shll_esize_h .... .... .... 01 .. .... .... .... .... &2shift \
50
VNEG_dp ---- 1110 1.11 0001 .... 1011 01.0 .... @vfp_dm_dd
47
+ qd=%qd qm=%qm size=1 shift=16
51
48
+
52
+VSQRT_hp ---- 1110 1.11 0001 .... 1001 11.0 .... @vfp_dm_ss
49
# Right shifts are encoded as N - shift, where N is the element size in bits.
53
VSQRT_sp ---- 1110 1.11 0001 .... 1010 11.0 .... @vfp_dm_ss
50
%rshift_i5 16:5 !function=rsub_32
54
VSQRT_dp ---- 1110 1.11 0001 .... 1011 11.0 .... @vfp_dm_dd
51
%rshift_i4 16:4 !function=rsub_16
55
52
@@ -XXX,XX +XXX,XX @@ VADD 1110 1111 0 . .. ... 0 ... 0 1000 . 1 . 0 ... 0 @2op
56
diff --git a/target/arm/vfp_helper.c b/target/arm/vfp_helper.c
53
VSUB 1111 1111 0 . .. ... 0 ... 0 1000 . 1 . 0 ... 0 @2op
57
index XXXXXXX..XXXXXXX 100644
54
VMUL 1110 1111 0 . .. ... 0 ... 0 1001 . 1 . 1 ... 0 @2op
58
--- a/target/arm/vfp_helper.c
55
59
+++ b/target/arm/vfp_helper.c
56
-VMULH_S 111 0 1110 0 . .. ...1 ... 0 1110 . 0 . 0 ... 1 @2op
60
@@ -XXX,XX +XXX,XX @@ float64 VFP_HELPER(neg, d)(float64 a)
57
-VMULH_U 111 1 1110 0 . .. ...1 ... 0 1110 . 0 . 0 ... 1 @2op
61
return float64_chs(a);
58
+# The VSHLL T2 encoding is not a @2op pattern, but is here because it
62
}
59
+# overlaps what would be size=0b11 VMULH/VRMULH
63
64
+dh_ctype_f16 VFP_HELPER(abs, h)(dh_ctype_f16 a)
65
+{
60
+{
66
+ return float16_abs(a);
61
+ VSHLL_BS 111 0 1110 0 . 11 .. 01 ... 0 1110 0 0 . 0 ... 1 @2_shll_esize_b
62
+ VSHLL_BS 111 0 1110 0 . 11 .. 01 ... 0 1110 0 0 . 0 ... 1 @2_shll_esize_h
63
64
-VRMULH_S 111 0 1110 0 . .. ...1 ... 1 1110 . 0 . 0 ... 1 @2op
65
-VRMULH_U 111 1 1110 0 . .. ...1 ... 1 1110 . 0 . 0 ... 1 @2op
66
+ VMULH_S 111 0 1110 0 . .. ...1 ... 0 1110 . 0 . 0 ... 1 @2op
67
+}
67
+}
68
+
68
+
69
float32 VFP_HELPER(abs, s)(float32 a)
70
{
71
return float32_abs(a);
72
@@ -XXX,XX +XXX,XX @@ float64 VFP_HELPER(abs, d)(float64 a)
73
return float64_abs(a);
74
}
75
76
+dh_ctype_f16 VFP_HELPER(sqrt, h)(dh_ctype_f16 a, CPUARMState *env)
77
+{
69
+{
78
+ return float16_sqrt(a, &env->vfp.fp_status_f16);
70
+ VSHLL_BU 111 1 1110 0 . 11 .. 01 ... 0 1110 0 0 . 0 ... 1 @2_shll_esize_b
71
+ VSHLL_BU 111 1 1110 0 . 11 .. 01 ... 0 1110 0 0 . 0 ... 1 @2_shll_esize_h
72
+
73
+ VMULH_U 111 1 1110 0 . .. ...1 ... 0 1110 . 0 . 0 ... 1 @2op
79
+}
74
+}
80
+
75
+
81
float32 VFP_HELPER(sqrt, s)(float32 a, CPUARMState *env)
76
+{
82
{
77
+ VSHLL_TS 111 0 1110 0 . 11 .. 01 ... 1 1110 0 0 . 0 ... 1 @2_shll_esize_b
83
return float32_sqrt(a, &env->vfp.fp_status);
78
+ VSHLL_TS 111 0 1110 0 . 11 .. 01 ... 1 1110 0 0 . 0 ... 1 @2_shll_esize_h
84
diff --git a/target/arm/translate-vfp.c.inc b/target/arm/translate-vfp.c.inc
79
+
80
+ VRMULH_S 111 0 1110 0 . .. ...1 ... 1 1110 . 0 . 0 ... 1 @2op
81
+}
82
+
83
+{
84
+ VSHLL_TU 111 1 1110 0 . 11 .. 01 ... 1 1110 0 0 . 0 ... 1 @2_shll_esize_b
85
+ VSHLL_TU 111 1 1110 0 . 11 .. 01 ... 1 1110 0 0 . 0 ... 1 @2_shll_esize_h
86
+
87
+ VRMULH_U 111 1 1110 0 . .. ...1 ... 1 1110 . 0 . 0 ... 1 @2op
88
+}
89
90
VMAX_S 111 0 1111 0 . .. ... 0 ... 0 0110 . 1 . 0 ... 0 @2op
91
VMAX_U 111 1 1111 0 . .. ... 0 ... 0 0110 . 1 . 0 ... 0 @2op
92
@@ -XXX,XX +XXX,XX @@ VRSHRI_S 111 0 1111 1 . ... ... ... 0 0010 0 1 . 1 ... 0 @2_shr_w
93
VRSHRI_U 111 1 1111 1 . ... ... ... 0 0010 0 1 . 1 ... 0 @2_shr_b
94
VRSHRI_U 111 1 1111 1 . ... ... ... 0 0010 0 1 . 1 ... 0 @2_shr_h
95
VRSHRI_U 111 1 1111 1 . ... ... ... 0 0010 0 1 . 1 ... 0 @2_shr_w
96
+
97
+# VSHLL T1 encoding; the T2 VSHLL encoding is elsewhere in this file
98
+VSHLL_BS 111 0 1110 1 . 1 .. ... ... 0 1111 0 1 . 0 ... 0 @2_shll_b
99
+VSHLL_BS 111 0 1110 1 . 1 .. ... ... 0 1111 0 1 . 0 ... 0 @2_shll_h
100
+
101
+VSHLL_BU 111 1 1110 1 . 1 .. ... ... 0 1111 0 1 . 0 ... 0 @2_shll_b
102
+VSHLL_BU 111 1 1110 1 . 1 .. ... ... 0 1111 0 1 . 0 ... 0 @2_shll_h
103
+
104
+VSHLL_TS 111 0 1110 1 . 1 .. ... ... 1 1111 0 1 . 0 ... 0 @2_shll_b
105
+VSHLL_TS 111 0 1110 1 . 1 .. ... ... 1 1111 0 1 . 0 ... 0 @2_shll_h
106
+
107
+VSHLL_TU 111 1 1110 1 . 1 .. ... ... 1 1111 0 1 . 0 ... 0 @2_shll_b
108
+VSHLL_TU 111 1 1110 1 . 1 .. ... ... 1 1111 0 1 . 0 ... 0 @2_shll_h
109
diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c
85
index XXXXXXX..XXXXXXX 100644
110
index XXXXXXX..XXXXXXX 100644
86
--- a/target/arm/translate-vfp.c.inc
111
--- a/target/arm/mve_helper.c
87
+++ b/target/arm/translate-vfp.c.inc
112
+++ b/target/arm/mve_helper.c
88
@@ -XXX,XX +XXX,XX @@ static bool do_vfp_2op_sp(DisasContext *s, VFPGen2OpSPFn *fn, int vd, int vm)
113
@@ -XXX,XX +XXX,XX @@ DO_2SHIFT_SAT_S(vqshli_s, DO_SQSHL_OP)
89
return true;
114
DO_2SHIFT_SAT_S(vqshlui_s, DO_SUQSHL_OP)
90
}
115
DO_2SHIFT_U(vrshli_u, DO_VRSHLU)
91
116
DO_2SHIFT_S(vrshli_s, DO_VRSHLS)
92
+static bool do_vfp_2op_hp(DisasContext *s, VFPGen2OpSPFn *fn, int vd, int vm)
93
+{
94
+ /*
95
+ * Do a half-precision operation. Functionally this is
96
+ * the same as do_vfp_2op_sp(), except:
97
+ * - it doesn't need the VFP vector handling (fp16 is a
98
+ * v8 feature, and in v8 VFP vectors don't exist)
99
+ * - it does the aa32_fp16_arith feature test
100
+ */
101
+ TCGv_i32 f0;
102
+
117
+
103
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
118
+/*
104
+ return false;
119
+ * Long shifts taking half-sized inputs from top or bottom of the input
120
+ * vector and producing a double-width result. ESIZE, TYPE are for
121
+ * the input, and LESIZE, LTYPE for the output.
122
+ * Unlike the normal shift helpers, we do not handle negative shift counts,
123
+ * because the long shift is strictly left-only.
124
+ */
125
+#define DO_VSHLL(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE) \
126
+ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, \
127
+ void *vm, uint32_t shift) \
128
+ { \
129
+ LTYPE *d = vd; \
130
+ TYPE *m = vm; \
131
+ uint16_t mask = mve_element_mask(env); \
132
+ unsigned le; \
133
+ assert(shift <= 16); \
134
+ for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \
135
+ LTYPE r = (LTYPE)m[H##ESIZE(le * 2 + TOP)] << shift; \
136
+ mergemask(&d[H##LESIZE(le)], r, mask); \
137
+ } \
138
+ mve_advance_vpt(env); \
105
+ }
139
+ }
106
+
140
+
107
+ if (s->vec_len != 0 || s->vec_stride != 0) {
141
+#define DO_VSHLL_ALL(OP, TOP) \
108
+ return false;
142
+ DO_VSHLL(OP##sb, TOP, 1, int8_t, 2, int16_t) \
143
+ DO_VSHLL(OP##ub, TOP, 1, uint8_t, 2, uint16_t) \
144
+ DO_VSHLL(OP##sh, TOP, 2, int16_t, 4, int32_t) \
145
+ DO_VSHLL(OP##uh, TOP, 2, uint16_t, 4, uint32_t) \
146
+
147
+DO_VSHLL_ALL(vshllb, false)
148
+DO_VSHLL_ALL(vshllt, true)
149
diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c
150
index XXXXXXX..XXXXXXX 100644
151
--- a/target/arm/translate-mve.c
152
+++ b/target/arm/translate-mve.c
153
@@ -XXX,XX +XXX,XX @@ DO_2SHIFT(VSHRI_S, vshli_s, true)
154
DO_2SHIFT(VSHRI_U, vshli_u, true)
155
DO_2SHIFT(VRSHRI_S, vrshli_s, true)
156
DO_2SHIFT(VRSHRI_U, vrshli_u, true)
157
+
158
+#define DO_VSHLL(INSN, FN) \
159
+ static bool trans_##INSN(DisasContext *s, arg_2shift *a) \
160
+ { \
161
+ static MVEGenTwoOpShiftFn * const fns[] = { \
162
+ gen_helper_mve_##FN##b, \
163
+ gen_helper_mve_##FN##h, \
164
+ }; \
165
+ return do_2shift(s, a, fns[a->size], false); \
109
+ }
166
+ }
110
+
167
+
111
+ if (!vfp_access_check(s)) {
168
+DO_VSHLL(VSHLL_BS, vshllbs)
112
+ return true;
169
+DO_VSHLL(VSHLL_BU, vshllbu)
113
+ }
170
+DO_VSHLL(VSHLL_TS, vshllts)
114
+
171
+DO_VSHLL(VSHLL_TU, vshlltu)
115
+ f0 = tcg_temp_new_i32();
116
+ neon_load_reg32(f0, vm);
117
+ fn(f0, f0);
118
+ neon_store_reg32(f0, vd);
119
+ tcg_temp_free_i32(f0);
120
+
121
+ return true;
122
+}
123
+
124
static bool do_vfp_2op_dp(DisasContext *s, VFPGen2OpDPFn *fn, int vd, int vm)
125
{
126
uint32_t delta_m = 0;
127
@@ -XXX,XX +XXX,XX @@ static bool trans_VMOV_imm_dp(DisasContext *s, arg_VMOV_imm_dp *a)
128
DO_VFP_2OP(VMOV_reg, sp, tcg_gen_mov_i32)
129
DO_VFP_2OP(VMOV_reg, dp, tcg_gen_mov_i64)
130
131
+DO_VFP_2OP(VABS, hp, gen_helper_vfp_absh)
132
DO_VFP_2OP(VABS, sp, gen_helper_vfp_abss)
133
DO_VFP_2OP(VABS, dp, gen_helper_vfp_absd)
134
135
+DO_VFP_2OP(VNEG, hp, gen_helper_vfp_negh)
136
DO_VFP_2OP(VNEG, sp, gen_helper_vfp_negs)
137
DO_VFP_2OP(VNEG, dp, gen_helper_vfp_negd)
138
139
+static void gen_VSQRT_hp(TCGv_i32 vd, TCGv_i32 vm)
140
+{
141
+ gen_helper_vfp_sqrth(vd, vm, cpu_env);
142
+}
143
+
144
static void gen_VSQRT_sp(TCGv_i32 vd, TCGv_i32 vm)
145
{
146
gen_helper_vfp_sqrts(vd, vm, cpu_env);
147
@@ -XXX,XX +XXX,XX @@ static void gen_VSQRT_dp(TCGv_i64 vd, TCGv_i64 vm)
148
gen_helper_vfp_sqrtd(vd, vm, cpu_env);
149
}
150
151
+DO_VFP_2OP(VSQRT, hp, gen_VSQRT_hp)
152
DO_VFP_2OP(VSQRT, sp, gen_VSQRT_sp)
153
DO_VFP_2OP(VSQRT, dp, gen_VSQRT_dp)
154
155
--
172
--
156
2.20.1
173
2.20.1
157
174
158
175
diff view generated by jsdifflib
1
Convert the Neon VRINT-with-specified-rounding-mode insns to gvec,
1
Implement the MVE VSRI and VSLI insns, which perform a
2
and use this to implement the fp16 versions.
2
shift-and-insert operation.
3
3
4
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
4
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
5
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
5
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
6
Message-id: 20200828183354.27913-41-peter.maydell@linaro.org
6
Message-id: 20210628135835.6690-11-peter.maydell@linaro.org
7
---
7
---
8
target/arm/helper.h | 4 +-
8
target/arm/helper-mve.h | 8 ++++++++
9
target/arm/vec_helper.c | 21 +++++++++++
9
target/arm/mve.decode | 9 ++++++++
10
target/arm/vfp_helper.c | 17 ---------
10
target/arm/mve_helper.c | 42 ++++++++++++++++++++++++++++++++++++++
11
target/arm/translate-neon.c.inc | 67 +++------------------------------
11
target/arm/translate-mve.c | 3 +++
12
4 files changed, 30 insertions(+), 79 deletions(-)
12
4 files changed, 62 insertions(+)
13
13
14
diff --git a/target/arm/helper.h b/target/arm/helper.h
14
diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h
15
index XXXXXXX..XXXXXXX 100644
15
index XXXXXXX..XXXXXXX 100644
16
--- a/target/arm/helper.h
16
--- a/target/arm/helper-mve.h
17
+++ b/target/arm/helper.h
17
+++ b/target/arm/helper-mve.h
18
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_3(vfp_sqtoh, f16, i64, i32, ptr)
18
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_4(mve_vshlltsb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
19
DEF_HELPER_3(vfp_uqtoh, f16, i64, i32, ptr)
19
DEF_HELPER_FLAGS_4(mve_vshlltsh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
20
20
DEF_HELPER_FLAGS_4(mve_vshlltub, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
21
DEF_HELPER_FLAGS_2(set_rmode, TCG_CALL_NO_RWG, i32, i32, ptr)
21
DEF_HELPER_FLAGS_4(mve_vshlltuh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
22
-DEF_HELPER_FLAGS_2(set_neon_rmode, TCG_CALL_NO_RWG, i32, i32, env)
23
24
DEF_HELPER_FLAGS_3(vfp_fcvt_f16_to_f32, TCG_CALL_NO_RWG, f32, f16, ptr, i32)
25
DEF_HELPER_FLAGS_3(vfp_fcvt_f32_to_f16, TCG_CALL_NO_RWG, f16, f32, ptr, i32)
26
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_4(gvec_vcvt_rm_us, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
27
DEF_HELPER_FLAGS_4(gvec_vcvt_rm_sh, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
28
DEF_HELPER_FLAGS_4(gvec_vcvt_rm_uh, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
29
30
+DEF_HELPER_FLAGS_4(gvec_vrint_rm_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
31
+DEF_HELPER_FLAGS_4(gvec_vrint_rm_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
32
+
22
+
33
DEF_HELPER_FLAGS_4(gvec_frecpe_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
23
+DEF_HELPER_FLAGS_4(mve_vsrib, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
34
DEF_HELPER_FLAGS_4(gvec_frecpe_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
24
+DEF_HELPER_FLAGS_4(mve_vsrih, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
35
DEF_HELPER_FLAGS_4(gvec_frecpe_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
25
+DEF_HELPER_FLAGS_4(mve_vsriw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
36
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
26
+
27
+DEF_HELPER_FLAGS_4(mve_vslib, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
28
+DEF_HELPER_FLAGS_4(mve_vslih, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
29
+DEF_HELPER_FLAGS_4(mve_vsliw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
30
diff --git a/target/arm/mve.decode b/target/arm/mve.decode
37
index XXXXXXX..XXXXXXX 100644
31
index XXXXXXX..XXXXXXX 100644
38
--- a/target/arm/vec_helper.c
32
--- a/target/arm/mve.decode
39
+++ b/target/arm/vec_helper.c
33
+++ b/target/arm/mve.decode
40
@@ -XXX,XX +XXX,XX @@ DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
34
@@ -XXX,XX +XXX,XX @@ VSHLL_TS 111 0 1110 1 . 1 .. ... ... 1 1111 0 1 . 0 ... 0 @2_shll_h
41
DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
35
42
36
VSHLL_TU 111 1 1110 1 . 1 .. ... ... 1 1111 0 1 . 0 ... 0 @2_shll_b
43
#undef DO_VCVT_RMODE
37
VSHLL_TU 111 1 1110 1 . 1 .. ... ... 1 1111 0 1 . 0 ... 0 @2_shll_h
44
+
38
+
45
+#define DO_VRINT_RMODE(NAME, FUNC, TYPE) \
39
+# Shift-and-insert
46
+ void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
40
+VSRI 111 1 1111 1 . ... ... ... 0 0100 0 1 . 1 ... 0 @2_shr_b
41
+VSRI 111 1 1111 1 . ... ... ... 0 0100 0 1 . 1 ... 0 @2_shr_h
42
+VSRI 111 1 1111 1 . ... ... ... 0 0100 0 1 . 1 ... 0 @2_shr_w
43
+
44
+VSLI 111 1 1111 1 . ... ... ... 0 0101 0 1 . 1 ... 0 @2_shl_b
45
+VSLI 111 1 1111 1 . ... ... ... 0 0101 0 1 . 1 ... 0 @2_shl_h
46
+VSLI 111 1 1111 1 . ... ... ... 0 0101 0 1 . 1 ... 0 @2_shl_w
47
diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c
48
index XXXXXXX..XXXXXXX 100644
49
--- a/target/arm/mve_helper.c
50
+++ b/target/arm/mve_helper.c
51
@@ -XXX,XX +XXX,XX @@ DO_2SHIFT_SAT_S(vqshlui_s, DO_SUQSHL_OP)
52
DO_2SHIFT_U(vrshli_u, DO_VRSHLU)
53
DO_2SHIFT_S(vrshli_s, DO_VRSHLS)
54
55
+/* Shift-and-insert; we always work with 64 bits at a time */
56
+#define DO_2SHIFT_INSERT(OP, ESIZE, SHIFTFN, MASKFN) \
57
+ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, \
58
+ void *vm, uint32_t shift) \
47
+ { \
59
+ { \
48
+ float_status *fpst = stat; \
60
+ uint64_t *d = vd, *m = vm; \
49
+ intptr_t i, oprsz = simd_oprsz(desc); \
61
+ uint16_t mask; \
50
+ uint32_t rmode = simd_data(desc); \
62
+ uint64_t shiftmask; \
51
+ uint32_t prev_rmode = get_float_rounding_mode(fpst); \
63
+ unsigned e; \
52
+ TYPE *d = vd, *n = vn; \
64
+ if (shift == 0 || shift == ESIZE * 8) { \
53
+ set_float_rounding_mode(rmode, fpst); \
65
+ /* \
54
+ for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
66
+ * Only VSLI can shift by 0; only VSRI can shift by <dt>. \
55
+ d[i] = FUNC(n[i], fpst); \
67
+ * The generic logic would give the right answer for 0 but \
68
+ * fails for <dt>. \
69
+ */ \
70
+ goto done; \
56
+ } \
71
+ } \
57
+ set_float_rounding_mode(prev_rmode, fpst); \
72
+ assert(shift < ESIZE * 8); \
58
+ clear_tail(d, oprsz, simd_maxsz(desc)); \
73
+ mask = mve_element_mask(env); \
74
+ /* ESIZE / 2 gives the MO_* value if ESIZE is in [1,2,4] */ \
75
+ shiftmask = dup_const(ESIZE / 2, MASKFN(ESIZE * 8, shift)); \
76
+ for (e = 0; e < 16 / 8; e++, mask >>= 8) { \
77
+ uint64_t r = (SHIFTFN(m[H8(e)], shift) & shiftmask) | \
78
+ (d[H8(e)] & ~shiftmask); \
79
+ mergemask(&d[H8(e)], r, mask); \
80
+ } \
81
+done: \
82
+ mve_advance_vpt(env); \
59
+ }
83
+ }
60
+
84
+
61
+DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
85
+#define DO_SHL(N, SHIFT) ((N) << (SHIFT))
62
+DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
86
+#define DO_SHR(N, SHIFT) ((N) >> (SHIFT))
87
+#define SHL_MASK(EBITS, SHIFT) MAKE_64BIT_MASK((SHIFT), (EBITS) - (SHIFT))
88
+#define SHR_MASK(EBITS, SHIFT) MAKE_64BIT_MASK(0, (EBITS) - (SHIFT))
63
+
89
+
64
+#undef DO_VRINT_RMODE
90
+DO_2SHIFT_INSERT(vsrib, 1, DO_SHR, SHR_MASK)
65
diff --git a/target/arm/vfp_helper.c b/target/arm/vfp_helper.c
91
+DO_2SHIFT_INSERT(vsrih, 2, DO_SHR, SHR_MASK)
92
+DO_2SHIFT_INSERT(vsriw, 4, DO_SHR, SHR_MASK)
93
+DO_2SHIFT_INSERT(vslib, 1, DO_SHL, SHL_MASK)
94
+DO_2SHIFT_INSERT(vslih, 2, DO_SHL, SHL_MASK)
95
+DO_2SHIFT_INSERT(vsliw, 4, DO_SHL, SHL_MASK)
96
+
97
/*
98
* Long shifts taking half-sized inputs from top or bottom of the input
99
* vector and producing a double-width result. ESIZE, TYPE are for
100
diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c
66
index XXXXXXX..XXXXXXX 100644
101
index XXXXXXX..XXXXXXX 100644
67
--- a/target/arm/vfp_helper.c
102
--- a/target/arm/translate-mve.c
68
+++ b/target/arm/vfp_helper.c
103
+++ b/target/arm/translate-mve.c
69
@@ -XXX,XX +XXX,XX @@ uint32_t HELPER(set_rmode)(uint32_t rmode, void *fpstp)
104
@@ -XXX,XX +XXX,XX @@ DO_2SHIFT(VSHRI_U, vshli_u, true)
70
return prev_rmode;
105
DO_2SHIFT(VRSHRI_S, vrshli_s, true)
71
}
106
DO_2SHIFT(VRSHRI_U, vrshli_u, true)
72
107
73
-/* Set the current fp rounding mode in the standard fp status and return
108
+DO_2SHIFT(VSRI, vsri, false)
74
- * the old one. This is for NEON instructions that need to change the
109
+DO_2SHIFT(VSLI, vsli, false)
75
- * rounding mode but wish to use the standard FPSCR values for everything
76
- * else. Always set the rounding mode back to the correct value after
77
- * modifying it.
78
- * The argument is a softfloat float_round_ value.
79
- */
80
-uint32_t HELPER(set_neon_rmode)(uint32_t rmode, CPUARMState *env)
81
-{
82
- float_status *fp_status = &env->vfp.standard_fp_status;
83
-
84
- uint32_t prev_rmode = get_float_rounding_mode(fp_status);
85
- set_float_rounding_mode(rmode, fp_status);
86
-
87
- return prev_rmode;
88
-}
89
-
90
/* Half precision conversions. */
91
float32 HELPER(vfp_fcvt_f16_to_f32)(uint32_t a, void *fpstp, uint32_t ahp_mode)
92
{
93
diff --git a/target/arm/translate-neon.c.inc b/target/arm/translate-neon.c.inc
94
index XXXXXXX..XXXXXXX 100644
95
--- a/target/arm/translate-neon.c.inc
96
+++ b/target/arm/translate-neon.c.inc
97
@@ -XXX,XX +XXX,XX @@ static bool trans_VRINTX(DisasContext *s, arg_2misc *a)
98
return do_2misc_fp(s, a, gen_helper_rints_exact);
99
}
100
101
-static bool do_vrint(DisasContext *s, arg_2misc *a, int rmode)
102
-{
103
- /*
104
- * Handle a VRINT* operation by iterating 32 bits at a time,
105
- * with a specified rounding mode in operation.
106
- */
107
- int pass;
108
- TCGv_ptr fpst;
109
- TCGv_i32 tcg_rmode;
110
-
111
- if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
112
- !arm_dc_feature(s, ARM_FEATURE_V8)) {
113
- return false;
114
- }
115
-
116
- /* UNDEF accesses to D16-D31 if they don't exist. */
117
- if (!dc_isar_feature(aa32_simd_r32, s) &&
118
- ((a->vd | a->vm) & 0x10)) {
119
- return false;
120
- }
121
-
122
- if (a->size != 2) {
123
- /* TODO: FP16 will be the size == 1 case */
124
- return false;
125
- }
126
-
127
- if ((a->vd | a->vm) & a->q) {
128
- return false;
129
- }
130
-
131
- if (!vfp_access_check(s)) {
132
- return true;
133
- }
134
-
135
- fpst = fpstatus_ptr(FPST_STD);
136
- tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rmode));
137
- gen_helper_set_neon_rmode(tcg_rmode, tcg_rmode, cpu_env);
138
- for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
139
- TCGv_i32 tmp = neon_load_reg(a->vm, pass);
140
- gen_helper_rints(tmp, tmp, fpst);
141
- neon_store_reg(a->vd, pass, tmp);
142
- }
143
- gen_helper_set_neon_rmode(tcg_rmode, tcg_rmode, cpu_env);
144
- tcg_temp_free_i32(tcg_rmode);
145
- tcg_temp_free_ptr(fpst);
146
-
147
- return true;
148
-}
149
-
150
-#define DO_VRINT(INSN, RMODE) \
151
- static bool trans_##INSN(DisasContext *s, arg_2misc *a) \
152
- { \
153
- return do_vrint(s, a, RMODE); \
154
- }
155
-
156
-DO_VRINT(VRINTN, FPROUNDING_TIEEVEN)
157
-DO_VRINT(VRINTA, FPROUNDING_TIEAWAY)
158
-DO_VRINT(VRINTZ, FPROUNDING_ZERO)
159
-DO_VRINT(VRINTM, FPROUNDING_NEGINF)
160
-DO_VRINT(VRINTP, FPROUNDING_POSINF)
161
-
162
#define DO_VEC_RMODE(INSN, RMODE, OP) \
163
static void gen_##INSN(unsigned vece, uint32_t rd_ofs, \
164
uint32_t rm_ofs, \
165
@@ -XXX,XX +XXX,XX @@ DO_VEC_RMODE(VCVTPS, FPROUNDING_POSINF, vcvt_rm_s)
166
DO_VEC_RMODE(VCVTMU, FPROUNDING_NEGINF, vcvt_rm_u)
167
DO_VEC_RMODE(VCVTMS, FPROUNDING_NEGINF, vcvt_rm_s)
168
169
+DO_VEC_RMODE(VRINTN, FPROUNDING_TIEEVEN, vrint_rm_)
170
+DO_VEC_RMODE(VRINTA, FPROUNDING_TIEAWAY, vrint_rm_)
171
+DO_VEC_RMODE(VRINTZ, FPROUNDING_ZERO, vrint_rm_)
172
+DO_VEC_RMODE(VRINTM, FPROUNDING_NEGINF, vrint_rm_)
173
+DO_VEC_RMODE(VRINTP, FPROUNDING_POSINF, vrint_rm_)
174
+
110
+
175
static bool trans_VSWP(DisasContext *s, arg_2misc *a)
111
#define DO_VSHLL(INSN, FN) \
176
{
112
static bool trans_##INSN(DisasContext *s, arg_2shift *a) \
177
TCGv_i64 rm, rd;
113
{ \
178
--
114
--
179
2.20.1
115
2.20.1
180
116
181
117
diff view generated by jsdifflib
1
Convert the neon floating-point vector compare-vs-0 insns VCEQ0,
1
Implement the MVE shift-right-and-narrow insn VSHRN and VRSHRN.
2
VCGT0, VCLE0, VCGE0 and VCLT0 to use a gvec helper, and use this to
2
3
implement the fp16 case.
3
do_urshr() is borrowed from sve_helper.c.
4
4
5
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
5
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
6
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
6
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
7
Message-id: 20200828183354.27913-33-peter.maydell@linaro.org
7
Message-id: 20210628135835.6690-12-peter.maydell@linaro.org
8
---
8
---
9
target/arm/helper.h | 15 +++++++++++++++
9
target/arm/helper-mve.h | 10 ++++++++++
10
target/arm/vec_helper.c | 25 +++++++++++++++++++++++++
10
target/arm/mve.decode | 11 +++++++++++
11
target/arm/translate-neon.c.inc | 33 +++++----------------------------
11
target/arm/mve_helper.c | 40 ++++++++++++++++++++++++++++++++++++++
12
3 files changed, 45 insertions(+), 28 deletions(-)
12
target/arm/translate-mve.c | 15 ++++++++++++++
13
4 files changed, 76 insertions(+)
13
14
14
diff --git a/target/arm/helper.h b/target/arm/helper.h
15
diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h
15
index XXXXXXX..XXXXXXX 100644
16
index XXXXXXX..XXXXXXX 100644
16
--- a/target/arm/helper.h
17
--- a/target/arm/helper-mve.h
17
+++ b/target/arm/helper.h
18
+++ b/target/arm/helper-mve.h
18
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_4(gvec_frsqrte_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
19
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_4(mve_vsriw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
19
DEF_HELPER_FLAGS_4(gvec_frsqrte_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
20
DEF_HELPER_FLAGS_4(mve_vslib, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
20
DEF_HELPER_FLAGS_4(gvec_frsqrte_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
21
DEF_HELPER_FLAGS_4(mve_vslih, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
21
22
DEF_HELPER_FLAGS_4(mve_vsliw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
22
+DEF_HELPER_FLAGS_4(gvec_fcgt0_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
23
+DEF_HELPER_FLAGS_4(gvec_fcgt0_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
24
+
23
+
25
+DEF_HELPER_FLAGS_4(gvec_fcge0_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
24
+DEF_HELPER_FLAGS_4(mve_vshrnbb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
26
+DEF_HELPER_FLAGS_4(gvec_fcge0_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
25
+DEF_HELPER_FLAGS_4(mve_vshrnbh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
26
+DEF_HELPER_FLAGS_4(mve_vshrntb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
27
+DEF_HELPER_FLAGS_4(mve_vshrnth, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
27
+
28
+
28
+DEF_HELPER_FLAGS_4(gvec_fceq0_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
29
+DEF_HELPER_FLAGS_4(mve_vrshrnbb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
29
+DEF_HELPER_FLAGS_4(gvec_fceq0_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
30
+DEF_HELPER_FLAGS_4(mve_vrshrnbh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
31
+DEF_HELPER_FLAGS_4(mve_vrshrntb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
32
+DEF_HELPER_FLAGS_4(mve_vrshrnth, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
33
diff --git a/target/arm/mve.decode b/target/arm/mve.decode
34
index XXXXXXX..XXXXXXX 100644
35
--- a/target/arm/mve.decode
36
+++ b/target/arm/mve.decode
37
@@ -XXX,XX +XXX,XX @@ VSRI 111 1 1111 1 . ... ... ... 0 0100 0 1 . 1 ... 0 @2_shr_w
38
VSLI 111 1 1111 1 . ... ... ... 0 0101 0 1 . 1 ... 0 @2_shl_b
39
VSLI 111 1 1111 1 . ... ... ... 0 0101 0 1 . 1 ... 0 @2_shl_h
40
VSLI 111 1 1111 1 . ... ... ... 0 0101 0 1 . 1 ... 0 @2_shl_w
30
+
41
+
31
+DEF_HELPER_FLAGS_4(gvec_fcle0_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
42
+# Narrowing shifts (which only support b and h sizes)
32
+DEF_HELPER_FLAGS_4(gvec_fcle0_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
43
+VSHRNB 111 0 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 1 @2_shr_b
44
+VSHRNB 111 0 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 1 @2_shr_h
45
+VSHRNT 111 0 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 1 @2_shr_b
46
+VSHRNT 111 0 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 1 @2_shr_h
33
+
47
+
34
+DEF_HELPER_FLAGS_4(gvec_fclt0_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
48
+VRSHRNB 111 1 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 1 @2_shr_b
35
+DEF_HELPER_FLAGS_4(gvec_fclt0_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
49
+VRSHRNB 111 1 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 1 @2_shr_h
50
+VRSHRNT 111 1 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 1 @2_shr_b
51
+VRSHRNT 111 1 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 1 @2_shr_h
52
diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c
53
index XXXXXXX..XXXXXXX 100644
54
--- a/target/arm/mve_helper.c
55
+++ b/target/arm/mve_helper.c
56
@@ -XXX,XX +XXX,XX @@ DO_2SHIFT_INSERT(vsliw, 4, DO_SHL, SHL_MASK)
57
58
DO_VSHLL_ALL(vshllb, false)
59
DO_VSHLL_ALL(vshllt, true)
36
+
60
+
37
DEF_HELPER_FLAGS_5(gvec_fadd_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
61
+/*
38
DEF_HELPER_FLAGS_5(gvec_fadd_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
62
+ * Narrowing right shifts, taking a double sized input, shifting it
39
DEF_HELPER_FLAGS_5(gvec_fadd_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
63
+ * and putting the result in either the top or bottom half of the output.
40
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
64
+ * ESIZE, TYPE are the output, and LESIZE, LTYPE the input.
41
index XXXXXXX..XXXXXXX 100644
65
+ */
42
--- a/target/arm/vec_helper.c
66
+#define DO_VSHRN(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN) \
43
+++ b/target/arm/vec_helper.c
67
+ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, \
44
@@ -XXX,XX +XXX,XX @@ DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
68
+ void *vm, uint32_t shift) \
45
DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
46
DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
47
48
+#define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \
49
+ static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \
50
+ { \
69
+ { \
51
+ return TYPE##_##CMPOP(op, TYPE##_zero, stat); \
70
+ LTYPE *m = vm; \
71
+ TYPE *d = vd; \
72
+ uint16_t mask = mve_element_mask(env); \
73
+ unsigned le; \
74
+ for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \
75
+ TYPE r = FN(m[H##LESIZE(le)], shift); \
76
+ mergemask(&d[H##ESIZE(le * 2 + TOP)], r, mask); \
77
+ } \
78
+ mve_advance_vpt(env); \
52
+ }
79
+ }
53
+
80
+
54
+#define WRAP_CMP0_REV(FN, CMPOP, TYPE) \
81
+#define DO_VSHRN_ALL(OP, FN) \
55
+ static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \
82
+ DO_VSHRN(OP##bb, false, 1, uint8_t, 2, uint16_t, FN) \
83
+ DO_VSHRN(OP##bh, false, 2, uint16_t, 4, uint32_t, FN) \
84
+ DO_VSHRN(OP##tb, true, 1, uint8_t, 2, uint16_t, FN) \
85
+ DO_VSHRN(OP##th, true, 2, uint16_t, 4, uint32_t, FN)
86
+
87
+static inline uint64_t do_urshr(uint64_t x, unsigned sh)
88
+{
89
+ if (likely(sh < 64)) {
90
+ return (x >> sh) + ((x >> (sh - 1)) & 1);
91
+ } else if (sh == 64) {
92
+ return x >> 63;
93
+ } else {
94
+ return 0;
95
+ }
96
+}
97
+
98
+DO_VSHRN_ALL(vshrn, DO_SHR)
99
+DO_VSHRN_ALL(vrshrn, do_urshr)
100
diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c
101
index XXXXXXX..XXXXXXX 100644
102
--- a/target/arm/translate-mve.c
103
+++ b/target/arm/translate-mve.c
104
@@ -XXX,XX +XXX,XX @@ DO_VSHLL(VSHLL_BS, vshllbs)
105
DO_VSHLL(VSHLL_BU, vshllbu)
106
DO_VSHLL(VSHLL_TS, vshllts)
107
DO_VSHLL(VSHLL_TU, vshlltu)
108
+
109
+#define DO_2SHIFT_N(INSN, FN) \
110
+ static bool trans_##INSN(DisasContext *s, arg_2shift *a) \
56
+ { \
111
+ { \
57
+ return TYPE##_##CMPOP(TYPE##_zero, op, stat); \
112
+ static MVEGenTwoOpShiftFn * const fns[] = { \
113
+ gen_helper_mve_##FN##b, \
114
+ gen_helper_mve_##FN##h, \
115
+ }; \
116
+ return do_2shift(s, a, fns[a->size], false); \
58
+ }
117
+ }
59
+
118
+
60
+#define DO_2OP_CMP0(FN, CMPOP, DIRN) \
119
+DO_2SHIFT_N(VSHRNB, vshrnb)
61
+ WRAP_CMP0_##DIRN(FN, CMPOP, float16) \
120
+DO_2SHIFT_N(VSHRNT, vshrnt)
62
+ WRAP_CMP0_##DIRN(FN, CMPOP, float32) \
121
+DO_2SHIFT_N(VRSHRNB, vrshrnb)
63
+ DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \
122
+DO_2SHIFT_N(VRSHRNT, vrshrnt)
64
+ DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)
65
+
66
+DO_2OP_CMP0(cgt, cgt, FWD)
67
+DO_2OP_CMP0(cge, cge, FWD)
68
+DO_2OP_CMP0(ceq, ceq, FWD)
69
+DO_2OP_CMP0(clt, cgt, REV)
70
+DO_2OP_CMP0(cle, cge, REV)
71
+
72
#undef DO_2OP
73
+#undef DO_2OP_CMP0
74
75
/* Floating-point trigonometric starting value.
76
* See the ARM ARM pseudocode function FPTrigSMul.
77
diff --git a/target/arm/translate-neon.c.inc b/target/arm/translate-neon.c.inc
78
index XXXXXXX..XXXXXXX 100644
79
--- a/target/arm/translate-neon.c.inc
80
+++ b/target/arm/translate-neon.c.inc
81
@@ -XXX,XX +XXX,XX @@ DO_2MISC_FP(VCVT_UF, gen_helper_vfp_touizs)
82
83
DO_2MISC_FP_VEC(VRECPE_F, gen_helper_gvec_frecpe_h, gen_helper_gvec_frecpe_s)
84
DO_2MISC_FP_VEC(VRSQRTE_F, gen_helper_gvec_frsqrte_h, gen_helper_gvec_frsqrte_s)
85
+DO_2MISC_FP_VEC(VCGT0_F, gen_helper_gvec_fcgt0_h, gen_helper_gvec_fcgt0_s)
86
+DO_2MISC_FP_VEC(VCGE0_F, gen_helper_gvec_fcge0_h, gen_helper_gvec_fcge0_s)
87
+DO_2MISC_FP_VEC(VCEQ0_F, gen_helper_gvec_fceq0_h, gen_helper_gvec_fceq0_s)
88
+DO_2MISC_FP_VEC(VCLT0_F, gen_helper_gvec_fclt0_h, gen_helper_gvec_fclt0_s)
89
+DO_2MISC_FP_VEC(VCLE0_F, gen_helper_gvec_fcle0_h, gen_helper_gvec_fcle0_s)
90
91
static bool trans_VRINTX(DisasContext *s, arg_2misc *a)
92
{
93
@@ -XXX,XX +XXX,XX @@ static bool trans_VRINTX(DisasContext *s, arg_2misc *a)
94
return do_2misc_fp(s, a, gen_helper_rints_exact);
95
}
96
97
-#define WRAP_FP_CMP0_FWD(WRAPNAME, FUNC) \
98
- static void WRAPNAME(TCGv_i32 d, TCGv_i32 m, TCGv_ptr fpst) \
99
- { \
100
- TCGv_i32 zero = tcg_const_i32(0); \
101
- FUNC(d, m, zero, fpst); \
102
- tcg_temp_free_i32(zero); \
103
- }
104
-#define WRAP_FP_CMP0_REV(WRAPNAME, FUNC) \
105
- static void WRAPNAME(TCGv_i32 d, TCGv_i32 m, TCGv_ptr fpst) \
106
- { \
107
- TCGv_i32 zero = tcg_const_i32(0); \
108
- FUNC(d, zero, m, fpst); \
109
- tcg_temp_free_i32(zero); \
110
- }
111
-
112
-#define DO_FP_CMP0(INSN, FUNC, REV) \
113
- WRAP_FP_CMP0_##REV(gen_##INSN, FUNC) \
114
- static bool trans_##INSN(DisasContext *s, arg_2misc *a) \
115
- { \
116
- return do_2misc_fp(s, a, gen_##INSN); \
117
- }
118
-
119
-DO_FP_CMP0(VCGT0_F, gen_helper_neon_cgt_f32, FWD)
120
-DO_FP_CMP0(VCGE0_F, gen_helper_neon_cge_f32, FWD)
121
-DO_FP_CMP0(VCEQ0_F, gen_helper_neon_ceq_f32, FWD)
122
-DO_FP_CMP0(VCLE0_F, gen_helper_neon_cge_f32, REV)
123
-DO_FP_CMP0(VCLT0_F, gen_helper_neon_cgt_f32, REV)
124
-
125
static bool do_vrint(DisasContext *s, arg_2misc *a, int rmode)
126
{
127
/*
128
--
123
--
129
2.20.1
124
2.20.1
130
125
131
126
diff view generated by jsdifflib
1
Convert the Neon float-integer VCVT insns to gvec, and use this
1
Implement the MVE saturating shift-right-and-narrow insns
2
to implement fp16 support for them.
2
VQSHRN, VQSHRUN, VQRSHRN and VQRSHRUN.
3
3
4
Note that unlike the VFP int<->fp16 VCVT insns we converted
4
do_srshr() is borrowed from sve_helper.c.
5
earlier and which convert to/from a 32-bit integer, these
6
Neon insns convert to/from 16-bit integers. So we can use
7
the existing vfp conversion helpers for the f32<->u32/i32
8
case but need to provide our own for f16<->u16/i16.
9
5
10
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
6
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
11
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
7
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
12
Message-id: 20200828183354.27913-37-peter.maydell@linaro.org
8
Message-id: 20210628135835.6690-13-peter.maydell@linaro.org
13
---
9
---
14
target/arm/helper.h | 9 +++++++++
10
target/arm/helper-mve.h | 30 +++++++++++
15
target/arm/vec_helper.c | 29 +++++++++++++++++++++++++++++
11
target/arm/mve.decode | 28 ++++++++++
16
target/arm/translate-neon.c.inc | 15 ++++-----------
12
target/arm/mve_helper.c | 104 +++++++++++++++++++++++++++++++++++++
17
3 files changed, 42 insertions(+), 11 deletions(-)
13
target/arm/translate-mve.c | 12 +++++
18
14
4 files changed, 174 insertions(+)
19
diff --git a/target/arm/helper.h b/target/arm/helper.h
15
20
index XXXXXXX..XXXXXXX 100644
16
diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h
21
--- a/target/arm/helper.h
17
index XXXXXXX..XXXXXXX 100644
22
+++ b/target/arm/helper.h
18
--- a/target/arm/helper-mve.h
23
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_5(neon_padds, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
19
+++ b/target/arm/helper-mve.h
24
DEF_HELPER_FLAGS_5(neon_pmaxs, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
20
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_4(mve_vrshrnbb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
25
DEF_HELPER_FLAGS_5(neon_pmins, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
21
DEF_HELPER_FLAGS_4(mve_vrshrnbh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
26
22
DEF_HELPER_FLAGS_4(mve_vrshrntb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
27
+DEF_HELPER_FLAGS_4(gvec_sstoh, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
23
DEF_HELPER_FLAGS_4(mve_vrshrnth, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
28
+DEF_HELPER_FLAGS_4(gvec_sitos, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
24
+
29
+DEF_HELPER_FLAGS_4(gvec_ustoh, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
25
+DEF_HELPER_FLAGS_4(mve_vqshrnb_sb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
30
+DEF_HELPER_FLAGS_4(gvec_uitos, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
26
+DEF_HELPER_FLAGS_4(mve_vqshrnb_sh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
31
+DEF_HELPER_FLAGS_4(gvec_tosszh, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
27
+DEF_HELPER_FLAGS_4(mve_vqshrnt_sb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
32
+DEF_HELPER_FLAGS_4(gvec_tosizs, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
28
+DEF_HELPER_FLAGS_4(mve_vqshrnt_sh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
33
+DEF_HELPER_FLAGS_4(gvec_touszh, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
29
+
34
+DEF_HELPER_FLAGS_4(gvec_touizs, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
30
+DEF_HELPER_FLAGS_4(mve_vqshrnb_ub, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
35
+
31
+DEF_HELPER_FLAGS_4(mve_vqshrnb_uh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
36
DEF_HELPER_FLAGS_4(gvec_frecpe_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
32
+DEF_HELPER_FLAGS_4(mve_vqshrnt_ub, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
37
DEF_HELPER_FLAGS_4(gvec_frecpe_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
33
+DEF_HELPER_FLAGS_4(mve_vqshrnt_uh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
38
DEF_HELPER_FLAGS_4(gvec_frecpe_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
34
+
39
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
35
+DEF_HELPER_FLAGS_4(mve_vqshrunbb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
40
index XXXXXXX..XXXXXXX 100644
36
+DEF_HELPER_FLAGS_4(mve_vqshrunbh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
41
--- a/target/arm/vec_helper.c
37
+DEF_HELPER_FLAGS_4(mve_vqshruntb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
42
+++ b/target/arm/vec_helper.c
38
+DEF_HELPER_FLAGS_4(mve_vqshrunth, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
43
@@ -XXX,XX +XXX,XX @@ static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
39
+
44
return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
40
+DEF_HELPER_FLAGS_4(mve_vqrshrnb_sb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
41
+DEF_HELPER_FLAGS_4(mve_vqrshrnb_sh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
42
+DEF_HELPER_FLAGS_4(mve_vqrshrnt_sb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
43
+DEF_HELPER_FLAGS_4(mve_vqrshrnt_sh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
44
+
45
+DEF_HELPER_FLAGS_4(mve_vqrshrnb_ub, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
46
+DEF_HELPER_FLAGS_4(mve_vqrshrnb_uh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
47
+DEF_HELPER_FLAGS_4(mve_vqrshrnt_ub, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
48
+DEF_HELPER_FLAGS_4(mve_vqrshrnt_uh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
49
+
50
+DEF_HELPER_FLAGS_4(mve_vqrshrunbb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
51
+DEF_HELPER_FLAGS_4(mve_vqrshrunbh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
52
+DEF_HELPER_FLAGS_4(mve_vqrshruntb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
53
+DEF_HELPER_FLAGS_4(mve_vqrshrunth, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
54
diff --git a/target/arm/mve.decode b/target/arm/mve.decode
55
index XXXXXXX..XXXXXXX 100644
56
--- a/target/arm/mve.decode
57
+++ b/target/arm/mve.decode
58
@@ -XXX,XX +XXX,XX @@ VRSHRNB 111 1 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 1 @2_shr_b
59
VRSHRNB 111 1 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 1 @2_shr_h
60
VRSHRNT 111 1 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 1 @2_shr_b
61
VRSHRNT 111 1 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 1 @2_shr_h
62
+
63
+VQSHRNB_S 111 0 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 0 @2_shr_b
64
+VQSHRNB_S 111 0 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 0 @2_shr_h
65
+VQSHRNT_S 111 0 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 0 @2_shr_b
66
+VQSHRNT_S 111 0 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 0 @2_shr_h
67
+VQSHRNB_U 111 1 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 0 @2_shr_b
68
+VQSHRNB_U 111 1 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 0 @2_shr_h
69
+VQSHRNT_U 111 1 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 0 @2_shr_b
70
+VQSHRNT_U 111 1 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 0 @2_shr_h
71
+
72
+VQSHRUNB 111 0 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 0 @2_shr_b
73
+VQSHRUNB 111 0 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 0 @2_shr_h
74
+VQSHRUNT 111 0 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 0 @2_shr_b
75
+VQSHRUNT 111 0 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 0 @2_shr_h
76
+
77
+VQRSHRNB_S 111 0 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 1 @2_shr_b
78
+VQRSHRNB_S 111 0 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 1 @2_shr_h
79
+VQRSHRNT_S 111 0 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 1 @2_shr_b
80
+VQRSHRNT_S 111 0 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 1 @2_shr_h
81
+VQRSHRNB_U 111 1 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 1 @2_shr_b
82
+VQRSHRNB_U 111 1 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 1 @2_shr_h
83
+VQRSHRNT_U 111 1 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 1 @2_shr_b
84
+VQRSHRNT_U 111 1 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 1 @2_shr_h
85
+
86
+VQRSHRUNB 111 1 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 0 @2_shr_b
87
+VQRSHRUNB 111 1 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 0 @2_shr_h
88
+VQRSHRUNT 111 1 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 0 @2_shr_b
89
+VQRSHRUNT 111 1 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 0 @2_shr_h
90
diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c
91
index XXXXXXX..XXXXXXX 100644
92
--- a/target/arm/mve_helper.c
93
+++ b/target/arm/mve_helper.c
94
@@ -XXX,XX +XXX,XX @@ static inline uint64_t do_urshr(uint64_t x, unsigned sh)
95
}
45
}
96
}
46
97
47
+static int16_t vfp_tosszh(float16 x, void *fpstp)
98
+static inline int64_t do_srshr(int64_t x, unsigned sh)
48
+{
99
+{
49
+ float_status *fpst = fpstp;
100
+ if (likely(sh < 64)) {
50
+ if (float16_is_any_nan(x)) {
101
+ return (x >> sh) + ((x >> (sh - 1)) & 1);
51
+ float_raise(float_flag_invalid, fpst);
102
+ } else {
103
+ /* Rounding the sign bit always produces 0. */
52
+ return 0;
104
+ return 0;
53
+ }
105
+ }
54
+ return float16_to_int16_round_to_zero(x, fpst);
55
+}
106
+}
56
+
107
+
57
+static uint16_t vfp_touszh(float16 x, void *fpstp)
108
DO_VSHRN_ALL(vshrn, DO_SHR)
109
DO_VSHRN_ALL(vrshrn, do_urshr)
110
+
111
+static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max,
112
+ bool *satp)
58
+{
113
+{
59
+ float_status *fpst = fpstp;
114
+ if (val > max) {
60
+ if (float16_is_any_nan(x)) {
115
+ *satp = true;
61
+ float_raise(float_flag_invalid, fpst);
116
+ return max;
62
+ return 0;
117
+ } else if (val < min) {
118
+ *satp = true;
119
+ return min;
120
+ } else {
121
+ return val;
63
+ }
122
+ }
64
+ return float16_to_uint16_round_to_zero(x, fpst);
65
+}
123
+}
66
+
124
+
67
#define DO_2OP(NAME, FUNC, TYPE) \
125
+/* Saturating narrowing right shifts */
68
void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
126
+#define DO_VSHRN_SAT(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN) \
69
{ \
127
+ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, \
70
@@ -XXX,XX +XXX,XX @@ DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
128
+ void *vm, uint32_t shift) \
71
DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
129
+ { \
72
DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
130
+ LTYPE *m = vm; \
73
131
+ TYPE *d = vd; \
74
+DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
132
+ uint16_t mask = mve_element_mask(env); \
75
+DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
133
+ bool qc = false; \
76
+DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
134
+ unsigned le; \
77
+DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
135
+ for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \
78
+DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
136
+ bool sat = false; \
79
+DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
137
+ TYPE r = FN(m[H##LESIZE(le)], shift, &sat); \
80
+DO_2OP(gvec_tosszh, vfp_tosszh, float16)
138
+ mergemask(&d[H##ESIZE(le * 2 + TOP)], r, mask); \
81
+DO_2OP(gvec_touszh, vfp_touszh, float16)
139
+ qc |= sat && (mask & 1 << (TOP * ESIZE)); \
82
+
140
+ } \
83
#define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \
141
+ if (qc) { \
84
static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \
142
+ env->vfp.qc[0] = qc; \
85
{ \
143
+ } \
86
diff --git a/target/arm/translate-neon.c.inc b/target/arm/translate-neon.c.inc
144
+ mve_advance_vpt(env); \
87
index XXXXXXX..XXXXXXX 100644
145
+ }
88
--- a/target/arm/translate-neon.c.inc
146
+
89
+++ b/target/arm/translate-neon.c.inc
147
+#define DO_VSHRN_SAT_UB(BOP, TOP, FN) \
90
@@ -XXX,XX +XXX,XX @@ static bool do_2misc_fp(DisasContext *s, arg_2misc *a,
148
+ DO_VSHRN_SAT(BOP, false, 1, uint8_t, 2, uint16_t, FN) \
91
return true;
149
+ DO_VSHRN_SAT(TOP, true, 1, uint8_t, 2, uint16_t, FN)
92
}
150
+
93
151
+#define DO_VSHRN_SAT_UH(BOP, TOP, FN) \
94
-#define DO_2MISC_FP(INSN, FUNC) \
152
+ DO_VSHRN_SAT(BOP, false, 2, uint16_t, 4, uint32_t, FN) \
95
- static bool trans_##INSN(DisasContext *s, arg_2misc *a) \
153
+ DO_VSHRN_SAT(TOP, true, 2, uint16_t, 4, uint32_t, FN)
96
- { \
154
+
97
- return do_2misc_fp(s, a, FUNC); \
155
+#define DO_VSHRN_SAT_SB(BOP, TOP, FN) \
98
- }
156
+ DO_VSHRN_SAT(BOP, false, 1, int8_t, 2, int16_t, FN) \
99
-
157
+ DO_VSHRN_SAT(TOP, true, 1, int8_t, 2, int16_t, FN)
100
-DO_2MISC_FP(VCVT_FS, gen_helper_vfp_sitos)
158
+
101
-DO_2MISC_FP(VCVT_FU, gen_helper_vfp_uitos)
159
+#define DO_VSHRN_SAT_SH(BOP, TOP, FN) \
102
-DO_2MISC_FP(VCVT_SF, gen_helper_vfp_tosizs)
160
+ DO_VSHRN_SAT(BOP, false, 2, int16_t, 4, int32_t, FN) \
103
-DO_2MISC_FP(VCVT_UF, gen_helper_vfp_touizs)
161
+ DO_VSHRN_SAT(TOP, true, 2, int16_t, 4, int32_t, FN)
104
-
162
+
105
#define DO_2MISC_FP_VEC(INSN, HFUNC, SFUNC) \
163
+#define DO_SHRN_SB(N, M, SATP) \
106
static void gen_##INSN(unsigned vece, uint32_t rd_ofs, \
164
+ do_sat_bhs((int64_t)(N) >> (M), INT8_MIN, INT8_MAX, SATP)
107
uint32_t rm_ofs, \
165
+#define DO_SHRN_UB(N, M, SATP) \
108
@@ -XXX,XX +XXX,XX @@ DO_2MISC_FP_VEC(VCGE0_F, gen_helper_gvec_fcge0_h, gen_helper_gvec_fcge0_s)
166
+ do_sat_bhs((uint64_t)(N) >> (M), 0, UINT8_MAX, SATP)
109
DO_2MISC_FP_VEC(VCEQ0_F, gen_helper_gvec_fceq0_h, gen_helper_gvec_fceq0_s)
167
+#define DO_SHRUN_B(N, M, SATP) \
110
DO_2MISC_FP_VEC(VCLT0_F, gen_helper_gvec_fclt0_h, gen_helper_gvec_fclt0_s)
168
+ do_sat_bhs((int64_t)(N) >> (M), 0, UINT8_MAX, SATP)
111
DO_2MISC_FP_VEC(VCLE0_F, gen_helper_gvec_fcle0_h, gen_helper_gvec_fcle0_s)
169
+
112
+DO_2MISC_FP_VEC(VCVT_FS, gen_helper_gvec_sstoh, gen_helper_gvec_sitos)
170
+#define DO_SHRN_SH(N, M, SATP) \
113
+DO_2MISC_FP_VEC(VCVT_FU, gen_helper_gvec_ustoh, gen_helper_gvec_uitos)
171
+ do_sat_bhs((int64_t)(N) >> (M), INT16_MIN, INT16_MAX, SATP)
114
+DO_2MISC_FP_VEC(VCVT_SF, gen_helper_gvec_tosszh, gen_helper_gvec_tosizs)
172
+#define DO_SHRN_UH(N, M, SATP) \
115
+DO_2MISC_FP_VEC(VCVT_UF, gen_helper_gvec_touszh, gen_helper_gvec_touizs)
173
+ do_sat_bhs((uint64_t)(N) >> (M), 0, UINT16_MAX, SATP)
116
174
+#define DO_SHRUN_H(N, M, SATP) \
117
static bool trans_VRINTX(DisasContext *s, arg_2misc *a)
175
+ do_sat_bhs((int64_t)(N) >> (M), 0, UINT16_MAX, SATP)
118
{
176
+
177
+#define DO_RSHRN_SB(N, M, SATP) \
178
+ do_sat_bhs(do_srshr(N, M), INT8_MIN, INT8_MAX, SATP)
179
+#define DO_RSHRN_UB(N, M, SATP) \
180
+ do_sat_bhs(do_urshr(N, M), 0, UINT8_MAX, SATP)
181
+#define DO_RSHRUN_B(N, M, SATP) \
182
+ do_sat_bhs(do_srshr(N, M), 0, UINT8_MAX, SATP)
183
+
184
+#define DO_RSHRN_SH(N, M, SATP) \
185
+ do_sat_bhs(do_srshr(N, M), INT16_MIN, INT16_MAX, SATP)
186
+#define DO_RSHRN_UH(N, M, SATP) \
187
+ do_sat_bhs(do_urshr(N, M), 0, UINT16_MAX, SATP)
188
+#define DO_RSHRUN_H(N, M, SATP) \
189
+ do_sat_bhs(do_srshr(N, M), 0, UINT16_MAX, SATP)
190
+
191
+DO_VSHRN_SAT_SB(vqshrnb_sb, vqshrnt_sb, DO_SHRN_SB)
192
+DO_VSHRN_SAT_SH(vqshrnb_sh, vqshrnt_sh, DO_SHRN_SH)
193
+DO_VSHRN_SAT_UB(vqshrnb_ub, vqshrnt_ub, DO_SHRN_UB)
194
+DO_VSHRN_SAT_UH(vqshrnb_uh, vqshrnt_uh, DO_SHRN_UH)
195
+DO_VSHRN_SAT_SB(vqshrunbb, vqshruntb, DO_SHRUN_B)
196
+DO_VSHRN_SAT_SH(vqshrunbh, vqshrunth, DO_SHRUN_H)
197
+
198
+DO_VSHRN_SAT_SB(vqrshrnb_sb, vqrshrnt_sb, DO_RSHRN_SB)
199
+DO_VSHRN_SAT_SH(vqrshrnb_sh, vqrshrnt_sh, DO_RSHRN_SH)
200
+DO_VSHRN_SAT_UB(vqrshrnb_ub, vqrshrnt_ub, DO_RSHRN_UB)
201
+DO_VSHRN_SAT_UH(vqrshrnb_uh, vqrshrnt_uh, DO_RSHRN_UH)
202
+DO_VSHRN_SAT_SB(vqrshrunbb, vqrshruntb, DO_RSHRUN_B)
203
+DO_VSHRN_SAT_SH(vqrshrunbh, vqrshrunth, DO_RSHRUN_H)
204
diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c
205
index XXXXXXX..XXXXXXX 100644
206
--- a/target/arm/translate-mve.c
207
+++ b/target/arm/translate-mve.c
208
@@ -XXX,XX +XXX,XX @@ DO_2SHIFT_N(VSHRNB, vshrnb)
209
DO_2SHIFT_N(VSHRNT, vshrnt)
210
DO_2SHIFT_N(VRSHRNB, vrshrnb)
211
DO_2SHIFT_N(VRSHRNT, vrshrnt)
212
+DO_2SHIFT_N(VQSHRNB_S, vqshrnb_s)
213
+DO_2SHIFT_N(VQSHRNT_S, vqshrnt_s)
214
+DO_2SHIFT_N(VQSHRNB_U, vqshrnb_u)
215
+DO_2SHIFT_N(VQSHRNT_U, vqshrnt_u)
216
+DO_2SHIFT_N(VQSHRUNB, vqshrunb)
217
+DO_2SHIFT_N(VQSHRUNT, vqshrunt)
218
+DO_2SHIFT_N(VQRSHRNB_S, vqrshrnb_s)
219
+DO_2SHIFT_N(VQRSHRNT_S, vqrshrnt_s)
220
+DO_2SHIFT_N(VQRSHRNB_U, vqrshrnb_u)
221
+DO_2SHIFT_N(VQRSHRNT_U, vqrshrnt_u)
222
+DO_2SHIFT_N(VQRSHRUNB, vqrshrunb)
223
+DO_2SHIFT_N(VQRSHRUNT, vqrshrunt)
119
--
224
--
120
2.20.1
225
2.20.1
121
226
122
227
diff view generated by jsdifflib
1
The fp16 extension includes a new instruction VINS, which copies the
1
Implement the MVE VSHLC insn, which performs a shift left of the
2
lower 16 bits of a 32-bit source VFP register into the upper 16 bits
2
entire vector with carry in bits provided from a general purpose
3
of the destination. Implement it.
3
register and carry out bits written back to that register.
4
4
5
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
5
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
6
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
6
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
7
Message-id: 20200828183354.27913-20-peter.maydell@linaro.org
7
Message-id: 20210628135835.6690-14-peter.maydell@linaro.org
8
---
8
---
9
target/arm/vfp-uncond.decode | 3 +++
9
target/arm/helper-mve.h | 2 ++
10
target/arm/translate-vfp.c.inc | 28 ++++++++++++++++++++++++++++
10
target/arm/mve.decode | 2 ++
11
2 files changed, 31 insertions(+)
11
target/arm/mve_helper.c | 38 ++++++++++++++++++++++++++++++++++++++
12
target/arm/translate-mve.c | 30 ++++++++++++++++++++++++++++++
13
4 files changed, 72 insertions(+)
12
14
13
diff --git a/target/arm/vfp-uncond.decode b/target/arm/vfp-uncond.decode
15
diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h
14
index XXXXXXX..XXXXXXX 100644
16
index XXXXXXX..XXXXXXX 100644
15
--- a/target/arm/vfp-uncond.decode
17
--- a/target/arm/helper-mve.h
16
+++ b/target/arm/vfp-uncond.decode
18
+++ b/target/arm/helper-mve.h
17
@@ -XXX,XX +XXX,XX @@ VCVT 1111 1110 1.11 11 rm:2 .... 1010 op:1 1.0 .... \
19
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_4(mve_vqrshrunbb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
18
vm=%vm_sp vd=%vd_sp sz=2
20
DEF_HELPER_FLAGS_4(mve_vqrshrunbh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
19
VCVT 1111 1110 1.11 11 rm:2 .... 1011 op:1 1.0 .... \
21
DEF_HELPER_FLAGS_4(mve_vqrshruntb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
20
vm=%vm_dp vd=%vd_sp sz=3
22
DEF_HELPER_FLAGS_4(mve_vqrshrunth, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
21
+
23
+
22
+VINS 1111 1110 1.11 0000 .... 1010 11 . 0 .... \
24
+DEF_HELPER_FLAGS_4(mve_vshlc, TCG_CALL_NO_WG, i32, env, ptr, i32, i32)
23
+ vd=%vd_sp vm=%vm_sp
25
diff --git a/target/arm/mve.decode b/target/arm/mve.decode
24
diff --git a/target/arm/translate-vfp.c.inc b/target/arm/translate-vfp.c.inc
25
index XXXXXXX..XXXXXXX 100644
26
index XXXXXXX..XXXXXXX 100644
26
--- a/target/arm/translate-vfp.c.inc
27
--- a/target/arm/mve.decode
27
+++ b/target/arm/translate-vfp.c.inc
28
+++ b/target/arm/mve.decode
28
@@ -XXX,XX +XXX,XX @@ static bool trans_NOCP(DisasContext *s, arg_NOCP *a)
29
@@ -XXX,XX +XXX,XX @@ VQRSHRUNB 111 1 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 0 @2_shr_b
29
30
VQRSHRUNB 111 1 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 0 @2_shr_h
30
return false;
31
VQRSHRUNT 111 1 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 0 @2_shr_b
31
}
32
VQRSHRUNT 111 1 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 0 @2_shr_h
32
+
33
+
33
+static bool trans_VINS(DisasContext *s, arg_VINS *a)
34
+VSHLC 111 0 1110 1 . 1 imm:5 ... 0 1111 1100 rdm:4 qd=%qd
35
diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c
36
index XXXXXXX..XXXXXXX 100644
37
--- a/target/arm/mve_helper.c
38
+++ b/target/arm/mve_helper.c
39
@@ -XXX,XX +XXX,XX @@ DO_VSHRN_SAT_UB(vqrshrnb_ub, vqrshrnt_ub, DO_RSHRN_UB)
40
DO_VSHRN_SAT_UH(vqrshrnb_uh, vqrshrnt_uh, DO_RSHRN_UH)
41
DO_VSHRN_SAT_SB(vqrshrunbb, vqrshruntb, DO_RSHRUN_B)
42
DO_VSHRN_SAT_SH(vqrshrunbh, vqrshrunth, DO_RSHRUN_H)
43
+
44
+uint32_t HELPER(mve_vshlc)(CPUARMState *env, void *vd, uint32_t rdm,
45
+ uint32_t shift)
34
+{
46
+{
35
+ TCGv_i32 rd, rm;
47
+ uint32_t *d = vd;
48
+ uint16_t mask = mve_element_mask(env);
49
+ unsigned e;
50
+ uint32_t r;
36
+
51
+
37
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
52
+ /*
53
+ * For each 32-bit element, we shift it left, bringing in the
54
+ * low 'shift' bits of rdm at the bottom. Bits shifted out at
55
+ * the top become the new rdm, if the predicate mask permits.
56
+ * The final rdm value is returned to update the register.
57
+ * shift == 0 here means "shift by 32 bits".
58
+ */
59
+ if (shift == 0) {
60
+ for (e = 0; e < 16 / 4; e++, mask >>= 4) {
61
+ r = rdm;
62
+ if (mask & 1) {
63
+ rdm = d[H4(e)];
64
+ }
65
+ mergemask(&d[H4(e)], r, mask);
66
+ }
67
+ } else {
68
+ uint32_t shiftmask = MAKE_64BIT_MASK(0, shift);
69
+
70
+ for (e = 0; e < 16 / 4; e++, mask >>= 4) {
71
+ r = (d[H4(e)] << shift) | (rdm & shiftmask);
72
+ if (mask & 1) {
73
+ rdm = d[H4(e)] >> (32 - shift);
74
+ }
75
+ mergemask(&d[H4(e)], r, mask);
76
+ }
77
+ }
78
+ mve_advance_vpt(env);
79
+ return rdm;
80
+}
81
diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c
82
index XXXXXXX..XXXXXXX 100644
83
--- a/target/arm/translate-mve.c
84
+++ b/target/arm/translate-mve.c
85
@@ -XXX,XX +XXX,XX @@ DO_2SHIFT_N(VQRSHRNB_U, vqrshrnb_u)
86
DO_2SHIFT_N(VQRSHRNT_U, vqrshrnt_u)
87
DO_2SHIFT_N(VQRSHRUNB, vqrshrunb)
88
DO_2SHIFT_N(VQRSHRUNT, vqrshrunt)
89
+
90
+static bool trans_VSHLC(DisasContext *s, arg_VSHLC *a)
91
+{
92
+ /*
93
+ * Whole Vector Left Shift with Carry. The carry is taken
94
+ * from a general purpose register and written back there.
95
+ * An imm of 0 means "shift by 32".
96
+ */
97
+ TCGv_ptr qd;
98
+ TCGv_i32 rdm;
99
+
100
+ if (!dc_isar_feature(aa32_mve, s) || !mve_check_qreg_bank(s, a->qd)) {
38
+ return false;
101
+ return false;
39
+ }
102
+ }
40
+
103
+ if (a->rdm == 13 || a->rdm == 15) {
41
+ if (s->vec_len != 0 || s->vec_stride != 0) {
104
+ /* CONSTRAINED UNPREDICTABLE: we UNDEF */
42
+ return false;
105
+ return false;
43
+ }
106
+ }
44
+
107
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
45
+ if (!vfp_access_check(s)) {
46
+ return true;
108
+ return true;
47
+ }
109
+ }
48
+
110
+
49
+ /* Insert low half of Vm into high half of Vd */
111
+ qd = mve_qreg_ptr(a->qd);
50
+ rm = tcg_temp_new_i32();
112
+ rdm = load_reg(s, a->rdm);
51
+ rd = tcg_temp_new_i32();
113
+ gen_helper_mve_vshlc(rdm, cpu_env, qd, rdm, tcg_constant_i32(a->imm));
52
+ neon_load_reg32(rm, a->vm);
114
+ store_reg(s, a->rdm, rdm);
53
+ neon_load_reg32(rd, a->vd);
115
+ tcg_temp_free_ptr(qd);
54
+ tcg_gen_deposit_i32(rd, rd, rm, 16, 16);
116
+ mve_update_eci(s);
55
+ neon_store_reg32(rd, a->vd);
56
+ tcg_temp_free_i32(rm);
57
+ tcg_temp_free_i32(rd);
58
+ return true;
117
+ return true;
59
+}
118
+}
60
--
119
--
61
2.20.1
120
2.20.1
62
121
63
122
diff view generated by jsdifflib
1
Implement the VFP fp16 variant of VMOV that transfers a 16-bit
1
Implement the MVE VADDLV insn; this is similar to VADDV, except
2
value between a general purpose register and a VFP register.
2
that it accumulates 32-bit elements into a 64-bit accumulator
3
3
stored in a pair of general-purpose registers.
4
Note that Rt == 15 is UNPREDICTABLE; since this insn is v8 and later
5
only we have no need to replicate the old "updates CPSR.NZCV"
6
behaviour that the singleprec version of this insn does.
7
4
8
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
5
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
9
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
6
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
10
Message-id: 20200828183354.27913-22-peter.maydell@linaro.org
7
Message-id: 20210628135835.6690-15-peter.maydell@linaro.org
11
---
8
---
12
target/arm/vfp.decode | 1 +
9
target/arm/helper-mve.h | 3 ++
13
target/arm/translate-vfp.c.inc | 34 ++++++++++++++++++++++++++++++++++
10
target/arm/mve.decode | 6 +++-
14
2 files changed, 35 insertions(+)
11
target/arm/mve_helper.c | 19 ++++++++++++
12
target/arm/translate-mve.c | 63 ++++++++++++++++++++++++++++++++++++++
13
4 files changed, 90 insertions(+), 1 deletion(-)
15
14
16
diff --git a/target/arm/vfp.decode b/target/arm/vfp.decode
15
diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h
17
index XXXXXXX..XXXXXXX 100644
16
index XXXXXXX..XXXXXXX 100644
18
--- a/target/arm/vfp.decode
17
--- a/target/arm/helper-mve.h
19
+++ b/target/arm/vfp.decode
18
+++ b/target/arm/helper-mve.h
20
@@ -XXX,XX +XXX,XX @@ VDUP ---- 1110 1 b:1 q:1 0 .... rt:4 1011 . 0 e:1 1 0000 \
19
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_3(mve_vaddvuh, TCG_CALL_NO_WG, i32, env, ptr, i32)
21
vn=%vn_dp
20
DEF_HELPER_FLAGS_3(mve_vaddvsw, TCG_CALL_NO_WG, i32, env, ptr, i32)
22
21
DEF_HELPER_FLAGS_3(mve_vaddvuw, TCG_CALL_NO_WG, i32, env, ptr, i32)
23
VMSR_VMRS ---- 1110 111 l:1 reg:4 rt:4 1010 0001 0000
22
24
+VMOV_half ---- 1110 000 l:1 .... rt:4 1001 . 001 0000 vn=%vn_sp
23
+DEF_HELPER_FLAGS_3(mve_vaddlv_s, TCG_CALL_NO_WG, i64, env, ptr, i64)
25
VMOV_single ---- 1110 000 l:1 .... rt:4 1010 . 001 0000 vn=%vn_sp
24
+DEF_HELPER_FLAGS_3(mve_vaddlv_u, TCG_CALL_NO_WG, i64, env, ptr, i64)
26
25
+
27
VMOV_64_sp ---- 1100 010 op:1 rt2:4 rt:4 1010 00.1 .... vm=%vm_sp
26
DEF_HELPER_FLAGS_3(mve_vmovi, TCG_CALL_NO_WG, void, env, ptr, i64)
28
diff --git a/target/arm/translate-vfp.c.inc b/target/arm/translate-vfp.c.inc
27
DEF_HELPER_FLAGS_3(mve_vandi, TCG_CALL_NO_WG, void, env, ptr, i64)
28
DEF_HELPER_FLAGS_3(mve_vorri, TCG_CALL_NO_WG, void, env, ptr, i64)
29
diff --git a/target/arm/mve.decode b/target/arm/mve.decode
29
index XXXXXXX..XXXXXXX 100644
30
index XXXXXXX..XXXXXXX 100644
30
--- a/target/arm/translate-vfp.c.inc
31
--- a/target/arm/mve.decode
31
+++ b/target/arm/translate-vfp.c.inc
32
+++ b/target/arm/mve.decode
32
@@ -XXX,XX +XXX,XX @@ static bool trans_VMSR_VMRS(DisasContext *s, arg_VMSR_VMRS *a)
33
@@ -XXX,XX +XXX,XX @@ VQDMULH_scalar 1110 1110 0 . .. ... 1 ... 0 1110 . 110 .... @2scalar
34
VQRDMULH_scalar 1111 1110 0 . .. ... 1 ... 0 1110 . 110 .... @2scalar
35
36
# Vector add across vector
37
-VADDV 111 u:1 1110 1111 size:2 01 ... 0 1111 0 0 a:1 0 qm:3 0 rda=%rdalo
38
+{
39
+ VADDV 111 u:1 1110 1111 size:2 01 ... 0 1111 0 0 a:1 0 qm:3 0 rda=%rdalo
40
+ VADDLV 111 u:1 1110 1 ... 1001 ... 0 1111 00 a:1 0 qm:3 0 \
41
+ rdahi=%rdahi rdalo=%rdalo
42
+}
43
44
# Predicate operations
45
%mask_22_13 22:1 13:3
46
diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c
47
index XXXXXXX..XXXXXXX 100644
48
--- a/target/arm/mve_helper.c
49
+++ b/target/arm/mve_helper.c
50
@@ -XXX,XX +XXX,XX @@ DO_VADDV(vaddvub, 1, uint8_t)
51
DO_VADDV(vaddvuh, 2, uint16_t)
52
DO_VADDV(vaddvuw, 4, uint32_t)
53
54
+#define DO_VADDLV(OP, TYPE, LTYPE) \
55
+ uint64_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \
56
+ uint64_t ra) \
57
+ { \
58
+ uint16_t mask = mve_element_mask(env); \
59
+ unsigned e; \
60
+ TYPE *m = vm; \
61
+ for (e = 0; e < 16 / 4; e++, mask >>= 4) { \
62
+ if (mask & 1) { \
63
+ ra += (LTYPE)m[H4(e)]; \
64
+ } \
65
+ } \
66
+ mve_advance_vpt(env); \
67
+ return ra; \
68
+ } \
69
+
70
+DO_VADDLV(vaddlv_s, int32_t, int64_t)
71
+DO_VADDLV(vaddlv_u, uint32_t, uint64_t)
72
+
73
/* Shifts by immediate */
74
#define DO_2SHIFT(OP, ESIZE, TYPE, FN) \
75
void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, \
76
diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c
77
index XXXXXXX..XXXXXXX 100644
78
--- a/target/arm/translate-mve.c
79
+++ b/target/arm/translate-mve.c
80
@@ -XXX,XX +XXX,XX @@ static bool trans_VADDV(DisasContext *s, arg_VADDV *a)
33
return true;
81
return true;
34
}
82
}
35
83
36
+static bool trans_VMOV_half(DisasContext *s, arg_VMOV_single *a)
84
+static bool trans_VADDLV(DisasContext *s, arg_VADDLV *a)
37
+{
85
+{
38
+ TCGv_i32 tmp;
86
+ /*
87
+ * Vector Add Long Across Vector: accumulate the 32-bit
88
+ * elements of the vector into a 64-bit result stored in
89
+ * a pair of general-purpose registers.
90
+ * No need to check Qm's bank: it is only 3 bits in decode.
91
+ */
92
+ TCGv_ptr qm;
93
+ TCGv_i64 rda;
94
+ TCGv_i32 rdalo, rdahi;
39
+
95
+
40
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
96
+ if (!dc_isar_feature(aa32_mve, s)) {
41
+ return false;
97
+ return false;
42
+ }
98
+ }
43
+
99
+ /*
44
+ if (a->rt == 15) {
100
+ * rdahi == 13 is UNPREDICTABLE; rdahi == 15 is a related
45
+ /* UNPREDICTABLE; we choose to UNDEF */
101
+ * encoding; rdalo always has bit 0 clear so cannot be 13 or 15.
102
+ */
103
+ if (a->rdahi == 13 || a->rdahi == 15) {
46
+ return false;
104
+ return false;
47
+ }
105
+ }
48
+
106
+ if (!mve_eci_check(s) || !vfp_access_check(s)) {
49
+ if (!vfp_access_check(s)) {
50
+ return true;
107
+ return true;
51
+ }
108
+ }
52
+
109
+
53
+ if (a->l) {
110
+ /*
54
+ /* VFP to general purpose register */
111
+ * This insn is subject to beat-wise execution. Partial execution
55
+ tmp = tcg_temp_new_i32();
112
+ * of an A=0 (no-accumulate) insn which does not execute the first
56
+ neon_load_reg32(tmp, a->vn);
113
+ * beat must start with the current value of RdaHi:RdaLo, not zero.
57
+ tcg_gen_andi_i32(tmp, tmp, 0xffff);
114
+ */
58
+ store_reg(s, a->rt, tmp);
115
+ if (a->a || mve_skip_first_beat(s)) {
116
+ /* Accumulate input from RdaHi:RdaLo */
117
+ rda = tcg_temp_new_i64();
118
+ rdalo = load_reg(s, a->rdalo);
119
+ rdahi = load_reg(s, a->rdahi);
120
+ tcg_gen_concat_i32_i64(rda, rdalo, rdahi);
121
+ tcg_temp_free_i32(rdalo);
122
+ tcg_temp_free_i32(rdahi);
59
+ } else {
123
+ } else {
60
+ /* general purpose register to VFP */
124
+ /* Accumulate starting at zero */
61
+ tmp = load_reg(s, a->rt);
125
+ rda = tcg_const_i64(0);
62
+ tcg_gen_andi_i32(tmp, tmp, 0xffff);
63
+ neon_store_reg32(tmp, a->vn);
64
+ tcg_temp_free_i32(tmp);
65
+ }
126
+ }
66
+
127
+
128
+ qm = mve_qreg_ptr(a->qm);
129
+ if (a->u) {
130
+ gen_helper_mve_vaddlv_u(rda, cpu_env, qm, rda);
131
+ } else {
132
+ gen_helper_mve_vaddlv_s(rda, cpu_env, qm, rda);
133
+ }
134
+ tcg_temp_free_ptr(qm);
135
+
136
+ rdalo = tcg_temp_new_i32();
137
+ rdahi = tcg_temp_new_i32();
138
+ tcg_gen_extrl_i64_i32(rdalo, rda);
139
+ tcg_gen_extrh_i64_i32(rdahi, rda);
140
+ store_reg(s, a->rdalo, rdalo);
141
+ store_reg(s, a->rdahi, rdahi);
142
+ tcg_temp_free_i64(rda);
143
+ mve_update_eci(s);
67
+ return true;
144
+ return true;
68
+}
145
+}
69
+
146
+
70
static bool trans_VMOV_single(DisasContext *s, arg_VMOV_single *a)
147
static bool do_1imm(DisasContext *s, arg_1imm *a, MVEGenOneOpImmFn *fn)
71
{
148
{
72
TCGv_i32 tmp;
149
TCGv_ptr qd;
73
--
150
--
74
2.20.1
151
2.20.1
75
152
76
153
diff view generated by jsdifflib
1
Implmeent VFP fp16 support for simple binary-operator VFP insns VADD,
1
The MVE extension to v8.1M includes some new shift instructions which
2
VSUB, VMUL, VDIV, VMINNM and VMAXNM:
2
sit entirely within the non-coprocessor part of the encoding space
3
3
and which operate only on general-purpose registers. They take up
4
* make the VFP_BINOP() macro generate float16 helpers as well as
4
the space which was previously UNPREDICTABLE MOVS and ORRS encodings
5
float32 and float64
5
with Rm == 13 or 15.
6
* implement a do_vfp_3op_hp() function similar to the existing
6
7
do_vfp_3op_sp()
7
Implement the long shifts by immediate, which perform shifts on a
8
* add decode for the half-precision insn patterns
8
pair of general-purpose registers treated as a 64-bit quantity, with
9
9
an immediate shift count between 1 and 32.
10
Note that the VFP_BINOP macro use creates a couple of unused helper
10
11
functions vfp_maxh and vfp_minh, but they're small so it's not worth
11
Awkwardly, because the MOVS and ORRS trans functions do not UNDEF for
12
splitting the BINOP operations into "needs halfprec" and "no
12
the Rm==13,15 case, we need to explicitly emit code to UNDEF for the
13
halfprec" groups.
13
cases where v8.1M now requires that. (Trying to change MOVS and ORRS
14
is too difficult, because the functions that generate the code are
15
shared between a dozen different kinds of arithmetic or logical
16
instruction for all A32, T16 and T32 encodings, and for some insns
17
and some encodings Rm==13,15 are valid.)
18
19
We make the helper functions we need for UQSHLL and SQSHLL take
20
a 32-bit value which the helper casts to int8_t because we'll need
21
these helpers also for the shift-by-register insns, where the shift
22
count might be < 0 or > 32.
14
23
15
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
24
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
16
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
25
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
17
Message-id: 20200828183354.27913-4-peter.maydell@linaro.org
26
Message-id: 20210628135835.6690-16-peter.maydell@linaro.org
18
---
27
---
19
target/arm/helper.h | 8 ++++
28
target/arm/helper-mve.h | 3 ++
20
target/arm/vfp-uncond.decode | 3 ++
29
target/arm/translate.h | 1 +
21
target/arm/vfp.decode | 4 ++
30
target/arm/t32.decode | 28 +++++++++++++
22
target/arm/vfp_helper.c | 5 ++
31
target/arm/mve_helper.c | 10 +++++
23
target/arm/translate-vfp.c.inc | 86 ++++++++++++++++++++++++++++++++++
32
target/arm/translate.c | 90 +++++++++++++++++++++++++++++++++++++++++
24
5 files changed, 106 insertions(+)
33
5 files changed, 132 insertions(+)
25
34
26
diff --git a/target/arm/helper.h b/target/arm/helper.h
35
diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h
27
index XXXXXXX..XXXXXXX 100644
36
index XXXXXXX..XXXXXXX 100644
28
--- a/target/arm/helper.h
37
--- a/target/arm/helper-mve.h
29
+++ b/target/arm/helper.h
38
+++ b/target/arm/helper-mve.h
30
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_5(probe_access, TCG_CALL_NO_WG, void, env, tl, i32, i32, i32)
39
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_4(mve_vqrshruntb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
31
DEF_HELPER_1(vfp_get_fpscr, i32, env)
40
DEF_HELPER_FLAGS_4(mve_vqrshrunth, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
32
DEF_HELPER_2(vfp_set_fpscr, void, env, i32)
41
33
42
DEF_HELPER_FLAGS_4(mve_vshlc, TCG_CALL_NO_WG, i32, env, ptr, i32, i32)
34
+DEF_HELPER_3(vfp_addh, f16, f16, f16, ptr)
43
+
35
DEF_HELPER_3(vfp_adds, f32, f32, f32, ptr)
44
+DEF_HELPER_FLAGS_3(mve_sqshll, TCG_CALL_NO_RWG, i64, env, i64, i32)
36
DEF_HELPER_3(vfp_addd, f64, f64, f64, ptr)
45
+DEF_HELPER_FLAGS_3(mve_uqshll, TCG_CALL_NO_RWG, i64, env, i64, i32)
37
+DEF_HELPER_3(vfp_subh, f16, f16, f16, ptr)
46
diff --git a/target/arm/translate.h b/target/arm/translate.h
38
DEF_HELPER_3(vfp_subs, f32, f32, f32, ptr)
47
index XXXXXXX..XXXXXXX 100644
39
DEF_HELPER_3(vfp_subd, f64, f64, f64, ptr)
48
--- a/target/arm/translate.h
40
+DEF_HELPER_3(vfp_mulh, f16, f16, f16, ptr)
49
+++ b/target/arm/translate.h
41
DEF_HELPER_3(vfp_muls, f32, f32, f32, ptr)
50
@@ -XXX,XX +XXX,XX @@ typedef void CryptoTwoOpFn(TCGv_ptr, TCGv_ptr);
42
DEF_HELPER_3(vfp_muld, f64, f64, f64, ptr)
51
typedef void CryptoThreeOpIntFn(TCGv_ptr, TCGv_ptr, TCGv_i32);
43
+DEF_HELPER_3(vfp_divh, f16, f16, f16, ptr)
52
typedef void CryptoThreeOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr);
44
DEF_HELPER_3(vfp_divs, f32, f32, f32, ptr)
53
typedef void AtomicThreeOpFn(TCGv_i64, TCGv_i64, TCGv_i64, TCGArg, MemOp);
45
DEF_HELPER_3(vfp_divd, f64, f64, f64, ptr)
54
+typedef void WideShiftImmFn(TCGv_i64, TCGv_i64, int64_t shift);
46
+DEF_HELPER_3(vfp_maxh, f16, f16, f16, ptr)
55
47
DEF_HELPER_3(vfp_maxs, f32, f32, f32, ptr)
56
/**
48
DEF_HELPER_3(vfp_maxd, f64, f64, f64, ptr)
57
* arm_tbflags_from_tb:
49
+DEF_HELPER_3(vfp_minh, f16, f16, f16, ptr)
58
diff --git a/target/arm/t32.decode b/target/arm/t32.decode
50
DEF_HELPER_3(vfp_mins, f32, f32, f32, ptr)
59
index XXXXXXX..XXXXXXX 100644
51
DEF_HELPER_3(vfp_mind, f64, f64, f64, ptr)
60
--- a/target/arm/t32.decode
52
+DEF_HELPER_3(vfp_maxnumh, f16, f16, f16, ptr)
61
+++ b/target/arm/t32.decode
53
DEF_HELPER_3(vfp_maxnums, f32, f32, f32, ptr)
62
@@ -XXX,XX +XXX,XX @@
54
DEF_HELPER_3(vfp_maxnumd, f64, f64, f64, ptr)
63
&mcr !extern cp opc1 crn crm opc2 rt
55
+DEF_HELPER_3(vfp_minnumh, f16, f16, f16, ptr)
64
&mcrr !extern cp opc1 crm rt rt2
56
DEF_HELPER_3(vfp_minnums, f32, f32, f32, ptr)
65
57
DEF_HELPER_3(vfp_minnumd, f64, f64, f64, ptr)
66
+&mve_shl_ri rdalo rdahi shim
58
DEF_HELPER_1(vfp_negs, f32, f32)
67
+
59
diff --git a/target/arm/vfp-uncond.decode b/target/arm/vfp-uncond.decode
68
+# rdahi: bits [3:1] from insn, bit 0 is 1
60
index XXXXXXX..XXXXXXX 100644
69
+# rdalo: bits [3:1] from insn, bit 0 is 0
61
--- a/target/arm/vfp-uncond.decode
70
+%rdahi_9 9:3 !function=times_2_plus_1
62
+++ b/target/arm/vfp-uncond.decode
71
+%rdalo_17 17:3 !function=times_2
63
@@ -XXX,XX +XXX,XX @@ VSEL 1111 1110 0. cc:2 .... .... 1010 .0.0 .... \
72
+
64
VSEL 1111 1110 0. cc:2 .... .... 1011 .0.0 .... \
73
# Data-processing (register)
65
vm=%vm_dp vn=%vn_dp vd=%vd_dp dp=1
74
66
75
%imm5_12_6 12:3 6:2
67
+VMAXNM_hp 1111 1110 1.00 .... .... 1001 .0.0 .... @vfp_dnm_s
76
@@ -XXX,XX +XXX,XX @@
68
+VMINNM_hp 1111 1110 1.00 .... .... 1001 .1.0 .... @vfp_dnm_s
77
@S_xrr_shi ....... .... . rn:4 .... .... .. shty:2 rm:4 \
69
+
78
&s_rrr_shi shim=%imm5_12_6 s=1 rd=0
70
VMAXNM_sp 1111 1110 1.00 .... .... 1010 .0.0 .... @vfp_dnm_s
79
71
VMINNM_sp 1111 1110 1.00 .... .... 1010 .1.0 .... @vfp_dnm_s
80
+@mve_shl_ri ....... .... . ... . . ... ... . .. .. .... \
72
81
+ &mve_shl_ri shim=%imm5_12_6 rdalo=%rdalo_17 rdahi=%rdahi_9
73
diff --git a/target/arm/vfp.decode b/target/arm/vfp.decode
82
+
74
index XXXXXXX..XXXXXXX 100644
83
{
75
--- a/target/arm/vfp.decode
84
TST_xrri 1110101 0000 1 .... 0 ... 1111 .... .... @S_xrr_shi
76
+++ b/target/arm/vfp.decode
85
AND_rrri 1110101 0000 . .... 0 ... .... .... .... @s_rrr_shi
77
@@ -XXX,XX +XXX,XX @@ VNMLS_dp ---- 1110 0.01 .... .... 1011 .0.0 .... @vfp_dnm_d
86
}
78
VNMLA_sp ---- 1110 0.01 .... .... 1010 .1.0 .... @vfp_dnm_s
87
BIC_rrri 1110101 0001 . .... 0 ... .... .... .... @s_rrr_shi
79
VNMLA_dp ---- 1110 0.01 .... .... 1011 .1.0 .... @vfp_dnm_d
88
{
80
89
+ # The v8.1M MVE shift insns overlap in encoding with MOVS/ORRS
81
+VMUL_hp ---- 1110 0.10 .... .... 1001 .0.0 .... @vfp_dnm_s
90
+ # and are distinguished by having Rm==13 or 15. Those are UNPREDICTABLE
82
VMUL_sp ---- 1110 0.10 .... .... 1010 .0.0 .... @vfp_dnm_s
91
+ # cases for MOVS/ORRS. We decode the MVE cases first, ensuring that
83
VMUL_dp ---- 1110 0.10 .... .... 1011 .0.0 .... @vfp_dnm_d
92
+ # they explicitly call unallocated_encoding() for cases that must UNDEF
84
93
+ # (eg "using a new shift insn on a v8.1M CPU without MVE"), and letting
85
VNMUL_sp ---- 1110 0.10 .... .... 1010 .1.0 .... @vfp_dnm_s
94
+ # the rest fall through (where ORR_rrri and MOV_rxri will end up
86
VNMUL_dp ---- 1110 0.10 .... .... 1011 .1.0 .... @vfp_dnm_d
95
+ # handling them as r13 and r15 accesses with the same semantics as A32).
87
96
+ [
88
+VADD_hp ---- 1110 0.11 .... .... 1001 .0.0 .... @vfp_dnm_s
97
+ LSLL_ri 1110101 0010 1 ... 0 0 ... ... 1 .. 00 1111 @mve_shl_ri
89
VADD_sp ---- 1110 0.11 .... .... 1010 .0.0 .... @vfp_dnm_s
98
+ LSRL_ri 1110101 0010 1 ... 0 0 ... ... 1 .. 01 1111 @mve_shl_ri
90
VADD_dp ---- 1110 0.11 .... .... 1011 .0.0 .... @vfp_dnm_d
99
+ ASRL_ri 1110101 0010 1 ... 0 0 ... ... 1 .. 10 1111 @mve_shl_ri
91
100
+
92
+VSUB_hp ---- 1110 0.11 .... .... 1001 .1.0 .... @vfp_dnm_s
101
+ UQSHLL_ri 1110101 0010 1 ... 1 0 ... ... 1 .. 00 1111 @mve_shl_ri
93
VSUB_sp ---- 1110 0.11 .... .... 1010 .1.0 .... @vfp_dnm_s
102
+ URSHRL_ri 1110101 0010 1 ... 1 0 ... ... 1 .. 01 1111 @mve_shl_ri
94
VSUB_dp ---- 1110 0.11 .... .... 1011 .1.0 .... @vfp_dnm_d
103
+ SRSHRL_ri 1110101 0010 1 ... 1 0 ... ... 1 .. 10 1111 @mve_shl_ri
95
104
+ SQSHLL_ri 1110101 0010 1 ... 1 0 ... ... 1 .. 11 1111 @mve_shl_ri
96
+VDIV_hp ---- 1110 1.00 .... .... 1001 .0.0 .... @vfp_dnm_s
105
+ ]
97
VDIV_sp ---- 1110 1.00 .... .... 1010 .0.0 .... @vfp_dnm_s
106
+
98
VDIV_dp ---- 1110 1.00 .... .... 1011 .0.0 .... @vfp_dnm_d
107
MOV_rxri 1110101 0010 . 1111 0 ... .... .... .... @s_rxr_shi
99
108
ORR_rrri 1110101 0010 . .... 0 ... .... .... .... @s_rrr_shi
100
diff --git a/target/arm/vfp_helper.c b/target/arm/vfp_helper.c
109
}
101
index XXXXXXX..XXXXXXX 100644
110
diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c
102
--- a/target/arm/vfp_helper.c
111
index XXXXXXX..XXXXXXX 100644
103
+++ b/target/arm/vfp_helper.c
112
--- a/target/arm/mve_helper.c
104
@@ -XXX,XX +XXX,XX @@ void vfp_set_fpscr(CPUARMState *env, uint32_t val)
113
+++ b/target/arm/mve_helper.c
105
#define VFP_HELPER(name, p) HELPER(glue(glue(vfp_,name),p))
114
@@ -XXX,XX +XXX,XX @@ uint32_t HELPER(mve_vshlc)(CPUARMState *env, void *vd, uint32_t rdm,
106
115
mve_advance_vpt(env);
107
#define VFP_BINOP(name) \
116
return rdm;
108
+dh_ctype_f16 VFP_HELPER(name, h)(dh_ctype_f16 a, dh_ctype_f16 b, void *fpstp) \
117
}
109
+{ \
118
+
110
+ float_status *fpst = fpstp; \
119
+uint64_t HELPER(mve_sqshll)(CPUARMState *env, uint64_t n, uint32_t shift)
111
+ return float16_ ## name(a, b, fpst); \
120
+{
112
+} \
121
+ return do_sqrshl_d(n, (int8_t)shift, false, &env->QF);
113
float32 VFP_HELPER(name, s)(float32 a, float32 b, void *fpstp) \
122
+}
114
{ \
123
+
115
float_status *fpst = fpstp; \
124
+uint64_t HELPER(mve_uqshll)(CPUARMState *env, uint64_t n, uint32_t shift)
116
diff --git a/target/arm/translate-vfp.c.inc b/target/arm/translate-vfp.c.inc
125
+{
117
index XXXXXXX..XXXXXXX 100644
126
+ return do_uqrshl_d(n, (int8_t)shift, false, &env->QF);
118
--- a/target/arm/translate-vfp.c.inc
127
+}
119
+++ b/target/arm/translate-vfp.c.inc
128
diff --git a/target/arm/translate.c b/target/arm/translate.c
120
@@ -XXX,XX +XXX,XX @@ static bool do_vfp_3op_sp(DisasContext *s, VFPGen3OpSPFn *fn,
129
index XXXXXXX..XXXXXXX 100644
130
--- a/target/arm/translate.c
131
+++ b/target/arm/translate.c
132
@@ -XXX,XX +XXX,XX @@ static bool trans_MOVT(DisasContext *s, arg_MOVW *a)
121
return true;
133
return true;
122
}
134
}
123
135
124
+static bool do_vfp_3op_hp(DisasContext *s, VFPGen3OpSPFn *fn,
136
+/*
125
+ int vd, int vn, int vm, bool reads_vd)
137
+ * v8.1M MVE wide-shifts
126
+{
138
+ */
127
+ /*
139
+static bool do_mve_shl_ri(DisasContext *s, arg_mve_shl_ri *a,
128
+ * Do a half-precision operation. Functionally this is
140
+ WideShiftImmFn *fn)
129
+ * the same as do_vfp_3op_sp(), except:
141
+{
130
+ * - it uses the FPST_FPCR_F16
142
+ TCGv_i64 rda;
131
+ * - it doesn't need the VFP vector handling (fp16 is a
143
+ TCGv_i32 rdalo, rdahi;
132
+ * v8 feature, and in v8 VFP vectors don't exist)
144
+
133
+ * - it does the aa32_fp16_arith feature test
145
+ if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
134
+ */
146
+ /* Decode falls through to ORR/MOV UNPREDICTABLE handling */
135
+ TCGv_i32 f0, f1, fd;
136
+ TCGv_ptr fpst;
137
+
138
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
139
+ return false;
147
+ return false;
140
+ }
148
+ }
141
+
149
+ if (a->rdahi == 15) {
142
+ if (s->vec_len != 0 || s->vec_stride != 0) {
150
+ /* These are a different encoding (SQSHL/SRSHR/UQSHL/URSHR) */
143
+ return false;
151
+ return false;
144
+ }
152
+ }
145
+
153
+ if (!dc_isar_feature(aa32_mve, s) ||
146
+ if (!vfp_access_check(s)) {
154
+ !arm_dc_feature(s, ARM_FEATURE_M_MAIN) ||
155
+ a->rdahi == 13) {
156
+ /* RdaHi == 13 is UNPREDICTABLE; we choose to UNDEF */
157
+ unallocated_encoding(s);
147
+ return true;
158
+ return true;
148
+ }
159
+ }
149
+
160
+
150
+ f0 = tcg_temp_new_i32();
161
+ if (a->shim == 0) {
151
+ f1 = tcg_temp_new_i32();
162
+ a->shim = 32;
152
+ fd = tcg_temp_new_i32();
163
+ }
153
+ fpst = fpstatus_ptr(FPST_FPCR_F16);
164
+
154
+
165
+ rda = tcg_temp_new_i64();
155
+ neon_load_reg32(f0, vn);
166
+ rdalo = load_reg(s, a->rdalo);
156
+ neon_load_reg32(f1, vm);
167
+ rdahi = load_reg(s, a->rdahi);
157
+
168
+ tcg_gen_concat_i32_i64(rda, rdalo, rdahi);
158
+ if (reads_vd) {
169
+
159
+ neon_load_reg32(fd, vd);
170
+ fn(rda, rda, a->shim);
160
+ }
171
+
161
+ fn(fd, f0, f1, fpst);
172
+ tcg_gen_extrl_i64_i32(rdalo, rda);
162
+ neon_store_reg32(fd, vd);
173
+ tcg_gen_extrh_i64_i32(rdahi, rda);
163
+
174
+ store_reg(s, a->rdalo, rdalo);
164
+ tcg_temp_free_i32(f0);
175
+ store_reg(s, a->rdahi, rdahi);
165
+ tcg_temp_free_i32(f1);
176
+ tcg_temp_free_i64(rda);
166
+ tcg_temp_free_i32(fd);
167
+ tcg_temp_free_ptr(fpst);
168
+
177
+
169
+ return true;
178
+ return true;
170
+}
179
+}
171
+
180
+
172
static bool do_vfp_3op_dp(DisasContext *s, VFPGen3OpDPFn *fn,
181
+static bool trans_ASRL_ri(DisasContext *s, arg_mve_shl_ri *a)
173
int vd, int vn, int vm, bool reads_vd)
182
+{
174
{
183
+ return do_mve_shl_ri(s, a, tcg_gen_sari_i64);
175
@@ -XXX,XX +XXX,XX @@ static bool trans_VNMLA_dp(DisasContext *s, arg_VNMLA_dp *a)
184
+}
176
return do_vfp_3op_dp(s, gen_VNMLA_dp, a->vd, a->vn, a->vm, true);
185
+
177
}
186
+static bool trans_LSLL_ri(DisasContext *s, arg_mve_shl_ri *a)
178
187
+{
179
+static bool trans_VMUL_hp(DisasContext *s, arg_VMUL_sp *a)
188
+ return do_mve_shl_ri(s, a, tcg_gen_shli_i64);
180
+{
189
+}
181
+ return do_vfp_3op_hp(s, gen_helper_vfp_mulh, a->vd, a->vn, a->vm, false);
190
+
182
+}
191
+static bool trans_LSRL_ri(DisasContext *s, arg_mve_shl_ri *a)
183
+
192
+{
184
static bool trans_VMUL_sp(DisasContext *s, arg_VMUL_sp *a)
193
+ return do_mve_shl_ri(s, a, tcg_gen_shri_i64);
185
{
194
+}
186
return do_vfp_3op_sp(s, gen_helper_vfp_muls, a->vd, a->vn, a->vm, false);
195
+
187
@@ -XXX,XX +XXX,XX @@ static bool trans_VNMUL_dp(DisasContext *s, arg_VNMUL_dp *a)
196
+static void gen_mve_sqshll(TCGv_i64 r, TCGv_i64 n, int64_t shift)
188
return do_vfp_3op_dp(s, gen_VNMUL_dp, a->vd, a->vn, a->vm, false);
197
+{
189
}
198
+ gen_helper_mve_sqshll(r, cpu_env, n, tcg_constant_i32(shift));
190
199
+}
191
+static bool trans_VADD_hp(DisasContext *s, arg_VADD_sp *a)
200
+
192
+{
201
+static bool trans_SQSHLL_ri(DisasContext *s, arg_mve_shl_ri *a)
193
+ return do_vfp_3op_hp(s, gen_helper_vfp_addh, a->vd, a->vn, a->vm, false);
202
+{
194
+}
203
+ return do_mve_shl_ri(s, a, gen_mve_sqshll);
195
+
204
+}
196
static bool trans_VADD_sp(DisasContext *s, arg_VADD_sp *a)
205
+
197
{
206
+static void gen_mve_uqshll(TCGv_i64 r, TCGv_i64 n, int64_t shift)
198
return do_vfp_3op_sp(s, gen_helper_vfp_adds, a->vd, a->vn, a->vm, false);
207
+{
199
@@ -XXX,XX +XXX,XX @@ static bool trans_VADD_dp(DisasContext *s, arg_VADD_dp *a)
208
+ gen_helper_mve_uqshll(r, cpu_env, n, tcg_constant_i32(shift));
200
return do_vfp_3op_dp(s, gen_helper_vfp_addd, a->vd, a->vn, a->vm, false);
209
+}
201
}
210
+
202
211
+static bool trans_UQSHLL_ri(DisasContext *s, arg_mve_shl_ri *a)
203
+static bool trans_VSUB_hp(DisasContext *s, arg_VSUB_sp *a)
212
+{
204
+{
213
+ return do_mve_shl_ri(s, a, gen_mve_uqshll);
205
+ return do_vfp_3op_hp(s, gen_helper_vfp_subh, a->vd, a->vn, a->vm, false);
214
+}
206
+}
215
+
207
+
216
+static bool trans_SRSHRL_ri(DisasContext *s, arg_mve_shl_ri *a)
208
static bool trans_VSUB_sp(DisasContext *s, arg_VSUB_sp *a)
217
+{
209
{
218
+ return do_mve_shl_ri(s, a, gen_srshr64_i64);
210
return do_vfp_3op_sp(s, gen_helper_vfp_subs, a->vd, a->vn, a->vm, false);
219
+}
211
@@ -XXX,XX +XXX,XX @@ static bool trans_VSUB_dp(DisasContext *s, arg_VSUB_dp *a)
220
+
212
return do_vfp_3op_dp(s, gen_helper_vfp_subd, a->vd, a->vn, a->vm, false);
221
+static bool trans_URSHRL_ri(DisasContext *s, arg_mve_shl_ri *a)
213
}
222
+{
214
223
+ return do_mve_shl_ri(s, a, gen_urshr64_i64);
215
+static bool trans_VDIV_hp(DisasContext *s, arg_VDIV_sp *a)
224
+}
216
+{
225
+
217
+ return do_vfp_3op_hp(s, gen_helper_vfp_divh, a->vd, a->vn, a->vm, false);
226
/*
218
+}
227
* Multiply and multiply accumulate
219
+
228
*/
220
static bool trans_VDIV_sp(DisasContext *s, arg_VDIV_sp *a)
221
{
222
return do_vfp_3op_sp(s, gen_helper_vfp_divs, a->vd, a->vn, a->vm, false);
223
@@ -XXX,XX +XXX,XX @@ static bool trans_VDIV_dp(DisasContext *s, arg_VDIV_dp *a)
224
return do_vfp_3op_dp(s, gen_helper_vfp_divd, a->vd, a->vn, a->vm, false);
225
}
226
227
+static bool trans_VMINNM_hp(DisasContext *s, arg_VMINNM_sp *a)
228
+{
229
+ if (!dc_isar_feature(aa32_vminmaxnm, s)) {
230
+ return false;
231
+ }
232
+ return do_vfp_3op_hp(s, gen_helper_vfp_minnumh,
233
+ a->vd, a->vn, a->vm, false);
234
+}
235
+
236
+static bool trans_VMAXNM_hp(DisasContext *s, arg_VMAXNM_sp *a)
237
+{
238
+ if (!dc_isar_feature(aa32_vminmaxnm, s)) {
239
+ return false;
240
+ }
241
+ return do_vfp_3op_hp(s, gen_helper_vfp_maxnumh,
242
+ a->vd, a->vn, a->vm, false);
243
+}
244
+
245
static bool trans_VMINNM_sp(DisasContext *s, arg_VMINNM_sp *a)
246
{
247
if (!dc_isar_feature(aa32_vminmaxnm, s)) {
248
--
229
--
249
2.20.1
230
2.20.1
250
231
251
232
diff view generated by jsdifflib
Deleted patch
1
Implement fp16 versions of the VFP VMLA, VMLS, VNMLS, VNMLA, VNMUL
2
instructions. (These are all the remaining ones which we implement
3
via do_vfp_3op_[hsd]p().)
4
1
5
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
6
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
7
Message-id: 20200828183354.27913-5-peter.maydell@linaro.org
8
---
9
target/arm/helper.h | 1 +
10
target/arm/vfp.decode | 5 ++
11
target/arm/vfp_helper.c | 5 ++
12
target/arm/translate-vfp.c.inc | 84 ++++++++++++++++++++++++++++++++++
13
4 files changed, 95 insertions(+)
14
15
diff --git a/target/arm/helper.h b/target/arm/helper.h
16
index XXXXXXX..XXXXXXX 100644
17
--- a/target/arm/helper.h
18
+++ b/target/arm/helper.h
19
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_3(vfp_maxnumd, f64, f64, f64, ptr)
20
DEF_HELPER_3(vfp_minnumh, f16, f16, f16, ptr)
21
DEF_HELPER_3(vfp_minnums, f32, f32, f32, ptr)
22
DEF_HELPER_3(vfp_minnumd, f64, f64, f64, ptr)
23
+DEF_HELPER_1(vfp_negh, f16, f16)
24
DEF_HELPER_1(vfp_negs, f32, f32)
25
DEF_HELPER_1(vfp_negd, f64, f64)
26
DEF_HELPER_1(vfp_abss, f32, f32)
27
diff --git a/target/arm/vfp.decode b/target/arm/vfp.decode
28
index XXXXXXX..XXXXXXX 100644
29
--- a/target/arm/vfp.decode
30
+++ b/target/arm/vfp.decode
31
@@ -XXX,XX +XXX,XX @@ VLDM_VSTM_dp ---- 1101 0.1 l:1 rn:4 .... 1011 imm:8 \
32
vd=%vd_dp p=1 u=0 w=1
33
34
# 3-register VFP data-processing; bits [23,21:20,6] identify the operation.
35
+VMLA_hp ---- 1110 0.00 .... .... 1001 .0.0 .... @vfp_dnm_s
36
VMLA_sp ---- 1110 0.00 .... .... 1010 .0.0 .... @vfp_dnm_s
37
VMLA_dp ---- 1110 0.00 .... .... 1011 .0.0 .... @vfp_dnm_d
38
39
+VMLS_hp ---- 1110 0.00 .... .... 1001 .1.0 .... @vfp_dnm_s
40
VMLS_sp ---- 1110 0.00 .... .... 1010 .1.0 .... @vfp_dnm_s
41
VMLS_dp ---- 1110 0.00 .... .... 1011 .1.0 .... @vfp_dnm_d
42
43
+VNMLS_hp ---- 1110 0.01 .... .... 1001 .0.0 .... @vfp_dnm_s
44
VNMLS_sp ---- 1110 0.01 .... .... 1010 .0.0 .... @vfp_dnm_s
45
VNMLS_dp ---- 1110 0.01 .... .... 1011 .0.0 .... @vfp_dnm_d
46
47
+VNMLA_hp ---- 1110 0.01 .... .... 1001 .1.0 .... @vfp_dnm_s
48
VNMLA_sp ---- 1110 0.01 .... .... 1010 .1.0 .... @vfp_dnm_s
49
VNMLA_dp ---- 1110 0.01 .... .... 1011 .1.0 .... @vfp_dnm_d
50
51
@@ -XXX,XX +XXX,XX @@ VMUL_hp ---- 1110 0.10 .... .... 1001 .0.0 .... @vfp_dnm_s
52
VMUL_sp ---- 1110 0.10 .... .... 1010 .0.0 .... @vfp_dnm_s
53
VMUL_dp ---- 1110 0.10 .... .... 1011 .0.0 .... @vfp_dnm_d
54
55
+VNMUL_hp ---- 1110 0.10 .... .... 1001 .1.0 .... @vfp_dnm_s
56
VNMUL_sp ---- 1110 0.10 .... .... 1010 .1.0 .... @vfp_dnm_s
57
VNMUL_dp ---- 1110 0.10 .... .... 1011 .1.0 .... @vfp_dnm_d
58
59
diff --git a/target/arm/vfp_helper.c b/target/arm/vfp_helper.c
60
index XXXXXXX..XXXXXXX 100644
61
--- a/target/arm/vfp_helper.c
62
+++ b/target/arm/vfp_helper.c
63
@@ -XXX,XX +XXX,XX @@ VFP_BINOP(minnum)
64
VFP_BINOP(maxnum)
65
#undef VFP_BINOP
66
67
+dh_ctype_f16 VFP_HELPER(neg, h)(dh_ctype_f16 a)
68
+{
69
+ return float16_chs(a);
70
+}
71
+
72
float32 VFP_HELPER(neg, s)(float32 a)
73
{
74
return float32_chs(a);
75
diff --git a/target/arm/translate-vfp.c.inc b/target/arm/translate-vfp.c.inc
76
index XXXXXXX..XXXXXXX 100644
77
--- a/target/arm/translate-vfp.c.inc
78
+++ b/target/arm/translate-vfp.c.inc
79
@@ -XXX,XX +XXX,XX @@ static bool do_vfp_2op_dp(DisasContext *s, VFPGen2OpDPFn *fn, int vd, int vm)
80
return true;
81
}
82
83
+static void gen_VMLA_hp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
84
+{
85
+ /* Note that order of inputs to the add matters for NaNs */
86
+ TCGv_i32 tmp = tcg_temp_new_i32();
87
+
88
+ gen_helper_vfp_mulh(tmp, vn, vm, fpst);
89
+ gen_helper_vfp_addh(vd, vd, tmp, fpst);
90
+ tcg_temp_free_i32(tmp);
91
+}
92
+
93
+static bool trans_VMLA_hp(DisasContext *s, arg_VMLA_sp *a)
94
+{
95
+ return do_vfp_3op_hp(s, gen_VMLA_hp, a->vd, a->vn, a->vm, true);
96
+}
97
+
98
static void gen_VMLA_sp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
99
{
100
/* Note that order of inputs to the add matters for NaNs */
101
@@ -XXX,XX +XXX,XX @@ static bool trans_VMLA_dp(DisasContext *s, arg_VMLA_dp *a)
102
return do_vfp_3op_dp(s, gen_VMLA_dp, a->vd, a->vn, a->vm, true);
103
}
104
105
+static void gen_VMLS_hp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
106
+{
107
+ /*
108
+ * VMLS: vd = vd + -(vn * vm)
109
+ * Note that order of inputs to the add matters for NaNs.
110
+ */
111
+ TCGv_i32 tmp = tcg_temp_new_i32();
112
+
113
+ gen_helper_vfp_mulh(tmp, vn, vm, fpst);
114
+ gen_helper_vfp_negh(tmp, tmp);
115
+ gen_helper_vfp_addh(vd, vd, tmp, fpst);
116
+ tcg_temp_free_i32(tmp);
117
+}
118
+
119
+static bool trans_VMLS_hp(DisasContext *s, arg_VMLS_sp *a)
120
+{
121
+ return do_vfp_3op_hp(s, gen_VMLS_hp, a->vd, a->vn, a->vm, true);
122
+}
123
+
124
static void gen_VMLS_sp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
125
{
126
/*
127
@@ -XXX,XX +XXX,XX @@ static bool trans_VMLS_dp(DisasContext *s, arg_VMLS_dp *a)
128
return do_vfp_3op_dp(s, gen_VMLS_dp, a->vd, a->vn, a->vm, true);
129
}
130
131
+static void gen_VNMLS_hp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
132
+{
133
+ /*
134
+ * VNMLS: -fd + (fn * fm)
135
+ * Note that it isn't valid to replace (-A + B) with (B - A) or similar
136
+ * plausible looking simplifications because this will give wrong results
137
+ * for NaNs.
138
+ */
139
+ TCGv_i32 tmp = tcg_temp_new_i32();
140
+
141
+ gen_helper_vfp_mulh(tmp, vn, vm, fpst);
142
+ gen_helper_vfp_negh(vd, vd);
143
+ gen_helper_vfp_addh(vd, vd, tmp, fpst);
144
+ tcg_temp_free_i32(tmp);
145
+}
146
+
147
+static bool trans_VNMLS_hp(DisasContext *s, arg_VNMLS_sp *a)
148
+{
149
+ return do_vfp_3op_hp(s, gen_VNMLS_hp, a->vd, a->vn, a->vm, true);
150
+}
151
+
152
static void gen_VNMLS_sp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
153
{
154
/*
155
@@ -XXX,XX +XXX,XX @@ static bool trans_VNMLS_dp(DisasContext *s, arg_VNMLS_dp *a)
156
return do_vfp_3op_dp(s, gen_VNMLS_dp, a->vd, a->vn, a->vm, true);
157
}
158
159
+static void gen_VNMLA_hp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
160
+{
161
+ /* VNMLA: -fd + -(fn * fm) */
162
+ TCGv_i32 tmp = tcg_temp_new_i32();
163
+
164
+ gen_helper_vfp_mulh(tmp, vn, vm, fpst);
165
+ gen_helper_vfp_negh(tmp, tmp);
166
+ gen_helper_vfp_negh(vd, vd);
167
+ gen_helper_vfp_addh(vd, vd, tmp, fpst);
168
+ tcg_temp_free_i32(tmp);
169
+}
170
+
171
+static bool trans_VNMLA_hp(DisasContext *s, arg_VNMLA_sp *a)
172
+{
173
+ return do_vfp_3op_hp(s, gen_VNMLA_hp, a->vd, a->vn, a->vm, true);
174
+}
175
+
176
static void gen_VNMLA_sp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
177
{
178
/* VNMLA: -fd + -(fn * fm) */
179
@@ -XXX,XX +XXX,XX @@ static bool trans_VMUL_dp(DisasContext *s, arg_VMUL_dp *a)
180
return do_vfp_3op_dp(s, gen_helper_vfp_muld, a->vd, a->vn, a->vm, false);
181
}
182
183
+static void gen_VNMUL_hp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
184
+{
185
+ /* VNMUL: -(fn * fm) */
186
+ gen_helper_vfp_mulh(vd, vn, vm, fpst);
187
+ gen_helper_vfp_negh(vd, vd);
188
+}
189
+
190
+static bool trans_VNMUL_hp(DisasContext *s, arg_VNMUL_sp *a)
191
+{
192
+ return do_vfp_3op_hp(s, gen_VNMUL_hp, a->vd, a->vn, a->vm, false);
193
+}
194
+
195
static void gen_VNMUL_sp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
196
{
197
/* VNMUL: -(fn * fm) */
198
--
199
2.20.1
200
201
diff view generated by jsdifflib
Deleted patch
1
Macroify creation of the trans functions for single and double
2
precision VFMA, VFMS, VFNMA, VFNMS. The repetition was OK for
3
two sizes, but we're about to add halfprec and it will get a bit
4
more than seems reasonable.
5
1
6
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
7
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
8
Message-id: 20200828183354.27913-6-peter.maydell@linaro.org
9
---
10
target/arm/translate-vfp.c.inc | 50 +++++++++-------------------------
11
1 file changed, 13 insertions(+), 37 deletions(-)
12
13
diff --git a/target/arm/translate-vfp.c.inc b/target/arm/translate-vfp.c.inc
14
index XXXXXXX..XXXXXXX 100644
15
--- a/target/arm/translate-vfp.c.inc
16
+++ b/target/arm/translate-vfp.c.inc
17
@@ -XXX,XX +XXX,XX @@ static bool do_vfm_sp(DisasContext *s, arg_VFMA_sp *a, bool neg_n, bool neg_d)
18
return true;
19
}
20
21
-static bool trans_VFMA_sp(DisasContext *s, arg_VFMA_sp *a)
22
-{
23
- return do_vfm_sp(s, a, false, false);
24
-}
25
-
26
-static bool trans_VFMS_sp(DisasContext *s, arg_VFMS_sp *a)
27
-{
28
- return do_vfm_sp(s, a, true, false);
29
-}
30
-
31
-static bool trans_VFNMA_sp(DisasContext *s, arg_VFNMA_sp *a)
32
-{
33
- return do_vfm_sp(s, a, false, true);
34
-}
35
-
36
-static bool trans_VFNMS_sp(DisasContext *s, arg_VFNMS_sp *a)
37
-{
38
- return do_vfm_sp(s, a, true, true);
39
-}
40
-
41
static bool do_vfm_dp(DisasContext *s, arg_VFMA_dp *a, bool neg_n, bool neg_d)
42
{
43
/*
44
@@ -XXX,XX +XXX,XX @@ static bool do_vfm_dp(DisasContext *s, arg_VFMA_dp *a, bool neg_n, bool neg_d)
45
return true;
46
}
47
48
-static bool trans_VFMA_dp(DisasContext *s, arg_VFMA_dp *a)
49
-{
50
- return do_vfm_dp(s, a, false, false);
51
-}
52
+#define MAKE_ONE_VFM_TRANS_FN(INSN, PREC, NEGN, NEGD) \
53
+ static bool trans_##INSN##_##PREC(DisasContext *s, \
54
+ arg_##INSN##_##PREC *a) \
55
+ { \
56
+ return do_vfm_##PREC(s, a, NEGN, NEGD); \
57
+ }
58
59
-static bool trans_VFMS_dp(DisasContext *s, arg_VFMS_dp *a)
60
-{
61
- return do_vfm_dp(s, a, true, false);
62
-}
63
+#define MAKE_VFM_TRANS_FNS(PREC) \
64
+ MAKE_ONE_VFM_TRANS_FN(VFMA, PREC, false, false) \
65
+ MAKE_ONE_VFM_TRANS_FN(VFMS, PREC, true, false) \
66
+ MAKE_ONE_VFM_TRANS_FN(VFNMA, PREC, false, true) \
67
+ MAKE_ONE_VFM_TRANS_FN(VFNMS, PREC, true, true)
68
69
-static bool trans_VFNMA_dp(DisasContext *s, arg_VFNMA_dp *a)
70
-{
71
- return do_vfm_dp(s, a, false, true);
72
-}
73
-
74
-static bool trans_VFNMS_dp(DisasContext *s, arg_VFNMS_dp *a)
75
-{
76
- return do_vfm_dp(s, a, true, true);
77
-}
78
+MAKE_VFM_TRANS_FNS(sp)
79
+MAKE_VFM_TRANS_FNS(dp)
80
81
static bool trans_VMOV_imm_sp(DisasContext *s, arg_VMOV_imm_sp *a)
82
{
83
--
84
2.20.1
85
86
diff view generated by jsdifflib
1
Implement fp16 version of VCMP.
1
Implement the MVE long shifts by register, which perform shifts on a
2
pair of general-purpose registers treated as a 64-bit quantity, with
3
the shift count in another general-purpose register, which might be
4
either positive or negative.
5
6
Like the long-shifts-by-immediate, these encodings sit in the space
7
that was previously the UNPREDICTABLE MOVS/ORRS with Rm==13,15.
8
Because LSLL_rr and ASRL_rr overlap with both MOV_rxri/ORR_rrri and
9
also with CSEL (as one of the previously-UNPREDICTABLE Rm==13 cases),
10
we have to move the CSEL pattern into the same decodetree group.
2
11
3
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
12
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
4
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
13
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
5
Message-id: 20200828183354.27913-11-peter.maydell@linaro.org
14
Message-id: 20210628135835.6690-17-peter.maydell@linaro.org
6
---
15
---
7
target/arm/helper.h | 2 ++
16
target/arm/helper-mve.h | 6 +++
8
target/arm/vfp.decode | 2 ++
17
target/arm/translate.h | 1 +
9
target/arm/vfp_helper.c | 15 +++++++------
18
target/arm/t32.decode | 16 +++++--
10
target/arm/translate-vfp.c.inc | 39 ++++++++++++++++++++++++++++++++++
19
target/arm/mve_helper.c | 93 +++++++++++++++++++++++++++++++++++++++++
11
4 files changed, 51 insertions(+), 7 deletions(-)
20
target/arm/translate.c | 69 ++++++++++++++++++++++++++++++
12
21
5 files changed, 182 insertions(+), 3 deletions(-)
13
diff --git a/target/arm/helper.h b/target/arm/helper.h
22
14
index XXXXXXX..XXXXXXX 100644
23
diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h
15
--- a/target/arm/helper.h
24
index XXXXXXX..XXXXXXX 100644
16
+++ b/target/arm/helper.h
25
--- a/target/arm/helper-mve.h
17
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_1(vfp_absd, f64, f64)
26
+++ b/target/arm/helper-mve.h
18
DEF_HELPER_2(vfp_sqrth, f16, f16, env)
27
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_4(mve_vqrshrunth, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
19
DEF_HELPER_2(vfp_sqrts, f32, f32, env)
28
20
DEF_HELPER_2(vfp_sqrtd, f64, f64, env)
29
DEF_HELPER_FLAGS_4(mve_vshlc, TCG_CALL_NO_WG, i32, env, ptr, i32, i32)
21
+DEF_HELPER_3(vfp_cmph, void, f16, f16, env)
30
22
DEF_HELPER_3(vfp_cmps, void, f32, f32, env)
31
+DEF_HELPER_FLAGS_3(mve_sshrl, TCG_CALL_NO_RWG, i64, env, i64, i32)
23
DEF_HELPER_3(vfp_cmpd, void, f64, f64, env)
32
+DEF_HELPER_FLAGS_3(mve_ushll, TCG_CALL_NO_RWG, i64, env, i64, i32)
24
+DEF_HELPER_3(vfp_cmpeh, void, f16, f16, env)
33
DEF_HELPER_FLAGS_3(mve_sqshll, TCG_CALL_NO_RWG, i64, env, i64, i32)
25
DEF_HELPER_3(vfp_cmpes, void, f32, f32, env)
34
DEF_HELPER_FLAGS_3(mve_uqshll, TCG_CALL_NO_RWG, i64, env, i64, i32)
26
DEF_HELPER_3(vfp_cmped, void, f64, f64, env)
35
+DEF_HELPER_FLAGS_3(mve_sqrshrl, TCG_CALL_NO_RWG, i64, env, i64, i32)
27
36
+DEF_HELPER_FLAGS_3(mve_uqrshll, TCG_CALL_NO_RWG, i64, env, i64, i32)
28
diff --git a/target/arm/vfp.decode b/target/arm/vfp.decode
37
+DEF_HELPER_FLAGS_3(mve_sqrshrl48, TCG_CALL_NO_RWG, i64, env, i64, i32)
29
index XXXXXXX..XXXXXXX 100644
38
+DEF_HELPER_FLAGS_3(mve_uqrshll48, TCG_CALL_NO_RWG, i64, env, i64, i32)
30
--- a/target/arm/vfp.decode
39
diff --git a/target/arm/translate.h b/target/arm/translate.h
31
+++ b/target/arm/vfp.decode
40
index XXXXXXX..XXXXXXX 100644
32
@@ -XXX,XX +XXX,XX @@ VSQRT_hp ---- 1110 1.11 0001 .... 1001 11.0 .... @vfp_dm_ss
41
--- a/target/arm/translate.h
33
VSQRT_sp ---- 1110 1.11 0001 .... 1010 11.0 .... @vfp_dm_ss
42
+++ b/target/arm/translate.h
34
VSQRT_dp ---- 1110 1.11 0001 .... 1011 11.0 .... @vfp_dm_dd
43
@@ -XXX,XX +XXX,XX @@ typedef void CryptoThreeOpIntFn(TCGv_ptr, TCGv_ptr, TCGv_i32);
35
44
typedef void CryptoThreeOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr);
36
+VCMP_hp ---- 1110 1.11 010 z:1 .... 1001 e:1 1.0 .... \
45
typedef void AtomicThreeOpFn(TCGv_i64, TCGv_i64, TCGv_i64, TCGArg, MemOp);
37
+ vd=%vd_sp vm=%vm_sp
46
typedef void WideShiftImmFn(TCGv_i64, TCGv_i64, int64_t shift);
38
VCMP_sp ---- 1110 1.11 010 z:1 .... 1010 e:1 1.0 .... \
47
+typedef void WideShiftFn(TCGv_i64, TCGv_ptr, TCGv_i64, TCGv_i32);
39
vd=%vd_sp vm=%vm_sp
48
40
VCMP_dp ---- 1110 1.11 010 z:1 .... 1011 e:1 1.0 .... \
49
/**
41
diff --git a/target/arm/vfp_helper.c b/target/arm/vfp_helper.c
50
* arm_tbflags_from_tb:
42
index XXXXXXX..XXXXXXX 100644
51
diff --git a/target/arm/t32.decode b/target/arm/t32.decode
43
--- a/target/arm/vfp_helper.c
52
index XXXXXXX..XXXXXXX 100644
44
+++ b/target/arm/vfp_helper.c
53
--- a/target/arm/t32.decode
45
@@ -XXX,XX +XXX,XX @@ static void softfloat_to_vfp_compare(CPUARMState *env, FloatRelation cmp)
54
+++ b/target/arm/t32.decode
46
}
55
@@ -XXX,XX +XXX,XX @@
47
56
&mcrr !extern cp opc1 crm rt rt2
48
/* XXX: check quiet/signaling case */
57
49
-#define DO_VFP_cmp(p, type) \
58
&mve_shl_ri rdalo rdahi shim
50
-void VFP_HELPER(cmp, p)(type a, type b, CPUARMState *env) \
59
+&mve_shl_rr rdalo rdahi rm
51
+#define DO_VFP_cmp(P, FLOATTYPE, ARGTYPE, FPST) \
60
52
+void VFP_HELPER(cmp, P)(ARGTYPE a, ARGTYPE b, CPUARMState *env) \
61
# rdahi: bits [3:1] from insn, bit 0 is 1
53
{ \
62
# rdalo: bits [3:1] from insn, bit 0 is 0
54
softfloat_to_vfp_compare(env, \
63
@@ -XXX,XX +XXX,XX @@
55
- type ## _compare_quiet(a, b, &env->vfp.fp_status)); \
64
56
+ FLOATTYPE ## _compare_quiet(a, b, &env->vfp.FPST)); \
65
@mve_shl_ri ....... .... . ... . . ... ... . .. .. .... \
57
} \
66
&mve_shl_ri shim=%imm5_12_6 rdalo=%rdalo_17 rdahi=%rdahi_9
58
-void VFP_HELPER(cmpe, p)(type a, type b, CPUARMState *env) \
67
+@mve_shl_rr ....... .... . ... . rm:4 ... . .. .. .... \
59
+void VFP_HELPER(cmpe, P)(ARGTYPE a, ARGTYPE b, CPUARMState *env) \
68
+ &mve_shl_rr rdalo=%rdalo_17 rdahi=%rdahi_9
60
{ \
69
61
softfloat_to_vfp_compare(env, \
70
{
62
- type ## _compare(a, b, &env->vfp.fp_status)); \
71
TST_xrri 1110101 0000 1 .... 0 ... 1111 .... .... @S_xrr_shi
63
+ FLOATTYPE ## _compare(a, b, &env->vfp.FPST)); \
72
@@ -XXX,XX +XXX,XX @@ BIC_rrri 1110101 0001 . .... 0 ... .... .... .... @s_rrr_shi
64
}
73
URSHRL_ri 1110101 0010 1 ... 1 0 ... ... 1 .. 01 1111 @mve_shl_ri
65
-DO_VFP_cmp(s, float32)
74
SRSHRL_ri 1110101 0010 1 ... 1 0 ... ... 1 .. 10 1111 @mve_shl_ri
66
-DO_VFP_cmp(d, float64)
75
SQSHLL_ri 1110101 0010 1 ... 1 0 ... ... 1 .. 11 1111 @mve_shl_ri
67
+DO_VFP_cmp(h, float16, dh_ctype_f16, fp_status_f16)
76
+
68
+DO_VFP_cmp(s, float32, float32, fp_status)
77
+ LSLL_rr 1110101 0010 1 ... 0 .... ... 1 0000 1101 @mve_shl_rr
69
+DO_VFP_cmp(d, float64, float64, fp_status)
78
+ ASRL_rr 1110101 0010 1 ... 0 .... ... 1 0010 1101 @mve_shl_rr
70
#undef DO_VFP_cmp
79
+ UQRSHLL64_rr 1110101 0010 1 ... 1 .... ... 1 0000 1101 @mve_shl_rr
71
80
+ SQRSHRL64_rr 1110101 0010 1 ... 1 .... ... 1 0010 1101 @mve_shl_rr
72
/* Integer to float and float to integer conversions */
81
+ UQRSHLL48_rr 1110101 0010 1 ... 1 .... ... 1 1000 1101 @mve_shl_rr
73
diff --git a/target/arm/translate-vfp.c.inc b/target/arm/translate-vfp.c.inc
82
+ SQRSHRL48_rr 1110101 0010 1 ... 1 .... ... 1 1010 1101 @mve_shl_rr
74
index XXXXXXX..XXXXXXX 100644
83
]
75
--- a/target/arm/translate-vfp.c.inc
84
76
+++ b/target/arm/translate-vfp.c.inc
85
MOV_rxri 1110101 0010 . 1111 0 ... .... .... .... @s_rxr_shi
77
@@ -XXX,XX +XXX,XX @@ DO_VFP_2OP(VSQRT, hp, gen_VSQRT_hp)
86
ORR_rrri 1110101 0010 . .... 0 ... .... .... .... @s_rrr_shi
78
DO_VFP_2OP(VSQRT, sp, gen_VSQRT_sp)
87
+
79
DO_VFP_2OP(VSQRT, dp, gen_VSQRT_dp)
88
+ # v8.1M CSEL and friends
80
89
+ CSEL 1110101 0010 1 rn:4 10 op:2 rd:4 fcond:4 rm:4
81
+static bool trans_VCMP_hp(DisasContext *s, arg_VCMP_sp *a)
90
}
82
+{
91
{
83
+ TCGv_i32 vd, vm;
92
MVN_rxri 1110101 0011 . 1111 0 ... .... .... .... @s_rxr_shi
84
+
93
@@ -XXX,XX +XXX,XX @@ SBC_rrri 1110101 1011 . .... 0 ... .... .... .... @s_rrr_shi
85
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
94
}
95
RSB_rrri 1110101 1110 . .... 0 ... .... .... .... @s_rrr_shi
96
97
-# v8.1M CSEL and friends
98
-CSEL 1110101 0010 1 rn:4 10 op:2 rd:4 fcond:4 rm:4
99
-
100
# Data-processing (register-shifted register)
101
102
MOV_rxrr 1111 1010 0 shty:2 s:1 rm:4 1111 rd:4 0000 rs:4 \
103
diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c
104
index XXXXXXX..XXXXXXX 100644
105
--- a/target/arm/mve_helper.c
106
+++ b/target/arm/mve_helper.c
107
@@ -XXX,XX +XXX,XX @@ uint32_t HELPER(mve_vshlc)(CPUARMState *env, void *vd, uint32_t rdm,
108
return rdm;
109
}
110
111
+uint64_t HELPER(mve_sshrl)(CPUARMState *env, uint64_t n, uint32_t shift)
112
+{
113
+ return do_sqrshl_d(n, -(int8_t)shift, false, NULL);
114
+}
115
+
116
+uint64_t HELPER(mve_ushll)(CPUARMState *env, uint64_t n, uint32_t shift)
117
+{
118
+ return do_uqrshl_d(n, (int8_t)shift, false, NULL);
119
+}
120
+
121
uint64_t HELPER(mve_sqshll)(CPUARMState *env, uint64_t n, uint32_t shift)
122
{
123
return do_sqrshl_d(n, (int8_t)shift, false, &env->QF);
124
@@ -XXX,XX +XXX,XX @@ uint64_t HELPER(mve_uqshll)(CPUARMState *env, uint64_t n, uint32_t shift)
125
{
126
return do_uqrshl_d(n, (int8_t)shift, false, &env->QF);
127
}
128
+
129
+uint64_t HELPER(mve_sqrshrl)(CPUARMState *env, uint64_t n, uint32_t shift)
130
+{
131
+ return do_sqrshl_d(n, -(int8_t)shift, true, &env->QF);
132
+}
133
+
134
+uint64_t HELPER(mve_uqrshll)(CPUARMState *env, uint64_t n, uint32_t shift)
135
+{
136
+ return do_uqrshl_d(n, (int8_t)shift, true, &env->QF);
137
+}
138
+
139
+/* Operate on 64-bit values, but saturate at 48 bits */
140
+static inline int64_t do_sqrshl48_d(int64_t src, int64_t shift,
141
+ bool round, uint32_t *sat)
142
+{
143
+ if (shift <= -48) {
144
+ /* Rounding the sign bit always produces 0. */
145
+ if (round) {
146
+ return 0;
147
+ }
148
+ return src >> 63;
149
+ } else if (shift < 0) {
150
+ if (round) {
151
+ src >>= -shift - 1;
152
+ return (src >> 1) + (src & 1);
153
+ }
154
+ return src >> -shift;
155
+ } else if (shift < 48) {
156
+ int64_t val = src << shift;
157
+ int64_t extval = sextract64(val, 0, 48);
158
+ if (!sat || val == extval) {
159
+ return extval;
160
+ }
161
+ } else if (!sat || src == 0) {
162
+ return 0;
163
+ }
164
+
165
+ *sat = 1;
166
+ return (1ULL << 47) - (src >= 0);
167
+}
168
+
169
+/* Operate on 64-bit values, but saturate at 48 bits */
170
+static inline uint64_t do_uqrshl48_d(uint64_t src, int64_t shift,
171
+ bool round, uint32_t *sat)
172
+{
173
+ uint64_t val, extval;
174
+
175
+ if (shift <= -(48 + round)) {
176
+ return 0;
177
+ } else if (shift < 0) {
178
+ if (round) {
179
+ val = src >> (-shift - 1);
180
+ val = (val >> 1) + (val & 1);
181
+ } else {
182
+ val = src >> -shift;
183
+ }
184
+ extval = extract64(val, 0, 48);
185
+ if (!sat || val == extval) {
186
+ return extval;
187
+ }
188
+ } else if (shift < 48) {
189
+ uint64_t val = src << shift;
190
+ uint64_t extval = extract64(val, 0, 48);
191
+ if (!sat || val == extval) {
192
+ return extval;
193
+ }
194
+ } else if (!sat || src == 0) {
195
+ return 0;
196
+ }
197
+
198
+ *sat = 1;
199
+ return MAKE_64BIT_MASK(0, 48);
200
+}
201
+
202
+uint64_t HELPER(mve_sqrshrl48)(CPUARMState *env, uint64_t n, uint32_t shift)
203
+{
204
+ return do_sqrshl48_d(n, -(int8_t)shift, true, &env->QF);
205
+}
206
+
207
+uint64_t HELPER(mve_uqrshll48)(CPUARMState *env, uint64_t n, uint32_t shift)
208
+{
209
+ return do_uqrshl48_d(n, (int8_t)shift, true, &env->QF);
210
+}
211
diff --git a/target/arm/translate.c b/target/arm/translate.c
212
index XXXXXXX..XXXXXXX 100644
213
--- a/target/arm/translate.c
214
+++ b/target/arm/translate.c
215
@@ -XXX,XX +XXX,XX @@ static bool trans_URSHRL_ri(DisasContext *s, arg_mve_shl_ri *a)
216
return do_mve_shl_ri(s, a, gen_urshr64_i64);
217
}
218
219
+static bool do_mve_shl_rr(DisasContext *s, arg_mve_shl_rr *a, WideShiftFn *fn)
220
+{
221
+ TCGv_i64 rda;
222
+ TCGv_i32 rdalo, rdahi;
223
+
224
+ if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
225
+ /* Decode falls through to ORR/MOV UNPREDICTABLE handling */
86
+ return false;
226
+ return false;
87
+ }
227
+ }
88
+
228
+ if (a->rdahi == 15) {
89
+ /* Vm/M bits must be zero for the Z variant */
229
+ /* These are a different encoding (SQSHL/SRSHR/UQSHL/URSHR) */
90
+ if (a->z && a->vm != 0) {
91
+ return false;
230
+ return false;
92
+ }
231
+ }
93
+
232
+ if (!dc_isar_feature(aa32_mve, s) ||
94
+ if (!vfp_access_check(s)) {
233
+ !arm_dc_feature(s, ARM_FEATURE_M_MAIN) ||
234
+ a->rdahi == 13 || a->rm == 13 || a->rm == 15 ||
235
+ a->rm == a->rdahi || a->rm == a->rdalo) {
236
+ /* These rdahi/rdalo/rm cases are UNPREDICTABLE; we choose to UNDEF */
237
+ unallocated_encoding(s);
95
+ return true;
238
+ return true;
96
+ }
239
+ }
97
+
240
+
98
+ vd = tcg_temp_new_i32();
241
+ rda = tcg_temp_new_i64();
99
+ vm = tcg_temp_new_i32();
242
+ rdalo = load_reg(s, a->rdalo);
100
+
243
+ rdahi = load_reg(s, a->rdahi);
101
+ neon_load_reg32(vd, a->vd);
244
+ tcg_gen_concat_i32_i64(rda, rdalo, rdahi);
102
+ if (a->z) {
245
+
103
+ tcg_gen_movi_i32(vm, 0);
246
+ /* The helper takes care of the sign-extension of the low 8 bits of Rm */
104
+ } else {
247
+ fn(rda, cpu_env, rda, cpu_R[a->rm]);
105
+ neon_load_reg32(vm, a->vm);
248
+
106
+ }
249
+ tcg_gen_extrl_i64_i32(rdalo, rda);
107
+
250
+ tcg_gen_extrh_i64_i32(rdahi, rda);
108
+ if (a->e) {
251
+ store_reg(s, a->rdalo, rdalo);
109
+ gen_helper_vfp_cmpeh(vd, vm, cpu_env);
252
+ store_reg(s, a->rdahi, rdahi);
110
+ } else {
253
+ tcg_temp_free_i64(rda);
111
+ gen_helper_vfp_cmph(vd, vm, cpu_env);
112
+ }
113
+
114
+ tcg_temp_free_i32(vd);
115
+ tcg_temp_free_i32(vm);
116
+
254
+
117
+ return true;
255
+ return true;
118
+}
256
+}
119
+
257
+
120
static bool trans_VCMP_sp(DisasContext *s, arg_VCMP_sp *a)
258
+static bool trans_LSLL_rr(DisasContext *s, arg_mve_shl_rr *a)
121
{
259
+{
122
TCGv_i32 vd, vm;
260
+ return do_mve_shl_rr(s, a, gen_helper_mve_ushll);
261
+}
262
+
263
+static bool trans_ASRL_rr(DisasContext *s, arg_mve_shl_rr *a)
264
+{
265
+ return do_mve_shl_rr(s, a, gen_helper_mve_sshrl);
266
+}
267
+
268
+static bool trans_UQRSHLL64_rr(DisasContext *s, arg_mve_shl_rr *a)
269
+{
270
+ return do_mve_shl_rr(s, a, gen_helper_mve_uqrshll);
271
+}
272
+
273
+static bool trans_SQRSHRL64_rr(DisasContext *s, arg_mve_shl_rr *a)
274
+{
275
+ return do_mve_shl_rr(s, a, gen_helper_mve_sqrshrl);
276
+}
277
+
278
+static bool trans_UQRSHLL48_rr(DisasContext *s, arg_mve_shl_rr *a)
279
+{
280
+ return do_mve_shl_rr(s, a, gen_helper_mve_uqrshll48);
281
+}
282
+
283
+static bool trans_SQRSHRL48_rr(DisasContext *s, arg_mve_shl_rr *a)
284
+{
285
+ return do_mve_shl_rr(s, a, gen_helper_mve_sqrshrl48);
286
+}
287
+
288
/*
289
* Multiply and multiply accumulate
290
*/
123
--
291
--
124
2.20.1
292
2.20.1
125
293
126
294
diff view generated by jsdifflib
1
Implement VFP fp16 support for fused multiply-add insns
1
Implement the MVE shifts by immediate, which perform shifts
2
VFNMA, VFNMS, VFMA, VFMS.
2
on a single general-purpose register.
3
4
These patterns overlap with the long-shift-by-immediates,
5
so we have to rearrange the grouping a little here.
3
6
4
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
7
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
5
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
8
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
6
Message-id: 20200828183354.27913-7-peter.maydell@linaro.org
9
Message-id: 20210628135835.6690-18-peter.maydell@linaro.org
7
---
10
---
8
target/arm/helper.h | 1 +
11
target/arm/helper-mve.h | 3 ++
9
target/arm/vfp.decode | 5 +++
12
target/arm/translate.h | 1 +
10
target/arm/vfp_helper.c | 7 ++++
13
target/arm/t32.decode | 31 ++++++++++++++-----
11
target/arm/translate-vfp.c.inc | 64 ++++++++++++++++++++++++++++++++++
14
target/arm/mve_helper.c | 10 ++++++
12
4 files changed, 77 insertions(+)
15
target/arm/translate.c | 68 +++++++++++++++++++++++++++++++++++++++--
13
16
5 files changed, 104 insertions(+), 9 deletions(-)
14
diff --git a/target/arm/helper.h b/target/arm/helper.h
17
15
index XXXXXXX..XXXXXXX 100644
18
diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h
16
--- a/target/arm/helper.h
19
index XXXXXXX..XXXXXXX 100644
17
+++ b/target/arm/helper.h
20
--- a/target/arm/helper-mve.h
18
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_3(vfp_fcvt_f64_to_f16, TCG_CALL_NO_RWG, f16, f64, ptr, i32)
21
+++ b/target/arm/helper-mve.h
19
22
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_3(mve_sqrshrl, TCG_CALL_NO_RWG, i64, env, i64, i32)
20
DEF_HELPER_4(vfp_muladdd, f64, f64, f64, f64, ptr)
23
DEF_HELPER_FLAGS_3(mve_uqrshll, TCG_CALL_NO_RWG, i64, env, i64, i32)
21
DEF_HELPER_4(vfp_muladds, f32, f32, f32, f32, ptr)
24
DEF_HELPER_FLAGS_3(mve_sqrshrl48, TCG_CALL_NO_RWG, i64, env, i64, i32)
22
+DEF_HELPER_4(vfp_muladdh, f16, f16, f16, f16, ptr)
25
DEF_HELPER_FLAGS_3(mve_uqrshll48, TCG_CALL_NO_RWG, i64, env, i64, i32)
23
26
+
24
DEF_HELPER_3(recps_f32, f32, env, f32, f32)
27
+DEF_HELPER_FLAGS_3(mve_uqshl, TCG_CALL_NO_RWG, i32, env, i32, i32)
25
DEF_HELPER_3(rsqrts_f32, f32, env, f32, f32)
28
+DEF_HELPER_FLAGS_3(mve_sqshl, TCG_CALL_NO_RWG, i32, env, i32, i32)
26
diff --git a/target/arm/vfp.decode b/target/arm/vfp.decode
29
diff --git a/target/arm/translate.h b/target/arm/translate.h
27
index XXXXXXX..XXXXXXX 100644
30
index XXXXXXX..XXXXXXX 100644
28
--- a/target/arm/vfp.decode
31
--- a/target/arm/translate.h
29
+++ b/target/arm/vfp.decode
32
+++ b/target/arm/translate.h
30
@@ -XXX,XX +XXX,XX @@ VDIV_hp ---- 1110 1.00 .... .... 1001 .0.0 .... @vfp_dnm_s
33
@@ -XXX,XX +XXX,XX @@ typedef void CryptoThreeOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr);
31
VDIV_sp ---- 1110 1.00 .... .... 1010 .0.0 .... @vfp_dnm_s
34
typedef void AtomicThreeOpFn(TCGv_i64, TCGv_i64, TCGv_i64, TCGArg, MemOp);
32
VDIV_dp ---- 1110 1.00 .... .... 1011 .0.0 .... @vfp_dnm_d
35
typedef void WideShiftImmFn(TCGv_i64, TCGv_i64, int64_t shift);
33
36
typedef void WideShiftFn(TCGv_i64, TCGv_ptr, TCGv_i64, TCGv_i32);
34
+VFMA_hp ---- 1110 1.10 .... .... 1001 .0. 0 .... @vfp_dnm_s
37
+typedef void ShiftImmFn(TCGv_i32, TCGv_i32, int32_t shift);
35
+VFMS_hp ---- 1110 1.10 .... .... 1001 .1. 0 .... @vfp_dnm_s
38
36
+VFNMA_hp ---- 1110 1.01 .... .... 1001 .0. 0 .... @vfp_dnm_s
39
/**
37
+VFNMS_hp ---- 1110 1.01 .... .... 1001 .1. 0 .... @vfp_dnm_s
40
* arm_tbflags_from_tb:
38
+
41
diff --git a/target/arm/t32.decode b/target/arm/t32.decode
39
VFMA_sp ---- 1110 1.10 .... .... 1010 .0. 0 .... @vfp_dnm_s
42
index XXXXXXX..XXXXXXX 100644
40
VFMS_sp ---- 1110 1.10 .... .... 1010 .1. 0 .... @vfp_dnm_s
43
--- a/target/arm/t32.decode
41
VFNMA_sp ---- 1110 1.01 .... .... 1010 .0. 0 .... @vfp_dnm_s
44
+++ b/target/arm/t32.decode
42
diff --git a/target/arm/vfp_helper.c b/target/arm/vfp_helper.c
45
@@ -XXX,XX +XXX,XX @@
43
index XXXXXXX..XXXXXXX 100644
46
44
--- a/target/arm/vfp_helper.c
47
&mve_shl_ri rdalo rdahi shim
45
+++ b/target/arm/vfp_helper.c
48
&mve_shl_rr rdalo rdahi rm
46
@@ -XXX,XX +XXX,XX @@ uint32_t HELPER(rsqrte_u32)(uint32_t a)
49
+&mve_sh_ri rda shim
50
51
# rdahi: bits [3:1] from insn, bit 0 is 1
52
# rdalo: bits [3:1] from insn, bit 0 is 0
53
@@ -XXX,XX +XXX,XX @@
54
&mve_shl_ri shim=%imm5_12_6 rdalo=%rdalo_17 rdahi=%rdahi_9
55
@mve_shl_rr ....... .... . ... . rm:4 ... . .. .. .... \
56
&mve_shl_rr rdalo=%rdalo_17 rdahi=%rdahi_9
57
+@mve_sh_ri ....... .... . rda:4 . ... ... . .. .. .... \
58
+ &mve_sh_ri shim=%imm5_12_6
59
60
{
61
TST_xrri 1110101 0000 1 .... 0 ... 1111 .... .... @S_xrr_shi
62
@@ -XXX,XX +XXX,XX @@ BIC_rrri 1110101 0001 . .... 0 ... .... .... .... @s_rrr_shi
63
# the rest fall through (where ORR_rrri and MOV_rxri will end up
64
# handling them as r13 and r15 accesses with the same semantics as A32).
65
[
66
- LSLL_ri 1110101 0010 1 ... 0 0 ... ... 1 .. 00 1111 @mve_shl_ri
67
- LSRL_ri 1110101 0010 1 ... 0 0 ... ... 1 .. 01 1111 @mve_shl_ri
68
- ASRL_ri 1110101 0010 1 ... 0 0 ... ... 1 .. 10 1111 @mve_shl_ri
69
+ {
70
+ UQSHL_ri 1110101 0010 1 .... 0 ... 1111 .. 00 1111 @mve_sh_ri
71
+ LSLL_ri 1110101 0010 1 ... 0 0 ... ... 1 .. 00 1111 @mve_shl_ri
72
+ UQSHLL_ri 1110101 0010 1 ... 1 0 ... ... 1 .. 00 1111 @mve_shl_ri
73
+ }
74
75
- UQSHLL_ri 1110101 0010 1 ... 1 0 ... ... 1 .. 00 1111 @mve_shl_ri
76
- URSHRL_ri 1110101 0010 1 ... 1 0 ... ... 1 .. 01 1111 @mve_shl_ri
77
- SRSHRL_ri 1110101 0010 1 ... 1 0 ... ... 1 .. 10 1111 @mve_shl_ri
78
- SQSHLL_ri 1110101 0010 1 ... 1 0 ... ... 1 .. 11 1111 @mve_shl_ri
79
+ {
80
+ URSHR_ri 1110101 0010 1 .... 0 ... 1111 .. 01 1111 @mve_sh_ri
81
+ LSRL_ri 1110101 0010 1 ... 0 0 ... ... 1 .. 01 1111 @mve_shl_ri
82
+ URSHRL_ri 1110101 0010 1 ... 1 0 ... ... 1 .. 01 1111 @mve_shl_ri
83
+ }
84
+
85
+ {
86
+ SRSHR_ri 1110101 0010 1 .... 0 ... 1111 .. 10 1111 @mve_sh_ri
87
+ ASRL_ri 1110101 0010 1 ... 0 0 ... ... 1 .. 10 1111 @mve_shl_ri
88
+ SRSHRL_ri 1110101 0010 1 ... 1 0 ... ... 1 .. 10 1111 @mve_shl_ri
89
+ }
90
+
91
+ {
92
+ SQSHL_ri 1110101 0010 1 .... 0 ... 1111 .. 11 1111 @mve_sh_ri
93
+ SQSHLL_ri 1110101 0010 1 ... 1 0 ... ... 1 .. 11 1111 @mve_shl_ri
94
+ }
95
96
LSLL_rr 1110101 0010 1 ... 0 .... ... 1 0000 1101 @mve_shl_rr
97
ASRL_rr 1110101 0010 1 ... 0 .... ... 1 0010 1101 @mve_shl_rr
98
diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c
99
index XXXXXXX..XXXXXXX 100644
100
--- a/target/arm/mve_helper.c
101
+++ b/target/arm/mve_helper.c
102
@@ -XXX,XX +XXX,XX @@ uint64_t HELPER(mve_uqrshll48)(CPUARMState *env, uint64_t n, uint32_t shift)
103
{
104
return do_uqrshl48_d(n, (int8_t)shift, true, &env->QF);
47
}
105
}
48
106
+
49
/* VFPv4 fused multiply-accumulate */
107
+uint32_t HELPER(mve_uqshl)(CPUARMState *env, uint32_t n, uint32_t shift)
50
+dh_ctype_f16 VFP_HELPER(muladd, h)(dh_ctype_f16 a, dh_ctype_f16 b,
108
+{
51
+ dh_ctype_f16 c, void *fpstp)
109
+ return do_uqrshl_bhs(n, (int8_t)shift, 32, false, &env->QF);
52
+{
110
+}
53
+ float_status *fpst = fpstp;
111
+
54
+ return float16_muladd(a, b, c, 0, fpst);
112
+uint32_t HELPER(mve_sqshl)(CPUARMState *env, uint32_t n, uint32_t shift)
55
+}
113
+{
56
+
114
+ return do_sqrshl_bhs(n, (int8_t)shift, 32, false, &env->QF);
57
float32 VFP_HELPER(muladd, s)(float32 a, float32 b, float32 c, void *fpstp)
115
+}
58
{
116
diff --git a/target/arm/translate.c b/target/arm/translate.c
59
float_status *fpst = fpstp;
117
index XXXXXXX..XXXXXXX 100644
60
diff --git a/target/arm/translate-vfp.c.inc b/target/arm/translate-vfp.c.inc
118
--- a/target/arm/translate.c
61
index XXXXXXX..XXXXXXX 100644
119
+++ b/target/arm/translate.c
62
--- a/target/arm/translate-vfp.c.inc
120
@@ -XXX,XX +XXX,XX @@ static void gen_srshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
63
+++ b/target/arm/translate-vfp.c.inc
121
64
@@ -XXX,XX +XXX,XX @@ static bool trans_VMAXNM_dp(DisasContext *s, arg_VMAXNM_dp *a)
122
static void gen_srshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
65
a->vd, a->vn, a->vm, false);
123
{
124
- TCGv_i32 t = tcg_temp_new_i32();
125
+ TCGv_i32 t;
126
127
+ /* Handle shift by the input size for the benefit of trans_SRSHR_ri */
128
+ if (sh == 32) {
129
+ tcg_gen_movi_i32(d, 0);
130
+ return;
131
+ }
132
+ t = tcg_temp_new_i32();
133
tcg_gen_extract_i32(t, a, sh - 1, 1);
134
tcg_gen_sari_i32(d, a, sh);
135
tcg_gen_add_i32(d, d, t);
136
@@ -XXX,XX +XXX,XX @@ static void gen_urshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
137
138
static void gen_urshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
139
{
140
- TCGv_i32 t = tcg_temp_new_i32();
141
+ TCGv_i32 t;
142
143
+ /* Handle shift by the input size for the benefit of trans_URSHR_ri */
144
+ if (sh == 32) {
145
+ tcg_gen_extract_i32(d, a, sh - 1, 1);
146
+ return;
147
+ }
148
+ t = tcg_temp_new_i32();
149
tcg_gen_extract_i32(t, a, sh - 1, 1);
150
tcg_gen_shri_i32(d, a, sh);
151
tcg_gen_add_i32(d, d, t);
152
@@ -XXX,XX +XXX,XX @@ static bool trans_SQRSHRL48_rr(DisasContext *s, arg_mve_shl_rr *a)
153
return do_mve_shl_rr(s, a, gen_helper_mve_sqrshrl48);
66
}
154
}
67
155
68
+static bool do_vfm_hp(DisasContext *s, arg_VFMA_sp *a, bool neg_n, bool neg_d)
156
+static bool do_mve_sh_ri(DisasContext *s, arg_mve_sh_ri *a, ShiftImmFn *fn)
69
+{
157
+{
70
+ /*
158
+ if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
71
+ * VFNMA : fd = muladd(-fd, fn, fm)
159
+ /* Decode falls through to ORR/MOV UNPREDICTABLE handling */
72
+ * VFNMS : fd = muladd(-fd, -fn, fm)
73
+ * VFMA : fd = muladd( fd, fn, fm)
74
+ * VFMS : fd = muladd( fd, -fn, fm)
75
+ *
76
+ * These are fused multiply-add, and must be done as one floating
77
+ * point operation with no rounding between the multiplication and
78
+ * addition steps. NB that doing the negations here as separate
79
+ * steps is correct : an input NaN should come out with its sign
80
+ * bit flipped if it is a negated-input.
81
+ */
82
+ TCGv_ptr fpst;
83
+ TCGv_i32 vn, vm, vd;
84
+
85
+ /*
86
+ * Present in VFPv4 only, and only with the FP16 extension.
87
+ * Note that we can't rely on the SIMDFMAC check alone, because
88
+ * in a Neon-no-VFP core that ID register field will be non-zero.
89
+ */
90
+ if (!dc_isar_feature(aa32_fp16_arith, s) ||
91
+ !dc_isar_feature(aa32_simdfmac, s) ||
92
+ !dc_isar_feature(aa32_fpsp_v2, s)) {
93
+ return false;
160
+ return false;
94
+ }
161
+ }
95
+
162
+ if (!dc_isar_feature(aa32_mve, s) ||
96
+ if (s->vec_len != 0 || s->vec_stride != 0) {
163
+ !arm_dc_feature(s, ARM_FEATURE_M_MAIN) ||
97
+ return false;
164
+ a->rda == 13 || a->rda == 15) {
98
+ }
165
+ /* These rda cases are UNPREDICTABLE; we choose to UNDEF */
99
+
166
+ unallocated_encoding(s);
100
+ if (!vfp_access_check(s)) {
101
+ return true;
167
+ return true;
102
+ }
168
+ }
103
+
169
+
104
+ vn = tcg_temp_new_i32();
170
+ if (a->shim == 0) {
105
+ vm = tcg_temp_new_i32();
171
+ a->shim = 32;
106
+ vd = tcg_temp_new_i32();
172
+ }
107
+
173
+ fn(cpu_R[a->rda], cpu_R[a->rda], a->shim);
108
+ neon_load_reg32(vn, a->vn);
109
+ neon_load_reg32(vm, a->vm);
110
+ if (neg_n) {
111
+ /* VFNMS, VFMS */
112
+ gen_helper_vfp_negh(vn, vn);
113
+ }
114
+ neon_load_reg32(vd, a->vd);
115
+ if (neg_d) {
116
+ /* VFNMA, VFNMS */
117
+ gen_helper_vfp_negh(vd, vd);
118
+ }
119
+ fpst = fpstatus_ptr(FPST_FPCR_F16);
120
+ gen_helper_vfp_muladdh(vd, vn, vm, vd, fpst);
121
+ neon_store_reg32(vd, a->vd);
122
+
123
+ tcg_temp_free_ptr(fpst);
124
+ tcg_temp_free_i32(vn);
125
+ tcg_temp_free_i32(vm);
126
+ tcg_temp_free_i32(vd);
127
+
174
+
128
+ return true;
175
+ return true;
129
+}
176
+}
130
+
177
+
131
static bool do_vfm_sp(DisasContext *s, arg_VFMA_sp *a, bool neg_n, bool neg_d)
178
+static bool trans_URSHR_ri(DisasContext *s, arg_mve_sh_ri *a)
132
{
179
+{
133
/*
180
+ return do_mve_sh_ri(s, a, gen_urshr32_i32);
134
@@ -XXX,XX +XXX,XX @@ static bool do_vfm_dp(DisasContext *s, arg_VFMA_dp *a, bool neg_n, bool neg_d)
181
+}
135
MAKE_ONE_VFM_TRANS_FN(VFNMA, PREC, false, true) \
182
+
136
MAKE_ONE_VFM_TRANS_FN(VFNMS, PREC, true, true)
183
+static bool trans_SRSHR_ri(DisasContext *s, arg_mve_sh_ri *a)
137
184
+{
138
+MAKE_VFM_TRANS_FNS(hp)
185
+ return do_mve_sh_ri(s, a, gen_srshr32_i32);
139
MAKE_VFM_TRANS_FNS(sp)
186
+}
140
MAKE_VFM_TRANS_FNS(dp)
187
+
141
188
+static void gen_mve_sqshl(TCGv_i32 r, TCGv_i32 n, int32_t shift)
189
+{
190
+ gen_helper_mve_sqshl(r, cpu_env, n, tcg_constant_i32(shift));
191
+}
192
+
193
+static bool trans_SQSHL_ri(DisasContext *s, arg_mve_sh_ri *a)
194
+{
195
+ return do_mve_sh_ri(s, a, gen_mve_sqshl);
196
+}
197
+
198
+static void gen_mve_uqshl(TCGv_i32 r, TCGv_i32 n, int32_t shift)
199
+{
200
+ gen_helper_mve_uqshl(r, cpu_env, n, tcg_constant_i32(shift));
201
+}
202
+
203
+static bool trans_UQSHL_ri(DisasContext *s, arg_mve_sh_ri *a)
204
+{
205
+ return do_mve_sh_ri(s, a, gen_mve_uqshl);
206
+}
207
+
208
/*
209
* Multiply and multiply accumulate
210
*/
142
--
211
--
143
2.20.1
212
2.20.1
144
213
145
214
diff view generated by jsdifflib
Deleted patch
1
Implement VFP fp16 support for the VMOV immediate insn.
2
1
3
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
4
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
5
Message-id: 20200828183354.27913-10-peter.maydell@linaro.org
6
---
7
target/arm/vfp.decode | 2 ++
8
target/arm/translate-vfp.c.inc | 22 ++++++++++++++++++++++
9
2 files changed, 24 insertions(+)
10
11
diff --git a/target/arm/vfp.decode b/target/arm/vfp.decode
12
index XXXXXXX..XXXXXXX 100644
13
--- a/target/arm/vfp.decode
14
+++ b/target/arm/vfp.decode
15
@@ -XXX,XX +XXX,XX @@ VFMS_dp ---- 1110 1.10 .... .... 1011 .1.0 .... @vfp_dnm_d
16
VFNMA_dp ---- 1110 1.01 .... .... 1011 .0.0 .... @vfp_dnm_d
17
VFNMS_dp ---- 1110 1.01 .... .... 1011 .1.0 .... @vfp_dnm_d
18
19
+VMOV_imm_hp ---- 1110 1.11 .... .... 1001 0000 .... \
20
+ vd=%vd_sp imm=%vmov_imm
21
VMOV_imm_sp ---- 1110 1.11 .... .... 1010 0000 .... \
22
vd=%vd_sp imm=%vmov_imm
23
VMOV_imm_dp ---- 1110 1.11 .... .... 1011 0000 .... \
24
diff --git a/target/arm/translate-vfp.c.inc b/target/arm/translate-vfp.c.inc
25
index XXXXXXX..XXXXXXX 100644
26
--- a/target/arm/translate-vfp.c.inc
27
+++ b/target/arm/translate-vfp.c.inc
28
@@ -XXX,XX +XXX,XX @@ MAKE_VFM_TRANS_FNS(hp)
29
MAKE_VFM_TRANS_FNS(sp)
30
MAKE_VFM_TRANS_FNS(dp)
31
32
+static bool trans_VMOV_imm_hp(DisasContext *s, arg_VMOV_imm_sp *a)
33
+{
34
+ TCGv_i32 fd;
35
+
36
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
37
+ return false;
38
+ }
39
+
40
+ if (s->vec_len != 0 || s->vec_stride != 0) {
41
+ return false;
42
+ }
43
+
44
+ if (!vfp_access_check(s)) {
45
+ return true;
46
+ }
47
+
48
+ fd = tcg_const_i32(vfp_expand_imm(MO_16, a->imm));
49
+ neon_store_reg32(fd, a->vd);
50
+ tcg_temp_free_i32(fd);
51
+ return true;
52
+}
53
+
54
static bool trans_VMOV_imm_sp(DisasContext *s, arg_VMOV_imm_sp *a)
55
{
56
uint32_t delta_d = 0;
57
--
58
2.20.1
59
60
diff view generated by jsdifflib
Deleted patch
1
Currently the VFP_CONV_FIX macros take a single fsz argument for the
2
size of the float type, which is used both to select the name of
3
the functions to call (eg float32_is_any_nan()) and also for the
4
type to use for the float inputs and outputs (eg float32).
5
1
6
Separate these into fsz and ftype arguments, so that we can use them
7
for fp16, which uses 'float16' in the function names but is still
8
passing inputs and outputs in a 32-bit sized type.
9
10
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
11
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
12
Message-id: 20200828183354.27913-14-peter.maydell@linaro.org
13
---
14
target/arm/vfp_helper.c | 46 ++++++++++++++++++++---------------------
15
1 file changed, 23 insertions(+), 23 deletions(-)
16
17
diff --git a/target/arm/vfp_helper.c b/target/arm/vfp_helper.c
18
index XXXXXXX..XXXXXXX 100644
19
--- a/target/arm/vfp_helper.c
20
+++ b/target/arm/vfp_helper.c
21
@@ -XXX,XX +XXX,XX @@ float32 VFP_HELPER(fcvts, d)(float64 x, CPUARMState *env)
22
}
23
24
/* VFP3 fixed point conversion. */
25
-#define VFP_CONV_FIX_FLOAT(name, p, fsz, isz, itype) \
26
-float##fsz HELPER(vfp_##name##to##p)(uint##isz##_t x, uint32_t shift, \
27
+#define VFP_CONV_FIX_FLOAT(name, p, fsz, ftype, isz, itype) \
28
+ftype HELPER(vfp_##name##to##p)(uint##isz##_t x, uint32_t shift, \
29
void *fpstp) \
30
{ return itype##_to_##float##fsz##_scalbn(x, -shift, fpstp); }
31
32
-#define VFP_CONV_FLOAT_FIX_ROUND(name, p, fsz, isz, itype, ROUND, suff) \
33
-uint##isz##_t HELPER(vfp_to##name##p##suff)(float##fsz x, uint32_t shift, \
34
+#define VFP_CONV_FLOAT_FIX_ROUND(name, p, fsz, ftype, isz, itype, ROUND, suff) \
35
+uint##isz##_t HELPER(vfp_to##name##p##suff)(ftype x, uint32_t shift, \
36
void *fpst) \
37
{ \
38
if (unlikely(float##fsz##_is_any_nan(x))) { \
39
@@ -XXX,XX +XXX,XX @@ uint##isz##_t HELPER(vfp_to##name##p##suff)(float##fsz x, uint32_t shift, \
40
return float##fsz##_to_##itype##_scalbn(x, ROUND, shift, fpst); \
41
}
42
43
-#define VFP_CONV_FIX(name, p, fsz, isz, itype) \
44
-VFP_CONV_FIX_FLOAT(name, p, fsz, isz, itype) \
45
-VFP_CONV_FLOAT_FIX_ROUND(name, p, fsz, isz, itype, \
46
+#define VFP_CONV_FIX(name, p, fsz, ftype, isz, itype) \
47
+VFP_CONV_FIX_FLOAT(name, p, fsz, ftype, isz, itype) \
48
+VFP_CONV_FLOAT_FIX_ROUND(name, p, fsz, ftype, isz, itype, \
49
float_round_to_zero, _round_to_zero) \
50
-VFP_CONV_FLOAT_FIX_ROUND(name, p, fsz, isz, itype, \
51
+VFP_CONV_FLOAT_FIX_ROUND(name, p, fsz, ftype, isz, itype, \
52
get_float_rounding_mode(fpst), )
53
54
-#define VFP_CONV_FIX_A64(name, p, fsz, isz, itype) \
55
-VFP_CONV_FIX_FLOAT(name, p, fsz, isz, itype) \
56
-VFP_CONV_FLOAT_FIX_ROUND(name, p, fsz, isz, itype, \
57
+#define VFP_CONV_FIX_A64(name, p, fsz, ftype, isz, itype) \
58
+VFP_CONV_FIX_FLOAT(name, p, fsz, ftype, isz, itype) \
59
+VFP_CONV_FLOAT_FIX_ROUND(name, p, fsz, ftype, isz, itype, \
60
get_float_rounding_mode(fpst), )
61
62
-VFP_CONV_FIX(sh, d, 64, 64, int16)
63
-VFP_CONV_FIX(sl, d, 64, 64, int32)
64
-VFP_CONV_FIX_A64(sq, d, 64, 64, int64)
65
-VFP_CONV_FIX(uh, d, 64, 64, uint16)
66
-VFP_CONV_FIX(ul, d, 64, 64, uint32)
67
-VFP_CONV_FIX_A64(uq, d, 64, 64, uint64)
68
-VFP_CONV_FIX(sh, s, 32, 32, int16)
69
-VFP_CONV_FIX(sl, s, 32, 32, int32)
70
-VFP_CONV_FIX_A64(sq, s, 32, 64, int64)
71
-VFP_CONV_FIX(uh, s, 32, 32, uint16)
72
-VFP_CONV_FIX(ul, s, 32, 32, uint32)
73
-VFP_CONV_FIX_A64(uq, s, 32, 64, uint64)
74
+VFP_CONV_FIX(sh, d, 64, float64, 64, int16)
75
+VFP_CONV_FIX(sl, d, 64, float64, 64, int32)
76
+VFP_CONV_FIX_A64(sq, d, 64, float64, 64, int64)
77
+VFP_CONV_FIX(uh, d, 64, float64, 64, uint16)
78
+VFP_CONV_FIX(ul, d, 64, float64, 64, uint32)
79
+VFP_CONV_FIX_A64(uq, d, 64, float64, 64, uint64)
80
+VFP_CONV_FIX(sh, s, 32, float32, 32, int16)
81
+VFP_CONV_FIX(sl, s, 32, float32, 32, int32)
82
+VFP_CONV_FIX_A64(sq, s, 32, float32, 64, int64)
83
+VFP_CONV_FIX(uh, s, 32, float32, 32, uint16)
84
+VFP_CONV_FIX(ul, s, 32, float32, 32, uint32)
85
+VFP_CONV_FIX_A64(uq, s, 32, float32, 64, uint64)
86
87
#undef VFP_CONV_FIX
88
#undef VFP_CONV_FIX_FLOAT
89
--
90
2.20.1
91
92
diff view generated by jsdifflib
Deleted patch
1
Now the VFP_CONV_FIX macros can handle fp16's distinction between the
2
width of the operation and the width of the type used to pass operands,
3
use the macros rather than the open-coded functions.
4
1
5
This creates an extra six helper functions, all of which we are going
6
to need for the AArch32 VFP fp16 instructions.
7
8
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
9
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
10
Message-id: 20200828183354.27913-15-peter.maydell@linaro.org
11
---
12
target/arm/helper.h | 6 +++
13
target/arm/vfp_helper.c | 86 +++--------------------------------------
14
2 files changed, 12 insertions(+), 80 deletions(-)
15
16
diff --git a/target/arm/helper.h b/target/arm/helper.h
17
index XXXXXXX..XXXXXXX 100644
18
--- a/target/arm/helper.h
19
+++ b/target/arm/helper.h
20
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_2(vfp_tosizh, s32, f16, ptr)
21
DEF_HELPER_2(vfp_tosizs, s32, f32, ptr)
22
DEF_HELPER_2(vfp_tosizd, s32, f64, ptr)
23
24
+DEF_HELPER_3(vfp_toshh_round_to_zero, i32, f16, i32, ptr)
25
+DEF_HELPER_3(vfp_toslh_round_to_zero, i32, f16, i32, ptr)
26
+DEF_HELPER_3(vfp_touhh_round_to_zero, i32, f16, i32, ptr)
27
+DEF_HELPER_3(vfp_toulh_round_to_zero, i32, f16, i32, ptr)
28
DEF_HELPER_3(vfp_toshs_round_to_zero, i32, f32, i32, ptr)
29
DEF_HELPER_3(vfp_tosls_round_to_zero, i32, f32, i32, ptr)
30
DEF_HELPER_3(vfp_touhs_round_to_zero, i32, f32, i32, ptr)
31
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_3(vfp_sqtod, f64, i64, i32, ptr)
32
DEF_HELPER_3(vfp_uhtod, f64, i64, i32, ptr)
33
DEF_HELPER_3(vfp_ultod, f64, i64, i32, ptr)
34
DEF_HELPER_3(vfp_uqtod, f64, i64, i32, ptr)
35
+DEF_HELPER_3(vfp_shtoh, f16, i32, i32, ptr)
36
+DEF_HELPER_3(vfp_uhtoh, f16, i32, i32, ptr)
37
DEF_HELPER_3(vfp_sltoh, f16, i32, i32, ptr)
38
DEF_HELPER_3(vfp_ultoh, f16, i32, i32, ptr)
39
DEF_HELPER_3(vfp_sqtoh, f16, i64, i32, ptr)
40
diff --git a/target/arm/vfp_helper.c b/target/arm/vfp_helper.c
41
index XXXXXXX..XXXXXXX 100644
42
--- a/target/arm/vfp_helper.c
43
+++ b/target/arm/vfp_helper.c
44
@@ -XXX,XX +XXX,XX @@ VFP_CONV_FIX_A64(sq, s, 32, float32, 64, int64)
45
VFP_CONV_FIX(uh, s, 32, float32, 32, uint16)
46
VFP_CONV_FIX(ul, s, 32, float32, 32, uint32)
47
VFP_CONV_FIX_A64(uq, s, 32, float32, 64, uint64)
48
+VFP_CONV_FIX(sh, h, 16, dh_ctype_f16, 32, int16)
49
+VFP_CONV_FIX(sl, h, 16, dh_ctype_f16, 32, int32)
50
+VFP_CONV_FIX_A64(sq, h, 16, dh_ctype_f16, 64, int64)
51
+VFP_CONV_FIX(uh, h, 16, dh_ctype_f16, 32, uint16)
52
+VFP_CONV_FIX(ul, h, 16, dh_ctype_f16, 32, uint32)
53
+VFP_CONV_FIX_A64(uq, h, 16, dh_ctype_f16, 64, uint64)
54
55
#undef VFP_CONV_FIX
56
#undef VFP_CONV_FIX_FLOAT
57
#undef VFP_CONV_FLOAT_FIX_ROUND
58
#undef VFP_CONV_FIX_A64
59
60
-uint32_t HELPER(vfp_sltoh)(uint32_t x, uint32_t shift, void *fpst)
61
-{
62
- return int32_to_float16_scalbn(x, -shift, fpst);
63
-}
64
-
65
-uint32_t HELPER(vfp_ultoh)(uint32_t x, uint32_t shift, void *fpst)
66
-{
67
- return uint32_to_float16_scalbn(x, -shift, fpst);
68
-}
69
-
70
-uint32_t HELPER(vfp_sqtoh)(uint64_t x, uint32_t shift, void *fpst)
71
-{
72
- return int64_to_float16_scalbn(x, -shift, fpst);
73
-}
74
-
75
-uint32_t HELPER(vfp_uqtoh)(uint64_t x, uint32_t shift, void *fpst)
76
-{
77
- return uint64_to_float16_scalbn(x, -shift, fpst);
78
-}
79
-
80
-uint32_t HELPER(vfp_toshh)(uint32_t x, uint32_t shift, void *fpst)
81
-{
82
- if (unlikely(float16_is_any_nan(x))) {
83
- float_raise(float_flag_invalid, fpst);
84
- return 0;
85
- }
86
- return float16_to_int16_scalbn(x, get_float_rounding_mode(fpst),
87
- shift, fpst);
88
-}
89
-
90
-uint32_t HELPER(vfp_touhh)(uint32_t x, uint32_t shift, void *fpst)
91
-{
92
- if (unlikely(float16_is_any_nan(x))) {
93
- float_raise(float_flag_invalid, fpst);
94
- return 0;
95
- }
96
- return float16_to_uint16_scalbn(x, get_float_rounding_mode(fpst),
97
- shift, fpst);
98
-}
99
-
100
-uint32_t HELPER(vfp_toslh)(uint32_t x, uint32_t shift, void *fpst)
101
-{
102
- if (unlikely(float16_is_any_nan(x))) {
103
- float_raise(float_flag_invalid, fpst);
104
- return 0;
105
- }
106
- return float16_to_int32_scalbn(x, get_float_rounding_mode(fpst),
107
- shift, fpst);
108
-}
109
-
110
-uint32_t HELPER(vfp_toulh)(uint32_t x, uint32_t shift, void *fpst)
111
-{
112
- if (unlikely(float16_is_any_nan(x))) {
113
- float_raise(float_flag_invalid, fpst);
114
- return 0;
115
- }
116
- return float16_to_uint32_scalbn(x, get_float_rounding_mode(fpst),
117
- shift, fpst);
118
-}
119
-
120
-uint64_t HELPER(vfp_tosqh)(uint32_t x, uint32_t shift, void *fpst)
121
-{
122
- if (unlikely(float16_is_any_nan(x))) {
123
- float_raise(float_flag_invalid, fpst);
124
- return 0;
125
- }
126
- return float16_to_int64_scalbn(x, get_float_rounding_mode(fpst),
127
- shift, fpst);
128
-}
129
-
130
-uint64_t HELPER(vfp_touqh)(uint32_t x, uint32_t shift, void *fpst)
131
-{
132
- if (unlikely(float16_is_any_nan(x))) {
133
- float_raise(float_flag_invalid, fpst);
134
- return 0;
135
- }
136
- return float16_to_uint64_scalbn(x, get_float_rounding_mode(fpst),
137
- shift, fpst);
138
-}
139
-
140
/* Set the current fp rounding mode and return the old one.
141
* The argument is a softfloat float_round_ value.
142
*/
143
--
144
2.20.1
145
146
diff view generated by jsdifflib
Deleted patch
1
Implement the fp16 versions of the VFP VCVT instruction forms which
2
convert between floating point and fixed-point.
3
1
4
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
5
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
6
Message-id: 20200828183354.27913-16-peter.maydell@linaro.org
7
---
8
target/arm/vfp.decode | 2 ++
9
target/arm/translate-vfp.c.inc | 59 ++++++++++++++++++++++++++++++++++
10
2 files changed, 61 insertions(+)
11
12
diff --git a/target/arm/vfp.decode b/target/arm/vfp.decode
13
index XXXXXXX..XXXXXXX 100644
14
--- a/target/arm/vfp.decode
15
+++ b/target/arm/vfp.decode
16
@@ -XXX,XX +XXX,XX @@ VJCVT ---- 1110 1.11 1001 .... 1011 11.0 .... @vfp_dm_sd
17
# We assemble bits 18 (op), 16 (u) and 7 (sx) into a single opc field
18
# for the convenience of the trans_VCVT_fix functions.
19
%vcvt_fix_op 18:1 16:1 7:1
20
+VCVT_fix_hp ---- 1110 1.11 1.1. .... 1001 .1.0 .... \
21
+ vd=%vd_sp imm=%vm_sp opc=%vcvt_fix_op
22
VCVT_fix_sp ---- 1110 1.11 1.1. .... 1010 .1.0 .... \
23
vd=%vd_sp imm=%vm_sp opc=%vcvt_fix_op
24
VCVT_fix_dp ---- 1110 1.11 1.1. .... 1011 .1.0 .... \
25
diff --git a/target/arm/translate-vfp.c.inc b/target/arm/translate-vfp.c.inc
26
index XXXXXXX..XXXXXXX 100644
27
--- a/target/arm/translate-vfp.c.inc
28
+++ b/target/arm/translate-vfp.c.inc
29
@@ -XXX,XX +XXX,XX @@ static bool trans_VJCVT(DisasContext *s, arg_VJCVT *a)
30
return true;
31
}
32
33
+static bool trans_VCVT_fix_hp(DisasContext *s, arg_VCVT_fix_sp *a)
34
+{
35
+ TCGv_i32 vd, shift;
36
+ TCGv_ptr fpst;
37
+ int frac_bits;
38
+
39
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
40
+ return false;
41
+ }
42
+
43
+ if (!vfp_access_check(s)) {
44
+ return true;
45
+ }
46
+
47
+ frac_bits = (a->opc & 1) ? (32 - a->imm) : (16 - a->imm);
48
+
49
+ vd = tcg_temp_new_i32();
50
+ neon_load_reg32(vd, a->vd);
51
+
52
+ fpst = fpstatus_ptr(FPST_FPCR_F16);
53
+ shift = tcg_const_i32(frac_bits);
54
+
55
+ /* Switch on op:U:sx bits */
56
+ switch (a->opc) {
57
+ case 0:
58
+ gen_helper_vfp_shtoh(vd, vd, shift, fpst);
59
+ break;
60
+ case 1:
61
+ gen_helper_vfp_sltoh(vd, vd, shift, fpst);
62
+ break;
63
+ case 2:
64
+ gen_helper_vfp_uhtoh(vd, vd, shift, fpst);
65
+ break;
66
+ case 3:
67
+ gen_helper_vfp_ultoh(vd, vd, shift, fpst);
68
+ break;
69
+ case 4:
70
+ gen_helper_vfp_toshh_round_to_zero(vd, vd, shift, fpst);
71
+ break;
72
+ case 5:
73
+ gen_helper_vfp_toslh_round_to_zero(vd, vd, shift, fpst);
74
+ break;
75
+ case 6:
76
+ gen_helper_vfp_touhh_round_to_zero(vd, vd, shift, fpst);
77
+ break;
78
+ case 7:
79
+ gen_helper_vfp_toulh_round_to_zero(vd, vd, shift, fpst);
80
+ break;
81
+ default:
82
+ g_assert_not_reached();
83
+ }
84
+
85
+ neon_store_reg32(vd, a->vd);
86
+ tcg_temp_free_i32(vd);
87
+ tcg_temp_free_i32(shift);
88
+ tcg_temp_free_ptr(fpst);
89
+ return true;
90
+}
91
+
92
static bool trans_VCVT_fix_sp(DisasContext *s, arg_VCVT_fix_sp *a)
93
{
94
TCGv_i32 vd, shift;
95
--
96
2.20.1
97
98
diff view generated by jsdifflib
Deleted patch
1
Implement the fp16 versions of the VFP VCVT instruction forms
2
which convert between floating point and integer with a specified
3
rounding mode.
4
1
5
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
6
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
7
Message-id: 20200828183354.27913-17-peter.maydell@linaro.org
8
---
9
target/arm/vfp-uncond.decode | 6 ++++--
10
target/arm/translate-vfp.c.inc | 32 ++++++++++++++++++++++++--------
11
2 files changed, 28 insertions(+), 10 deletions(-)
12
13
diff --git a/target/arm/vfp-uncond.decode b/target/arm/vfp-uncond.decode
14
index XXXXXXX..XXXXXXX 100644
15
--- a/target/arm/vfp-uncond.decode
16
+++ b/target/arm/vfp-uncond.decode
17
@@ -XXX,XX +XXX,XX @@ VRINT 1111 1110 1.11 10 rm:2 .... 1011 01.0 .... \
18
vm=%vm_dp vd=%vd_dp dp=1
19
20
# VCVT float to int with specified rounding mode; Vd is always single-precision
21
+VCVT 1111 1110 1.11 11 rm:2 .... 1001 op:1 1.0 .... \
22
+ vm=%vm_sp vd=%vd_sp sz=1
23
VCVT 1111 1110 1.11 11 rm:2 .... 1010 op:1 1.0 .... \
24
- vm=%vm_sp vd=%vd_sp dp=0
25
+ vm=%vm_sp vd=%vd_sp sz=2
26
VCVT 1111 1110 1.11 11 rm:2 .... 1011 op:1 1.0 .... \
27
- vm=%vm_dp vd=%vd_sp dp=1
28
+ vm=%vm_dp vd=%vd_sp sz=3
29
diff --git a/target/arm/translate-vfp.c.inc b/target/arm/translate-vfp.c.inc
30
index XXXXXXX..XXXXXXX 100644
31
--- a/target/arm/translate-vfp.c.inc
32
+++ b/target/arm/translate-vfp.c.inc
33
@@ -XXX,XX +XXX,XX @@ static bool trans_VRINT(DisasContext *s, arg_VRINT *a)
34
static bool trans_VCVT(DisasContext *s, arg_VCVT *a)
35
{
36
uint32_t rd, rm;
37
- bool dp = a->dp;
38
+ int sz = a->sz;
39
TCGv_ptr fpst;
40
TCGv_i32 tcg_rmode, tcg_shift;
41
int rounding = fp_decode_rm[a->rm];
42
@@ -XXX,XX +XXX,XX @@ static bool trans_VCVT(DisasContext *s, arg_VCVT *a)
43
return false;
44
}
45
46
- if (dp && !dc_isar_feature(aa32_fpdp_v2, s)) {
47
+ if (sz == 3 && !dc_isar_feature(aa32_fpdp_v2, s)) {
48
+ return false;
49
+ }
50
+
51
+ if (sz == 1 && !dc_isar_feature(aa32_fp16_arith, s)) {
52
return false;
53
}
54
55
/* UNDEF accesses to D16-D31 if they don't exist */
56
- if (dp && !dc_isar_feature(aa32_simd_r32, s) && (a->vm & 0x10)) {
57
+ if (sz == 3 && !dc_isar_feature(aa32_simd_r32, s) && (a->vm & 0x10)) {
58
return false;
59
}
60
61
@@ -XXX,XX +XXX,XX @@ static bool trans_VCVT(DisasContext *s, arg_VCVT *a)
62
return true;
63
}
64
65
- fpst = fpstatus_ptr(FPST_FPCR);
66
+ if (sz == 1) {
67
+ fpst = fpstatus_ptr(FPST_FPCR_F16);
68
+ } else {
69
+ fpst = fpstatus_ptr(FPST_FPCR);
70
+ }
71
72
tcg_shift = tcg_const_i32(0);
73
74
tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rounding));
75
gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
76
77
- if (dp) {
78
+ if (sz == 3) {
79
TCGv_i64 tcg_double, tcg_res;
80
TCGv_i32 tcg_tmp;
81
tcg_double = tcg_temp_new_i64();
82
@@ -XXX,XX +XXX,XX @@ static bool trans_VCVT(DisasContext *s, arg_VCVT *a)
83
tcg_single = tcg_temp_new_i32();
84
tcg_res = tcg_temp_new_i32();
85
neon_load_reg32(tcg_single, rm);
86
- if (is_signed) {
87
- gen_helper_vfp_tosls(tcg_res, tcg_single, tcg_shift, fpst);
88
+ if (sz == 1) {
89
+ if (is_signed) {
90
+ gen_helper_vfp_toslh(tcg_res, tcg_single, tcg_shift, fpst);
91
+ } else {
92
+ gen_helper_vfp_toulh(tcg_res, tcg_single, tcg_shift, fpst);
93
+ }
94
} else {
95
- gen_helper_vfp_touls(tcg_res, tcg_single, tcg_shift, fpst);
96
+ if (is_signed) {
97
+ gen_helper_vfp_tosls(tcg_res, tcg_single, tcg_shift, fpst);
98
+ } else {
99
+ gen_helper_vfp_touls(tcg_res, tcg_single, tcg_shift, fpst);
100
+ }
101
}
102
neon_store_reg32(tcg_res, rd);
103
tcg_temp_free_i32(tcg_res);
104
--
105
2.20.1
106
107
diff view generated by jsdifflib
Deleted patch
1
Implement the fp16 versions of the VFP VSEL instruction.
2
1
3
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
4
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
5
Message-id: 20200828183354.27913-18-peter.maydell@linaro.org
6
---
7
target/arm/vfp-uncond.decode | 6 ++++--
8
target/arm/translate-vfp.c.inc | 16 ++++++++++++----
9
2 files changed, 16 insertions(+), 6 deletions(-)
10
11
diff --git a/target/arm/vfp-uncond.decode b/target/arm/vfp-uncond.decode
12
index XXXXXXX..XXXXXXX 100644
13
--- a/target/arm/vfp-uncond.decode
14
+++ b/target/arm/vfp-uncond.decode
15
@@ -XXX,XX +XXX,XX @@
16
@vfp_dnm_s ................................ vm=%vm_sp vn=%vn_sp vd=%vd_sp
17
@vfp_dnm_d ................................ vm=%vm_dp vn=%vn_dp vd=%vd_dp
18
19
+VSEL 1111 1110 0. cc:2 .... .... 1001 .0.0 .... \
20
+ vm=%vm_sp vn=%vn_sp vd=%vd_sp sz=1
21
VSEL 1111 1110 0. cc:2 .... .... 1010 .0.0 .... \
22
- vm=%vm_sp vn=%vn_sp vd=%vd_sp dp=0
23
+ vm=%vm_sp vn=%vn_sp vd=%vd_sp sz=2
24
VSEL 1111 1110 0. cc:2 .... .... 1011 .0.0 .... \
25
- vm=%vm_dp vn=%vn_dp vd=%vd_dp dp=1
26
+ vm=%vm_dp vn=%vn_dp vd=%vd_dp sz=3
27
28
VMAXNM_hp 1111 1110 1.00 .... .... 1001 .0.0 .... @vfp_dnm_s
29
VMINNM_hp 1111 1110 1.00 .... .... 1001 .1.0 .... @vfp_dnm_s
30
diff --git a/target/arm/translate-vfp.c.inc b/target/arm/translate-vfp.c.inc
31
index XXXXXXX..XXXXXXX 100644
32
--- a/target/arm/translate-vfp.c.inc
33
+++ b/target/arm/translate-vfp.c.inc
34
@@ -XXX,XX +XXX,XX @@ static bool vfp_access_check(DisasContext *s)
35
static bool trans_VSEL(DisasContext *s, arg_VSEL *a)
36
{
37
uint32_t rd, rn, rm;
38
- bool dp = a->dp;
39
+ int sz = a->sz;
40
41
if (!dc_isar_feature(aa32_vsel, s)) {
42
return false;
43
}
44
45
- if (dp && !dc_isar_feature(aa32_fpdp_v2, s)) {
46
+ if (sz == 3 && !dc_isar_feature(aa32_fpdp_v2, s)) {
47
+ return false;
48
+ }
49
+
50
+ if (sz == 1 && !dc_isar_feature(aa32_fp16_arith, s)) {
51
return false;
52
}
53
54
/* UNDEF accesses to D16-D31 if they don't exist */
55
- if (dp && !dc_isar_feature(aa32_simd_r32, s) &&
56
+ if (sz == 3 && !dc_isar_feature(aa32_simd_r32, s) &&
57
((a->vm | a->vn | a->vd) & 0x10)) {
58
return false;
59
}
60
@@ -XXX,XX +XXX,XX @@ static bool trans_VSEL(DisasContext *s, arg_VSEL *a)
61
return true;
62
}
63
64
- if (dp) {
65
+ if (sz == 3) {
66
TCGv_i64 frn, frm, dest;
67
TCGv_i64 tmp, zero, zf, nf, vf;
68
69
@@ -XXX,XX +XXX,XX @@ static bool trans_VSEL(DisasContext *s, arg_VSEL *a)
70
tcg_temp_free_i32(tmp);
71
break;
72
}
73
+ /* For fp16 the top half is always zeroes */
74
+ if (sz == 1) {
75
+ tcg_gen_andi_i32(dest, dest, 0xffff);
76
+ }
77
neon_store_reg32(dest, rd);
78
tcg_temp_free_i32(frn);
79
tcg_temp_free_i32(frm);
80
--
81
2.20.1
82
83
diff view generated by jsdifflib
1
Implement the fp16 version of the VFP VRINT* insns.
1
Implement the MVE shifts by register, which perform
2
shifts on a single general-purpose register.
2
3
3
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
4
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
4
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
5
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
5
Message-id: 20200828183354.27913-19-peter.maydell@linaro.org
6
Message-id: 20210628135835.6690-19-peter.maydell@linaro.org
6
---
7
---
7
target/arm/helper.h | 2 +
8
target/arm/helper-mve.h | 2 ++
8
target/arm/vfp-uncond.decode | 6 ++-
9
target/arm/translate.h | 1 +
9
target/arm/vfp.decode | 3 ++
10
target/arm/t32.decode | 18 ++++++++++++++----
10
target/arm/vfp_helper.c | 21 ++++++++
11
target/arm/mve_helper.c | 10 ++++++++++
11
target/arm/translate-vfp.c.inc | 98 +++++++++++++++++++++++++++++++---
12
target/arm/translate.c | 30 ++++++++++++++++++++++++++++++
12
5 files changed, 122 insertions(+), 8 deletions(-)
13
5 files changed, 57 insertions(+), 4 deletions(-)
13
14
14
diff --git a/target/arm/helper.h b/target/arm/helper.h
15
diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h
15
index XXXXXXX..XXXXXXX 100644
16
index XXXXXXX..XXXXXXX 100644
16
--- a/target/arm/helper.h
17
--- a/target/arm/helper-mve.h
17
+++ b/target/arm/helper.h
18
+++ b/target/arm/helper-mve.h
18
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_3(shr_cc, i32, env, i32, i32)
19
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_3(mve_uqrshll48, TCG_CALL_NO_RWG, i64, env, i64, i32)
19
DEF_HELPER_3(sar_cc, i32, env, i32, i32)
20
20
DEF_HELPER_3(ror_cc, i32, env, i32, i32)
21
DEF_HELPER_FLAGS_3(mve_uqshl, TCG_CALL_NO_RWG, i32, env, i32, i32)
21
22
DEF_HELPER_FLAGS_3(mve_sqshl, TCG_CALL_NO_RWG, i32, env, i32, i32)
22
+DEF_HELPER_FLAGS_2(rinth_exact, TCG_CALL_NO_RWG, f16, f16, ptr)
23
+DEF_HELPER_FLAGS_3(mve_uqrshl, TCG_CALL_NO_RWG, i32, env, i32, i32)
23
DEF_HELPER_FLAGS_2(rints_exact, TCG_CALL_NO_RWG, f32, f32, ptr)
24
+DEF_HELPER_FLAGS_3(mve_sqrshr, TCG_CALL_NO_RWG, i32, env, i32, i32)
24
DEF_HELPER_FLAGS_2(rintd_exact, TCG_CALL_NO_RWG, f64, f64, ptr)
25
diff --git a/target/arm/translate.h b/target/arm/translate.h
25
+DEF_HELPER_FLAGS_2(rinth, TCG_CALL_NO_RWG, f16, f16, ptr)
26
DEF_HELPER_FLAGS_2(rints, TCG_CALL_NO_RWG, f32, f32, ptr)
27
DEF_HELPER_FLAGS_2(rintd, TCG_CALL_NO_RWG, f64, f64, ptr)
28
29
diff --git a/target/arm/vfp-uncond.decode b/target/arm/vfp-uncond.decode
30
index XXXXXXX..XXXXXXX 100644
26
index XXXXXXX..XXXXXXX 100644
31
--- a/target/arm/vfp-uncond.decode
27
--- a/target/arm/translate.h
32
+++ b/target/arm/vfp-uncond.decode
28
+++ b/target/arm/translate.h
33
@@ -XXX,XX +XXX,XX @@ VMINNM_sp 1111 1110 1.00 .... .... 1010 .1.0 .... @vfp_dnm_s
29
@@ -XXX,XX +XXX,XX @@ typedef void AtomicThreeOpFn(TCGv_i64, TCGv_i64, TCGv_i64, TCGArg, MemOp);
34
VMAXNM_dp 1111 1110 1.00 .... .... 1011 .0.0 .... @vfp_dnm_d
30
typedef void WideShiftImmFn(TCGv_i64, TCGv_i64, int64_t shift);
35
VMINNM_dp 1111 1110 1.00 .... .... 1011 .1.0 .... @vfp_dnm_d
31
typedef void WideShiftFn(TCGv_i64, TCGv_ptr, TCGv_i64, TCGv_i32);
36
32
typedef void ShiftImmFn(TCGv_i32, TCGv_i32, int32_t shift);
37
+VRINT 1111 1110 1.11 10 rm:2 .... 1001 01.0 .... \
33
+typedef void ShiftFn(TCGv_i32, TCGv_ptr, TCGv_i32, TCGv_i32);
38
+ vm=%vm_sp vd=%vd_sp sz=1
34
39
VRINT 1111 1110 1.11 10 rm:2 .... 1010 01.0 .... \
35
/**
40
- vm=%vm_sp vd=%vd_sp dp=0
36
* arm_tbflags_from_tb:
41
+ vm=%vm_sp vd=%vd_sp sz=2
37
diff --git a/target/arm/t32.decode b/target/arm/t32.decode
42
VRINT 1111 1110 1.11 10 rm:2 .... 1011 01.0 .... \
43
- vm=%vm_dp vd=%vd_dp dp=1
44
+ vm=%vm_dp vd=%vd_dp sz=3
45
46
# VCVT float to int with specified rounding mode; Vd is always single-precision
47
VCVT 1111 1110 1.11 11 rm:2 .... 1001 op:1 1.0 .... \
48
diff --git a/target/arm/vfp.decode b/target/arm/vfp.decode
49
index XXXXXXX..XXXXXXX 100644
38
index XXXXXXX..XXXXXXX 100644
50
--- a/target/arm/vfp.decode
39
--- a/target/arm/t32.decode
51
+++ b/target/arm/vfp.decode
40
+++ b/target/arm/t32.decode
52
@@ -XXX,XX +XXX,XX @@ VCVT_f16_f32 ---- 1110 1.11 0011 .... 1010 t:1 1.0 .... \
41
@@ -XXX,XX +XXX,XX @@
53
VCVT_f16_f64 ---- 1110 1.11 0011 .... 1011 t:1 1.0 .... \
42
&mve_shl_ri rdalo rdahi shim
54
vd=%vd_sp vm=%vm_dp
43
&mve_shl_rr rdalo rdahi rm
55
44
&mve_sh_ri rda shim
56
+VRINTR_hp ---- 1110 1.11 0110 .... 1001 01.0 .... @vfp_dm_ss
45
+&mve_sh_rr rda rm
57
VRINTR_sp ---- 1110 1.11 0110 .... 1010 01.0 .... @vfp_dm_ss
46
58
VRINTR_dp ---- 1110 1.11 0110 .... 1011 01.0 .... @vfp_dm_dd
47
# rdahi: bits [3:1] from insn, bit 0 is 1
59
48
# rdalo: bits [3:1] from insn, bit 0 is 0
60
+VRINTZ_hp ---- 1110 1.11 0110 .... 1001 11.0 .... @vfp_dm_ss
49
@@ -XXX,XX +XXX,XX @@
61
VRINTZ_sp ---- 1110 1.11 0110 .... 1010 11.0 .... @vfp_dm_ss
50
&mve_shl_rr rdalo=%rdalo_17 rdahi=%rdahi_9
62
VRINTZ_dp ---- 1110 1.11 0110 .... 1011 11.0 .... @vfp_dm_dd
51
@mve_sh_ri ....... .... . rda:4 . ... ... . .. .. .... \
63
52
&mve_sh_ri shim=%imm5_12_6
64
+VRINTX_hp ---- 1110 1.11 0111 .... 1001 01.0 .... @vfp_dm_ss
53
+@mve_sh_rr ....... .... . rda:4 rm:4 .... .... .... &mve_sh_rr
65
VRINTX_sp ---- 1110 1.11 0111 .... 1010 01.0 .... @vfp_dm_ss
54
66
VRINTX_dp ---- 1110 1.11 0111 .... 1011 01.0 .... @vfp_dm_dd
55
{
67
56
TST_xrri 1110101 0000 1 .... 0 ... 1111 .... .... @S_xrr_shi
68
diff --git a/target/arm/vfp_helper.c b/target/arm/vfp_helper.c
57
@@ -XXX,XX +XXX,XX @@ BIC_rrri 1110101 0001 . .... 0 ... .... .... .... @s_rrr_shi
58
SQSHLL_ri 1110101 0010 1 ... 1 0 ... ... 1 .. 11 1111 @mve_shl_ri
59
}
60
61
- LSLL_rr 1110101 0010 1 ... 0 .... ... 1 0000 1101 @mve_shl_rr
62
- ASRL_rr 1110101 0010 1 ... 0 .... ... 1 0010 1101 @mve_shl_rr
63
- UQRSHLL64_rr 1110101 0010 1 ... 1 .... ... 1 0000 1101 @mve_shl_rr
64
- SQRSHRL64_rr 1110101 0010 1 ... 1 .... ... 1 0010 1101 @mve_shl_rr
65
+ {
66
+ UQRSHL_rr 1110101 0010 1 .... .... 1111 0000 1101 @mve_sh_rr
67
+ LSLL_rr 1110101 0010 1 ... 0 .... ... 1 0000 1101 @mve_shl_rr
68
+ UQRSHLL64_rr 1110101 0010 1 ... 1 .... ... 1 0000 1101 @mve_shl_rr
69
+ }
70
+
71
+ {
72
+ SQRSHR_rr 1110101 0010 1 .... .... 1111 0010 1101 @mve_sh_rr
73
+ ASRL_rr 1110101 0010 1 ... 0 .... ... 1 0010 1101 @mve_shl_rr
74
+ SQRSHRL64_rr 1110101 0010 1 ... 1 .... ... 1 0010 1101 @mve_shl_rr
75
+ }
76
+
77
UQRSHLL48_rr 1110101 0010 1 ... 1 .... ... 1 1000 1101 @mve_shl_rr
78
SQRSHRL48_rr 1110101 0010 1 ... 1 .... ... 1 1010 1101 @mve_shl_rr
79
]
80
diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c
69
index XXXXXXX..XXXXXXX 100644
81
index XXXXXXX..XXXXXXX 100644
70
--- a/target/arm/vfp_helper.c
82
--- a/target/arm/mve_helper.c
71
+++ b/target/arm/vfp_helper.c
83
+++ b/target/arm/mve_helper.c
72
@@ -XXX,XX +XXX,XX @@ float64 VFP_HELPER(muladd, d)(float64 a, float64 b, float64 c, void *fpstp)
84
@@ -XXX,XX +XXX,XX @@ uint32_t HELPER(mve_sqshl)(CPUARMState *env, uint32_t n, uint32_t shift)
85
{
86
return do_sqrshl_bhs(n, (int8_t)shift, 32, false, &env->QF);
73
}
87
}
74
88
+
75
/* ARMv8 round to integral */
89
+uint32_t HELPER(mve_uqrshl)(CPUARMState *env, uint32_t n, uint32_t shift)
76
+dh_ctype_f16 HELPER(rinth_exact)(dh_ctype_f16 x, void *fp_status)
77
+{
90
+{
78
+ return float16_round_to_int(x, fp_status);
91
+ return do_uqrshl_bhs(n, (int8_t)shift, 32, true, &env->QF);
79
+}
92
+}
80
+
93
+
81
float32 HELPER(rints_exact)(float32 x, void *fp_status)
94
+uint32_t HELPER(mve_sqrshr)(CPUARMState *env, uint32_t n, uint32_t shift)
82
{
95
+{
83
return float32_round_to_int(x, fp_status);
96
+ return do_sqrshl_bhs(n, -(int8_t)shift, 32, true, &env->QF);
84
@@ -XXX,XX +XXX,XX @@ float64 HELPER(rintd_exact)(float64 x, void *fp_status)
97
+}
85
return float64_round_to_int(x, fp_status);
98
diff --git a/target/arm/translate.c b/target/arm/translate.c
99
index XXXXXXX..XXXXXXX 100644
100
--- a/target/arm/translate.c
101
+++ b/target/arm/translate.c
102
@@ -XXX,XX +XXX,XX @@ static bool trans_UQSHL_ri(DisasContext *s, arg_mve_sh_ri *a)
103
return do_mve_sh_ri(s, a, gen_mve_uqshl);
86
}
104
}
87
105
88
+dh_ctype_f16 HELPER(rinth)(dh_ctype_f16 x, void *fp_status)
106
+static bool do_mve_sh_rr(DisasContext *s, arg_mve_sh_rr *a, ShiftFn *fn)
89
+{
107
+{
90
+ int old_flags = get_float_exception_flags(fp_status), new_flags;
108
+ if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
91
+ float16 ret;
109
+ /* Decode falls through to ORR/MOV UNPREDICTABLE handling */
92
+
93
+ ret = float16_round_to_int(x, fp_status);
94
+
95
+ /* Suppress any inexact exceptions the conversion produced */
96
+ if (!(old_flags & float_flag_inexact)) {
97
+ new_flags = get_float_exception_flags(fp_status);
98
+ set_float_exception_flags(new_flags & ~float_flag_inexact, fp_status);
99
+ }
100
+
101
+ return ret;
102
+}
103
+
104
float32 HELPER(rints)(float32 x, void *fp_status)
105
{
106
int old_flags = get_float_exception_flags(fp_status), new_flags;
107
diff --git a/target/arm/translate-vfp.c.inc b/target/arm/translate-vfp.c.inc
108
index XXXXXXX..XXXXXXX 100644
109
--- a/target/arm/translate-vfp.c.inc
110
+++ b/target/arm/translate-vfp.c.inc
111
@@ -XXX,XX +XXX,XX @@ static const uint8_t fp_decode_rm[] = {
112
static bool trans_VRINT(DisasContext *s, arg_VRINT *a)
113
{
114
uint32_t rd, rm;
115
- bool dp = a->dp;
116
+ int sz = a->sz;
117
TCGv_ptr fpst;
118
TCGv_i32 tcg_rmode;
119
int rounding = fp_decode_rm[a->rm];
120
@@ -XXX,XX +XXX,XX @@ static bool trans_VRINT(DisasContext *s, arg_VRINT *a)
121
return false;
122
}
123
124
- if (dp && !dc_isar_feature(aa32_fpdp_v2, s)) {
125
+ if (sz == 3 && !dc_isar_feature(aa32_fpdp_v2, s)) {
126
+ return false;
110
+ return false;
127
+ }
111
+ }
128
+
112
+ if (!dc_isar_feature(aa32_mve, s) ||
129
+ if (sz == 1 && !dc_isar_feature(aa32_fp16_arith, s)) {
113
+ !arm_dc_feature(s, ARM_FEATURE_M_MAIN) ||
130
return false;
114
+ a->rda == 13 || a->rda == 15 || a->rm == 13 || a->rm == 15 ||
131
}
115
+ a->rm == a->rda) {
132
116
+ /* These rda/rm cases are UNPREDICTABLE; we choose to UNDEF */
133
/* UNDEF accesses to D16-D31 if they don't exist */
117
+ unallocated_encoding(s);
134
- if (dp && !dc_isar_feature(aa32_simd_r32, s) &&
135
+ if (sz == 3 && !dc_isar_feature(aa32_simd_r32, s) &&
136
((a->vm | a->vd) & 0x10)) {
137
return false;
138
}
139
@@ -XXX,XX +XXX,XX @@ static bool trans_VRINT(DisasContext *s, arg_VRINT *a)
140
return true;
141
}
142
143
- fpst = fpstatus_ptr(FPST_FPCR);
144
+ if (sz == 1) {
145
+ fpst = fpstatus_ptr(FPST_FPCR_F16);
146
+ } else {
147
+ fpst = fpstatus_ptr(FPST_FPCR);
148
+ }
149
150
tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rounding));
151
gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
152
153
- if (dp) {
154
+ if (sz == 3) {
155
TCGv_i64 tcg_op;
156
TCGv_i64 tcg_res;
157
tcg_op = tcg_temp_new_i64();
158
@@ -XXX,XX +XXX,XX @@ static bool trans_VRINT(DisasContext *s, arg_VRINT *a)
159
tcg_op = tcg_temp_new_i32();
160
tcg_res = tcg_temp_new_i32();
161
neon_load_reg32(tcg_op, rm);
162
- gen_helper_rints(tcg_res, tcg_op, fpst);
163
+ if (sz == 1) {
164
+ gen_helper_rinth(tcg_res, tcg_op, fpst);
165
+ } else {
166
+ gen_helper_rints(tcg_res, tcg_op, fpst);
167
+ }
168
neon_store_reg32(tcg_res, rd);
169
tcg_temp_free_i32(tcg_op);
170
tcg_temp_free_i32(tcg_res);
171
@@ -XXX,XX +XXX,XX @@ static bool trans_VCVT_f16_f64(DisasContext *s, arg_VCVT_f16_f64 *a)
172
return true;
173
}
174
175
+static bool trans_VRINTR_hp(DisasContext *s, arg_VRINTR_sp *a)
176
+{
177
+ TCGv_ptr fpst;
178
+ TCGv_i32 tmp;
179
+
180
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
181
+ return false;
182
+ }
183
+
184
+ if (!vfp_access_check(s)) {
185
+ return true;
118
+ return true;
186
+ }
119
+ }
187
+
120
+
188
+ tmp = tcg_temp_new_i32();
121
+ /* The helper takes care of the sign-extension of the low 8 bits of Rm */
189
+ neon_load_reg32(tmp, a->vm);
122
+ fn(cpu_R[a->rda], cpu_env, cpu_R[a->rda], cpu_R[a->rm]);
190
+ fpst = fpstatus_ptr(FPST_FPCR_F16);
191
+ gen_helper_rinth(tmp, tmp, fpst);
192
+ neon_store_reg32(tmp, a->vd);
193
+ tcg_temp_free_ptr(fpst);
194
+ tcg_temp_free_i32(tmp);
195
+ return true;
123
+ return true;
196
+}
124
+}
197
+
125
+
198
static bool trans_VRINTR_sp(DisasContext *s, arg_VRINTR_sp *a)
126
+static bool trans_SQRSHR_rr(DisasContext *s, arg_mve_sh_rr *a)
199
{
200
TCGv_ptr fpst;
201
@@ -XXX,XX +XXX,XX @@ static bool trans_VRINTR_dp(DisasContext *s, arg_VRINTR_dp *a)
202
return true;
203
}
204
205
+static bool trans_VRINTZ_hp(DisasContext *s, arg_VRINTZ_sp *a)
206
+{
127
+{
207
+ TCGv_ptr fpst;
128
+ return do_mve_sh_rr(s, a, gen_helper_mve_sqrshr);
208
+ TCGv_i32 tmp;
209
+ TCGv_i32 tcg_rmode;
210
+
211
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
212
+ return false;
213
+ }
214
+
215
+ if (!vfp_access_check(s)) {
216
+ return true;
217
+ }
218
+
219
+ tmp = tcg_temp_new_i32();
220
+ neon_load_reg32(tmp, a->vm);
221
+ fpst = fpstatus_ptr(FPST_FPCR_F16);
222
+ tcg_rmode = tcg_const_i32(float_round_to_zero);
223
+ gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
224
+ gen_helper_rinth(tmp, tmp, fpst);
225
+ gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
226
+ neon_store_reg32(tmp, a->vd);
227
+ tcg_temp_free_ptr(fpst);
228
+ tcg_temp_free_i32(tcg_rmode);
229
+ tcg_temp_free_i32(tmp);
230
+ return true;
231
+}
129
+}
232
+
130
+
233
static bool trans_VRINTZ_sp(DisasContext *s, arg_VRINTZ_sp *a)
131
+static bool trans_UQRSHL_rr(DisasContext *s, arg_mve_sh_rr *a)
234
{
235
TCGv_ptr fpst;
236
@@ -XXX,XX +XXX,XX @@ static bool trans_VRINTZ_dp(DisasContext *s, arg_VRINTZ_dp *a)
237
return true;
238
}
239
240
+static bool trans_VRINTX_hp(DisasContext *s, arg_VRINTX_sp *a)
241
+{
132
+{
242
+ TCGv_ptr fpst;
133
+ return do_mve_sh_rr(s, a, gen_helper_mve_uqrshl);
243
+ TCGv_i32 tmp;
244
+
245
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
246
+ return false;
247
+ }
248
+
249
+ if (!vfp_access_check(s)) {
250
+ return true;
251
+ }
252
+
253
+ tmp = tcg_temp_new_i32();
254
+ neon_load_reg32(tmp, a->vm);
255
+ fpst = fpstatus_ptr(FPST_FPCR_F16);
256
+ gen_helper_rinth_exact(tmp, tmp, fpst);
257
+ neon_store_reg32(tmp, a->vd);
258
+ tcg_temp_free_ptr(fpst);
259
+ tcg_temp_free_i32(tmp);
260
+ return true;
261
+}
134
+}
262
+
135
+
263
static bool trans_VRINTX_sp(DisasContext *s, arg_VRINTX_sp *a)
136
/*
264
{
137
* Multiply and multiply accumulate
265
TCGv_ptr fpst;
138
*/
266
--
139
--
267
2.20.1
140
2.20.1
268
141
269
142
diff view generated by jsdifflib
Deleted patch
1
The fp16 extension includes a new instruction VMOVX, which copies the
2
upper 16 bits of a 32-bit source VFP register into the lower 16
3
bits of the destination and zeroes the high half of the destination.
4
Implement it.
5
1
6
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
7
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
8
Message-id: 20200828183354.27913-21-peter.maydell@linaro.org
9
---
10
target/arm/vfp-uncond.decode | 3 +++
11
target/arm/translate-vfp.c.inc | 25 +++++++++++++++++++++++++
12
2 files changed, 28 insertions(+)
13
14
diff --git a/target/arm/vfp-uncond.decode b/target/arm/vfp-uncond.decode
15
index XXXXXXX..XXXXXXX 100644
16
--- a/target/arm/vfp-uncond.decode
17
+++ b/target/arm/vfp-uncond.decode
18
@@ -XXX,XX +XXX,XX @@ VCVT 1111 1110 1.11 11 rm:2 .... 1010 op:1 1.0 .... \
19
VCVT 1111 1110 1.11 11 rm:2 .... 1011 op:1 1.0 .... \
20
vm=%vm_dp vd=%vd_sp sz=3
21
22
+VMOVX 1111 1110 1.11 0000 .... 1010 01 . 0 .... \
23
+ vd=%vd_sp vm=%vm_sp
24
+
25
VINS 1111 1110 1.11 0000 .... 1010 11 . 0 .... \
26
vd=%vd_sp vm=%vm_sp
27
diff --git a/target/arm/translate-vfp.c.inc b/target/arm/translate-vfp.c.inc
28
index XXXXXXX..XXXXXXX 100644
29
--- a/target/arm/translate-vfp.c.inc
30
+++ b/target/arm/translate-vfp.c.inc
31
@@ -XXX,XX +XXX,XX @@ static bool trans_VINS(DisasContext *s, arg_VINS *a)
32
tcg_temp_free_i32(rd);
33
return true;
34
}
35
+
36
+static bool trans_VMOVX(DisasContext *s, arg_VINS *a)
37
+{
38
+ TCGv_i32 rm;
39
+
40
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
41
+ return false;
42
+ }
43
+
44
+ if (s->vec_len != 0 || s->vec_stride != 0) {
45
+ return false;
46
+ }
47
+
48
+ if (!vfp_access_check(s)) {
49
+ return true;
50
+ }
51
+
52
+ /* Set Vd to high half of Vm */
53
+ rm = tcg_temp_new_i32();
54
+ neon_load_reg32(rm, a->vm);
55
+ tcg_gen_shri_i32(rm, rm, 16);
56
+ neon_store_reg32(rm, a->vd);
57
+ tcg_temp_free_i32(rm);
58
+ return true;
59
+}
60
--
61
2.20.1
62
63
diff view generated by jsdifflib
Deleted patch
1
Implement FP16 support for the Neon insns which use the DO_3S_FP_GVEC
2
macro: VADD, VSUB, VABD, VMUL.
3
1
4
For VABD this requires us to implement a new gvec_fabd_h helper
5
using the machinery we have already for the other helpers.
6
7
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
8
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
9
Message-id: 20200828183354.27913-24-peter.maydell@linaro.org
10
---
11
target/arm/helper.h | 1 +
12
target/arm/vec_helper.c | 6 ++++++
13
target/arm/translate-neon.c.inc | 36 +++++++++++++++++----------------
14
3 files changed, 26 insertions(+), 17 deletions(-)
15
16
diff --git a/target/arm/helper.h b/target/arm/helper.h
17
index XXXXXXX..XXXXXXX 100644
18
--- a/target/arm/helper.h
19
+++ b/target/arm/helper.h
20
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_5(gvec_fmul_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
21
DEF_HELPER_FLAGS_5(gvec_fmul_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
22
DEF_HELPER_FLAGS_5(gvec_fmul_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
23
24
+DEF_HELPER_FLAGS_5(gvec_fabd_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
25
DEF_HELPER_FLAGS_5(gvec_fabd_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
26
27
DEF_HELPER_FLAGS_5(gvec_ftsmul_h, TCG_CALL_NO_RWG,
28
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
29
index XXXXXXX..XXXXXXX 100644
30
--- a/target/arm/vec_helper.c
31
+++ b/target/arm/vec_helper.c
32
@@ -XXX,XX +XXX,XX @@ static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
33
return result;
34
}
35
36
+static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
37
+{
38
+ return float16_abs(float16_sub(op1, op2, stat));
39
+}
40
+
41
static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
42
{
43
return float32_abs(float32_sub(op1, op2, stat));
44
@@ -XXX,XX +XXX,XX @@ DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
45
DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
46
DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
47
48
+DO_3OP(gvec_fabd_h, float16_abd, float16)
49
DO_3OP(gvec_fabd_s, float32_abd, float32)
50
51
#ifdef TARGET_AARCH64
52
diff --git a/target/arm/translate-neon.c.inc b/target/arm/translate-neon.c.inc
53
index XXXXXXX..XXXXXXX 100644
54
--- a/target/arm/translate-neon.c.inc
55
+++ b/target/arm/translate-neon.c.inc
56
@@ -XXX,XX +XXX,XX @@ static bool do_3same_fp(DisasContext *s, arg_3same *a, VFPGen3OpSPFn *fn,
57
return true;
58
}
59
60
-/*
61
- * For all the functions using this macro, size == 1 means fp16,
62
- * which is an architecture extension we don't implement yet.
63
- */
64
-#define DO_3S_FP_GVEC(INSN,FUNC) \
65
- static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs, \
66
- uint32_t rn_ofs, uint32_t rm_ofs, \
67
- uint32_t oprsz, uint32_t maxsz) \
68
+#define WRAP_FP_GVEC(WRAPNAME, FPST, FUNC) \
69
+ static void WRAPNAME(unsigned vece, uint32_t rd_ofs, \
70
+ uint32_t rn_ofs, uint32_t rm_ofs, \
71
+ uint32_t oprsz, uint32_t maxsz) \
72
{ \
73
- TCGv_ptr fpst = fpstatus_ptr(FPST_STD); \
74
+ TCGv_ptr fpst = fpstatus_ptr(FPST); \
75
tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpst, \
76
oprsz, maxsz, 0, FUNC); \
77
tcg_temp_free_ptr(fpst); \
78
- } \
79
+ }
80
+
81
+#define DO_3S_FP_GVEC(INSN,SFUNC,HFUNC) \
82
+ WRAP_FP_GVEC(gen_##INSN##_fp32_3s, FPST_STD, SFUNC) \
83
+ WRAP_FP_GVEC(gen_##INSN##_fp16_3s, FPST_STD_F16, HFUNC) \
84
static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
85
{ \
86
if (a->size != 0) { \
87
- /* TODO fp16 support */ \
88
- return false; \
89
+ if (!dc_isar_feature(aa32_fp16_arith, s)) { \
90
+ return false; \
91
+ } \
92
+ return do_3same(s, a, gen_##INSN##_fp16_3s); \
93
} \
94
- return do_3same(s, a, gen_##INSN##_3s); \
95
+ return do_3same(s, a, gen_##INSN##_fp32_3s); \
96
}
97
98
99
-DO_3S_FP_GVEC(VADD, gen_helper_gvec_fadd_s)
100
-DO_3S_FP_GVEC(VSUB, gen_helper_gvec_fsub_s)
101
-DO_3S_FP_GVEC(VABD, gen_helper_gvec_fabd_s)
102
-DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s)
103
+DO_3S_FP_GVEC(VADD, gen_helper_gvec_fadd_s, gen_helper_gvec_fadd_h)
104
+DO_3S_FP_GVEC(VSUB, gen_helper_gvec_fsub_s, gen_helper_gvec_fsub_h)
105
+DO_3S_FP_GVEC(VABD, gen_helper_gvec_fabd_s, gen_helper_gvec_fabd_h)
106
+DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s, gen_helper_gvec_fmul_h)
107
108
/*
109
* For all the functions using this macro, size == 1 means fp16,
110
--
111
2.20.1
112
113
diff view generated by jsdifflib
Deleted patch
1
We already have gvec helpers for floating point VRECPE and
2
VRQSRTE, so convert the Neon decoder to use them and
3
add the fp16 support.
4
1
5
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
6
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
7
Message-id: 20200828183354.27913-25-peter.maydell@linaro.org
8
---
9
target/arm/translate-neon.c.inc | 31 +++++++++++++++++++++++++++++--
10
1 file changed, 29 insertions(+), 2 deletions(-)
11
12
diff --git a/target/arm/translate-neon.c.inc b/target/arm/translate-neon.c.inc
13
index XXXXXXX..XXXXXXX 100644
14
--- a/target/arm/translate-neon.c.inc
15
+++ b/target/arm/translate-neon.c.inc
16
@@ -XXX,XX +XXX,XX @@ static bool do_2misc_fp(DisasContext *s, arg_2misc *a,
17
return do_2misc_fp(s, a, FUNC); \
18
}
19
20
-DO_2MISC_FP(VRECPE_F, gen_helper_recpe_f32)
21
-DO_2MISC_FP(VRSQRTE_F, gen_helper_rsqrte_f32)
22
DO_2MISC_FP(VCVT_FS, gen_helper_vfp_sitos)
23
DO_2MISC_FP(VCVT_FU, gen_helper_vfp_uitos)
24
DO_2MISC_FP(VCVT_SF, gen_helper_vfp_tosizs)
25
DO_2MISC_FP(VCVT_UF, gen_helper_vfp_touizs)
26
27
+#define DO_2MISC_FP_VEC(INSN, HFUNC, SFUNC) \
28
+ static void gen_##INSN(unsigned vece, uint32_t rd_ofs, \
29
+ uint32_t rm_ofs, \
30
+ uint32_t oprsz, uint32_t maxsz) \
31
+ { \
32
+ static gen_helper_gvec_2_ptr * const fns[4] = { \
33
+ NULL, HFUNC, SFUNC, NULL, \
34
+ }; \
35
+ TCGv_ptr fpst; \
36
+ fpst = fpstatus_ptr(vece == MO_16 ? FPST_STD_F16 : FPST_STD); \
37
+ tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz, 0, \
38
+ fns[vece]); \
39
+ tcg_temp_free_ptr(fpst); \
40
+ } \
41
+ static bool trans_##INSN(DisasContext *s, arg_2misc *a) \
42
+ { \
43
+ if (a->size == MO_16) { \
44
+ if (!dc_isar_feature(aa32_fp16_arith, s)) { \
45
+ return false; \
46
+ } \
47
+ } else if (a->size != MO_32) { \
48
+ return false; \
49
+ } \
50
+ return do_2misc_vec(s, a, gen_##INSN); \
51
+ }
52
+
53
+DO_2MISC_FP_VEC(VRECPE_F, gen_helper_gvec_frecpe_h, gen_helper_gvec_frecpe_s)
54
+DO_2MISC_FP_VEC(VRSQRTE_F, gen_helper_gvec_frsqrte_h, gen_helper_gvec_frsqrte_s)
55
+
56
static bool trans_VRINTX(DisasContext *s, arg_2misc *a)
57
{
58
if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
59
--
60
2.20.1
61
62
diff view generated by jsdifflib
Deleted patch
1
Rewrite Neon VABS/VNEG of floats to use gvec logical AND and XOR, so
2
that we can implement the fp16 version of the insns.
3
1
4
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
5
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
6
Message-id: 20200828183354.27913-26-peter.maydell@linaro.org
7
---
8
target/arm/translate-neon.c.inc | 34 +++++++++++++++++++++++++++------
9
1 file changed, 28 insertions(+), 6 deletions(-)
10
11
diff --git a/target/arm/translate-neon.c.inc b/target/arm/translate-neon.c.inc
12
index XXXXXXX..XXXXXXX 100644
13
--- a/target/arm/translate-neon.c.inc
14
+++ b/target/arm/translate-neon.c.inc
15
@@ -XXX,XX +XXX,XX @@ static bool trans_VCNT(DisasContext *s, arg_2misc *a)
16
return do_2misc(s, a, gen_helper_neon_cnt_u8);
17
}
18
19
+static void gen_VABS_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
20
+ uint32_t oprsz, uint32_t maxsz)
21
+{
22
+ tcg_gen_gvec_andi(vece, rd_ofs, rm_ofs,
23
+ vece == MO_16 ? 0x7fff : 0x7fffffff,
24
+ oprsz, maxsz);
25
+}
26
+
27
static bool trans_VABS_F(DisasContext *s, arg_2misc *a)
28
{
29
- if (a->size != 2) {
30
+ if (a->size == MO_16) {
31
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
32
+ return false;
33
+ }
34
+ } else if (a->size != MO_32) {
35
return false;
36
}
37
- /* TODO: FP16 : size == 1 */
38
- return do_2misc(s, a, gen_helper_vfp_abss);
39
+ return do_2misc_vec(s, a, gen_VABS_F);
40
+}
41
+
42
+static void gen_VNEG_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
43
+ uint32_t oprsz, uint32_t maxsz)
44
+{
45
+ tcg_gen_gvec_xori(vece, rd_ofs, rm_ofs,
46
+ vece == MO_16 ? 0x8000 : 0x80000000,
47
+ oprsz, maxsz);
48
}
49
50
static bool trans_VNEG_F(DisasContext *s, arg_2misc *a)
51
{
52
- if (a->size != 2) {
53
+ if (a->size == MO_16) {
54
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
55
+ return false;
56
+ }
57
+ } else if (a->size != MO_32) {
58
return false;
59
}
60
- /* TODO: FP16 : size == 1 */
61
- return do_2misc(s, a, gen_helper_vfp_negs);
62
+ return do_2misc_vec(s, a, gen_VNEG_F);
63
}
64
65
static bool trans_VRECPE(DisasContext *s, arg_2misc *a)
66
--
67
2.20.1
68
69
diff view generated by jsdifflib
Deleted patch
1
Convert the Neon floating-point vector comparison ops VCEQ,
2
VCGE and VCGT over to using a gvec helper and use this to
3
implement the fp16 case.
4
1
5
(We put the float16_ceq() etc functions above the DO_2OP()
6
macro definition because later when we convert the
7
compare-against-zero instructions we'll want their
8
definitions to be visible at that point in the source file.)
9
10
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
11
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
12
Message-id: 20200828183354.27913-27-peter.maydell@linaro.org
13
---
14
target/arm/helper.h | 9 +++++++
15
target/arm/vec_helper.c | 44 +++++++++++++++++++++++++++++++++
16
target/arm/translate-neon.c.inc | 6 ++---
17
3 files changed, 56 insertions(+), 3 deletions(-)
18
19
diff --git a/target/arm/helper.h b/target/arm/helper.h
20
index XXXXXXX..XXXXXXX 100644
21
--- a/target/arm/helper.h
22
+++ b/target/arm/helper.h
23
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_5(gvec_fmul_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
24
DEF_HELPER_FLAGS_5(gvec_fabd_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
25
DEF_HELPER_FLAGS_5(gvec_fabd_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
26
27
+DEF_HELPER_FLAGS_5(gvec_fceq_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
28
+DEF_HELPER_FLAGS_5(gvec_fceq_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
29
+
30
+DEF_HELPER_FLAGS_5(gvec_fcge_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
31
+DEF_HELPER_FLAGS_5(gvec_fcge_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
32
+
33
+DEF_HELPER_FLAGS_5(gvec_fcgt_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
34
+DEF_HELPER_FLAGS_5(gvec_fcgt_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
35
+
36
DEF_HELPER_FLAGS_5(gvec_ftsmul_h, TCG_CALL_NO_RWG,
37
void, ptr, ptr, ptr, ptr, i32)
38
DEF_HELPER_FLAGS_5(gvec_ftsmul_s, TCG_CALL_NO_RWG,
39
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
40
index XXXXXXX..XXXXXXX 100644
41
--- a/target/arm/vec_helper.c
42
+++ b/target/arm/vec_helper.c
43
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm,
44
clear_tail(d, opr_sz, simd_maxsz(desc));
45
}
46
47
+/*
48
+ * Floating point comparisons producing an integer result (all 1s or all 0s).
49
+ * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
50
+ * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
51
+ */
52
+static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
53
+{
54
+ return -float16_eq_quiet(op1, op2, stat);
55
+}
56
+
57
+static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
58
+{
59
+ return -float32_eq_quiet(op1, op2, stat);
60
+}
61
+
62
+static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
63
+{
64
+ return -float16_le(op2, op1, stat);
65
+}
66
+
67
+static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
68
+{
69
+ return -float32_le(op2, op1, stat);
70
+}
71
+
72
+static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
73
+{
74
+ return -float16_lt(op2, op1, stat);
75
+}
76
+
77
+static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
78
+{
79
+ return -float32_lt(op2, op1, stat);
80
+}
81
+
82
#define DO_2OP(NAME, FUNC, TYPE) \
83
void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
84
{ \
85
@@ -XXX,XX +XXX,XX @@ DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
86
DO_3OP(gvec_fabd_h, float16_abd, float16)
87
DO_3OP(gvec_fabd_s, float32_abd, float32)
88
89
+DO_3OP(gvec_fceq_h, float16_ceq, float16)
90
+DO_3OP(gvec_fceq_s, float32_ceq, float32)
91
+
92
+DO_3OP(gvec_fcge_h, float16_cge, float16)
93
+DO_3OP(gvec_fcge_s, float32_cge, float32)
94
+
95
+DO_3OP(gvec_fcgt_h, float16_cgt, float16)
96
+DO_3OP(gvec_fcgt_s, float32_cgt, float32)
97
+
98
#ifdef TARGET_AARCH64
99
100
DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
101
diff --git a/target/arm/translate-neon.c.inc b/target/arm/translate-neon.c.inc
102
index XXXXXXX..XXXXXXX 100644
103
--- a/target/arm/translate-neon.c.inc
104
+++ b/target/arm/translate-neon.c.inc
105
@@ -XXX,XX +XXX,XX @@ DO_3S_FP_GVEC(VADD, gen_helper_gvec_fadd_s, gen_helper_gvec_fadd_h)
106
DO_3S_FP_GVEC(VSUB, gen_helper_gvec_fsub_s, gen_helper_gvec_fsub_h)
107
DO_3S_FP_GVEC(VABD, gen_helper_gvec_fabd_s, gen_helper_gvec_fabd_h)
108
DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s, gen_helper_gvec_fmul_h)
109
+DO_3S_FP_GVEC(VCEQ, gen_helper_gvec_fceq_s, gen_helper_gvec_fceq_h)
110
+DO_3S_FP_GVEC(VCGE, gen_helper_gvec_fcge_s, gen_helper_gvec_fcge_h)
111
+DO_3S_FP_GVEC(VCGT, gen_helper_gvec_fcgt_s, gen_helper_gvec_fcgt_h)
112
113
/*
114
* For all the functions using this macro, size == 1 means fp16,
115
@@ -XXX,XX +XXX,XX @@ DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s, gen_helper_gvec_fmul_h)
116
return do_3same_fp(s, a, FUNC, READS_VD); \
117
}
118
119
-DO_3S_FP(VCEQ, gen_helper_neon_ceq_f32, false)
120
-DO_3S_FP(VCGE, gen_helper_neon_cge_f32, false)
121
-DO_3S_FP(VCGT, gen_helper_neon_cgt_f32, false)
122
DO_3S_FP(VACGE, gen_helper_neon_acge_f32, false)
123
DO_3S_FP(VACGT, gen_helper_neon_acgt_f32, false)
124
DO_3S_FP(VMAX, gen_helper_vfp_maxs, false)
125
--
126
2.20.1
127
128
diff view generated by jsdifflib
Deleted patch
1
Convert the Neon floating point VMAXNM and VMINNM insns to
2
using a gvec helper and use this to implement the fp16 case.
3
1
4
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
5
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
6
Message-id: 20200828183354.27913-30-peter.maydell@linaro.org
7
---
8
target/arm/helper.h | 6 ++++++
9
target/arm/vec_helper.c | 6 ++++++
10
target/arm/translate-neon.c.inc | 23 +++++++++++++++--------
11
3 files changed, 27 insertions(+), 8 deletions(-)
12
13
diff --git a/target/arm/helper.h b/target/arm/helper.h
14
index XXXXXXX..XXXXXXX 100644
15
--- a/target/arm/helper.h
16
+++ b/target/arm/helper.h
17
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_5(gvec_fmax_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
18
DEF_HELPER_FLAGS_5(gvec_fmin_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
19
DEF_HELPER_FLAGS_5(gvec_fmin_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
20
21
+DEF_HELPER_FLAGS_5(gvec_fmaxnum_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
22
+DEF_HELPER_FLAGS_5(gvec_fmaxnum_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
23
+
24
+DEF_HELPER_FLAGS_5(gvec_fminnum_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
25
+DEF_HELPER_FLAGS_5(gvec_fminnum_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
26
+
27
DEF_HELPER_FLAGS_5(gvec_ftsmul_h, TCG_CALL_NO_RWG,
28
void, ptr, ptr, ptr, ptr, i32)
29
DEF_HELPER_FLAGS_5(gvec_ftsmul_s, TCG_CALL_NO_RWG,
30
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
31
index XXXXXXX..XXXXXXX 100644
32
--- a/target/arm/vec_helper.c
33
+++ b/target/arm/vec_helper.c
34
@@ -XXX,XX +XXX,XX @@ DO_3OP(gvec_fmax_s, float32_max, float32)
35
DO_3OP(gvec_fmin_h, float16_min, float16)
36
DO_3OP(gvec_fmin_s, float32_min, float32)
37
38
+DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
39
+DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
40
+
41
+DO_3OP(gvec_fminnum_h, float16_minnum, float16)
42
+DO_3OP(gvec_fminnum_s, float32_minnum, float32)
43
+
44
#ifdef TARGET_AARCH64
45
46
DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
47
diff --git a/target/arm/translate-neon.c.inc b/target/arm/translate-neon.c.inc
48
index XXXXXXX..XXXXXXX 100644
49
--- a/target/arm/translate-neon.c.inc
50
+++ b/target/arm/translate-neon.c.inc
51
@@ -XXX,XX +XXX,XX @@ static void gen_VMLS_fp_3s(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm,
52
DO_3S_FP(VMLA, gen_VMLA_fp_3s, true)
53
DO_3S_FP(VMLS, gen_VMLS_fp_3s, true)
54
55
+WRAP_FP_GVEC(gen_VMAXNM_fp32_3s, FPST_STD, gen_helper_gvec_fmaxnum_s)
56
+WRAP_FP_GVEC(gen_VMAXNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fmaxnum_h)
57
+WRAP_FP_GVEC(gen_VMINNM_fp32_3s, FPST_STD, gen_helper_gvec_fminnum_s)
58
+WRAP_FP_GVEC(gen_VMINNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fminnum_h)
59
+
60
static bool trans_VMAXNM_fp_3s(DisasContext *s, arg_3same *a)
61
{
62
if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
63
@@ -XXX,XX +XXX,XX @@ static bool trans_VMAXNM_fp_3s(DisasContext *s, arg_3same *a)
64
}
65
66
if (a->size != 0) {
67
- /* TODO fp16 support */
68
- return false;
69
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
70
+ return false;
71
+ }
72
+ return do_3same(s, a, gen_VMAXNM_fp16_3s);
73
}
74
-
75
- return do_3same_fp(s, a, gen_helper_vfp_maxnums, false);
76
+ return do_3same(s, a, gen_VMAXNM_fp32_3s);
77
}
78
79
static bool trans_VMINNM_fp_3s(DisasContext *s, arg_3same *a)
80
@@ -XXX,XX +XXX,XX @@ static bool trans_VMINNM_fp_3s(DisasContext *s, arg_3same *a)
81
}
82
83
if (a->size != 0) {
84
- /* TODO fp16 support */
85
- return false;
86
+ if (!dc_isar_feature(aa32_fp16_arith, s)) {
87
+ return false;
88
+ }
89
+ return do_3same(s, a, gen_VMINNM_fp16_3s);
90
}
91
-
92
- return do_3same_fp(s, a, gen_helper_vfp_minnums, false);
93
+ return do_3same(s, a, gen_VMINNM_fp32_3s);
94
}
95
96
WRAP_ENV_FN(gen_VRECPS_tramp, gen_helper_recps_f32)
97
--
98
2.20.1
99
100
diff view generated by jsdifflib
Deleted patch
1
Convert the Neon floating-point VMLA and VMLS insns over to using a
2
gvec helper, and use this to implement the fp16 case.
3
1
4
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
5
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
6
Message-id: 20200828183354.27913-31-peter.maydell@linaro.org
7
---
8
target/arm/helper.h | 6 +++++
9
target/arm/vec_helper.c | 42 +++++++++++++++++++++++++++++++++
10
target/arm/translate-neon.c.inc | 33 ++------------------------
11
3 files changed, 50 insertions(+), 31 deletions(-)
12
13
diff --git a/target/arm/helper.h b/target/arm/helper.h
14
index XXXXXXX..XXXXXXX 100644
15
--- a/target/arm/helper.h
16
+++ b/target/arm/helper.h
17
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_5(gvec_fmaxnum_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i3
18
DEF_HELPER_FLAGS_5(gvec_fminnum_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
19
DEF_HELPER_FLAGS_5(gvec_fminnum_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
20
21
+DEF_HELPER_FLAGS_5(gvec_fmla_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
22
+DEF_HELPER_FLAGS_5(gvec_fmla_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
23
+
24
+DEF_HELPER_FLAGS_5(gvec_fmls_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
25
+DEF_HELPER_FLAGS_5(gvec_fmls_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
26
+
27
DEF_HELPER_FLAGS_5(gvec_ftsmul_h, TCG_CALL_NO_RWG,
28
void, ptr, ptr, ptr, ptr, i32)
29
DEF_HELPER_FLAGS_5(gvec_ftsmul_s, TCG_CALL_NO_RWG,
30
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
31
index XXXXXXX..XXXXXXX 100644
32
--- a/target/arm/vec_helper.c
33
+++ b/target/arm/vec_helper.c
34
@@ -XXX,XX +XXX,XX @@ DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
35
#endif
36
#undef DO_3OP
37
38
+/* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
39
+static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
40
+ float_status *stat)
41
+{
42
+ return float16_add(dest, float16_mul(op1, op2, stat), stat);
43
+}
44
+
45
+static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
46
+ float_status *stat)
47
+{
48
+ return float32_add(dest, float32_mul(op1, op2, stat), stat);
49
+}
50
+
51
+static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
52
+ float_status *stat)
53
+{
54
+ return float16_sub(dest, float16_mul(op1, op2, stat), stat);
55
+}
56
+
57
+static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
58
+ float_status *stat)
59
+{
60
+ return float32_sub(dest, float32_mul(op1, op2, stat), stat);
61
+}
62
+
63
+#define DO_MULADD(NAME, FUNC, TYPE) \
64
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
65
+{ \
66
+ intptr_t i, oprsz = simd_oprsz(desc); \
67
+ TYPE *d = vd, *n = vn, *m = vm; \
68
+ for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
69
+ d[i] = FUNC(d[i], n[i], m[i], stat); \
70
+ } \
71
+ clear_tail(d, oprsz, simd_maxsz(desc)); \
72
+}
73
+
74
+DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
75
+DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
76
+
77
+DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
78
+DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
79
+
80
/* For the indexed ops, SVE applies the index per 128-bit vector segment.
81
* For AdvSIMD, there is of course only one such vector segment.
82
*/
83
diff --git a/target/arm/translate-neon.c.inc b/target/arm/translate-neon.c.inc
84
index XXXXXXX..XXXXXXX 100644
85
--- a/target/arm/translate-neon.c.inc
86
+++ b/target/arm/translate-neon.c.inc
87
@@ -XXX,XX +XXX,XX @@ DO_3S_FP_GVEC(VACGE, gen_helper_gvec_facge_s, gen_helper_gvec_facge_h)
88
DO_3S_FP_GVEC(VACGT, gen_helper_gvec_facgt_s, gen_helper_gvec_facgt_h)
89
DO_3S_FP_GVEC(VMAX, gen_helper_gvec_fmax_s, gen_helper_gvec_fmax_h)
90
DO_3S_FP_GVEC(VMIN, gen_helper_gvec_fmin_s, gen_helper_gvec_fmin_h)
91
-
92
-/*
93
- * For all the functions using this macro, size == 1 means fp16,
94
- * which is an architecture extension we don't implement yet.
95
- */
96
-#define DO_3S_FP(INSN,FUNC,READS_VD) \
97
- static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
98
- { \
99
- if (a->size != 0) { \
100
- /* TODO fp16 support */ \
101
- return false; \
102
- } \
103
- return do_3same_fp(s, a, FUNC, READS_VD); \
104
- }
105
-
106
-static void gen_VMLA_fp_3s(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm,
107
- TCGv_ptr fpstatus)
108
-{
109
- gen_helper_vfp_muls(vn, vn, vm, fpstatus);
110
- gen_helper_vfp_adds(vd, vd, vn, fpstatus);
111
-}
112
-
113
-static void gen_VMLS_fp_3s(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm,
114
- TCGv_ptr fpstatus)
115
-{
116
- gen_helper_vfp_muls(vn, vn, vm, fpstatus);
117
- gen_helper_vfp_subs(vd, vd, vn, fpstatus);
118
-}
119
-
120
-DO_3S_FP(VMLA, gen_VMLA_fp_3s, true)
121
-DO_3S_FP(VMLS, gen_VMLS_fp_3s, true)
122
+DO_3S_FP_GVEC(VMLA, gen_helper_gvec_fmla_s, gen_helper_gvec_fmla_h)
123
+DO_3S_FP_GVEC(VMLS, gen_helper_gvec_fmls_s, gen_helper_gvec_fmls_h)
124
125
WRAP_FP_GVEC(gen_VMAXNM_fp32_3s, FPST_STD, gen_helper_gvec_fmaxnum_s)
126
WRAP_FP_GVEC(gen_VMAXNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fmaxnum_h)
127
--
128
2.20.1
129
130
diff view generated by jsdifflib
Deleted patch
1
Convert the Neon VRECPS insn to using a gvec helper, and
2
use this to implement the fp16 case.
3
1
4
The phrasing of the new float32_recps_nf() is slightly different from
5
the old recps_f32() so that it parallels the f16 version; for f16 we
6
can't assume that flush-to-zero is always enabled.
7
8
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
9
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
10
Message-id: 20200828183354.27913-34-peter.maydell@linaro.org
11
---
12
target/arm/helper.h | 4 +++-
13
target/arm/vec_helper.c | 31 +++++++++++++++++++++++++++++++
14
target/arm/vfp_helper.c | 13 -------------
15
target/arm/translate-neon.c.inc | 21 +--------------------
16
4 files changed, 35 insertions(+), 34 deletions(-)
17
18
diff --git a/target/arm/helper.h b/target/arm/helper.h
19
index XXXXXXX..XXXXXXX 100644
20
--- a/target/arm/helper.h
21
+++ b/target/arm/helper.h
22
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_4(vfp_muladdd, f64, f64, f64, f64, ptr)
23
DEF_HELPER_4(vfp_muladds, f32, f32, f32, f32, ptr)
24
DEF_HELPER_4(vfp_muladdh, f16, f16, f16, f16, ptr)
25
26
-DEF_HELPER_3(recps_f32, f32, env, f32, f32)
27
DEF_HELPER_3(rsqrts_f32, f32, env, f32, f32)
28
DEF_HELPER_FLAGS_2(recpe_f16, TCG_CALL_NO_RWG, f16, f16, ptr)
29
DEF_HELPER_FLAGS_2(recpe_f32, TCG_CALL_NO_RWG, f32, f32, ptr)
30
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_5(gvec_fmaxnum_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i3
31
DEF_HELPER_FLAGS_5(gvec_fminnum_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
32
DEF_HELPER_FLAGS_5(gvec_fminnum_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
33
34
+DEF_HELPER_FLAGS_5(gvec_recps_nf_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
35
+DEF_HELPER_FLAGS_5(gvec_recps_nf_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
36
+
37
DEF_HELPER_FLAGS_5(gvec_fmla_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
38
DEF_HELPER_FLAGS_5(gvec_fmla_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
39
40
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
41
index XXXXXXX..XXXXXXX 100644
42
--- a/target/arm/vec_helper.c
43
+++ b/target/arm/vec_helper.c
44
@@ -XXX,XX +XXX,XX @@ static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
45
return float32_abs(float32_sub(op1, op2, stat));
46
}
47
48
+/*
49
+ * Reciprocal step. These are the AArch32 version which uses a
50
+ * non-fused multiply-and-subtract.
51
+ */
52
+static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
53
+{
54
+ op1 = float16_squash_input_denormal(op1, stat);
55
+ op2 = float16_squash_input_denormal(op2, stat);
56
+
57
+ if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
58
+ (float16_is_infinity(op2) && float16_is_zero(op1))) {
59
+ return float16_two;
60
+ }
61
+ return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
62
+}
63
+
64
+static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
65
+{
66
+ op1 = float32_squash_input_denormal(op1, stat);
67
+ op2 = float32_squash_input_denormal(op2, stat);
68
+
69
+ if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
70
+ (float32_is_infinity(op2) && float32_is_zero(op1))) {
71
+ return float32_two;
72
+ }
73
+ return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
74
+}
75
+
76
#define DO_3OP(NAME, FUNC, TYPE) \
77
void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
78
{ \
79
@@ -XXX,XX +XXX,XX @@ DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
80
DO_3OP(gvec_fminnum_h, float16_minnum, float16)
81
DO_3OP(gvec_fminnum_s, float32_minnum, float32)
82
83
+DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
84
+DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
85
+
86
#ifdef TARGET_AARCH64
87
88
DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
89
diff --git a/target/arm/vfp_helper.c b/target/arm/vfp_helper.c
90
index XXXXXXX..XXXXXXX 100644
91
--- a/target/arm/vfp_helper.c
92
+++ b/target/arm/vfp_helper.c
93
@@ -XXX,XX +XXX,XX @@ uint32_t HELPER(vfp_fcvt_f64_to_f16)(float64 a, void *fpstp, uint32_t ahp_mode)
94
return r;
95
}
96
97
-float32 HELPER(recps_f32)(CPUARMState *env, float32 a, float32 b)
98
-{
99
- float_status *s = &env->vfp.standard_fp_status;
100
- if ((float32_is_infinity(a) && float32_is_zero_or_denormal(b)) ||
101
- (float32_is_infinity(b) && float32_is_zero_or_denormal(a))) {
102
- if (!(float32_is_zero(a) || float32_is_zero(b))) {
103
- float_raise(float_flag_input_denormal, s);
104
- }
105
- return float32_two;
106
- }
107
- return float32_sub(float32_two, float32_mul(a, b, s), s);
108
-}
109
-
110
float32 HELPER(rsqrts_f32)(CPUARMState *env, float32 a, float32 b)
111
{
112
float_status *s = &env->vfp.standard_fp_status;
113
diff --git a/target/arm/translate-neon.c.inc b/target/arm/translate-neon.c.inc
114
index XXXXXXX..XXXXXXX 100644
115
--- a/target/arm/translate-neon.c.inc
116
+++ b/target/arm/translate-neon.c.inc
117
@@ -XXX,XX +XXX,XX @@ DO_3S_FP_GVEC(VMLA, gen_helper_gvec_fmla_s, gen_helper_gvec_fmla_h)
118
DO_3S_FP_GVEC(VMLS, gen_helper_gvec_fmls_s, gen_helper_gvec_fmls_h)
119
DO_3S_FP_GVEC(VFMA, gen_helper_gvec_vfma_s, gen_helper_gvec_vfma_h)
120
DO_3S_FP_GVEC(VFMS, gen_helper_gvec_vfms_s, gen_helper_gvec_vfms_h)
121
+DO_3S_FP_GVEC(VRECPS, gen_helper_gvec_recps_nf_s, gen_helper_gvec_recps_nf_h)
122
123
WRAP_FP_GVEC(gen_VMAXNM_fp32_3s, FPST_STD, gen_helper_gvec_fmaxnum_s)
124
WRAP_FP_GVEC(gen_VMAXNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fmaxnum_h)
125
@@ -XXX,XX +XXX,XX @@ static bool trans_VMINNM_fp_3s(DisasContext *s, arg_3same *a)
126
return do_3same(s, a, gen_VMINNM_fp32_3s);
127
}
128
129
-WRAP_ENV_FN(gen_VRECPS_tramp, gen_helper_recps_f32)
130
-
131
-static void gen_VRECPS_fp_3s(unsigned vece, uint32_t rd_ofs,
132
- uint32_t rn_ofs, uint32_t rm_ofs,
133
- uint32_t oprsz, uint32_t maxsz)
134
-{
135
- static const GVecGen3 ops = { .fni4 = gen_VRECPS_tramp };
136
- tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops);
137
-}
138
-
139
-static bool trans_VRECPS_fp_3s(DisasContext *s, arg_3same *a)
140
-{
141
- if (a->size != 0) {
142
- /* TODO fp16 support */
143
- return false;
144
- }
145
-
146
- return do_3same(s, a, gen_VRECPS_fp_3s);
147
-}
148
-
149
WRAP_ENV_FN(gen_VRSQRTS_tramp, gen_helper_rsqrts_f32)
150
151
static void gen_VRSQRTS_fp_3s(unsigned vece, uint32_t rd_ofs,
152
--
153
2.20.1
154
155
diff view generated by jsdifflib
Deleted patch
1
Convert the Neon VRSQRTS insn to using a gvec helper,
2
and use this to implement the fp16 case.
3
1
4
As with VRECPS, we adjust the phrasing of the new implementation
5
slightly so that the fp32 version parallels the fp16 one.
6
7
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
8
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
9
Message-id: 20200828183354.27913-35-peter.maydell@linaro.org
10
---
11
target/arm/helper.h | 4 +++-
12
target/arm/vec_helper.c | 30 ++++++++++++++++++++++++++++++
13
target/arm/vfp_helper.c | 15 ---------------
14
target/arm/translate-neon.c.inc | 21 +--------------------
15
4 files changed, 34 insertions(+), 36 deletions(-)
16
17
diff --git a/target/arm/helper.h b/target/arm/helper.h
18
index XXXXXXX..XXXXXXX 100644
19
--- a/target/arm/helper.h
20
+++ b/target/arm/helper.h
21
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_4(vfp_muladdd, f64, f64, f64, f64, ptr)
22
DEF_HELPER_4(vfp_muladds, f32, f32, f32, f32, ptr)
23
DEF_HELPER_4(vfp_muladdh, f16, f16, f16, f16, ptr)
24
25
-DEF_HELPER_3(rsqrts_f32, f32, env, f32, f32)
26
DEF_HELPER_FLAGS_2(recpe_f16, TCG_CALL_NO_RWG, f16, f16, ptr)
27
DEF_HELPER_FLAGS_2(recpe_f32, TCG_CALL_NO_RWG, f32, f32, ptr)
28
DEF_HELPER_FLAGS_2(recpe_f64, TCG_CALL_NO_RWG, f64, f64, ptr)
29
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_5(gvec_fminnum_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i3
30
DEF_HELPER_FLAGS_5(gvec_recps_nf_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
31
DEF_HELPER_FLAGS_5(gvec_recps_nf_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
32
33
+DEF_HELPER_FLAGS_5(gvec_rsqrts_nf_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
34
+DEF_HELPER_FLAGS_5(gvec_rsqrts_nf_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
35
+
36
DEF_HELPER_FLAGS_5(gvec_fmla_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
37
DEF_HELPER_FLAGS_5(gvec_fmla_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
38
39
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
40
index XXXXXXX..XXXXXXX 100644
41
--- a/target/arm/vec_helper.c
42
+++ b/target/arm/vec_helper.c
43
@@ -XXX,XX +XXX,XX @@ static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
44
return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
45
}
46
47
+/* Reciprocal square-root step. AArch32 non-fused semantics. */
48
+static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
49
+{
50
+ op1 = float16_squash_input_denormal(op1, stat);
51
+ op2 = float16_squash_input_denormal(op2, stat);
52
+
53
+ if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
54
+ (float16_is_infinity(op2) && float16_is_zero(op1))) {
55
+ return float16_one_point_five;
56
+ }
57
+ op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
58
+ return float16_div(op1, float16_two, stat);
59
+}
60
+
61
+static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
62
+{
63
+ op1 = float32_squash_input_denormal(op1, stat);
64
+ op2 = float32_squash_input_denormal(op2, stat);
65
+
66
+ if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
67
+ (float32_is_infinity(op2) && float32_is_zero(op1))) {
68
+ return float32_one_point_five;
69
+ }
70
+ op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
71
+ return float32_div(op1, float32_two, stat);
72
+}
73
+
74
#define DO_3OP(NAME, FUNC, TYPE) \
75
void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
76
{ \
77
@@ -XXX,XX +XXX,XX @@ DO_3OP(gvec_fminnum_s, float32_minnum, float32)
78
DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
79
DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
80
81
+DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
82
+DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
83
+
84
#ifdef TARGET_AARCH64
85
86
DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
87
diff --git a/target/arm/vfp_helper.c b/target/arm/vfp_helper.c
88
index XXXXXXX..XXXXXXX 100644
89
--- a/target/arm/vfp_helper.c
90
+++ b/target/arm/vfp_helper.c
91
@@ -XXX,XX +XXX,XX @@ uint32_t HELPER(vfp_fcvt_f64_to_f16)(float64 a, void *fpstp, uint32_t ahp_mode)
92
return r;
93
}
94
95
-float32 HELPER(rsqrts_f32)(CPUARMState *env, float32 a, float32 b)
96
-{
97
- float_status *s = &env->vfp.standard_fp_status;
98
- float32 product;
99
- if ((float32_is_infinity(a) && float32_is_zero_or_denormal(b)) ||
100
- (float32_is_infinity(b) && float32_is_zero_or_denormal(a))) {
101
- if (!(float32_is_zero(a) || float32_is_zero(b))) {
102
- float_raise(float_flag_input_denormal, s);
103
- }
104
- return float32_one_point_five;
105
- }
106
- product = float32_mul(a, b, s);
107
- return float32_div(float32_sub(float32_three, product, s), float32_two, s);
108
-}
109
-
110
/* NEON helpers. */
111
112
/* Constants 256 and 512 are used in some helpers; we avoid relying on
113
diff --git a/target/arm/translate-neon.c.inc b/target/arm/translate-neon.c.inc
114
index XXXXXXX..XXXXXXX 100644
115
--- a/target/arm/translate-neon.c.inc
116
+++ b/target/arm/translate-neon.c.inc
117
@@ -XXX,XX +XXX,XX @@ DO_3S_FP_GVEC(VMLS, gen_helper_gvec_fmls_s, gen_helper_gvec_fmls_h)
118
DO_3S_FP_GVEC(VFMA, gen_helper_gvec_vfma_s, gen_helper_gvec_vfma_h)
119
DO_3S_FP_GVEC(VFMS, gen_helper_gvec_vfms_s, gen_helper_gvec_vfms_h)
120
DO_3S_FP_GVEC(VRECPS, gen_helper_gvec_recps_nf_s, gen_helper_gvec_recps_nf_h)
121
+DO_3S_FP_GVEC(VRSQRTS, gen_helper_gvec_rsqrts_nf_s, gen_helper_gvec_rsqrts_nf_h)
122
123
WRAP_FP_GVEC(gen_VMAXNM_fp32_3s, FPST_STD, gen_helper_gvec_fmaxnum_s)
124
WRAP_FP_GVEC(gen_VMAXNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fmaxnum_h)
125
@@ -XXX,XX +XXX,XX @@ static bool trans_VMINNM_fp_3s(DisasContext *s, arg_3same *a)
126
return do_3same(s, a, gen_VMINNM_fp32_3s);
127
}
128
129
-WRAP_ENV_FN(gen_VRSQRTS_tramp, gen_helper_rsqrts_f32)
130
-
131
-static void gen_VRSQRTS_fp_3s(unsigned vece, uint32_t rd_ofs,
132
- uint32_t rn_ofs, uint32_t rm_ofs,
133
- uint32_t oprsz, uint32_t maxsz)
134
-{
135
- static const GVecGen3 ops = { .fni4 = gen_VRSQRTS_tramp };
136
- tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops);
137
-}
138
-
139
-static bool trans_VRSQRTS_fp_3s(DisasContext *s, arg_3same *a)
140
-{
141
- if (a->size != 0) {
142
- /* TODO fp16 support */
143
- return false;
144
- }
145
-
146
- return do_3same(s, a, gen_VRSQRTS_fp_3s);
147
-}
148
-
149
static bool do_3same_fp_pair(DisasContext *s, arg_3same *a, VFPGen3OpSPFn *fn)
150
{
151
/* FP operations handled pairwise 32 bits at a time */
152
--
153
2.20.1
154
155
diff view generated by jsdifflib
Deleted patch
1
Convert the Neon pairwise fp ops to use a single gvic-style
2
helper to do the full operation instead of one helper call
3
for each 32-bit part. This allows us to use the same
4
framework to implement the fp16.
5
1
6
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
7
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
8
Message-id: 20200828183354.27913-36-peter.maydell@linaro.org
9
---
10
target/arm/helper.h | 7 +++++
11
target/arm/vec_helper.c | 45 +++++++++++++++++++++++++++++++++
12
target/arm/translate-neon.c.inc | 42 ++++++++++++------------------
13
3 files changed, 68 insertions(+), 26 deletions(-)
14
15
diff --git a/target/arm/helper.h b/target/arm/helper.h
16
index XXXXXXX..XXXXXXX 100644
17
--- a/target/arm/helper.h
18
+++ b/target/arm/helper.h
19
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_5(gvec_fcmlas_idx, TCG_CALL_NO_RWG,
20
DEF_HELPER_FLAGS_5(gvec_fcmlad, TCG_CALL_NO_RWG,
21
void, ptr, ptr, ptr, ptr, i32)
22
23
+DEF_HELPER_FLAGS_5(neon_paddh, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
24
+DEF_HELPER_FLAGS_5(neon_pmaxh, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
25
+DEF_HELPER_FLAGS_5(neon_pminh, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
26
+DEF_HELPER_FLAGS_5(neon_padds, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
27
+DEF_HELPER_FLAGS_5(neon_pmaxs, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
28
+DEF_HELPER_FLAGS_5(neon_pmins, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
29
+
30
DEF_HELPER_FLAGS_4(gvec_frecpe_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
31
DEF_HELPER_FLAGS_4(gvec_frecpe_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
32
DEF_HELPER_FLAGS_4(gvec_frecpe_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
33
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
34
index XXXXXXX..XXXXXXX 100644
35
--- a/target/arm/vec_helper.c
36
+++ b/target/arm/vec_helper.c
37
@@ -XXX,XX +XXX,XX @@ DO_ABA(gvec_uaba_s, uint32_t)
38
DO_ABA(gvec_uaba_d, uint64_t)
39
40
#undef DO_ABA
41
+
42
+#define DO_NEON_PAIRWISE(NAME, OP) \
43
+ void HELPER(NAME##s)(void *vd, void *vn, void *vm, \
44
+ void *stat, uint32_t oprsz) \
45
+ { \
46
+ float_status *fpst = stat; \
47
+ float32 *d = vd; \
48
+ float32 *n = vn; \
49
+ float32 *m = vm; \
50
+ float32 r0, r1; \
51
+ \
52
+ /* Read all inputs before writing outputs in case vm == vd */ \
53
+ r0 = float32_##OP(n[H4(0)], n[H4(1)], fpst); \
54
+ r1 = float32_##OP(m[H4(0)], m[H4(1)], fpst); \
55
+ \
56
+ d[H4(0)] = r0; \
57
+ d[H4(1)] = r1; \
58
+ } \
59
+ \
60
+ void HELPER(NAME##h)(void *vd, void *vn, void *vm, \
61
+ void *stat, uint32_t oprsz) \
62
+ { \
63
+ float_status *fpst = stat; \
64
+ float16 *d = vd; \
65
+ float16 *n = vn; \
66
+ float16 *m = vm; \
67
+ float16 r0, r1, r2, r3; \
68
+ \
69
+ /* Read all inputs before writing outputs in case vm == vd */ \
70
+ r0 = float16_##OP(n[H2(0)], n[H2(1)], fpst); \
71
+ r1 = float16_##OP(n[H2(2)], n[H2(3)], fpst); \
72
+ r2 = float16_##OP(m[H2(0)], m[H2(1)], fpst); \
73
+ r3 = float16_##OP(m[H2(2)], m[H2(3)], fpst); \
74
+ \
75
+ d[H4(0)] = r0; \
76
+ d[H4(1)] = r1; \
77
+ d[H4(2)] = r2; \
78
+ d[H4(3)] = r3; \
79
+ }
80
+
81
+DO_NEON_PAIRWISE(neon_padd, add)
82
+DO_NEON_PAIRWISE(neon_pmax, max)
83
+DO_NEON_PAIRWISE(neon_pmin, min)
84
+
85
+#undef DO_NEON_PAIRWISE
86
diff --git a/target/arm/translate-neon.c.inc b/target/arm/translate-neon.c.inc
87
index XXXXXXX..XXXXXXX 100644
88
--- a/target/arm/translate-neon.c.inc
89
+++ b/target/arm/translate-neon.c.inc
90
@@ -XXX,XX +XXX,XX @@ static bool trans_VMINNM_fp_3s(DisasContext *s, arg_3same *a)
91
return do_3same(s, a, gen_VMINNM_fp32_3s);
92
}
93
94
-static bool do_3same_fp_pair(DisasContext *s, arg_3same *a, VFPGen3OpSPFn *fn)
95
+static bool do_3same_fp_pair(DisasContext *s, arg_3same *a,
96
+ gen_helper_gvec_3_ptr *fn)
97
{
98
- /* FP operations handled pairwise 32 bits at a time */
99
- TCGv_i32 tmp, tmp2, tmp3;
100
+ /* FP pairwise operations */
101
TCGv_ptr fpstatus;
102
103
if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
104
@@ -XXX,XX +XXX,XX @@ static bool do_3same_fp_pair(DisasContext *s, arg_3same *a, VFPGen3OpSPFn *fn)
105
106
assert(a->q == 0); /* enforced by decode patterns */
107
108
- /*
109
- * Note that we have to be careful not to clobber the source operands
110
- * in the "vm == vd" case by storing the result of the first pass too
111
- * early. Since Q is 0 there are always just two passes, so instead
112
- * of a complicated loop over each pass we just unroll.
113
- */
114
- fpstatus = fpstatus_ptr(FPST_STD);
115
- tmp = neon_load_reg(a->vn, 0);
116
- tmp2 = neon_load_reg(a->vn, 1);
117
- fn(tmp, tmp, tmp2, fpstatus);
118
- tcg_temp_free_i32(tmp2);
119
120
- tmp3 = neon_load_reg(a->vm, 0);
121
- tmp2 = neon_load_reg(a->vm, 1);
122
- fn(tmp3, tmp3, tmp2, fpstatus);
123
- tcg_temp_free_i32(tmp2);
124
+ fpstatus = fpstatus_ptr(a->size != 0 ? FPST_STD_F16 : FPST_STD);
125
+ tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
126
+ vfp_reg_offset(1, a->vn),
127
+ vfp_reg_offset(1, a->vm),
128
+ fpstatus, 8, 8, 0, fn);
129
tcg_temp_free_ptr(fpstatus);
130
131
- neon_store_reg(a->vd, 0, tmp);
132
- neon_store_reg(a->vd, 1, tmp3);
133
return true;
134
}
135
136
@@ -XXX,XX +XXX,XX @@ static bool do_3same_fp_pair(DisasContext *s, arg_3same *a, VFPGen3OpSPFn *fn)
137
static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
138
{ \
139
if (a->size != 0) { \
140
- /* TODO fp16 support */ \
141
- return false; \
142
+ if (!dc_isar_feature(aa32_fp16_arith, s)) { \
143
+ return false; \
144
+ } \
145
+ return do_3same_fp_pair(s, a, FUNC##h); \
146
} \
147
- return do_3same_fp_pair(s, a, FUNC); \
148
+ return do_3same_fp_pair(s, a, FUNC##s); \
149
}
150
151
-DO_3S_FP_PAIR(VPADD, gen_helper_vfp_adds)
152
-DO_3S_FP_PAIR(VPMAX, gen_helper_vfp_maxs)
153
-DO_3S_FP_PAIR(VPMIN, gen_helper_vfp_mins)
154
+DO_3S_FP_PAIR(VPADD, gen_helper_neon_padd)
155
+DO_3S_FP_PAIR(VPMAX, gen_helper_neon_pmax)
156
+DO_3S_FP_PAIR(VPMIN, gen_helper_neon_pmin)
157
158
static bool do_vector_2sh(DisasContext *s, arg_2reg_shift *a, GVecGen2iFn *fn)
159
{
160
--
161
2.20.1
162
163
diff view generated by jsdifflib
Deleted patch
1
Implement fp16 for the Neon VCVT insns which convert between
2
float and fixed-point.
3
1
4
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
5
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
6
Message-id: 20200828183354.27913-39-peter.maydell@linaro.org
7
---
8
target/arm/helper.h | 5 +++++
9
target/arm/neon-dp.decode | 8 +++++++-
10
target/arm/vec_helper.c | 4 ++++
11
target/arm/translate-neon.c.inc | 5 +++++
12
4 files changed, 21 insertions(+), 1 deletion(-)
13
14
diff --git a/target/arm/helper.h b/target/arm/helper.h
15
index XXXXXXX..XXXXXXX 100644
16
--- a/target/arm/helper.h
17
+++ b/target/arm/helper.h
18
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_4(gvec_vcvt_uf, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
19
DEF_HELPER_FLAGS_4(gvec_vcvt_fs, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
20
DEF_HELPER_FLAGS_4(gvec_vcvt_fu, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
21
22
+DEF_HELPER_FLAGS_4(gvec_vcvt_sh, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
23
+DEF_HELPER_FLAGS_4(gvec_vcvt_uh, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
24
+DEF_HELPER_FLAGS_4(gvec_vcvt_hs, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
25
+DEF_HELPER_FLAGS_4(gvec_vcvt_hu, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
26
+
27
DEF_HELPER_FLAGS_4(gvec_frecpe_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
28
DEF_HELPER_FLAGS_4(gvec_frecpe_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
29
DEF_HELPER_FLAGS_4(gvec_frecpe_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
30
diff --git a/target/arm/neon-dp.decode b/target/arm/neon-dp.decode
31
index XXXXXXX..XXXXXXX 100644
32
--- a/target/arm/neon-dp.decode
33
+++ b/target/arm/neon-dp.decode
34
@@ -XXX,XX +XXX,XX @@ VMINNM_fp_3s 1111 001 1 0 . 1 . .... .... 1111 ... 1 .... @3same_fp
35
# We use size=0 for fp32 and size=1 for fp16 to match the 3-same encodings.
36
@2reg_vcvt .... ... . . . 1 ..... .... .... . q:1 . . .... \
37
&2reg_shift vm=%vm_dp vd=%vd_dp size=0 shift=%neon_rshift_i5
38
+@2reg_vcvt_f16 .... ... . . . 11 .... .... .... . q:1 . . .... \
39
+ &2reg_shift vm=%vm_dp vd=%vd_dp size=1 shift=%neon_rshift_i4
40
41
VSHR_S_2sh 1111 001 0 1 . ...... .... 0000 . . . 1 .... @2reg_shr_d
42
VSHR_S_2sh 1111 001 0 1 . ...... .... 0000 . . . 1 .... @2reg_shr_s
43
@@ -XXX,XX +XXX,XX @@ VSHLL_U_2sh 1111 001 1 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_h
44
VSHLL_U_2sh 1111 001 1 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_b
45
46
# VCVT fixed<->float conversions
47
-# TODO: FP16 fixed<->float conversions are opc==0b1100 and 0b1101
48
+VCVT_SH_2sh 1111 001 0 1 . ...... .... 1100 0 . . 1 .... @2reg_vcvt_f16
49
+VCVT_UH_2sh 1111 001 1 1 . ...... .... 1100 0 . . 1 .... @2reg_vcvt_f16
50
+VCVT_HS_2sh 1111 001 0 1 . ...... .... 1101 0 . . 1 .... @2reg_vcvt_f16
51
+VCVT_HU_2sh 1111 001 1 1 . ...... .... 1101 0 . . 1 .... @2reg_vcvt_f16
52
+
53
VCVT_SF_2sh 1111 001 0 1 . ...... .... 1110 0 . . 1 .... @2reg_vcvt
54
VCVT_UF_2sh 1111 001 1 1 . ...... .... 1110 0 . . 1 .... @2reg_vcvt
55
VCVT_FS_2sh 1111 001 0 1 . ...... .... 1111 0 . . 1 .... @2reg_vcvt
56
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
57
index XXXXXXX..XXXXXXX 100644
58
--- a/target/arm/vec_helper.c
59
+++ b/target/arm/vec_helper.c
60
@@ -XXX,XX +XXX,XX @@ DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
61
DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
62
DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t)
63
DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t)
64
+DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
65
+DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
66
+DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t)
67
+DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t)
68
69
#undef DO_VCVT_FIXED
70
diff --git a/target/arm/translate-neon.c.inc b/target/arm/translate-neon.c.inc
71
index XXXXXXX..XXXXXXX 100644
72
--- a/target/arm/translate-neon.c.inc
73
+++ b/target/arm/translate-neon.c.inc
74
@@ -XXX,XX +XXX,XX @@ DO_FP_2SH(VCVT_UF, gen_helper_gvec_vcvt_uf)
75
DO_FP_2SH(VCVT_FS, gen_helper_gvec_vcvt_fs)
76
DO_FP_2SH(VCVT_FU, gen_helper_gvec_vcvt_fu)
77
78
+DO_FP_2SH(VCVT_SH, gen_helper_gvec_vcvt_sh)
79
+DO_FP_2SH(VCVT_UH, gen_helper_gvec_vcvt_uh)
80
+DO_FP_2SH(VCVT_HS, gen_helper_gvec_vcvt_hs)
81
+DO_FP_2SH(VCVT_HU, gen_helper_gvec_vcvt_hu)
82
+
83
static uint64_t asimd_imm_const(uint32_t imm, int cmode, int op)
84
{
85
/*
86
--
87
2.20.1
88
89
diff view generated by jsdifflib
Deleted patch
1
Convert the Neon VRINTX insn to use gvec, and use this to implement
2
fp16 support for it.
3
1
4
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
5
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
6
Message-id: 20200828183354.27913-42-peter.maydell@linaro.org
7
---
8
target/arm/helper.h | 3 +++
9
target/arm/vec_helper.c | 3 +++
10
target/arm/translate-neon.c.inc | 45 +++------------------------------
11
3 files changed, 9 insertions(+), 42 deletions(-)
12
13
diff --git a/target/arm/helper.h b/target/arm/helper.h
14
index XXXXXXX..XXXXXXX 100644
15
--- a/target/arm/helper.h
16
+++ b/target/arm/helper.h
17
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_4(gvec_vcvt_rm_uh, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
18
DEF_HELPER_FLAGS_4(gvec_vrint_rm_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
19
DEF_HELPER_FLAGS_4(gvec_vrint_rm_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
20
21
+DEF_HELPER_FLAGS_4(gvec_vrintx_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
22
+DEF_HELPER_FLAGS_4(gvec_vrintx_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
23
+
24
DEF_HELPER_FLAGS_4(gvec_frecpe_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
25
DEF_HELPER_FLAGS_4(gvec_frecpe_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
26
DEF_HELPER_FLAGS_4(gvec_frecpe_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
27
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
28
index XXXXXXX..XXXXXXX 100644
29
--- a/target/arm/vec_helper.c
30
+++ b/target/arm/vec_helper.c
31
@@ -XXX,XX +XXX,XX @@ DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
32
DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
33
DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
34
35
+DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
36
+DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
37
+
38
DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
39
DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
40
DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
41
diff --git a/target/arm/translate-neon.c.inc b/target/arm/translate-neon.c.inc
42
index XXXXXXX..XXXXXXX 100644
43
--- a/target/arm/translate-neon.c.inc
44
+++ b/target/arm/translate-neon.c.inc
45
@@ -XXX,XX +XXX,XX @@ static bool trans_VQNEG(DisasContext *s, arg_2misc *a)
46
return do_2misc(s, a, fn[a->size]);
47
}
48
49
-static bool do_2misc_fp(DisasContext *s, arg_2misc *a,
50
- NeonGenOneSingleOpFn *fn)
51
-{
52
- int pass;
53
- TCGv_ptr fpst;
54
-
55
- /* Handle a 2-reg-misc operation by iterating 32 bits at a time */
56
- if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
57
- return false;
58
- }
59
-
60
- /* UNDEF accesses to D16-D31 if they don't exist. */
61
- if (!dc_isar_feature(aa32_simd_r32, s) &&
62
- ((a->vd | a->vm) & 0x10)) {
63
- return false;
64
- }
65
-
66
- if (a->size != 2) {
67
- /* TODO: FP16 will be the size == 1 case */
68
- return false;
69
- }
70
-
71
- if ((a->vd | a->vm) & a->q) {
72
- return false;
73
- }
74
-
75
- if (!vfp_access_check(s)) {
76
- return true;
77
- }
78
-
79
- fpst = fpstatus_ptr(FPST_STD);
80
- for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
81
- TCGv_i32 tmp = neon_load_reg(a->vm, pass);
82
- fn(tmp, tmp, fpst);
83
- neon_store_reg(a->vd, pass, tmp);
84
- }
85
- tcg_temp_free_ptr(fpst);
86
-
87
- return true;
88
-}
89
-
90
#define DO_2MISC_FP_VEC(INSN, HFUNC, SFUNC) \
91
static void gen_##INSN(unsigned vece, uint32_t rd_ofs, \
92
uint32_t rm_ofs, \
93
@@ -XXX,XX +XXX,XX @@ DO_2MISC_FP_VEC(VCVT_FU, gen_helper_gvec_ustoh, gen_helper_gvec_uitos)
94
DO_2MISC_FP_VEC(VCVT_SF, gen_helper_gvec_tosszh, gen_helper_gvec_tosizs)
95
DO_2MISC_FP_VEC(VCVT_UF, gen_helper_gvec_touszh, gen_helper_gvec_touizs)
96
97
+DO_2MISC_FP_VEC(VRINTX_impl, gen_helper_gvec_vrintx_h, gen_helper_gvec_vrintx_s)
98
+
99
static bool trans_VRINTX(DisasContext *s, arg_2misc *a)
100
{
101
if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
102
return false;
103
}
104
- return do_2misc_fp(s, a, gen_helper_rints_exact);
105
+ return trans_VRINTX_impl(s, a);
106
}
107
108
#define DO_VEC_RMODE(INSN, RMODE, OP) \
109
--
110
2.20.1
111
112
diff view generated by jsdifflib
Deleted patch
1
In the gvec helper functions for indexed operations, for AArch32
2
Neon the oprsz (total size of the vector) can be less than 16 bytes
3
if the operation is on a D reg. Since the inner loop in these
4
helpers always goes from 0 to segment, we must clamp it based
5
on oprsz to avoid processing a full 16 byte segment when asked to
6
handle an 8 byte wide vector.
7
1
8
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
9
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
10
Message-id: 20200828183354.27913-43-peter.maydell@linaro.org
11
---
12
target/arm/vec_helper.c | 12 ++++++++----
13
1 file changed, 8 insertions(+), 4 deletions(-)
14
15
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
16
index XXXXXXX..XXXXXXX 100644
17
--- a/target/arm/vec_helper.c
18
+++ b/target/arm/vec_helper.c
19
@@ -XXX,XX +XXX,XX @@ DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
20
#define DO_MUL_IDX(NAME, TYPE, H) \
21
void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
22
{ \
23
- intptr_t i, j, oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \
24
+ intptr_t i, j, oprsz = simd_oprsz(desc); \
25
+ intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
26
intptr_t idx = simd_data(desc); \
27
TYPE *d = vd, *n = vn, *m = vm; \
28
for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
29
@@ -XXX,XX +XXX,XX @@ DO_MUL_IDX(gvec_mul_idx_d, uint64_t, )
30
#define DO_MLA_IDX(NAME, TYPE, OP, H) \
31
void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
32
{ \
33
- intptr_t i, j, oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \
34
+ intptr_t i, j, oprsz = simd_oprsz(desc); \
35
+ intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
36
intptr_t idx = simd_data(desc); \
37
TYPE *d = vd, *n = vn, *m = vm, *a = va; \
38
for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
39
@@ -XXX,XX +XXX,XX @@ DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, )
40
#define DO_FMUL_IDX(NAME, TYPE, H) \
41
void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
42
{ \
43
- intptr_t i, j, oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \
44
+ intptr_t i, j, oprsz = simd_oprsz(desc); \
45
+ intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
46
intptr_t idx = simd_data(desc); \
47
TYPE *d = vd, *n = vn, *m = vm; \
48
for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
49
@@ -XXX,XX +XXX,XX @@ DO_FMUL_IDX(gvec_fmul_idx_d, float64, )
50
void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \
51
void *stat, uint32_t desc) \
52
{ \
53
- intptr_t i, j, oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \
54
+ intptr_t i, j, oprsz = simd_oprsz(desc); \
55
+ intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
56
TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \
57
intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \
58
TYPE *d = vd, *n = vn, *m = vm, *a = va; \
59
--
60
2.20.1
61
62
diff view generated by jsdifflib
Deleted patch
1
Add gvec helpers for doing Neon-style indexed non-fused fp
2
multiply-and-accumulate operations.
3
1
4
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
5
Message-id: 20200828183354.27913-44-peter.maydell@linaro.org
6
---
7
target/arm/helper.h | 10 ++++++++++
8
target/arm/vec_helper.c | 27 ++++++++++++++++++++++-----
9
2 files changed, 32 insertions(+), 5 deletions(-)
10
11
diff --git a/target/arm/helper.h b/target/arm/helper.h
12
index XXXXXXX..XXXXXXX 100644
13
--- a/target/arm/helper.h
14
+++ b/target/arm/helper.h
15
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_5(gvec_fmul_idx_s, TCG_CALL_NO_RWG,
16
DEF_HELPER_FLAGS_5(gvec_fmul_idx_d, TCG_CALL_NO_RWG,
17
void, ptr, ptr, ptr, ptr, i32)
18
19
+DEF_HELPER_FLAGS_5(gvec_fmla_nf_idx_h, TCG_CALL_NO_RWG,
20
+ void, ptr, ptr, ptr, ptr, i32)
21
+DEF_HELPER_FLAGS_5(gvec_fmla_nf_idx_s, TCG_CALL_NO_RWG,
22
+ void, ptr, ptr, ptr, ptr, i32)
23
+
24
+DEF_HELPER_FLAGS_5(gvec_fmls_nf_idx_h, TCG_CALL_NO_RWG,
25
+ void, ptr, ptr, ptr, ptr, i32)
26
+DEF_HELPER_FLAGS_5(gvec_fmls_nf_idx_s, TCG_CALL_NO_RWG,
27
+ void, ptr, ptr, ptr, ptr, i32)
28
+
29
DEF_HELPER_FLAGS_6(gvec_fmla_idx_h, TCG_CALL_NO_RWG,
30
void, ptr, ptr, ptr, ptr, ptr, i32)
31
DEF_HELPER_FLAGS_6(gvec_fmla_idx_s, TCG_CALL_NO_RWG,
32
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
33
index XXXXXXX..XXXXXXX 100644
34
--- a/target/arm/vec_helper.c
35
+++ b/target/arm/vec_helper.c
36
@@ -XXX,XX +XXX,XX @@ DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, )
37
38
#undef DO_MLA_IDX
39
40
-#define DO_FMUL_IDX(NAME, TYPE, H) \
41
+#define DO_FMUL_IDX(NAME, ADD, TYPE, H) \
42
void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
43
{ \
44
intptr_t i, j, oprsz = simd_oprsz(desc); \
45
@@ -XXX,XX +XXX,XX @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
46
for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
47
TYPE mm = m[H(i + idx)]; \
48
for (j = 0; j < segment; j++) { \
49
- d[i + j] = TYPE##_mul(n[i + j], mm, stat); \
50
+ d[i + j] = TYPE##_##ADD(d[i + j], \
51
+ TYPE##_mul(n[i + j], mm, stat), stat); \
52
} \
53
} \
54
clear_tail(d, oprsz, simd_maxsz(desc)); \
55
}
56
57
-DO_FMUL_IDX(gvec_fmul_idx_h, float16, H2)
58
-DO_FMUL_IDX(gvec_fmul_idx_s, float32, H4)
59
-DO_FMUL_IDX(gvec_fmul_idx_d, float64, )
60
+#define float16_nop(N, M, S) (M)
61
+#define float32_nop(N, M, S) (M)
62
+#define float64_nop(N, M, S) (M)
63
64
+DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16, H2)
65
+DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32, H4)
66
+DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64, )
67
+
68
+/*
69
+ * Non-fused multiply-accumulate operations, for Neon. NB that unlike
70
+ * the fused ops below they assume accumulate both from and into Vd.
71
+ */
72
+DO_FMUL_IDX(gvec_fmla_nf_idx_h, add, float16, H2)
73
+DO_FMUL_IDX(gvec_fmla_nf_idx_s, add, float32, H4)
74
+DO_FMUL_IDX(gvec_fmls_nf_idx_h, sub, float16, H2)
75
+DO_FMUL_IDX(gvec_fmls_nf_idx_s, sub, float32, H4)
76
+
77
+#undef float16_nop
78
+#undef float32_nop
79
+#undef float64_nop
80
#undef DO_FMUL_IDX
81
82
#define DO_FMLA_IDX(NAME, TYPE, H) \
83
--
84
2.20.1
85
86
diff view generated by jsdifflib