Add HVX IEEE bfloat16 (bf16) instructions:
Arithmetic operations:
- V6_vadd_sf_bf, V6_vsub_sf_bf: add/sub bf16 widening to sf output
- V6_vmpy_sf_bf: multiply bf16 widening to sf output
- V6_vmpy_sf_bf_acc: multiply-accumulate bf16 widening to sf output
Min/Max operations:
- V6_vmin_bf, V6_vmax_bf: bf16 min/max
Comparison operations:
- V6_vgtbf: greater-than compare
- V6_vgtbf_and, V6_vgtbf_or, V6_vgtbf_xor: predicate variants
Conversion operations:
- V6_vcvt_bf_sf: convert sf to bf16
Signed-off-by: Matheus Tavares Bernardino <matheus.bernardino@oss.qualcomm.com>
---
target/hexagon/mmvec/kvx_ieee.h | 36 +++++++++++
target/hexagon/mmvec/macros.h | 5 ++
target/hexagon/mmvec/mmvec.h | 1 +
target/hexagon/mmvec/kvx_ieee.c | 3 +
target/hexagon/imported/mmvec/encode_ext.def | 15 +++++
target/hexagon/imported/mmvec/ext.idef | 64 ++++++++++++++++++++
6 files changed, 124 insertions(+)
diff --git a/target/hexagon/mmvec/kvx_ieee.h b/target/hexagon/mmvec/kvx_ieee.h
index 8a6816f6b3..eb670d4ec3 100644
--- a/target/hexagon/mmvec/kvx_ieee.h
+++ b/target/hexagon/mmvec/kvx_ieee.h
@@ -80,4 +80,40 @@ int16_t conv_hf_h(int16_t a, float_status *fp_status);
int32_t conv_w_sf(uint32_t a, float_status *fp_status);
int16_t conv_h_hf(uint16_t a, float_status *fp_status);
+/* IEEE BFloat instructions */
+
+#define fp_mult_sf_bf(A, B) \
+ fp_mult_sf_sf(((uint32_t)(A)) << 16, ((uint32_t)(B)) << 16, &env->fp_status)
+#define fp_add_sf_bf(A, B) \
+ fp_add_sf_sf(((uint32_t)(A)) << 16, ((uint32_t)(B)) << 16, &env->fp_status)
+#define fp_sub_sf_bf(A, B) \
+ fp_sub_sf_sf(((uint32_t)(A)) << 16, ((uint32_t)(B)) << 16, &env->fp_status)
+
+uint32_t fp_mult_sf_bf_acc(uint16_t op1, uint16_t op2, uint32_t acc,
+ float_status *fp_status);
+
+#define bf_to_sf(A) (((uint32_t)(A)) << 16)
+
+#define fp_min_bf(A, B) ({ \
+ uint32_t _bf_res = fp_min_sf(bf_to_sf(A), bf_to_sf(B), &env->fp_status); \
+ (uint16_t)((_bf_res >> 16) & 0xffff); \
+})
+
+#define fp_max_bf(A, B) ({ \
+ uint32_t _bf_res = fp_max_sf(bf_to_sf(A), bf_to_sf(B), &env->fp_status); \
+ (uint16_t)((_bf_res >> 16) & 0xffff); \
+})
+
+static inline uint16_t sf_to_bf(int32_t A)
+{
+ uint32_t rslt = A;
+ if ((rslt & 0x1FFFF) == 0x08000) {
+ /* do not round up if exactly .5 and even already */
+ } else if ((rslt & 0x8000) == 0x8000) {
+ rslt += 0x8000; /* rounding to nearest number */
+ }
+ rslt = float32_is_any_nan(A) ? FP32_DEF_NAN : rslt;
+ return rslt >> 16;
+}
+
#endif
diff --git a/target/hexagon/mmvec/macros.h b/target/hexagon/mmvec/macros.h
index c342507d1a..b70996578e 100644
--- a/target/hexagon/mmvec/macros.h
+++ b/target/hexagon/mmvec/macros.h
@@ -25,6 +25,9 @@
#include "accel/tcg/probe.h"
#include "mmvec/kvx_ieee.h"
+#define fBFLOAT()
+#define fCVI_VX_NO_TMP_LD()
+
#ifndef QEMU_GENERATE
#define VdV (*(MMVector *restrict)(VdV_void))
#define VsV (*(MMVector *restrict)(VsV_void))
@@ -366,4 +369,6 @@
(int16_t)(A) > (int16_t)(B) : \
float16_compare((A), (B), &env->fp_status) == float_relation_greater)
+#define fCMPGT_BF(A, B) fCMPGT_SF(((int)A) << 16, ((int)B) << 16)
+
#endif
diff --git a/target/hexagon/mmvec/mmvec.h b/target/hexagon/mmvec/mmvec.h
index eaedfe0d6d..9d8d57c7c6 100644
--- a/target/hexagon/mmvec/mmvec.h
+++ b/target/hexagon/mmvec/mmvec.h
@@ -40,6 +40,7 @@ typedef union {
int8_t b[MAX_VEC_SIZE_BYTES / 1];
int32_t sf[MAX_VEC_SIZE_BYTES / 4]; /* single float (32-bit) */
int16_t hf[MAX_VEC_SIZE_BYTES / 2]; /* half float (16-bit) */
+ uint16_t bf[MAX_VEC_SIZE_BYTES / 2]; /* bfloat16 */
} MMVector;
typedef union {
diff --git a/target/hexagon/mmvec/kvx_ieee.c b/target/hexagon/mmvec/kvx_ieee.c
index bbeec09707..b5c434ad6d 100644
--- a/target/hexagon/mmvec/kvx_ieee.c
+++ b/target/hexagon/mmvec/kvx_ieee.c
@@ -229,3 +229,6 @@ int16_t conv_h_hf(uint16_t a, float_status *fp_status)
}
return float16_to_int16_round_to_zero(f1, fp_status);
}
+
+DEF_FP_INSN_3(mult_sf_bf_acc, 32, 16, 16, 32,
+ float32_muladd(bf_to_sf(f1), bf_to_sf(f2), f3, 0, fp_status))
diff --git a/target/hexagon/imported/mmvec/encode_ext.def b/target/hexagon/imported/mmvec/encode_ext.def
index 3f84a1691b..352a8ec14b 100644
--- a/target/hexagon/imported/mmvec/encode_ext.def
+++ b/target/hexagon/imported/mmvec/encode_ext.def
@@ -869,4 +869,19 @@ DEF_ENC(V6_vgthf_or,"00011100100vvvvvPP1uuuuu001101xx")
DEF_ENC(V6_vgtsf_xor,"00011100100vvvvvPP1uuuuu111010xx")
DEF_ENC(V6_vgthf_xor,"00011100100vvvvvPP1uuuuu111011xx")
+/* BFLOAT instructions */
+DEF_ENC(V6_vmpy_sf_bf,"00011101010vvvvvPP1uuuuu100ddddd")
+DEF_ENC(V6_vmpy_sf_bf_acc,"00011101000vvvvvPP1uuuuu000xxxxx")
+DEF_ENC(V6_vadd_sf_bf,"00011101010vvvvvPP1uuuuu110ddddd")
+DEF_ENC(V6_vsub_sf_bf,"00011101010vvvvvPP1uuuuu101ddddd")
+DEF_ENC(V6_vmax_bf,"00011101010vvvvvPP1uuuuu111ddddd")
+DEF_ENC(V6_vmin_bf,"00011101010vvvvvPP1uuuuu000ddddd")
+DEF_ENC(V6_vcvt_bf_sf,"00011101010vvvvvPP1uuuuu011ddddd")
+
+/* BFLOAT compare instructions */
+DEF_ENC(V6_vgtbf,"00011100100vvvvvPP1uuuuu011110dd")
+DEF_ENC(V6_vgtbf_and,"00011100100vvvvvPP1uuuuu110100xx")
+DEF_ENC(V6_vgtbf_or,"00011100100vvvvvPP1uuuuu001110xx")
+DEF_ENC(V6_vgtbf_xor,"00011100100vvvvvPP1uuuuu111100xx")
+
#endif /* NO MMVEC */
diff --git a/target/hexagon/imported/mmvec/ext.idef b/target/hexagon/imported/mmvec/ext.idef
index 304c4966d8..afe9de3716 100644
--- a/target/hexagon/imported/mmvec/ext.idef
+++ b/target/hexagon/imported/mmvec/ext.idef
@@ -3149,6 +3149,15 @@ ITERATOR_INSN_SHIFT_SLOT_FLT(16, vconv_hf_h,"Vd32.hf=Vu32.h",
} \
}
+#define VCMPGT_BF(DEST, ASRC, ASRCOP, CMP, N, SRC, MASK, WIDTH) \
+{ \
+ fBFLOAT(); \
+ for (fHIDE(int) i = 0; i < fVBYTES(); i += WIDTH) { \
+ fHIDE(int) VAL = fCMPGT_BF(VuV.SRC[i/WIDTH],VvV.SRC[i/WIDTH]) ? MASK : 0; \
+ fSETQBITS(DEST,WIDTH,MASK,i,ASRC ASRCOP VAL); \
+ } \
+}
+
/* Vector SF compare */
#define MMVEC_CMPGT_SF(TYPE,TYPE2,DESCR,N,MASK,WIDTH,SRC) \
EXTINSN(V6_vgt##TYPE##_and, "Qx4&=vcmp.gt(Vu32." TYPE2 ",Vv32." TYPE2 ")", \
@@ -3187,8 +3196,63 @@ ITERATOR_INSN_SHIFT_SLOT_FLT(16, vconv_hf_h,"Vd32.hf=Vu32.h",
DESCR" greater than", \
VCMPGT_HF(QdV, , , ">", N, SRC, MASK, WIDTH))
+/* Vector BF compare */
+#define MMVEC_CMPGT_BF(TYPE,TYPE2,DESCR,N,MASK,WIDTH,SRC) \
+ EXTINSN(V6_vgt##TYPE##_and, "Qx4&=vcmp.gt(Vu32." TYPE2 ",Vv32." TYPE2 ")",\
+ ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA,A_CVI_VA_2SRC,A_HVX_FLT), \
+ DESCR" greater than with predicate-and", \
+ VCMPGT_BF(QxV, fGETQBITS(QxV,WIDTH,MASK,i), &, ">", N, SRC, MASK, WIDTH)) \
+ EXTINSN(V6_vgt##TYPE##_xor, "Qx4^=vcmp.gt(Vu32." TYPE2 ",Vv32." TYPE2 ")", \
+ ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA,A_CVI_VA_2SRC,A_HVX_FLT), \
+ DESCR" greater than with predicate-xor", \
+ VCMPGT_BF(QxV, fGETQBITS(QxV,WIDTH,MASK,i), ^, ">", N, SRC, MASK, WIDTH)) \
+ EXTINSN(V6_vgt##TYPE##_or, "Qx4|=vcmp.gt(Vu32." TYPE2 ",Vv32." TYPE2 ")", \
+ ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA,A_CVI_VA_2SRC,A_HVX_FLT), \
+ DESCR" greater than with predicate-or", \
+ VCMPGT_BF(QxV, fGETQBITS(QxV,WIDTH,MASK,i), |, ">", N, SRC, MASK, WIDTH)) \
+ EXTINSN(V6_vgt##TYPE, "Qd4=vcmp.gt(Vu32." TYPE2 ",Vv32." TYPE2 ")", \
+ ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA,A_CVI_VA_2SRC,A_HVX_FLT), \
+ DESCR" greater than", \
+ VCMPGT_BF(QdV, , , ">", N, SRC, MASK, WIDTH))
+
MMVEC_CMPGT_SF(sf,"sf","Vector sf Compare ", fVELEM(32), 0xF, 4, sf)
MMVEC_CMPGT_HF(hf,"hf","Vector hf Compare ", fVELEM(16), 0x3, 2, hf)
+MMVEC_CMPGT_BF(bf,"bf","Vector bf Compare ", fVELEM(16), 0x3, 2, bf)
+
+/******************************************************************************
+ BFloat arithmetic and max/min instructions
+ ******************************************************************************/
+
+ITERATOR_INSN_IEEE_FP_DOUBLE_32(32, vadd_sf_bf,
+ "Vdd32.sf=vadd(Vu32.bf,Vv32.bf)", "Vector IEEE add: bf widen to sf",
+ VddV.v[0].sf[i] = fp_add_sf_bf(VuV.bf[2*i], VvV.bf[2*i]);
+ VddV.v[1].sf[i] = fp_add_sf_bf(VuV.bf[2*i+1], VvV.bf[2*i+1]); fBFLOAT())
+ITERATOR_INSN_IEEE_FP_DOUBLE_32(32, vsub_sf_bf,
+ "Vdd32.sf=vsub(Vu32.bf,Vv32.bf)", "Vector IEEE sub: bf widen to sf",
+ VddV.v[0].sf[i] = fp_sub_sf_bf(VuV.bf[2*i], VvV.bf[2*i]);
+ VddV.v[1].sf[i] = fp_sub_sf_bf(VuV.bf[2*i+1], VvV.bf[2*i+1]); fBFLOAT())
+ITERATOR_INSN_IEEE_FP_DOUBLE_32(32, vmpy_sf_bf,
+ "Vdd32.sf=vmpy(Vu32.bf,Vv32.bf)", "Vector IEEE mul: hf widen to sf",
+ VddV.v[0].sf[i] = fp_mult_sf_bf(VuV.bf[2*i], VvV.bf[2*i]);
+ VddV.v[1].sf[i] = fp_mult_sf_bf(VuV.bf[2*i+1], VvV.bf[2*i+1]); fBFLOAT())
+ITERATOR_INSN_IEEE_FP_DOUBLE_32(32, vmpy_sf_bf_acc,
+ "Vxx32.sf+=vmpy(Vu32.bf,Vv32.bf)", "Vector IEEE fma: hf widen to sf",
+ VxxV.v[0].sf[i] = fp_mult_sf_bf_acc(VuV.bf[2*i], VvV.bf[2*i],
+ VxxV.v[0].sf[i], &env->fp_status);
+ VxxV.v[1].sf[i] = fp_mult_sf_bf_acc(VuV.bf[2*i+1], VvV.bf[2*i+1],
+ VxxV.v[1].sf[i], &env->fp_status);
+ fCVI_VX_NO_TMP_LD(); fBFLOAT())
+ITERATOR_INSN_IEEE_FP_16(32, vcvt_bf_sf,
+ "Vd32.bf=vcvt(Vu32.sf,Vv32.sf)", "Vector IEEE cvt: sf to bf",
+ VdV.bf[2*i] = sf_to_bf(VuV.sf[i]);
+ VdV.bf[2*i+1] = sf_to_bf(VvV.sf[i]); fBFLOAT())
+
+ITERATOR_INSN_IEEE_FP_16_32_LATE(16, vmax_bf, "Vd32.bf=vmax(Vu32.bf,Vv32.bf)",
+ "Vector IEEE max: bf", VdV.bf[i] = fp_max_bf(VuV.bf[i], VvV.bf[i]);
+ fBFLOAT())
+ITERATOR_INSN_IEEE_FP_16_32_LATE(16, vmin_bf, "Vd32.bf=vmin(Vu32.bf,Vv32.bf)",
+ "Vector IEEE max: bf", VdV.bf[i] = fp_min_bf(VuV.bf[i], VvV.bf[i]);
+ fBFLOAT())
/******************************************************************************
DEBUG Vector/Register Printing
--
2.37.2
On Mon, Mar 23, 2026 at 7:16 AM Matheus Tavares Bernardino <
matheus.bernardino@oss.qualcomm.com> wrote:
> Add HVX IEEE bfloat16 (bf16) instructions:
>
> Arithmetic operations:
> - V6_vadd_sf_bf, V6_vsub_sf_bf: add/sub bf16 widening to sf output
> - V6_vmpy_sf_bf: multiply bf16 widening to sf output
> - V6_vmpy_sf_bf_acc: multiply-accumulate bf16 widening to sf output
>
> Min/Max operations:
> - V6_vmin_bf, V6_vmax_bf: bf16 min/max
>
> Comparison operations:
> - V6_vgtbf: greater-than compare
> - V6_vgtbf_and, V6_vgtbf_or, V6_vgtbf_xor: predicate variants
>
> Conversion operations:
> - V6_vcvt_bf_sf: convert sf to bf16
>
> Signed-off-by: Matheus Tavares Bernardino <
> matheus.bernardino@oss.qualcomm.com>
> ---
> target/hexagon/mmvec/kvx_ieee.h | 36 +++++++++++
> target/hexagon/mmvec/macros.h | 5 ++
> target/hexagon/mmvec/mmvec.h | 1 +
> target/hexagon/mmvec/kvx_ieee.c | 3 +
> target/hexagon/imported/mmvec/encode_ext.def | 15 +++++
> target/hexagon/imported/mmvec/ext.idef | 64 ++++++++++++++++++++
> 6 files changed, 124 insertions(+)
>
> diff --git a/target/hexagon/mmvec/kvx_ieee.h
> b/target/hexagon/mmvec/kvx_ieee.h
> index 8a6816f6b3..eb670d4ec3 100644
> --- a/target/hexagon/mmvec/kvx_ieee.h
> +++ b/target/hexagon/mmvec/kvx_ieee.h
> @@ -80,4 +80,40 @@ int16_t conv_hf_h(int16_t a, float_status *fp_status);
> int32_t conv_w_sf(uint32_t a, float_status *fp_status);
> int16_t conv_h_hf(uint16_t a, float_status *fp_status);
>
> +/* IEEE BFloat instructions */
> +
> +#define fp_mult_sf_bf(A, B) \
> + fp_mult_sf_sf(((uint32_t)(A)) << 16, ((uint32_t)(B)) << 16,
> &env->fp_status)
> +#define fp_add_sf_bf(A, B) \
> + fp_add_sf_sf(((uint32_t)(A)) << 16, ((uint32_t)(B)) << 16,
> &env->fp_status)
> +#define fp_sub_sf_bf(A, B) \
> + fp_sub_sf_sf(((uint32_t)(A)) << 16, ((uint32_t)(B)) << 16,
> &env->fp_status)
>
Can we use softfloat routine bfloat16_to_float32 instead of shifting by 16?
> +
> +uint32_t fp_mult_sf_bf_acc(uint16_t op1, uint16_t op2, uint32_t acc,
> + float_status *fp_status);
> +
> +#define bf_to_sf(A) (((uint32_t)(A)) << 16)
>
Ditto
> +
> +#define fp_min_bf(A, B) ({ \
> + uint32_t _bf_res = fp_min_sf(bf_to_sf(A), bf_to_sf(B),
> &env->fp_status); \
> + (uint16_t)((_bf_res >> 16) & 0xffff); \
>
float32_to_bfloat16
> +})
> +
> +#define fp_max_bf(A, B) ({ \
> + uint32_t _bf_res = fp_max_sf(bf_to_sf(A), bf_to_sf(B),
> &env->fp_status); \
> + (uint16_t)((_bf_res >> 16) & 0xffff); \
>
Ditto
> +})
> +
> +static inline uint16_t sf_to_bf(int32_t A)
> +{
> + uint32_t rslt = A;
> + if ((rslt & 0x1FFFF) == 0x08000) {
> + /* do not round up if exactly .5 and even already */
> + } else if ((rslt & 0x8000) == 0x8000) {
> + rslt += 0x8000; /* rounding to nearest number */
> + }
> + rslt = float32_is_any_nan(A) ? FP32_DEF_NAN : rslt;
> + return rslt >> 16;
> +}
>
float32_to_bfloat16
> +
> #endif
> diff --git a/target/hexagon/mmvec/macros.h b/target/hexagon/mmvec/macros.h
> index c342507d1a..b70996578e 100644
> --- a/target/hexagon/mmvec/macros.h
> +++ b/target/hexagon/mmvec/macros.h
> @@ -25,6 +25,9 @@
> #include "accel/tcg/probe.h"
> #include "mmvec/kvx_ieee.h"
>
> +#define fBFLOAT()
> +#define fCVI_VX_NO_TMP_LD()
> +
> #ifndef QEMU_GENERATE
> #define VdV (*(MMVector *restrict)(VdV_void))
> #define VsV (*(MMVector *restrict)(VsV_void))
> @@ -366,4 +369,6 @@
> (int16_t)(A) > (int16_t)(B) : \
> float16_compare((A), (B), &env->fp_status) == float_relation_greater)
>
> +#define fCMPGT_BF(A, B) fCMPGT_SF(((int)A) << 16, ((int)B) << 16)
>
bfloat16_to_float32
> +
> #endif
> diff --git a/target/hexagon/mmvec/mmvec.h b/target/hexagon/mmvec/mmvec.h
> index eaedfe0d6d..9d8d57c7c6 100644
> --- a/target/hexagon/mmvec/mmvec.h
> +++ b/target/hexagon/mmvec/mmvec.h
> @@ -40,6 +40,7 @@ typedef union {
> int8_t b[MAX_VEC_SIZE_BYTES / 1];
> int32_t sf[MAX_VEC_SIZE_BYTES / 4]; /* single float (32-bit) */
> int16_t hf[MAX_VEC_SIZE_BYTES / 2]; /* half float (16-bit) */
> + uint16_t bf[MAX_VEC_SIZE_BYTES / 2]; /* bfloat16 */
>
Consider using bfloat16
Also float32 for sf and float16 for hf.
> } MMVector;
Thanks,
Taylor
On Mon, Mar 23, 2026 at 7:03 PM Taylor Simpson <ltaylorsimpson@gmail.com> wrote: > > On Mon, Mar 23, 2026 at 7:16 AM Matheus Tavares Bernardino <matheus.bernardino@oss.qualcomm.com> wrote: >> >> diff --git a/target/hexagon/mmvec/macros.h b/target/hexagon/mmvec/macros.h >> index c342507d1a..b70996578e 100644 >> --- a/target/hexagon/mmvec/macros.h >> +++ b/target/hexagon/mmvec/macros.h >> @@ -25,6 +25,9 @@ >> #include "accel/tcg/probe.h" >> #include "mmvec/kvx_ieee.h" >> >> +#define fBFLOAT() >> +#define fCVI_VX_NO_TMP_LD() >> + >> #ifndef QEMU_GENERATE >> #define VdV (*(MMVector *restrict)(VdV_void)) >> #define VsV (*(MMVector *restrict)(VsV_void)) >> @@ -366,4 +369,6 @@ >> (int16_t)(A) > (int16_t)(B) : \ >> float16_compare((A), (B), &env->fp_status) == float_relation_greater) >> >> +#define fCMPGT_BF(A, B) fCMPGT_SF(((int)A) << 16, ((int)B) << 16) > > > bfloat16_to_flloat32 I've been experimenting with this, but it looks like softfloat's bfloat16_to_flloat32 conversion will handle NaNs in a way that is incompatible with Hexagon. I think we will have to stick with the 16-bits shift
On 4/1/26 08:03, Matheus Bernardino wrote: > On Mon, Mar 23, 2026 at 7:03 PM Taylor Simpson <ltaylorsimpson@gmail.com> wrote: >> >> On Mon, Mar 23, 2026 at 7:16 AM Matheus Tavares Bernardino <matheus.bernardino@oss.qualcomm.com> wrote: >>> >>> diff --git a/target/hexagon/mmvec/macros.h b/target/hexagon/mmvec/macros.h >>> index c342507d1a..b70996578e 100644 >>> --- a/target/hexagon/mmvec/macros.h >>> +++ b/target/hexagon/mmvec/macros.h >>> @@ -25,6 +25,9 @@ >>> #include "accel/tcg/probe.h" >>> #include "mmvec/kvx_ieee.h" >>> >>> +#define fBFLOAT() >>> +#define fCVI_VX_NO_TMP_LD() >>> + >>> #ifndef QEMU_GENERATE >>> #define VdV (*(MMVector *restrict)(VdV_void)) >>> #define VsV (*(MMVector *restrict)(VsV_void)) >>> @@ -366,4 +369,6 @@ >>> (int16_t)(A) > (int16_t)(B) : \ >>> float16_compare((A), (B), &env->fp_status) == float_relation_greater) >>> >>> +#define fCMPGT_BF(A, B) fCMPGT_SF(((int)A) << 16, ((int)B) << 16) >> >> >> bfloat16_to_flloat32 > > I've been experimenting with this, but it looks like softfloat's > bfloat16_to_flloat32 conversion will handle NaNs in a way that is > incompatible with Hexagon. I think we will have to stick with the > 16-bits shift Yes indeed. Many Arm bfloat16 inputs require the same shift. It's only the "real" conversions insns that want bfloat16_to_float32. r~
© 2016 - 2026 Red Hat, Inc.