This commit provides the covert operations for ofp8(e4m3, e5m2) with
following implementation defined behaviors required by RISC-V Zvfofp8min
extension:
- The canonical NaN of OFP8 e4m3 and e5m2 is 0x7f (ocpfp8_same_cnan)
- All of the NaNs of OFP8 are quiet NaNs (ocpfp8_only_qnan)
According to the definition in OFP8 spec, the conversion from a wider
format infinity depends on the saturation mode defined in the spec.
Signed-off-by: Max Chou <max.chou@sifive.com>
---
fpu/softfloat-parts.c.inc | 77 +++++++++++-
fpu/softfloat.c | 241 ++++++++++++++++++++++++++++++++++++++
include/fpu/softfloat.h | 11 ++
3 files changed, 323 insertions(+), 6 deletions(-)
diff --git a/fpu/softfloat-parts.c.inc b/fpu/softfloat-parts.c.inc
index 5e0438fc0b..d9ec3ca8ae 100644
--- a/fpu/softfloat-parts.c.inc
+++ b/fpu/softfloat-parts.c.inc
@@ -227,7 +227,8 @@ static void partsN(canonicalize)(FloatPartsN *p, float_status *status,
p->exp = fmt->frac_shift - fmt->exp_bias
- shift + !has_pseudo_denormals;
}
- } else if (likely(p->exp < fmt->exp_max) || fmt->arm_althp) {
+ } else if (likely(p->exp < fmt->exp_max) || fmt->arm_althp ||
+ ocpfp_is_normal(p, fmt, false)) {
p->cls = float_class_normal;
p->exp -= fmt->exp_bias;
frac_shl(p, fmt->frac_shift);
@@ -236,8 +237,12 @@ static void partsN(canonicalize)(FloatPartsN *p, float_status *status,
p->cls = float_class_inf;
} else {
frac_shl(p, fmt->frac_shift);
- p->cls = (parts_is_snan_frac(p->frac_hi, status)
- ? float_class_snan : float_class_qnan);
+ if (parts_is_snan_frac(p->frac_hi, status) == false ||
+ (fmt->ocpfp && status->ocp_fp8e5m2_no_signal_nan)) {
+ p->cls = float_class_qnan;
+ } else {
+ p->cls = float_class_snan;
+ }
}
}
@@ -313,8 +318,40 @@ static void partsN(uncanon_normal)(FloatPartsN *p, float_status *s,
}
p->frac_lo &= ~round_mask;
}
+ p->exp = exp;
- if (fmt->arm_althp) {
+ if (fmt->ocpfp) {
+ if (ocpfp_is_normal(p, fmt, true) == false) {
+ flags |= float_flag_overflow;
+ flags |= float_flag_inexact;
+ if (fmt->exp_size == 4) {
+ if (fmt->ocpfp_sat || overflow_norm) {
+ /* S.1111.110 */
+ exp = exp_max;
+ frac_clear(p);
+ frac_addi(p, p, 0b110);
+ frac_shl(p, frac_shift);
+ } else {
+ /* S.1111.111 NaN */
+ p->cls = float_class_qnan;
+ p->sign = (s->ocp_fp8_same_canonical_nan ? 0 : p->sign);
+ exp = exp_max;
+ frac_allones(p);
+ }
+ } else if (fmt->exp_size == 5) {
+ if (fmt->ocpfp_sat || overflow_norm) {
+ /* S.11110.11 */
+ exp = exp_max - 1;
+ frac_allones(p);
+ } else {
+ /* S.11111.00 Inf */
+ p->cls = float_class_inf;
+ exp = exp_max;
+ frac_clear(p);
+ }
+ }
+ }
+ } else if (fmt->arm_althp) {
/* ARM Alt HP eschews Inf and NaN for a wider exponent. */
if (unlikely(exp > exp_max)) {
/* Overflow. Return the maximum normal. */
@@ -435,8 +472,36 @@ static void partsN(uncanon)(FloatPartsN *p, float_status *s,
return;
case float_class_inf:
g_assert(!fmt->arm_althp);
- p->exp = fmt->exp_max;
- frac_clear(p);
+ if (fmt->ocpfp) {
+ if (fmt->ocpfp_sat) {
+ if (fmt->exp_size == 4) {
+ /* S.1111.110 */
+ p->exp = fmt->exp_max;
+ frac_clear(p);
+ frac_addi(p, p, 0x6);
+ } else {
+ /* S.11110.11 */
+ p->exp = fmt->exp_max - 1;
+ frac_allones(p);
+ }
+ } else {
+ if (fmt->exp_size == 4) {
+ /* S.1111.111 NaN */
+ p->cls = float_class_qnan;
+ p->sign = (s->ocp_fp8_same_canonical_nan ? 0 : p->sign);
+ p->exp = fmt->exp_max;
+ frac_allones(p);
+ } else {
+ /* S.11111.00 Inf */
+ p->cls = float_class_inf;
+ p->exp = fmt->exp_max;
+ frac_clear(p);
+ }
+ }
+ } else {
+ p->exp = fmt->exp_max;
+ frac_clear(p);
+ }
return;
case float_class_qnan:
case float_class_snan:
diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index 8094358c2e..0c7f052ec0 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -544,6 +544,8 @@ typedef struct {
int frac_shift;
bool arm_althp;
bool has_explicit_bit;
+ bool ocpfp;
+ bool ocpfp_sat;
uint64_t round_mask;
} FloatFmt;
@@ -560,6 +562,28 @@ typedef struct {
.frac_shift = (-F - 1) & 63, \
.round_mask = (1ull << ((-F - 1) & 63)) - 1
+static const FloatFmt float8_e4m3_params = {
+ FLOAT_PARAMS(4, 3),
+ .ocpfp = true
+};
+
+static const FloatFmt float8_e4m3_params_sat = {
+ FLOAT_PARAMS(4, 3),
+ .ocpfp = true,
+ .ocpfp_sat = true
+};
+
+static const FloatFmt float8_e5m2_params = {
+ FLOAT_PARAMS(5, 2),
+ .ocpfp = true
+};
+
+static const FloatFmt float8_e5m2_params_sat = {
+ FLOAT_PARAMS(5, 2),
+ .ocpfp = true,
+ .ocpfp_sat = true
+};
+
static const FloatFmt float16_params = {
FLOAT_PARAMS(5, 10)
};
@@ -614,6 +638,16 @@ static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, uint64_t raw)
};
}
+static void QEMU_FLATTEN float8_e4m3_unpack_raw(FloatParts64 *p, float8_e4m3 f)
+{
+ unpack_raw64(p, &float8_e4m3_params, f);
+}
+
+static void QEMU_FLATTEN float8_e5m2_unpack_raw(FloatParts64 *p, float8_e5m2 f)
+{
+ unpack_raw64(p, &float8_e5m2_params, f);
+}
+
static void QEMU_FLATTEN float16_unpack_raw(FloatParts64 *p, float16 f)
{
unpack_raw64(p, &float16_params, f);
@@ -671,6 +705,16 @@ static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt *fmt)
return ret;
}
+static float8_e4m3 QEMU_FLATTEN float8_e4m3_pack_raw(const FloatParts64 *p)
+{
+ return make_float8_e4m3(pack_raw64(p, &float8_e4m3_params));
+}
+
+static float8_e5m2 QEMU_FLATTEN float8_e5m2_pack_raw(const FloatParts64 *p)
+{
+ return make_float8_e5m2(pack_raw64(p, &float8_e5m2_params));
+}
+
static float16 QEMU_FLATTEN float16_pack_raw(const FloatParts64 *p)
{
return make_float16(pack_raw64(p, &float16_params));
@@ -1604,6 +1648,91 @@ static void frac128_widen(FloatParts256 *r, FloatParts128 *a)
#define frac_widen(A, B) FRAC_GENERIC_64_128(widen, B)(A, B)
+#define OCPFP_GENERIC_64_128(NAME, P) \
+ _Generic((P), FloatParts64 *: ocpfp64_##NAME, \
+ FloatParts128 *: ocpfp128_##NAME)
+
+static bool ocpfp64_is_normal(const FloatParts64 *a, const FloatFmt *fmt,
+ bool is_normalized)
+{
+ FloatParts64 input;
+ input.exp = a->exp;
+ input.frac = a->frac;
+ if (!is_normalized) {
+ frac64_shl(&input, fmt->frac_shift);
+ input.frac_hi |= DECOMPOSED_IMPLICIT_BIT;
+ }
+
+ if (fmt->ocpfp) {
+ if (fmt->exp_size == 4 && fmt->frac_size == 3) {
+ /*
+ * The OCP E4M3 format uses only two bit patterns for NaN (a
+ * single mantissa-exponent bit pattern with the sign bit) in
+ * order to increase emax to 8 and thus to increase the dynamic
+ * range by one binade.
+ */
+ FloatParts64 tmp;
+ frac64_clear(&tmp);
+ tmp.frac_lo = 0b110;
+ frac64_shl(&tmp, fmt->frac_shift);
+ tmp.frac_hi |= DECOMPOSED_IMPLICIT_BIT;
+ if (!(input.exp > fmt->exp_max ||
+ (input.exp == fmt->exp_max &&
+ frac64_cmp(&input, &tmp) == float_relation_greater))) {
+ return true;
+ }
+ } else if (fmt->exp_size == 5 && fmt->frac_size == 2) {
+ if (input.exp < fmt->exp_max) {
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+static bool ocpfp128_is_normal(const FloatParts128 *a, const FloatFmt *fmt,
+ bool is_normalized)
+{
+ FloatParts128 input;
+ input.exp = a->exp;
+ input.frac_hi = a->frac_hi;
+ input.frac_lo = a->frac_lo;
+ if (!is_normalized) {
+ frac128_shl(&input, fmt->frac_shift);
+ input.frac_hi |= DECOMPOSED_IMPLICIT_BIT;
+ }
+
+ if (fmt->ocpfp) {
+ if (fmt->exp_size == 4 && fmt->frac_size == 3) {
+ /*
+ * The OCP E4M3 format uses only two bit patterns for NaN (a
+ * single mantissa-exponent bit pattern with the sign bit) in
+ * order to increase emax to 8 and thus to increase the dynamic
+ * range by one binade.
+ */
+ FloatParts128 tmp;
+ frac128_clear(&tmp);
+ tmp.frac_lo = 0b110;
+ frac128_shl(&tmp, fmt->frac_shift);
+ tmp.frac_hi |= DECOMPOSED_IMPLICIT_BIT;
+ if (!(input.exp > fmt->exp_max ||
+ (input.exp == fmt->exp_max &&
+ frac128_cmp(&input, &tmp) == float_relation_greater))) {
+ return true;
+ }
+ } else if (fmt->exp_size == 5 && fmt->frac_size == 2) {
+ if (input.exp < fmt->exp_max) {
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+#define ocpfp_is_normal(A, F, N) OCPFP_GENERIC_64_128(is_normal, A)(A, F, N)
+
/*
* Reciprocal sqrt table. 1 bit of exponent, 6-bits of mantessa.
* From https://git.musl-libc.org/cgit/musl/tree/src/math/sqrt_data.c
@@ -1662,6 +1791,20 @@ static const uint16_t rsqrt_tab[128] = {
* Pack/unpack routines with a specific FloatFmt.
*/
+static void float8_e4m3_unpack_canonical(FloatParts64 *p, float8_e4m3 f,
+ float_status *s)
+{
+ float8_e4m3_unpack_raw(p, f);
+ parts_canonicalize(p, s, &float8_e4m3_params);
+}
+
+static void float8_e5m2_unpack_canonical(FloatParts64 *p, float8_e5m2 f,
+ float_status *s)
+{
+ float8_e5m2_unpack_raw(p, f);
+ parts_canonicalize(p, s, &float8_e5m2_params);
+}
+
static void float16a_unpack_canonical(FloatParts64 *p, float16 f,
float_status *s, const FloatFmt *params)
{
@@ -1682,6 +1825,22 @@ static void bfloat16_unpack_canonical(FloatParts64 *p, bfloat16 f,
parts_canonicalize(p, s, &bfloat16_params);
}
+static float8_e4m3 float8_e4m3_round_pack_canonical(FloatParts64 *p,
+ float_status *status,
+ const FloatFmt *params)
+{
+ parts_uncanon(p, status, params);
+ return float8_e4m3_pack_raw(p);
+}
+
+static float8_e5m2 float8_e5m2_round_pack_canonical(FloatParts64 *p,
+ float_status *status,
+ const FloatFmt *params)
+{
+ parts_uncanon(p, status, params);
+ return float8_e5m2_pack_raw(p);
+}
+
static float16 float16a_round_pack_canonical(FloatParts64 *p,
float_status *s,
const FloatFmt *params)
@@ -2759,6 +2918,23 @@ static void parts_float_to_ahp(FloatParts64 *a, float_status *s)
}
}
+static void parts_float_to_ofp8(FloatParts64 *a, float_status *s,
+ const FloatFmt *fmt)
+{
+ if (is_nan(a->cls)) {
+ if (s->ocp_fp8_same_canonical_nan) {
+ if (a->cls == float_class_snan) {
+ float_raise(float_flag_invalid | float_flag_invalid_snan, s);
+ }
+ a->sign = 0;
+ a->exp = fmt->exp_max;
+ frac_allones(a);
+ } else {
+ parts_return_nan(a, s);
+ }
+ }
+}
+
static void parts64_float_to_float(FloatParts64 *a, float_status *s)
{
if (is_nan(a->cls)) {
@@ -2823,6 +2999,71 @@ static void parts_float_to_float_widen(FloatParts128 *a, FloatParts64 *b,
}
}
+
+bfloat16 float8_e4m3_to_bfloat16(float8_e4m3 a, float_status *s)
+{
+ FloatParts64 p;
+
+ float8_e4m3_unpack_canonical(&p, a, s);
+ parts_float_to_float(&p, s);
+
+ return bfloat16_round_pack_canonical(&p, s);
+}
+
+bfloat16 float8_e5m2_to_bfloat16(float8_e5m2 a, float_status *s)
+{
+ FloatParts64 p;
+
+ float8_e5m2_unpack_canonical(&p, a, s);
+ parts_float_to_float(&p, s);
+
+ return bfloat16_round_pack_canonical(&p, s);
+}
+
+float8_e4m3 bfloat16_to_float8_e4m3(bfloat16 a, bool saturate, float_status *s)
+{
+ const FloatFmt *fmt = saturate ? &float8_e4m3_params_sat
+ : &float8_e4m3_params;
+ FloatParts64 p;
+
+ bfloat16_unpack_canonical(&p, a, s);
+ parts_float_to_ofp8(&p, s, fmt);
+ return float8_e4m3_round_pack_canonical(&p, s, fmt);
+}
+
+float8_e5m2 bfloat16_to_float8_e5m2(bfloat16 a, bool saturate, float_status *s)
+{
+ const FloatFmt *fmt = saturate ? &float8_e5m2_params_sat
+ : &float8_e5m2_params;
+ FloatParts64 p;
+
+ bfloat16_unpack_canonical(&p, a, s);
+ parts_float_to_ofp8(&p, s, fmt);
+ return float8_e5m2_round_pack_canonical(&p, s, fmt);
+}
+
+float8_e4m3 float32_to_float8_e4m3(float32 a, bool saturate, float_status *s)
+{
+ const FloatFmt *fmt = saturate ? &float8_e4m3_params_sat
+ : &float8_e4m3_params;
+ FloatParts64 p;
+
+ float32_unpack_canonical(&p, a, s);
+ parts_float_to_ofp8(&p, s, fmt);
+ return float8_e4m3_round_pack_canonical(&p, s, fmt);
+}
+
+float8_e5m2 float32_to_float8_e5m2(float32 a, bool saturate, float_status *s)
+{
+ const FloatFmt *fmt = saturate ? &float8_e5m2_params_sat
+ : &float8_e5m2_params;
+ FloatParts64 p;
+
+ float32_unpack_canonical(&p, a, s);
+ parts_float_to_ofp8(&p, s, fmt);
+ return float8_e5m2_round_pack_canonical(&p, s, fmt);
+}
+
float32 float16_to_float32(float16 a, bool ieee, float_status *s)
{
const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h
index 6f7259f9dd..7ab585bfc8 100644
--- a/include/fpu/softfloat.h
+++ b/include/fpu/softfloat.h
@@ -189,6 +189,17 @@ float128 int128_to_float128(Int128, float_status *status);
float128 uint64_to_float128(uint64_t, float_status *status);
float128 uint128_to_float128(Int128, float_status *status);
+/*----------------------------------------------------------------------------
+| Software OCP FP8 conversion routines.
+*----------------------------------------------------------------------------*/
+
+bfloat16 float8_e4m3_to_bfloat16(float8_e4m3, float_status *status);
+bfloat16 float8_e5m2_to_bfloat16(float8_e5m2, float_status *status);
+float8_e4m3 bfloat16_to_float8_e4m3(bfloat16, bool saturate, float_status *status);
+float8_e5m2 bfloat16_to_float8_e5m2(bfloat16, bool saturate, float_status *status);
+float8_e4m3 float32_to_float8_e4m3(float32, bool saturate, float_status *status);
+float8_e5m2 float32_to_float8_e5m2(float32, bool saturate, float_status *status);
+
/*----------------------------------------------------------------------------
| Software OCP FP8 operations.
*----------------------------------------------------------------------------*/
--
2.43.7
On 1/9/26 02:16, Max Chou wrote:
> diff --git a/fpu/softfloat.c b/fpu/softfloat.c
> index 8094358c2e..0c7f052ec0 100644
> --- a/fpu/softfloat.c
> +++ b/fpu/softfloat.c
> @@ -544,6 +544,8 @@ typedef struct {
> int frac_shift;
> bool arm_althp;
> bool has_explicit_bit;
> + bool ocpfp;
> + bool ocpfp_sat;
> uint64_t round_mask;
> } FloatFmt;
>
> @@ -560,6 +562,28 @@ typedef struct {
> .frac_shift = (-F - 1) & 63, \
> .round_mask = (1ull << ((-F - 1) & 63)) - 1
>
> +static const FloatFmt float8_e4m3_params = {
> + FLOAT_PARAMS(4, 3),
> + .ocpfp = true
> +};
> +
> +static const FloatFmt float8_e4m3_params_sat = {
> + FLOAT_PARAMS(4, 3),
> + .ocpfp = true,
> + .ocpfp_sat = true
> +};
> +
> +static const FloatFmt float8_e5m2_params = {
> + FLOAT_PARAMS(5, 2),
> + .ocpfp = true
> +};
> +
> +static const FloatFmt float8_e5m2_params_sat = {
> + FLOAT_PARAMS(5, 2),
> + .ocpfp = true,
> + .ocpfp_sat = true
> +};
Saturation is not part of the format, it's part of the conversion operation.
I suggest you pass that as a bool parameter to bfloat16_to_float8_e4m3 etc.
This would then be handled as part of round-and-pack, maybe a separate step, maybe via
float_round_nearest_even_max.
I'm not sure what to do with arm_althp vs ocpfp. It seems like they have a couple of
things in common. Perhaps we should decompose these to separate behavior flags.
r~
On 2026-01-10 14:20, Richard Henderson wrote: > I suggest you pass that as a bool parameter to bfloat16_to_float8_e4m3 etc. > This would then be handled as part of round-and-pack, maybe a separate step, > maybe via float_round_nearest_even_max. > Thanks for the suggestion! Will replace the format saturate flag by a bool parameter of convert function at v2. > I'm not sure what to do with arm_althp vs ocpfp. It seems like they have a > couple of things in common. Perhaps we should decompose these to separate > behavior flags. > > > r~ I agree that we should decompose these to seperate behavior flags. I will try to introduce some behavior flas (liked no infinity, maximum normal pattern, etc.) at v2. rnax
© 2016 - 2026 Red Hat, Inc.