[v1] Add OCP FP8/FP4 and RISC-V Zvfofp8min/Zvfofp4min extension support

[PATCH 03/18] fpu/softfloat: Add convert operations(bf16, fp32) for OFP8 data types

Posted by Max Chou 1 month ago

This commit provides the covert operations for ofp8(e4m3, e5m2) with
following implementation defined behaviors required by RISC-V Zvfofp8min
extension:
- The canonical NaN of OFP8 e4m3 and e5m2 is 0x7f (ocpfp8_same_cnan)
- All of the NaNs of OFP8 are quiet NaNs (ocpfp8_only_qnan)

According to the definition in OFP8 spec, the conversion from a wider
format infinity depends on the saturation mode defined in the spec.

Signed-off-by: Max Chou <max.chou@sifive.com>
---
 fpu/softfloat-parts.c.inc |  77 +++++++++++-
 fpu/softfloat.c           | 241 ++++++++++++++++++++++++++++++++++++++
 include/fpu/softfloat.h   |  11 ++
 3 files changed, 323 insertions(+), 6 deletions(-)

diff --git a/fpu/softfloat-parts.c.inc b/fpu/softfloat-parts.c.inc
index 5e0438fc0b..d9ec3ca8ae 100644
--- a/fpu/softfloat-parts.c.inc
+++ b/fpu/softfloat-parts.c.inc
@@ -227,7 +227,8 @@ static void partsN(canonicalize)(FloatPartsN *p, float_status *status,
             p->exp = fmt->frac_shift - fmt->exp_bias
                    - shift + !has_pseudo_denormals;
         }
-    } else if (likely(p->exp < fmt->exp_max) || fmt->arm_althp) {
+    } else if (likely(p->exp < fmt->exp_max) || fmt->arm_althp ||
+               ocpfp_is_normal(p, fmt, false)) {
         p->cls = float_class_normal;
         p->exp -= fmt->exp_bias;
         frac_shl(p, fmt->frac_shift);
@@ -236,8 +237,12 @@ static void partsN(canonicalize)(FloatPartsN *p, float_status *status,
         p->cls = float_class_inf;
     } else {
         frac_shl(p, fmt->frac_shift);
-        p->cls = (parts_is_snan_frac(p->frac_hi, status)
-                  ? float_class_snan : float_class_qnan);
+        if (parts_is_snan_frac(p->frac_hi, status) == false ||
+            (fmt->ocpfp && status->ocp_fp8e5m2_no_signal_nan)) {
+            p->cls = float_class_qnan;
+        } else {
+            p->cls = float_class_snan;
+        }
     }
 }
 
@@ -313,8 +318,40 @@ static void partsN(uncanon_normal)(FloatPartsN *p, float_status *s,
             }
             p->frac_lo &= ~round_mask;
         }
+        p->exp = exp;
 
-        if (fmt->arm_althp) {
+        if (fmt->ocpfp) {
+            if (ocpfp_is_normal(p, fmt, true) == false) {
+                flags |= float_flag_overflow;
+                flags |= float_flag_inexact;
+                if (fmt->exp_size == 4) {
+                    if (fmt->ocpfp_sat || overflow_norm) {
+                        /* S.1111.110 */
+                        exp = exp_max;
+                        frac_clear(p);
+                        frac_addi(p, p, 0b110);
+                        frac_shl(p, frac_shift);
+                    } else {
+                        /* S.1111.111 NaN */
+                        p->cls = float_class_qnan;
+                        p->sign = (s->ocp_fp8_same_canonical_nan ? 0 : p->sign);
+                        exp = exp_max;
+                        frac_allones(p);
+                    }
+                } else if (fmt->exp_size == 5) {
+                    if (fmt->ocpfp_sat || overflow_norm) {
+                        /* S.11110.11 */
+                        exp = exp_max - 1;
+                        frac_allones(p);
+                    } else {
+                        /* S.11111.00 Inf */
+                        p->cls = float_class_inf;
+                        exp = exp_max;
+                        frac_clear(p);
+                    }
+                }
+            }
+        } else if (fmt->arm_althp) {
             /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
             if (unlikely(exp > exp_max)) {
                 /* Overflow.  Return the maximum normal.  */
@@ -435,8 +472,36 @@ static void partsN(uncanon)(FloatPartsN *p, float_status *s,
             return;
         case float_class_inf:
             g_assert(!fmt->arm_althp);
-            p->exp = fmt->exp_max;
-            frac_clear(p);
+            if (fmt->ocpfp) {
+                if (fmt->ocpfp_sat) {
+                    if (fmt->exp_size == 4) {
+                        /* S.1111.110 */
+                        p->exp = fmt->exp_max;
+                        frac_clear(p);
+                        frac_addi(p, p, 0x6);
+                    } else {
+                        /* S.11110.11 */
+                        p->exp = fmt->exp_max - 1;
+                        frac_allones(p);
+                    }
+                } else {
+                    if (fmt->exp_size == 4) {
+                        /* S.1111.111 NaN */
+                        p->cls = float_class_qnan;
+                        p->sign = (s->ocp_fp8_same_canonical_nan ? 0 : p->sign);
+                        p->exp = fmt->exp_max;
+                        frac_allones(p);
+                    } else {
+                        /* S.11111.00 Inf */
+                        p->cls = float_class_inf;
+                        p->exp = fmt->exp_max;
+                        frac_clear(p);
+                    }
+                }
+            } else {
+                p->exp = fmt->exp_max;
+                frac_clear(p);
+            }
             return;
         case float_class_qnan:
         case float_class_snan:
diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index 8094358c2e..0c7f052ec0 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -544,6 +544,8 @@ typedef struct {
     int frac_shift;
     bool arm_althp;
     bool has_explicit_bit;
+    bool ocpfp;
+    bool ocpfp_sat;
     uint64_t round_mask;
 } FloatFmt;
 
@@ -560,6 +562,28 @@ typedef struct {
     .frac_shift     = (-F - 1) & 63,                    \
     .round_mask     = (1ull << ((-F - 1) & 63)) - 1
 
+static const FloatFmt float8_e4m3_params = {
+    FLOAT_PARAMS(4, 3),
+    .ocpfp = true
+};
+
+static const FloatFmt float8_e4m3_params_sat = {
+    FLOAT_PARAMS(4, 3),
+    .ocpfp = true,
+    .ocpfp_sat = true
+};
+
+static const FloatFmt float8_e5m2_params = {
+    FLOAT_PARAMS(5, 2),
+    .ocpfp = true
+};
+
+static const FloatFmt float8_e5m2_params_sat = {
+    FLOAT_PARAMS(5, 2),
+    .ocpfp = true,
+    .ocpfp_sat = true
+};
+
 static const FloatFmt float16_params = {
     FLOAT_PARAMS(5, 10)
 };
@@ -614,6 +638,16 @@ static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, uint64_t raw)
     };
 }
 
+static void QEMU_FLATTEN float8_e4m3_unpack_raw(FloatParts64 *p, float8_e4m3 f)
+{
+    unpack_raw64(p, &float8_e4m3_params, f);
+}
+
+static void QEMU_FLATTEN float8_e5m2_unpack_raw(FloatParts64 *p, float8_e5m2 f)
+{
+    unpack_raw64(p, &float8_e5m2_params, f);
+}
+
 static void QEMU_FLATTEN float16_unpack_raw(FloatParts64 *p, float16 f)
 {
     unpack_raw64(p, &float16_params, f);
@@ -671,6 +705,16 @@ static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt *fmt)
     return ret;
 }
 
+static float8_e4m3 QEMU_FLATTEN float8_e4m3_pack_raw(const FloatParts64 *p)
+{
+    return make_float8_e4m3(pack_raw64(p, &float8_e4m3_params));
+}
+
+static float8_e5m2 QEMU_FLATTEN float8_e5m2_pack_raw(const FloatParts64 *p)
+{
+    return make_float8_e5m2(pack_raw64(p, &float8_e5m2_params));
+}
+
 static float16 QEMU_FLATTEN float16_pack_raw(const FloatParts64 *p)
 {
     return make_float16(pack_raw64(p, &float16_params));
@@ -1604,6 +1648,91 @@ static void frac128_widen(FloatParts256 *r, FloatParts128 *a)
 
 #define frac_widen(A, B)  FRAC_GENERIC_64_128(widen, B)(A, B)
 
+#define OCPFP_GENERIC_64_128(NAME, P) \
+    _Generic((P), FloatParts64 *: ocpfp64_##NAME, \
+                  FloatParts128 *: ocpfp128_##NAME)
+
+static bool ocpfp64_is_normal(const FloatParts64 *a, const FloatFmt *fmt,
+                              bool is_normalized)
+{
+    FloatParts64 input;
+    input.exp = a->exp;
+    input.frac = a->frac;
+    if (!is_normalized) {
+        frac64_shl(&input, fmt->frac_shift);
+        input.frac_hi |= DECOMPOSED_IMPLICIT_BIT;
+    }
+
+    if (fmt->ocpfp) {
+        if (fmt->exp_size == 4 && fmt->frac_size == 3) {
+            /*
+             * The OCP E4M3 format uses only two bit patterns for NaN (a
+             * single mantissa-exponent bit pattern with the sign bit) in
+             * order to increase emax to 8 and thus to increase the dynamic
+             * range by one binade.
+             */
+            FloatParts64 tmp;
+            frac64_clear(&tmp);
+            tmp.frac_lo = 0b110;
+            frac64_shl(&tmp, fmt->frac_shift);
+            tmp.frac_hi |= DECOMPOSED_IMPLICIT_BIT;
+            if (!(input.exp > fmt->exp_max ||
+                  (input.exp == fmt->exp_max &&
+                   frac64_cmp(&input, &tmp) == float_relation_greater))) {
+                return true;
+            }
+        } else if (fmt->exp_size == 5 && fmt->frac_size == 2) {
+            if (input.exp < fmt->exp_max) {
+                return true;
+            }
+        }
+    }
+
+    return false;
+}
+
+static bool ocpfp128_is_normal(const FloatParts128 *a, const FloatFmt *fmt,
+                              bool is_normalized)
+{
+    FloatParts128 input;
+    input.exp = a->exp;
+    input.frac_hi = a->frac_hi;
+    input.frac_lo = a->frac_lo;
+    if (!is_normalized) {
+        frac128_shl(&input, fmt->frac_shift);
+        input.frac_hi |= DECOMPOSED_IMPLICIT_BIT;
+    }
+
+    if (fmt->ocpfp) {
+        if (fmt->exp_size == 4 && fmt->frac_size == 3) {
+            /*
+             * The OCP E4M3 format uses only two bit patterns for NaN (a
+             * single mantissa-exponent bit pattern with the sign bit) in
+             * order to increase emax to 8 and thus to increase the dynamic
+             * range by one binade.
+             */
+            FloatParts128 tmp;
+            frac128_clear(&tmp);
+            tmp.frac_lo = 0b110;
+            frac128_shl(&tmp, fmt->frac_shift);
+            tmp.frac_hi |= DECOMPOSED_IMPLICIT_BIT;
+            if (!(input.exp > fmt->exp_max ||
+                  (input.exp == fmt->exp_max &&
+                   frac128_cmp(&input, &tmp) == float_relation_greater))) {
+                return true;
+            }
+        } else if (fmt->exp_size == 5 && fmt->frac_size == 2) {
+            if (input.exp < fmt->exp_max) {
+                return true;
+            }
+        }
+    }
+
+    return false;
+}
+
+#define ocpfp_is_normal(A, F, N)  OCPFP_GENERIC_64_128(is_normal, A)(A, F, N)
+
 /*
  * Reciprocal sqrt table.  1 bit of exponent, 6-bits of mantessa.
  * From https://git.musl-libc.org/cgit/musl/tree/src/math/sqrt_data.c
@@ -1662,6 +1791,20 @@ static const uint16_t rsqrt_tab[128] = {
  * Pack/unpack routines with a specific FloatFmt.
  */
 
+static void float8_e4m3_unpack_canonical(FloatParts64 *p, float8_e4m3 f,
+                                         float_status *s)
+{
+    float8_e4m3_unpack_raw(p, f);
+    parts_canonicalize(p, s, &float8_e4m3_params);
+}
+
+static void float8_e5m2_unpack_canonical(FloatParts64 *p, float8_e5m2 f,
+                                         float_status *s)
+{
+    float8_e5m2_unpack_raw(p, f);
+    parts_canonicalize(p, s, &float8_e5m2_params);
+}
+
 static void float16a_unpack_canonical(FloatParts64 *p, float16 f,
                                       float_status *s, const FloatFmt *params)
 {
@@ -1682,6 +1825,22 @@ static void bfloat16_unpack_canonical(FloatParts64 *p, bfloat16 f,
     parts_canonicalize(p, s, &bfloat16_params);
 }
 
+static float8_e4m3 float8_e4m3_round_pack_canonical(FloatParts64 *p,
+                                                    float_status *status,
+                                                    const FloatFmt *params)
+{
+    parts_uncanon(p, status, params);
+    return float8_e4m3_pack_raw(p);
+}
+
+static float8_e5m2 float8_e5m2_round_pack_canonical(FloatParts64 *p,
+                                                    float_status *status,
+                                                    const FloatFmt *params)
+{
+    parts_uncanon(p, status, params);
+    return float8_e5m2_pack_raw(p);
+}
+
 static float16 float16a_round_pack_canonical(FloatParts64 *p,
                                              float_status *s,
                                              const FloatFmt *params)
@@ -2759,6 +2918,23 @@ static void parts_float_to_ahp(FloatParts64 *a, float_status *s)
     }
 }
 
+static void parts_float_to_ofp8(FloatParts64 *a, float_status *s,
+                                const FloatFmt *fmt)
+{
+    if (is_nan(a->cls)) {
+        if (s->ocp_fp8_same_canonical_nan) {
+            if (a->cls == float_class_snan) {
+                float_raise(float_flag_invalid | float_flag_invalid_snan, s);
+            }
+            a->sign = 0;
+            a->exp = fmt->exp_max;
+            frac_allones(a);
+        } else {
+            parts_return_nan(a, s);
+        }
+    }
+}
+
 static void parts64_float_to_float(FloatParts64 *a, float_status *s)
 {
     if (is_nan(a->cls)) {
@@ -2823,6 +2999,71 @@ static void parts_float_to_float_widen(FloatParts128 *a, FloatParts64 *b,
     }
 }
 
+
+bfloat16 float8_e4m3_to_bfloat16(float8_e4m3 a, float_status *s)
+{
+    FloatParts64 p;
+
+    float8_e4m3_unpack_canonical(&p, a, s);
+    parts_float_to_float(&p, s);
+
+    return bfloat16_round_pack_canonical(&p, s);
+}
+
+bfloat16 float8_e5m2_to_bfloat16(float8_e5m2 a, float_status *s)
+{
+    FloatParts64 p;
+
+    float8_e5m2_unpack_canonical(&p, a, s);
+    parts_float_to_float(&p, s);
+
+    return bfloat16_round_pack_canonical(&p, s);
+}
+
+float8_e4m3 bfloat16_to_float8_e4m3(bfloat16 a, bool saturate, float_status *s)
+{
+    const FloatFmt *fmt = saturate ? &float8_e4m3_params_sat
+                                   : &float8_e4m3_params;
+    FloatParts64 p;
+
+    bfloat16_unpack_canonical(&p, a, s);
+    parts_float_to_ofp8(&p, s, fmt);
+    return float8_e4m3_round_pack_canonical(&p, s, fmt);
+}
+
+float8_e5m2 bfloat16_to_float8_e5m2(bfloat16 a, bool saturate, float_status *s)
+{
+    const FloatFmt *fmt = saturate ? &float8_e5m2_params_sat
+                                   : &float8_e5m2_params;
+    FloatParts64 p;
+
+    bfloat16_unpack_canonical(&p, a, s);
+    parts_float_to_ofp8(&p, s, fmt);
+    return float8_e5m2_round_pack_canonical(&p, s, fmt);
+}
+
+float8_e4m3 float32_to_float8_e4m3(float32 a, bool saturate, float_status *s)
+{
+    const FloatFmt *fmt = saturate ? &float8_e4m3_params_sat
+                                   : &float8_e4m3_params;
+    FloatParts64 p;
+
+    float32_unpack_canonical(&p, a, s);
+    parts_float_to_ofp8(&p, s, fmt);
+    return float8_e4m3_round_pack_canonical(&p, s, fmt);
+}
+
+float8_e5m2 float32_to_float8_e5m2(float32 a, bool saturate, float_status *s)
+{
+    const FloatFmt *fmt = saturate ? &float8_e5m2_params_sat
+                                   : &float8_e5m2_params;
+    FloatParts64 p;
+
+    float32_unpack_canonical(&p, a, s);
+    parts_float_to_ofp8(&p, s, fmt);
+    return float8_e5m2_round_pack_canonical(&p, s, fmt);
+}
+
 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
 {
     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h
index 6f7259f9dd..7ab585bfc8 100644
--- a/include/fpu/softfloat.h
+++ b/include/fpu/softfloat.h
@@ -189,6 +189,17 @@ float128 int128_to_float128(Int128, float_status *status);
 float128 uint64_to_float128(uint64_t, float_status *status);
 float128 uint128_to_float128(Int128, float_status *status);
 
+/*----------------------------------------------------------------------------
+| Software OCP FP8 conversion routines.
+*----------------------------------------------------------------------------*/
+
+bfloat16 float8_e4m3_to_bfloat16(float8_e4m3, float_status *status);
+bfloat16 float8_e5m2_to_bfloat16(float8_e5m2, float_status *status);
+float8_e4m3 bfloat16_to_float8_e4m3(bfloat16, bool saturate, float_status *status);
+float8_e5m2 bfloat16_to_float8_e5m2(bfloat16, bool saturate, float_status *status);
+float8_e4m3 float32_to_float8_e4m3(float32, bool saturate, float_status *status);
+float8_e5m2 float32_to_float8_e5m2(float32, bool saturate, float_status *status);
+
 /*----------------------------------------------------------------------------
 | Software OCP FP8 operations.
 *----------------------------------------------------------------------------*/
-- 
2.43.7

Re: [PATCH 03/18] fpu/softfloat: Add convert operations(bf16, fp32) for OFP8 data types

Posted by Richard Henderson 1 month ago

On 1/9/26 02:16, Max Chou wrote:
> diff --git a/fpu/softfloat.c b/fpu/softfloat.c
> index 8094358c2e..0c7f052ec0 100644
> --- a/fpu/softfloat.c
> +++ b/fpu/softfloat.c
> @@ -544,6 +544,8 @@ typedef struct {
>       int frac_shift;
>       bool arm_althp;
>       bool has_explicit_bit;
> +    bool ocpfp;
> +    bool ocpfp_sat;
>       uint64_t round_mask;
>   } FloatFmt;
>   
> @@ -560,6 +562,28 @@ typedef struct {
>       .frac_shift     = (-F - 1) & 63,                    \
>       .round_mask     = (1ull << ((-F - 1) & 63)) - 1
>   
> +static const FloatFmt float8_e4m3_params = {
> +    FLOAT_PARAMS(4, 3),
> +    .ocpfp = true
> +};
> +
> +static const FloatFmt float8_e4m3_params_sat = {
> +    FLOAT_PARAMS(4, 3),
> +    .ocpfp = true,
> +    .ocpfp_sat = true
> +};
> +
> +static const FloatFmt float8_e5m2_params = {
> +    FLOAT_PARAMS(5, 2),
> +    .ocpfp = true
> +};
> +
> +static const FloatFmt float8_e5m2_params_sat = {
> +    FLOAT_PARAMS(5, 2),
> +    .ocpfp = true,
> +    .ocpfp_sat = true
> +};

Saturation is not part of the format, it's part of the conversion operation.

I suggest you pass that as a bool parameter to bfloat16_to_float8_e4m3 etc.
This would then be handled as part of round-and-pack, maybe a separate step, maybe via 
float_round_nearest_even_max.

I'm not sure what to do with arm_althp vs ocpfp.  It seems like they have a couple of 
things in common.  Perhaps we should decompose these to separate behavior flags.


r~

Re: [PATCH 03/18] fpu/softfloat: Add convert operations(bf16, fp32) for OFP8 data types

Posted by Max Chou 3 weeks, 4 days ago

On 2026-01-10 14:20, Richard Henderson wrote:
> I suggest you pass that as a bool parameter to bfloat16_to_float8_e4m3 etc.
> This would then be handled as part of round-and-pack, maybe a separate step,
> maybe via float_round_nearest_even_max.
> 

Thanks for the suggestion!
Will replace the format saturate flag by a bool parameter of convert
function at v2.

> I'm not sure what to do with arm_althp vs ocpfp.  It seems like they have a
> couple of things in common.  Perhaps we should decompose these to separate
> behavior flags.
> 
> 
> r~

I agree that we should decompose these to seperate behavior flags. I
will try to introduce some behavior flas (liked no infinity, maximum
normal pattern, etc.) at v2.

rnax