This commit provides the implementation defined behavior flags and the basic
operation support for the OCP float8 data types(E4M3 & E5M2).
According to the definition in OFP8 spec, the conversion from a wider
format infinity depends on the saturation mode defined in the spec.
Signed-off-by: Max Chou <max.chou@sifive.com>
---
fpu/softfloat-parts.c.inc | 159 +++++++++++++++++++++------
fpu/softfloat-specialize.c.inc | 62 +++++++++++
fpu/softfloat.c | 191 +++++++++++++++++++++++++++++++--
include/fpu/softfloat-types.h | 12 +++
include/fpu/softfloat.h | 81 ++++++++++++++
5 files changed, 467 insertions(+), 38 deletions(-)
diff --git a/fpu/softfloat-parts.c.inc b/fpu/softfloat-parts.c.inc
index 5e0438fc0b..eee7daae4d 100644
--- a/fpu/softfloat-parts.c.inc
+++ b/fpu/softfloat-parts.c.inc
@@ -227,11 +227,28 @@ static void partsN(canonicalize)(FloatPartsN *p, float_status *status,
p->exp = fmt->frac_shift - fmt->exp_bias
- shift + !has_pseudo_denormals;
}
- } else if (likely(p->exp < fmt->exp_max) || fmt->arm_althp) {
+ } else if (likely(p->exp < fmt->exp_max)) {
p->cls = float_class_normal;
p->exp -= fmt->exp_bias;
frac_shl(p, fmt->frac_shift);
p->frac_hi |= DECOMPOSED_IMPLICIT_BIT;
+ } else if (fmt->limited_nan) {
+ /*
+ * Formats with limited NaN encodings (E4M3, E2M1, ARM Alt HP).
+ */
+ frac_shl(p, fmt->frac_shift);
+ p->frac_hi |= DECOMPOSED_IMPLICIT_BIT;
+ if (fmt->normal_frac_max == NORMAL_FRAC_MAX_ALL ||
+ p->frac_hi <= fmt->normal_frac_max) {
+ p->cls = float_class_normal;
+ p->exp -= fmt->exp_bias;
+ } else {
+ if (parts_is_snan_frac(p->frac_hi, status)) {
+ p->cls = float_class_snan;
+ } else {
+ p->cls = float_class_qnan;
+ }
+ }
} else if (likely(frac_eqz(p))) {
p->cls = float_class_inf;
} else {
@@ -241,14 +258,39 @@ static void partsN(canonicalize)(FloatPartsN *p, float_status *status,
}
}
+/*
+ * Set FloatPartsN to the maximum normal value for the given format.
+ * - IEEE formats (!no_infinity): exp = exp_max - 1, frac = all ones
+ * - Limited NaN formats (E4M3): exp = exp_max, frac = normal_frac_max
+ * - No NaN/InF formats (E2M1, ARM AHP): exp = exp_max, frac = all ones
+ */
+static void partsN(set_max_normal)(FloatPartsN *p, const FloatFmt *fmt)
+{
+ if (!fmt->no_infinity) {
+ p->exp = fmt->exp_max - 1;
+ frac_allones(p);
+ } else if (fmt->normal_frac_max != NORMAL_FRAC_MAX_ALL) {
+ p->exp = fmt->exp_max;
+ frac_clear(p);
+ p->frac_hi = fmt->normal_frac_max;
+ } else {
+ p->exp = fmt->exp_max;
+ frac_allones(p);
+ }
+}
+
/*
* Round and uncanonicalize a floating-point number by parts. There
* are FRAC_SHIFT bits that may require rounding at the bottom of the
* fraction; these bits will be removed. The exponent will be biased
* by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
+ *
+ * The saturate parameter controls saturation behavior for formats that
+ * support it (OCP FP8 E4M3/E5M2). When true, overflow produces max normal
+ * instead of infinity (E5M2) or NaN (E4M3).
*/
static void partsN(uncanon_normal)(FloatPartsN *p, float_status *s,
- const FloatFmt *fmt)
+ const FloatFmt *fmt, bool saturate)
{
const int exp_max = fmt->exp_max;
const int frac_shift = fmt->frac_shift;
@@ -256,8 +298,8 @@ static void partsN(uncanon_normal)(FloatPartsN *p, float_status *s,
const uint64_t frac_lsb = round_mask + 1;
const uint64_t frac_lsbm1 = round_mask ^ (round_mask >> 1);
const uint64_t roundeven_mask = round_mask | frac_lsb;
+ bool overflow_norm = saturate;
uint64_t inc;
- bool overflow_norm = false;
int exp, flags = 0;
switch (s->float_rounding_mode) {
@@ -313,30 +355,64 @@ static void partsN(uncanon_normal)(FloatPartsN *p, float_status *s,
}
p->frac_lo &= ~round_mask;
}
+ p->exp = exp;
- if (fmt->arm_althp) {
- /* ARM Alt HP eschews Inf and NaN for a wider exponent. */
- if (unlikely(exp > exp_max)) {
- /* Overflow. Return the maximum normal. */
- flags = float_flag_invalid;
- exp = exp_max;
- frac_allones(p);
- p->frac_lo &= ~round_mask;
+ /*
+ * Unified overflow handling based on format capabilities.
+ * 1. Format has infinity -> overflow to infinity (or saturate)
+ * 2. Format has NaN but no infinity -> overflow to NaN (or saturate)
+ * 3. Format has neither -> always saturate
+ */
+ if (!fmt->no_infinity) {
+ if (unlikely(exp >= exp_max)) {
+ flags |= float_flag_overflow;
+ if (s->rebias_overflow) {
+ exp -= fmt->exp_re_bias;
+ } else if (overflow_norm) {
+ flags |= float_flag_inexact;
+ parts_set_max_normal(p, fmt);
+ exp = p->exp;
+ p->frac_lo &= ~round_mask;
+ } else {
+ flags |= float_flag_inexact;
+ p->cls = float_class_inf;
+ exp = exp_max;
+ frac_clear(p);
+ }
}
- } else if (unlikely(exp >= exp_max)) {
- flags |= float_flag_overflow;
- if (s->rebias_overflow) {
- exp -= fmt->exp_re_bias;
- } else if (overflow_norm) {
+ } else if (fmt_has_nan_encoding(fmt)) {
+ bool is_overflow = (exp > exp_max) ||
+ (exp == exp_max &&
+ p->frac_hi > fmt->normal_frac_max);
+
+ if (unlikely(is_overflow)) {
+ flags |= float_flag_overflow;
flags |= float_flag_inexact;
- exp = exp_max - 1;
- frac_allones(p);
+
+ if (overflow_norm) {
+ parts_set_max_normal(p, fmt);
+ exp = p->exp;
+ } else {
+ uint8_t dnan = s->default_nan_pattern;
+ p->cls = float_class_qnan;
+ p->sign = dnan >> 7;
+ exp = exp_max;
+ frac_allones(p);
+ }
+ }
+ } else {
+ if (unlikely(exp > exp_max)) {
+ if (fmt->overflow_raises_invalid) {
+ /* ARM Alt HP: raise Invalid, not Overflow */
+ flags = float_flag_invalid;
+ } else {
+ flags |= float_flag_overflow;
+ flags |= float_flag_inexact;
+ }
+
+ parts_set_max_normal(p, fmt);
+ exp = p->exp;
p->frac_lo &= ~round_mask;
- } else {
- flags |= float_flag_inexact;
- p->cls = float_class_inf;
- exp = exp_max;
- frac_clear(p);
}
}
frac_shr(p, frac_shift);
@@ -422,11 +498,11 @@ static void partsN(uncanon_normal)(FloatPartsN *p, float_status *s,
float_raise(flags, s);
}
-static void partsN(uncanon)(FloatPartsN *p, float_status *s,
- const FloatFmt *fmt)
+static void partsN(uncanon_sat)(FloatPartsN *p, float_status *s,
+ const FloatFmt *fmt, bool saturate)
{
if (likely(is_anynorm(p->cls))) {
- parts_uncanon_normal(p, s, fmt);
+ parts_uncanon_normal(p, s, fmt, saturate);
} else {
switch (p->cls) {
case float_class_zero:
@@ -434,13 +510,30 @@ static void partsN(uncanon)(FloatPartsN *p, float_status *s,
frac_clear(p);
return;
case float_class_inf:
- g_assert(!fmt->arm_althp);
- p->exp = fmt->exp_max;
- frac_clear(p);
+ /*
+ * Unified infinity handling using format capabilities.
+ * Formats with no_infinity must convert infinity to something else
+ */
+ if (!fmt->no_infinity) {
+ p->exp = fmt->exp_max;
+ frac_clear(p);
+ } else if (fmt_has_nan_encoding(fmt)) {
+ if (saturate) {
+ parts_set_max_normal(p, fmt);
+ } else {
+ uint8_t dnan = s->default_nan_pattern;
+ p->cls = float_class_qnan;
+ p->sign = dnan >> 7;
+ p->exp = fmt->exp_max;
+ frac_allones(p);
+ }
+ } else {
+ parts_set_max_normal(p, fmt);
+ }
return;
case float_class_qnan:
case float_class_snan:
- g_assert(!fmt->arm_althp);
+ g_assert(fmt_has_nan_encoding(fmt));
p->exp = fmt->exp_max;
frac_shr(p, fmt->frac_shift);
return;
@@ -451,6 +544,12 @@ static void partsN(uncanon)(FloatPartsN *p, float_status *s,
}
}
+static void partsN(uncanon)(FloatPartsN *p, float_status *s,
+ const FloatFmt *fmt)
+{
+ partsN(uncanon_sat)(p, s, fmt, false);
+}
+
/*
* Returns the result of adding or subtracting the values of the
* floating-point values `a' and `b'. The operation is performed
diff --git a/fpu/softfloat-specialize.c.inc b/fpu/softfloat-specialize.c.inc
index 9ed968c79b..40c574283f 100644
--- a/fpu/softfloat-specialize.c.inc
+++ b/fpu/softfloat-specialize.c.inc
@@ -226,6 +226,68 @@ floatx80 floatx80_default_inf(bool zSign, float_status *status)
return packFloatx80(zSign, 0x7fff, z ? 0 : (1ULL << 63));
}
+/*----------------------------------------------------------------------------
+| Determine if a OCP FP8 E4M3 NaN is signaling NaN.
+| E4M3 has only one NaN encoding, so classification is policy-based.
+*----------------------------------------------------------------------------*/
+
+static bool float8_e4m3_nan_is_snan(float8_e4m3 a, float_status *status)
+{
+ if (no_signaling_nans(status)) {
+ return false;
+ }
+ return snan_bit_is_one(status);
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the OCP FP8 E4M3 value `a' is a quiet NaN; otherwise returns 0.
+*----------------------------------------------------------------------------*/
+
+bool float8_e4m3_is_quiet_nan(float8_e4m3 a_, float_status *status)
+{
+ return float8_e4m3_is_any_nan(a_) && !float8_e4m3_nan_is_snan(a_, status);
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the OCP FP8 E4M3 value `a' is a signaling NaN; otherwise 0.
+*----------------------------------------------------------------------------*/
+
+bool float8_e4m3_is_signaling_nan(float8_e4m3 a_, float_status *status)
+{
+ return float8_e4m3_is_any_nan(a_) && float8_e4m3_nan_is_snan(a_, status);
+}
+
+/*----------------------------------------------------------------------------
+| Determine if a OCP FP8 E5M2 NaN is signaling NaN.
+*----------------------------------------------------------------------------*/
+
+static bool float8_e5m2_nan_is_snan(float8_e5m2 a, float_status *status)
+{
+ if (no_signaling_nans(status)) {
+ return false;
+ }
+ bool frac_msb_is_one = (a >> 1) & 1;
+ return frac_msb_is_one == snan_bit_is_one(status);
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the OCP FP8 E5M2 value `a' is a quiet NaN; otherwise returns 0.
+*----------------------------------------------------------------------------*/
+
+bool float8_e5m2_is_quiet_nan(float8_e5m2 a_, float_status *status)
+{
+ return float8_e5m2_is_any_nan(a_) && !float8_e5m2_nan_is_snan(a_, status);
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the OCP FP8 E5M2 value `a' is a signaling NaN; otherwise 0.
+*----------------------------------------------------------------------------*/
+
+bool float8_e5m2_is_signaling_nan(float8_e5m2 a_, float_status *status)
+{
+ return float8_e5m2_is_any_nan(a_) && float8_e5m2_nan_is_snan(a_, status);
+}
+
/*----------------------------------------------------------------------------
| Determine if a float16 NaN is signaling NaN.
*----------------------------------------------------------------------------*/
diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index 8094358c2e..533f96dcda 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -522,6 +522,13 @@ typedef struct {
#define DECOMPOSED_BINARY_POINT 63
#define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
+/*
+ * Sentinel value for normal_frac_max indicating "all fraction values at
+ * exp_max are normal" (i.e., the format has no NaN encoding at exp_max).
+ * Used by E2M1 and ARM Alternative Half Precision formats.
+ */
+#define NORMAL_FRAC_MAX_ALL 0
+
/* Structure holding all of the relevant parameters for a format.
* exp_size: the size of the exponent field
* exp_bias: the offset applied to the exponent field
@@ -542,11 +549,39 @@ typedef struct {
int exp_max;
int frac_size;
int frac_shift;
- bool arm_althp;
bool has_explicit_bit;
uint64_t round_mask;
+ /*
+ * Format capability flags:
+ * no_infinity: Format has no infinity encoding. When true, exp=exp_max
+ * with frac=0 is NOT infinity - it's either NaN or max normal.
+ *
+ * limited_nan: Format has limited or no NaN patterns. When combined
+ * with normal_frac_max, determines NaN encoding capability:
+ * - limited_nan=false: Standard IEEE NaN (exp=exp_max, frac!=0)
+ * - limited_nan=true && normal_frac_max!=0: Limited NaN (E4M3)
+ * - limited_nan=true && normal_frac_max==0: No NaN encoding (AHP, E2M1)
+ *
+ * overflow_raises_invalid: Raise Invalid (not Overflow) exception.
+ * ARM Alt HP uses this to signal overflow as an invalid operation.
+ *
+ * normal_frac_max: For formats with limited_nan, the maximum fraction
+ * value (after normalization shift, including implicit bit) that is
+ * still considered normal at exp=exp_max.
+ * Use NORMAL_FRAC_MAX_ALL (0) to indicate all frac values at exp_max
+ * are normal (E2M1, ARM Alt HP), which also implies no NaN encoding.
+ */
+ bool no_infinity;
+ bool limited_nan;
+ bool overflow_raises_invalid;
+ uint64_t normal_frac_max;
} FloatFmt;
+static inline bool fmt_has_nan_encoding(const FloatFmt *fmt)
+{
+ return !fmt->limited_nan || fmt->normal_frac_max != NORMAL_FRAC_MAX_ALL;
+}
+
/* Expand fields based on the size of exponent and fraction */
#define FLOAT_PARAMS_(E) \
.exp_size = E, \
@@ -560,13 +595,27 @@ typedef struct {
.frac_shift = (-F - 1) & 63, \
.round_mask = (1ull << ((-F - 1) & 63)) - 1
+static const FloatFmt float8_e4m3_params = {
+ FLOAT_PARAMS(4, 3),
+ .no_infinity = true,
+ .limited_nan = true,
+ .normal_frac_max = 0xE000000000000000ULL,
+};
+
+static const FloatFmt float8_e5m2_params = {
+ FLOAT_PARAMS(5, 2),
+};
+
static const FloatFmt float16_params = {
FLOAT_PARAMS(5, 10)
};
static const FloatFmt float16_params_ahp = {
FLOAT_PARAMS(5, 10),
- .arm_althp = true
+ .no_infinity = true,
+ .limited_nan = true,
+ .overflow_raises_invalid = true,
+ .normal_frac_max = NORMAL_FRAC_MAX_ALL,
};
static const FloatFmt bfloat16_params = {
@@ -614,6 +663,16 @@ static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, uint64_t raw)
};
}
+static void QEMU_FLATTEN float8_e4m3_unpack_raw(FloatParts64 *p, float8_e4m3 f)
+{
+ unpack_raw64(p, &float8_e4m3_params, f);
+}
+
+static void QEMU_FLATTEN float8_e5m2_unpack_raw(FloatParts64 *p, float8_e5m2 f)
+{
+ unpack_raw64(p, &float8_e5m2_params, f);
+}
+
static void QEMU_FLATTEN float16_unpack_raw(FloatParts64 *p, float16 f)
{
unpack_raw64(p, &float16_params, f);
@@ -671,6 +730,16 @@ static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt *fmt)
return ret;
}
+static float8_e4m3 QEMU_FLATTEN float8_e4m3_pack_raw(const FloatParts64 *p)
+{
+ return make_float8_e4m3(pack_raw64(p, &float8_e4m3_params));
+}
+
+static float8_e5m2 QEMU_FLATTEN float8_e5m2_pack_raw(const FloatParts64 *p)
+{
+ return make_float8_e5m2(pack_raw64(p, &float8_e5m2_params));
+}
+
static float16 QEMU_FLATTEN float16_pack_raw(const FloatParts64 *p)
{
return make_float16(pack_raw64(p, &float16_params));
@@ -758,12 +827,26 @@ static void parts128_canonicalize(FloatParts128 *p, float_status *status,
PARTS_GENERIC_64_128(canonicalize, A)(A, S, F)
static void parts64_uncanon_normal(FloatParts64 *p, float_status *status,
- const FloatFmt *fmt);
+ const FloatFmt *fmt, bool saturate);
static void parts128_uncanon_normal(FloatParts128 *p, float_status *status,
- const FloatFmt *fmt);
+ const FloatFmt *fmt, bool saturate);
+
+#define parts_uncanon_normal(A, S, F, SAT) \
+ PARTS_GENERIC_64_128(uncanon_normal, A)(A, S, F, SAT)
-#define parts_uncanon_normal(A, S, F) \
- PARTS_GENERIC_64_128(uncanon_normal, A)(A, S, F)
+static void parts64_uncanon_sat(FloatParts64 *p, float_status *status,
+ const FloatFmt *fmt, bool saturate);
+static void parts128_uncanon_sat(FloatParts128 *p, float_status *status,
+ const FloatFmt *fmt, bool saturate);
+
+#define parts_uncanon_sat(A, S, F, SAT) \
+ PARTS_GENERIC_64_128(uncanon_sat, A)(A, S, F, SAT)
+
+static void parts64_set_max_normal(FloatParts64 *p, const FloatFmt *fmt);
+static void parts128_set_max_normal(FloatParts128 *p, const FloatFmt *fmt);
+
+#define parts_set_max_normal(P, F) \
+ PARTS_GENERIC_64_128(set_max_normal, P)(P, F)
static void parts64_uncanon(FloatParts64 *p, float_status *status,
const FloatFmt *fmt);
@@ -1662,6 +1745,20 @@ static const uint16_t rsqrt_tab[128] = {
* Pack/unpack routines with a specific FloatFmt.
*/
+static void float8_e4m3_unpack_canonical(FloatParts64 *p, float8_e4m3 f,
+ float_status *s)
+{
+ float8_e4m3_unpack_raw(p, f);
+ parts_canonicalize(p, s, &float8_e4m3_params);
+}
+
+static void float8_e5m2_unpack_canonical(FloatParts64 *p, float8_e5m2 f,
+ float_status *s)
+{
+ float8_e5m2_unpack_raw(p, f);
+ parts_canonicalize(p, s, &float8_e5m2_params);
+}
+
static void float16a_unpack_canonical(FloatParts64 *p, float16 f,
float_status *s, const FloatFmt *params)
{
@@ -1682,6 +1779,24 @@ static void bfloat16_unpack_canonical(FloatParts64 *p, bfloat16 f,
parts_canonicalize(p, s, &bfloat16_params);
}
+static float8_e4m3 float8_e4m3_round_pack_canonical(FloatParts64 *p,
+ float_status *status,
+ const FloatFmt *params,
+ const bool saturate)
+{
+ parts_uncanon_sat(p, status, params, saturate);
+ return float8_e4m3_pack_raw(p);
+}
+
+static float8_e5m2 float8_e5m2_round_pack_canonical(FloatParts64 *p,
+ float_status *status,
+ const FloatFmt *params,
+ const bool saturate)
+{
+ parts_uncanon_sat(p, status, params, saturate);
+ return float8_e5m2_pack_raw(p);
+}
+
static float16 float16a_round_pack_canonical(FloatParts64 *p,
float_status *s,
const FloatFmt *params)
@@ -1838,7 +1953,7 @@ static floatx80 floatx80_round_pack_canonical(FloatParts128 *p,
case float_class_normal:
case float_class_denormal:
if (s->floatx80_rounding_precision == floatx80_precision_x) {
- parts_uncanon_normal(p, s, fmt);
+ parts_uncanon_normal(p, s, fmt, false);
frac = p->frac_hi;
exp = p->exp;
} else {
@@ -1847,7 +1962,7 @@ static floatx80 floatx80_round_pack_canonical(FloatParts128 *p,
p64.sign = p->sign;
p64.exp = p->exp;
frac_truncjam(&p64, p);
- parts_uncanon_normal(&p64, s, fmt);
+ parts_uncanon_normal(&p64, s, fmt, false);
frac = p64.frac;
exp = p64.exp;
}
@@ -2823,6 +2938,66 @@ static void parts_float_to_float_widen(FloatParts128 *a, FloatParts64 *b,
}
}
+bfloat16 float8_e4m3_to_bfloat16(float8_e4m3 a, float_status *s)
+{
+ FloatParts64 p;
+
+ float8_e4m3_unpack_canonical(&p, a, s);
+ parts_float_to_float(&p, s);
+
+ return bfloat16_round_pack_canonical(&p, s);
+}
+
+bfloat16 float8_e5m2_to_bfloat16(float8_e5m2 a, float_status *s)
+{
+ FloatParts64 p;
+
+ float8_e5m2_unpack_canonical(&p, a, s);
+ parts_float_to_float(&p, s);
+
+ return bfloat16_round_pack_canonical(&p, s);
+}
+
+float8_e4m3 bfloat16_to_float8_e4m3(bfloat16 a, bool saturate, float_status *s)
+{
+ FloatParts64 p;
+
+ bfloat16_unpack_canonical(&p, a, s);
+ parts_float_to_float(&p, s);
+ return float8_e4m3_round_pack_canonical(&p, s, &float8_e4m3_params,
+ saturate);
+}
+
+float8_e5m2 bfloat16_to_float8_e5m2(bfloat16 a, bool saturate, float_status *s)
+{
+ FloatParts64 p;
+
+ bfloat16_unpack_canonical(&p, a, s);
+ parts_float_to_float(&p, s);
+ return float8_e5m2_round_pack_canonical(&p, s, &float8_e5m2_params,
+ saturate);
+}
+
+float8_e4m3 float32_to_float8_e4m3(float32 a, bool saturate, float_status *s)
+{
+ FloatParts64 p;
+
+ float32_unpack_canonical(&p, a, s);
+ parts_float_to_float(&p, s);
+ return float8_e4m3_round_pack_canonical(&p, s, &float8_e4m3_params,
+ saturate);
+}
+
+float8_e5m2 float32_to_float8_e5m2(float32 a, bool saturate, float_status *s)
+{
+ FloatParts64 p;
+
+ float32_unpack_canonical(&p, a, s);
+ parts_float_to_float(&p, s);
+ return float8_e5m2_round_pack_canonical(&p, s, &float8_e5m2_params,
+ saturate);
+}
+
float32 float16_to_float32(float16 a, bool ieee, float_status *s)
{
const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
diff --git a/include/fpu/softfloat-types.h b/include/fpu/softfloat-types.h
index 8f82fdfc97..b781bf10b7 100644
--- a/include/fpu/softfloat-types.h
+++ b/include/fpu/softfloat-types.h
@@ -119,6 +119,18 @@ typedef struct {
*/
typedef uint16_t bfloat16;
+/*
+ * Software OCP(Open Compute Project) floating point types
+ */
+typedef uint8_t float8_e4m3;
+typedef uint8_t float8_e5m2;
+#define float8_e4m3_val(x) (x)
+#define float8_e5m2_val(x) (x)
+#define make_float8_e4m3(x) (x)
+#define make_float8_e5m2(x) (x)
+#define const_float8_e4m3(x) (x)
+#define const_float8_e5m2(x) (x)
+
/*
* Software IEC/IEEE floating-point underflow tininess-detection mode.
*/
diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h
index ac6a392375..7abbf92b7e 100644
--- a/include/fpu/softfloat.h
+++ b/include/fpu/softfloat.h
@@ -189,6 +189,87 @@ float128 int128_to_float128(Int128, float_status *status);
float128 uint64_to_float128(uint64_t, float_status *status);
float128 uint128_to_float128(Int128, float_status *status);
+/*----------------------------------------------------------------------------
+| Software OCP conversion routines.
+*----------------------------------------------------------------------------*/
+
+bfloat16 float8_e4m3_to_bfloat16(float8_e4m3, float_status *status);
+bfloat16 float8_e5m2_to_bfloat16(float8_e5m2, float_status *status);
+float8_e4m3 bfloat16_to_float8_e4m3(bfloat16, bool saturate, float_status *status);
+float8_e5m2 bfloat16_to_float8_e5m2(bfloat16, bool saturate, float_status *status);
+float8_e4m3 float32_to_float8_e4m3(float32, bool saturate, float_status *status);
+float8_e5m2 float32_to_float8_e5m2(float32, bool saturate, float_status *status);
+
+/*----------------------------------------------------------------------------
+| Software OCP operations.
+*----------------------------------------------------------------------------*/
+
+bool float8_e4m3_is_quiet_nan(float8_e4m3, float_status *status);
+bool float8_e4m3_is_signaling_nan(float8_e4m3, float_status *status);
+bool float8_e5m2_is_quiet_nan(float8_e5m2, float_status *status);
+bool float8_e5m2_is_signaling_nan(float8_e5m2, float_status *status);
+
+static inline bool float8_e4m3_is_any_nan(float8_e4m3 a)
+{
+ return ((float8_e4m3_val(a) & ~0x80) == 0x7f);
+}
+
+static inline bool float8_e5m2_is_any_nan(float8_e5m2 a)
+{
+ return ((float8_e5m2_val(a) & ~0x80) > 0x7c);
+}
+
+static inline bool float8_e4m3_is_neg(float8_e4m3 a)
+{
+ return float8_e4m3_val(a) >> 7;
+}
+
+static inline bool float8_e5m2_is_neg(float8_e5m2 a)
+{
+ return float8_e5m2_val(a) >> 7;
+}
+
+static inline bool float8_e4m3_is_infinity(float8_e4m3 a)
+{
+ return false;
+}
+
+static inline bool float8_e5m2_is_infinity(float8_e5m2 a)
+{
+ return (float8_e5m2_val(a) & 0x7f) == 0x7c;
+}
+
+static inline bool float8_e4m3_is_zero(float8_e4m3 a)
+{
+ return (float8_e4m3_val(a) & 0x7f) == 0;
+}
+
+static inline bool float8_e5m2_is_zero(float8_e5m2 a)
+{
+ return (float8_e5m2_val(a) & 0x7f) == 0;
+}
+
+static inline bool float8_e4m3_is_zero_or_denormal(float8_e4m3 a)
+{
+ return (float8_e4m3_val(a) & 0x78) == 0;
+}
+
+static inline bool float8_e5m2_is_zero_or_denormal(float8_e5m2 a)
+{
+ return (float8_e5m2_val(a) & 0x7c) == 0;
+}
+
+static inline bool float8_e4m3_is_normal(float8_e4m3 a)
+{
+ uint8_t em = float8_e4m3_val(a) & 0x7f;
+ return em >= 0x8 && em <= 0x7e;
+}
+
+static inline bool float8_e5m2_is_normal(float8_e5m2 a)
+{
+ return (((float8_e5m2_val(a) >> 2) + 1) & 0x1f) >= 2;
+}
+
/*----------------------------------------------------------------------------
| Software half-precision conversion routines.
*----------------------------------------------------------------------------*/
--
2.52.0
Hi Max,
I've been testing the OCP FP8 implementation by writing
a simple test suite in tests/fp/ that covers various boundary cases for E4M3,
E5M2, E2M1, and BFloat16 formats. During testing, I found some issues in the
float_class_inf handling in partsN(uncanon_sat).
On Wed, Feb 04, 2026 at 01:17:41PM +0800, Max Chou wrote:
> This commit provides the implementation defined behavior flags and the basic
> operation support for the OCP float8 data types(E4M3 & E5M2).
>
> According to the definition in OFP8 spec, the conversion from a wider
> format infinity depends on the saturation mode defined in the spec.
>
> Signed-off-by: Max Chou <max.chou@sifive.com>
> ---
> fpu/softfloat-parts.c.inc | 159 +++++++++++++++++++++------
> fpu/softfloat-specialize.c.inc | 62 +++++++++++
> fpu/softfloat.c | 191 +++++++++++++++++++++++++++++++--
> include/fpu/softfloat-types.h | 12 +++
> include/fpu/softfloat.h | 81 ++++++++++++++
> 5 files changed, 467 insertions(+), 38 deletions(-)
>
> diff --git a/fpu/softfloat-parts.c.inc b/fpu/softfloat-parts.c.inc
> index 5e0438fc0b..eee7daae4d 100644
> --- a/fpu/softfloat-parts.c.inc
> +++ b/fpu/softfloat-parts.c.inc
[...]
> -static void partsN(uncanon)(FloatPartsN *p, float_status *s,
> - const FloatFmt *fmt)
> +static void partsN(uncanon_sat)(FloatPartsN *p, float_status *s,
> + const FloatFmt *fmt, bool saturate)
> {
> if (likely(is_anynorm(p->cls))) {
> - parts_uncanon_normal(p, s, fmt);
> + parts_uncanon_normal(p, s, fmt, saturate);
> } else {
> switch (p->cls) {
> case float_class_zero:
> @@ -434,13 +510,30 @@ static void partsN(uncanon)(FloatPartsN *p, float_status *s,
> frac_clear(p);
> return;
> case float_class_inf:
> - g_assert(!fmt->arm_althp);
> - p->exp = fmt->exp_max;
> - frac_clear(p);
> + /*
> + * Unified infinity handling using format capabilities.
> + * Formats with no_infinity must convert infinity to something else
> + */
> + if (!fmt->no_infinity) {
> + p->exp = fmt->exp_max;
> + frac_clear(p);
The saturate flag is not checked here. For IEEE-like formats such as
E5M2 that have infinity encoding, when saturate=true, the result should be
the maximum normal value, not infinity.
Per OCP FP8 specification Section 4.2 "Saturation", when saturation mode is
enabled, infinity should be converted to the maximum finite value even for
formats that support infinity representation.
My case:
bfloat16_to_float8_e5m2(BF16_INF_POS, true, &status)
Expected: 0x7b (max normal)
Actual: 0x7c (infinity)
Suggested fix:
if (!fmt->no_infinity && !saturate) {
p->exp = fmt->exp_max;
frac_clea
r(p);
} else if (!fmt->no_infinity && saturate) {
/* Saturate infinity to max normal for IEEE-like formats */
p->exp = fmt->exp_max - 1;
frac_allones(p);
frac_shr(p, fmt->frac_shift);
} else if ...
> + } else if (fmt_has_nan_encoding(fmt)) {
> + if (saturate) {
> + parts_set_max_normal(p, fmt);
Missing frac_shr() call after parts_set_max_normal().
The parts_set_max_normal() function sets frac_hi to the normalized
fraction value (with implicit bit at position 63). Before packing into
the final format, the fraction must be shifted right by frac_shift to
position it correctly.
Compare with the float_class_qnan/snan case below which correctly calls
frac_shr(p, fmt->frac_shift) before returning.
My case:
bfloat16_to_float8_e4m3(BF16_INF_POS, true, &status)
Expected: 0x7e (max normal, exp=15, frac=6)
Actual: 0x78 (exp=15, frac=0 - incorrect due to missing shift)
Suggested fix:
if (saturate) {
parts_set_max_normal(p, fmt);
frac_shr(p, fmt->frac_shift);
> + } else {
> + uint8_t dnan = s->default_nan_pattern;
> + p->cls = float_class_qnan;
> + p->sign = dnan >> 7;
> + p->exp = fmt->exp_max;
> + frac_allones(p);
Same issue - missing frac_shr() call after frac_allones().
> + }
> + } else {
> + parts_set_max_normal(p, fmt);
Same issue - missing frac_shr() call after parts_set_max_normal().
PS: This path is taken for formats without NaN encoding (like E2M1).
> + }
> return;
> case float_class_qnan:
> case float_class_snan:
> - g_assert(!fmt->arm_althp);
> + g_assert(fmt_has_nan_encoding(fmt));
> p->exp = fmt->exp_max;
> frac_shr(p, fmt->frac_shift); /* <-- This is correct */
> return;
The qnan/snan case correctly calls frac_shr(), which is good, but the inf case above does not.
---
I've prepared a fix patch and a test suite (tests/fp/fp-test-ocp.c) with 97
test cases covering:
- Classification functions for E4M3, E5M2, E2M1, BFloat16
- Format conversions with and without saturation
- Rounding mode handling
- Canonical NaN generation per Zvfofp8min specification
git repo:
https://github.com/zevorn/qemu/tree/riscv-zvfofp8min-zvfofp4min-v3
command:
cd $QEMU_SRC_PATH/build && ninja tests/fp/fp-test-ocp
./pyvenv/bin/meson test --suite softfloat-ocp -v
With the fix applied, all saturation tests pass:
PASS: BF16 +inf -> E4M3 max normal (with saturation), got 0x7e
PASS: BF16 +inf -> E5M2 max normal (with saturation), got 0x7b
PASS: F32 +inf -> E4M3 max normal (with saturation), got 0x7e
PASS: F32 +inf -> E5M2 max normal (with saturation), got 0x7b
---
@Richard: I noticed that the current tests/fp/ directory doesn't have test
coverage for BFloat16, OCP FP8 (E4M3/E5M2), or FP4 (E2M1) formats.
The existing fp-test relies on Berkeley TestFloat which doesn't support
these newer formats. Would it be useful if I clean up and submit my test
suite (fp-test-ocp.c) as a separate patch to provide basic test coverage
for these OCP floating-point formats? It could help catch similar issues
in future softfloat changes.
Thanks,
Chao
On 2026-02-05 21:21, Chao Liu wrote: > Hi Max, > > I've been testing the OCP FP8 implementation by writing > a simple test suite in tests/fp/ that covers various boundary cases for E4M3, > E5M2, E2M1, and BFloat16 formats. During testing, I found some issues in the > float_class_inf handling in partsN(uncanon_sat). > Hi Liu, Ooops looks like my random test cases miss the cases that input is InF from v2. Thanks for pointing out this issue. Richard has provided a v4 for the softfloat part, which is better and clearer than v3. I intend to address this issue based on that one. And I'll seperate the riscv isa part to another v4 based on the softfloat v4. Thanks, rnax
On 2/4/26 15:17, Max Chou wrote:
> This commit provides the implementation defined behavior flags and the basic
> operation support for the OCP float8 data types(E4M3 & E5M2).
I'd really like to see this split into parts. Beginning with
> @@ -542,11 +549,39 @@ typedef struct {
> int exp_max;
> int frac_size;
> int frac_shift;
> - bool arm_althp;
> bool has_explicit_bit;
> uint64_t round_mask;
> + /*
> + * Format capability flags:
> + * no_infinity: Format has no infinity encoding. When true, exp=exp_max
> + * with frac=0 is NOT infinity - it's either NaN or max normal.
> + *
> + * limited_nan: Format has limited or no NaN patterns. When combined
> + * with normal_frac_max, determines NaN encoding capability:
> + * - limited_nan=false: Standard IEEE NaN (exp=exp_max, frac!=0)
> + * - limited_nan=true && normal_frac_max!=0: Limited NaN (E4M3)
> + * - limited_nan=true && normal_frac_max==0: No NaN encoding (AHP, E2M1)
> + *
> + * overflow_raises_invalid: Raise Invalid (not Overflow) exception.
> + * ARM Alt HP uses this to signal overflow as an invalid operation.
> + *
> + * normal_frac_max: For formats with limited_nan, the maximum fraction
> + * value (after normalization shift, including implicit bit) that is
> + * still considered normal at exp=exp_max.
> + * Use NORMAL_FRAC_MAX_ALL (0) to indicate all frac values at exp_max
> + * are normal (E2M1, ARM Alt HP), which also implies no NaN encoding.
> + */
> + bool no_infinity;
> + bool limited_nan;
> + bool overflow_raises_invalid;
> + uint64_t normal_frac_max;
> } FloatFmt;
... this. I wanted to say something about this vs previous revisions, but I hadn't had
anything coherent to say besides "meh".
In particular, I think separating "no_infinity" and "limited_nan" leads to confusing
checks, such as the one in parts_canonicalize where you test "limited_nan" in a context
that is really testing for overflow to infinity.
Further, normal_frac_max is defined oddly, such that you have to test it twice, once vs
frac_hi and once vs NORMAL_FRAC_MAX_ALL. Since this is used for exactly one format, this
is perhaps trying to be overly general.
I think better might be:
typedef enum {
/* exp==max, frac==0 ? infinity : nan; this is ieee standard. */
float_maxexp_ieee,
/* exp==max is a normal number; no infinity or nan representation. */
float_maxexp_normal,
/* exp==max, frac==max ? nan : normal; no infinity. */
float_maxexp_e4m3,
} FloatFmtMaxExp;
We can stage in this behaviour without also including either FP8 format.
Just changing Arm althp in a separate patch is large enough.
r~
On 2026-02-05 14:36, Richard Henderson wrote:
> In particular, I think separating "no_infinity" and "limited_nan" leads to
> confusing checks, such as the one in parts_canonicalize where you test
> "limited_nan" in a context that is really testing for overflow to infinity.
>
> Further, normal_frac_max is defined oddly, such that you have to test it
> twice, once vs frac_hi and once vs NORMAL_FRAC_MAX_ALL. Since this is used
> for exactly one format, this is perhaps trying to be overly general.
>
> I think better might be:
>
> typedef enum {
> /* exp==max, frac==0 ? infinity : nan; this is ieee standard. */
> float_maxexp_ieee,
> /* exp==max is a normal number; no infinity or nan representation. */
> float_maxexp_normal,
> /* exp==max, frac==max ? nan : normal; no infinity. */
> float_maxexp_e4m3,
> } FloatFmtMaxExp;
>
> We can stage in this behaviour without also including either FP8 format.
> Just changing Arm althp in a separate patch is large enough.
>
>
> r~
Hi Richard,
Thank you for the suggestions and v4 for softfloat part.
I agree that the original patch should be separated and the solution you
suggested is better.
I'll seperate the riscv isa part to another v4 patch based on the
softfloat v4.
And will also testing the softfloat v4 you provided and fix some
saturate issues on that.
Thanks a lot,
rnax
© 2016 - 2026 Red Hat, Inc.