DIVIDE TO INTEGER computes floating point remainder and is used by
LuaJIT, so add it to QEMU.
The instruction comes in two flavors: for floats and doubles, which are
very similar. Since it's also quite complex, copy-pasting the
implementation would result in barely maintainable code. Mitigate that
using macros. An alternative would be an .inc file, but this looks like
an overkill.
Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
---
target/s390x/helper.h | 2 +
target/s390x/tcg/fpu_helper.c | 199 +++++++++++++++++++++++++++++++
target/s390x/tcg/insn-data.h.inc | 5 +-
target/s390x/tcg/translate.c | 26 ++++
4 files changed, 231 insertions(+), 1 deletion(-)
diff --git a/target/s390x/helper.h b/target/s390x/helper.h
index 1a8a76abb98..f2b24c65a88 100644
--- a/target/s390x/helper.h
+++ b/target/s390x/helper.h
@@ -46,6 +46,8 @@ DEF_HELPER_FLAGS_3(sxb, TCG_CALL_NO_WG, i128, env, i128, i128)
DEF_HELPER_FLAGS_3(deb, TCG_CALL_NO_WG, i64, env, i64, i64)
DEF_HELPER_FLAGS_3(ddb, TCG_CALL_NO_WG, i64, env, i64, i64)
DEF_HELPER_FLAGS_3(dxb, TCG_CALL_NO_WG, i128, env, i128, i128)
+DEF_HELPER_5(didb, void, env, i32, i32, i32, i32)
+DEF_HELPER_5(dieb, void, env, i32, i32, i32, i32)
DEF_HELPER_FLAGS_3(meeb, TCG_CALL_NO_WG, i64, env, i64, i64)
DEF_HELPER_FLAGS_3(mdeb, TCG_CALL_NO_WG, i64, env, i64, i64)
DEF_HELPER_FLAGS_3(mdb, TCG_CALL_NO_WG, i64, env, i64, i64)
diff --git a/target/s390x/tcg/fpu_helper.c b/target/s390x/tcg/fpu_helper.c
index 1ba43715ac1..f524c4257fb 100644
--- a/target/s390x/tcg/fpu_helper.c
+++ b/target/s390x/tcg/fpu_helper.c
@@ -286,6 +286,205 @@ Int128 HELPER(dxb)(CPUS390XState *env, Int128 a, Int128 b)
return RET128(ret);
}
+static float128 float128_precision_round_to_float32(float128 x)
+{
+ x.low = 0;
+ x.high = deposit64(x.high, 0, 25, 0);
+ return x;
+}
+
+static float128 float128_precision_round_to_float64(float128 x)
+{
+ x.low = deposit64(x.low, 0, 60, 0);
+ return x;
+}
+
+static int float128_get_exp(float128 x)
+{
+ return extract64(x.high, 48, 15) - 16383;
+}
+
+static float128 float128_set_exp(float128 x, int exp)
+{
+ x.high = deposit64(x.high, 48, 15, exp + 16383);
+ return x;
+}
+
+static float128 float128_adjust_exp(float128 x, int delta)
+{
+ return float128_set_exp(x, float128_get_exp(x) + delta);
+}
+
+static bool float128_is_int(float128 x)
+{
+ return extract64(x.high, 0, 48) == 0 && x.low == 0;
+}
+
+static float32 extract_float32(CPUS390XState *env, uint32_t r)
+{
+ return env->vregs[r][0] >> 32;
+}
+
+static void deposit_float32(CPUS390XState *env, uint32_t r, float32 x)
+{
+ env->vregs[r][0] = deposit64(env->vregs[r][0], 32, 32, x);
+}
+
+static float64 extract_float64(CPUS390XState *env, uint32_t r)
+{
+ return env->vregs[r][0];
+}
+
+static void deposit_float64(CPUS390XState *env, uint32_t r, float64 x)
+{
+ env->vregs[r][0] = x;
+}
+
+#define DIVIDE_TO_INTEGER(name, floatN, p, exp_max, exp_bias) \
+void HELPER(name)(CPUS390XState *env, uint32_t r1, uint32_t r2, \
+ uint32_t r3, uint32_t m4) \
+{ \
+ int float_exception_flags = 0; \
+ floatN a, b, n, r; \
+ int dxc = -1; \
+ uint32_t cc; \
+ \
+ a = extract_ ## floatN(env, r1); \
+ b = extract_ ## floatN(env, r2); \
+ \
+ /* POp table "Results: DIVIDE TO INTEGER (Part 1 of 2)" */ \
+ if (floatN ## _is_signaling_nan(a, &env->fpu_status)) { \
+ r = n = floatN ## _silence_nan(a, &env->fpu_status); \
+ cc = 1; \
+ float_exception_flags |= float_flag_invalid; \
+ } else if (floatN ## _is_signaling_nan(b, &env->fpu_status)) { \
+ r = n = floatN ## _silence_nan(b, &env->fpu_status); \
+ cc = 1; \
+ float_exception_flags |= float_flag_invalid; \
+ } else if (floatN ## _is_quiet_nan(a, &env->fpu_status)) { \
+ r = n = a; \
+ cc = 1; \
+ } else if (floatN ## _is_quiet_nan(b, &env->fpu_status)) { \
+ r = n = b; \
+ cc = 1; \
+ } else if (floatN ## _is_infinity(a) || floatN ## _is_zero(b)) { \
+ r = n = floatN ## _default_nan(&env->fpu_status); \
+ cc = 1; \
+ float_exception_flags |= float_flag_invalid; \
+ } else if (floatN ## _is_infinity(b)) { \
+ r = a; \
+ n = floatN ## _set_sign(floatN ## _zero, \
+ floatN ## _is_neg(a) != floatN ## _is_neg(b)); \
+ cc = 0; \
+ } else { \
+ float128 a128, b128, m128, n128, q128, r128; \
+ bool is_final, is_q128_smallish; \
+ int old_mode, r128_exp; \
+ uint32_t r_flags; \
+ \
+ /* Compute precise quotient */ \
+ a128 = floatN ## _to_float128(a, &env->fpu_status); \
+ b128 = floatN ## _to_float128(b, &env->fpu_status); \
+ q128 = float128_div(a128, b128, &env->fpu_status); \
+ \
+ /* Final or partial case? */ \
+ is_q128_smallish = float128_get_exp(q128) < p; \
+ is_final = is_q128_smallish || float128_is_int(q128); \
+ \
+ /* \
+ * Final quotient is rounded using M4, \
+ * partial quotient is rounded toward zero. \
+ */ \
+ old_mode = s390_swap_bfp_rounding_mode(env, is_final ? m4 : 5); \
+ n128 = float128_round_to_int(q128, &env->fpu_status); \
+ s390_restore_bfp_rounding_mode(env, old_mode); \
+ \
+ /* \
+ * Intermediate values are precision-rounded, \
+ * see "Intermediate Values" in POp. \
+ */ \
+ n128 = float128_precision_round_to_ ## floatN(n128); \
+ \
+ /* Compute remainder */ \
+ m128 = float128_mul(b128, n128, &env->fpu_status); \
+ env->fpu_status.float_exception_flags = 0; \
+ r128 = float128_sub(a128, m128, &env->fpu_status); \
+ r128_exp = float128_get_exp(r128); \
+ r = float128_to_## floatN(r128, &env->fpu_status); \
+ r_flags = env->fpu_status.float_exception_flags; \
+ \
+ /* POp table "Results: DIVIDE TO INTEGER (Part 2 of 2)" */ \
+ if (is_q128_smallish) { \
+ cc = 0; \
+ if (!floatN ## _is_zero(r)) { \
+ if (r128_exp < -(exp_max - 1)) { \
+ if ((env->fpc >> 24) & S390_IEEE_MASK_UNDERFLOW) { \
+ float_exception_flags |= float_flag_underflow; \
+ dxc = 0x10; \
+ r128 = float128_adjust_exp(r128, exp_bias); \
+ r = float128_to_## floatN(r128, &env->fpu_status); \
+ } \
+ } else if (r_flags & float_flag_inexact) { \
+ float_exception_flags |= float_flag_inexact; \
+ if ((env->fpc >> 24) & S390_IEEE_MASK_INEXACT) { \
+ /* \
+ * Check whether remainder was truncated (rounded \
+ * toward zero) or incremented. \
+ */ \
+ if (float128_lt( \
+ floatN ## _to_float128(floatN ## _abs(r), \
+ &env->fpu_status), \
+ float128_abs(r128), &env->fpu_status)) { \
+ dxc = 0x8; \
+ } else { \
+ dxc = 0xc; \
+ } \
+ } \
+ } \
+ } \
+ } else if (float128_get_exp(n128) > exp_max) { \
+ n128 = float128_adjust_exp(n128, -exp_bias); \
+ cc = floatN ## _is_zero(r) ? 1 : 3; \
+ } else { \
+ cc = floatN ## _is_zero(r) ? 0 : 2; \
+ } \
+ \
+ /* Adjust sign of zero */ \
+ if (floatN ## _is_zero(r)) { \
+ r = floatN ## _set_sign(r, float128_is_neg(a128)); \
+ } \
+ n = float128_to_ ## floatN(n128, &env->fpu_status); \
+ if (floatN ## _is_zero(n)) { \
+ n = floatN ## _set_sign(n, \
+ float128_is_neg(a128) != \
+ float128_is_neg(b128)); \
+ } \
+ } \
+ \
+ /* Flush the results if needed */ \
+ if ((float_exception_flags & float_flag_invalid) && \
+ ((env->fpc >> 24) & S390_IEEE_MASK_INVALID)) { \
+ /* The action for invalid operation is "Suppress" */ \
+ } else { \
+ /* The action for other exceptions is "Complete" */ \
+ deposit_ ## floatN(env, r1, r); \
+ deposit_ ## floatN(env, r3, n); \
+ env->cc_op = cc; \
+ } \
+ \
+ /* Raise an exception if needed */ \
+ if (dxc == -1) { \
+ env->fpu_status.float_exception_flags = float_exception_flags; \
+ handle_exceptions(env, false, GETPC()); \
+ } else { \
+ env->fpu_status.float_exception_flags = 0; \
+ tcg_s390_data_exception(env, dxc, GETPC()); \
+ } \
+}
+
+DIVIDE_TO_INTEGER(dieb, float32, 24, 127, 192)
+DIVIDE_TO_INTEGER(didb, float64, 53, 1023, 1536)
+
/* 32-bit FP multiplication */
uint64_t HELPER(meeb)(CPUS390XState *env, uint64_t f1, uint64_t f2)
{
diff --git a/target/s390x/tcg/insn-data.h.inc b/target/s390x/tcg/insn-data.h.inc
index baaafe922e9..0d5392eac54 100644
--- a/target/s390x/tcg/insn-data.h.inc
+++ b/target/s390x/tcg/insn-data.h.inc
@@ -9,7 +9,7 @@
* OPC = (op << 8) | op2 where op is the major, op2 the minor opcode
* NAME = name of the opcode, used internally
* FMT = format of the opcode (defined in insn-format.h.inc)
- * FAC = facility the opcode is available in (defined in DisasFacility)
+ * FAC = facility the opcode is available in (define in translate.c)
* I1 = func in1_xx fills o->in1
* I2 = func in2_xx fills o->in2
* P = func prep_xx initializes o->*out*
@@ -361,6 +361,9 @@
C(0xb91d, DSGFR, RRE, Z, r1p1, r2_32s, r1_P, 0, divs64, 0)
C(0xe30d, DSG, RXY_a, Z, r1p1, m2_64, r1_P, 0, divs64, 0)
C(0xe31d, DSGF, RXY_a, Z, r1p1, m2_32s, r1_P, 0, divs64, 0)
+/* DIVIDE TO INTEGER */
+ D(0xb35b, DIDBR, RRF_b, Z, 0, 0, 0, 0, dib, 0, 64)
+ D(0xb353, DIEBR, RRF_b, Z, 0, 0, 0, 0, dib, 0, 32)
/* EXCLUSIVE OR */
C(0x1700, XR, RR_a, Z, r1, r2, new, r1_32, xor, nz32)
diff --git a/target/s390x/tcg/translate.c b/target/s390x/tcg/translate.c
index 540c5a569c0..a3b753bc829 100644
--- a/target/s390x/tcg/translate.c
+++ b/target/s390x/tcg/translate.c
@@ -2283,6 +2283,32 @@ static DisasJumpType op_dxb(DisasContext *s, DisasOps *o)
return DISAS_NEXT;
}
+static DisasJumpType op_dib(DisasContext *s, DisasOps *o)
+{
+ const bool fpe = s390_has_feat(S390_FEAT_FLOATING_POINT_EXT);
+ uint8_t m4 = get_field(s, m4);
+
+ if (get_field(s, r1) == get_field(s, r2) ||
+ get_field(s, r1) == get_field(s, r3) ||
+ get_field(s, r2) == get_field(s, r3)) {
+ gen_program_exception(s, PGM_SPECIFICATION);
+ return DISAS_NORETURN;
+ }
+
+ if (m4 == 2 || (!fpe && m4 == 3) || m4 > 7) {
+ gen_program_exception(s, PGM_SPECIFICATION);
+ return DISAS_NORETURN;
+ }
+
+ (s->insn->data == 32 ? gen_helper_dieb : gen_helper_didb)(
+ tcg_env, tcg_constant_i32(get_field(s, r1)),
+ tcg_constant_i32(get_field(s, r2)),
+ tcg_constant_i32(get_field(s, r3)), tcg_constant_i32(m4));
+ set_cc_static(s);
+
+ return DISAS_NEXT;
+}
+
static DisasJumpType op_ear(DisasContext *s, DisasOps *o)
{
int r2 = get_field(s, r2);
--
2.52.0
On 1/22/26 09:12, Ilya Leoshkevich wrote:
> +static bool float128_is_int(float128 x)
> +{
> + return extract64(x.high, 0, 48) == 0 && x.low == 0;
> +}
This isn't testing for integer, it's testing for 1.0eNN,
i.e. a power of two.
> + /* Compute precise quotient */ \
> + a128 = floatN ## _to_float128(a, &env->fpu_status); \
> + b128 = floatN ## _to_float128(b, &env->fpu_status); \
> + q128 = float128_div(a128, b128, &env->fpu_status); \
> + \
> + /* Final or partial case? */ \
> + is_q128_smallish = float128_get_exp(q128) < p; \
> + is_final = is_q128_smallish || float128_is_int(q128); \
The language from the manual,
# If the precise quotient is not an integer and the two
# integers closest to this precise quotient cannot both
# be represented exactly in the precision of the quotient ...
does not appear to be what you are computing here.
Certainly none of this relates to "precision of the quotient".
I would imagine that all of this would be easier to accomplish if you did this in fpu/
with FloatParts instead of continually swapping in and out of float128.
r~
On 1/22/26 02:04, Richard Henderson wrote:
> On 1/22/26 09:12, Ilya Leoshkevich wrote:
>> +static bool float128_is_int(float128 x)
>> +{
>> + return extract64(x.high, 0, 48) == 0 && x.low == 0;
>> +}
>
> This isn't testing for integer, it's testing for 1.0eNN,
> i.e. a power of two.
Whoops. Not sure what I was thinking here.
>> + /* Compute precise quotient
>> */ \
>> + a128 = floatN ## _to_float128(a,
>> &env->fpu_status); \
>> + b128 = floatN ## _to_float128(b,
>> &env->fpu_status); \
>> + q128 = float128_div(a128, b128,
>> &env->fpu_status); \
>> + \
>> + /* Final or partial case?
>> */ \
>> + is_q128_smallish = float128_get_exp(q128) <
>> p; \
>> + is_final = is_q128_smallish ||
>> float128_is_int(q128); \
>
> The language from the manual,
>
> # If the precise quotient is not an integer and the two
> # integers closest to this precise quotient cannot both
> # be represented exactly in the precision of the quotient ...
>
> does not appear to be what you are computing here.
> Certainly none of this relates to "precision of the quotient".
I was rather following the tables, I think they are more precise w.r.t.
what needs to be checked.
float128_is_int(q128) was supposed to replace the r=0 check, but it's
probably unnecessary here altogether, because if the division result is
precise, it doesn't matter which way we round. And I'm explicitly
checking for r=0 in other places.
> I would imagine that all of this would be easier to accomplish if you
> did this in fpu/ with FloatParts instead of continually swapping in
> and out of float128.
That sounds good, I think I will also be able to reuse pick_nan and
simplify exponent manipulations then. I will give it a try.
> r~
© 2016 - 2026 Red Hat, Inc.