From: "Lucas Mateus Castro (alqotel)" <lucas.araujo@eldorado.org.br>
Based on already existing QEMU implementation, created an unsigned 256
bit by 128 bit division needed to implement the vector divide extended
unsigned instruction from PowerISA3.1
Signed-off-by: Lucas Mateus Castro (alqotel) <lucas.araujo@eldorado.org.br>
---
include/qemu/host-utils.h | 15 +++++
include/qemu/int128.h | 20 ++++++
util/host-utils.c | 128 ++++++++++++++++++++++++++++++++++++++
3 files changed, 163 insertions(+)
diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
index ca979dc6cc..6da6a93f69 100644
--- a/include/qemu/host-utils.h
+++ b/include/qemu/host-utils.h
@@ -32,6 +32,7 @@
#include "qemu/compiler.h"
#include "qemu/bswap.h"
+#include "qemu/int128.h"
#ifdef CONFIG_INT128
static inline void mulu64(uint64_t *plow, uint64_t *phigh,
@@ -153,6 +154,19 @@ static inline int clo64(uint64_t val)
return clz64(~val);
}
+/*
+ * clz128 - count leading zeros in a 128-bit value.
+ * @val: The value to search
+ */
+static inline int clz128(Int128 a)
+{
+ if (int128_gethi(a)) {
+ return clz64(int128_gethi(a));
+ } else {
+ return clz64(int128_getlo(a)) + 64;
+ }
+}
+
/**
* ctz32 - count trailing zeros in a 32-bit value.
* @val: The value to search
@@ -849,4 +863,5 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
#endif
}
+Int128 divu256(Int128 *plow, Int128 *phigh, Int128 divisor);
#endif
diff --git a/include/qemu/int128.h b/include/qemu/int128.h
index 3af01f38cd..2a9ee956aa 100644
--- a/include/qemu/int128.h
+++ b/include/qemu/int128.h
@@ -128,11 +128,21 @@ static inline bool int128_ge(Int128 a, Int128 b)
return a >= b;
}
+static inline bool int128_uge(Int128 a, Int128 b)
+{
+ return ((__uint128_t)a) >= ((__uint128_t)b);
+}
+
static inline bool int128_lt(Int128 a, Int128 b)
{
return a < b;
}
+static inline bool int128_ult(Int128 a, Int128 b)
+{
+ return (__uint128_t)a < (__uint128_t)b;
+}
+
static inline bool int128_le(Int128 a, Int128 b)
{
return a <= b;
@@ -373,11 +383,21 @@ static inline bool int128_ge(Int128 a, Int128 b)
return a.hi > b.hi || (a.hi == b.hi && a.lo >= b.lo);
}
+static inline bool int128_uge(Int128 a, Int128 b)
+{
+ return (uint64_t)a.hi > (uint64_t)b.hi || (a.hi == b.hi && a.lo >= b.lo);
+}
+
static inline bool int128_lt(Int128 a, Int128 b)
{
return !int128_ge(a, b);
}
+static inline bool int128_ult(Int128 a, Int128 b)
+{
+ return !int128_uge(a, b);
+}
+
static inline bool int128_le(Int128 a, Int128 b)
{
return int128_ge(b, a);
diff --git a/util/host-utils.c b/util/host-utils.c
index bcc772b8ec..c6a01638c7 100644
--- a/util/host-utils.c
+++ b/util/host-utils.c
@@ -266,3 +266,131 @@ void ulshift(uint64_t *plow, uint64_t *phigh, int32_t shift, bool *overflow)
*plow = *plow << shift;
}
}
+/*
+ * Unsigned 256-by-128 division.
+ * Returns the remainder via r.
+ * Returns lower 128 bit of quotient.
+ * Needs a normalized divisor (most significant bit set to 1).
+ *
+ * Adapted from include/qemu/host-utils.h udiv_qrnnd,
+ * from the GNU Multi Precision Library - longlong.h __udiv_qrnnd
+ * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
+ *
+ * Licensed under the GPLv2/LGPLv3
+ */
+static Int128 udiv256_qrnnd(Int128 *r, Int128 n1, Int128 n0, Int128 d)
+{
+ Int128 d0, d1, q0, q1, r1, r0, m;
+ uint64_t mp0, mp1;
+
+ d0 = int128_make64(int128_getlo(d));
+ d1 = int128_make64(int128_gethi(d));
+
+ r1 = int128_remu(n1, d1);
+ q1 = int128_divu(n1, d1);
+ mp0 = int128_getlo(q1);
+ mp1 = int128_gethi(q1);
+ mulu128(&mp0, &mp1, int128_getlo(d0));
+ m = int128_make128(mp0, mp1);
+ r1 = int128_make128(int128_gethi(n0), int128_getlo(r1));
+ if (int128_ult(r1, m)) {
+ q1 = int128_sub(q1, int128_one());
+ r1 = int128_add(r1, d);
+ if (int128_uge(r1, d)) {
+ if (int128_ult(r1, m)) {
+ q1 = int128_sub(q1, int128_one());
+ r1 = int128_add(r1, d);
+ }
+ }
+ }
+ r1 = int128_sub(r1, m);
+
+ r0 = int128_remu(r1, d1);
+ q0 = int128_divu(r1, d1);
+ mp0 = int128_getlo(q0);
+ mp1 = int128_gethi(q0);
+ mulu128(&mp0, &mp1, int128_getlo(d0));
+ m = int128_make128(mp0, mp1);
+ r0 = int128_make128(int128_getlo(n0), int128_getlo(r0));
+ if (int128_ult(r0, m)) {
+ q0 = int128_sub(q0, int128_one());
+ r0 = int128_add(r0, d);
+ if (int128_uge(r0, d)) {
+ if (int128_ult(r0, m)) {
+ q0 = int128_sub(q0, int128_one());
+ r0 = int128_add(r0, d);
+ }
+ }
+ }
+ r0 = int128_sub(r0, m);
+
+ *r = r0;
+ return int128_or(int128_lshift(q1, 64), q0);
+}
+
+/*
+ * Unsigned 256-by-128 division.
+ * Returns the remainder.
+ * Returns quotient via plow and phigh.
+ * Also returns the remainder via the function return value.
+ */
+Int128 divu256(Int128 *plow, Int128 *phigh, Int128 divisor)
+{
+ Int128 dhi = *phigh;
+ Int128 dlo = *plow;
+ Int128 rem, dhighest;
+ int sh;
+
+ if (!int128_nz(divisor) || !int128_nz(dhi)) {
+ *plow = int128_divu(dlo, divisor);
+ *phigh = int128_zero();
+ return int128_remu(dlo, divisor);
+ } else {
+ sh = clz128(divisor);
+
+ if (int128_ult(dhi, divisor)) {
+ if (sh != 0) {
+ /* normalize the divisor, shifting the dividend accordingly */
+ divisor = int128_lshift(divisor, sh);
+ dhi = int128_or(int128_lshift(dhi, sh),
+ int128_urshift(dlo, (128 - sh)));
+ dlo = int128_lshift(dlo, sh);
+ }
+
+ *phigh = int128_zero();
+ *plow = udiv256_qrnnd(&rem, dhi, dlo, divisor);
+ } else {
+ if (sh != 0) {
+ /* normalize the divisor, shifting the dividend accordingly */
+ divisor = int128_lshift(divisor, sh);
+ dhighest = int128_rshift(dhi, (128 - sh));
+ dhi = int128_or(int128_lshift(dhi, sh),
+ int128_urshift(dlo, (128 - sh)));
+ dlo = int128_lshift(dlo, sh);
+
+ *phigh = udiv256_qrnnd(&dhi, dhighest, dhi, divisor);
+ } else {
+ /*
+ * dhi >= divisor
+ * Since the MSB of divisor is set (sh == 0),
+ * (dhi - divisor) < divisor
+ *
+ * Thus, the high part of the quotient is 1, and we can
+ * calculate the low part with a single call to udiv_qrnnd
+ * after subtracting divisor from dhi
+ */
+ dhi = int128_sub(dhi, divisor);
+ *phigh = int128_one();
+ }
+
+ *plow = udiv256_qrnnd(&rem, dhi, dlo, divisor);
+ }
+
+ /*
+ * since the dividend/divisor might have been normalized,
+ * the remainder might also have to be shifted back
+ */
+ rem = int128_urshift(rem, sh);
+ return rem;
+ }
+}
--
2.31.1
On 4/5/22 12:55, Lucas Mateus Castro(alqotel) wrote:
> From: "Lucas Mateus Castro (alqotel)" <lucas.araujo@eldorado.org.br>
>
> Based on already existing QEMU implementation, created an unsigned 256
> bit by 128 bit division needed to implement the vector divide extended
> unsigned instruction from PowerISA3.1
>
> Signed-off-by: Lucas Mateus Castro (alqotel) <lucas.araujo@eldorado.org.br>
> ---
> include/qemu/host-utils.h | 15 +++++
> include/qemu/int128.h | 20 ++++++
> util/host-utils.c | 128 ++++++++++++++++++++++++++++++++++++++
> 3 files changed, 163 insertions(+)
>
> diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
> index ca979dc6cc..6da6a93f69 100644
> --- a/include/qemu/host-utils.h
> +++ b/include/qemu/host-utils.h
> @@ -32,6 +32,7 @@
>
> #include "qemu/compiler.h"
> #include "qemu/bswap.h"
> +#include "qemu/int128.h"
>
> #ifdef CONFIG_INT128
> static inline void mulu64(uint64_t *plow, uint64_t *phigh,
> @@ -153,6 +154,19 @@ static inline int clo64(uint64_t val)
> return clz64(~val);
> }
>
> +/*
> + * clz128 - count leading zeros in a 128-bit value.
> + * @val: The value to search
> + */
> +static inline int clz128(Int128 a)
> +{
> + if (int128_gethi(a)) {
> + return clz64(int128_gethi(a));
> + } else {
> + return clz64(int128_getlo(a)) + 64;
> + }
> +}
Should be in int128.h, like bswap128.
> diff --git a/util/host-utils.c b/util/host-utils.c
> index bcc772b8ec..c6a01638c7 100644
> --- a/util/host-utils.c
> +++ b/util/host-utils.c
> @@ -266,3 +266,131 @@ void ulshift(uint64_t *plow, uint64_t *phigh, int32_t shift, bool *overflow)
> *plow = *plow << shift;
> }
> }
> +/*
Watch your spacing.
> + * Unsigned 256-by-128 division.
> + * Returns the remainder via r.
> + * Returns lower 128 bit of quotient.
> + * Needs a normalized divisor (most significant bit set to 1).
> + *
> + * Adapted from include/qemu/host-utils.h udiv_qrnnd,
> + * from the GNU Multi Precision Library - longlong.h __udiv_qrnnd
> + * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
> + *
> + * Licensed under the GPLv2/LGPLv3
> + */
> +static Int128 udiv256_qrnnd(Int128 *r, Int128 n1, Int128 n0, Int128 d)
> +{
> + Int128 d0, d1, q0, q1, r1, r0, m;
> + uint64_t mp0, mp1;
> +
> + d0 = int128_make64(int128_getlo(d));
> + d1 = int128_make64(int128_gethi(d));
> +
> + r1 = int128_remu(n1, d1);
> + q1 = int128_divu(n1, d1);
> + mp0 = int128_getlo(q1);
> + mp1 = int128_gethi(q1);
> + mulu128(&mp0, &mp1, int128_getlo(d0));
> + m = int128_make128(mp0, mp1);
> + r1 = int128_make128(int128_gethi(n0), int128_getlo(r1));
> + if (int128_ult(r1, m)) {
> + q1 = int128_sub(q1, int128_one());
> + r1 = int128_add(r1, d);
> + if (int128_uge(r1, d)) {
> + if (int128_ult(r1, m)) {
> + q1 = int128_sub(q1, int128_one());
> + r1 = int128_add(r1, d);
> + }
> + }
> + }
> + r1 = int128_sub(r1, m);
> +
> + r0 = int128_remu(r1, d1);
> + q0 = int128_divu(r1, d1);
> + mp0 = int128_getlo(q0);
> + mp1 = int128_gethi(q0);
> + mulu128(&mp0, &mp1, int128_getlo(d0));
> + m = int128_make128(mp0, mp1);
> + r0 = int128_make128(int128_getlo(n0), int128_getlo(r0));
> + if (int128_ult(r0, m)) {
> + q0 = int128_sub(q0, int128_one());
> + r0 = int128_add(r0, d);
> + if (int128_uge(r0, d)) {
> + if (int128_ult(r0, m)) {
> + q0 = int128_sub(q0, int128_one());
> + r0 = int128_add(r0, d);
> + }
> + }
> + }
> + r0 = int128_sub(r0, m);
> +
> + *r = r0;
> + return int128_or(int128_lshift(q1, 64), q0);
> +}
> +
> +/*
> + * Unsigned 256-by-128 division.
> + * Returns the remainder.
> + * Returns quotient via plow and phigh.
> + * Also returns the remainder via the function return value.
> + */
> +Int128 divu256(Int128 *plow, Int128 *phigh, Int128 divisor)
> +{
> + Int128 dhi = *phigh;
> + Int128 dlo = *plow;
> + Int128 rem, dhighest;
> + int sh;
> +
> + if (!int128_nz(divisor) || !int128_nz(dhi)) {
> + *plow = int128_divu(dlo, divisor);
> + *phigh = int128_zero();
> + return int128_remu(dlo, divisor);
> + } else {
> + sh = clz128(divisor);
> +
> + if (int128_ult(dhi, divisor)) {
> + if (sh != 0) {
> + /* normalize the divisor, shifting the dividend accordingly */
> + divisor = int128_lshift(divisor, sh);
> + dhi = int128_or(int128_lshift(dhi, sh),
> + int128_urshift(dlo, (128 - sh)));
> + dlo = int128_lshift(dlo, sh);
> + }
> +
> + *phigh = int128_zero();
> + *plow = udiv256_qrnnd(&rem, dhi, dlo, divisor);
> + } else {
> + if (sh != 0) {
> + /* normalize the divisor, shifting the dividend accordingly */
> + divisor = int128_lshift(divisor, sh);
> + dhighest = int128_rshift(dhi, (128 - sh));
> + dhi = int128_or(int128_lshift(dhi, sh),
> + int128_urshift(dlo, (128 - sh)));
> + dlo = int128_lshift(dlo, sh);
> +
> + *phigh = udiv256_qrnnd(&dhi, dhighest, dhi, divisor);
> + } else {
> + /*
> + * dhi >= divisor
> + * Since the MSB of divisor is set (sh == 0),
> + * (dhi - divisor) < divisor
> + *
> + * Thus, the high part of the quotient is 1, and we can
> + * calculate the low part with a single call to udiv_qrnnd
> + * after subtracting divisor from dhi
> + */
> + dhi = int128_sub(dhi, divisor);
> + *phigh = int128_one();
> + }
> +
> + *plow = udiv256_qrnnd(&rem, dhi, dlo, divisor);
> + }
> +
> + /*
> + * since the dividend/divisor might have been normalized,
> + * the remainder might also have to be shifted back
> + */
> + rem = int128_urshift(rem, sh);
> + return rem;
> + }
> +}
I guess this works. I'm starting to wonder if we shouldn't use libgmp, instead of rolling
our own. In this case, mpn_tdiv_qr.
Anyway, modulo placement of clz128,
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
r~
© 2016 - 2026 Red Hat, Inc.