Apparently some versions of clang can't handle inline assembly with
__int128 parameters, especially on s390. Instead of hand-coding the
s390 divide provide a generic fallback for anything that provides
__int128 capable maths.
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Cc: Thomas Huth <thuth@redhat.com>
---
include/fpu/softfloat-macros.h | 10 ++++------
1 file changed, 4 insertions(+), 6 deletions(-)
diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h
index b1d772e6d4..1a43609eef 100644
--- a/include/fpu/softfloat-macros.h
+++ b/include/fpu/softfloat-macros.h
@@ -641,12 +641,6 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
uint64_t q;
asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
return q;
-#elif defined(__s390x__)
- /* Need to use a TImode type to get an even register pair for DLGR. */
- unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
- asm("dlgr %0, %1" : "+r"(n) : "r"(d));
- *r = n >> 64;
- return n;
#elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
/* From Power ISA 2.06, programming note for divdeu. */
uint64_t q1, q2, Q, r1, r2, R;
@@ -663,6 +657,10 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
}
*r = R;
return Q;
+#elif defined(CONFIG_INT128)
+ unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
+ *r = n % d;
+ return n / d;
#else
uint64_t d0, d1, q0, q1, r1, r0, m;
--
2.17.1
On 1/17/19 7:23 AM, Alex Bennée wrote:
> Apparently some versions of clang can't handle inline assembly with
> __int128 parameters, especially on s390. Instead of hand-coding the
> s390 divide provide a generic fallback for anything that provides
> __int128 capable maths.
>
> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
> Cc: Thomas Huth <thuth@redhat.com>
> ---
> include/fpu/softfloat-macros.h | 10 ++++------
> 1 file changed, 4 insertions(+), 6 deletions(-)
>
> diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h
> index b1d772e6d4..1a43609eef 100644
> --- a/include/fpu/softfloat-macros.h
> +++ b/include/fpu/softfloat-macros.h
> @@ -641,12 +641,6 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
> uint64_t q;
> asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
> return q;
> -#elif defined(__s390x__)
> - /* Need to use a TImode type to get an even register pair for DLGR. */
> - unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
> - asm("dlgr %0, %1" : "+r"(n) : "r"(d));
> - *r = n >> 64;
> - return n;
> #elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
> /* From Power ISA 2.06, programming note for divdeu. */
> uint64_t q1, q2, Q, r1, r2, R;
> @@ -663,6 +657,10 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
> }
> *r = R;
> return Q;
> +#elif defined(CONFIG_INT128)
> + unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
> + *r = n % d;
> + return n / d;
> #else
I thought that we'd shown that, at least at present, no compiler is taking
advantage of hardware insns for this, and is promoting this to a full 128-bit
divide. And further that the version using 64-bit arithmetic was competitive
with the hardware insn.
I'd rather not include this hunk for now.
r~
Richard Henderson <richard.henderson@linaro.org> writes:
> On 1/17/19 7:23 AM, Alex Bennée wrote:
>> Apparently some versions of clang can't handle inline assembly with
>> __int128 parameters, especially on s390. Instead of hand-coding the
>> s390 divide provide a generic fallback for anything that provides
>> __int128 capable maths.
>>
>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
>> Cc: Thomas Huth <thuth@redhat.com>
>> ---
>> include/fpu/softfloat-macros.h | 10 ++++------
>> 1 file changed, 4 insertions(+), 6 deletions(-)
>>
>> diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h
>> index b1d772e6d4..1a43609eef 100644
>> --- a/include/fpu/softfloat-macros.h
>> +++ b/include/fpu/softfloat-macros.h
>> @@ -641,12 +641,6 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
>> uint64_t q;
>> asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
>> return q;
>> -#elif defined(__s390x__)
>> - /* Need to use a TImode type to get an even register pair for DLGR. */
>> - unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
>> - asm("dlgr %0, %1" : "+r"(n) : "r"(d));
>> - *r = n >> 64;
>> - return n;
>> #elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
>> /* From Power ISA 2.06, programming note for divdeu. */
>> uint64_t q1, q2, Q, r1, r2, R;
>> @@ -663,6 +657,10 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
>> }
>> *r = R;
>> return Q;
>> +#elif defined(CONFIG_INT128)
>> + unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
>> + *r = n % d;
>> + return n / d;
>> #else
>
> I thought that we'd shown that, at least at present, no compiler is taking
> advantage of hardware insns for this, and is promoting this to a full 128-bit
> divide. And further that the version using 64-bit arithmetic was competitive
> with the hardware insn.
Yeah it seems so. While Thomas' numbers weren't convincing the
CONFIG_INT128 fallback did trigger on my SynQuacer an knocked off about
2 MFlops of it's admittedly slow performance. Amusingly of course it's
faster under translation because of the hardware fall back:
07:44:44 [alex@idun:~/l/q/t/fp] (8973c1e5…) + ./fp-bench -o div -p double
13.28 MFlops
07:44:49 [alex@idun:~/l/q/t/fp] (8973c1e5…) + ./fp-bench -o div -p double -t host
498.20 MFlops
07:44:53 [alex@idun:~/l/q/t/fp] (8973c1e5…) + ../../aarch64-linux-user/qemu-aarch64 ./fp-bench -o div -p double -t host
52.71 MFlops
I'll drop this and use Thomas' #elif defined(__s390x__) &&
!defined(__clang__) version in the pull-request.
--
Alex Bennée
On 2019-01-16 21:23, Alex Bennée wrote:
> Apparently some versions of clang can't handle inline assembly with
> __int128 parameters, especially on s390. Instead of hand-coding the
> s390 divide provide a generic fallback for anything that provides
> __int128 capable maths.
>
> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
> Cc: Thomas Huth <thuth@redhat.com>
> ---
> include/fpu/softfloat-macros.h | 10 ++++------
> 1 file changed, 4 insertions(+), 6 deletions(-)
>
> diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h
> index b1d772e6d4..1a43609eef 100644
> --- a/include/fpu/softfloat-macros.h
> +++ b/include/fpu/softfloat-macros.h
> @@ -641,12 +641,6 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
> uint64_t q;
> asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
> return q;
> -#elif defined(__s390x__)
> - /* Need to use a TImode type to get an even register pair for DLGR. */
> - unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
> - asm("dlgr %0, %1" : "+r"(n) : "r"(d));
> - *r = n >> 64;
> - return n;
> #elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
> /* From Power ISA 2.06, programming note for divdeu. */
> uint64_t q1, q2, Q, r1, r2, R;
> @@ -663,6 +657,10 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
> }
> *r = R;
> return Q;
> +#elif defined(CONFIG_INT128)
> + unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
> + *r = n % d;
> + return n / d;
> #else
> uint64_t d0, d1, q0, q1, r1, r0, m;
No, please don't. Use my !defined(__clang__) patch instead, please.
Thomas
© 2016 - 2026 Red Hat, Inc.