include/fpu/softfloat-macros.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
Clang v7.0.1 does not like the __int128 variable type for inline
assembly on s390x:
In file included from fpu/softfloat.c:97:
include/fpu/softfloat-macros.h:647:9: error: inline asm error:
This value type register class is not natively supported!
asm("dlgr %0, %1" : "+r"(n) : "r"(d));
^
Disable this code part there now when compiling with Clang, so that
the generic code gets used instead.
Signed-off-by: Thomas Huth <thuth@redhat.com>
---
include/fpu/softfloat-macros.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h
index b1d772e..bd5b641 100644
--- a/include/fpu/softfloat-macros.h
+++ b/include/fpu/softfloat-macros.h
@@ -641,7 +641,7 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
uint64_t q;
asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
return q;
-#elif defined(__s390x__)
+#elif defined(__s390x__) && !defined(__clang__)
/* Need to use a TImode type to get an even register pair for DLGR. */
unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
asm("dlgr %0, %1" : "+r"(n) : "r"(d));
--
1.8.3.1
On 1/14/19 1:12 PM, Thomas Huth wrote:
> Clang v7.0.1 does not like the __int128 variable type for inline
> assembly on s390x:
>
> In file included from fpu/softfloat.c:97:
> include/fpu/softfloat-macros.h:647:9: error: inline asm error:
> This value type register class is not natively supported!
> asm("dlgr %0, %1" : "+r"(n) : "r"(d));
> ^
>
> Disable this code part there now when compiling with Clang, so that
> the generic code gets used instead.
>
> Signed-off-by: Thomas Huth <thuth@redhat.com>
> ---
> include/fpu/softfloat-macros.h | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h
> index b1d772e..bd5b641 100644
> --- a/include/fpu/softfloat-macros.h
> +++ b/include/fpu/softfloat-macros.h
> @@ -641,7 +641,7 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
> uint64_t q;
> asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
> return q;
> -#elif defined(__s390x__)
> +#elif defined(__s390x__) && !defined(__clang__)
Can we rather check if __int128 is natively supported? So this part get
compiled once Clang do support it, else we'll never use it...
> /* Need to use a TImode type to get an even register pair for DLGR. */
> unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
> asm("dlgr %0, %1" : "+r"(n) : "r"(d));
>
Philippe Mathieu-Daudé <philmd@redhat.com> writes:
> On 1/14/19 1:12 PM, Thomas Huth wrote:
>> Clang v7.0.1 does not like the __int128 variable type for inline
>> assembly on s390x:
>>
>> In file included from fpu/softfloat.c:97:
>> include/fpu/softfloat-macros.h:647:9: error: inline asm error:
>> This value type register class is not natively supported!
>> asm("dlgr %0, %1" : "+r"(n) : "r"(d));
>> ^
>>
>> Disable this code part there now when compiling with Clang, so that
>> the generic code gets used instead.
>>
>> Signed-off-by: Thomas Huth <thuth@redhat.com>
>> ---
>> include/fpu/softfloat-macros.h | 2 +-
>> 1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h
>> index b1d772e..bd5b641 100644
>> --- a/include/fpu/softfloat-macros.h
>> +++ b/include/fpu/softfloat-macros.h
>> @@ -641,7 +641,7 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
>> uint64_t q;
>> asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
>> return q;
>> -#elif defined(__s390x__)
>> +#elif defined(__s390x__) && !defined(__clang__)
>
> Can we rather check if __int128 is natively supported? So this part get
> compiled once Clang do support it, else we'll never use it...
We already define CONFIG_INT128 so you could just use that.
Thomas does the s390 clang leave CONFIG_INT128=y in config-host.mak?
>
>> /* Need to use a TImode type to get an even register pair for DLGR. */
>> unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
>> asm("dlgr %0, %1" : "+r"(n) : "r"(d));
>>
--
Alex Bennée
On 2019-01-14 17:37, Alex Bennée wrote:
>
> Philippe Mathieu-Daudé <philmd@redhat.com> writes:
>
>> On 1/14/19 1:12 PM, Thomas Huth wrote:
>>> Clang v7.0.1 does not like the __int128 variable type for inline
>>> assembly on s390x:
>>>
>>> In file included from fpu/softfloat.c:97:
>>> include/fpu/softfloat-macros.h:647:9: error: inline asm error:
>>> This value type register class is not natively supported!
>>> asm("dlgr %0, %1" : "+r"(n) : "r"(d));
>>> ^
>>>
>>> Disable this code part there now when compiling with Clang, so that
>>> the generic code gets used instead.
>>>
>>> Signed-off-by: Thomas Huth <thuth@redhat.com>
>>> ---
>>> include/fpu/softfloat-macros.h | 2 +-
>>> 1 file changed, 1 insertion(+), 1 deletion(-)
>>>
>>> diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h
>>> index b1d772e..bd5b641 100644
>>> --- a/include/fpu/softfloat-macros.h
>>> +++ b/include/fpu/softfloat-macros.h
>>> @@ -641,7 +641,7 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
>>> uint64_t q;
>>> asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
>>> return q;
>>> -#elif defined(__s390x__)
>>> +#elif defined(__s390x__) && !defined(__clang__)
>>
>> Can we rather check if __int128 is natively supported? So this part get
>> compiled once Clang do support it, else we'll never use it...
>
> We already define CONFIG_INT128 so you could just use that.
>
> Thomas does the s390 clang leave CONFIG_INT128=y in config-host.mak?
Yes, CONFIG_INT128=y is also set with Clang on s390x. It's really just
that it does not like __int128 to be passed as parameters for inline
assembly...
Thomas
Thomas Huth <thuth@redhat.com> writes:
> On 2019-01-14 17:37, Alex Bennée wrote:
>>
>> Philippe Mathieu-Daudé <philmd@redhat.com> writes:
>>
>>> On 1/14/19 1:12 PM, Thomas Huth wrote:
>>>> Clang v7.0.1 does not like the __int128 variable type for inline
>>>> assembly on s390x:
>>>>
>>>> In file included from fpu/softfloat.c:97:
>>>> include/fpu/softfloat-macros.h:647:9: error: inline asm error:
>>>> This value type register class is not natively supported!
>>>> asm("dlgr %0, %1" : "+r"(n) : "r"(d));
>>>> ^
>>>>
>>>> Disable this code part there now when compiling with Clang, so that
>>>> the generic code gets used instead.
>>>>
>>>> Signed-off-by: Thomas Huth <thuth@redhat.com>
>>>> ---
>>>> include/fpu/softfloat-macros.h | 2 +-
>>>> 1 file changed, 1 insertion(+), 1 deletion(-)
>>>>
>>>> diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h
>>>> index b1d772e..bd5b641 100644
>>>> --- a/include/fpu/softfloat-macros.h
>>>> +++ b/include/fpu/softfloat-macros.h
>>>> @@ -641,7 +641,7 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
>>>> uint64_t q;
>>>> asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
>>>> return q;
>>>> -#elif defined(__s390x__)
>>>> +#elif defined(__s390x__) && !defined(__clang__)
>>>
>>> Can we rather check if __int128 is natively supported? So this part get
>>> compiled once Clang do support it, else we'll never use it...
>>
>> We already define CONFIG_INT128 so you could just use that.
>>
>> Thomas does the s390 clang leave CONFIG_INT128=y in config-host.mak?
>
> Yes, CONFIG_INT128=y is also set with Clang on s390x. It's really just
> that it does not like __int128 to be passed as parameters for inline
> assembly...
What about something like this:
modified include/fpu/softfloat-macros.h
@@ -641,12 +641,6 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
uint64_t q;
asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
return q;
-#elif defined(__s390x__)
- /* Need to use a TImode type to get an even register pair for DLGR. */
- unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
- asm("dlgr %0, %1" : "+r"(n) : "r"(d));
- *r = n >> 64;
- return n;
#elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
/* From Power ISA 2.06, programming note for divdeu. */
uint64_t q1, q2, Q, r1, r2, R;
@@ -663,6 +657,11 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
}
*r = R;
return Q;
+#elif defined(CONFIG_INT128)
+ unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
+ unsigned __int128 q = n / d;
+ *r = q >> 64;
+ return q;
#else
uint64_t d0, d1, q0, q1, r1, r0, m;
--
Alex Bennée
On 1/15/19 5:58 AM, Alex Bennée wrote:
>
> Thomas Huth <thuth@redhat.com> writes:
>
>> On 2019-01-14 17:37, Alex Bennée wrote:
>>>
>>> Philippe Mathieu-Daudé <philmd@redhat.com> writes:
>>>
>>>> On 1/14/19 1:12 PM, Thomas Huth wrote:
>>>>> Clang v7.0.1 does not like the __int128 variable type for inline
>>>>> assembly on s390x:
>>>>>
>>>>> In file included from fpu/softfloat.c:97:
>>>>> include/fpu/softfloat-macros.h:647:9: error: inline asm error:
>>>>> This value type register class is not natively supported!
>>>>> asm("dlgr %0, %1" : "+r"(n) : "r"(d));
>>>>> ^
>>>>>
>>>>> Disable this code part there now when compiling with Clang, so that
>>>>> the generic code gets used instead.
>>>>>
>>>>> Signed-off-by: Thomas Huth <thuth@redhat.com>
>>>>> ---
>>>>> include/fpu/softfloat-macros.h | 2 +-
>>>>> 1 file changed, 1 insertion(+), 1 deletion(-)
>>>>>
>>>>> diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h
>>>>> index b1d772e..bd5b641 100644
>>>>> --- a/include/fpu/softfloat-macros.h
>>>>> +++ b/include/fpu/softfloat-macros.h
>>>>> @@ -641,7 +641,7 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
>>>>> uint64_t q;
>>>>> asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
>>>>> return q;
>>>>> -#elif defined(__s390x__)
>>>>> +#elif defined(__s390x__) && !defined(__clang__)
>>>>
>>>> Can we rather check if __int128 is natively supported? So this part get
>>>> compiled once Clang do support it, else we'll never use it...
>>>
>>> We already define CONFIG_INT128 so you could just use that.
>>>
>>> Thomas does the s390 clang leave CONFIG_INT128=y in config-host.mak?
>>
>> Yes, CONFIG_INT128=y is also set with Clang on s390x. It's really just
>> that it does not like __int128 to be passed as parameters for inline
>> assembly...
>
> What about something like this:
>
> modified include/fpu/softfloat-macros.h
> @@ -641,12 +641,6 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
> uint64_t q;
> asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
> return q;
> -#elif defined(__s390x__)
> - /* Need to use a TImode type to get an even register pair for DLGR. */
> - unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
> - asm("dlgr %0, %1" : "+r"(n) : "r"(d));
> - *r = n >> 64;
> - return n;
> #elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
> /* From Power ISA 2.06, programming note for divdeu. */
> uint64_t q1, q2, Q, r1, r2, R;
> @@ -663,6 +657,11 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
> }
> *r = R;
> return Q;
> +#elif defined(CONFIG_INT128)
> + unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
> + unsigned __int128 q = n / d;
> + *r = q >> 64;
> + return q;
Because that is not what the assembly does, for one.
But perhaps
unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
*r = n % d;
return n / d;
will allow the compiler to do what the assembly does for some 64-bit hosts.
r~
Richard Henderson <rth@twiddle.net> writes:
> On 1/15/19 5:58 AM, Alex Bennée wrote:
>>
>> Thomas Huth <thuth@redhat.com> writes:
>>
>>> On 2019-01-14 17:37, Alex Bennée wrote:
>>>>
>>>> Philippe Mathieu-Daudé <philmd@redhat.com> writes:
>>>>
>>>>> On 1/14/19 1:12 PM, Thomas Huth wrote:
>>>>>> Clang v7.0.1 does not like the __int128 variable type for inline
>>>>>> assembly on s390x:
>>>>>>
>>>>>> In file included from fpu/softfloat.c:97:
>>>>>> include/fpu/softfloat-macros.h:647:9: error: inline asm error:
>>>>>> This value type register class is not natively supported!
>>>>>> asm("dlgr %0, %1" : "+r"(n) : "r"(d));
>>>>>> ^
>>>>>>
>>>>>> Disable this code part there now when compiling with Clang, so that
>>>>>> the generic code gets used instead.
>>>>>>
>>>>>> Signed-off-by: Thomas Huth <thuth@redhat.com>
>>>>>> ---
>>>>>> include/fpu/softfloat-macros.h | 2 +-
>>>>>> 1 file changed, 1 insertion(+), 1 deletion(-)
>>>>>>
>>>>>> diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h
>>>>>> index b1d772e..bd5b641 100644
>>>>>> --- a/include/fpu/softfloat-macros.h
>>>>>> +++ b/include/fpu/softfloat-macros.h
>>>>>> @@ -641,7 +641,7 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
>>>>>> uint64_t q;
>>>>>> asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
>>>>>> return q;
>>>>>> -#elif defined(__s390x__)
>>>>>> +#elif defined(__s390x__) && !defined(__clang__)
>>>>>
>>>>> Can we rather check if __int128 is natively supported? So this part get
>>>>> compiled once Clang do support it, else we'll never use it...
>>>>
>>>> We already define CONFIG_INT128 so you could just use that.
>>>>
>>>> Thomas does the s390 clang leave CONFIG_INT128=y in config-host.mak?
>>>
>>> Yes, CONFIG_INT128=y is also set with Clang on s390x. It's really just
>>> that it does not like __int128 to be passed as parameters for inline
>>> assembly...
>>
>> What about something like this:
>>
>> modified include/fpu/softfloat-macros.h
>> @@ -641,12 +641,6 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
>> uint64_t q;
>> asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
>> return q;
>> -#elif defined(__s390x__)
>> - /* Need to use a TImode type to get an even register pair for DLGR. */
>> - unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
>> - asm("dlgr %0, %1" : "+r"(n) : "r"(d));
>> - *r = n >> 64;
>> - return n;
>> #elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
>> /* From Power ISA 2.06, programming note for divdeu. */
>> uint64_t q1, q2, Q, r1, r2, R;
>> @@ -663,6 +657,11 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
>> }
>> *r = R;
>> return Q;
>> +#elif defined(CONFIG_INT128)
>> + unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
>> + unsigned __int128 q = n / d;
>> + *r = q >> 64;
>> + return q;
>
> Because that is not what the assembly does, for one.
Doh...
> But perhaps
>
> unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
> *r = n % d;
> return n / d;
>
> will allow the compiler to do what the assembly does for some 64-bit
> hosts.
I wonder how much cost is incurred by the jumping to the (libgcc?) div
helper? Anyone got an s390x about so we can benchmark the two
approaches?
If it's in the noise then it would be nice to avoid getting too #ifdef
happy.
--
Alex Bennée
On Mon, 14 Jan 2019 at 22:48, Alex Bennée <alex.bennee@linaro.org> wrote: > > > Richard Henderson <rth@twiddle.net> writes: > > But perhaps > > > > unsigned __int128 n = (unsigned __int128)n1 << 64 | n0; > > *r = n % d; > > return n / d; > > > > will allow the compiler to do what the assembly does for some 64-bit > > hosts. > > I wonder how much cost is incurred by the jumping to the (libgcc?) div > helper? Anyone got an s390x about so we can benchmark the two > approaches? The project has an s390x system available; however it's usually running merge build tests so not so useful for benchmarking. (I can set up accounts on it but that requires me to faff about figuring out how to create new accounts :-)) thanks -- PMM
Peter Maydell <peter.maydell@linaro.org> writes: > On Mon, 14 Jan 2019 at 22:48, Alex Bennée <alex.bennee@linaro.org> wrote: >> >> >> Richard Henderson <rth@twiddle.net> writes: >> > But perhaps >> > >> > unsigned __int128 n = (unsigned __int128)n1 << 64 | n0; >> > *r = n % d; >> > return n / d; >> > >> > will allow the compiler to do what the assembly does for some 64-bit >> > hosts. >> >> I wonder how much cost is incurred by the jumping to the (libgcc?) div >> helper? Anyone got an s390x about so we can benchmark the two >> approaches? > > The project has an s390x system available; however it's usually > running merge build tests so not so useful for benchmarking. > (I can set up accounts on it but that requires me to faff about > figuring out how to create new accounts :-)) I'm happy to leave this up to those who care about s390x host performance (Thomas, Cornelia?). I'm just keen to avoid the divide helper getting too #ifdefy. I'll include a CONFIG_INT128 patch in my next patch queue review once I've double checked and tested under linux-user ;-) -- Alex Bennée
On 2019-01-15 15:46, Alex Bennée wrote:
>
> Peter Maydell <peter.maydell@linaro.org> writes:
>
>> On Mon, 14 Jan 2019 at 22:48, Alex Bennée <alex.bennee@linaro.org> wrote:
>>>
>>>
>>> Richard Henderson <rth@twiddle.net> writes:
>>>> But perhaps
>>>>
>>>> unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
>>>> *r = n % d;
>>>> return n / d;
>>>>
>>>> will allow the compiler to do what the assembly does for some 64-bit
>>>> hosts.
>>>
>>> I wonder how much cost is incurred by the jumping to the (libgcc?) div
>>> helper? Anyone got an s390x about so we can benchmark the two
>>> approaches?
>>
>> The project has an s390x system available; however it's usually
>> running merge build tests so not so useful for benchmarking.
>> (I can set up accounts on it but that requires me to faff about
>> figuring out how to create new accounts :-))
>
> I'm happy to leave this up to those who care about s390x host
> performance (Thomas, Cornelia?). I'm just keen to avoid the divide
> helper getting too #ifdefy.
Ok, I just did a quick'n'dirty "benchmark" on the s390x that I've got available:
#include <stdio.h>
#include <time.h>
#include <stdint.h>
uint64_t udiv_qrnnd1(uint64_t *r, uint64_t n1, uint64_t n0, uint64_t d)
{
unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
asm("dlgr %0, %1" : "+r"(n) : "r"(d));
*r = n >> 64;
return n;
}
uint64_t udiv_qrnnd2(uint64_t *r, uint64_t n1, uint64_t n0, uint64_t d)
{
unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
*r = n % d;
return n / d;
}
uint64_t udiv_qrnnd3(uint64_t *r, uint64_t n1, uint64_t n0, uint64_t d)
{
uint64_t d0, d1, q0, q1, r1, r0, m;
d0 = (uint32_t)d;
d1 = d >> 32;
r1 = n1 % d1;
q1 = n1 / d1;
m = q1 * d0;
r1 = (r1 << 32) | (n0 >> 32);
if (r1 < m) {
q1 -= 1;
r1 += d;
if (r1 >= d) {
if (r1 < m) {
q1 -= 1;
r1 += d;
}
}
}
r1 -= m;
r0 = r1 % d1;
q0 = r1 / d1;
m = q0 * d0;
r0 = (r0 << 32) | (uint32_t)n0;
if (r0 < m) {
q0 -= 1;
r0 += d;
if (r0 >= d) {
if (r0 < m) {
q0 -= 1;
r0 += d;
}
}
}
r0 -= m;
*r = r0;
return (q1 << 32) | q0;
}
int main()
{
uint64_t r = 0, n1 = 0, n0 = 0, d = 0;
uint64_t rs = 0, rn = 0;
clock_t start, end;
long i;
start = clock();
for (i=0; i<200000000L; i++) {
n1 += 3;
n0 += 987654321;
d += 0x123456789;
rs += udiv_qrnnd1(&r, n1, n0, d);
rn += r;
}
end = clock();
printf("test 1: time=%li\t, rs=%li , rn = %li\n", (end-start)/1000, rs, rn);
r = n1 = n0 = d = rs = rn = 0;
start = clock();
for (i=0; i<200000000L; i++) {
n1 += 3;
n0 += 987654321;
d += 0x123456789;
rs += udiv_qrnnd2(&r, n1, n0, d);
rn += r;
}
end = clock();
printf("test 2: time=%li\t, rs=%li , rn = %li\n", (end-start)/1000, rs, rn);
r = n1 = n0 = d = rs = rn = 0;
start = clock();
for (i=0; i<200000000L; i++) {
n1 += 3;
n0 += 987654321;
d += 0x123456789;
rs += udiv_qrnnd3(&r, n1, n0, d);
rn += r;
}
end = clock();
printf("test 3: time=%li\t, rs=%li , rn = %li\n", (end-start)/1000, rs, rn);
return 0;
}
... and results with GCC v8.2.1 are (using -O2):
test 1: time=609 , rs=2264924160200000000 , rn = 6136218997527160832
test 2: time=10127 , rs=2264924160200000000 , rn = 6136218997527160832
test 3: time=2350 , rs=2264924183048928865 , rn = 4842822048162311089
Thus the int128 version is the slowest!
... but at least it gives the same results as the DLGR instruction. The 64-bit
version gives different results - do we have a bug here?
Results with Clang v7.0.1 (using -O2, too) are these:
test 2: time=5035 , rs=2264924160200000000 , rn = 6136218997527160832
test 3: time=1970 , rs=2264924183048928865 , rn = 4842822048162311089
Thomas
Thomas Huth <thuth@redhat.com> writes:
> On 2019-01-15 15:46, Alex Bennée wrote:
>>
>> Peter Maydell <peter.maydell@linaro.org> writes:
>>
>>> On Mon, 14 Jan 2019 at 22:48, Alex Bennée <alex.bennee@linaro.org> wrote:
>>>>
>>>>
>>>> Richard Henderson <rth@twiddle.net> writes:
>>>>> But perhaps
>>>>>
>>>>> unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
>>>>> *r = n % d;
>>>>> return n / d;
>>>>>
>>>>> will allow the compiler to do what the assembly does for some 64-bit
>>>>> hosts.
>>>>
>>>> I wonder how much cost is incurred by the jumping to the (libgcc?) div
>>>> helper? Anyone got an s390x about so we can benchmark the two
>>>> approaches?
>>>
>>> The project has an s390x system available; however it's usually
>>> running merge build tests so not so useful for benchmarking.
>>> (I can set up accounts on it but that requires me to faff about
>>> figuring out how to create new accounts :-))
>>
>> I'm happy to leave this up to those who care about s390x host
>> performance (Thomas, Cornelia?). I'm just keen to avoid the divide
>> helper getting too #ifdefy.
>
> Ok, I just did a quick'n'dirty "benchmark" on the s390x that I've got
> available:
Ahh I should have mentioned we already have the technology for this ;-)
If you build the fpu/next tree on a s390x you can then run:
./tests/fp/fp-bench f64_div
with and without the CONFIG_128 path. To get an idea of the real world
impact you can compile a foreign binary and run it on a s390x system
with:
$QEMU ./tests/fp/fp-bench f64_div -t host
And that will give you the peak performance assuming your program is
doing nothing but f64_div operations. If the two QEMU's are basically in
the same ballpark then it doesn't make enough difference. That said:
> #include <stdio.h>
> #include <time.h>
> #include <stdint.h>
>
> uint64_t udiv_qrnnd1(uint64_t *r, uint64_t n1, uint64_t n0, uint64_t d)
> {
> unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
> asm("dlgr %0, %1" : "+r"(n) : "r"(d));
> *r = n >> 64;
> return n;
> }
>
> uint64_t udiv_qrnnd2(uint64_t *r, uint64_t n1, uint64_t n0, uint64_t d)
> {
> unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
> *r = n % d;
> return n / d;
> }
>
<snip>
>
> int main()
> {
> uint64_t r = 0, n1 = 0, n0 = 0, d = 0;
> uint64_t rs = 0, rn = 0;
> clock_t start, end;
> long i;
>
> start = clock();
> for (i=0; i<200000000L; i++) {
> n1 += 3;
> n0 += 987654321;
> d += 0x123456789;
> rs += udiv_qrnnd1(&r, n1, n0, d);
> rn += r;
> }
> end = clock();
> printf("test 1: time=%li\t, rs=%li , rn = %li\n", (end-start)/1000, rs, rn);
>
> r = n1 = n0 = d = rs = rn = 0;
>
> start = clock();
> for (i=0; i<200000000L; i++) {
> n1 += 3;
> n0 += 987654321;
> d += 0x123456789;
> rs += udiv_qrnnd2(&r, n1, n0, d);
> rn += r;
> }
> end = clock();
> printf("test 2: time=%li\t, rs=%li , rn = %li\n", (end-start)/1000, rs, rn);
>
> r = n1 = n0 = d = rs = rn = 0;
>
> start = clock();
> for (i=0; i<200000000L; i++) {
> n1 += 3;
> n0 += 987654321;
> d += 0x123456789;
> rs += udiv_qrnnd3(&r, n1, n0, d);
> rn += r;
> }
> end = clock();
> printf("test 3: time=%li\t, rs=%li , rn = %li\n", (end-start)/1000, rs, rn);
>
> return 0;
> }
>
> ... and results with GCC v8.2.1 are (using -O2):
>
> test 1: time=609 , rs=2264924160200000000 , rn = 6136218997527160832
> test 2: time=10127 , rs=2264924160200000000 , rn = 6136218997527160832
> test 3: time=2350 , rs=2264924183048928865 , rn = 4842822048162311089
>
> Thus the int128 version is the slowest!
I'd expect a little slow down due to the indirection into libgcc.. but
that seems pretty high.
>
> ... but at least it gives the same results as the DLGR instruction. The 64-bit
> version gives different results - do we have a bug here?
>
> Results with Clang v7.0.1 (using -O2, too) are these:
>
> test 2: time=5035 , rs=2264924160200000000 , rn = 6136218997527160832
> test 3: time=1970 , rs=2264924183048928865 , rn =
> 4842822048162311089
You can run:
./tests/fp/fp-test f64_div -l 2 -r all
For a proper comprehensive test.
--
Alex Bennée
On Tue, Jan 15, 2019 at 16:01:32 +0000, Alex Bennée wrote: > Ahh I should have mentioned we already have the technology for this ;-) > > If you build the fpu/next tree on a s390x you can then run: > > ./tests/fp/fp-bench f64_div > > with and without the CONFIG_128 path. To get an idea of the real world > impact you can compile a foreign binary and run it on a s390x system > with: > > $QEMU ./tests/fp/fp-bench f64_div -t host > > And that will give you the peak performance assuming your program is > doing nothing but f64_div operations. If the two QEMU's are basically in > the same ballpark then it doesn't make enough difference. That said: I think you mean here `tests/fp/fp-bench -o div -p double', otherwise you'll get the default op (-o add). Thanks, E.
On 2019-01-15 21:05, Emilio G. Cota wrote: > On Tue, Jan 15, 2019 at 16:01:32 +0000, Alex Bennée wrote: >> Ahh I should have mentioned we already have the technology for this ;-) >> >> If you build the fpu/next tree on a s390x you can then run: >> >> ./tests/fp/fp-bench f64_div >> >> with and without the CONFIG_128 path. To get an idea of the real world >> impact you can compile a foreign binary and run it on a s390x system >> with: >> >> $QEMU ./tests/fp/fp-bench f64_div -t host >> >> And that will give you the peak performance assuming your program is >> doing nothing but f64_div operations. If the two QEMU's are basically in >> the same ballpark then it doesn't make enough difference. That said: > > I think you mean here `tests/fp/fp-bench -o div -p double', otherwise > you'll get the default op (-o add). I tried that now, too, and -o div -p double does not really seem to exercise this function at all. Here are my results (disclaimer: that system is likely not really usable for benchmarks since it's CPUs are shared with other LPARs, but I ran all the tests at least twice and got similar results): With the DGLR inline assembly: time ./fp-test f64_div -l 2 -r all real 6m43,648s user 6m43,362s sys 0m0,160s time ./fp-bench -o div -p double 204.98 MFlops real 0m1,002s user 0m1,001s sys 0m0,001s With the "#else" default 64-bit code: time ./fp-test f64_div -l 2 -r all real 6m44,910s user 6m44,616s sys 0m0,165s time ./fp-bench -o div -p double 205.41 MFlops real 0m1,002s user 0m1,001s sys 0m0,001s With the new CONFIG_INT128 code: time ./fp-test f64_div -l 2 -r all real 6m58,371s user 6m58,078s sys 0m0,164s time ./fp-bench -o div -p double 205.17 MFlops real 0m1,002s user 0m1,000s sys 0m0,001s ==> The new CONFIG_INT128 code is really worse than the 64-bit code, so I don't think we should include this yet (unless we know a system where the compiler can create optimized assembly code without libgcc here). Thomas
Thomas Huth <thuth@redhat.com> writes: > On 2019-01-15 21:05, Emilio G. Cota wrote: >> On Tue, Jan 15, 2019 at 16:01:32 +0000, Alex Bennée wrote: >>> Ahh I should have mentioned we already have the technology for this ;-) >>> >>> If you build the fpu/next tree on a s390x you can then run: >>> >>> ./tests/fp/fp-bench f64_div >>> >>> with and without the CONFIG_128 path. To get an idea of the real world >>> impact you can compile a foreign binary and run it on a s390x system >>> with: >>> >>> $QEMU ./tests/fp/fp-bench f64_div -t host >>> >>> And that will give you the peak performance assuming your program is >>> doing nothing but f64_div operations. If the two QEMU's are basically in >>> the same ballpark then it doesn't make enough difference. That said: >> >> I think you mean here `tests/fp/fp-bench -o div -p double', otherwise >> you'll get the default op (-o add). > > I tried that now, too, and -o div -p double does not really seem to > exercise this function at all. How do you mean? It should do because by default it should be calling the softfloat implementations. > Here are my results (disclaimer: that system is likely not really usable > for benchmarks since it's CPUs are shared with other LPARs, but I ran > all the tests at least twice and got similar results): > > > With the DGLR inline assembly: > <snip> > time ./fp-bench -o div -p double > 204.98 MFlops <snip> > With the "#else" default 64-bit code: > <snip> > time ./fp-bench -o div -p double > 205.41 MFlops <snip> > With the new CONFIG_INT128 code: > <snip> > time ./fp-bench -o div -p double > 205.17 MFlops <snip> > > > ==> The new CONFIG_INT128 code is really worse than the 64-bit code, so > I don't think we should include this yet (unless we know a system where > the compiler can create optimized assembly code without libgcc here). I mean to me that looks like it is easily in the noise range and that the dglr instruction didn't actually beat the unrolled 64 bit code - which is just weird. -- Alex Bennée
On 2019-01-16 18:08, Alex Bennée wrote:
>
> Thomas Huth <thuth@redhat.com> writes:
>
>> On 2019-01-15 21:05, Emilio G. Cota wrote:
>>> On Tue, Jan 15, 2019 at 16:01:32 +0000, Alex Bennée wrote:
>>>> Ahh I should have mentioned we already have the technology for this ;-)
>>>>
>>>> If you build the fpu/next tree on a s390x you can then run:
>>>>
>>>> ./tests/fp/fp-bench f64_div
>>>>
>>>> with and without the CONFIG_128 path. To get an idea of the real world
>>>> impact you can compile a foreign binary and run it on a s390x system
>>>> with:
>>>>
>>>> $QEMU ./tests/fp/fp-bench f64_div -t host
>>>>
>>>> And that will give you the peak performance assuming your program is
>>>> doing nothing but f64_div operations. If the two QEMU's are basically in
>>>> the same ballpark then it doesn't make enough difference. That said:
>>>
>>> I think you mean here `tests/fp/fp-bench -o div -p double', otherwise
>>> you'll get the default op (-o add).
>>
>> I tried that now, too, and -o div -p double does not really seem to
>> exercise this function at all.
>
> How do you mean? It should do because by default it should be calling
> the softfloat implementations.
I've added a puts("hello") into the udiv_qrnd() function. When I then
run "fp-bench -o div -p double", it only prints out "hello" a single
time, so the function is only called once during the whole test.
Thomas
Thomas Huth <thuth@redhat.com> writes:
> On 2019-01-16 18:08, Alex Bennée wrote:
>>
>> Thomas Huth <thuth@redhat.com> writes:
>>
>>> On 2019-01-15 21:05, Emilio G. Cota wrote:
>>>> On Tue, Jan 15, 2019 at 16:01:32 +0000, Alex Bennée wrote:
>>>>> Ahh I should have mentioned we already have the technology for this ;-)
>>>>>
>>>>> If you build the fpu/next tree on a s390x you can then run:
>>>>>
>>>>> ./tests/fp/fp-bench f64_div
>>>>>
>>>>> with and without the CONFIG_128 path. To get an idea of the real world
>>>>> impact you can compile a foreign binary and run it on a s390x system
>>>>> with:
>>>>>
>>>>> $QEMU ./tests/fp/fp-bench f64_div -t host
>>>>>
>>>>> And that will give you the peak performance assuming your program is
>>>>> doing nothing but f64_div operations. If the two QEMU's are basically in
>>>>> the same ballpark then it doesn't make enough difference. That said:
>>>>
>>>> I think you mean here `tests/fp/fp-bench -o div -p double', otherwise
>>>> you'll get the default op (-o add).
>>>
>>> I tried that now, too, and -o div -p double does not really seem to
>>> exercise this function at all.
>>
>> How do you mean? It should do because by default it should be calling
>> the softfloat implementations.
>
> I've added a puts("hello") into the udiv_qrnd() function. When I then
> run "fp-bench -o div -p double", it only prints out "hello" a single
> time, so the function is only called once during the whole test.
That's very odd. With the following on my aarch64 box:
modified include/fpu/softfloat-macros.h
@@ -637,6 +637,8 @@ static inline uint64_t estimateDiv128To64(uint64_t a0, uint64_t a1, uint64_t b)
static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
uint64_t n0, uint64_t d)
{
+ static int iter = 0;
+ fprintf(stderr, "%s: %d\n", __func__, iter++);
#if defined(__x86_64__)
uint64_t q;
asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
I get:
udiv_qrnnd: 0
udiv_qrnnd: 1
..
..<all the way to>
udiv_qrnnd: 99998
udiv_qrnnd: 99999
So I'm not sure what is different on s390
--
Alex Bennée
On Wed, Jan 16, 2019 at 07:33:28 +0100, Thomas Huth wrote: > On 2019-01-15 21:05, Emilio G. Cota wrote: > > On Tue, Jan 15, 2019 at 16:01:32 +0000, Alex Bennée wrote: > >> Ahh I should have mentioned we already have the technology for this ;-) > >> > >> If you build the fpu/next tree on a s390x you can then run: > >> > >> ./tests/fp/fp-bench f64_div > >> > >> with and without the CONFIG_128 path. To get an idea of the real world > >> impact you can compile a foreign binary and run it on a s390x system > >> with: > >> > >> $QEMU ./tests/fp/fp-bench f64_div -t host > >> > >> And that will give you the peak performance assuming your program is > >> doing nothing but f64_div operations. If the two QEMU's are basically in > >> the same ballpark then it doesn't make enough difference. That said: > > > > I think you mean here `tests/fp/fp-bench -o div -p double', otherwise > > you'll get the default op (-o add). > > I tried that now, too, and -o div -p double does not really seem to > exercise this function at all. You can check what is being called then with perf record/report. E.
On 1/16/19 2:29 AM, Thomas Huth wrote: > ... but at least it gives the same results as the DLGR instruction. The 64-bit > version gives different results - do we have a bug here? Yes, on your inputs. udiv_qrnnd3 requires that D be "normalized", i.e. have the most siginificant bit set. (And thus N1:N0 shifted left to match, and the returned remainder shifted right to match.) r~
On 1/14/19 11:12 PM, Thomas Huth wrote:
> Clang v7.0.1 does not like the __int128 variable type for inline
> assembly on s390x:
>
> In file included from fpu/softfloat.c:97:
> include/fpu/softfloat-macros.h:647:9: error: inline asm error:
> This value type register class is not natively supported!
> asm("dlgr %0, %1" : "+r"(n) : "r"(d));
> ^
>
> Disable this code part there now when compiling with Clang, so that
> the generic code gets used instead.
>
> Signed-off-by: Thomas Huth <thuth@redhat.com>
> ---
> include/fpu/softfloat-macros.h | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Regardless of whatever follow-up we might do wrt CONFIG_INT128.
r~
On Mon, 14 Jan 2019 13:12:35 +0100
Thomas Huth <thuth@redhat.com> wrote:
> Clang v7.0.1 does not like the __int128 variable type for inline
> assembly on s390x:
>
> In file included from fpu/softfloat.c:97:
> include/fpu/softfloat-macros.h:647:9: error: inline asm error:
> This value type register class is not natively supported!
> asm("dlgr %0, %1" : "+r"(n) : "r"(d));
> ^
>
> Disable this code part there now when compiling with Clang, so that
> the generic code gets used instead.
>
> Signed-off-by: Thomas Huth <thuth@redhat.com>
> ---
> include/fpu/softfloat-macros.h | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h
> index b1d772e..bd5b641 100644
> --- a/include/fpu/softfloat-macros.h
> +++ b/include/fpu/softfloat-macros.h
> @@ -641,7 +641,7 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
> uint64_t q;
> asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
> return q;
> -#elif defined(__s390x__)
> +#elif defined(__s390x__) && !defined(__clang__)
> /* Need to use a TImode type to get an even register pair for DLGR. */
> unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
> asm("dlgr %0, %1" : "+r"(n) : "r"(d));
Ok, so what's the deal with this patch now? Fix compilation now,
optimize later?
If yes, should I pick it as an s390x build fix (I plan to send a pull
request later this week), or will the fpu maintainers pick it?
Cornelia Huck <cohuck@redhat.com> writes:
> On Mon, 14 Jan 2019 13:12:35 +0100
> Thomas Huth <thuth@redhat.com> wrote:
>
<snip>
>>
>> diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h
>> index b1d772e..bd5b641 100644
>> --- a/include/fpu/softfloat-macros.h
>> +++ b/include/fpu/softfloat-macros.h
>> @@ -641,7 +641,7 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
>> uint64_t q;
>> asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
>> return q;
>> -#elif defined(__s390x__)
>> +#elif defined(__s390x__) && !defined(__clang__)
>> /* Need to use a TImode type to get an even register pair for DLGR. */
>> unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
>> asm("dlgr %0, %1" : "+r"(n) : "r"(d));
>
> Ok, so what's the deal with this patch now? Fix compilation now,
> optimize later?
>
> If yes, should I pick it as an s390x build fix (I plan to send a pull
> request later this week), or will the fpu maintainers pick it?
I'm planning to send a FPU PR tomorrow and I'll happily include either
version.
I'm personally minded to go with the patch that makes s390 (and others)
fall back to the generic CONFIG_INT128 code. The numbers Thomas gathered
didn't look like it was much difference either way.
Unless you *really* care about milking that last bit of performance out of
the s390 TCG back-end?
--
Alex Bennée
On 2019-01-16 18:16, Alex Bennée wrote:
>
> Cornelia Huck <cohuck@redhat.com> writes:
>
>> On Mon, 14 Jan 2019 13:12:35 +0100
>> Thomas Huth <thuth@redhat.com> wrote:
>>
> <snip>
>>>
>>> diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h
>>> index b1d772e..bd5b641 100644
>>> --- a/include/fpu/softfloat-macros.h
>>> +++ b/include/fpu/softfloat-macros.h
>>> @@ -641,7 +641,7 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
>>> uint64_t q;
>>> asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
>>> return q;
>>> -#elif defined(__s390x__)
>>> +#elif defined(__s390x__) && !defined(__clang__)
>>> /* Need to use a TImode type to get an even register pair for DLGR. */
>>> unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
>>> asm("dlgr %0, %1" : "+r"(n) : "r"(d));
>>
>> Ok, so what's the deal with this patch now? Fix compilation now,
>> optimize later?
>>
>> If yes, should I pick it as an s390x build fix (I plan to send a pull
>> request later this week), or will the fpu maintainers pick it?
>
> I'm planning to send a FPU PR tomorrow and I'll happily include either
> version.
>
> I'm personally minded to go with the patch that makes s390 (and others)
> fall back to the generic CONFIG_INT128 code. The numbers Thomas gathered
> didn't look like it was much difference either way.
>
> Unless you *really* care about milking that last bit of performance out of
> the s390 TCG back-end?
I don't think that anybody buys a mainframe for this special case ;-)
I'd go with the !defined(__clang__) patch above.
Thomas
On Mon, 14 Jan 2019 13:12:35 +0100
Thomas Huth <thuth@redhat.com> wrote:
> Clang v7.0.1 does not like the __int128 variable type for inline
> assembly on s390x:
>
> In file included from fpu/softfloat.c:97:
> include/fpu/softfloat-macros.h:647:9: error: inline asm error:
> This value type register class is not natively supported!
> asm("dlgr %0, %1" : "+r"(n) : "r"(d));
> ^
>
> Disable this code part there now when compiling with Clang, so that
> the generic code gets used instead.
>
> Signed-off-by: Thomas Huth <thuth@redhat.com>
> ---
> include/fpu/softfloat-macros.h | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h
> index b1d772e..bd5b641 100644
> --- a/include/fpu/softfloat-macros.h
> +++ b/include/fpu/softfloat-macros.h
> @@ -641,7 +641,7 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
> uint64_t q;
> asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
> return q;
> -#elif defined(__s390x__)
> +#elif defined(__s390x__) && !defined(__clang__)
> /* Need to use a TImode type to get an even register pair for DLGR. */
> unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
> asm("dlgr %0, %1" : "+r"(n) : "r"(d));
Acked-by: Cornelia Huck <cohuck@redhat.com>
...as this fixed that s390x clang problem for me.
© 2016 - 2025 Red Hat, Inc.