[PATCH 2/2] util/bufferiszero: Add simd acceleration for loongarch64

Bibo Mao posted 2 patches 5 months, 3 weeks ago
Maintainers: Richard Henderson <richard.henderson@linaro.org>, Paolo Bonzini <pbonzini@redhat.com>, "Marc-André Lureau" <marcandre.lureau@redhat.com>, "Daniel P. Berrangé" <berrange@redhat.com>, Thomas Huth <thuth@redhat.com>, "Philippe Mathieu-Daudé" <philmd@linaro.org>
[PATCH 2/2] util/bufferiszero: Add simd acceleration for loongarch64
Posted by Bibo Mao 5 months, 3 weeks ago
Different gcc versions have different features, macro CONFIG_LSX_OPT
and CONFIG_LASX_OPT is added here to detect whether gcc supports
built-in lsx/lasx macro.

Function buffer_zero_lsx() is added for 128bit simd fpu optimization,
and function buffer_zero_lasx() is for 256bit simd fpu optimization.

Loongarch gcc built-in lsx/lasx macro can be used only when compiler
option -mlsx/-mlasx is added, and there is no separate compiler option
for function only. So it is only in effect when qemu is compiled with
parameter --extra-cflags="-mlasx"

Signed-off-by: Bibo Mao <maobibo@loongson.cn>
---
 meson.build         |  11 +++++
 util/bufferiszero.c | 103 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 114 insertions(+)

diff --git a/meson.build b/meson.build
index 6386607144..29bc362d7a 100644
--- a/meson.build
+++ b/meson.build
@@ -2855,6 +2855,17 @@ config_host_data.set('CONFIG_ARM_AES_BUILTIN', cc.compiles('''
     void foo(uint8x16_t *p) { *p = vaesmcq_u8(*p); }
   '''))
 
+# For Loongarch64, detect if LSX/LASX are available.
+ config_host_data.set('CONFIG_LSX_OPT', cc.compiles('''
+    #include "lsxintrin.h"
+    int foo(__m128i v) { return __lsx_bz_v(v); }
+  '''))
+
+config_host_data.set('CONFIG_LASX_OPT', cc.compiles('''
+    #include "lasxintrin.h"
+    int foo(__m256i v) { return __lasx_xbz_v(v); }
+  '''))
+
 if get_option('membarrier').disabled()
   have_membarrier = false
 elif host_os == 'windows'
diff --git a/util/bufferiszero.c b/util/bufferiszero.c
index 74864f7b78..751e81dbb3 100644
--- a/util/bufferiszero.c
+++ b/util/bufferiszero.c
@@ -265,6 +265,109 @@ static biz_accel_fn const accel_table[] = {
     buffer_is_zero_int_ge256,
     buffer_is_zero_simd,
 };
+#elif defined(__loongarch__)
+#ifdef CONFIG_LSX_OPT
+#include "lsxintrin.h"
+static bool buffer_zero_lsx(const void *buf, size_t len)
+{
+    /* Unaligned loads at head/tail.  */
+    __m128i v = *(__m128i *)(buf);
+    __m128i w = *(__m128i *)(buf + len - 16);
+    /* Align head/tail to 16-byte boundaries.  */
+    const __m128i *p = QEMU_ALIGN_PTR_DOWN(buf + 16, 16);
+    const __m128i *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 16);
+
+    /* Collect a partial block at tail end.  */
+    v |= e[-1]; w |= e[-2];
+    v |= e[-3]; w |= e[-4];
+    v |= e[-5]; w |= e[-6];
+    v |= e[-7]; v |= w;
+
+    /*
+     * Loop over complete 128-byte blocks.
+     * With the head and tail removed, e - p >= 14, so the loop
+     * must iterate at least once.
+     */
+    do {
+        if (!__lsx_bz_v(v)) {
+            return false;
+        }
+        v = p[0];  w = p[1];
+        v |= p[2]; w |= p[3];
+        v |= p[4]; w |= p[5];
+        v |= p[6]; w |= p[7];
+        v |= w;
+        p += 8;
+    } while (p < e - 7);
+
+    return __lsx_bz_v(v);
+}
+#endif
+
+#ifdef CONFIG_LASX_OPT
+#include "lasxintrin.h"
+static bool buffer_zero_lasx(const void *buf, size_t len)
+{
+    /* Unaligned loads at head/tail.  */
+    __m256i v = *(__m256i *)(buf);
+    __m256i w = *(__m256i *)(buf + len - 32);
+    /* Align head/tail to 32-byte boundaries.  */
+    const __m256i *p = QEMU_ALIGN_PTR_DOWN(buf + 32, 32);
+    const __m256i *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 32);
+
+    /* Collect a partial block at tail end.  */
+    v |= e[-1]; w |= e[-2];
+    v |= e[-3]; w |= e[-4];
+    v |= e[-5]; w |= e[-6];
+    v |= e[-7]; v |= w;
+
+    /* Loop over complete 256-byte blocks.  */
+    for (; p < e - 7; p += 8) {
+        /* PTEST is not profitable here.  */
+        if (!__lasx_xbz_v(v)) {
+            return false;
+        }
+
+        v = p[0];  w = p[1];
+        v |= p[2]; w |= p[3];
+        v |= p[4]; w |= p[5];
+        v |= p[6]; w |= p[7];
+        v |= w;
+    }
+
+    return __lasx_xbz_v(v);
+}
+#endif
+
+static biz_accel_fn const accel_table[] = {
+    buffer_is_zero_int_ge256,
+#ifdef CONFIG_LSX_OPT
+    buffer_zero_lsx,
+#endif
+#ifdef CONFIG_LASX_OPT
+    buffer_zero_lasx,
+#endif
+};
+
+static unsigned best_accel(void)
+{
+    unsigned info = cpuinfo_init();
+
+    /* CONFIG_LSX_OPT must be enabled if CONFIG_LASX_OPT is enabled */
+#ifdef CONFIG_LASX_OPT
+    if (info & CPUINFO_LASX) {
+        return 2;
+    }
+#endif
+
+#ifdef CONFIG_LSX_OPT
+    if (info & CPUINFO_LSX) {
+        return 1;
+    }
+#endif
+
+    return 0;
+}
 #else
 #define best_accel() 0
 static biz_accel_fn const accel_table[1] = {
-- 
2.39.3
Re: [PATCH 2/2] util/bufferiszero: Add simd acceleration for loongarch64
Posted by Richard Henderson 5 months, 3 weeks ago
On 6/5/24 02:32, Bibo Mao wrote:
> Different gcc versions have different features, macro CONFIG_LSX_OPT
> and CONFIG_LASX_OPT is added here to detect whether gcc supports
> built-in lsx/lasx macro.
> 
> Function buffer_zero_lsx() is added for 128bit simd fpu optimization,
> and function buffer_zero_lasx() is for 256bit simd fpu optimization.
> 
> Loongarch gcc built-in lsx/lasx macro can be used only when compiler
> option -mlsx/-mlasx is added, and there is no separate compiler option
> for function only. So it is only in effect when qemu is compiled with
> parameter --extra-cflags="-mlasx"
> 
> Signed-off-by: Bibo Mao <maobibo@loongson.cn>
> ---
>   meson.build         |  11 +++++
>   util/bufferiszero.c | 103 ++++++++++++++++++++++++++++++++++++++++++++
>   2 files changed, 114 insertions(+)
> 
> diff --git a/meson.build b/meson.build
> index 6386607144..29bc362d7a 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -2855,6 +2855,17 @@ config_host_data.set('CONFIG_ARM_AES_BUILTIN', cc.compiles('''
>       void foo(uint8x16_t *p) { *p = vaesmcq_u8(*p); }
>     '''))
>   
> +# For Loongarch64, detect if LSX/LASX are available.
> + config_host_data.set('CONFIG_LSX_OPT', cc.compiles('''
> +    #include "lsxintrin.h"
> +    int foo(__m128i v) { return __lsx_bz_v(v); }
> +  '''))
> +
> +config_host_data.set('CONFIG_LASX_OPT', cc.compiles('''
> +    #include "lasxintrin.h"
> +    int foo(__m256i v) { return __lasx_xbz_v(v); }
> +  '''))

Both of these are introduced by gcc 14 and llvm 18, so I'm not certain of the utility of 
separate tests.  We might simplify this with

   config_host_data.set('CONFIG_LSX_LASX_INTRIN_H',
     cc.has_header('lsxintrin.h') && cc.has_header('lasxintrin.h'))


As you say, these headers require vector instructions to be enabled at compile-time rather 
than detecting them at runtime.  This is a point where the compilers could be improved to 
support __attribute__((target("xyz"))) and the builtins with that.  The i386 port does 
this, for instance.

In the meantime, it means that you don't need a runtime test.  Similar to aarch64 and the 
use of __ARM_NEON as a compile-time test for simd support.  Perhaps

#elif defined(CONFIG_LSX_LASX_INTRIN_H) && \
       (defined(__loongarch_sx) || defined(__loongarch_asx))
# ifdef __loongarch_sx
   ...
# endif
# ifdef __loongarch_asx
   ...
# endif


The actual code is perfectly fine, of course, since it follows the pattern from the 
others.  How much improvement do you see from bufferiszero-bench?


r~
Re: [PATCH 2/2] util/bufferiszero: Add simd acceleration for loongarch64
Posted by maobibo 5 months, 3 weeks ago

On 2024/6/6 上午7:51, Richard Henderson wrote:
> On 6/5/24 02:32, Bibo Mao wrote:
>> Different gcc versions have different features, macro CONFIG_LSX_OPT
>> and CONFIG_LASX_OPT is added here to detect whether gcc supports
>> built-in lsx/lasx macro.
>>
>> Function buffer_zero_lsx() is added for 128bit simd fpu optimization,
>> and function buffer_zero_lasx() is for 256bit simd fpu optimization.
>>
>> Loongarch gcc built-in lsx/lasx macro can be used only when compiler
>> option -mlsx/-mlasx is added, and there is no separate compiler option
>> for function only. So it is only in effect when qemu is compiled with
>> parameter --extra-cflags="-mlasx"
>>
>> Signed-off-by: Bibo Mao <maobibo@loongson.cn>
>> ---
>>   meson.build         |  11 +++++
>>   util/bufferiszero.c | 103 ++++++++++++++++++++++++++++++++++++++++++++
>>   2 files changed, 114 insertions(+)
>>
>> diff --git a/meson.build b/meson.build
>> index 6386607144..29bc362d7a 100644
>> --- a/meson.build
>> +++ b/meson.build
>> @@ -2855,6 +2855,17 @@ config_host_data.set('CONFIG_ARM_AES_BUILTIN', 
>> cc.compiles('''
>>       void foo(uint8x16_t *p) { *p = vaesmcq_u8(*p); }
>>     '''))
>> +# For Loongarch64, detect if LSX/LASX are available.
>> + config_host_data.set('CONFIG_LSX_OPT', cc.compiles('''
>> +    #include "lsxintrin.h"
>> +    int foo(__m128i v) { return __lsx_bz_v(v); }
>> +  '''))
>> +
>> +config_host_data.set('CONFIG_LASX_OPT', cc.compiles('''
>> +    #include "lasxintrin.h"
>> +    int foo(__m256i v) { return __lasx_xbz_v(v); }
>> +  '''))
> 
> Both of these are introduced by gcc 14 and llvm 18, so I'm not certain 
> of the utility of separate tests.  We might simplify this with
> 
>    config_host_data.set('CONFIG_LSX_LASX_INTRIN_H',
>      cc.has_header('lsxintrin.h') && cc.has_header('lasxintrin.h'))
> 
> 
> As you say, these headers require vector instructions to be enabled at 
> compile-time rather than detecting them at runtime.  This is a point 
> where the compilers could be improved to support 
> __attribute__((target("xyz"))) and the builtins with that.  The i386 
> port does this, for instance.
> 
> In the meantime, it means that you don't need a runtime test.  Similar 
> to aarch64 and the use of __ARM_NEON as a compile-time test for simd 
> support.  Perhaps
> 
> #elif defined(CONFIG_LSX_LASX_INTRIN_H) && \
>        (defined(__loongarch_sx) || defined(__loongarch_asx))
> # ifdef __loongarch_sx
>    ...
> # endif
> # ifdef __loongarch_asx
>    ...
> # endif
Sure, will do in this way.
And also there is runtime check coming from hwcap, such this:

unsigned info = cpuinfo_init();
   if (info & CPUINFO_LASX)

> 
> 
> The actual code is perfectly fine, of course, since it follows the 
> pattern from the others.  How much improvement do you see from 
> bufferiszero-bench?
yes, it is much easier to follow others, it is not new things.

Here is the benchmark result, no obvious improvement with 1K
buffer size. 200% improvement with LASX, 100% improve with LSX
with 16K page size.

# /root/src/qemu/b/tests/bench/bufferiszero-bench --tap -k
# Start of cutils tests
# Start of bufferiszero tests
# buffer_is_zero #0:  1KB    13460 MB/sec
# buffer_is_zero #0:  4KB    36857 MB/sec
# buffer_is_zero #0: 16KB    69884 MB/sec
# buffer_is_zero #0: 64KB    80863 MB/sec
#
# buffer_is_zero #1:  1KB    11180 MB/sec
# buffer_is_zero #1:  4KB    27972 MB/sec
# buffer_is_zero #1: 16KB    42951 MB/sec
# buffer_is_zero #1: 64KB    43293 MB/sec
#
# buffer_is_zero #2:  1KB    10026 MB/sec
# buffer_is_zero #2:  4KB    18373 MB/sec
# buffer_is_zero #2: 16KB    23933 MB/sec
# buffer_is_zero #2: 64KB    25180 MB/sec

Regards
Bibo Mao

> 
> 
> r~


Re: [PATCH 2/2] util/bufferiszero: Add simd acceleration for loongarch64
Posted by Richard Henderson 5 months, 3 weeks ago
On 6/5/24 19:30, maobibo wrote:
> 
> 
> On 2024/6/6 上午7:51, Richard Henderson wrote:
>> On 6/5/24 02:32, Bibo Mao wrote:
>>> Different gcc versions have different features, macro CONFIG_LSX_OPT
>>> and CONFIG_LASX_OPT is added here to detect whether gcc supports
>>> built-in lsx/lasx macro.
>>>
>>> Function buffer_zero_lsx() is added for 128bit simd fpu optimization,
>>> and function buffer_zero_lasx() is for 256bit simd fpu optimization.
>>>
>>> Loongarch gcc built-in lsx/lasx macro can be used only when compiler
>>> option -mlsx/-mlasx is added, and there is no separate compiler option
>>> for function only. So it is only in effect when qemu is compiled with
>>> parameter --extra-cflags="-mlasx"
>>>
>>> Signed-off-by: Bibo Mao <maobibo@loongson.cn>
>>> ---
>>>   meson.build         |  11 +++++
>>>   util/bufferiszero.c | 103 ++++++++++++++++++++++++++++++++++++++++++++
>>>   2 files changed, 114 insertions(+)
>>>
>>> diff --git a/meson.build b/meson.build
>>> index 6386607144..29bc362d7a 100644
>>> --- a/meson.build
>>> +++ b/meson.build
>>> @@ -2855,6 +2855,17 @@ config_host_data.set('CONFIG_ARM_AES_BUILTIN', cc.compiles('''
>>>       void foo(uint8x16_t *p) { *p = vaesmcq_u8(*p); }
>>>     '''))
>>> +# For Loongarch64, detect if LSX/LASX are available.
>>> + config_host_data.set('CONFIG_LSX_OPT', cc.compiles('''
>>> +    #include "lsxintrin.h"
>>> +    int foo(__m128i v) { return __lsx_bz_v(v); }
>>> +  '''))
>>> +
>>> +config_host_data.set('CONFIG_LASX_OPT', cc.compiles('''
>>> +    #include "lasxintrin.h"
>>> +    int foo(__m256i v) { return __lasx_xbz_v(v); }
>>> +  '''))
>>
>> Both of these are introduced by gcc 14 and llvm 18, so I'm not certain of the utility of 
>> separate tests.  We might simplify this with
>>
>>    config_host_data.set('CONFIG_LSX_LASX_INTRIN_H',
>>      cc.has_header('lsxintrin.h') && cc.has_header('lasxintrin.h'))
>>
>>
>> As you say, these headers require vector instructions to be enabled at compile-time 
>> rather than detecting them at runtime.  This is a point where the compilers could be 
>> improved to support __attribute__((target("xyz"))) and the builtins with that.  The i386 
>> port does this, for instance.
>>
>> In the meantime, it means that you don't need a runtime test.  Similar to aarch64 and 
>> the use of __ARM_NEON as a compile-time test for simd support.  Perhaps
>>
>> #elif defined(CONFIG_LSX_LASX_INTRIN_H) && \
>>        (defined(__loongarch_sx) || defined(__loongarch_asx))
>> # ifdef __loongarch_sx
>>    ...
>> # endif
>> # ifdef __loongarch_asx
>>    ...
>> # endif
> Sure, will do in this way.
> And also there is runtime check coming from hwcap, such this:
> 
> unsigned info = cpuinfo_init();
>    if (info & CPUINFO_LASX)

static biz_accel_fn const accel_table[] = {
     buffer_is_zero_int_ge256,
#ifdef __loongarch_sx
     buffer_is_zero_lsx,
#endif
#ifdef __loongarch_asx
     buffer_is_zero_lasx,
#endif
};

static unsigned best_accel(void)
{
#ifdef __loongarch_asx
     /* lasx may be index 1 or 2, but always last */
     return ARRAY_SIZE(accel_table) - 1;
#else
     /* lsx is always index 1 */
     return 1;
#endif
}


r~

Re: [PATCH 2/2] util/bufferiszero: Add simd acceleration for loongarch64
Posted by Richard Henderson 5 months, 3 weeks ago
On 6/5/24 20:18, Richard Henderson wrote:
> On 6/5/24 19:30, maobibo wrote:
>>
>>
>> On 2024/6/6 上午7:51, Richard Henderson wrote:
>>> On 6/5/24 02:32, Bibo Mao wrote:
>>>> Different gcc versions have different features, macro CONFIG_LSX_OPT
>>>> and CONFIG_LASX_OPT is added here to detect whether gcc supports
>>>> built-in lsx/lasx macro.
>>>>
>>>> Function buffer_zero_lsx() is added for 128bit simd fpu optimization,
>>>> and function buffer_zero_lasx() is for 256bit simd fpu optimization.
>>>>
>>>> Loongarch gcc built-in lsx/lasx macro can be used only when compiler
>>>> option -mlsx/-mlasx is added, and there is no separate compiler option
>>>> for function only. So it is only in effect when qemu is compiled with
>>>> parameter --extra-cflags="-mlasx"
>>>>
>>>> Signed-off-by: Bibo Mao <maobibo@loongson.cn>
>>>> ---
>>>>   meson.build         |  11 +++++
>>>>   util/bufferiszero.c | 103 ++++++++++++++++++++++++++++++++++++++++++++
>>>>   2 files changed, 114 insertions(+)
>>>>
>>>> diff --git a/meson.build b/meson.build
>>>> index 6386607144..29bc362d7a 100644
>>>> --- a/meson.build
>>>> +++ b/meson.build
>>>> @@ -2855,6 +2855,17 @@ config_host_data.set('CONFIG_ARM_AES_BUILTIN', cc.compiles('''
>>>>       void foo(uint8x16_t *p) { *p = vaesmcq_u8(*p); }
>>>>     '''))
>>>> +# For Loongarch64, detect if LSX/LASX are available.
>>>> + config_host_data.set('CONFIG_LSX_OPT', cc.compiles('''
>>>> +    #include "lsxintrin.h"
>>>> +    int foo(__m128i v) { return __lsx_bz_v(v); }
>>>> +  '''))
>>>> +
>>>> +config_host_data.set('CONFIG_LASX_OPT', cc.compiles('''
>>>> +    #include "lasxintrin.h"
>>>> +    int foo(__m256i v) { return __lasx_xbz_v(v); }
>>>> +  '''))
>>>
>>> Both of these are introduced by gcc 14 and llvm 18, so I'm not certain of the utility 
>>> of separate tests.  We might simplify this with
>>>
>>>    config_host_data.set('CONFIG_LSX_LASX_INTRIN_H',
>>>      cc.has_header('lsxintrin.h') && cc.has_header('lasxintrin.h'))
>>>
>>>
>>> As you say, these headers require vector instructions to be enabled at compile-time 
>>> rather than detecting them at runtime.  This is a point where the compilers could be 
>>> improved to support __attribute__((target("xyz"))) and the builtins with that.  The 
>>> i386 port does this, for instance.
>>>
>>> In the meantime, it means that you don't need a runtime test.  Similar to aarch64 and 
>>> the use of __ARM_NEON as a compile-time test for simd support.  Perhaps
>>>
>>> #elif defined(CONFIG_LSX_LASX_INTRIN_H) && \
>>>        (defined(__loongarch_sx) || defined(__loongarch_asx))
>>> # ifdef __loongarch_sx
>>>    ...
>>> # endif
>>> # ifdef __loongarch_asx
>>>    ...
>>> # endif
>> Sure, will do in this way.
>> And also there is runtime check coming from hwcap, such this:
>>
>> unsigned info = cpuinfo_init();
>>    if (info & CPUINFO_LASX)
> 
> static biz_accel_fn const accel_table[] = {
>      buffer_is_zero_int_ge256,
> #ifdef __loongarch_sx
>      buffer_is_zero_lsx,
> #endif
> #ifdef __loongarch_asx
>      buffer_is_zero_lasx,
> #endif
> };
> 
> static unsigned best_accel(void)
> {
> #ifdef __loongarch_asx
>      /* lasx may be index 1 or 2, but always last */
>      return ARRAY_SIZE(accel_table) - 1;
> #else
>      /* lsx is always index 1 */
>      return 1;
> #endif
> }

It occurs to me that by accumulating host specific sections to this file, we should split 
it like the atomics.  Put each portion in host/include/*/host/bufferiszero.h.inc.

I'll send a patch set handling the existing two hosts.


r~


Re: [PATCH 2/2] util/bufferiszero: Add simd acceleration for loongarch64
Posted by maobibo 5 months, 3 weeks ago

On 2024/6/6 上午11:27, Richard Henderson wrote:
> On 6/5/24 20:18, Richard Henderson wrote:
>> On 6/5/24 19:30, maobibo wrote:
>>>
>>>
>>> On 2024/6/6 上午7:51, Richard Henderson wrote:
>>>> On 6/5/24 02:32, Bibo Mao wrote:
>>>>> Different gcc versions have different features, macro CONFIG_LSX_OPT
>>>>> and CONFIG_LASX_OPT is added here to detect whether gcc supports
>>>>> built-in lsx/lasx macro.
>>>>>
>>>>> Function buffer_zero_lsx() is added for 128bit simd fpu optimization,
>>>>> and function buffer_zero_lasx() is for 256bit simd fpu optimization.
>>>>>
>>>>> Loongarch gcc built-in lsx/lasx macro can be used only when compiler
>>>>> option -mlsx/-mlasx is added, and there is no separate compiler option
>>>>> for function only. So it is only in effect when qemu is compiled with
>>>>> parameter --extra-cflags="-mlasx"
>>>>>
>>>>> Signed-off-by: Bibo Mao <maobibo@loongson.cn>
>>>>> ---
>>>>>   meson.build         |  11 +++++
>>>>>   util/bufferiszero.c | 103 
>>>>> ++++++++++++++++++++++++++++++++++++++++++++
>>>>>   2 files changed, 114 insertions(+)
>>>>>
>>>>> diff --git a/meson.build b/meson.build
>>>>> index 6386607144..29bc362d7a 100644
>>>>> --- a/meson.build
>>>>> +++ b/meson.build
>>>>> @@ -2855,6 +2855,17 @@ 
>>>>> config_host_data.set('CONFIG_ARM_AES_BUILTIN', cc.compiles('''
>>>>>       void foo(uint8x16_t *p) { *p = vaesmcq_u8(*p); }
>>>>>     '''))
>>>>> +# For Loongarch64, detect if LSX/LASX are available.
>>>>> + config_host_data.set('CONFIG_LSX_OPT', cc.compiles('''
>>>>> +    #include "lsxintrin.h"
>>>>> +    int foo(__m128i v) { return __lsx_bz_v(v); }
>>>>> +  '''))
>>>>> +
>>>>> +config_host_data.set('CONFIG_LASX_OPT', cc.compiles('''
>>>>> +    #include "lasxintrin.h"
>>>>> +    int foo(__m256i v) { return __lasx_xbz_v(v); }
>>>>> +  '''))
>>>>
>>>> Both of these are introduced by gcc 14 and llvm 18, so I'm not 
>>>> certain of the utility of separate tests.  We might simplify this with
>>>>
>>>>    config_host_data.set('CONFIG_LSX_LASX_INTRIN_H',
>>>>      cc.has_header('lsxintrin.h') && cc.has_header('lasxintrin.h'))
>>>>
>>>>
>>>> As you say, these headers require vector instructions to be enabled 
>>>> at compile-time rather than detecting them at runtime.  This is a 
>>>> point where the compilers could be improved to support 
>>>> __attribute__((target("xyz"))) and the builtins with that.  The i386 
>>>> port does this, for instance.
>>>>
>>>> In the meantime, it means that you don't need a runtime test.  
>>>> Similar to aarch64 and the use of __ARM_NEON as a compile-time test 
>>>> for simd support.  Perhaps
>>>>
>>>> #elif defined(CONFIG_LSX_LASX_INTRIN_H) && \
>>>>        (defined(__loongarch_sx) || defined(__loongarch_asx))
>>>> # ifdef __loongarch_sx
>>>>    ...
>>>> # endif
>>>> # ifdef __loongarch_asx
>>>>    ...
>>>> # endif
>>> Sure, will do in this way.
>>> And also there is runtime check coming from hwcap, such this:
>>>
>>> unsigned info = cpuinfo_init();
>>>    if (info & CPUINFO_LASX)
>>
>> static biz_accel_fn const accel_table[] = {
>>      buffer_is_zero_int_ge256,
>> #ifdef __loongarch_sx
>>      buffer_is_zero_lsx,
>> #endif
>> #ifdef __loongarch_asx
>>      buffer_is_zero_lasx,
>> #endif
>> };
>>
>> static unsigned best_accel(void)
>> {
>> #ifdef __loongarch_asx
>>      /* lasx may be index 1 or 2, but always last */
>>      return ARRAY_SIZE(accel_table) - 1;
>> #else
>>      /* lsx is always index 1 */
>>      return 1;
>> #endif
>> }
size of accel_table is decided at compile-time, will it be better if 
runtime checking is added also?  something like this:

  unsigned info = cpuinfo_init();

  #ifdef __loongarch_asx
  if (info & CPUINFO_LASX) {
       /* lasx may be index 1 or 2, but always last */
       return ARRAY_SIZE(accel_table) - 1;
  }
  #endif

> 
> It occurs to me that by accumulating host specific sections to this 
> file, we should split it like the atomics.  Put each portion in 
> host/include/*/host/bufferiszero.h.inc.
sure, will do.

> 
> I'll send a patch set handling the existing two hosts.
> 
> 
> r~
> 


Re: [PATCH 2/2] util/bufferiszero: Add simd acceleration for loongarch64
Posted by Richard Henderson 5 months, 3 weeks ago
On 6/5/24 20:36, maobibo wrote:
>>> static biz_accel_fn const accel_table[] = {
>>>      buffer_is_zero_int_ge256,
>>> #ifdef __loongarch_sx
>>>      buffer_is_zero_lsx,
>>> #endif
>>> #ifdef __loongarch_asx
>>>      buffer_is_zero_lasx,
>>> #endif
>>> };
>>>
>>> static unsigned best_accel(void)
>>> {
>>> #ifdef __loongarch_asx
>>>      /* lasx may be index 1 or 2, but always last */
>>>      return ARRAY_SIZE(accel_table) - 1;
>>> #else
>>>      /* lsx is always index 1 */
>>>      return 1;
>>> #endif
>>> }
> size of accel_table is decided at compile-time, will it be better if runtime checking is 
> added also?  something like this:
> 
>   unsigned info = cpuinfo_init();
> 
>   #ifdef __loongarch_asx
>   if (info & CPUINFO_LASX) {
>        /* lasx may be index 1 or 2, but always last */
>        return ARRAY_SIZE(accel_table) - 1;
>   }
>   #endif

No, because the ifdef checks that the *compiler* is prepared to use LASX/LSX instructions 
itself without further checks.  There's no point in qemu checking further.


r~

Re: [PATCH 2/2] util/bufferiszero: Add simd acceleration for loongarch64
Posted by maobibo 5 months, 3 weeks ago

On 2024/6/6 上午11:42, Richard Henderson wrote:
> On 6/5/24 20:36, maobibo wrote:
>>>> static biz_accel_fn const accel_table[] = {
>>>>      buffer_is_zero_int_ge256,
>>>> #ifdef __loongarch_sx
>>>>      buffer_is_zero_lsx,
>>>> #endif
>>>> #ifdef __loongarch_asx
>>>>      buffer_is_zero_lasx,
>>>> #endif
>>>> };
>>>>
>>>> static unsigned best_accel(void)
>>>> {
>>>> #ifdef __loongarch_asx
>>>>      /* lasx may be index 1 or 2, but always last */
>>>>      return ARRAY_SIZE(accel_table) - 1;
>>>> #else
>>>>      /* lsx is always index 1 */
>>>>      return 1;
>>>> #endif
>>>> }
>> size of accel_table is decided at compile-time, will it be better if 
>> runtime checking is added also?  something like this:
>>
>>   unsigned info = cpuinfo_init();
>>
>>   #ifdef __loongarch_asx
>>   if (info & CPUINFO_LASX) {
>>        /* lasx may be index 1 or 2, but always last */
>>        return ARRAY_SIZE(accel_table) - 1;
>>   }
>>   #endif
> 
> No, because the ifdef checks that the *compiler* is prepared to use 
> LASX/LSX instructions itself without further checks.  There's no point 
> in qemu checking further.
By my understanding, currently compiler option is the same with all 
files, there is no separate compiler option with single file or file 
function.

So if compiler is prepared to use LASX/LSX instructions itself, host 
hardware must support LASX/LSX instructions, else there will be problem.

My main concern is that there is one hw machine which supports LSX, but 
no LASX, no KVM neither.  QEMU binary maybe fails to run on such hw 
machine if it is compiled with LASX option.

Regards
Bibo Mao
> 
> 
> r~


Re: [PATCH 2/2] util/bufferiszero: Add simd acceleration for loongarch64
Posted by Richard Henderson 5 months, 3 weeks ago
On 6/5/24 21:00, maobibo wrote:
>> No, because the ifdef checks that the *compiler* is prepared to use LASX/LSX 
>> instructions itself without further checks.  There's no point in qemu checking further.
> By my understanding, currently compiler option is the same with all files, there is no 
> separate compiler option with single file or file function.
> 
> So if compiler is prepared to use LASX/LSX instructions itself, host hardware must support 
> LASX/LSX instructions, else there will be problem.

Correct.


> My main concern is that there is one hw machine which supports LSX, but no LASX, no KVM 
> neither.  QEMU binary maybe fails to run on such hw machine if it is compiled with LASX 
> option.

Yes, that would be a problem for packaging qemu for distribution.

An alternative is to write these functions in assembly.  While it's worth prioritizing 
implementation of __attribute__((target())) in the compilers, the very earliest that could 
happen is gcc 15.  Which is far away from being reliable for qemu.  It would also allow 
this optimization to happen with gcc 13, which doesn't support the builtins either.

I just sent a patch set along these lines.


r~