Different gcc versions have different features, macro CONFIG_LSX_OPT
and CONFIG_LASX_OPT is added here to detect whether gcc supports
built-in lsx/lasx macro.
Function buffer_zero_lsx() is added for 128bit simd fpu optimization,
and function buffer_zero_lasx() is for 256bit simd fpu optimization.
Loongarch gcc built-in lsx/lasx macro can be used only when compiler
option -mlsx/-mlasx is added, and there is no separate compiler option
for function only. So it is only in effect when qemu is compiled with
parameter --extra-cflags="-mlasx"
Signed-off-by: Bibo Mao <maobibo@loongson.cn>
---
meson.build | 11 +++++
util/bufferiszero.c | 103 ++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 114 insertions(+)
diff --git a/meson.build b/meson.build
index 6386607144..29bc362d7a 100644
--- a/meson.build
+++ b/meson.build
@@ -2855,6 +2855,17 @@ config_host_data.set('CONFIG_ARM_AES_BUILTIN', cc.compiles('''
void foo(uint8x16_t *p) { *p = vaesmcq_u8(*p); }
'''))
+# For Loongarch64, detect if LSX/LASX are available.
+ config_host_data.set('CONFIG_LSX_OPT', cc.compiles('''
+ #include "lsxintrin.h"
+ int foo(__m128i v) { return __lsx_bz_v(v); }
+ '''))
+
+config_host_data.set('CONFIG_LASX_OPT', cc.compiles('''
+ #include "lasxintrin.h"
+ int foo(__m256i v) { return __lasx_xbz_v(v); }
+ '''))
+
if get_option('membarrier').disabled()
have_membarrier = false
elif host_os == 'windows'
diff --git a/util/bufferiszero.c b/util/bufferiszero.c
index 74864f7b78..751e81dbb3 100644
--- a/util/bufferiszero.c
+++ b/util/bufferiszero.c
@@ -265,6 +265,109 @@ static biz_accel_fn const accel_table[] = {
buffer_is_zero_int_ge256,
buffer_is_zero_simd,
};
+#elif defined(__loongarch__)
+#ifdef CONFIG_LSX_OPT
+#include "lsxintrin.h"
+static bool buffer_zero_lsx(const void *buf, size_t len)
+{
+ /* Unaligned loads at head/tail. */
+ __m128i v = *(__m128i *)(buf);
+ __m128i w = *(__m128i *)(buf + len - 16);
+ /* Align head/tail to 16-byte boundaries. */
+ const __m128i *p = QEMU_ALIGN_PTR_DOWN(buf + 16, 16);
+ const __m128i *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 16);
+
+ /* Collect a partial block at tail end. */
+ v |= e[-1]; w |= e[-2];
+ v |= e[-3]; w |= e[-4];
+ v |= e[-5]; w |= e[-6];
+ v |= e[-7]; v |= w;
+
+ /*
+ * Loop over complete 128-byte blocks.
+ * With the head and tail removed, e - p >= 14, so the loop
+ * must iterate at least once.
+ */
+ do {
+ if (!__lsx_bz_v(v)) {
+ return false;
+ }
+ v = p[0]; w = p[1];
+ v |= p[2]; w |= p[3];
+ v |= p[4]; w |= p[5];
+ v |= p[6]; w |= p[7];
+ v |= w;
+ p += 8;
+ } while (p < e - 7);
+
+ return __lsx_bz_v(v);
+}
+#endif
+
+#ifdef CONFIG_LASX_OPT
+#include "lasxintrin.h"
+static bool buffer_zero_lasx(const void *buf, size_t len)
+{
+ /* Unaligned loads at head/tail. */
+ __m256i v = *(__m256i *)(buf);
+ __m256i w = *(__m256i *)(buf + len - 32);
+ /* Align head/tail to 32-byte boundaries. */
+ const __m256i *p = QEMU_ALIGN_PTR_DOWN(buf + 32, 32);
+ const __m256i *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 32);
+
+ /* Collect a partial block at tail end. */
+ v |= e[-1]; w |= e[-2];
+ v |= e[-3]; w |= e[-4];
+ v |= e[-5]; w |= e[-6];
+ v |= e[-7]; v |= w;
+
+ /* Loop over complete 256-byte blocks. */
+ for (; p < e - 7; p += 8) {
+ /* PTEST is not profitable here. */
+ if (!__lasx_xbz_v(v)) {
+ return false;
+ }
+
+ v = p[0]; w = p[1];
+ v |= p[2]; w |= p[3];
+ v |= p[4]; w |= p[5];
+ v |= p[6]; w |= p[7];
+ v |= w;
+ }
+
+ return __lasx_xbz_v(v);
+}
+#endif
+
+static biz_accel_fn const accel_table[] = {
+ buffer_is_zero_int_ge256,
+#ifdef CONFIG_LSX_OPT
+ buffer_zero_lsx,
+#endif
+#ifdef CONFIG_LASX_OPT
+ buffer_zero_lasx,
+#endif
+};
+
+static unsigned best_accel(void)
+{
+ unsigned info = cpuinfo_init();
+
+ /* CONFIG_LSX_OPT must be enabled if CONFIG_LASX_OPT is enabled */
+#ifdef CONFIG_LASX_OPT
+ if (info & CPUINFO_LASX) {
+ return 2;
+ }
+#endif
+
+#ifdef CONFIG_LSX_OPT
+ if (info & CPUINFO_LSX) {
+ return 1;
+ }
+#endif
+
+ return 0;
+}
#else
#define best_accel() 0
static biz_accel_fn const accel_table[1] = {
--
2.39.3
On 6/5/24 02:32, Bibo Mao wrote: > Different gcc versions have different features, macro CONFIG_LSX_OPT > and CONFIG_LASX_OPT is added here to detect whether gcc supports > built-in lsx/lasx macro. > > Function buffer_zero_lsx() is added for 128bit simd fpu optimization, > and function buffer_zero_lasx() is for 256bit simd fpu optimization. > > Loongarch gcc built-in lsx/lasx macro can be used only when compiler > option -mlsx/-mlasx is added, and there is no separate compiler option > for function only. So it is only in effect when qemu is compiled with > parameter --extra-cflags="-mlasx" > > Signed-off-by: Bibo Mao <maobibo@loongson.cn> > --- > meson.build | 11 +++++ > util/bufferiszero.c | 103 ++++++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 114 insertions(+) > > diff --git a/meson.build b/meson.build > index 6386607144..29bc362d7a 100644 > --- a/meson.build > +++ b/meson.build > @@ -2855,6 +2855,17 @@ config_host_data.set('CONFIG_ARM_AES_BUILTIN', cc.compiles(''' > void foo(uint8x16_t *p) { *p = vaesmcq_u8(*p); } > ''')) > > +# For Loongarch64, detect if LSX/LASX are available. > + config_host_data.set('CONFIG_LSX_OPT', cc.compiles(''' > + #include "lsxintrin.h" > + int foo(__m128i v) { return __lsx_bz_v(v); } > + ''')) > + > +config_host_data.set('CONFIG_LASX_OPT', cc.compiles(''' > + #include "lasxintrin.h" > + int foo(__m256i v) { return __lasx_xbz_v(v); } > + ''')) Both of these are introduced by gcc 14 and llvm 18, so I'm not certain of the utility of separate tests. We might simplify this with config_host_data.set('CONFIG_LSX_LASX_INTRIN_H', cc.has_header('lsxintrin.h') && cc.has_header('lasxintrin.h')) As you say, these headers require vector instructions to be enabled at compile-time rather than detecting them at runtime. This is a point where the compilers could be improved to support __attribute__((target("xyz"))) and the builtins with that. The i386 port does this, for instance. In the meantime, it means that you don't need a runtime test. Similar to aarch64 and the use of __ARM_NEON as a compile-time test for simd support. Perhaps #elif defined(CONFIG_LSX_LASX_INTRIN_H) && \ (defined(__loongarch_sx) || defined(__loongarch_asx)) # ifdef __loongarch_sx ... # endif # ifdef __loongarch_asx ... # endif The actual code is perfectly fine, of course, since it follows the pattern from the others. How much improvement do you see from bufferiszero-bench? r~
On 2024/6/6 上午7:51, Richard Henderson wrote: > On 6/5/24 02:32, Bibo Mao wrote: >> Different gcc versions have different features, macro CONFIG_LSX_OPT >> and CONFIG_LASX_OPT is added here to detect whether gcc supports >> built-in lsx/lasx macro. >> >> Function buffer_zero_lsx() is added for 128bit simd fpu optimization, >> and function buffer_zero_lasx() is for 256bit simd fpu optimization. >> >> Loongarch gcc built-in lsx/lasx macro can be used only when compiler >> option -mlsx/-mlasx is added, and there is no separate compiler option >> for function only. So it is only in effect when qemu is compiled with >> parameter --extra-cflags="-mlasx" >> >> Signed-off-by: Bibo Mao <maobibo@loongson.cn> >> --- >> meson.build | 11 +++++ >> util/bufferiszero.c | 103 ++++++++++++++++++++++++++++++++++++++++++++ >> 2 files changed, 114 insertions(+) >> >> diff --git a/meson.build b/meson.build >> index 6386607144..29bc362d7a 100644 >> --- a/meson.build >> +++ b/meson.build >> @@ -2855,6 +2855,17 @@ config_host_data.set('CONFIG_ARM_AES_BUILTIN', >> cc.compiles(''' >> void foo(uint8x16_t *p) { *p = vaesmcq_u8(*p); } >> ''')) >> +# For Loongarch64, detect if LSX/LASX are available. >> + config_host_data.set('CONFIG_LSX_OPT', cc.compiles(''' >> + #include "lsxintrin.h" >> + int foo(__m128i v) { return __lsx_bz_v(v); } >> + ''')) >> + >> +config_host_data.set('CONFIG_LASX_OPT', cc.compiles(''' >> + #include "lasxintrin.h" >> + int foo(__m256i v) { return __lasx_xbz_v(v); } >> + ''')) > > Both of these are introduced by gcc 14 and llvm 18, so I'm not certain > of the utility of separate tests. We might simplify this with > > config_host_data.set('CONFIG_LSX_LASX_INTRIN_H', > cc.has_header('lsxintrin.h') && cc.has_header('lasxintrin.h')) > > > As you say, these headers require vector instructions to be enabled at > compile-time rather than detecting them at runtime. This is a point > where the compilers could be improved to support > __attribute__((target("xyz"))) and the builtins with that. The i386 > port does this, for instance. > > In the meantime, it means that you don't need a runtime test. Similar > to aarch64 and the use of __ARM_NEON as a compile-time test for simd > support. Perhaps > > #elif defined(CONFIG_LSX_LASX_INTRIN_H) && \ > (defined(__loongarch_sx) || defined(__loongarch_asx)) > # ifdef __loongarch_sx > ... > # endif > # ifdef __loongarch_asx > ... > # endif Sure, will do in this way. And also there is runtime check coming from hwcap, such this: unsigned info = cpuinfo_init(); if (info & CPUINFO_LASX) > > > The actual code is perfectly fine, of course, since it follows the > pattern from the others. How much improvement do you see from > bufferiszero-bench? yes, it is much easier to follow others, it is not new things. Here is the benchmark result, no obvious improvement with 1K buffer size. 200% improvement with LASX, 100% improve with LSX with 16K page size. # /root/src/qemu/b/tests/bench/bufferiszero-bench --tap -k # Start of cutils tests # Start of bufferiszero tests # buffer_is_zero #0: 1KB 13460 MB/sec # buffer_is_zero #0: 4KB 36857 MB/sec # buffer_is_zero #0: 16KB 69884 MB/sec # buffer_is_zero #0: 64KB 80863 MB/sec # # buffer_is_zero #1: 1KB 11180 MB/sec # buffer_is_zero #1: 4KB 27972 MB/sec # buffer_is_zero #1: 16KB 42951 MB/sec # buffer_is_zero #1: 64KB 43293 MB/sec # # buffer_is_zero #2: 1KB 10026 MB/sec # buffer_is_zero #2: 4KB 18373 MB/sec # buffer_is_zero #2: 16KB 23933 MB/sec # buffer_is_zero #2: 64KB 25180 MB/sec Regards Bibo Mao > > > r~
On 6/5/24 19:30, maobibo wrote: > > > On 2024/6/6 上午7:51, Richard Henderson wrote: >> On 6/5/24 02:32, Bibo Mao wrote: >>> Different gcc versions have different features, macro CONFIG_LSX_OPT >>> and CONFIG_LASX_OPT is added here to detect whether gcc supports >>> built-in lsx/lasx macro. >>> >>> Function buffer_zero_lsx() is added for 128bit simd fpu optimization, >>> and function buffer_zero_lasx() is for 256bit simd fpu optimization. >>> >>> Loongarch gcc built-in lsx/lasx macro can be used only when compiler >>> option -mlsx/-mlasx is added, and there is no separate compiler option >>> for function only. So it is only in effect when qemu is compiled with >>> parameter --extra-cflags="-mlasx" >>> >>> Signed-off-by: Bibo Mao <maobibo@loongson.cn> >>> --- >>> meson.build | 11 +++++ >>> util/bufferiszero.c | 103 ++++++++++++++++++++++++++++++++++++++++++++ >>> 2 files changed, 114 insertions(+) >>> >>> diff --git a/meson.build b/meson.build >>> index 6386607144..29bc362d7a 100644 >>> --- a/meson.build >>> +++ b/meson.build >>> @@ -2855,6 +2855,17 @@ config_host_data.set('CONFIG_ARM_AES_BUILTIN', cc.compiles(''' >>> void foo(uint8x16_t *p) { *p = vaesmcq_u8(*p); } >>> ''')) >>> +# For Loongarch64, detect if LSX/LASX are available. >>> + config_host_data.set('CONFIG_LSX_OPT', cc.compiles(''' >>> + #include "lsxintrin.h" >>> + int foo(__m128i v) { return __lsx_bz_v(v); } >>> + ''')) >>> + >>> +config_host_data.set('CONFIG_LASX_OPT', cc.compiles(''' >>> + #include "lasxintrin.h" >>> + int foo(__m256i v) { return __lasx_xbz_v(v); } >>> + ''')) >> >> Both of these are introduced by gcc 14 and llvm 18, so I'm not certain of the utility of >> separate tests. We might simplify this with >> >> config_host_data.set('CONFIG_LSX_LASX_INTRIN_H', >> cc.has_header('lsxintrin.h') && cc.has_header('lasxintrin.h')) >> >> >> As you say, these headers require vector instructions to be enabled at compile-time >> rather than detecting them at runtime. This is a point where the compilers could be >> improved to support __attribute__((target("xyz"))) and the builtins with that. The i386 >> port does this, for instance. >> >> In the meantime, it means that you don't need a runtime test. Similar to aarch64 and >> the use of __ARM_NEON as a compile-time test for simd support. Perhaps >> >> #elif defined(CONFIG_LSX_LASX_INTRIN_H) && \ >> (defined(__loongarch_sx) || defined(__loongarch_asx)) >> # ifdef __loongarch_sx >> ... >> # endif >> # ifdef __loongarch_asx >> ... >> # endif > Sure, will do in this way. > And also there is runtime check coming from hwcap, such this: > > unsigned info = cpuinfo_init(); > if (info & CPUINFO_LASX) static biz_accel_fn const accel_table[] = { buffer_is_zero_int_ge256, #ifdef __loongarch_sx buffer_is_zero_lsx, #endif #ifdef __loongarch_asx buffer_is_zero_lasx, #endif }; static unsigned best_accel(void) { #ifdef __loongarch_asx /* lasx may be index 1 or 2, but always last */ return ARRAY_SIZE(accel_table) - 1; #else /* lsx is always index 1 */ return 1; #endif } r~
On 6/5/24 20:18, Richard Henderson wrote: > On 6/5/24 19:30, maobibo wrote: >> >> >> On 2024/6/6 上午7:51, Richard Henderson wrote: >>> On 6/5/24 02:32, Bibo Mao wrote: >>>> Different gcc versions have different features, macro CONFIG_LSX_OPT >>>> and CONFIG_LASX_OPT is added here to detect whether gcc supports >>>> built-in lsx/lasx macro. >>>> >>>> Function buffer_zero_lsx() is added for 128bit simd fpu optimization, >>>> and function buffer_zero_lasx() is for 256bit simd fpu optimization. >>>> >>>> Loongarch gcc built-in lsx/lasx macro can be used only when compiler >>>> option -mlsx/-mlasx is added, and there is no separate compiler option >>>> for function only. So it is only in effect when qemu is compiled with >>>> parameter --extra-cflags="-mlasx" >>>> >>>> Signed-off-by: Bibo Mao <maobibo@loongson.cn> >>>> --- >>>> meson.build | 11 +++++ >>>> util/bufferiszero.c | 103 ++++++++++++++++++++++++++++++++++++++++++++ >>>> 2 files changed, 114 insertions(+) >>>> >>>> diff --git a/meson.build b/meson.build >>>> index 6386607144..29bc362d7a 100644 >>>> --- a/meson.build >>>> +++ b/meson.build >>>> @@ -2855,6 +2855,17 @@ config_host_data.set('CONFIG_ARM_AES_BUILTIN', cc.compiles(''' >>>> void foo(uint8x16_t *p) { *p = vaesmcq_u8(*p); } >>>> ''')) >>>> +# For Loongarch64, detect if LSX/LASX are available. >>>> + config_host_data.set('CONFIG_LSX_OPT', cc.compiles(''' >>>> + #include "lsxintrin.h" >>>> + int foo(__m128i v) { return __lsx_bz_v(v); } >>>> + ''')) >>>> + >>>> +config_host_data.set('CONFIG_LASX_OPT', cc.compiles(''' >>>> + #include "lasxintrin.h" >>>> + int foo(__m256i v) { return __lasx_xbz_v(v); } >>>> + ''')) >>> >>> Both of these are introduced by gcc 14 and llvm 18, so I'm not certain of the utility >>> of separate tests. We might simplify this with >>> >>> config_host_data.set('CONFIG_LSX_LASX_INTRIN_H', >>> cc.has_header('lsxintrin.h') && cc.has_header('lasxintrin.h')) >>> >>> >>> As you say, these headers require vector instructions to be enabled at compile-time >>> rather than detecting them at runtime. This is a point where the compilers could be >>> improved to support __attribute__((target("xyz"))) and the builtins with that. The >>> i386 port does this, for instance. >>> >>> In the meantime, it means that you don't need a runtime test. Similar to aarch64 and >>> the use of __ARM_NEON as a compile-time test for simd support. Perhaps >>> >>> #elif defined(CONFIG_LSX_LASX_INTRIN_H) && \ >>> (defined(__loongarch_sx) || defined(__loongarch_asx)) >>> # ifdef __loongarch_sx >>> ... >>> # endif >>> # ifdef __loongarch_asx >>> ... >>> # endif >> Sure, will do in this way. >> And also there is runtime check coming from hwcap, such this: >> >> unsigned info = cpuinfo_init(); >> if (info & CPUINFO_LASX) > > static biz_accel_fn const accel_table[] = { > buffer_is_zero_int_ge256, > #ifdef __loongarch_sx > buffer_is_zero_lsx, > #endif > #ifdef __loongarch_asx > buffer_is_zero_lasx, > #endif > }; > > static unsigned best_accel(void) > { > #ifdef __loongarch_asx > /* lasx may be index 1 or 2, but always last */ > return ARRAY_SIZE(accel_table) - 1; > #else > /* lsx is always index 1 */ > return 1; > #endif > } It occurs to me that by accumulating host specific sections to this file, we should split it like the atomics. Put each portion in host/include/*/host/bufferiszero.h.inc. I'll send a patch set handling the existing two hosts. r~
On 2024/6/6 上午11:27, Richard Henderson wrote: > On 6/5/24 20:18, Richard Henderson wrote: >> On 6/5/24 19:30, maobibo wrote: >>> >>> >>> On 2024/6/6 上午7:51, Richard Henderson wrote: >>>> On 6/5/24 02:32, Bibo Mao wrote: >>>>> Different gcc versions have different features, macro CONFIG_LSX_OPT >>>>> and CONFIG_LASX_OPT is added here to detect whether gcc supports >>>>> built-in lsx/lasx macro. >>>>> >>>>> Function buffer_zero_lsx() is added for 128bit simd fpu optimization, >>>>> and function buffer_zero_lasx() is for 256bit simd fpu optimization. >>>>> >>>>> Loongarch gcc built-in lsx/lasx macro can be used only when compiler >>>>> option -mlsx/-mlasx is added, and there is no separate compiler option >>>>> for function only. So it is only in effect when qemu is compiled with >>>>> parameter --extra-cflags="-mlasx" >>>>> >>>>> Signed-off-by: Bibo Mao <maobibo@loongson.cn> >>>>> --- >>>>> meson.build | 11 +++++ >>>>> util/bufferiszero.c | 103 >>>>> ++++++++++++++++++++++++++++++++++++++++++++ >>>>> 2 files changed, 114 insertions(+) >>>>> >>>>> diff --git a/meson.build b/meson.build >>>>> index 6386607144..29bc362d7a 100644 >>>>> --- a/meson.build >>>>> +++ b/meson.build >>>>> @@ -2855,6 +2855,17 @@ >>>>> config_host_data.set('CONFIG_ARM_AES_BUILTIN', cc.compiles(''' >>>>> void foo(uint8x16_t *p) { *p = vaesmcq_u8(*p); } >>>>> ''')) >>>>> +# For Loongarch64, detect if LSX/LASX are available. >>>>> + config_host_data.set('CONFIG_LSX_OPT', cc.compiles(''' >>>>> + #include "lsxintrin.h" >>>>> + int foo(__m128i v) { return __lsx_bz_v(v); } >>>>> + ''')) >>>>> + >>>>> +config_host_data.set('CONFIG_LASX_OPT', cc.compiles(''' >>>>> + #include "lasxintrin.h" >>>>> + int foo(__m256i v) { return __lasx_xbz_v(v); } >>>>> + ''')) >>>> >>>> Both of these are introduced by gcc 14 and llvm 18, so I'm not >>>> certain of the utility of separate tests. We might simplify this with >>>> >>>> config_host_data.set('CONFIG_LSX_LASX_INTRIN_H', >>>> cc.has_header('lsxintrin.h') && cc.has_header('lasxintrin.h')) >>>> >>>> >>>> As you say, these headers require vector instructions to be enabled >>>> at compile-time rather than detecting them at runtime. This is a >>>> point where the compilers could be improved to support >>>> __attribute__((target("xyz"))) and the builtins with that. The i386 >>>> port does this, for instance. >>>> >>>> In the meantime, it means that you don't need a runtime test. >>>> Similar to aarch64 and the use of __ARM_NEON as a compile-time test >>>> for simd support. Perhaps >>>> >>>> #elif defined(CONFIG_LSX_LASX_INTRIN_H) && \ >>>> (defined(__loongarch_sx) || defined(__loongarch_asx)) >>>> # ifdef __loongarch_sx >>>> ... >>>> # endif >>>> # ifdef __loongarch_asx >>>> ... >>>> # endif >>> Sure, will do in this way. >>> And also there is runtime check coming from hwcap, such this: >>> >>> unsigned info = cpuinfo_init(); >>> if (info & CPUINFO_LASX) >> >> static biz_accel_fn const accel_table[] = { >> buffer_is_zero_int_ge256, >> #ifdef __loongarch_sx >> buffer_is_zero_lsx, >> #endif >> #ifdef __loongarch_asx >> buffer_is_zero_lasx, >> #endif >> }; >> >> static unsigned best_accel(void) >> { >> #ifdef __loongarch_asx >> /* lasx may be index 1 or 2, but always last */ >> return ARRAY_SIZE(accel_table) - 1; >> #else >> /* lsx is always index 1 */ >> return 1; >> #endif >> } size of accel_table is decided at compile-time, will it be better if runtime checking is added also? something like this: unsigned info = cpuinfo_init(); #ifdef __loongarch_asx if (info & CPUINFO_LASX) { /* lasx may be index 1 or 2, but always last */ return ARRAY_SIZE(accel_table) - 1; } #endif > > It occurs to me that by accumulating host specific sections to this > file, we should split it like the atomics. Put each portion in > host/include/*/host/bufferiszero.h.inc. sure, will do. > > I'll send a patch set handling the existing two hosts. > > > r~ >
On 6/5/24 20:36, maobibo wrote: >>> static biz_accel_fn const accel_table[] = { >>> buffer_is_zero_int_ge256, >>> #ifdef __loongarch_sx >>> buffer_is_zero_lsx, >>> #endif >>> #ifdef __loongarch_asx >>> buffer_is_zero_lasx, >>> #endif >>> }; >>> >>> static unsigned best_accel(void) >>> { >>> #ifdef __loongarch_asx >>> /* lasx may be index 1 or 2, but always last */ >>> return ARRAY_SIZE(accel_table) - 1; >>> #else >>> /* lsx is always index 1 */ >>> return 1; >>> #endif >>> } > size of accel_table is decided at compile-time, will it be better if runtime checking is > added also? something like this: > > unsigned info = cpuinfo_init(); > > #ifdef __loongarch_asx > if (info & CPUINFO_LASX) { > /* lasx may be index 1 or 2, but always last */ > return ARRAY_SIZE(accel_table) - 1; > } > #endif No, because the ifdef checks that the *compiler* is prepared to use LASX/LSX instructions itself without further checks. There's no point in qemu checking further. r~
On 2024/6/6 上午11:42, Richard Henderson wrote: > On 6/5/24 20:36, maobibo wrote: >>>> static biz_accel_fn const accel_table[] = { >>>> buffer_is_zero_int_ge256, >>>> #ifdef __loongarch_sx >>>> buffer_is_zero_lsx, >>>> #endif >>>> #ifdef __loongarch_asx >>>> buffer_is_zero_lasx, >>>> #endif >>>> }; >>>> >>>> static unsigned best_accel(void) >>>> { >>>> #ifdef __loongarch_asx >>>> /* lasx may be index 1 or 2, but always last */ >>>> return ARRAY_SIZE(accel_table) - 1; >>>> #else >>>> /* lsx is always index 1 */ >>>> return 1; >>>> #endif >>>> } >> size of accel_table is decided at compile-time, will it be better if >> runtime checking is added also? something like this: >> >> unsigned info = cpuinfo_init(); >> >> #ifdef __loongarch_asx >> if (info & CPUINFO_LASX) { >> /* lasx may be index 1 or 2, but always last */ >> return ARRAY_SIZE(accel_table) - 1; >> } >> #endif > > No, because the ifdef checks that the *compiler* is prepared to use > LASX/LSX instructions itself without further checks. There's no point > in qemu checking further. By my understanding, currently compiler option is the same with all files, there is no separate compiler option with single file or file function. So if compiler is prepared to use LASX/LSX instructions itself, host hardware must support LASX/LSX instructions, else there will be problem. My main concern is that there is one hw machine which supports LSX, but no LASX, no KVM neither. QEMU binary maybe fails to run on such hw machine if it is compiled with LASX option. Regards Bibo Mao > > > r~
On 6/5/24 21:00, maobibo wrote: >> No, because the ifdef checks that the *compiler* is prepared to use LASX/LSX >> instructions itself without further checks. There's no point in qemu checking further. > By my understanding, currently compiler option is the same with all files, there is no > separate compiler option with single file or file function. > > So if compiler is prepared to use LASX/LSX instructions itself, host hardware must support > LASX/LSX instructions, else there will be problem. Correct. > My main concern is that there is one hw machine which supports LSX, but no LASX, no KVM > neither. QEMU binary maybe fails to run on such hw machine if it is compiled with LASX > option. Yes, that would be a problem for packaging qemu for distribution. An alternative is to write these functions in assembly. While it's worth prioritizing implementation of __attribute__((target())) in the compilers, the very earliest that could happen is gcc 15. Which is far away from being reliable for qemu. It would also allow this optimization to happen with gcc 13, which doesn't support the builtins either. I just sent a patch set along these lines. r~
© 2016 - 2024 Red Hat, Inc.