[v1] x86/crc32: use builtins to improve code generation

[PATCH] x86/crc32: use builtins to improve code generation

Posted by Bill Wendling 11 months, 2 weeks ago

For both gcc and clang, crc32 builtins generate better code than the
inline asm. GCC improves, removing unneeded "mov" instructions. Clang
does the same and unrolls the loops. GCC has no changes on i386, but
Clang's code generation is vastly improved, due to Clang's "rm"
constraint issue.

The number of cycles improved by ~0.1% for GCC and ~1% for Clang, which
is expected because of the "rm" issue. However, Clang's performance is
better than GCC's by ~1.5%, most likely due to loop unrolling.

Link: https://github.com/llvm/llvm-project/issues/20571#issuecomment-2649330009
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: x86@kernel.org
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Eric Biggers <ebiggers@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <nick.desaulniers+lkml@gmail.com>
Cc: Justin Stitt <justinstitt@google.com>
Cc: linux-kernel@vger.kernel.org
Cc: linux-crypto@vger.kernel.org
Cc: llvm@lists.linux.dev
Signed-off-by: Bill Wendling <morbo@google.com>
---
 arch/x86/Makefile         | 3 +++
 arch/x86/lib/crc32-glue.c | 8 ++++----
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 5b773b34768d..241436da1473 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -114,6 +114,9 @@ else
 KBUILD_CFLAGS += $(call cc-option,-fcf-protection=none)
 endif

+# Enables the use of CRC32 builtins.
+KBUILD_CFLAGS += -mcrc32
+
 ifeq ($(CONFIG_X86_32),y)
         BITS := 32
         UTS_MACHINE := i386
diff --git a/arch/x86/lib/crc32-glue.c b/arch/x86/lib/crc32-glue.c
index 2dd18a886ded..fdb94bff25f4 100644
--- a/arch/x86/lib/crc32-glue.c
+++ b/arch/x86/lib/crc32-glue.c
@@ -48,9 +48,9 @@ u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
 EXPORT_SYMBOL(crc32_le_arch);

 #ifdef CONFIG_X86_64
-#define CRC32_INST "crc32q %1, %q0"
+#define CRC32_INST __builtin_ia32_crc32di
 #else
-#define CRC32_INST "crc32l %1, %0"
+#define CRC32_INST __builtin_ia32_crc32si
 #endif

 /*
@@ -78,10 +78,10 @@ u32 crc32c_le_arch(u32 crc, const u8 *p, size_t len)

        for (num_longs = len / sizeof(unsigned long);
             num_longs != 0; num_longs--, p += sizeof(unsigned long))
-               asm(CRC32_INST : "+r" (crc) : "rm" (*(unsigned long *)p));
+               crc = CRC32_INST(crc,  *(unsigned long *)p);

        for (len %= sizeof(unsigned long); len; len--, p++)
-               asm("crc32b %1, %0" : "+r" (crc) : "rm" (*p));
+               crc = __builtin_ia32_crc32qi(crc, *p);

        return crc;
 }
-- 
2.48.1.711.g2feabab25a-goog

Re: [PATCH] x86/crc32: use builtins to improve code generation

Posted by Dave Hansen 11 months, 2 weeks ago

On 2/26/25 22:12, Bill Wendling wrote:
>  #ifdef CONFIG_X86_64
> -#define CRC32_INST "crc32q %1, %q0"
> +#define CRC32_INST __builtin_ia32_crc32di
>  #else
> -#define CRC32_INST "crc32l %1, %0"
> +#define CRC32_INST __builtin_ia32_crc32si
>  #endif
> 
>  /*
> @@ -78,10 +78,10 @@ u32 crc32c_le_arch(u32 crc, const u8 *p, size_t len)
> 
>         for (num_longs = len / sizeof(unsigned long);
>              num_longs != 0; num_longs--, p += sizeof(unsigned long))
> -               asm(CRC32_INST : "+r" (crc) : "rm" (*(unsigned long *)p));
> +               crc = CRC32_INST(crc,  *(unsigned long *)p);

Could we get rid of the macros, please?

unsigned long crc32_ul(unsigned long crc, unsigned long data)
{
	if (IS_DEFINED(CONFIG_X86_64))
		return __builtin_ia32_crc32di(crc, data)
	else
		return __builtin_ia32_crc32si(crc, data)
}

I guess it could also do some check like:

	if (sizeof(int) == sizeof(long))

instead of CONFIG_X86_64, but the CONFIG_X86_64 will make it more
obvious when someone comes through to rip out 32-bit support some day.

Re: [PATCH] x86/crc32: use builtins to improve code generation

Posted by Bill Wendling 11 months, 2 weeks ago

On Thu, Feb 27, 2025 at 8:26 AM Dave Hansen <dave.hansen@intel.com> wrote:
> On 2/26/25 22:12, Bill Wendling wrote:
> >  #ifdef CONFIG_X86_64
> > -#define CRC32_INST "crc32q %1, %q0"
> > +#define CRC32_INST __builtin_ia32_crc32di
> >  #else
> > -#define CRC32_INST "crc32l %1, %0"
> > +#define CRC32_INST __builtin_ia32_crc32si
> >  #endif
> >
> >  /*
> > @@ -78,10 +78,10 @@ u32 crc32c_le_arch(u32 crc, const u8 *p, size_t len)
> >
> >         for (num_longs = len / sizeof(unsigned long);
> >              num_longs != 0; num_longs--, p += sizeof(unsigned long))
> > -               asm(CRC32_INST : "+r" (crc) : "rm" (*(unsigned long *)p));
> > +               crc = CRC32_INST(crc,  *(unsigned long *)p);
>
> Could we get rid of the macros, please?
>
> unsigned long crc32_ul(unsigned long crc, unsigned long data)
> {
>         if (IS_DEFINED(CONFIG_X86_64))
>                 return __builtin_ia32_crc32di(crc, data)
>         else
>                 return __builtin_ia32_crc32si(crc, data)
> }
>
> I guess it could also do some check like:
>
>         if (sizeof(int) == sizeof(long))
>
> instead of CONFIG_X86_64, but the CONFIG_X86_64 will make it more
> obvious when someone comes through to rip out 32-bit support some day.

I vastly prefer the first way if made "static __always_inline".

-bw

Re: [PATCH] x86/crc32: use builtins to improve code generation

Posted by Dave Hansen 11 months, 2 weeks ago

On 2/27/25 12:57, Bill Wendling wrote:
> I vastly prefer the first way if made "static __always_inline".

'static', for sure. But I'd leave the explicit inlining out unless the
compiler is actively being stupid.

Re: [PATCH] x86/crc32: use builtins to improve code generation

Posted by Eric Biggers 11 months, 2 weeks ago

On Wed, Feb 26, 2025 at 10:12:47PM -0800, Bill Wendling wrote:
> For both gcc and clang, crc32 builtins generate better code than the
> inline asm. GCC improves, removing unneeded "mov" instructions. Clang
> does the same and unrolls the loops. GCC has no changes on i386, but
> Clang's code generation is vastly improved, due to Clang's "rm"
> constraint issue.
> 
> The number of cycles improved by ~0.1% for GCC and ~1% for Clang, which
> is expected because of the "rm" issue. However, Clang's performance is
> better than GCC's by ~1.5%, most likely due to loop unrolling.
> 
> Link: https://github.com/llvm/llvm-project/issues/20571#issuecomment-2649330009
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: Borislav Petkov <bp@alien8.de>
> Cc: Dave Hansen <dave.hansen@linux.intel.com>
> Cc: x86@kernel.org
> Cc: "H. Peter Anvin" <hpa@zytor.com>
> Cc: Eric Biggers <ebiggers@kernel.org>
> Cc: Ard Biesheuvel <ardb@kernel.org>
> Cc: Nathan Chancellor <nathan@kernel.org>
> Cc: Nick Desaulniers <nick.desaulniers+lkml@gmail.com>
> Cc: Justin Stitt <justinstitt@google.com>
> Cc: linux-kernel@vger.kernel.org
> Cc: linux-crypto@vger.kernel.org
> Cc: llvm@lists.linux.dev
> Signed-off-by: Bill Wendling <morbo@google.com>
> ---
>  arch/x86/Makefile         | 3 +++
>  arch/x86/lib/crc32-glue.c | 8 ++++----
>  2 files changed, 7 insertions(+), 4 deletions(-)

Thanks!  A couple concerns, though:

> diff --git a/arch/x86/Makefile b/arch/x86/Makefile
> index 5b773b34768d..241436da1473 100644
> --- a/arch/x86/Makefile
> +++ b/arch/x86/Makefile
> @@ -114,6 +114,9 @@ else
>  KBUILD_CFLAGS += $(call cc-option,-fcf-protection=none)
>  endif
> 
> +# Enables the use of CRC32 builtins.
> +KBUILD_CFLAGS += -mcrc32

Doesn't this technically allow the compiler to insert CRC32 instructions
anywhere in arch/x86/ without the needed runtime CPU feature check?  Normally
when using intrinsics it's necessary to limit the scope of the feature
enablement to match the runtime CPU feature check that is done, e.g. by using
the target function attribute.

> diff --git a/arch/x86/lib/crc32-glue.c b/arch/x86/lib/crc32-glue.c
> index 2dd18a886ded..fdb94bff25f4 100644
> --- a/arch/x86/lib/crc32-glue.c
> +++ b/arch/x86/lib/crc32-glue.c
> @@ -48,9 +48,9 @@ u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
>  EXPORT_SYMBOL(crc32_le_arch);
> 
>  #ifdef CONFIG_X86_64
> -#define CRC32_INST "crc32q %1, %q0"
> +#define CRC32_INST __builtin_ia32_crc32di
>  #else
> -#define CRC32_INST "crc32l %1, %0"
> +#define CRC32_INST __builtin_ia32_crc32si
>  #endif

Do both gcc and clang consider these builtins to be a stable API, or do they
only guarantee the stability of _mm_crc32_*() from immintrin.h?  At least for
the rest of the SSE and AVX stuff, I thought that only the immintrin.h functions
are actually considered stable.

- Eric

Re: [PATCH] x86/crc32: use builtins to improve code generation

Posted by H. Peter Anvin 11 months, 2 weeks ago

On February 26, 2025 10:28:59 PM PST, Eric Biggers <ebiggers@kernel.org> wrote:
>On Wed, Feb 26, 2025 at 10:12:47PM -0800, Bill Wendling wrote:
>> For both gcc and clang, crc32 builtins generate better code than the
>> inline asm. GCC improves, removing unneeded "mov" instructions. Clang
>> does the same and unrolls the loops. GCC has no changes on i386, but
>> Clang's code generation is vastly improved, due to Clang's "rm"
>> constraint issue.
>> 
>> The number of cycles improved by ~0.1% for GCC and ~1% for Clang, which
>> is expected because of the "rm" issue. However, Clang's performance is
>> better than GCC's by ~1.5%, most likely due to loop unrolling.
>> 
>> Link: https://github.com/llvm/llvm-project/issues/20571#issuecomment-2649330009
>> Cc: Thomas Gleixner <tglx@linutronix.de>
>> Cc: Ingo Molnar <mingo@redhat.com>
>> Cc: Borislav Petkov <bp@alien8.de>
>> Cc: Dave Hansen <dave.hansen@linux.intel.com>
>> Cc: x86@kernel.org
>> Cc: "H. Peter Anvin" <hpa@zytor.com>
>> Cc: Eric Biggers <ebiggers@kernel.org>
>> Cc: Ard Biesheuvel <ardb@kernel.org>
>> Cc: Nathan Chancellor <nathan@kernel.org>
>> Cc: Nick Desaulniers <nick.desaulniers+lkml@gmail.com>
>> Cc: Justin Stitt <justinstitt@google.com>
>> Cc: linux-kernel@vger.kernel.org
>> Cc: linux-crypto@vger.kernel.org
>> Cc: llvm@lists.linux.dev
>> Signed-off-by: Bill Wendling <morbo@google.com>
>> ---
>>  arch/x86/Makefile         | 3 +++
>>  arch/x86/lib/crc32-glue.c | 8 ++++----
>>  2 files changed, 7 insertions(+), 4 deletions(-)
>
>Thanks!  A couple concerns, though:
>
>> diff --git a/arch/x86/Makefile b/arch/x86/Makefile
>> index 5b773b34768d..241436da1473 100644
>> --- a/arch/x86/Makefile
>> +++ b/arch/x86/Makefile
>> @@ -114,6 +114,9 @@ else
>>  KBUILD_CFLAGS += $(call cc-option,-fcf-protection=none)
>>  endif
>> 
>> +# Enables the use of CRC32 builtins.
>> +KBUILD_CFLAGS += -mcrc32
>
>Doesn't this technically allow the compiler to insert CRC32 instructions
>anywhere in arch/x86/ without the needed runtime CPU feature check?  Normally
>when using intrinsics it's necessary to limit the scope of the feature
>enablement to match the runtime CPU feature check that is done, e.g. by using
>the target function attribute.
>
>> diff --git a/arch/x86/lib/crc32-glue.c b/arch/x86/lib/crc32-glue.c
>> index 2dd18a886ded..fdb94bff25f4 100644
>> --- a/arch/x86/lib/crc32-glue.c
>> +++ b/arch/x86/lib/crc32-glue.c
>> @@ -48,9 +48,9 @@ u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
>>  EXPORT_SYMBOL(crc32_le_arch);
>> 
>>  #ifdef CONFIG_X86_64
>> -#define CRC32_INST "crc32q %1, %q0"
>> +#define CRC32_INST __builtin_ia32_crc32di
>>  #else
>> -#define CRC32_INST "crc32l %1, %0"
>> +#define CRC32_INST __builtin_ia32_crc32si
>>  #endif
>
>Do both gcc and clang consider these builtins to be a stable API, or do they
>only guarantee the stability of _mm_crc32_*() from immintrin.h?  At least for
>the rest of the SSE and AVX stuff, I thought that only the immintrin.h functions
>are actually considered stable.
>
>- Eric

There is that... also are there compiler versions that we support that do not have -mcrc32 support?

Re: [PATCH] x86/crc32: use builtins to improve code generation

Posted by Bill Wendling 11 months, 2 weeks ago

On Thu, Feb 27, 2025 at 2:53 AM H. Peter Anvin <hpa@zytor.com> wrote:
> On February 26, 2025 10:28:59 PM PST, Eric Biggers <ebiggers@kernel.org> wrote:
> >On Wed, Feb 26, 2025 at 10:12:47PM -0800, Bill Wendling wrote:
> >> For both gcc and clang, crc32 builtins generate better code than the
> >> inline asm. GCC improves, removing unneeded "mov" instructions. Clang
> >> does the same and unrolls the loops. GCC has no changes on i386, but
> >> Clang's code generation is vastly improved, due to Clang's "rm"
> >> constraint issue.
> >>
> >> The number of cycles improved by ~0.1% for GCC and ~1% for Clang, which
> >> is expected because of the "rm" issue. However, Clang's performance is
> >> better than GCC's by ~1.5%, most likely due to loop unrolling.
> >>
> >> Link: https://github.com/llvm/llvm-project/issues/20571#issuecomment-2649330009
> >> Cc: Thomas Gleixner <tglx@linutronix.de>
> >> Cc: Ingo Molnar <mingo@redhat.com>
> >> Cc: Borislav Petkov <bp@alien8.de>
> >> Cc: Dave Hansen <dave.hansen@linux.intel.com>
> >> Cc: x86@kernel.org
> >> Cc: "H. Peter Anvin" <hpa@zytor.com>
> >> Cc: Eric Biggers <ebiggers@kernel.org>
> >> Cc: Ard Biesheuvel <ardb@kernel.org>
> >> Cc: Nathan Chancellor <nathan@kernel.org>
> >> Cc: Nick Desaulniers <nick.desaulniers+lkml@gmail.com>
> >> Cc: Justin Stitt <justinstitt@google.com>
> >> Cc: linux-kernel@vger.kernel.org
> >> Cc: linux-crypto@vger.kernel.org
> >> Cc: llvm@lists.linux.dev
> >> Signed-off-by: Bill Wendling <morbo@google.com>
> >> ---
> >>  arch/x86/Makefile         | 3 +++
> >>  arch/x86/lib/crc32-glue.c | 8 ++++----
> >>  2 files changed, 7 insertions(+), 4 deletions(-)
> >
> >Thanks!  A couple concerns, though:
> >
> >> diff --git a/arch/x86/Makefile b/arch/x86/Makefile
> >> index 5b773b34768d..241436da1473 100644
> >> --- a/arch/x86/Makefile
> >> +++ b/arch/x86/Makefile
> >> @@ -114,6 +114,9 @@ else
> >>  KBUILD_CFLAGS += $(call cc-option,-fcf-protection=none)
> >>  endif
> >>
> >> +# Enables the use of CRC32 builtins.
> >> +KBUILD_CFLAGS += -mcrc32
> >
> >Doesn't this technically allow the compiler to insert CRC32 instructions
> >anywhere in arch/x86/ without the needed runtime CPU feature check?  Normally
> >when using intrinsics it's necessary to limit the scope of the feature
> >enablement to match the runtime CPU feature check that is done, e.g. by using
> >the target function attribute.
> >
> >> diff --git a/arch/x86/lib/crc32-glue.c b/arch/x86/lib/crc32-glue.c
> >> index 2dd18a886ded..fdb94bff25f4 100644
> >> --- a/arch/x86/lib/crc32-glue.c
> >> +++ b/arch/x86/lib/crc32-glue.c
> >> @@ -48,9 +48,9 @@ u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
> >>  EXPORT_SYMBOL(crc32_le_arch);
> >>
> >>  #ifdef CONFIG_X86_64
> >> -#define CRC32_INST "crc32q %1, %q0"
> >> +#define CRC32_INST __builtin_ia32_crc32di
> >>  #else
> >> -#define CRC32_INST "crc32l %1, %0"
> >> +#define CRC32_INST __builtin_ia32_crc32si
> >>  #endif
> >
> >Do both gcc and clang consider these builtins to be a stable API, or do they
> >only guarantee the stability of _mm_crc32_*() from immintrin.h?  At least for
> >the rest of the SSE and AVX stuff, I thought that only the immintrin.h functions
> >are actually considered stable.
> >
> >- Eric
>
> There is that... also are there compiler versions that we support that do not have -mcrc32 support?
>
Checking GCC 5.1.0 and Clang 13.0.1, it seems that both support '-mcrc32'.

-bw

Re: [PATCH] x86/crc32: use builtins to improve code generation

Posted by Bill Wendling 11 months, 2 weeks ago

On Thu, Feb 27, 2025 at 4:17 AM Bill Wendling <morbo@google.com> wrote:
> On Thu, Feb 27, 2025 at 2:53 AM H. Peter Anvin <hpa@zytor.com> wrote:
> > On February 26, 2025 10:28:59 PM PST, Eric Biggers <ebiggers@kernel.org> wrote:
> > >On Wed, Feb 26, 2025 at 10:12:47PM -0800, Bill Wendling wrote:
> > >> For both gcc and clang, crc32 builtins generate better code than the
> > >> inline asm. GCC improves, removing unneeded "mov" instructions. Clang
> > >> does the same and unrolls the loops. GCC has no changes on i386, but
> > >> Clang's code generation is vastly improved, due to Clang's "rm"
> > >> constraint issue.
> > >>
> > >> The number of cycles improved by ~0.1% for GCC and ~1% for Clang, which
> > >> is expected because of the "rm" issue. However, Clang's performance is
> > >> better than GCC's by ~1.5%, most likely due to loop unrolling.
> > >>
> > >> Link: https://github.com/llvm/llvm-project/issues/20571#issuecomment-2649330009
> > >> Cc: Thomas Gleixner <tglx@linutronix.de>
> > >> Cc: Ingo Molnar <mingo@redhat.com>
> > >> Cc: Borislav Petkov <bp@alien8.de>
> > >> Cc: Dave Hansen <dave.hansen@linux.intel.com>
> > >> Cc: x86@kernel.org
> > >> Cc: "H. Peter Anvin" <hpa@zytor.com>
> > >> Cc: Eric Biggers <ebiggers@kernel.org>
> > >> Cc: Ard Biesheuvel <ardb@kernel.org>
> > >> Cc: Nathan Chancellor <nathan@kernel.org>
> > >> Cc: Nick Desaulniers <nick.desaulniers+lkml@gmail.com>
> > >> Cc: Justin Stitt <justinstitt@google.com>
> > >> Cc: linux-kernel@vger.kernel.org
> > >> Cc: linux-crypto@vger.kernel.org
> > >> Cc: llvm@lists.linux.dev
> > >> Signed-off-by: Bill Wendling <morbo@google.com>
> > >> ---
> > >>  arch/x86/Makefile         | 3 +++
> > >>  arch/x86/lib/crc32-glue.c | 8 ++++----
> > >>  2 files changed, 7 insertions(+), 4 deletions(-)
> > >
> > >Thanks!  A couple concerns, though:
> > >
> > >> diff --git a/arch/x86/Makefile b/arch/x86/Makefile
> > >> index 5b773b34768d..241436da1473 100644
> > >> --- a/arch/x86/Makefile
> > >> +++ b/arch/x86/Makefile
> > >> @@ -114,6 +114,9 @@ else
> > >>  KBUILD_CFLAGS += $(call cc-option,-fcf-protection=none)
> > >>  endif
> > >>
> > >> +# Enables the use of CRC32 builtins.
> > >> +KBUILD_CFLAGS += -mcrc32
> > >
> > >Doesn't this technically allow the compiler to insert CRC32 instructions
> > >anywhere in arch/x86/ without the needed runtime CPU feature check?  Normally
> > >when using intrinsics it's necessary to limit the scope of the feature
> > >enablement to match the runtime CPU feature check that is done, e.g. by using
> > >the target function attribute.
> > >
> > >> diff --git a/arch/x86/lib/crc32-glue.c b/arch/x86/lib/crc32-glue.c
> > >> index 2dd18a886ded..fdb94bff25f4 100644
> > >> --- a/arch/x86/lib/crc32-glue.c
> > >> +++ b/arch/x86/lib/crc32-glue.c
> > >> @@ -48,9 +48,9 @@ u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
> > >>  EXPORT_SYMBOL(crc32_le_arch);
> > >>
> > >>  #ifdef CONFIG_X86_64
> > >> -#define CRC32_INST "crc32q %1, %q0"
> > >> +#define CRC32_INST __builtin_ia32_crc32di
> > >>  #else
> > >> -#define CRC32_INST "crc32l %1, %0"
> > >> +#define CRC32_INST __builtin_ia32_crc32si
> > >>  #endif
> > >
> > >Do both gcc and clang consider these builtins to be a stable API, or do they
> > >only guarantee the stability of _mm_crc32_*() from immintrin.h?  At least for
> > >the rest of the SSE and AVX stuff, I thought that only the immintrin.h functions
> > >are actually considered stable.
> > >
> > >- Eric
> >
> > There is that... also are there compiler versions that we support that do not have -mcrc32 support?
> >
> Checking GCC 5.1.0 and Clang 13.0.1, it seems that both support '-mcrc32'.
>
I just checked and GCC 5.1.0 doesn't appear to be able to compile the
kernel anymore, at least not with "defconfig". It doesn't have
retpoline support for one and then can't compile lib/zstd:

lib/zstd/decompress/zstd_decompress_block.c: In function
‘ZSTD_decompressSequences_default’:
lib/zstd/decompress/zstd_decompress_block.c:1539:1: error: inlining
failed in call to always_inline ‘ZSTD_decompressSequences_body’:
optimization level attribute mismatch
 ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
 ^
lib/zstd/decompress/zstd_decompress_block.c:1633:12: error: called from here
     return ZSTD_decompressSequences_body(dctx, dst, maxDstSize,
seqStart, seqSize, nbSeq, isLongOffset, frame);
            ^

GCC 6.1.0 gets further, but also doesn't have retpoline support. Maybe
the minimal version should be changed?

Anyway, GCC 5.1.0 doesn't support
__attribute__((__target__("crc32"))), so I'd have to use the flag. I
know I can conditionally add the flag with:

CFLAGS_crc32-glue.o := -mcrc32

But like I said, the file is compiled twice (why?), but only once with
the arch/x86/lib/Makefile. If anyone has any suggestions on how to
solve this, please let me know.

-bw

Re: [PATCH] x86/crc32: use builtins to improve code generation

Posted by Bill Wendling 11 months, 2 weeks ago

On Wed, Feb 26, 2025 at 10:29 PM Eric Biggers <ebiggers@kernel.org> wrote:
>
> On Wed, Feb 26, 2025 at 10:12:47PM -0800, Bill Wendling wrote:
> > For both gcc and clang, crc32 builtins generate better code than the
> > inline asm. GCC improves, removing unneeded "mov" instructions. Clang
> > does the same and unrolls the loops. GCC has no changes on i386, but
> > Clang's code generation is vastly improved, due to Clang's "rm"
> > constraint issue.
> >
> > The number of cycles improved by ~0.1% for GCC and ~1% for Clang, which
> > is expected because of the "rm" issue. However, Clang's performance is
> > better than GCC's by ~1.5%, most likely due to loop unrolling.
> >
> > Link: https://github.com/llvm/llvm-project/issues/20571#issuecomment-2649330009
> > Cc: Thomas Gleixner <tglx@linutronix.de>
> > Cc: Ingo Molnar <mingo@redhat.com>
> > Cc: Borislav Petkov <bp@alien8.de>
> > Cc: Dave Hansen <dave.hansen@linux.intel.com>
> > Cc: x86@kernel.org
> > Cc: "H. Peter Anvin" <hpa@zytor.com>
> > Cc: Eric Biggers <ebiggers@kernel.org>
> > Cc: Ard Biesheuvel <ardb@kernel.org>
> > Cc: Nathan Chancellor <nathan@kernel.org>
> > Cc: Nick Desaulniers <nick.desaulniers+lkml@gmail.com>
> > Cc: Justin Stitt <justinstitt@google.com>
> > Cc: linux-kernel@vger.kernel.org
> > Cc: linux-crypto@vger.kernel.org
> > Cc: llvm@lists.linux.dev
> > Signed-off-by: Bill Wendling <morbo@google.com>
> > ---
> >  arch/x86/Makefile         | 3 +++
> >  arch/x86/lib/crc32-glue.c | 8 ++++----
> >  2 files changed, 7 insertions(+), 4 deletions(-)
>
> Thanks!  A couple concerns, though:
>
> > diff --git a/arch/x86/Makefile b/arch/x86/Makefile
> > index 5b773b34768d..241436da1473 100644
> > --- a/arch/x86/Makefile
> > +++ b/arch/x86/Makefile
> > @@ -114,6 +114,9 @@ else
> >  KBUILD_CFLAGS += $(call cc-option,-fcf-protection=none)
> >  endif
> >
> > +# Enables the use of CRC32 builtins.
> > +KBUILD_CFLAGS += -mcrc32
>
> Doesn't this technically allow the compiler to insert CRC32 instructions
> anywhere in arch/x86/ without the needed runtime CPU feature check?  Normally
> when using intrinsics it's necessary to limit the scope of the feature
> enablement to match the runtime CPU feature check that is done, e.g. by using
> the target function attribute.
>
I'm not sure if CRC32 instructions will automatically be inserted when
not explicitly called, especially since the other vector features are
disabled. I wanted to limit enabling this flag for only crc32-glue.c,
but my Makefile-fu failed me. The file appears to be compiled twice.
But adding __attribute__((target("crc32"))) to the function would be
much better.

> > diff --git a/arch/x86/lib/crc32-glue.c b/arch/x86/lib/crc32-glue.c
> > index 2dd18a886ded..fdb94bff25f4 100644
> > --- a/arch/x86/lib/crc32-glue.c
> > +++ b/arch/x86/lib/crc32-glue.c
> > @@ -48,9 +48,9 @@ u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
> >  EXPORT_SYMBOL(crc32_le_arch);
> >
> >  #ifdef CONFIG_X86_64
> > -#define CRC32_INST "crc32q %1, %q0"
> > +#define CRC32_INST __builtin_ia32_crc32di
> >  #else
> > -#define CRC32_INST "crc32l %1, %0"
> > +#define CRC32_INST __builtin_ia32_crc32si
> >  #endif
>
> Do both gcc and clang consider these builtins to be a stable API, or do they
> only guarantee the stability of _mm_crc32_*() from immintrin.h?  At least for
> the rest of the SSE and AVX stuff, I thought that only the immintrin.h functions
> are actually considered stable.
>
I don't know the answer for this. In general, once we (Clang) create a
__builtin_* function it's not going away, because it will break anyone
who uses them. (I assume the same is true for GCC.) There's a note in
Documentation/arch/x86/x86_64/fsgs.rst in regards to using
_{read,write}fsbase_u64() from immintrin.h (see below). I don't know
if that's analogous to what I'm doing here, but maybe we should do
something similar for crc32intr.h?

FSGSBASE instructions compiler support
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

GCC version 4.6.4 and newer provide intrinsics for the FSGSBASE
instructions. Clang 5 supports them as well.

  =================== ===========================
  _readfsbase_u64()   Read the FS base register
  _readfsbase_u64()   Read the GS base register
  _writefsbase_u64()  Write the FS base register
  _writegsbase_u64()  Write the GS base register
  =================== ===========================

To utilize these intrinsics <immintrin.h> must be included in the source
code and the compiler option -mfsgsbase has to be added.


-bw

Re: [PATCH] x86/crc32: use builtins to improve code generation

Posted by Eric Biggers 11 months, 2 weeks ago

On Wed, Feb 26, 2025 at 11:08:22PM -0800, Bill Wendling wrote:
> > Doesn't this technically allow the compiler to insert CRC32 instructions
> > anywhere in arch/x86/ without the needed runtime CPU feature check?  Normally
> > when using intrinsics it's necessary to limit the scope of the feature
> > enablement to match the runtime CPU feature check that is done, e.g. by using
> > the target function attribute.
> >
> I'm not sure if CRC32 instructions will automatically be inserted when
> not explicitly called, especially since the other vector features are
> disabled. I wanted to limit enabling this flag for only crc32-glue.c,
> but my Makefile-fu failed me. The file appears to be compiled twice.
> But adding __attribute__((target("crc32"))) to the function would be
> much better.

Technically, limiting it to crc32-glue.c still isn't enough, as much of the code
in that file is executed before the crc32 instruction support is checked for.

I also noticed that -mcrc32 support wasn't added to clang until clang 14, by
https://github.com/llvm/llvm-project/commit/12fa608af44a80de8b655a8a984cd095908e7e80
But according to https://docs.kernel.org/process/changes.html the minimum clang
version to build Linux is 13.0.1.  So there's a missing check for support.

> > Do both gcc and clang consider these builtins to be a stable API, or do they
> > only guarantee the stability of _mm_crc32_*() from immintrin.h?  At least for
> > the rest of the SSE and AVX stuff, I thought that only the immintrin.h functions
> > are actually considered stable.
> >
> I don't know the answer for this. In general, once we (Clang) create a
> __builtin_* function it's not going away, because it will break anyone
> who uses them. (I assume the same is true for GCC.)

Here are examples of LLVM commits that removed x86 builtins:

* https://github.com/llvm/llvm-project/commit/09857a4bd166ca62a9610629731dfbf8f62cd955
* https://github.com/llvm/llvm-project/commit/9a14c369c422b244db78f1a9f947a891a75d912f
* https://github.com/llvm/llvm-project/commit/ec6024d0811b3116e0a29481b01179d5081a3b92
* https://github.com/llvm/llvm-project/commit/e4074432d5bf5c295f96eeed27c5b693f5b3bf16
* https://github.com/llvm/llvm-project/commit/9fddc3fd00b3ad5df5a3988e5cc4708254976173

So no, they do not appear to be considered stable.

(The equivalents in immintrin.h are stable, but good luck including immintrin.h
in the Linux kernel, since it depends on stdlib.h.)

Of course, if we really wanted this we could go with "it works in practice"
anyway.  But, given the small benefit of this patch vs. the potential risk I
don't think we should bother with it, unless it's acked by the gcc and clang
folks on the following points:

* The crc32 builtins are stable.

* gcc and clang will never generate crc32 instructions without explicitly using
  the builtins.  (BTW, keep in mind this ongoing work:
  https://gcc.gnu.org/wiki/cauldron2023talks?action=AttachFile&do=get&target=GCC+CRC+optimization.pdf)

Also note that crc32c_arch() already calls into the assembly code in
arch/x86/lib/crc32c-3way.S to handle lengths >= 512 bytes, and for handling the
tail data that assembly function already has a nice qword-at-a-time loop which
is exactly what we are trying to generate here.  A more promising approach might
be to reorganize things a bit so that we can reuse that assembly code.

- Eric

[PATCH v2] x86/crc32: use builtins to improve code generation

Posted by Bill Wendling 11 months, 2 weeks ago

For both gcc and clang, crc32 builtins generate better code than the
inline asm. GCC improves, removing unneeded "mov" instructions. Clang
does the same and unrolls the loops. GCC has no changes on i386, but
Clang's code generation is vastly improved, due to Clang's "rm"
constraint issue.

The number of cycles improved by ~0.1% for GCC and ~1% for Clang, which
is expected because of the "rm" issue. However, Clang's performance is
better than GCC's by ~1.5%, most likely due to loop unrolling.

Link: https://github.com/llvm/llvm-project/issues/20571#issuecomment-2649330009
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: x86@kernel.org
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Eric Biggers <ebiggers@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <nick.desaulniers+lkml@gmail.com>
Cc: Justin Stitt <justinstitt@google.com>
Cc: linux-kernel@vger.kernel.org
Cc: linux-crypto@vger.kernel.org
Cc: llvm@lists.linux.dev
Signed-off-by: Bill Wendling <morbo@google.com>
---
v2 - Limited range of '-mcrc32' usage to single file.
   - Use a function instead of macros.
---
 arch/x86/lib/Makefile     |  2 ++
 arch/x86/lib/crc32-glue.c | 15 ++++++++-------
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 8a59c61624c2..1251f611ce3d 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -14,6 +14,8 @@ ifdef CONFIG_KCSAN
 CFLAGS_REMOVE_delay.o = $(CC_FLAGS_FTRACE)
 endif

+CFLAGS_crc32-glue.o := -mcrc32
+
 inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk
 inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt
 quiet_cmd_inat_tables = GEN     $@
diff --git a/arch/x86/lib/crc32-glue.c b/arch/x86/lib/crc32-glue.c
index 2dd18a886ded..fc70462ae2c1 100644
--- a/arch/x86/lib/crc32-glue.c
+++ b/arch/x86/lib/crc32-glue.c
@@ -47,11 +47,12 @@ u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
 }
 EXPORT_SYMBOL(crc32_le_arch);

-#ifdef CONFIG_X86_64
-#define CRC32_INST "crc32q %1, %q0"
-#else
-#define CRC32_INST "crc32l %1, %0"
-#endif
+static unsigned long crc32_ul(u32 crc, unsigned long p)
+{
+       if (IS_ENABLED(CONFIG_X86_64))
+               return __builtin_ia32_crc32di(crc, p);
+       return __builtin_ia32_crc32si(crc, p);
+}

 /*
  * Use carryless multiply version of crc32c when buffer size is >= 512 to
@@ -78,10 +79,10 @@ u32 crc32c_le_arch(u32 crc, const u8 *p, size_t len)

        for (num_longs = len / sizeof(unsigned long);
             num_longs != 0; num_longs--, p += sizeof(unsigned long))
-               asm(CRC32_INST : "+r" (crc) : "rm" (*(unsigned long *)p));
+               crc = crc32_ul(crc,  *(unsigned long *)p);

        for (len %= sizeof(unsigned long); len; len--, p++)
-               asm("crc32b %1, %0" : "+r" (crc) : "rm" (*p));
+               crc = __builtin_ia32_crc32qi(crc, *p);

        return crc;
 }
-- 
2.48.1.711.g2feabab25a-goog