drivers/ras/amd/atl/umc.c | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-)
Both LLVM/GCC support a __builtin_parity function which is functionally
equivalent to the custom bitwise_xor_bits() one. Let's simplify the code by
relying on the built-in. No functional changes.
Signed-off-by: Nikolay Borisov <nik.borisov@suse.com>
---
Changes since v1:
* Reworded the commit message
drivers/ras/amd/atl/umc.c | 22 +++++-----------------
1 file changed, 5 insertions(+), 17 deletions(-)
diff --git a/drivers/ras/amd/atl/umc.c b/drivers/ras/amd/atl/umc.c
index 6e072b7667e9..7ff4a5a1c5da 100644
--- a/drivers/ras/amd/atl/umc.c
+++ b/drivers/ras/amd/atl/umc.c
@@ -49,18 +49,6 @@ static u8 get_coh_st_inst_id_mi300(struct atl_err *err)
return i;
}
-/* XOR the bits in @val. */
-static u16 bitwise_xor_bits(u16 val)
-{
- u16 tmp = 0;
- u8 i;
-
- for (i = 0; i < 16; i++)
- tmp ^= (val >> i) & 0x1;
-
- return tmp;
-}
-
struct xor_bits {
bool xor_enable;
u16 col_xor;
@@ -250,17 +238,17 @@ static unsigned long convert_dram_to_norm_addr_mi300(unsigned long addr)
if (!addr_hash.bank[i].xor_enable)
continue;
- temp = bitwise_xor_bits(col & addr_hash.bank[i].col_xor);
- temp ^= bitwise_xor_bits(row & addr_hash.bank[i].row_xor);
+ temp = (u16)__builtin_parity(col & addr_hash.bank[i].col_xor);
+ temp ^= (u16)__builtin_parity(row & addr_hash.bank[i].row_xor);
bank ^= temp << i;
}
/* Calculate hash for PC bit. */
if (addr_hash.pc.xor_enable) {
- temp = bitwise_xor_bits(col & addr_hash.pc.col_xor);
- temp ^= bitwise_xor_bits(row & addr_hash.pc.row_xor);
+ temp = (u16)__builtin_parity(col & addr_hash.pc.col_xor);
+ temp ^= (u16)__builtin_parity(row & addr_hash.pc.row_xor);
/* Bits SID[1:0] act as Bank[5:4] for PC hash, so apply them here. */
- temp ^= bitwise_xor_bits((bank | sid << NUM_BANK_BITS) & addr_hash.bank_xor);
+ temp ^= (u16)__builtin_parity((bank | sid << NUM_BANK_BITS) & addr_hash.bank_xor);
pc ^= temp;
}
--
2.52.0
On Mon, 24 Nov 2025 10:40:11 +0200 Nikolay Borisov <nik.borisov@suse.com> wrote: > Both LLVM/GCC support a __builtin_parity function which is functionally > equivalent to the custom bitwise_xor_bits() one. Let's simplify the code by > relying on the built-in. No functional changes. > While you've got this code out on the operating table: - Change all the locals/parameters from u8/u16 to 'unsigned int'. It will generate better code. Using u8/u16 only makes any sense if you are trying to reduce the size of a structure. - Both col_xor and row_xor are masks (for the parity code). So the names are wrong. In fact I think all the 'xor' and 'XOR' are incorrectly named. - How often is 'xor_enable' aka 'mask_enable' set? If set most of the time (or the code rarely runs) then if the hardware register says 'don't include these values' then just set the row/col mask values to zero and let the rest of the code just run through. David
Hi Nikolay,
On Mon, Nov 24, 2025 at 10:40:11AM +0200, Nikolay Borisov wrote:
> Both LLVM/GCC support a __builtin_parity function which is functionally
> equivalent to the custom bitwise_xor_bits() one. Let's simplify the code by
> relying on the built-in. No functional changes.
IIRC in some cases, if the compiler decides not to inline
__builtin_parity(), it generates a libgcc function call like
__paritysi2(). Since the kernel currently lacks this symbol, this could
lead to a build failure at link time. Although the compiler inlines it
in most cases, I am not sure if using __builtin_parity() here is a good
idea.
Regards,
Kuan-Wei
>
> Signed-off-by: Nikolay Borisov <nik.borisov@suse.com>
> ---
>
> Changes since v1:
>
> * Reworded the commit message
>
> drivers/ras/amd/atl/umc.c | 22 +++++-----------------
> 1 file changed, 5 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/ras/amd/atl/umc.c b/drivers/ras/amd/atl/umc.c
> index 6e072b7667e9..7ff4a5a1c5da 100644
> --- a/drivers/ras/amd/atl/umc.c
> +++ b/drivers/ras/amd/atl/umc.c
> @@ -49,18 +49,6 @@ static u8 get_coh_st_inst_id_mi300(struct atl_err *err)
> return i;
> }
>
> -/* XOR the bits in @val. */
> -static u16 bitwise_xor_bits(u16 val)
> -{
> - u16 tmp = 0;
> - u8 i;
> -
> - for (i = 0; i < 16; i++)
> - tmp ^= (val >> i) & 0x1;
> -
> - return tmp;
> -}
> -
> struct xor_bits {
> bool xor_enable;
> u16 col_xor;
> @@ -250,17 +238,17 @@ static unsigned long convert_dram_to_norm_addr_mi300(unsigned long addr)
> if (!addr_hash.bank[i].xor_enable)
> continue;
>
> - temp = bitwise_xor_bits(col & addr_hash.bank[i].col_xor);
> - temp ^= bitwise_xor_bits(row & addr_hash.bank[i].row_xor);
> + temp = (u16)__builtin_parity(col & addr_hash.bank[i].col_xor);
> + temp ^= (u16)__builtin_parity(row & addr_hash.bank[i].row_xor);
> bank ^= temp << i;
> }
>
> /* Calculate hash for PC bit. */
> if (addr_hash.pc.xor_enable) {
> - temp = bitwise_xor_bits(col & addr_hash.pc.col_xor);
> - temp ^= bitwise_xor_bits(row & addr_hash.pc.row_xor);
> + temp = (u16)__builtin_parity(col & addr_hash.pc.col_xor);
> + temp ^= (u16)__builtin_parity(row & addr_hash.pc.row_xor);
> /* Bits SID[1:0] act as Bank[5:4] for PC hash, so apply them here. */
> - temp ^= bitwise_xor_bits((bank | sid << NUM_BANK_BITS) & addr_hash.bank_xor);
> + temp ^= (u16)__builtin_parity((bank | sid << NUM_BANK_BITS) & addr_hash.bank_xor);
> pc ^= temp;
> }
>
> --
> 2.52.0
>
>
On Mon, Nov 24, 2025 at 04:57:51PM +0800, Kuan-Wei Chiu wrote:
> > Both LLVM/GCC support a __builtin_parity function which is functionally
> > equivalent to the custom bitwise_xor_bits() one. Let's simplify the code by
> > relying on the built-in. No functional changes.
>
> IIRC in some cases,
Which are those cases?
Do you have a trigger scenario?
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
Hi Borislav, On Mon, Nov 24, 2025 at 12:05:26PM +0100, Borislav Petkov wrote: > On Mon, Nov 24, 2025 at 04:57:51PM +0800, Kuan-Wei Chiu wrote: > > > Both LLVM/GCC support a __builtin_parity function which is functionally > > > equivalent to the custom bitwise_xor_bits() one. Let's simplify the code by > > > relying on the built-in. No functional changes. > > > > IIRC in some cases, > > Which are those cases? > > Do you have a trigger scenario? > I did a quick search, and I believe it was this kernel test robot report [1] that reminded me of this compiler behavior. [1]: https://lore.kernel.org/oe-kbuild-all/202501312159.l6jNRaYy-lkp@intel.com/ Regards, Kuan-Wei
On Mon, Nov 24, 2025 at 08:03:02PM +0800, Kuan-Wei Chiu wrote:
> On Mon, Nov 24, 2025 at 12:05:26PM +0100, Borislav Petkov wrote:
> > On Mon, Nov 24, 2025 at 04:57:51PM +0800, Kuan-Wei Chiu wrote:
> > > > Both LLVM/GCC support a __builtin_parity function which is functionally
> > > > equivalent to the custom bitwise_xor_bits() one. Let's simplify the code by
> > > > relying on the built-in. No functional changes.
> > >
> > > IIRC in some cases,
> >
> > Which are those cases?
> >
> > Do you have a trigger scenario?
> >
> I did a quick search, and I believe it was this kernel test robot
> report [1] that reminded me of this compiler behavior.
>
> [1]: https://lore.kernel.org/oe-kbuild-all/202501312159.l6jNRaYy-lkp@intel.com/
Interesting, thanks for the pointer.
@Nik, just use hweight16() but pls do check what asm the compiler generates
before and after so that at least there's some palpable improvement or gcc is
smart enough to replace the unrolled XORing with something slick.
Also put in the commit message why we're not using the builtin parity thing
and quote the link above.
Thx.
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
On 11/24/25 14:52, Borislav Petkov wrote:
> On Mon, Nov 24, 2025 at 08:03:02PM +0800, Kuan-Wei Chiu wrote:
>> On Mon, Nov 24, 2025 at 12:05:26PM +0100, Borislav Petkov wrote:
>>> On Mon, Nov 24, 2025 at 04:57:51PM +0800, Kuan-Wei Chiu wrote:
>>>>> Both LLVM/GCC support a __builtin_parity function which is functionally
>>>>> equivalent to the custom bitwise_xor_bits() one. Let's simplify the code by
>>>>> relying on the built-in. No functional changes.
>>>>
>>>> IIRC in some cases,
>>>
>>> Which are those cases?
>>>
>>> Do you have a trigger scenario?
>>>
>> I did a quick search, and I believe it was this kernel test robot
>> report [1] that reminded me of this compiler behavior.
>>
>> [1]: https://lore.kernel.org/oe-kbuild-all/202501312159.l6jNRaYy-lkp@intel.com/
>
> Interesting, thanks for the pointer.
>
> @Nik, just use hweight16() but pls do check what asm the compiler generates
> before and after so that at least there's some palpable improvement or gcc is
> smart enough to replace the unrolled XORing with something slick.
>
> Also put in the commit message why we're not using the builtin parity thing
> and quote the link above.
>
> Thx.
>
So the custom function one results in the following asm being inlined i.e 4 total copies:
# drivers/ras/amd/atl/umc.c:59: tmp ^= (val >> i) & 0x1;
xorl %ecx, %ecx # ivtmp.157
# drivers/ras/amd/atl/umc.c:55: u16 tmp = 0;
xorl %esi, %esi # tmp
# drivers/ras/amd/atl/umc.c:253: temp = bitwise_xor_bits(col & addr_hash.bank[i].col_xor);
andl %r13d, %edx # col, tmp249
# drivers/ras/amd/atl/umc.c:59: tmp ^= (val >> i) & 0x1;
movzwl %dx, %edx # tmp249, _387
.L3:
movl %edx, %eax # _387, tmp250
sarl %cl, %eax # ivtmp.157, tmp250
# drivers/ras/amd/atl/umc.c:58: for (i = 0; i < 16; i++)
addl $1, %ecx #, ivtmp.157
# drivers/ras/amd/atl/umc.c:59: tmp ^= (val >> i) & 0x1;
andl $1, %eax #, tmp251
# drivers/ras/amd/atl/umc.c:59: tmp ^= (val >> i) & 0x1;
xorl %eax, %esi # tmp251, tmp
# drivers/ras/amd/atl/umc.c:58: for (i = 0; i < 16; i++)
cmpl $16, %ecx #, ivtmp.157
jne .L3 #,
Whilst with hweight, if the cpu has popcnt it will be 2 instructions - popcnt and an andl:
# drivers/ras/amd/atl/umc.c:243: temp ^= hweight16(row & addr_hash.bank[i].row_xor) & 1;
andl %ebp, %edi # _321, tmp221
# ./arch/x86/include/asm/arch_hweight.h:19: asm_inline (ALTERNATIVE("call __sw_hweight32",
#APP
# 19 "./arch/x86/include/asm/arch_hweight.h" 1
# ALT: oldinstr
771:
call __sw_hweight32
772:
# ALT: padding
.skip -(((775f-774f)-(772b-771b)) > 0) * ((775f-774f)-(772b-771b)),0x90
773:
.pushsection .altinstructions,"a"
.long 771b - .
.long 774f - .
.4byte ( 4*32+23)
.byte 773b-771b
.byte 775f-774f
.popsection
.pushsection .altinstr_replacement, "ax"
# ALT: replacement
774:
popcntl %edi, %eax # tmp221, res
775:
.popsection
# 0 "" 2
# drivers/ras/amd/atl/umc.c:243: temp ^= hweight16(row & addr_hash.bank[i].row_xor) & 1;
#NO_APP
xorl %eax, %edx # res, tmp222
andl $1, %edx #, temp
So yeah, much better IMHO, unless there is some hidden latency in the popcnt...
On Mon, Nov 24, 2025 at 03:24:40PM +0200, Nikolay Borisov wrote:
> So yeah, much better IMHO, unless there is some hidden latency in the popcnt...
Yap, exactly. And every CPU we care about supports POPCNT. And this is soo not
perf-sensitive so let's whack that function.
Thx.
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
© 2016 - 2025 Red Hat, Inc.