[PATCH] ext4: improve str2hashbuf by processing 4-byte chunks

Guan-Chun Wu posted 1 patch 2 months, 3 weeks ago
fs/ext4/hash.c | 48 ++++++++++++++++++++++++++++++++----------------
1 file changed, 32 insertions(+), 16 deletions(-)
[PATCH] ext4: improve str2hashbuf by processing 4-byte chunks
Posted by Guan-Chun Wu 2 months, 3 weeks ago
The original byte-by-byte implementation with modulo checks is less
efficient. Refactor str2hashbuf_unsigned() and str2hashbuf_signed()
to process input in explicit 4-byte chunks instead of using a
modulus-based loop to emit words byte by byte.

This change removes per-byte modulo checks and reduces loop iterations,
improving efficiency.

Performance test (x86_64, Intel Core i7-10700 @ 2.90GHz, average over 10000
runs, using kernel module for testing):

    len | orig_s | new_s | orig_u | new_u
    ----+--------+-------+--------+-------
      1 |   70   |   71  |   63   |   63
      8 |   68   |   64  |   64   |   62
     32 |   75   |   70  |   75   |   63
     64 |   96   |   71  |  100   |   68
    255 |  192   |  108  |  187   |   84

Signed-off-by: Guan-Chun Wu <409411716@gms.tku.edu.tw>
---
 fs/ext4/hash.c | 48 ++++++++++++++++++++++++++++++++----------------
 1 file changed, 32 insertions(+), 16 deletions(-)

diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 33cd5b6b02d5..75105828e8b4 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -141,21 +141,29 @@ static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
 	pad = (__u32)len | ((__u32)len << 8);
 	pad |= pad << 16;
 
-	val = pad;
 	if (len > num*4)
 		len = num * 4;
-	for (i = 0; i < len; i++) {
-		val = ((int) scp[i]) + (val << 8);
-		if ((i % 4) == 3) {
-			*buf++ = val;
-			val = pad;
-			num--;
-		}
+
+	while (len >= 4) {
+		val = ((int)scp[0] << 24) + ((int)scp[1] << 16) +
+				((int)scp[2] << 8) + (int)scp[3];
+		*buf++ = val;
+		scp += 4;
+		len -= 4;
+		num--;
 	}
+
+	val = pad;
+
+	for (i = 0; i < len; i++)
+		val = (int)scp[i] + (val << 8);
+
 	if (--num >= 0)
 		*buf++ = val;
+
 	while (--num >= 0)
 		*buf++ = pad;
+
 }
 
 static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
@@ -167,21 +175,29 @@ static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
 	pad = (__u32)len | ((__u32)len << 8);
 	pad |= pad << 16;
 
-	val = pad;
 	if (len > num*4)
 		len = num * 4;
-	for (i = 0; i < len; i++) {
-		val = ((int) ucp[i]) + (val << 8);
-		if ((i % 4) == 3) {
-			*buf++ = val;
-			val = pad;
-			num--;
-		}
+
+	while (len >= 4) {
+		val = ((int)ucp[0] << 24) | ((int)ucp[1] << 16) |
+				((int)ucp[2] << 8) | (int)ucp[3];
+		*buf++ = val;
+		ucp += 4;
+		len -= 4;
+		num--;
 	}
+
+	val = pad;
+
+	for (i = 0; i < len; i++)
+		val = (int)ucp[i] + (val << 8);
+
 	if (--num >= 0)
 		*buf++ = val;
+
 	while (--num >= 0)
 		*buf++ = pad;
+
 }
 
 /*
-- 
2.34.1
Re: [PATCH] ext4: improve str2hashbuf by processing 4-byte chunks
Posted by David Laight 2 months, 3 weeks ago
On Sun, 16 Nov 2025 21:01:05 +0800
Guan-Chun Wu <409411716@gms.tku.edu.tw> wrote:

> The original byte-by-byte implementation with modulo checks is less
> efficient. Refactor str2hashbuf_unsigned() and str2hashbuf_signed()
> to process input in explicit 4-byte chunks instead of using a
> modulus-based loop to emit words byte by byte.

There are much bigger gains to be made - the current code is horrid.
Not least due to the costs of the indirect calls.
It is better to use conditionals than indirect calls. 


> 
> This change removes per-byte modulo checks and reduces loop iterations,
> improving efficiency.
> 
> Performance test (x86_64, Intel Core i7-10700 @ 2.90GHz, average over 10000
> runs, using kernel module for testing):
> 
>     len | orig_s | new_s | orig_u | new_u
>     ----+--------+-------+--------+-------
>       1 |   70   |   71  |   63   |   63
>       8 |   68   |   64  |   64   |   62
>      32 |   75   |   70  |   75   |   63
>      64 |   96   |   71  |  100   |   68
>     255 |  192   |  108  |  187   |   84
> 
> Signed-off-by: Guan-Chun Wu <409411716@gms.tku.edu.tw>
> ---
>  fs/ext4/hash.c | 48 ++++++++++++++++++++++++++++++++----------------
>  1 file changed, 32 insertions(+), 16 deletions(-)
> 
> diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
> index 33cd5b6b02d5..75105828e8b4 100644
> --- a/fs/ext4/hash.c
> +++ b/fs/ext4/hash.c
> @@ -141,21 +141,29 @@ static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
>  	pad = (__u32)len | ((__u32)len << 8);
>  	pad |= pad << 16;
>  
> -	val = pad;
>  	if (len > num*4)
>  		len = num * 4;
> -	for (i = 0; i < len; i++) {
> -		val = ((int) scp[i]) + (val << 8);
> -		if ((i % 4) == 3) {
> -			*buf++ = val;
> -			val = pad;
> -			num--;
> -		}
> +
> +	while (len >= 4) {
> +		val = ((int)scp[0] << 24) + ((int)scp[1] << 16) +
> +				((int)scp[2] << 8) + (int)scp[3];

The (int) casts are unnecessary (throughout), 'char' is always promoted to
'signed int' before any arithmetic.

> +		*buf++ = val;
> +		scp += 4;
> +		len -= 4;
> +		num--;
>  	}
> +
> +	val = pad;
> +
> +	for (i = 0; i < len; i++)
> +		val = (int)scp[i] + (val << 8);
> +
>  	if (--num >= 0)
>  		*buf++ = val;
> +
>  	while (--num >= 0)
>  		*buf++ = pad;
> +
>  }
>  
>  static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
> @@ -167,21 +175,29 @@ static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
>  	pad = (__u32)len | ((__u32)len << 8);
>  	pad |= pad << 16;
>  
> -	val = pad;
>  	if (len > num*4)
>  		len = num * 4;
> -	for (i = 0; i < len; i++) {
> -		val = ((int) ucp[i]) + (val << 8);
> -		if ((i % 4) == 3) {
> -			*buf++ = val;
> -			val = pad;
> -			num--;
> -		}
> +
> +	while (len >= 4) {
> +		val = ((int)ucp[0] << 24) | ((int)ucp[1] << 16) |
> +				((int)ucp[2] << 8) | (int)ucp[3];

Isn't that get_misaligned_be32() ?

	David 

> +		*buf++ = val;
> +		ucp += 4;
> +		len -= 4;
> +		num--;
>  	}
> +
> +	val = pad;
> +
> +	for (i = 0; i < len; i++)
> +		val = (int)ucp[i] + (val << 8);
> +
>  	if (--num >= 0)
>  		*buf++ = val;
> +
>  	while (--num >= 0)
>  		*buf++ = pad;
> +
>  }
>  
>  /*
Re: [PATCH] ext4: improve str2hashbuf by processing 4-byte chunks
Posted by Theodore Tso 2 months, 2 weeks ago
On Sun, Nov 16, 2025 at 07:35:13PM +0000, David Laight wrote:
> 
> The (int) casts are unnecessary (throughout), 'char' is always promoted to
> 'signed int' before any arithmetic.

nit: in this case the casts aren't necessary, but your comment is not
correct in general, so I just wanted to make sure it's corrected in
case someone later looks at the mail archive.

"char" is not always signed.  It can be signed or unsigned; the C
specification allows either.  In this particular case, scp is a
"signed char", not "char".

Secondly, it's not that a promotion happens before "any" arithmetic.
If we add two 8-bit values together, promotion doesn't happen.  In
this case, we are adding a signed char to an int, so the promotion
will happen.

Cheers,

							- Ted
Re: [PATCH] ext4: improve str2hashbuf by processing 4-byte chunks
Posted by Andreas Schwab 2 months, 2 weeks ago
On Nov 20 2025, Theodore Tso wrote:

> Secondly, it's not that a promotion happens before "any" arithmetic.
> If we add two 8-bit values together, promotion doesn't happen.

That is not true.  Integer promotion is applied individually to each
operand independent of context.

-- 
Andreas Schwab, schwab@linux-m68k.org
GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510  2552 DF73 E780 A9DA AEC1
"And now for something completely different."
Re: [PATCH] ext4: improve str2hashbuf by processing 4-byte chunks
Posted by David Laight 2 months, 2 weeks ago
On Fri, 21 Nov 2025 11:55:26 +0100
Andreas Schwab <schwab@linux-m68k.org> wrote:

> On Nov 20 2025, Theodore Tso wrote:
> 
> > Secondly, it's not that a promotion happens before "any" arithmetic.
> > If we add two 8-bit values together, promotion doesn't happen.  
> 
> That is not true.  Integer promotion is applied individually to each
> operand independent of context.

True.

You can think of the promotion happening every time a 'variable' is
read into a cpu register for any operation (including an assignment).

Similarly every write to a variable discards the high bits.

	David
Re: [PATCH] ext4: improve str2hashbuf by processing 4-byte chunks
Posted by Geert Uytterhoeven 2 months, 2 weeks ago
On Thu, 20 Nov 2025 at 17:11, Theodore Tso <tytso@mit.edu> wrote:
> "char" is not always signed.  It can be signed or unsigned; the C
> specification allows either.  In this particular case, scp is a
> "signed char", not "char".

This is Linux, so commit 3bc753c06dd02a35 ("kbuild: treat char as
always unsigned") in v6.2 and later applies to plain "char".

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds
Re: [PATCH] ext4: improve str2hashbuf by processing 4-byte chunks
Posted by Kuan-Wei Chiu 2 months, 2 weeks ago
Hi Ted,

On Thu, Nov 20, 2025 at 10:58:16AM -0500, Theodore Tso wrote:
> On Sun, Nov 16, 2025 at 07:35:13PM +0000, David Laight wrote:
> > 
> > The (int) casts are unnecessary (throughout), 'char' is always promoted to
> > 'signed int' before any arithmetic.
> 
> nit: in this case the casts aren't necessary, but your comment is not
> correct in general, so I just wanted to make sure it's corrected in
> case someone later looks at the mail archive.
> 
> "char" is not always signed.  It can be signed or unsigned; the C
> specification allows either.  In this particular case, scp is a
> "signed char", not "char".
> 
> Secondly, it's not that a promotion happens before "any" arithmetic.
> If we add two 8-bit values together, promotion doesn't happen.  In
> this case, we are adding a signed char to an int, so the promotion
> will happen.
> 
I believe David was referring to the C11 spec 6.3.1.1:

If an int can represent all values of the original type (as restricted
by the width, for a bit-field), the value is converted to an int;
otherwise, it is converted to an unsigned int. These are called the
integer promotions. All other types are unchanged by the integer
promotions.

The spec explicitly mentions char + char in 5.1.2.3 example:

EXAMPLE 2 In executing the fragment
char c1, c2;
/* ... */
c1 = c1 + c2;
the ‘‘integer promotions’’ require that the abstract machine promote
the value of each variable to int size and then add the two ints and
truncate the sum. Provided the addition of two chars can be done
without overflow, or with overflow wrapping silently to produce the
correct result, the actual execution need only produce the same result,
possibly omitting the promotions.

So IIUC conceptually the promotion happens, even if the compiler
optimizes it out in the actual execution.

Link: https://www.open-std.org/jtc1/sc22/wg14/www/docs/n1570.pdf

Regards,
Kuan-Wei
Re: [PATCH] ext4: improve str2hashbuf by processing 4-byte chunks
Posted by David Laight 2 months, 2 weeks ago
On Fri, 21 Nov 2025 00:58:23 +0800
Kuan-Wei Chiu <visitorckw@gmail.com> wrote:

> Hi Ted,
> 
> On Thu, Nov 20, 2025 at 10:58:16AM -0500, Theodore Tso wrote:
> > On Sun, Nov 16, 2025 at 07:35:13PM +0000, David Laight wrote:  
> > > 
> > > The (int) casts are unnecessary (throughout), 'char' is always promoted to
> > > 'signed int' before any arithmetic.  
> > 
> > nit: in this case the casts aren't necessary, but your comment is not
> > correct in general, so I just wanted to make sure it's corrected in
> > case someone later looks at the mail archive.
> > 
> > "char" is not always signed.  It can be signed or unsigned; the C
> > specification allows either.  In this particular case, scp is a
> > "signed char", not "char".

It doesn't matter - as pointed out below.
Both 'signed char' and 'unsigned char' are promoted to 'signed int'
before ANY operation.
Well unless sizeof(char) == sizeof(int) when 'unsigned char' is
promoted to 'unsigned int' - which is technically valid and was
true for the C compiler for an old DSP (everything was 32bits).

This is one difference between K&R C and ANSI C - K&R promoted
'unsigned char' to 'unsigned int'.
So there was always the chance that compiling in ANSI mode would
break working code.

> > 
> > Secondly, it's not that a promotion happens before "any" arithmetic.
> > If we add two 8-bit values together, promotion doesn't happen.  In
> > this case, we are adding a signed char to an int, so the promotion
> > will happen.
> >   
> I believe David was referring to the C11 spec 6.3.1.1:
> 
> If an int can represent all values of the original type (as restricted
> by the width, for a bit-field), the value is converted to an int;
> otherwise, it is converted to an unsigned int. These are called the
> integer promotions. All other types are unchanged by the integer
> promotions.
> 
> The spec explicitly mentions char + char in 5.1.2.3 example:
> 
> EXAMPLE 2 In executing the fragment
> char c1, c2;
> /* ... */
> c1 = c1 + c2;
> the ‘‘integer promotions’’ require that the abstract machine promote
> the value of each variable to int size and then add the two ints and
> truncate the sum. Provided the addition of two chars can be done
> without overflow, or with overflow wrapping silently to produce the
> correct result, the actual execution need only produce the same result,
> possibly omitting the promotions.

So with:
	char c1, c2;
	int i1, i2, i3;
	...
	i1 = c1 + c2;
	i2 = (int)c1 + (int)c2;
	i3 = (unsigned int)c1 + (unsigned int)c2;
the values of i1, i2 and i3 are all the same (on a 2s compliment cpu for i3)
regardless of whether char is signed or unsigned (they do depend on
the signedness of char).

> 
> So IIUC conceptually the promotion happens, even if the compiler
> optimizes it out in the actual execution.

Any it is pretty much only x86 and m68k that have instructions for
byte arithmetic.
So for everything else if you assign the result of an arithmetic
operation to a char/short local variable (which is hopefully in
a register rather than on stack) the compiler has to add extra
instructions to mask the value back to 8 (or 16) bits and likely
keep sign extending it as well.

People also forget that the type of 'cond ? c1 : c2' is also 'int'.

Part of it is historic, the pdp11 is a 16bit cpu with byte-addressable
memory and sign-extending byte memory reads (which is probably why char
defaults to signed).

	David


> 
> Link: https://www.open-std.org/jtc1/sc22/wg14/www/docs/n1570.pdf
> 
> Regards,
> Kuan-Wei
Re: [PATCH] ext4: improve str2hashbuf by processing 4-byte chunks
Posted by Guan-Chun Wu 2 months, 3 weeks ago
Hi David,

On Sun, Nov 16, 2025 at 07:35:13PM +0000, David Laight wrote:
> On Sun, 16 Nov 2025 21:01:05 +0800
> Guan-Chun Wu <409411716@gms.tku.edu.tw> wrote:
> 
> > The original byte-by-byte implementation with modulo checks is less
> > efficient. Refactor str2hashbuf_unsigned() and str2hashbuf_signed()
> > to process input in explicit 4-byte chunks instead of using a
> > modulus-based loop to emit words byte by byte.
> 
> There are much bigger gains to be made - the current code is horrid.
> Not least due to the costs of the indirect calls.
> It is better to use conditionals than indirect calls. 
>

Thanks for the feedback. I'll remove the redundant casts, and for the
unsigned version I'll switch to using get_unaligned_be32() to avoid
duplicating the implementation. If this approach looks reasonable, I
can send a v2 that replaces the indirect calls with conditionals.

Best regards,
Guan-Chun

> 
> > 
> > This change removes per-byte modulo checks and reduces loop iterations,
> > improving efficiency.
> > 
> > Performance test (x86_64, Intel Core i7-10700 @ 2.90GHz, average over 10000
> > runs, using kernel module for testing):
> > 
> >     len | orig_s | new_s | orig_u | new_u
> >     ----+--------+-------+--------+-------
> >       1 |   70   |   71  |   63   |   63
> >       8 |   68   |   64  |   64   |   62
> >      32 |   75   |   70  |   75   |   63
> >      64 |   96   |   71  |  100   |   68
> >     255 |  192   |  108  |  187   |   84
> > 
> > Signed-off-by: Guan-Chun Wu <409411716@gms.tku.edu.tw>
> > ---
> >  fs/ext4/hash.c | 48 ++++++++++++++++++++++++++++++++----------------
> >  1 file changed, 32 insertions(+), 16 deletions(-)
> > 
> > diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
> > index 33cd5b6b02d5..75105828e8b4 100644
> > --- a/fs/ext4/hash.c
> > +++ b/fs/ext4/hash.c
> > @@ -141,21 +141,29 @@ static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
> >  	pad = (__u32)len | ((__u32)len << 8);
> >  	pad |= pad << 16;
> >  
> > -	val = pad;
> >  	if (len > num*4)
> >  		len = num * 4;
> > -	for (i = 0; i < len; i++) {
> > -		val = ((int) scp[i]) + (val << 8);
> > -		if ((i % 4) == 3) {
> > -			*buf++ = val;
> > -			val = pad;
> > -			num--;
> > -		}
> > +
> > +	while (len >= 4) {
> > +		val = ((int)scp[0] << 24) + ((int)scp[1] << 16) +
> > +				((int)scp[2] << 8) + (int)scp[3];
> 
> The (int) casts are unnecessary (throughout), 'char' is always promoted to
> 'signed int' before any arithmetic.
> 
> > +		*buf++ = val;
> > +		scp += 4;
> > +		len -= 4;
> > +		num--;
> >  	}
> > +
> > +	val = pad;
> > +
> > +	for (i = 0; i < len; i++)
> > +		val = (int)scp[i] + (val << 8);
> > +
> >  	if (--num >= 0)
> >  		*buf++ = val;
> > +
> >  	while (--num >= 0)
> >  		*buf++ = pad;
> > +
> >  }
> >  
> >  static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
> > @@ -167,21 +175,29 @@ static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
> >  	pad = (__u32)len | ((__u32)len << 8);
> >  	pad |= pad << 16;
> >  
> > -	val = pad;
> >  	if (len > num*4)
> >  		len = num * 4;
> > -	for (i = 0; i < len; i++) {
> > -		val = ((int) ucp[i]) + (val << 8);
> > -		if ((i % 4) == 3) {
> > -			*buf++ = val;
> > -			val = pad;
> > -			num--;
> > -		}
> > +
> > +	while (len >= 4) {
> > +		val = ((int)ucp[0] << 24) | ((int)ucp[1] << 16) |
> > +				((int)ucp[2] << 8) | (int)ucp[3];
> 
> Isn't that get_misaligned_be32() ?
> 
> 	David 
>
> > +		*buf++ = val;
> > +		ucp += 4;
> > +		len -= 4;
> > +		num--;
> >  	}
> > +
> > +	val = pad;
> > +
> > +	for (i = 0; i < len; i++)
> > +		val = (int)ucp[i] + (val << 8);
> > +
> >  	if (--num >= 0)
> >  		*buf++ = val;
> > +
> >  	while (--num >= 0)
> >  		*buf++ = pad;
> > +
> >  }
> >  
> >  /*
>