arch/x86/include/asm/string_32.h | 18 ------- arch/x86/lib/string_32.c | 89 -------------------------------- 2 files changed, 107 deletions(-)
From: David Laight <david.laight.linux@gmail.com>
The fixed overhead of all the 'rep xxx' instructions is rather more
that might expect.
While 'rep movs' is getting better on more recent CPU, the same is
not true for 'rep scasb'.
On my Zen-5 it has a fixed overhead of 150 clocks and then takes 3
clocks for each byte.
I've not measured any Intel CPU, but the cost might be 'only' 40 + 2n.
Remove the asm versions of strcat() strncat() strlen() memchr()
and memscan(), the generic C versions will be faster.
It is quite likely that all these functions are slower than the generic
code on pretty much all CPU since the 486.
Signed-off-by: David Laight <david.laight.linux@gmail.com>
---
arch/x86/include/asm/string_32.h | 18 -------
arch/x86/lib/string_32.c | 89 --------------------------------
2 files changed, 107 deletions(-)
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index e9cce169bb4c..b245db5d7f3c 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -12,12 +12,6 @@ extern char *strcpy(char *dest, const char *src);
#define __HAVE_ARCH_STRNCPY
extern char *strncpy(char *dest, const char *src, size_t count);
-#define __HAVE_ARCH_STRCAT
-extern char *strcat(char *dest, const char *src);
-
-#define __HAVE_ARCH_STRNCAT
-extern char *strncat(char *dest, const char *src, size_t count);
-
#define __HAVE_ARCH_STRCMP
extern int strcmp(const char *cs, const char *ct);
@@ -27,9 +21,6 @@ extern int strncmp(const char *cs, const char *ct, size_t count);
#define __HAVE_ARCH_STRCHR
extern char *strchr(const char *s, int c);
-#define __HAVE_ARCH_STRLEN
-extern size_t strlen(const char *s);
-
static __always_inline void *__memcpy(void *to, const void *from, size_t n)
{
int d0, d1, d2;
@@ -159,9 +150,6 @@ extern int memcmp(const void *, const void *, size_t);
#define memcmp __builtin_memcmp
#endif
-#define __HAVE_ARCH_MEMCHR
-extern void *memchr(const void *cs, int c, size_t count);
-
static inline void *__memset_generic(void *s, char c, size_t count)
{
int d0, d1;
@@ -216,12 +204,6 @@ static inline void *memset32(uint32_t *s, uint32_t v, size_t n)
return s;
}
-/*
- * find the first occurrence of byte 'c', or 1 past the area if none
- */
-#define __HAVE_ARCH_MEMSCAN
-extern void *memscan(void *addr, int c, size_t size);
-
#endif /* __KERNEL__ */
#endif /* _ASM_X86_STRING_32_H */
diff --git a/arch/x86/lib/string_32.c b/arch/x86/lib/string_32.c
index f87ec24fa579..3602e808b584 100644
--- a/arch/x86/lib/string_32.c
+++ b/arch/x86/lib/string_32.c
@@ -49,46 +49,6 @@ char *strncpy(char *dest, const char *src, size_t count)
EXPORT_SYMBOL(strncpy);
#endif
-#ifdef __HAVE_ARCH_STRCAT
-char *strcat(char *dest, const char *src)
-{
- int d0, d1, d2, d3;
- asm volatile("repne scasb\n\t"
- "decl %1\n"
- "1:\tlodsb\n\t"
- "stosb\n\t"
- "testb %%al,%%al\n\t"
- "jne 1b"
- : "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3)
- : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu) : "memory");
- return dest;
-}
-EXPORT_SYMBOL(strcat);
-#endif
-
-#ifdef __HAVE_ARCH_STRNCAT
-char *strncat(char *dest, const char *src, size_t count)
-{
- int d0, d1, d2, d3;
- asm volatile("repne scasb\n\t"
- "decl %1\n\t"
- "movl %8,%3\n"
- "1:\tdecl %3\n\t"
- "js 2f\n\t"
- "lodsb\n\t"
- "stosb\n\t"
- "testb %%al,%%al\n\t"
- "jne 1b\n"
- "2:\txorl %2,%2\n\t"
- "stosb"
- : "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3)
- : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu), "g" (count)
- : "memory");
- return dest;
-}
-EXPORT_SYMBOL(strncat);
-#endif
-
#ifdef __HAVE_ARCH_STRCMP
int strcmp(const char *cs, const char *ct)
{
@@ -159,55 +119,6 @@ char *strchr(const char *s, int c)
EXPORT_SYMBOL(strchr);
#endif
-#ifdef __HAVE_ARCH_STRLEN
-size_t strlen(const char *s)
-{
- int d0;
- size_t res;
- asm volatile("repne scasb"
- : "=c" (res), "=&D" (d0)
- : "1" (s), "a" (0), "0" (0xffffffffu)
- : "memory");
- return ~res - 1;
-}
-EXPORT_SYMBOL(strlen);
-#endif
-
-#ifdef __HAVE_ARCH_MEMCHR
-void *memchr(const void *cs, int c, size_t count)
-{
- int d0;
- void *res;
- if (!count)
- return NULL;
- asm volatile("repne scasb\n\t"
- "je 1f\n\t"
- "movl $1,%0\n"
- "1:\tdecl %0"
- : "=D" (res), "=&c" (d0)
- : "a" (c), "0" (cs), "1" (count)
- : "memory");
- return res;
-}
-EXPORT_SYMBOL(memchr);
-#endif
-
-#ifdef __HAVE_ARCH_MEMSCAN
-void *memscan(void *addr, int c, size_t size)
-{
- if (!size)
- return addr;
- asm volatile("repnz scasb\n\t"
- "jnz 1f\n\t"
- "dec %%edi\n"
- "1:"
- : "=D" (addr), "=c" (size)
- : "0" (addr), "1" (size), "a" (c)
- : "memory");
- return addr;
-}
-EXPORT_SYMBOL(memscan);
-#endif
#ifdef __HAVE_ARCH_STRNLEN
size_t strnlen(const char *s, size_t count)
--
2.39.5
On 3/27/26 12:57, david.laight.linux@gmail.com wrote:
> The fixed overhead of all the 'rep xxx' instructions is rather more
> that might expect. While 'rep movs' is getting better on more recent
> CPU, the same is not true for 'rep scasb'. On my Zen-5 it has a
> fixed overhead of 150 clocks and then takes 3 clocks for each byte.
> I've not measured any Intel CPU, but the cost might be 'only' 40 +
> 2n.
One measurement on a modern 64-bit CPU isn't super convincing to me.
> Remove the asm versions of strcat() strncat() strlen() memchr()
> and memscan(), the generic C versions will be faster.
>
> It is quite likely that all these functions are slower than the generic
> code on pretty much all CPU since the 486.
This is rather handwavy for my taste.
There seem to be two valid paths here:
1. We continue the "nobody cares about 32-bit" refrain. This removes a
bunch of 32-bit-only code and complexity. If it causes a performance
regression, we do not care much.
2. Someone makes _some_ kind of effort to test this on at least *one*
32-bit-only CPU to see if it does any harm.
In other words, I'm not opposed to the patch, but the justification
doesn't really work for me as written.
On Mon, Mar 30, 2026 at 7:58 PM Dave Hansen <dave.hansen@intel.com> wrote: > > On 3/27/26 12:57, david.laight.linux@gmail.com wrote: > > The fixed overhead of all the 'rep xxx' instructions is rather more > > that might expect. While 'rep movs' is getting better on more recent > > CPU, the same is not true for 'rep scasb'. On my Zen-5 it has a > > fixed overhead of 150 clocks and then takes 3 clocks for each byte. > > I've not measured any Intel CPU, but the cost might be 'only' 40 + > > 2n. > > One measurement on a modern 64-bit CPU isn't super convincing to me. > > > Remove the asm versions of strcat() strncat() strlen() memchr() > > and memscan(), the generic C versions will be faster. > > > > It is quite likely that all these functions are slower than the generic > > code on pretty much all CPU since the 486. > > This is rather handwavy for my taste. > > There seem to be two valid paths here: > > 1. We continue the "nobody cares about 32-bit" refrain. This removes a > bunch of 32-bit-only code and complexity. If it causes a performance > regression, we do not care much. > 2. Someone makes _some_ kind of effort to test this on at least *one* > 32-bit-only CPU to see if it does any harm. > > In other words, I'm not opposed to the patch, but the justification > doesn't really work for me as written. I have Intel Quark at hand to test. But I need to know the step-by-step instructions on what to do. -- With Best Regards, Andy Shevchenko
On Mon, 30 Mar 2026 20:21:41 +0300 Andy Shevchenko <andy.shevchenko@gmail.com> wrote: > On Mon, Mar 30, 2026 at 7:58 PM Dave Hansen <dave.hansen@intel.com> wrote: > > > > On 3/27/26 12:57, david.laight.linux@gmail.com wrote: > > > The fixed overhead of all the 'rep xxx' instructions is rather more > > > that might expect. While 'rep movs' is getting better on more recent > > > CPU, the same is not true for 'rep scasb'. On my Zen-5 it has a > > > fixed overhead of 150 clocks and then takes 3 clocks for each byte. > > > I've not measured any Intel CPU, but the cost might be 'only' 40 + > > > 2n. > > > > One measurement on a modern 64-bit CPU isn't super convincing to me. > > > > > Remove the asm versions of strcat() strncat() strlen() memchr() > > > and memscan(), the generic C versions will be faster. > > > > > > It is quite likely that all these functions are slower than the generic > > > code on pretty much all CPU since the 486. > > > > This is rather handwavy for my taste. > > > > There seem to be two valid paths here: > > > > 1. We continue the "nobody cares about 32-bit" refrain. This removes a > > bunch of 32-bit-only code and complexity. If it causes a performance > > regression, we do not care much. > > 2. Someone makes _some_ kind of effort to test this on at least *one* > > 32-bit-only CPU to see if it does any harm. > > > > In other words, I'm not opposed to the patch, but the justification > > doesn't really work for me as written. > > I have Intel Quark at hand to test. But I need to know the > step-by-step instructions on what to do. > I can run my test on a few 'older' systems, but I don't have anything Intel before Sandy bridge and only an AMD 'Excavator' (or similar). I do remember (a long time ago) getting my Athlon 700 to run a copy loop as fast as 'rep movl' - but the setup time was a lot worse. So I suspect that generation of cpu didn't have a large overhead. If I've read Agner's tables he gives a 40 clock setup to P-II onwards. I can give you the source of the test I've been using. David
On 3/30/26 12:20, David Laight wrote: > I have Intel Quark at hand to test. But I need to know the > step-by-step instructions on what to do. I'll take it if it's all that we have, but Quark is really weird. It's probably Intel's last sold 32-bit-only CPU, but it wasn't used for anything remotely performance sensitive, it's more like a 1995 CPU than a 2010 CPU, and Intel probably sold like twenty of them. ;) But, seriously, we don't need to go digging in the junk heap for performance numbers. If nobody has one handy, it's just extra justification for "we don't care". But let's just say *THAT* instead of doing some kind of performance theater where we pretend that like every cycle on CPUs from 2003 matters on a 2026 kernel, and that we even cared enough to measure it.
On Mon, 30 Mar 2026, Dave Hansen wrote: > > I have Intel Quark at hand to test. But I need to know the > > step-by-step instructions on what to do. > > I'll take it if it's all that we have, but Quark is really weird. It's > probably Intel's last sold 32-bit-only CPU, but it wasn't used for > anything remotely performance sensitive, it's more like a 1995 CPU than > a 2010 CPU, and Intel probably sold like twenty of them. ;) > > But, seriously, we don't need to go digging in the junk heap for > performance numbers. If nobody has one handy, it's just extra > justification for "we don't care". > > But let's just say *THAT* instead of doing some kind of performance > theater where we pretend that like every cycle on CPUs from 2003 matters > on a 2026 kernel, and that we even cared enough to measure it. FWIW I can benchmark on a genuine i486 or Pentium MMX system right away, but I'm more concerned about support being dropped altogether rather than squeezing out any extra cycles from these boxes at this point. If anyone runs such equipment for performance nowadays, they must clearly be mad or have missed something. Maciej
On Tue, Mar 31, 2026 at 3:27 AM Maciej W. Rozycki <macro@orcam.me.uk> wrote: > On Mon, 30 Mar 2026, Dave Hansen wrote: > > > > I have Intel Quark at hand to test. But I need to know the > > > step-by-step instructions on what to do. > > > > I'll take it if it's all that we have, but Quark is really weird. It's > > probably Intel's last sold 32-bit-only CPU, but it wasn't used for > > anything remotely performance sensitive, it's more like a 1995 CPU than > > a 2010 CPU, and Intel probably sold like twenty of them. ;) > > > > But, seriously, we don't need to go digging in the junk heap for > > performance numbers. If nobody has one handy, it's just extra > > justification for "we don't care". > > > > But let's just say *THAT* instead of doing some kind of performance > > theater where we pretend that like every cycle on CPUs from 2003 matters > > on a 2026 kernel, and that we even cared enough to measure it. > > FWIW I can benchmark on a genuine i486 or Pentium MMX system right away, > but I'm more concerned about support being dropped altogether rather than > squeezing out any extra cycles from these boxes at this point. If anyone > runs such equipment for performance nowadays, they must clearly be mad or > have missed something. It makes sense for people who want a tiny x86 core running something as fast as they can with all the benefits from that small core. Intel Quark was designed for power and efficiency for the embedded world, having slightly better performance is not a bad idea. -- With Best Regards, Andy Shevchenko
© 2016 - 2026 Red Hat, Inc.