Eric Blake <ebb9 <at> byu.net> writes:
> > Maybe you could show a few timing measurements
from realistic testcases at
> > this point in the discussion? I haven't felt
comfortable trying to directly
> > infer from instruction counts to real-time clock
cycles any time in the past
> > two decades myself, there are so many confounding
factors these days that
> > I'm not even confident of being able to validly
reason about it any more.
>
> Here's my test program:
I modified my test program to also check strlen numbers (in
mystrchr, replace
the !c branch with "return (char *) str + strlen
(str);"). With strlen's
current assembly implementation of 'repnz scasb' as the
inner loop, modern x86
machines have HIDEOUS performance:
$ time ./foo 1000000 1 0 1000 0 1
real 0m2.859s
user 0m2.874s
sys 0m0.015s
$ time ./foo 1000000 1 1 1000 0 1
real 0m2.875s
user 0m2.874s
sys 0m0.030s
3x worse than current strchr on unaligned data, but at least
alignment didn't
matter.
Finally, I implemented my assembly changes (renamed as
strchr1 in my testing,
so I could still compare to the original strchr), and got
much more promising
results:
$ time ./foo 1000000 1 0 1000 2 2
real 0m0.578s
user 0m0.593s
sys 0m0.015s
$ time ./foo 1000000 1 1 1000 2 2
real 0m0.562s
user 0m0.577s
sys 0m0.015s
All right - aligned searches are not noticeably worse than
the original
implementation, and unaligned searches are now comparable in
time to aligned
searches, rather than 40% slower.
$ time ./foo 1000000 1 0 1000 0 0
real 0m0.329s
user 0m0.327s
sys 0m0.015s
$ time ./foo 1000000 1 1 1000 0 0
real 0m0.328s
user 0m0.343s
sys 0m0.015s
And the special case of searching for NUL is 40% faster!
Code size difference:
original strchr: 111 bytes
my strchr: 250 bytes
So, OK to apply this patch?
2008-05-21 Eric Blake <ebb9 byu.net>
Optimize strchr for x86.
* libc/machine/i386/strchr.S (strchr): Pre-align so
unaligned
searches aren't penalized. Special-case searching for 0.
Index: libc/machine/i386/strchr.S
============================================================
=======
RCS file: /cvs/src/src/newlib/libc/machine/i386/strchr.S,v
retrieving revision 1.3
diff -u -p -r1.3 strchr.S
--- libc/machine/i386/strchr.S 20 Dec 2002 21:31:19
-0000 1.3
+++ libc/machine/i386/strchr.S 21 May 2008 17:46:22 -0000
 -1,6
+1,6 
/*
* ====================================================
- * Copyright (C) 1998, 2002 by Red Hat Inc. All rights
reserved.
+ * Copyright (C) 1998, 2002, 2008 by Red Hat Inc. All
rights reserved.
*
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
 -9,7
+9,7 
*/
#include "i386mach.h"
-
+
.global SYM (strchr)
SOTYPE_FUNCTION(strchr)
 -21,14
+21,45  SYM (strchr):
pushl ebx
xorl ebx,ebx
movl 8(ebp),edi
- movb 12(ebp),bl
+ addb 12(ebp),bl
+
+#ifndef __OPTIMIZE_SIZE__
+/* Special case strchr(p,0). */
+ je L25
-#ifndef __OPTIMIZE_SIZE__
-/* check if string is aligned, if not do check one byte at
a time */
+/* Do byte-wise checks until string is aligned. */
test $3,edi
- jne L9
+ je L5
+ movl edi,eax
+ movb (eax),cl
+ testb cl,cl
+ je L14
+ cmpb bl,cl
+ je L19
+ incl edi
+
+ test $3,edi
+ je L5
+ movl edi,eax
+ movb (eax),cl
+ testb cl,cl
+ je L14
+ cmpb bl,cl
+ je L19
+ incl edi
+
+ test $3,edi
+ je L5
+ movl edi,eax
+ movb (eax),cl
+ testb cl,cl
+ je L14
+ cmpb bl,cl
+ je L19
+ incl edi
/* create 4 byte mask which is just the desired byte
repeated 4 times */
+L5:
movl ebx,ecx
sall $8,ebx
subl $4,edi
 -49,15
+80,14  L10:
testl $-2139062144,edx
jne L9
- movl ebx,eax
- xorl ecx,eax
- leal -16843009(eax),edx
- notl eax
- andl eax,edx
+ xorl ebx,ecx
+ leal -16843009(ecx),edx
+ notl ecx
+ andl ecx,edx
testl $-2139062144,edx
je L10
#endif /* not __OPTIMIZE_SIZE__ */
-
+
/* loop while (*s && *s++ != c) */
L9:
leal -1(edi),eax
 -69,7
+99,7  L15:
je L14
cmpb bl,dl
jne L15
-
+
L14:
/* if (*s == c) return address otherwise return NULL */
cmpb bl,(eax)
 -83,3
+113,60  L19:
leave
ret
+#ifndef __OPTIMIZE_SIZE__
+/* Special case strchr(p,0). */
+#if 0
+ /* Hideous performance on modern machines. */
+L25:
+ cld
+ movl $-1,ecx
+ xor eax,eax
+ repnz
+ scasb
+ leal -1(edi),eax
+ jmp L19
+#endif
+L25:
+/* Do byte-wise checks until string is aligned. */
+ test $3,edi
+ je L26
+ movl edi,eax
+ movb (eax),cl
+ testb cl,cl
+ je L19
+ incl edi
+
+ test $3,edi
+ je L26
+ movl edi,eax
+ movb (eax),cl
+ testb cl,cl
+ je L19
+ incl edi
+
+ test $3,edi
+ je L26
+ movl edi,eax
+ movb (eax),cl
+ testb cl,cl
+ je L19
+ incl edi
+
+L26:
+ subl $4,edi
+
+/* loop performing 4 byte mask checking for desired 0 byte
*/
+ .p2align 4,,7
+L27:
+ addl $4,edi
+ movl (edi),ecx
+ leal -16843009(ecx),edx
+ movl ecx,eax
+ notl eax
+ andl eax,edx
+ testl $-2139062144,edx
+ je L27
+
+ jmp L9
+
+#endif /* !__OPTIMIZE_SIZE__ */
|