List Info

Thread: Re: rawmemchr




Re: rawmemchr
country flaguser name
United States
2008-05-21 12:50:15
Eric Blake <ebb9 <at> byu.net> writes:

> >   Maybe you could show a few timing measurements
from realistic testcases at
> > this point in the discussion?  I haven't felt
comfortable trying to directly
> > infer from instruction counts to real-time clock
cycles any time in the past
> > two decades myself, there are so many confounding
factors these days that
> > I'm not even confident of being able to validly
reason about it any more.
> 
> Here's my test program:

I modified my test program to also check strlen numbers (in
mystrchr, replace 
the !c branch with "return (char *) str + strlen
(str);").  With strlen's 
current assembly implementation of 'repnz scasb' as the
inner loop, modern x86 
machines have HIDEOUS performance:

$ time ./foo 1000000 1 0 1000 0 1

real	0m2.859s
user	0m2.874s
sys	0m0.015s

$ time ./foo 1000000 1 1 1000 0 1

real	0m2.875s
user	0m2.874s
sys	0m0.030s

3x worse than current strchr on unaligned data, but at least
alignment didn't 
matter.

Finally, I implemented my assembly changes (renamed as
strchr1 in my testing, 
so I could still compare to the original strchr), and got
much more promising 
results:


$ time ./foo 1000000 1 0 1000 2 2

real	0m0.578s
user	0m0.593s
sys	0m0.015s

$ time ./foo 1000000 1 1 1000 2 2

real	0m0.562s
user	0m0.577s
sys	0m0.015s

All right - aligned searches are not noticeably worse than
the original 
implementation, and unaligned searches are now comparable in
time to aligned 
searches, rather than 40% slower.

$ time ./foo 1000000 1 0 1000 0 0

real	0m0.329s
user	0m0.327s
sys	0m0.015s

$ time ./foo 1000000 1 1 1000 0 0

real	0m0.328s
user	0m0.343s
sys	0m0.015s

And the special case of searching for NUL is 40% faster!

Code size difference:

original strchr: 111 bytes
my strchr: 250 bytes

So, OK to apply this patch?

2008-05-21  Eric Blake  <ebb9byu.net>

	Optimize strchr for x86.
	* libc/machine/i386/strchr.S (strchr): Pre-align so
unaligned
	searches aren't penalized.  Special-case searching for 0.


Index: libc/machine/i386/strchr.S
============================================================
=======
RCS file: /cvs/src/src/newlib/libc/machine/i386/strchr.S,v
retrieving revision 1.3
diff -u -p -r1.3 strchr.S
--- libc/machine/i386/strchr.S	20 Dec 2002 21:31:19
-0000	1.3
+++ libc/machine/i386/strchr.S	21 May 2008 17:46:22 -0000
 -1,6
+1,6 
 /*
  * ====================================================
- * Copyright (C) 1998, 2002 by Red Hat Inc. All rights
reserved.
+ * Copyright (C) 1998, 2002, 2008 by Red Hat Inc. All
rights reserved.
  *
  * Permission to use, copy, modify, and distribute this
  * software is freely granted, provided that this notice
 -9,7
+9,7 
  */
 
 	#include "i386mach.h"
-	
+
 	.global SYM (strchr)
        SOTYPE_FUNCTION(strchr)
 
 -21,14
+21,45  SYM (strchr):
 	pushl ebx
 	xorl ebx,ebx
 	movl 8(ebp),edi
-	movb 12(ebp),bl
+	addb 12(ebp),bl
+
+#ifndef __OPTIMIZE_SIZE__
+/* Special case strchr(p,0).  */
+	je L25
 
-#ifndef __OPTIMIZE_SIZE__	
-/* check if string is aligned, if not do check one byte at
a time */
+/* Do byte-wise checks until string is aligned.  */
 	test $3,edi
-	jne L9
+	je L5
+	movl edi,eax
+	movb (eax),cl
+	testb cl,cl
+	je L14
+	cmpb bl,cl
+	je L19
+	incl edi
+
+	test $3,edi
+	je L5
+	movl edi,eax
+	movb (eax),cl
+	testb cl,cl
+	je L14
+	cmpb bl,cl
+	je L19
+	incl edi
+
+	test $3,edi
+	je L5
+	movl edi,eax
+	movb (eax),cl
+	testb cl,cl
+	je L14
+	cmpb bl,cl
+	je L19
+	incl edi
 
 /* create 4 byte mask which is just the desired byte
repeated 4 times */
+L5:
 	movl ebx,ecx
 	sall $8,ebx
 	subl $4,edi
 -49,15
+80,14  L10:
 	testl $-2139062144,edx
 	jne L9
 
-	movl ebx,eax
-	xorl ecx,eax
-	leal -16843009(eax),edx
-	notl eax
-	andl eax,edx
+	xorl ebx,ecx
+	leal -16843009(ecx),edx
+	notl ecx
+	andl ecx,edx
 	testl $-2139062144,edx
 	je L10
 #endif /* not __OPTIMIZE_SIZE__ */
-		
+
 /* loop while (*s && *s++ != c) */
 L9:
 	leal -1(edi),eax
 -69,7
+99,7  L15:
 	je L14
 	cmpb bl,dl
 	jne L15
-	
+
 L14:
 /*  if (*s == c) return address otherwise return NULL */
 	cmpb bl,(eax)
 -83,3
+113,60  L19:
 	leave
 	ret
 
+#ifndef __OPTIMIZE_SIZE__
+/* Special case strchr(p,0).  */
+#if 0
+	/* Hideous performance on modern machines.  */
+L25:
+	cld
+	movl $-1,ecx
+	xor eax,eax
+	repnz
+	scasb
+	leal -1(edi),eax
+	jmp L19
+#endif
+L25:
+/* Do byte-wise checks until string is aligned.  */
+	test $3,edi
+	je L26
+	movl edi,eax
+	movb (eax),cl
+	testb cl,cl
+	je L19
+	incl edi
+
+	test $3,edi
+	je L26
+	movl edi,eax
+	movb (eax),cl
+	testb cl,cl
+	je L19
+	incl edi
+
+	test $3,edi
+	je L26
+	movl edi,eax
+	movb (eax),cl
+	testb cl,cl
+	je L19
+	incl edi
+
+L26:
+	subl $4,edi
+
+/* loop performing 4 byte mask checking for desired 0 byte
*/
+	.p2align 4,,7
+L27:
+	addl $4,edi
+	movl (edi),ecx
+	leal -16843009(ecx),edx
+	movl ecx,eax
+	notl eax
+	andl eax,edx
+	testl $-2139062144,edx
+	je L27
+
+	jmp L9
+
+#endif /* !__OPTIMIZE_SIZE__ */



[1]

about | contact  Other archives ( Real Estate discussion Medical topics )