List Info

Thread: OpenSSL: openssl/ Configure openssl/crypto/sha/ Makefile openssl...




OpenSSL: openssl/ Configure openssl/crypto/sha/ Makefile openssl...
user name
2006-04-16 14:42:55
  OpenSSL CVS Repository
  http://cvs.openssl.org/
 
____________________________________________________________
________________

  Server: cvs.openssl.org                  Name:   Andy
Polyakov
  Root:   /v/openssl/cvs                   Email:  approopenssl.org
  Module: openssl                          Date:  
16-Apr-2006 16:42:55
  Branch: HEAD                             Handle:
2006041615425400

  Added files:
    openssl/crypto/sha/asm  sha1-x86_64.pl
  Modified files:
    openssl                 Configure
    openssl/crypto/sha      Makefile sha_locl.h

  Log:
    SHA-1 for x86_64.

  Summary:
    Revision    Changes     Path
    1.532       +1  -1      openssl/Configure
    1.8         +2  -0      openssl/crypto/sha/Makefile
    1.1         +239 -0    
openssl/crypto/sha/asm/sha1-x86_64.pl
    1.25        +2  -1      openssl/crypto/sha/sha_locl.h
 
____________________________________________________________
________________

  patch -p0 <<' .'
  Index: openssl/Configure
 
============================================================
================
  $ cvs diff -u -r1.531 -r1.532 Configure
  --- openssl/Configure	11 Apr 2006 21:34:12 -0000	1.531
  +++ openssl/Configure	16 Apr 2006 14:42:54 -0000	1.532
   -118,7 +118,7 
   my $x86_coff_asm="x86cpuid-cof.o:bn86-cof.o
co86-cof.o mo86-cof.o:dx86-cof.o
yx86-cof.o:ax86-cof.o:bx86-cof.o:mx86-cof.o:sx86-cof.o
s512sse2-cof.o:cx86-cof.o:rx86-cof.o:rm86-cof.o:r586-cof.o:w
p_block.o w86mmx-cof.o";
   my $x86_out_asm="x86cpuid-out.o:bn86-out.o
co86-out.o mo86-out.o:dx86-out.o
yx86-out.o:ax86-out.o:bx86-out.o:mx86-out.o:sx86-out.o
s512sse2-out.o:cx86-out.o:rx86-out.o:rm86-out.o:r586-out.o:w
p_block.o w86mmx-out.o";
   
  -my $x86_64_asm="x86_64cpuid.o86_64
-gcc.o
x86_64-mont.o::aes-x86_64.o::md5-x86_64.o:sha256-x86_64.o
sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o";
  +my $x86_64_asm="x86_64cpuid.o86_64
-gcc.o
x86_64-mont.o::aes-x86_64.o::md5-x86_64.o:sha1-x86_64.o
sha256-x86_64.o
sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o";
   my $ia64_asm="ia64cpuid.o:bn-ia64.o::aes_core.o
aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o
sha512-ia64.o::rc4-ia64.o:::";
   my $sparcv9_asm="sparcv9cap.o
sparccpuid.o:bn-sparcv9.o sparcv9-mont.o
sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o
aes_cbc.o aes-sparcv9.o::md5-sparcv9.o::::::";
   my $no_asm=":::::::::::";
   .
  patch -p0 <<' .'
  Index: openssl/crypto/sha/Makefile
 
============================================================
================
  $ cvs diff -u -r1.7 -r1.8 Makefile
  --- openssl/crypto/sha/Makefile	4 Feb 2006 01:45:51
-0000	1.7
  +++ openssl/crypto/sha/Makefile	16 Apr 2006 14:42:54
-0000	1.8
   -64,6 +64,8 
   	(cd asm; $(PERL) sha512-ia64.pl ../$
$(CFLAGS))
   sha512-ia64.s: asm/sha512-ia64.pl
   	(cd asm; $(PERL) sha512-ia64.pl ../$
$(CFLAGS))
  +sha1-x86_64.s: asm/sha1-x86_64.pl
  +	$(PERL) asm/sha1-x86_64.pl $
   sha256-x86_64.s: asm/sha512-x86_64.pl
   	$(PERL) asm/sha512-x86_64.pl $
   sha512-x86_64.s: asm/sha512-x86_64.pl
   .
  patch -p0 <<' .'
  Index: openssl/crypto/sha/asm/sha1-x86_64.pl
 
============================================================
================
  $ cvs diff -u -r0 -r1.1 sha1-x86_64.pl
  --- /dev/null	2006-04-16 16:41:30 +0200
  +++ sha1-x86_64.pl	2006-04-16 16:42:55 +0200
   -0,0 +1,239 
  +#!/usr/bin/env perl
  +#
  +#
============================================================
========
  +# Written by Andy Polyakov <approfy.chalmers.se> for the OpenSSL
  +# project. Rights for redistribution and usage in source
and binary
  +# forms are granted according to the OpenSSL license.
  +#
============================================================
========
  +#
  +# sha1_block procedure for x86_64.
  +#
  +# It was brought to my attention that on EM64T
compiler-generated code
  +# was far behind 32-bit assembler implementation. This is
unlike on
  +# Opteron where compiler-generated code was only 15%
behind 32-bit
  +# assembler, which originally made it hard to motivate
the effort.
  +# There was suggestion to mechanically translate 32-bit
code, but I
  +# dismissed it, reasoning that x86_64 offers enough
register bank
  +# capacity to fully utilize SHA-1 parallelism. Therefore
this fresh
  +# implementation However!
While 64-bit code does performs better
  +# on Opteron, I failed to beat 32-bit assembler on EM64T
core. Well,
  +# x86_64 does offer larger *addressable* bank, but
out-of-order core
  +# reaches for even more registers through dynamic
aliasing, and EM64T
  +# core must have managed to run-time optimize even 32-bit
code just as
  +# good as 64-bit one. Performance improvement is
summarized in the
  +# following table:
  +#
  +#		gcc 3.4		32-bit asm	cycles/byte
  +# Opteron	+45%		+20%		6.8
  +# Xeon		+65%		+0%		9.9
  +
  +$output=shift;
  +open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl
$output";
  +
  +$ctx="%rdi";	# 1st arg
  +$inp="%rsi";	# 2nd arg
  +$num="%rdx";	# 3rd arg
  +
  +# reassign arguments in order to produce more compact
code
  +$ctx="%r8";
  +$inp="%r9";
  +$num="%r10";
  +
  +$xi="%eax";
  +$t0="%ebx";
  +$t1="%ecx";
  +$A="%edx";
  +$B="%esi";
  +$C="%edi";
  +$D="%ebp";
  +$E="%r11d";
  +$T="%r12d";
  +
  +V=($A,$B,$C,$D,$E,$T);
  +
  +sub PROLOGUE {
  +my $func=shift;
  +$code.=<<___;
  +.globl	$func
  +.type	$func,\function,3
  +.align	16
  +$func:
  +	push	%rbx
  +	push	%rbp
  +	push	%r12
  +	mov	%rsp,%rax
  +	mov	%rdi,$ctx	# reassigned argument
  +	sub	\$`8+16*4`,%rsp
  +	mov	%rsi,$inp	# reassigned argument
  +	and	\$-64,%rsp
  +	mov	%rdx,$num	# reassigned argument
  +	mov	%rax,`16*4`(%rsp)
  +
  +	mov	0($ctx),$A
  +	mov	4($ctx),$B
  +	mov	8($ctx),$C
  +	mov	12($ctx),$D
  +	mov	16($ctx),$E
  +___
  +}
  +
  +sub EPILOGUE {
  +my $func=shift;
  +$code.=<<___;
  +	mov	`16*4`(%rsp),%rsp
  +	pop	%r12
  +	pop	%rbp
  +	pop	%rbx
  +	ret
  +.size	$func,.-$func
  +___
  +}
  +
  +sub BODY_00_19 {
  +my ($i,$a,$b,$c,$d,$e,$f,$host)=_;
  +my $j=$i+1;
  +$code.=<<___ if ($i==0);
  +	mov	`4*$i`($inp),$xi	
  +	`"bswap	$xi"	if(!defined($host))`
  +	mov	$xi,`4*$i`(%rsp)
  +___
  +$code.=<<___ if ($i<15);
  +	lea	0x5a827999($xi,$e),$f
  +	mov	$c,$t0
  +	mov	`4*$j`($inp),$xi
  +	mov	$a,$e
  +	xor	$d,$t0
  +	`"bswap	$xi"	if(!defined($host))`	
  +	rol	\$5,$e
  +	and	$b,$t0
  +	mov	$xi,`4*$j`(%rsp)
  +	add	$e,$f
  +	xor	$d,$t0
  +	rol	\$30,$b
  +	add	$t0,$f
  +___
  +$code.=".Lshortcut:\n" if ($i==15);
  +$code.=<<___ if ($i>=15);
  +	lea	0x5a827999($xi,$e),$f
  +	mov	`4*($j%16)`(%rsp),$xi
  +	mov	$c,$t0
  +	mov	$a,$e
  +	xor	`4*(($j+2)%16)`(%rsp),$xi
  +	xor	$d,$t0
  +	rol	\$5,$e
  +	xor	`4*(($j+8)%16)`(%rsp),$xi
  +	and	$b,$t0
  +	add	$e,$f
  +	xor	`4*(($j+13)%16)`(%rsp),$xi
  +	xor	$d,$t0
  +	rol	\$30,$b
  +	add	$t0,$f
  +	rol	\$1,$xi
  +	mov	$xi,`4*($j%16)`(%rsp)
  +___
  +}
  +
  +sub BODY_20_39 {
  +my ($i,$a,$b,$c,$d,$e,$f)=_;
  +my $j=$i+1;
  +my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
  +$code.=<<___ if ($i<79);
  +	lea	$K($xi,$e),$f
  +	mov	`4*($j%16)`(%rsp),$xi
  +	mov	$c,$t0
  +	mov	$a,$e
  +	xor	`4*(($j+2)%16)`(%rsp),$xi
  +	xor	$b,$t0
  +	rol	\$5,$e
  +	xor	`4*(($j+8)%16)`(%rsp),$xi
  +	xor	$d,$t0
  +	add	$e,$f
  +	xor	`4*(($j+13)%16)`(%rsp),$xi
  +	rol	\$30,$b
  +	add	$t0,$f
  +	rol	\$1,$xi
  +	mov	$xi,`4*($j%16)`(%rsp)
  +___
  +$code.=<<___ if ($i==79);
  +	lea	$K($xi,$e),$f
  +	mov	$c,$t0
  +	mov	$a,$e
  +	xor	$b,$t0
  +	rol	\$5,$e
  +	xor	$d,$t0
  +	add	$e,$f
  +	rol	\$30,$b
  +	add	$t0,$f
  +___
  +}
  +
  +sub BODY_40_59 {
  +my ($i,$a,$b,$c,$d,$e,$f)=_;
  +my $j=$i+1;
  +$code.=<<___;
  +	lea	0x8f1bbcdc($xi,$e),$f
  +	mov	`4*($j%16)`(%rsp),$xi
  +	mov	$b,$t0
  +	mov	$b,$t1
  +	xor	`4*(($j+2)%16)`(%rsp),$xi
  +	mov	$a,$e
  +	and	$c,$t0
  +	xor	`4*(($j+8)%16)`(%rsp),$xi
  +	or	$c,$t1
  +	rol	\$5,$e
  +	xor	`4*(($j+13)%16)`(%rsp),$xi
  +	and	$d,$t1
  +	add	$e,$f
  +	rol	\$1,$xi
  +	or	$t1,$t0
  +	rol	\$30,$b
  +	mov	$xi,`4*($j%16)`(%rsp)
  +	add	$t0,$f
  +___
  +}
  +
  +$code=".text\n";
  +
  +&PROLOGUE("sha1_block_asm_data_order");
  +$code.=".align	4\n.Lloop:\n";
  +for($i=0;$i<20;$i++)	{ &BODY_00_19($i,V);
unshift(V,pop(V)); }
  +for(;$i<40;$i++)	{ &BODY_20_39($i,V);
unshift(V,pop(V)); }
  +for(;$i<60;$i++)	{ &BODY_40_59($i,V);
unshift(V,pop(V)); }
  +for(;$i<80;$i++)	{ &BODY_20_39($i,V);
unshift(V,pop(V)); }
  +$code.=<<___;
  +	add	0($ctx),$E
  +	add	4($ctx),$T
  +	add	8($ctx),$A
  +	add	12($ctx),$B
  +	add	16($ctx),$C
  +	mov	$E,0($ctx)
  +	mov	$T,4($ctx)
  +	mov	$A,8($ctx)
  +	mov	$B,12($ctx)
  +	mov	$C,16($ctx)
  +
  +	xchg	$E,$A	# mov	$E,$A
  +	xchg	$T,$B	# mov	$T,$B
  +	xchg	$E,$C	# mov	$A,$C
  +	xchg	$T,$D	# mov	$B,$D
  +			# mov	$C,$E
  +	lea	`16*4`($inp),$inp
  +	sub	\$1,$num
  +	jnz	.Lloop
  +___
  +&EPILOGUE("sha1_block_asm_data_order");
  +
 
+###########################################################
#########
  +
  +V=($A,$B,$C,$D,$E,$T);
  +
  +&PROLOGUE("sha1_block_asm_host_order");
  +for($i=0;$i<15;$i++)	{ &BODY_00_19($i,V,1);
unshift(V,pop(V)); }
  +$code.=<<___;
  +	jmp	.Lshortcut
 
+.size	sha1_block_asm_host_order,.-sha1_block_asm_host_order
  +___
  +
  +$code =~ s/\`([^\`]*)\`/eval $1/gem;
  +print $code;
  +close STDOUT;
   .
  patch -p0 <<' .'
  Index: openssl/crypto/sha/sha_locl.h
 
============================================================
================
  $ cvs diff -u -r1.24 -r1.25 sha_locl.h
  --- openssl/crypto/sha/sha_locl.h	25 Oct 2005 15:55:06
-0000	1.24
  +++ openssl/crypto/sha/sha_locl.h	16 Apr 2006 14:42:54
-0000	1.25
   -115,7 +115,8 
   # endif
   
   # ifdef SHA1_ASM
  -#  if defined(__i386) || defined(__i386__) ||
defined(_M_IX86) || defined(__INTEL__)
  +#  if defined(__i386) || defined(__i386__) ||
defined(_M_IX86) || defined(__INTEL__) \
  +   || defined(__x86_64) || defined(__x86_64__) ||
defined(_M_AMD64) || defined(_M_X64)
   #   define
sha1_block_host_order		sha1_block_asm_host_order
   #   define DONT_IMPLEMENT_BLOCK_HOST_ORDER
   #   define
sha1_block_data_order		sha1_block_asm_data_order
   .
____________________________________________________________
__________
OpenSSL Project                                 http://www.openssl.org
CVS Repository Commit List                    
openssl-cvsopenssl.org
Automated List Manager                          
majordomoopenssl.org
[1]

about | contact  Other archives ( Real Estate discussion Medical topics )