List Info

Thread: OpenSSL: openssl/crypto/bn/asm/ x86-mont.pl




OpenSSL: openssl/crypto/bn/asm/ x86-mont.pl
user name
2006-11-27 14:59:36
  OpenSSL CVS Repository
  http://cvs.openssl.org/
 
____________________________________________________________
________________

  Server: cvs.openssl.org                  Name:   Andy
Polyakov
  Root:   /v/openssl/cvs                   Email:  approopenssl.org
  Module: openssl                          Date:  
27-Nov-2006 15:59:35
  Branch: HEAD                             Handle:
2006112714593500

  Modified files:
    openssl/crypto/bn/asm   x86-mont.pl

  Log:
    Non-SSE2 path to bn_mul_mont. But it's disabled, because
it currently
    doesn't give performance improvement.

  Summary:
    Revision    Changes     Path
    1.6         +187 -41   
openssl/crypto/bn/asm/x86-mont.pl
 
____________________________________________________________
________________

  patch -p0 <<' .'
  Index: openssl/crypto/bn/asm/x86-mont.pl
 
============================================================
================
  $ cvs diff -u -r1.5 -r1.6 x86-mont.pl
  --- openssl/crypto/bn/asm/x86-mont.pl	22 Oct 2005 17:57:18
-0000	1.5
  +++ openssl/crypto/bn/asm/x86-mont.pl	27 Nov 2006 14:59:35
-0000	1.6
   -2,8 +2,9 
   
   #
============================================================
========
   # Written by Andy Polyakov <approfy.chalmers.se> for the OpenSSL
  -# project. Rights for redistribution and usage in source
and binary
  -# forms are granted according to the OpenSSL license.
  +# project. The module is, however, dual licensed under
OpenSSL and
  +# CRYPTOGAMS licenses depending on where you obtain it.
For further
  +# details see http://www
.openssl.org/~appro/cryptogams/.
   #
============================================================
========
   
   # October 2005
   -31,12 +32,12 
   
  
&function_begin("bn_mul_mont",$sse2?"EXTR
Nt_OPENSSL_ia32cap_PWORD&quo
t;:"");
   
  -$i="ebx";
  +$i="edx";
   $j="ecx";
   $ap="esi";
   $rp="edi";	$bp="edi";		# overlapping
variables!!!
  -$np="edx";
  -$num="ebp";
  +$np="ebp";
  +$num="ebx";
   
   $_rp=&DWP(4*0,"esp");			# stack top layout
   $_ap=&DWP(4*1,"esp");
   -45,21 +46,13 
   $_n0=&DWP(4*4,"esp");
   $_num=&DWP(4*5,"esp");
   $_sp=&DWP(4*6,"esp");
  +$_bpend=&DWP(4*7,"esp");
   $frame=32;				# size of above frame rounded up to 16n
   
  -$acc0="mm0";				# mmx register bank layout
  -$acc1="mm1";
  -$car0="mm2";
  -$car1="mm3";
  -$mul0="mm4";
  -$mul1="mm5";
  -$temp="mm6";
  -$mask="mm7";
  -
  -if($sse2) {
 
-	&picmeup("eax","OPENSSL_ia32cap_P"
);
  -	&bt	(&DWP(0,"eax"),26);
  -	&jnc	(&label("non_sse2"));
  +	&xor	("eax","eax");
  +	&mov	("edi",&wparam(5));	# int num
  +	&cmp	("edi",3);
  +	&jb	(&label("just_leave"));
   
   	################################# load argument block...
   	&mov	("eax",&wparam(0));	# BN_ULONG
*rp
   -67,16 +60,14 
   	&mov	("ecx",&wparam(2));	# const
BN_ULONG *bp
   	&mov	("edx",&wparam(3));	# const
BN_ULONG *np
   	&mov	("esi",&wparam(4));	# const
BN_ULONG *n0
  -	&mov	($num,&wparam(5));	# int num
  +	#&mov	("edi",&wparam(5));	# int num
   
  -	&mov	("edi","esp");		# saved
stack pointer!
  -	&add	($num,1);		# extra word on top of tp
  -	&neg	($num);
 
-	&lea	("esp",&DWP(-$frame,"esp"
,$num,4));	# alloca($frame+8*($num+1))
  -	&neg	($num);
  -	&and	("esp",-1024);		# minimize TLB
utilization
  -	&sub	($num,1);		# num is restored to its original
value
  -					# and will remain constant from now...
  +	&mov	("ebp","esp");		# saved
stack pointer!
  +	&add	("edi",2);		# extra two words on top
of tp
  +	&neg	("edi");
 
+	&lea	("esp",&DWP(-$frame,"esp"
,"edi",4));	# alloca($frame+4*(num+2))
  +	&neg	("edi");
  +	&and	("esp",-4096);		# minimize TLB
utilization
   
  
	&mov	("esi",&DWP(0,"esi"));	#
pull n0[0]
   	&mov	($_rp,"eax");		# ... save a copy of
argument block
   -84,8 +75,23 
   	&mov	($_bp,"ecx");
   	&mov	($_np,"edx");
   	&mov	($_n0,"esi");
  -	#&mov	($_num,$num);		# redundant in sse2 context
  -	&mov	($_sp,"edi");		# saved stack pointer!
  +	&lea	($num,&DWP(-2,"edi"));	# num is
restored to its original value
  +	#&mov	($_num,$num);		# redundant as $num is not
reused
  +	&mov	($_sp,"ebp");		# saved stack pointer!
  +
  +if($sse2) {
  +$acc0="mm0";	# mmx register bank layout
  +$acc1="mm1";
  +$car0="mm2";
  +$car1="mm3";
  +$mul0="mm4";
  +$mul1="mm5";
  +$temp="mm6";
  +$mask="mm7";
  +
 
+	&picmeup("eax","OPENSSL_ia32cap_P"
);
  +	&bt	(&DWP(0,"eax"),26);
  +	&jnc	(&label("non_sse2"));
   
   	&mov	("eax",-1);
   	&movd	($mask,"eax");		# mask 32 lower bits
   -195,7 +201,153 
   	&jl	(&label("outer"));
   
   	&emms	();				# done with mmx bank
  +	&jmp	(&label("common_tail"));
  +
  +&set_label("non_sse2",16);
  +}
  +
  +if (1) {
  +	&mov	("esp",$_sp);
  +	&xor	("eax","eax");	# signal
"not fast enough [yet]"
  +	&jmp	(&label("just_leave"));
  +	# The code below gives ~15% improvement on 512-bit
benchmark
  +	# *only*:-( On all other key lengths it's slower for up
to 20%.
  +	# This is because the original code path holds down the
overall
  +	# amount of multiplications by ~25% by deploying
bn_sqr_words.
  +	# In other words, for the code below to be competitive,
  +	# dedicated squaring procedure is a must...
  +} else {
  +$inp="esi";	# integer path uses these registers
differently
  +$word="edi";
  +$carry="ebp";
  +
  +	&sub	($num,1);		# non-SSE2 path uses num-1
  +
  +	&mov	($inp,$_ap);
  +	&mov	($word,$_bp);
  +	&lea	("eax",&DWP(4,$word,$num,4));		#
&bp[num]
  +	&mov	($word,&DWP(0,$word));			# bp[0]
  +	&mov	($_bpend,"eax");
  +	&xor	($j,$j);
  +	&xor	("edx","edx");
  +
  +&set_label("mull",16);
  +	&mov	("eax",&DWP(0,$inp,$j,4));		#
ap[j]
  +	&mov	($carry,"edx");
  +	&mul	($word);				# ap[j]*bp[0]
  +	&lea	($j,&DWP(1,$j));
  +	&add	("eax",$carry);
  +	&adc	("edx",0);
 
+	&mov	(&DWP($frame-4,"esp",$j,4),"ea
x");	# tp[j]=
  +	&cmp	($j,$num);
  +	&jb	(&label("mull"));
  +
  +	&mov	("eax",&DWP(0,$inp,$num,4));		#
ap[num-1]
  +	&mov	($carry,"edx");
  +	&mul	($word);				# ap[num-1]*bp[0]
  +	&add	("eax",$carry);
  +	&adc	("edx",0);
  +
  +	&mov	($word,$_n0);
  +	&mov	($inp,$_np);
  +	&imul	($word,&DWP($frame,"esp"));		#
n0*tp[0]
  +
 
+	&mov	(&DWP($frame,"esp",$num,4),"ea
x");	# tp[num-1]=
  +	&xor	($j,$j);
 
+	&mov	(&DWP($frame+4,"esp",$num,4),"
edx");	# tp[num]=
 
+	&mov	(&DWP($frame+8,"esp",$num,4),$j);	#
tp[num+1]=
  +
  +	&mov	("eax",&DWP(0,$inp));			# np[0]
  +	&mul	($word);				# np[0]*m
 
+	&add	("eax",&DWP($frame,"esp")
);		# +=tp[0]
  +	&adc	("edx",0);
  +	&mov	($j,1);
  +
  +	&jmp	(&label("2ndmadd"));
  +
  +&set_label("1stmadd",16);
  +	&mov	("eax",&DWP(0,$inp,$j,4));		#
ap[j]
  +	&mov	($carry,"edx");
  +	&mul	($word);				# ap[j]*bp[i]
  +	&lea	($j,&DWP(1,$j));
 
+	&add	("eax",&DWP($frame-4,"esp"
;,$j,4));	# +=tp[j]
  +	&adc	("edx",0);
  +	&add	("eax",$carry);
  +	&adc	("edx",0);
 
+	&mov	(&DWP($frame-4,"esp",$j,4),"ea
x");	# tp[j]=
  +	&cmp	($j,$num);
  +	&jb	(&label("1stmadd"));
  +
  +	&mov	("eax",&DWP(0,$inp,$num,4));		#
ap[num-1]
  +	&mov	($carry,"edx");
  +	&mul	($word);				# ap[num-1]*bp[i]
 
+	&add	("eax",&DWP($frame,"esp",
$num,4));	# +=tp[num-1]
  +	&adc	("edx",0);
  +	&add	("eax",$carry);
  +	&adc	("edx",0);
  +
  +	&mov	($word,$_n0);
  +	&mov	($inp,$_np);
  +	&imul	($word,&DWP($frame,"esp"));		#
n0*tp[0]
  +
  +	&xor	($j,$j);
 
+	&add	("edx",&DWP($frame+4,"esp"
;,$num,4));	# carry+=tp[num]
 
+	&mov	(&DWP($frame,"esp",$num,4),"ea
x");	# tp[num-1]=
  +	&adc	($j,0);
 
+	&mov	(&DWP($frame+4,"esp",$num,4),"
edx");	# tp[num]=
 
+	&mov	(&DWP($frame+8,"esp",$num,4),$j);	#
tp[num+1]=
  +
  +	&mov	("eax",&DWP(0,$inp));			# np[0]
  +	&mul	($word);				# np[0]*m
 
+	&add	("eax",&DWP($frame,"esp")
);		# +=tp[0]
  +	&adc	("edx",0);
  +	&mov	($j,1);
  +
  +&set_label("2ndmadd",16);
  +	&mov	("eax",&DWP(0,$inp,$j,4));		#
np[j]
  +	&mov	($carry,"edx");
  +	&mul	($word);				# np[j]*m
  +	&lea	($j,&DWP(1,$j));
 
+	&add	("eax",&DWP($frame-4,"esp"
;,$j,4));	# +=tp[j]
  +	&adc	("edx",0);
  +	&add	("eax",$carry);
  +	&adc	("edx",0);
 
+	&mov	(&DWP($frame-8,"esp",$j,4),"ea
x");	# tp[j-1]=
  +	&cmp	($j,$num);
  +	&jb	(&label("2ndmadd"));
   
  +	&mov	("eax",&DWP(0,$inp,$num,4));		#
np[num-1]
  +	&mov	($carry,"edx");
  +	&mul	($word);				# np[num-1]*m
 
+	&add	("eax",&DWP($frame,"esp",
$num,4));	# +=tp[num-1]
  +	&adc	("edx",0);
  +	&add	("eax",$carry);
  +	&adc	("edx",0);
 
+	&mov	(&DWP($frame-4,"esp",$num,4),"
eax");	# tp[num-2]=
  +
  +	&xor	("eax","eax");
 
+	&add	("edx",&DWP($frame+4,"esp"
;,$num,4));	# carry+=tp[num]
 
+	&adc	("eax",&DWP($frame+8,"esp"
;,$num,4));	# +=tp[num+1]
 
+	&mov	(&DWP($frame,"esp",$num,4),"ed
x");	# tp[num-1]=
 
+	&mov	(&DWP($frame+4,"esp",$num,4),"
eax");	# tp[num]=
  +
  +	&mov	($carry,$_bp);				# &bp[i]
  +	&add	($carry,4);
  +	&cmp	($carry,$_bpend);
  +	&je	(&label("x86done"));
  +	&mov	($word,&DWP(0,$carry));			# bp[i]
  +	&mov	($inp,$_ap);
  +	&mov	($_bp,$carry);				# &bp[++i]
  +	&xor	($j,$j);
  +	&xor	("edx","edx");
  +	&jmp	(&label("1stmadd"));
  +
  +&set_label("x86done",16);
  +	&mov	($np,$_np);	# make adjustments for tail
processing
  +	&add	($num,1);
  +}
  +
  +&set_label("common_tail",16);
  
	&mov	("esi",&DWP($frame,"esp",$
num,4));# load upmost overflow bit
   	&mov	($rp,$_rp);			# load result pointer
   						# [$ap and $bp are zapped]
   -206,15 +358,15 
  
	&mov	("eax",&DWP($frame,"esp",$
j,4));
   	&cmp	("eax",&DWP(0,$np,$j,4));	#
tp[num-1]-np[num-1]?
   	&jae	(&label("sub"));		# if taken CF
is cleared
  -&set_label("copy");
  +&set_label("copy",16);
  
	&mov	("eax",&DWP($frame,"esp",$
j,4));
   	&mov	(&DWP(0,$rp,$j,4),"eax");	#
rp[i]=tp[i]
   	&mov	(&DWP($frame,"esp",$j,4),$j);	#
zap temporary vector
   	&dec	($j);
   	&jge	(&label("copy"));
  -	&jmp	(&label("exit_sse2"));
  +	&jmp	(&label("exit"));
   
  -&set_label("sub",4);
  +&set_label("sub",16);
  
	&mov	("eax",&DWP($frame,"esp",$
i,4));
   	&sbb	("eax",&DWP(0,$np,$i,4));
   	&mov	(&DWP(0,$rp,$i,4),"eax");	#
rp[i]=tp[i]-np[i]
   -224,21 +376,15 
   	&lea	($j,&DWP(-1,$num));		# j=num-1
   	&sbb	("esi",0);			# esi holds upmost
overflow bit
   	&jc	(&label("copy"));
  -&set_label("zap");
  +&set_label("zap",16);
   	&mov	(&DWP($frame,"esp",$j,4),$i);	#
zap temporary vector
   	&dec	($j);
   	&jge	(&label("zap"));
   
  -&set_label("exit_sse2");
  +&set_label("exit",4);
   	&mov	("esp",$_sp);		# pull saved stack
pointer
   	&mov	("eax",1);
  -	&jmp	(&label("leave"));
  -&set_label("non_sse2");
  -}
  -
  -	&xor	("eax","eax");	# zero
signals "not implemented [yet]"
  -
  -&set_label("leave");
  +&set_label("just_leave");
   &function_end("bn_mul_mont");
   
   &asm_finish();
   .
____________________________________________________________
__________
OpenSSL Project                                 http://www.openssl.org
CVS Repository Commit List                    
openssl-cvsopenssl.org
Automated List Manager                          
majordomoopenssl.org
[1]

about | contact  Other archives ( Real Estate discussion Medical topics )