aboutsummaryrefslogtreecommitdiffstats
path: root/crypto/bn
diff options
context:
space:
mode:
Diffstat (limited to 'crypto/bn')
-rwxr-xr-xcrypto/bn/asm/x86-mont.pl41
-rw-r--r--crypto/bn/asm/x86_64-gcc.c2
-rwxr-xr-xcrypto/bn/asm/x86_64-mont.pl185
-rwxr-xr-xcrypto/bn/asm/x86_64-mont5.pl227
-rw-r--r--crypto/bn/bn.h2
-rw-r--r--crypto/bn/bn_div.c4
-rw-r--r--crypto/bn/bn_lib.c2
-rw-r--r--crypto/bn/bn_print.c35
-rw-r--r--crypto/bn/bn_rand.c23
-rw-r--r--crypto/bn/bn_word.c22
-rw-r--r--crypto/bn/bntest.c8
11 files changed, 341 insertions, 210 deletions
diff --git a/crypto/bn/asm/x86-mont.pl b/crypto/bn/asm/x86-mont.pl
index 89f4de61e896..1c4003efc20a 100755
--- a/crypto/bn/asm/x86-mont.pl
+++ b/crypto/bn/asm/x86-mont.pl
@@ -63,27 +63,26 @@ $frame=32; # size of above frame rounded up to 16n
&lea ("esi",&wparam(0)); # put aside pointer to argument block
&lea ("edx",&wparam(1)); # load ap
- &mov ("ebp","esp"); # saved stack pointer!
&add ("edi",2); # extra two words on top of tp
&neg ("edi");
- &lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2))
+ &lea ("ebp",&DWP(-$frame,"esp","edi",4)); # future alloca($frame+4*(num+2))
&neg ("edi");
# minimize cache contention by arraning 2K window between stack
# pointer and ap argument [np is also position sensitive vector,
# but it's assumed to be near ap, as it's allocated at ~same
# time].
- &mov ("eax","esp");
+ &mov ("eax","ebp");
&sub ("eax","edx");
&and ("eax",2047);
- &sub ("esp","eax"); # this aligns sp and ap modulo 2048
+ &sub ("ebp","eax"); # this aligns sp and ap modulo 2048
- &xor ("edx","esp");
+ &xor ("edx","ebp");
&and ("edx",2048);
&xor ("edx",2048);
- &sub ("esp","edx"); # this splits them apart modulo 4096
+ &sub ("ebp","edx"); # this splits them apart modulo 4096
- &and ("esp",-64); # align to cache line
+ &and ("ebp",-64); # align to cache line
# Some OSes, *cough*-dows, insist on stack being "wired" to
# physical memory in strictly sequential manner, i.e. if stack
@@ -91,20 +90,28 @@ $frame=32; # size of above frame rounded up to 16n
# be punishable by SEGV. But page walking can do good even on
# other OSes, because it guarantees that villain thread hits
# the guard page before it can make damage to innocent one...
- &mov ("eax","ebp");
- &sub ("eax","esp");
+ &mov ("eax","esp");
+ &sub ("eax","ebp");
&and ("eax",-4096);
-&set_label("page_walk");
- &mov ("edx",&DWP(0,"esp","eax"));
- &sub ("eax",4096);
- &data_byte(0x2e);
- &jnc (&label("page_walk"));
+ &mov ("edx","esp"); # saved stack pointer!
+ &lea ("esp",&DWP(0,"ebp","eax"));
+ &mov ("eax",&DWP(0,"esp"));
+ &cmp ("esp","ebp");
+ &ja (&label("page_walk"));
+ &jmp (&label("page_walk_done"));
+
+&set_label("page_walk",16);
+ &lea ("esp",&DWP(-4096,"esp"));
+ &mov ("eax",&DWP(0,"esp"));
+ &cmp ("esp","ebp");
+ &ja (&label("page_walk"));
+&set_label("page_walk_done");
################################# load argument block...
&mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
&mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
&mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
- &mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
+ &mov ("ebp",&DWP(3*4,"esi"));# const BN_ULONG *np
&mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
#&mov ("edi",&DWP(5*4,"esi"));# int num
@@ -112,11 +119,11 @@ $frame=32; # size of above frame rounded up to 16n
&mov ($_rp,"eax"); # ... save a copy of argument block
&mov ($_ap,"ebx");
&mov ($_bp,"ecx");
- &mov ($_np,"edx");
+ &mov ($_np,"ebp");
&mov ($_n0,"esi");
&lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling
#&mov ($_num,$num); # redundant as $num is not reused
- &mov ($_sp,"ebp"); # saved stack pointer!
+ &mov ($_sp,"edx"); # saved stack pointer!
if($sse2) {
$acc0="mm0"; # mmx register bank layout
diff --git a/crypto/bn/asm/x86_64-gcc.c b/crypto/bn/asm/x86_64-gcc.c
index d77dc433d405..1729b479d43e 100644
--- a/crypto/bn/asm/x86_64-gcc.c
+++ b/crypto/bn/asm/x86_64-gcc.c
@@ -194,7 +194,7 @@ BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
BN_ULONG ret, waste;
asm("divq %4":"=a"(ret), "=d"(waste)
- : "a"(l), "d"(h), "g"(d)
+ : "a"(l), "d"(h), "r"(d)
: "cc");
return ret;
diff --git a/crypto/bn/asm/x86_64-mont.pl b/crypto/bn/asm/x86_64-mont.pl
index 8fb6c994e1ef..044fd7ecc0fd 100755
--- a/crypto/bn/asm/x86_64-mont.pl
+++ b/crypto/bn/asm/x86_64-mont.pl
@@ -97,6 +97,8 @@ $code=<<___;
.type bn_mul_mont,\@function,6
.align 16
bn_mul_mont:
+ mov ${num}d,${num}d
+ mov %rsp,%rax
test \$3,${num}d
jnz .Lmul_enter
cmp \$8,${num}d
@@ -121,29 +123,36 @@ $code.=<<___;
push %r14
push %r15
- mov ${num}d,${num}d
- lea 2($num),%r10
+ neg $num
mov %rsp,%r11
- neg %r10
- lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+2))
- and \$-1024,%rsp # minimize TLB usage
+ lea -16(%rsp,$num,8),%r10 # future alloca(8*(num+2))
+ neg $num # restore $num
+ and \$-1024,%r10 # minimize TLB usage
- mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
-.Lmul_body:
# Some OSes, *cough*-dows, insist on stack being "wired" to
# physical memory in strictly sequential manner, i.e. if stack
# allocation spans two pages, then reference to farmost one can
# be punishable by SEGV. But page walking can do good even on
# other OSes, because it guarantees that villain thread hits
# the guard page before it can make damage to innocent one...
- sub %rsp,%r11
+ sub %r10,%r11
and \$-4096,%r11
+ lea (%r10,%r11),%rsp
+ mov (%rsp),%r11
+ cmp %r10,%rsp
+ ja .Lmul_page_walk
+ jmp .Lmul_page_walk_done
+
+.align 16
.Lmul_page_walk:
- mov (%rsp,%r11),%r10
- sub \$4096,%r11
- .byte 0x66,0x2e # predict non-taken
- jnc .Lmul_page_walk
+ lea -4096(%rsp),%rsp
+ mov (%rsp),%r11
+ cmp %r10,%rsp
+ ja .Lmul_page_walk
+.Lmul_page_walk_done:
+ mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
+.Lmul_body:
mov $bp,%r12 # reassign $bp
___
$bp="%r12";
@@ -314,13 +323,13 @@ $code.=<<___;
mov 8(%rsp,$num,8),%rsi # restore %rsp
mov \$1,%rax
- mov (%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
.Lmul_epilogue:
ret
.size bn_mul_mont,.-bn_mul_mont
@@ -332,6 +341,8 @@ $code.=<<___;
.type bn_mul4x_mont,\@function,6
.align 16
bn_mul4x_mont:
+ mov ${num}d,${num}d
+ mov %rsp,%rax
.Lmul4x_enter:
___
$code.=<<___ if ($addx);
@@ -347,23 +358,29 @@ $code.=<<___;
push %r14
push %r15
- mov ${num}d,${num}d
- lea 4($num),%r10
+ neg $num
mov %rsp,%r11
- neg %r10
- lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+4))
- and \$-1024,%rsp # minimize TLB usage
+ lea -32(%rsp,$num,8),%r10 # future alloca(8*(num+4))
+ neg $num # restore
+ and \$-1024,%r10 # minimize TLB usage
- mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
-.Lmul4x_body:
- sub %rsp,%r11
+ sub %r10,%r11
and \$-4096,%r11
+ lea (%r10,%r11),%rsp
+ mov (%rsp),%r11
+ cmp %r10,%rsp
+ ja .Lmul4x_page_walk
+ jmp .Lmul4x_page_walk_done
+
.Lmul4x_page_walk:
- mov (%rsp,%r11),%r10
- sub \$4096,%r11
- .byte 0x2e # predict non-taken
- jnc .Lmul4x_page_walk
+ lea -4096(%rsp),%rsp
+ mov (%rsp),%r11
+ cmp %r10,%rsp
+ ja .Lmul4x_page_walk
+.Lmul4x_page_walk_done:
+ mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
+.Lmul4x_body:
mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
mov %rdx,%r12 # reassign $bp
___
@@ -742,13 +759,13 @@ ___
$code.=<<___;
mov 8(%rsp,$num,8),%rsi # restore %rsp
mov \$1,%rax
- mov (%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
.Lmul4x_epilogue:
ret
.size bn_mul4x_mont,.-bn_mul4x_mont
@@ -778,14 +795,15 @@ $code.=<<___;
.type bn_sqr8x_mont,\@function,6
.align 32
bn_sqr8x_mont:
-.Lsqr8x_enter:
mov %rsp,%rax
+.Lsqr8x_enter:
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
+.Lsqr8x_prologue:
mov ${num}d,%r10d
shl \$3,${num}d # convert $num to bytes
@@ -798,33 +816,42 @@ bn_sqr8x_mont:
# do its job.
#
lea -64(%rsp,$num,2),%r11
+ mov %rsp,%rbp
mov ($n0),$n0 # *n0
sub $aptr,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lsqr8x_sp_alt
- sub %r11,%rsp # align with $aptr
- lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ sub %r11,%rbp # align with $aptr
+ lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num)
jmp .Lsqr8x_sp_done
.align 32
.Lsqr8x_sp_alt:
lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
- lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
- sub %r11,%rsp
+ sub %r11,%rbp
.Lsqr8x_sp_done:
- and \$-64,%rsp
- mov %rax,%r11
- sub %rsp,%r11
+ and \$-64,%rbp
+ mov %rsp,%r11
+ sub %rbp,%r11
and \$-4096,%r11
+ lea (%rbp,%r11),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lsqr8x_page_walk
+ jmp .Lsqr8x_page_walk_done
+
+.align 16
.Lsqr8x_page_walk:
- mov (%rsp,%r11),%r10
- sub \$4096,%r11
- .byte 0x2e # predict non-taken
- jnc .Lsqr8x_page_walk
+ lea -4096(%rsp),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lsqr8x_page_walk
+.Lsqr8x_page_walk_done:
mov $num,%r10
neg $num
@@ -948,30 +975,38 @@ $code.=<<___;
.type bn_mulx4x_mont,\@function,6
.align 32
bn_mulx4x_mont:
-.Lmulx4x_enter:
mov %rsp,%rax
+.Lmulx4x_enter:
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
+.Lmulx4x_prologue:
shl \$3,${num}d # convert $num to bytes
- .byte 0x67
xor %r10,%r10
sub $num,%r10 # -$num
mov ($n0),$n0 # *n0
- lea -72(%rsp,%r10),%rsp # alloca(frame+$num+8)
- and \$-128,%rsp
- mov %rax,%r11
- sub %rsp,%r11
+ lea -72(%rsp,%r10),%rbp # future alloca(frame+$num+8)
+ and \$-128,%rbp
+ mov %rsp,%r11
+ sub %rbp,%r11
and \$-4096,%r11
+ lea (%rbp,%r11),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lmulx4x_page_walk
+ jmp .Lmulx4x_page_walk_done
+
+.align 16
.Lmulx4x_page_walk:
- mov (%rsp,%r11),%r10
- sub \$4096,%r11
- .byte 0x66,0x2e # predict non-taken
- jnc .Lmulx4x_page_walk
+ lea -4096(%rsp),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lmulx4x_page_walk
+.Lmulx4x_page_walk_done:
lea ($bp,$num),%r10
##############################################################
@@ -1332,22 +1367,8 @@ mul_handler:
mov 192($context),%r10 # pull $num
mov 8(%rax,%r10,8),%rax # pull saved stack pointer
- lea 48(%rax),%rax
-
- mov -8(%rax),%rbx
- mov -16(%rax),%rbp
- mov -24(%rax),%r12
- mov -32(%rax),%r13
- mov -40(%rax),%r14
- mov -48(%rax),%r15
- mov %rbx,144($context) # restore context->Rbx
- mov %rbp,160($context) # restore context->Rbp
- mov %r12,216($context) # restore context->R12
- mov %r13,224($context) # restore context->R13
- mov %r14,232($context) # restore context->R14
- mov %r15,240($context) # restore context->R15
- jmp .Lcommon_seh_tail
+ jmp .Lcommon_pop_regs
.size mul_handler,.-mul_handler
.type sqr_handler,\@abi-omnipotent
@@ -1375,15 +1396,21 @@ sqr_handler:
cmp %r10,%rbx # context->Rip<.Lsqr_body
jb .Lcommon_seh_tail
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # body label
+ cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue
+ jb .Lcommon_pop_regs
+
mov 152($context),%rax # pull context->Rsp
- mov 4(%r11),%r10d # HandlerData[1]
+ mov 8(%r11),%r10d # HandlerData[2]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue
jae .Lcommon_seh_tail
mov 40(%rax),%rax # pull saved stack pointer
+.Lcommon_pop_regs:
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
@@ -1470,13 +1497,15 @@ $code.=<<___;
.LSEH_info_bn_sqr8x_mont:
.byte 9,0,0,0
.rva sqr_handler
- .rva .Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[]
+ .rva .Lsqr8x_prologue,.Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[]
+.align 8
___
$code.=<<___ if ($addx);
.LSEH_info_bn_mulx4x_mont:
.byte 9,0,0,0
.rva sqr_handler
- .rva .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
+ .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
+.align 8
___
}
diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl
index 938e17081803..f1fbb45b532b 100755
--- a/crypto/bn/asm/x86_64-mont5.pl
+++ b/crypto/bn/asm/x86_64-mont5.pl
@@ -86,6 +86,8 @@ $code=<<___;
.type bn_mul_mont_gather5,\@function,6
.align 64
bn_mul_mont_gather5:
+ mov ${num}d,${num}d
+ mov %rsp,%rax
test \$7,${num}d
jnz .Lmul_enter
___
@@ -97,10 +99,7 @@ $code.=<<___;
.align 16
.Lmul_enter:
- mov ${num}d,${num}d
- mov %rsp,%rax
movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument
- lea .Linc(%rip),%r10
push %rbx
push %rbp
push %r12
@@ -108,26 +107,36 @@ $code.=<<___;
push %r14
push %r15
- lea 2($num),%r11
- neg %r11
- lea -264(%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)+256+8)
- and \$-1024,%rsp # minimize TLB usage
+ neg $num
+ mov %rsp,%r11
+ lea -280(%rsp,$num,8),%r10 # future alloca(8*(num+2)+256+8)
+ neg $num # restore $num
+ and \$-1024,%r10 # minimize TLB usage
- mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
-.Lmul_body:
# Some OSes, *cough*-dows, insist on stack being "wired" to
# physical memory in strictly sequential manner, i.e. if stack
# allocation spans two pages, then reference to farmost one can
# be punishable by SEGV. But page walking can do good even on
# other OSes, because it guarantees that villain thread hits
# the guard page before it can make damage to innocent one...
- sub %rsp,%rax
- and \$-4096,%rax
+ sub %r10,%r11
+ and \$-4096,%r11
+ lea (%r10,%r11),%rsp
+ mov (%rsp),%r11
+ cmp %r10,%rsp
+ ja .Lmul_page_walk
+ jmp .Lmul_page_walk_done
+
.Lmul_page_walk:
- mov (%rsp,%rax),%r11
- sub \$4096,%rax
- .byte 0x2e # predict non-taken
- jnc .Lmul_page_walk
+ lea -4096(%rsp),%rsp
+ mov (%rsp),%r11
+ cmp %r10,%rsp
+ ja .Lmul_page_walk
+.Lmul_page_walk_done:
+
+ lea .Linc(%rip),%r10
+ mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
+.Lmul_body:
lea 128($bp),%r12 # reassign $bp (+size optimization)
___
@@ -433,6 +442,8 @@ $code.=<<___;
.type bn_mul4x_mont_gather5,\@function,6
.align 32
bn_mul4x_mont_gather5:
+ .byte 0x67
+ mov %rsp,%rax
.Lmul4x_enter:
___
$code.=<<___ if ($addx);
@@ -441,14 +452,13 @@ $code.=<<___ if ($addx);
je .Lmulx4x_enter
___
$code.=<<___;
- .byte 0x67
- mov %rsp,%rax
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
+.Lmul4x_prologue:
.byte 0x67
shl \$3,${num}d # convert $num to bytes
@@ -465,32 +475,40 @@ $code.=<<___;
# calculated from 7th argument, the index.]
#
lea -320(%rsp,$num,2),%r11
+ mov %rsp,%rbp
sub $rp,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lmul4xsp_alt
- sub %r11,%rsp # align with $rp
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
+ sub %r11,%rbp # align with $rp
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)
jmp .Lmul4xsp_done
.align 32
.Lmul4xsp_alt:
lea 4096-320(,$num,2),%r10
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
- sub %r11,%rsp
+ sub %r11,%rbp
.Lmul4xsp_done:
- and \$-64,%rsp
- mov %rax,%r11
- sub %rsp,%r11
+ and \$-64,%rbp
+ mov %rsp,%r11
+ sub %rbp,%r11
and \$-4096,%r11
+ lea (%rbp,%r11),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lmul4x_page_walk
+ jmp .Lmul4x_page_walk_done
+
.Lmul4x_page_walk:
- mov (%rsp,%r11),%r10
- sub \$4096,%r11
- .byte 0x2e # predict non-taken
- jnc .Lmul4x_page_walk
+ lea -4096(%rsp),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lmul4x_page_walk
+.Lmul4x_page_walk_done:
neg $num
@@ -1034,6 +1052,7 @@ $code.=<<___;
.type bn_power5,\@function,6
.align 32
bn_power5:
+ mov %rsp,%rax
___
$code.=<<___ if ($addx);
mov OPENSSL_ia32cap_P+8(%rip),%r11d
@@ -1042,13 +1061,13 @@ $code.=<<___ if ($addx);
je .Lpowerx5_enter
___
$code.=<<___;
- mov %rsp,%rax
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
+.Lpower5_prologue:
shl \$3,${num}d # convert $num to bytes
lea ($num,$num,2),%r10d # 3*$num
@@ -1063,32 +1082,40 @@ $code.=<<___;
# calculated from 7th argument, the index.]
#
lea -320(%rsp,$num,2),%r11
+ mov %rsp,%rbp
sub $rptr,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lpwr_sp_alt
- sub %r11,%rsp # align with $aptr
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
+ sub %r11,%rbp # align with $aptr
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)
jmp .Lpwr_sp_done
.align 32
.Lpwr_sp_alt:
lea 4096-320(,$num,2),%r10
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
- sub %r11,%rsp
+ sub %r11,%rbp
.Lpwr_sp_done:
- and \$-64,%rsp
- mov %rax,%r11
- sub %rsp,%r11
+ and \$-64,%rbp
+ mov %rsp,%r11
+ sub %rbp,%r11
and \$-4096,%r11
+ lea (%rbp,%r11),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lpwr_page_walk
+ jmp .Lpwr_page_walk_done
+
.Lpwr_page_walk:
- mov (%rsp,%r11),%r10
- sub \$4096,%r11
- .byte 0x2e # predict non-taken
- jnc .Lpwr_page_walk
+ lea -4096(%rsp),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lpwr_page_walk
+.Lpwr_page_walk_done:
mov $num,%r10
neg $num
@@ -2028,6 +2055,7 @@ bn_from_mont8x:
push %r13
push %r14
push %r15
+.Lfrom_prologue:
shl \$3,${num}d # convert $num to bytes
lea ($num,$num,2),%r10 # 3*$num in bytes
@@ -2042,32 +2070,40 @@ bn_from_mont8x:
# last operation, we use the opportunity to cleanse it.
#
lea -320(%rsp,$num,2),%r11
+ mov %rsp,%rbp
sub $rptr,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lfrom_sp_alt
- sub %r11,%rsp # align with $aptr
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
+ sub %r11,%rbp # align with $aptr
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
jmp .Lfrom_sp_done
.align 32
.Lfrom_sp_alt:
lea 4096-320(,$num,2),%r10
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
- sub %r11,%rsp
+ sub %r11,%rbp
.Lfrom_sp_done:
- and \$-64,%rsp
- mov %rax,%r11
- sub %rsp,%r11
+ and \$-64,%rbp
+ mov %rsp,%r11
+ sub %rbp,%r11
and \$-4096,%r11
+ lea (%rbp,%r11),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lfrom_page_walk
+ jmp .Lfrom_page_walk_done
+
.Lfrom_page_walk:
- mov (%rsp,%r11),%r10
- sub \$4096,%r11
- .byte 0x2e # predict non-taken
- jnc .Lfrom_page_walk
+ lea -4096(%rsp),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lfrom_page_walk
+.Lfrom_page_walk_done:
mov $num,%r10
neg $num
@@ -2173,14 +2209,15 @@ $code.=<<___;
.type bn_mulx4x_mont_gather5,\@function,6
.align 32
bn_mulx4x_mont_gather5:
-.Lmulx4x_enter:
mov %rsp,%rax
+.Lmulx4x_enter:
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
+.Lmulx4x_prologue:
shl \$3,${num}d # convert $num to bytes
lea ($num,$num,2),%r10 # 3*$num in bytes
@@ -2197,31 +2234,39 @@ bn_mulx4x_mont_gather5:
# calculated from 7th argument, the index.]
#
lea -320(%rsp,$num,2),%r11
+ mov %rsp,%rbp
sub $rp,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lmulx4xsp_alt
- sub %r11,%rsp # align with $aptr
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
+ sub %r11,%rbp # align with $aptr
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
jmp .Lmulx4xsp_done
.Lmulx4xsp_alt:
lea 4096-320(,$num,2),%r10
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
- sub %r11,%rsp
+ sub %r11,%rbp
.Lmulx4xsp_done:
- and \$-64,%rsp # ensure alignment
- mov %rax,%r11
- sub %rsp,%r11
+ and \$-64,%rbp # ensure alignment
+ mov %rsp,%r11
+ sub %rbp,%r11
and \$-4096,%r11
+ lea (%rbp,%r11),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lmulx4x_page_walk
+ jmp .Lmulx4x_page_walk_done
+
.Lmulx4x_page_walk:
- mov (%rsp,%r11),%r10
- sub \$4096,%r11
- .byte 0x2e # predict non-taken
- jnc .Lmulx4x_page_walk
+ lea -4096(%rsp),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lmulx4x_page_walk
+.Lmulx4x_page_walk_done:
##############################################################
# Stack layout
@@ -2629,14 +2674,15 @@ $code.=<<___;
.type bn_powerx5,\@function,6
.align 32
bn_powerx5:
-.Lpowerx5_enter:
mov %rsp,%rax
+.Lpowerx5_enter:
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
+.Lpowerx5_prologue:
shl \$3,${num}d # convert $num to bytes
lea ($num,$num,2),%r10 # 3*$num in bytes
@@ -2651,32 +2697,40 @@ bn_powerx5:
# calculated from 7th argument, the index.]
#
lea -320(%rsp,$num,2),%r11
+ mov %rsp,%rbp
sub $rptr,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lpwrx_sp_alt
- sub %r11,%rsp # align with $aptr
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
+ sub %r11,%rbp # align with $aptr
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
jmp .Lpwrx_sp_done
.align 32
.Lpwrx_sp_alt:
lea 4096-320(,$num,2),%r10
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
+ lea -320(%rbp,$num,2),%rbp # alloca(frame+2*$num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
- sub %r11,%rsp
+ sub %r11,%rbp
.Lpwrx_sp_done:
- and \$-64,%rsp
- mov %rax,%r11
- sub %rsp,%r11
+ and \$-64,%rbp
+ mov %rsp,%r11
+ sub %rbp,%r11
and \$-4096,%r11
+ lea (%rbp,%r11),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lpwrx_page_walk
+ jmp .Lpwrx_page_walk_done
+
.Lpwrx_page_walk:
- mov (%rsp,%r11),%r10
- sub \$4096,%r11
- .byte 0x2e # predict non-taken
- jnc .Lpwrx_page_walk
+ lea -4096(%rsp),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lpwrx_page_walk
+.Lpwrx_page_walk_done:
mov $num,%r10
neg $num
@@ -3607,9 +3661,14 @@ mul_handler:
cmp %r10,%rbx # context->Rip<end of prologue label
jb .Lcommon_seh_tail
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jb .Lcommon_pop_regs
+
mov 152($context),%rax # pull context->Rsp
- mov 4(%r11),%r10d # HandlerData[1]
+ mov 8(%r11),%r10d # HandlerData[2]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lcommon_seh_tail
@@ -3621,11 +3680,11 @@ mul_handler:
mov 192($context),%r10 # pull $num
mov 8(%rax,%r10,8),%rax # pull saved stack pointer
- jmp .Lbody_proceed
+ jmp .Lcommon_pop_regs
.Lbody_40:
mov 40(%rax),%rax # pull saved stack pointer
-.Lbody_proceed:
+.Lcommon_pop_regs:
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
@@ -3716,34 +3775,34 @@ $code.=<<___;
.LSEH_info_bn_mul_mont_gather5:
.byte 9,0,0,0
.rva mul_handler
- .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
+ .rva .Lmul_body,.Lmul_body,.Lmul_epilogue # HandlerData[]
.align 8
.LSEH_info_bn_mul4x_mont_gather5:
.byte 9,0,0,0
.rva mul_handler
- .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
+ .rva .Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
.align 8
.LSEH_info_bn_power5:
.byte 9,0,0,0
.rva mul_handler
- .rva .Lpower5_body,.Lpower5_epilogue # HandlerData[]
+ .rva .Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue # HandlerData[]
.align 8
.LSEH_info_bn_from_mont8x:
.byte 9,0,0,0
.rva mul_handler
- .rva .Lfrom_body,.Lfrom_epilogue # HandlerData[]
+ .rva .Lfrom_prologue,.Lfrom_body,.Lfrom_epilogue # HandlerData[]
___
$code.=<<___ if ($addx);
.align 8
.LSEH_info_bn_mulx4x_mont_gather5:
.byte 9,0,0,0
.rva mul_handler
- .rva .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
+ .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
.align 8
.LSEH_info_bn_powerx5:
.byte 9,0,0,0
.rva mul_handler
- .rva .Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[]
+ .rva .Lpowerx5_prologue,.Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[]
___
$code.=<<___;
.align 8
diff --git a/crypto/bn/bn.h b/crypto/bn/bn.h
index 86264ae6315f..633d1b1f6013 100644
--- a/crypto/bn/bn.h
+++ b/crypto/bn/bn.h
@@ -842,6 +842,8 @@ int RAND_pseudo_bytes(unsigned char *buf, int num);
if (*(ftl--)) break; \
(a)->top = tmp_top; \
} \
+ if ((a)->top == 0) \
+ (a)->neg = 0; \
bn_pollute(a); \
}
diff --git a/crypto/bn/bn_div.c b/crypto/bn/bn_div.c
index 72e6ce3f74c0..bc37671cf138 100644
--- a/crypto/bn/bn_div.c
+++ b/crypto/bn/bn_div.c
@@ -155,7 +155,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d,
({ asm volatile ( \
"divl %4" \
: "=a"(q), "=d"(rem) \
- : "a"(n1), "d"(n0), "g"(d0) \
+ : "a"(n1), "d"(n0), "r"(d0) \
: "cc"); \
q; \
})
@@ -170,7 +170,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d,
({ asm volatile ( \
"divq %4" \
: "=a"(q), "=d"(rem) \
- : "a"(n1), "d"(n0), "g"(d0) \
+ : "a"(n1), "d"(n0), "r"(d0) \
: "cc"); \
q; \
})
diff --git a/crypto/bn/bn_lib.c b/crypto/bn/bn_lib.c
index 80105fff410c..10b78f512607 100644
--- a/crypto/bn/bn_lib.c
+++ b/crypto/bn/bn_lib.c
@@ -569,7 +569,7 @@ void BN_clear(BIGNUM *a)
{
bn_check_top(a);
if (a->d != NULL)
- memset(a->d, 0, a->dmax * sizeof(a->d[0]));
+ OPENSSL_cleanse(a->d, a->dmax * sizeof(a->d[0]));
a->top = 0;
a->neg = 0;
}
diff --git a/crypto/bn/bn_print.c b/crypto/bn/bn_print.c
index bfa31efc5621..f121fb6e9a08 100644
--- a/crypto/bn/bn_print.c
+++ b/crypto/bn/bn_print.c
@@ -72,12 +72,9 @@ char *BN_bn2hex(const BIGNUM *a)
char *buf;
char *p;
- if (a->neg && BN_is_zero(a)) {
- /* "-0" == 3 bytes including NULL terminator */
- buf = OPENSSL_malloc(3);
- } else {
- buf = OPENSSL_malloc(a->top * BN_BYTES * 2 + 2);
- }
+ if (BN_is_zero(a))
+ return OPENSSL_strdup("0");
+ buf = OPENSSL_malloc(a->top * BN_BYTES * 2 + 2);
if (buf == NULL) {
BNerr(BN_F_BN_BN2HEX, ERR_R_MALLOC_FAILURE);
goto err;
@@ -111,6 +108,7 @@ char *BN_bn2dec(const BIGNUM *a)
char *p;
BIGNUM *t = NULL;
BN_ULONG *bn_data = NULL, *lp;
+ int bn_data_num;
/*-
* get an upper bound for the length of the decimal integer
@@ -120,9 +118,9 @@ char *BN_bn2dec(const BIGNUM *a)
*/
i = BN_num_bits(a) * 3;
num = (i / 10 + i / 1000 + 1) + 1;
- bn_data =
- (BN_ULONG *)OPENSSL_malloc((num / BN_DEC_NUM + 1) * sizeof(BN_ULONG));
- buf = (char *)OPENSSL_malloc(num + 3);
+ bn_data_num = num / BN_DEC_NUM + 1;
+ bn_data = OPENSSL_malloc(bn_data_num * sizeof(BN_ULONG));
+ buf = OPENSSL_malloc(num + 3);
if ((buf == NULL) || (bn_data == NULL)) {
BNerr(BN_F_BN_BN2DEC, ERR_R_MALLOC_FAILURE);
goto err;
@@ -140,9 +138,12 @@ char *BN_bn2dec(const BIGNUM *a)
if (BN_is_negative(t))
*p++ = '-';
- i = 0;
while (!BN_is_zero(t)) {
+ if (lp - bn_data >= bn_data_num)
+ goto err;
*lp = BN_div_word(t, BN_DEC_CONV);
+ if (*lp == (BN_ULONG)-1)
+ goto err;
lp++;
}
lp--;
@@ -240,10 +241,12 @@ int BN_hex2bn(BIGNUM **bn, const char *a)
}
ret->top = h;
bn_correct_top(ret);
- ret->neg = neg;
*bn = ret;
bn_check_top(ret);
+ /* Don't set the negative flag if it's zero. */
+ if (ret->top != 0)
+ ret->neg = neg;
return (num);
err:
if (*bn == NULL)
@@ -295,7 +298,7 @@ int BN_dec2bn(BIGNUM **bn, const char *a)
if (j == BN_DEC_NUM)
j = 0;
l = 0;
- while (*a) {
+ while (--i >= 0) {
l *= 10;
l += *a - '0';
a++;
@@ -306,11 +309,13 @@ int BN_dec2bn(BIGNUM **bn, const char *a)
j = 0;
}
}
- ret->neg = neg;
bn_correct_top(ret);
*bn = ret;
bn_check_top(ret);
+ /* Don't set the negative flag if it's zero. */
+ if (ret->top != 0)
+ ret->neg = neg;
return (num);
err:
if (*bn == NULL)
@@ -321,6 +326,7 @@ int BN_dec2bn(BIGNUM **bn, const char *a)
int BN_asc2bn(BIGNUM **bn, const char *a)
{
const char *p = a;
+
if (*p == '-')
p++;
@@ -331,7 +337,8 @@ int BN_asc2bn(BIGNUM **bn, const char *a)
if (!BN_dec2bn(bn, p))
return 0;
}
- if (*a == '-')
+ /* Don't set the negative flag if it's zero. */
+ if (*a == '-' && (*bn)->top != 0)
(*bn)->neg = 1;
return 1;
}
diff --git a/crypto/bn/bn_rand.c b/crypto/bn/bn_rand.c
index f9fb2e9e45e0..60d3f2260ba1 100644
--- a/crypto/bn/bn_rand.c
+++ b/crypto/bn/bn_rand.c
@@ -121,15 +121,14 @@ static int bnrand(int pseudorand, BIGNUM *rnd, int bits, int top, int bottom)
int ret = 0, bit, bytes, mask;
time_t tim;
- if (bits < 0 || (bits == 1 && top > 0)) {
- BNerr(BN_F_BNRAND, BN_R_BITS_TOO_SMALL);
- return 0;
- }
-
if (bits == 0) {
+ if (top != -1 || bottom != 0)
+ goto toosmall;
BN_zero(rnd);
return 1;
}
+ if (bits < 0 || (bits == 1 && top > 0))
+ goto toosmall;
bytes = (bits + 7) / 8;
bit = (bits - 1) % 8;
@@ -145,13 +144,9 @@ static int bnrand(int pseudorand, BIGNUM *rnd, int bits, int top, int bottom)
time(&tim);
RAND_add(&tim, sizeof(tim), 0.0);
- if (pseudorand) {
- if (RAND_pseudo_bytes(buf, bytes) == -1)
- goto err;
- } else {
- if (RAND_bytes(buf, bytes) <= 0)
- goto err;
- }
+ /* We ignore the value of pseudorand and always call RAND_bytes */
+ if (RAND_bytes(buf, bytes) <= 0)
+ goto err;
#if 1
if (pseudorand == 2) {
@@ -199,6 +194,10 @@ static int bnrand(int pseudorand, BIGNUM *rnd, int bits, int top, int bottom)
}
bn_check_top(rnd);
return (ret);
+
+toosmall:
+ BNerr(BN_F_BNRAND, BN_R_BITS_TOO_SMALL);
+ return 0;
}
int BN_rand(BIGNUM *rnd, int bits, int top, int bottom)
diff --git a/crypto/bn/bn_word.c b/crypto/bn/bn_word.c
index b031a60b5bf8..9b5f9cb98c3a 100644
--- a/crypto/bn/bn_word.c
+++ b/crypto/bn/bn_word.c
@@ -72,10 +72,32 @@ BN_ULONG BN_mod_word(const BIGNUM *a, BN_ULONG w)
if (w == 0)
return (BN_ULONG)-1;
+#ifndef BN_LLONG
+ /*
+ * If |w| is too long and we don't have BN_ULLONG then we need to fall
+ * back to using BN_div_word
+ */
+ if (w > ((BN_ULONG)1 << BN_BITS4)) {
+ BIGNUM *tmp = BN_dup(a);
+ if (tmp == NULL)
+ return (BN_ULONG)-1;
+
+ ret = BN_div_word(tmp, w);
+ BN_free(tmp);
+
+ return ret;
+ }
+#endif
+
bn_check_top(a);
w &= BN_MASK2;
for (i = a->top - 1; i >= 0; i--) {
#ifndef BN_LLONG
+ /*
+ * We can assume here that | w <= ((BN_ULONG)1 << BN_BITS4) | and so
+ * | ret < ((BN_ULONG)1 << BN_BITS4) | and therefore the shifts here are
+ * safe and will not overflow
+ */
ret = ((ret << BN_BITS4) | ((a->d[i] >> BN_BITS4) & BN_MASK2l)) % w;
ret = ((ret << BN_BITS4) | (a->d[i] & BN_MASK2l)) % w;
#else
diff --git a/crypto/bn/bntest.c b/crypto/bn/bntest.c
index 1e35988022bb..a327b1a647b2 100644
--- a/crypto/bn/bntest.c
+++ b/crypto/bn/bntest.c
@@ -514,7 +514,7 @@ static void print_word(BIO *bp, BN_ULONG w)
int test_div_word(BIO *bp)
{
BIGNUM a, b;
- BN_ULONG r, s;
+ BN_ULONG r, rmod, s;
int i;
BN_init(&a);
@@ -528,8 +528,14 @@ int test_div_word(BIO *bp)
s = b.d[0];
BN_copy(&b, &a);
+ rmod = BN_mod_word(&b, s);
r = BN_div_word(&b, s);
+ if (rmod != r) {
+ fprintf(stderr, "Mod (word) test failed!\n");
+ return 0;
+ }
+
if (bp != NULL) {
if (!results) {
BN_print(bp, &a);