aboutsummaryrefslogtreecommitdiffstats
path: root/crypto/bn/asm/x86_64-mont5.pl
diff options
context:
space:
mode:
Diffstat (limited to 'crypto/bn/asm/x86_64-mont5.pl')
-rwxr-xr-xcrypto/bn/asm/x86_64-mont5.pl1276
1 files changed, 719 insertions, 557 deletions
diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl
index 292409c4ffb8..2e8c9db32cbc 100755
--- a/crypto/bn/asm/x86_64-mont5.pl
+++ b/crypto/bn/asm/x86_64-mont5.pl
@@ -99,58 +99,111 @@ $code.=<<___;
.Lmul_enter:
mov ${num}d,${num}d
mov %rsp,%rax
- mov `($win64?56:8)`(%rsp),%r10d # load 7th argument
+ movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument
+ lea .Linc(%rip),%r10
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
-___
-$code.=<<___ if ($win64);
- lea -0x28(%rsp),%rsp
- movaps %xmm6,(%rsp)
- movaps %xmm7,0x10(%rsp)
-___
-$code.=<<___;
+
lea 2($num),%r11
neg %r11
- lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2))
+ lea -264(%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)+256+8)
and \$-1024,%rsp # minimize TLB usage
mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
.Lmul_body:
- mov $bp,%r12 # reassign $bp
+ lea 128($bp),%r12 # reassign $bp (+size optimization)
___
$bp="%r12";
$STRIDE=2**5*8; # 5 is "window size"
$N=$STRIDE/4; # should match cache line size
$code.=<<___;
- mov %r10,%r11
- shr \$`log($N/8)/log(2)`,%r10
- and \$`$N/8-1`,%r11
- not %r10
- lea .Lmagic_masks(%rip),%rax
- and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
- lea 96($bp,%r11,8),$bp # pointer within 1st cache line
- movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
- movq 8(%rax,%r10,8),%xmm5 # cache line contains element
- movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
- movq 24(%rax,%r10,8),%xmm7
-
- movq `0*$STRIDE/4-96`($bp),%xmm0
- movq `1*$STRIDE/4-96`($bp),%xmm1
- pand %xmm4,%xmm0
- movq `2*$STRIDE/4-96`($bp),%xmm2
- pand %xmm5,%xmm1
- movq `3*$STRIDE/4-96`($bp),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
+ movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000
+ movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002
+ lea 24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization)
+ and \$-16,%r10
+
+ pshufd \$0,%xmm5,%xmm5 # broadcast index
+ movdqa %xmm1,%xmm4
+ movdqa %xmm1,%xmm2
+___
+########################################################################
+# calculate mask by comparing 0..31 to index and save result to stack
+#
+$code.=<<___;
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0 # compare to 1,0
+ .byte 0x67
+ movdqa %xmm4,%xmm3
+___
+for($k=0;$k<$STRIDE/16-4;$k+=4) {
+$code.=<<___;
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1 # compare to 3,2
+ movdqa %xmm0,`16*($k+0)+112`(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2 # compare to 5,4
+ movdqa %xmm1,`16*($k+1)+112`(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3 # compare to 7,6
+ movdqa %xmm2,`16*($k+2)+112`(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,`16*($k+3)+112`(%r10)
+ movdqa %xmm4,%xmm3
+___
+}
+$code.=<<___; # last iteration can be optimized
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,`16*($k+0)+112`(%r10)
+
+ paddd %xmm2,%xmm3
+ .byte 0x67
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,`16*($k+1)+112`(%r10)
+
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,`16*($k+2)+112`(%r10)
+ pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register
+
+ pand `16*($k+1)-128`($bp),%xmm1
+ pand `16*($k+2)-128`($bp),%xmm2
+ movdqa %xmm3,`16*($k+3)+112`(%r10)
+ pand `16*($k+3)-128`($bp),%xmm3
por %xmm2,%xmm0
+ por %xmm3,%xmm1
+___
+for($k=0;$k<$STRIDE/16-4;$k+=4) {
+$code.=<<___;
+ movdqa `16*($k+0)-128`($bp),%xmm4
+ movdqa `16*($k+1)-128`($bp),%xmm5
+ movdqa `16*($k+2)-128`($bp),%xmm2
+ pand `16*($k+0)+112`(%r10),%xmm4
+ movdqa `16*($k+3)-128`($bp),%xmm3
+ pand `16*($k+1)+112`(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand `16*($k+2)+112`(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand `16*($k+3)+112`(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+___
+}
+$code.=<<___;
+ por %xmm1,%xmm0
+ pshufd \$0x4e,%xmm0,%xmm1
+ por %xmm1,%xmm0
lea $STRIDE($bp),$bp
- por %xmm3,%xmm0
-
movq %xmm0,$m0 # m0=bp[0]
mov ($n0),$n0 # pull n0[0] value
@@ -159,29 +212,14 @@ $code.=<<___;
xor $i,$i # i=0
xor $j,$j # j=0
- movq `0*$STRIDE/4-96`($bp),%xmm0
- movq `1*$STRIDE/4-96`($bp),%xmm1
- pand %xmm4,%xmm0
- movq `2*$STRIDE/4-96`($bp),%xmm2
- pand %xmm5,%xmm1
-
mov $n0,$m1
mulq $m0 # ap[0]*bp[0]
mov %rax,$lo0
mov ($np),%rax
- movq `3*$STRIDE/4-96`($bp),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
-
imulq $lo0,$m1 # "tp[0]"*n0
mov %rdx,$hi0
- por %xmm2,%xmm0
- lea $STRIDE($bp),$bp
- por %xmm3,%xmm0
-
mulq $m1 # np[0]*m1
add %rax,$lo0 # discarded
mov 8($ap),%rax
@@ -212,16 +250,14 @@ $code.=<<___;
mulq $m1 # np[j]*m1
cmp $num,$j
- jne .L1st
-
- movq %xmm0,$m0 # bp[1]
+ jne .L1st # note that upon exit $j==$num, so
+ # they can be used interchangeably
add %rax,$hi1
- mov ($ap),%rax # ap[0]
adc \$0,%rdx
add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
adc \$0,%rdx
- mov $hi1,-16(%rsp,$j,8) # tp[j-1]
+ mov $hi1,-16(%rsp,$num,8) # tp[num-1]
mov %rdx,$hi1
mov $lo0,$hi0
@@ -235,33 +271,48 @@ $code.=<<___;
jmp .Louter
.align 16
.Louter:
+ lea 24+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization)
+ and \$-16,%rdx
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+___
+for($k=0;$k<$STRIDE/16;$k+=4) {
+$code.=<<___;
+ movdqa `16*($k+0)-128`($bp),%xmm0
+ movdqa `16*($k+1)-128`($bp),%xmm1
+ movdqa `16*($k+2)-128`($bp),%xmm2
+ movdqa `16*($k+3)-128`($bp),%xmm3
+ pand `16*($k+0)-128`(%rdx),%xmm0
+ pand `16*($k+1)-128`(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand `16*($k+2)-128`(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand `16*($k+3)-128`(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+___
+}
+$code.=<<___;
+ por %xmm5,%xmm4
+ pshufd \$0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
+ lea $STRIDE($bp),$bp
+
+ mov ($ap),%rax # ap[0]
+ movq %xmm0,$m0 # m0=bp[i]
+
xor $j,$j # j=0
mov $n0,$m1
mov (%rsp),$lo0
- movq `0*$STRIDE/4-96`($bp),%xmm0
- movq `1*$STRIDE/4-96`($bp),%xmm1
- pand %xmm4,%xmm0
- movq `2*$STRIDE/4-96`($bp),%xmm2
- pand %xmm5,%xmm1
-
mulq $m0 # ap[0]*bp[i]
add %rax,$lo0 # ap[0]*bp[i]+tp[0]
mov ($np),%rax
adc \$0,%rdx
- movq `3*$STRIDE/4-96`($bp),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
-
imulq $lo0,$m1 # tp[0]*n0
mov %rdx,$hi0
- por %xmm2,%xmm0
- lea $STRIDE($bp),$bp
- por %xmm3,%xmm0
-
mulq $m1 # np[0]*m1
add %rax,$lo0 # discarded
mov 8($ap),%rax
@@ -295,17 +346,14 @@ $code.=<<___;
mulq $m1 # np[j]*m1
cmp $num,$j
- jne .Linner
-
- movq %xmm0,$m0 # bp[i+1]
-
+ jne .Linner # note that upon exit $j==$num, so
+ # they can be used interchangeably
add %rax,$hi1
- mov ($ap),%rax # ap[0]
adc \$0,%rdx
add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
- mov (%rsp,$j,8),$lo0
+ mov (%rsp,$num,8),$lo0
adc \$0,%rdx
- mov $hi1,-16(%rsp,$j,8) # tp[j-1]
+ mov $hi1,-16(%rsp,$num,8) # tp[num-1]
mov %rdx,$hi1
xor %rdx,%rdx
@@ -352,12 +400,7 @@ $code.=<<___;
mov 8(%rsp,$num,8),%rsi # restore %rsp
mov \$1,%rax
-___
-$code.=<<___ if ($win64);
- movaps -88(%rsi),%xmm6
- movaps -72(%rsi),%xmm7
-___
-$code.=<<___;
+
mov -48(%rsi),%r15
mov -40(%rsi),%r14
mov -32(%rsi),%r13
@@ -379,8 +422,8 @@ bn_mul4x_mont_gather5:
.Lmul4x_enter:
___
$code.=<<___ if ($addx);
- and \$0x80100,%r11d
- cmp \$0x80100,%r11d
+ and \$0x80108,%r11d
+ cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1
je .Lmulx4x_enter
___
$code.=<<___;
@@ -392,39 +435,34 @@ $code.=<<___;
push %r13
push %r14
push %r15
-___
-$code.=<<___ if ($win64);
- lea -0x28(%rsp),%rsp
- movaps %xmm6,(%rsp)
- movaps %xmm7,0x10(%rsp)
-___
-$code.=<<___;
+
.byte 0x67
- mov ${num}d,%r10d
- shl \$3,${num}d
- shl \$3+2,%r10d # 4*$num
+ shl \$3,${num}d # convert $num to bytes
+ lea ($num,$num,2),%r10 # 3*$num in bytes
neg $num # -$num
##############################################################
- # ensure that stack frame doesn't alias with $aptr+4*$num
- # modulo 4096, which covers ret[num], am[num] and n[2*num]
- # (see bn_exp.c). this is done to allow memory disambiguation
- # logic do its magic. [excessive frame is allocated in order
- # to allow bn_from_mont8x to clear it.]
+ # Ensure that stack frame doesn't alias with $rptr+3*$num
+ # modulo 4096, which covers ret[num], am[num] and n[num]
+ # (see bn_exp.c). This is done to allow memory disambiguation
+ # logic do its magic. [Extra [num] is allocated in order
+ # to align with bn_power5's frame, which is cleansed after
+ # completing exponentiation. Extra 256 bytes is for power mask
+ # calculated from 7th argument, the index.]
#
- lea -64(%rsp,$num,2),%r11
- sub $ap,%r11
+ lea -320(%rsp,$num,2),%r11
+ sub $rp,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lmul4xsp_alt
- sub %r11,%rsp # align with $ap
- lea -64(%rsp,$num,2),%rsp # alloca(128+num*8)
+ sub %r11,%rsp # align with $rp
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
jmp .Lmul4xsp_done
.align 32
.Lmul4xsp_alt:
- lea 4096-64(,$num,2),%r10
- lea -64(%rsp,$num,2),%rsp # alloca(128+num*8)
+ lea 4096-320(,$num,2),%r10
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
@@ -440,12 +478,7 @@ $code.=<<___;
mov 40(%rsp),%rsi # restore %rsp
mov \$1,%rax
-___
-$code.=<<___ if ($win64);
- movaps -88(%rsi),%xmm6
- movaps -72(%rsi),%xmm7
-___
-$code.=<<___;
+
mov -48(%rsi),%r15
mov -40(%rsi),%r14
mov -32(%rsi),%r13
@@ -460,9 +493,10 @@ $code.=<<___;
.type mul4x_internal,\@abi-omnipotent
.align 32
mul4x_internal:
- shl \$5,$num
- mov `($win64?56:8)`(%rax),%r10d # load 7th argument
- lea 256(%rdx,$num),%r13
+ shl \$5,$num # $num was in bytes
+ movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument, index
+ lea .Linc(%rip),%rax
+ lea 128(%rdx,$num),%r13 # end of powers table (+size optimization)
shr \$5,$num # restore $num
___
$bp="%r12";
@@ -470,44 +504,92 @@ ___
$N=$STRIDE/4; # should match cache line size
$tp=$i;
$code.=<<___;
- mov %r10,%r11
- shr \$`log($N/8)/log(2)`,%r10
- and \$`$N/8-1`,%r11
- not %r10
- lea .Lmagic_masks(%rip),%rax
- and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
- lea 96(%rdx,%r11,8),$bp # pointer within 1st cache line
- movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
- movq 8(%rax,%r10,8),%xmm5 # cache line contains element
- add \$7,%r11
- movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
- movq 24(%rax,%r10,8),%xmm7
- and \$7,%r11
-
- movq `0*$STRIDE/4-96`($bp),%xmm0
- lea $STRIDE($bp),$tp # borrow $tp
- movq `1*$STRIDE/4-96`($bp),%xmm1
- pand %xmm4,%xmm0
- movq `2*$STRIDE/4-96`($bp),%xmm2
- pand %xmm5,%xmm1
- movq `3*$STRIDE/4-96`($bp),%xmm3
- pand %xmm6,%xmm2
- .byte 0x67
- por %xmm1,%xmm0
- movq `0*$STRIDE/4-96`($tp),%xmm1
- .byte 0x67
- pand %xmm7,%xmm3
- .byte 0x67
- por %xmm2,%xmm0
- movq `1*$STRIDE/4-96`($tp),%xmm2
+ movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000
+ movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002
+ lea 88-112(%rsp,$num),%r10 # place the mask after tp[num+1] (+ICache optimization)
+ lea 128(%rdx),$bp # size optimization
+
+ pshufd \$0,%xmm5,%xmm5 # broadcast index
+ movdqa %xmm1,%xmm4
+ .byte 0x67,0x67
+ movdqa %xmm1,%xmm2
+___
+########################################################################
+# calculate mask by comparing 0..31 to index and save result to stack
+#
+$code.=<<___;
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0 # compare to 1,0
.byte 0x67
- pand %xmm4,%xmm1
+ movdqa %xmm4,%xmm3
+___
+for($i=0;$i<$STRIDE/16-4;$i+=4) {
+$code.=<<___;
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1 # compare to 3,2
+ movdqa %xmm0,`16*($i+0)+112`(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2 # compare to 5,4
+ movdqa %xmm1,`16*($i+1)+112`(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3 # compare to 7,6
+ movdqa %xmm2,`16*($i+2)+112`(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,`16*($i+3)+112`(%r10)
+ movdqa %xmm4,%xmm3
+___
+}
+$code.=<<___; # last iteration can be optimized
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,`16*($i+0)+112`(%r10)
+
+ paddd %xmm2,%xmm3
.byte 0x67
- por %xmm3,%xmm0
- movq `2*$STRIDE/4-96`($tp),%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,`16*($i+1)+112`(%r10)
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,`16*($i+2)+112`(%r10)
+ pand `16*($i+0)-128`($bp),%xmm0 # while it's still in register
+
+ pand `16*($i+1)-128`($bp),%xmm1
+ pand `16*($i+2)-128`($bp),%xmm2
+ movdqa %xmm3,`16*($i+3)+112`(%r10)
+ pand `16*($i+3)-128`($bp),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+___
+for($i=0;$i<$STRIDE/16-4;$i+=4) {
+$code.=<<___;
+ movdqa `16*($i+0)-128`($bp),%xmm4
+ movdqa `16*($i+1)-128`($bp),%xmm5
+ movdqa `16*($i+2)-128`($bp),%xmm2
+ pand `16*($i+0)+112`(%r10),%xmm4
+ movdqa `16*($i+3)-128`($bp),%xmm3
+ pand `16*($i+1)+112`(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand `16*($i+2)+112`(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand `16*($i+3)+112`(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+___
+}
+$code.=<<___;
+ por %xmm1,%xmm0
+ pshufd \$0x4e,%xmm0,%xmm1
+ por %xmm1,%xmm0
+ lea $STRIDE($bp),$bp
movq %xmm0,$m0 # m0=bp[0]
- movq `3*$STRIDE/4-96`($tp),%xmm0
+
mov %r13,16+8(%rsp) # save end of b[num]
mov $rp, 56+8(%rsp) # save $rp
@@ -521,26 +603,10 @@ $code.=<<___;
mov %rax,$A[0]
mov ($np),%rax
- pand %xmm5,%xmm2
- pand %xmm6,%xmm3
- por %xmm2,%xmm1
-
imulq $A[0],$m1 # "tp[0]"*n0
- ##############################################################
- # $tp is chosen so that writing to top-most element of the
- # vector occurs just "above" references to powers table,
- # "above" modulo cache-line size, which effectively precludes
- # possibility of memory disambiguation logic failure when
- # accessing the table.
- #
- lea 64+8(%rsp,%r11,8),$tp
+ lea 64+8(%rsp),$tp
mov %rdx,$A[1]
- pand %xmm7,%xmm0
- por %xmm3,%xmm1
- lea 2*$STRIDE($bp),$bp
- por %xmm1,%xmm0
-
mulq $m1 # np[0]*m1
add %rax,$A[0] # discarded
mov 8($ap,$num),%rax
@@ -549,7 +615,7 @@ $code.=<<___;
mulq $m0
add %rax,$A[1]
- mov 16*1($np),%rax # interleaved with 0, therefore 16*n
+ mov 8*1($np),%rax
adc \$0,%rdx
mov %rdx,$A[0]
@@ -559,7 +625,7 @@ $code.=<<___;
adc \$0,%rdx
add $A[1],$N[1]
lea 4*8($num),$j # j=4
- lea 16*4($np),$np
+ lea 8*4($np),$np
adc \$0,%rdx
mov $N[1],($tp)
mov %rdx,$N[0]
@@ -569,7 +635,7 @@ $code.=<<___;
.L1st4x:
mulq $m0 # ap[j]*bp[0]
add %rax,$A[0]
- mov -16*2($np),%rax
+ mov -8*2($np),%rax
lea 32($tp),$tp
adc \$0,%rdx
mov %rdx,$A[1]
@@ -585,7 +651,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[0]
add %rax,$A[1]
- mov -16*1($np),%rax
+ mov -8*1($np),%rax
adc \$0,%rdx
mov %rdx,$A[0]
@@ -600,7 +666,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[0]
add %rax,$A[0]
- mov 16*0($np),%rax
+ mov 8*0($np),%rax
adc \$0,%rdx
mov %rdx,$A[1]
@@ -615,7 +681,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[0]
add %rax,$A[1]
- mov 16*1($np),%rax
+ mov 8*1($np),%rax
adc \$0,%rdx
mov %rdx,$A[0]
@@ -624,7 +690,7 @@ $code.=<<___;
mov 16($ap,$j),%rax
adc \$0,%rdx
add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
- lea 16*4($np),$np
+ lea 8*4($np),$np
adc \$0,%rdx
mov $N[1],($tp) # tp[j-1]
mov %rdx,$N[0]
@@ -634,7 +700,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[0]
add %rax,$A[0]
- mov -16*2($np),%rax
+ mov -8*2($np),%rax
lea 32($tp),$tp
adc \$0,%rdx
mov %rdx,$A[1]
@@ -650,7 +716,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[0]
add %rax,$A[1]
- mov -16*1($np),%rax
+ mov -8*1($np),%rax
adc \$0,%rdx
mov %rdx,$A[0]
@@ -663,8 +729,7 @@ $code.=<<___;
mov $N[1],-16($tp) # tp[j-1]
mov %rdx,$N[0]
- movq %xmm0,$m0 # bp[1]
- lea ($np,$num,2),$np # rewind $np
+ lea ($np,$num),$np # rewind $np
xor $N[1],$N[1]
add $A[0],$N[0]
@@ -675,6 +740,33 @@ $code.=<<___;
.align 32
.Louter4x:
+ lea 16+128($tp),%rdx # where 256-byte mask is (+size optimization)
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+___
+for($i=0;$i<$STRIDE/16;$i+=4) {
+$code.=<<___;
+ movdqa `16*($i+0)-128`($bp),%xmm0
+ movdqa `16*($i+1)-128`($bp),%xmm1
+ movdqa `16*($i+2)-128`($bp),%xmm2
+ movdqa `16*($i+3)-128`($bp),%xmm3
+ pand `16*($i+0)-128`(%rdx),%xmm0
+ pand `16*($i+1)-128`(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand `16*($i+2)-128`(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand `16*($i+3)-128`(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+___
+}
+$code.=<<___;
+ por %xmm5,%xmm4
+ pshufd \$0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
+ lea $STRIDE($bp),$bp
+ movq %xmm0,$m0 # m0=bp[i]
+
mov ($tp,$num),$A[0]
mov $n0,$m1
mulq $m0 # ap[0]*bp[i]
@@ -682,25 +774,11 @@ $code.=<<___;
mov ($np),%rax
adc \$0,%rdx
- movq `0*$STRIDE/4-96`($bp),%xmm0
- movq `1*$STRIDE/4-96`($bp),%xmm1
- pand %xmm4,%xmm0
- movq `2*$STRIDE/4-96`($bp),%xmm2
- pand %xmm5,%xmm1
- movq `3*$STRIDE/4-96`($bp),%xmm3
-
imulq $A[0],$m1 # tp[0]*n0
- .byte 0x67
mov %rdx,$A[1]
mov $N[1],($tp) # store upmost overflow bit
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
- por %xmm2,%xmm0
lea ($tp,$num),$tp # rewind $tp
- lea $STRIDE($bp),$bp
- por %xmm3,%xmm0
mulq $m1 # np[0]*m1
add %rax,$A[0] # "$N[0]", discarded
@@ -710,7 +788,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[i]
add %rax,$A[1]
- mov 16*1($np),%rax # interleaved with 0, therefore 16*n
+ mov 8*1($np),%rax
adc \$0,%rdx
add 8($tp),$A[1] # +tp[1]
adc \$0,%rdx
@@ -722,7 +800,7 @@ $code.=<<___;
adc \$0,%rdx
add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
lea 4*8($num),$j # j=4
- lea 16*4($np),$np
+ lea 8*4($np),$np
adc \$0,%rdx
mov %rdx,$N[0]
jmp .Linner4x
@@ -731,7 +809,7 @@ $code.=<<___;
.Linner4x:
mulq $m0 # ap[j]*bp[i]
add %rax,$A[0]
- mov -16*2($np),%rax
+ mov -8*2($np),%rax
adc \$0,%rdx
add 16($tp),$A[0] # ap[j]*bp[i]+tp[j]
lea 32($tp),$tp
@@ -749,7 +827,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[i]
add %rax,$A[1]
- mov -16*1($np),%rax
+ mov -8*1($np),%rax
adc \$0,%rdx
add -8($tp),$A[1]
adc \$0,%rdx
@@ -766,7 +844,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[i]
add %rax,$A[0]
- mov 16*0($np),%rax
+ mov 8*0($np),%rax
adc \$0,%rdx
add ($tp),$A[0] # ap[j]*bp[i]+tp[j]
adc \$0,%rdx
@@ -783,7 +861,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[i]
add %rax,$A[1]
- mov 16*1($np),%rax
+ mov 8*1($np),%rax
adc \$0,%rdx
add 8($tp),$A[1]
adc \$0,%rdx
@@ -794,7 +872,7 @@ $code.=<<___;
mov 16($ap,$j),%rax
adc \$0,%rdx
add $A[1],$N[1]
- lea 16*4($np),$np
+ lea 8*4($np),$np
adc \$0,%rdx
mov $N[0],-8($tp) # tp[j-1]
mov %rdx,$N[0]
@@ -804,7 +882,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[i]
add %rax,$A[0]
- mov -16*2($np),%rax
+ mov -8*2($np),%rax
adc \$0,%rdx
add 16($tp),$A[0] # ap[j]*bp[i]+tp[j]
lea 32($tp),$tp
@@ -823,7 +901,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[i]
add %rax,$A[1]
mov $m1,%rax
- mov -16*1($np),$m1
+ mov -8*1($np),$m1
adc \$0,%rdx
add -8($tp),$A[1]
adc \$0,%rdx
@@ -838,9 +916,8 @@ $code.=<<___;
mov $N[0],-24($tp) # tp[j-1]
mov %rdx,$N[0]
- movq %xmm0,$m0 # bp[i+1]
mov $N[1],-16($tp) # tp[j-1]
- lea ($np,$num,2),$np # rewind $np
+ lea ($np,$num),$np # rewind $np
xor $N[1],$N[1]
add $A[0],$N[0]
@@ -854,16 +931,23 @@ $code.=<<___;
___
if (1) {
$code.=<<___;
+ xor %rax,%rax
sub $N[0],$m1 # compare top-most words
adc $j,$j # $j is zero
or $j,$N[1]
- xor \$1,$N[1]
+ sub $N[1],%rax # %rax=-$N[1]
lea ($tp,$num),%rbx # tptr in .sqr4x_sub
- lea ($np,$N[1],8),%rbp # nptr in .sqr4x_sub
+ mov ($np),%r12
+ lea ($np),%rbp # nptr in .sqr4x_sub
mov %r9,%rcx
- sar \$3+2,%rcx # cf=0
+ sar \$3+2,%rcx
mov 56+8(%rsp),%rdi # rptr in .sqr4x_sub
- jmp .Lsqr4x_sub
+ dec %r12 # so that after 'not' we get -n[0]
+ xor %r10,%r10
+ mov 8*1(%rbp),%r13
+ mov 8*2(%rbp),%r14
+ mov 8*3(%rbp),%r15
+ jmp .Lsqr4x_sub_entry
___
} else {
my @ri=("%rax",$bp,$m0,$m1);
@@ -930,8 +1014,8 @@ bn_power5:
___
$code.=<<___ if ($addx);
mov OPENSSL_ia32cap_P+8(%rip),%r11d
- and \$0x80100,%r11d
- cmp \$0x80100,%r11d
+ and \$0x80108,%r11d
+ cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1
je .Lpowerx5_enter
___
$code.=<<___;
@@ -942,38 +1026,32 @@ $code.=<<___;
push %r13
push %r14
push %r15
-___
-$code.=<<___ if ($win64);
- lea -0x28(%rsp),%rsp
- movaps %xmm6,(%rsp)
- movaps %xmm7,0x10(%rsp)
-___
-$code.=<<___;
- mov ${num}d,%r10d
+
shl \$3,${num}d # convert $num to bytes
- shl \$3+2,%r10d # 4*$num
+ lea ($num,$num,2),%r10d # 3*$num
neg $num
mov ($n0),$n0 # *n0
##############################################################
- # ensure that stack frame doesn't alias with $aptr+4*$num
- # modulo 4096, which covers ret[num], am[num] and n[2*num]
- # (see bn_exp.c). this is done to allow memory disambiguation
- # logic do its magic.
+ # Ensure that stack frame doesn't alias with $rptr+3*$num
+ # modulo 4096, which covers ret[num], am[num] and n[num]
+ # (see bn_exp.c). This is done to allow memory disambiguation
+ # logic do its magic. [Extra 256 bytes is for power mask
+ # calculated from 7th argument, the index.]
#
- lea -64(%rsp,$num,2),%r11
- sub $aptr,%r11
+ lea -320(%rsp,$num,2),%r11
+ sub $rptr,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lpwr_sp_alt
sub %r11,%rsp # align with $aptr
- lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
jmp .Lpwr_sp_done
.align 32
.Lpwr_sp_alt:
- lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
- lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ lea 4096-320(,$num,2),%r10
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
@@ -995,16 +1073,21 @@ $code.=<<___;
mov $n0, 32(%rsp)
mov %rax, 40(%rsp) # save original %rsp
.Lpower5_body:
- movq $rptr,%xmm1 # save $rptr
+ movq $rptr,%xmm1 # save $rptr, used in sqr8x
movq $nptr,%xmm2 # save $nptr
- movq %r10, %xmm3 # -$num
+ movq %r10, %xmm3 # -$num, used in sqr8x
movq $bptr,%xmm4
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
movq %xmm2,$nptr
movq %xmm4,$bptr
@@ -1565,9 +1648,9 @@ my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx");
$code.=<<___;
movq %xmm2,$nptr
-sqr8x_reduction:
+__bn_sqr8x_reduction:
xor %rax,%rax
- lea ($nptr,$num,2),%rcx # end of n[]
+ lea ($nptr,$num),%rcx # end of n[]
lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer
mov %rcx,0+8(%rsp)
lea 48+8(%rsp,$num),$tptr # end of initial t[] window
@@ -1593,21 +1676,21 @@ sqr8x_reduction:
.byte 0x67
mov $m0,%r8
imulq 32+8(%rsp),$m0 # n0*a[0]
- mov 16*0($nptr),%rax # n[0]
+ mov 8*0($nptr),%rax # n[0]
mov \$8,%ecx
jmp .L8x_reduce
.align 32
.L8x_reduce:
mulq $m0
- mov 16*1($nptr),%rax # n[1]
+ mov 8*1($nptr),%rax # n[1]
neg %r8
mov %rdx,%r8
adc \$0,%r8
mulq $m0
add %rax,%r9
- mov 16*2($nptr),%rax
+ mov 8*2($nptr),%rax
adc \$0,%rdx
add %r9,%r8
mov $m0,48-8+8(%rsp,%rcx,8) # put aside n0*a[i]
@@ -1616,7 +1699,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r10
- mov 16*3($nptr),%rax
+ mov 8*3($nptr),%rax
adc \$0,%rdx
add %r10,%r9
mov 32+8(%rsp),$carry # pull n0, borrow $carry
@@ -1625,7 +1708,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r11
- mov 16*4($nptr),%rax
+ mov 8*4($nptr),%rax
adc \$0,%rdx
imulq %r8,$carry # modulo-scheduled
add %r11,%r10
@@ -1634,7 +1717,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r12
- mov 16*5($nptr),%rax
+ mov 8*5($nptr),%rax
adc \$0,%rdx
add %r12,%r11
mov %rdx,%r12
@@ -1642,7 +1725,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r13
- mov 16*6($nptr),%rax
+ mov 8*6($nptr),%rax
adc \$0,%rdx
add %r13,%r12
mov %rdx,%r13
@@ -1650,7 +1733,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r14
- mov 16*7($nptr),%rax
+ mov 8*7($nptr),%rax
adc \$0,%rdx
add %r14,%r13
mov %rdx,%r14
@@ -1659,7 +1742,7 @@ sqr8x_reduction:
mulq $m0
mov $carry,$m0 # n0*a[i]
add %rax,%r15
- mov 16*0($nptr),%rax # n[0]
+ mov 8*0($nptr),%rax # n[0]
adc \$0,%rdx
add %r15,%r14
mov %rdx,%r15
@@ -1668,7 +1751,7 @@ sqr8x_reduction:
dec %ecx
jnz .L8x_reduce
- lea 16*8($nptr),$nptr
+ lea 8*8($nptr),$nptr
xor %rax,%rax
mov 8+8(%rsp),%rdx # pull end of t[]
cmp 0+8(%rsp),$nptr # end of n[]?
@@ -1687,21 +1770,21 @@ sqr8x_reduction:
mov 48+56+8(%rsp),$m0 # pull n0*a[0]
mov \$8,%ecx
- mov 16*0($nptr),%rax
+ mov 8*0($nptr),%rax
jmp .L8x_tail
.align 32
.L8x_tail:
mulq $m0
add %rax,%r8
- mov 16*1($nptr),%rax
+ mov 8*1($nptr),%rax
mov %r8,($tptr) # save result
mov %rdx,%r8
adc \$0,%r8
mulq $m0
add %rax,%r9
- mov 16*2($nptr),%rax
+ mov 8*2($nptr),%rax
adc \$0,%rdx
add %r9,%r8
lea 8($tptr),$tptr # $tptr++
@@ -1710,7 +1793,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r10
- mov 16*3($nptr),%rax
+ mov 8*3($nptr),%rax
adc \$0,%rdx
add %r10,%r9
mov %rdx,%r10
@@ -1718,7 +1801,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r11
- mov 16*4($nptr),%rax
+ mov 8*4($nptr),%rax
adc \$0,%rdx
add %r11,%r10
mov %rdx,%r11
@@ -1726,7 +1809,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r12
- mov 16*5($nptr),%rax
+ mov 8*5($nptr),%rax
adc \$0,%rdx
add %r12,%r11
mov %rdx,%r12
@@ -1734,7 +1817,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r13
- mov 16*6($nptr),%rax
+ mov 8*6($nptr),%rax
adc \$0,%rdx
add %r13,%r12
mov %rdx,%r13
@@ -1742,7 +1825,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r14
- mov 16*7($nptr),%rax
+ mov 8*7($nptr),%rax
adc \$0,%rdx
add %r14,%r13
mov %rdx,%r14
@@ -1753,14 +1836,14 @@ sqr8x_reduction:
add %rax,%r15
adc \$0,%rdx
add %r15,%r14
- mov 16*0($nptr),%rax # pull n[0]
+ mov 8*0($nptr),%rax # pull n[0]
mov %rdx,%r15
adc \$0,%r15
dec %ecx
jnz .L8x_tail
- lea 16*8($nptr),$nptr
+ lea 8*8($nptr),$nptr
mov 8+8(%rsp),%rdx # pull end of t[]
cmp 0+8(%rsp),$nptr # end of n[]?
jae .L8x_tail_done # break out of loop
@@ -1806,7 +1889,7 @@ sqr8x_reduction:
adc 8*6($tptr),%r14
adc 8*7($tptr),%r15
adc \$0,%rax # top-most carry
- mov -16($nptr),%rcx # np[num-1]
+ mov -8($nptr),%rcx # np[num-1]
xor $carry,$carry
movq %xmm2,$nptr # restore $nptr
@@ -1824,6 +1907,8 @@ sqr8x_reduction:
cmp %rdx,$tptr # end of t[]?
jb .L8x_reduction_loop
+ ret
+.size bn_sqr8x_internal,.-bn_sqr8x_internal
___
}
##############################################################
@@ -1832,48 +1917,62 @@ ___
{
my ($tptr,$nptr)=("%rbx","%rbp");
$code.=<<___;
- #xor %rsi,%rsi # %rsi was $carry above
- sub %r15,%rcx # compare top-most words
+.type __bn_post4x_internal,\@abi-omnipotent
+.align 32
+__bn_post4x_internal:
+ mov 8*0($nptr),%r12
lea (%rdi,$num),$tptr # %rdi was $tptr above
- adc %rsi,%rsi
mov $num,%rcx
- or %rsi,%rax
movq %xmm1,$rptr # restore $rptr
- xor \$1,%rax
+ neg %rax
movq %xmm1,$aptr # prepare for back-to-back call
- lea ($nptr,%rax,8),$nptr
- sar \$3+2,%rcx # cf=0
- jmp .Lsqr4x_sub
+ sar \$3+2,%rcx
+ dec %r12 # so that after 'not' we get -n[0]
+ xor %r10,%r10
+ mov 8*1($nptr),%r13
+ mov 8*2($nptr),%r14
+ mov 8*3($nptr),%r15
+ jmp .Lsqr4x_sub_entry
-.align 32
+.align 16
.Lsqr4x_sub:
- .byte 0x66
- mov 8*0($tptr),%r12
- mov 8*1($tptr),%r13
- sbb 16*0($nptr),%r12
- mov 8*2($tptr),%r14
- sbb 16*1($nptr),%r13
- mov 8*3($tptr),%r15
- lea 8*4($tptr),$tptr
- sbb 16*2($nptr),%r14
+ mov 8*0($nptr),%r12
+ mov 8*1($nptr),%r13
+ mov 8*2($nptr),%r14
+ mov 8*3($nptr),%r15
+.Lsqr4x_sub_entry:
+ lea 8*4($nptr),$nptr
+ not %r12
+ not %r13
+ not %r14
+ not %r15
+ and %rax,%r12
+ and %rax,%r13
+ and %rax,%r14
+ and %rax,%r15
+
+ neg %r10 # mov %r10,%cf
+ adc 8*0($tptr),%r12
+ adc 8*1($tptr),%r13
+ adc 8*2($tptr),%r14
+ adc 8*3($tptr),%r15
mov %r12,8*0($rptr)
- sbb 16*3($nptr),%r15
- lea 16*4($nptr),$nptr
+ lea 8*4($tptr),$tptr
mov %r13,8*1($rptr)
+ sbb %r10,%r10 # mov %cf,%r10
mov %r14,8*2($rptr)
mov %r15,8*3($rptr)
lea 8*4($rptr),$rptr
inc %rcx # pass %cf
jnz .Lsqr4x_sub
-___
-}
-$code.=<<___;
+
mov $num,%r10 # prepare for back-to-back call
neg $num # restore $num
ret
-.size bn_sqr8x_internal,.-bn_sqr8x_internal
+.size __bn_post4x_internal,.-__bn_post4x_internal
___
+}
{
$code.=<<___;
.globl bn_from_montgomery
@@ -1897,39 +1996,32 @@ bn_from_mont8x:
push %r13
push %r14
push %r15
-___
-$code.=<<___ if ($win64);
- lea -0x28(%rsp),%rsp
- movaps %xmm6,(%rsp)
- movaps %xmm7,0x10(%rsp)
-___
-$code.=<<___;
- .byte 0x67
- mov ${num}d,%r10d
+
shl \$3,${num}d # convert $num to bytes
- shl \$3+2,%r10d # 4*$num
+ lea ($num,$num,2),%r10 # 3*$num in bytes
neg $num
mov ($n0),$n0 # *n0
##############################################################
- # ensure that stack frame doesn't alias with $aptr+4*$num
- # modulo 4096, which covers ret[num], am[num] and n[2*num]
- # (see bn_exp.c). this is done to allow memory disambiguation
- # logic do its magic.
+ # Ensure that stack frame doesn't alias with $rptr+3*$num
+ # modulo 4096, which covers ret[num], am[num] and n[num]
+ # (see bn_exp.c). The stack is allocated to aligned with
+ # bn_power5's frame, and as bn_from_montgomery happens to be
+ # last operation, we use the opportunity to cleanse it.
#
- lea -64(%rsp,$num,2),%r11
- sub $aptr,%r11
+ lea -320(%rsp,$num,2),%r11
+ sub $rptr,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lfrom_sp_alt
sub %r11,%rsp # align with $aptr
- lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
jmp .Lfrom_sp_done
.align 32
.Lfrom_sp_alt:
- lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
- lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ lea 4096-320(,$num,2),%r10
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
@@ -1983,12 +2075,13 @@ $code.=<<___;
___
$code.=<<___ if ($addx);
mov OPENSSL_ia32cap_P+8(%rip),%r11d
- and \$0x80100,%r11d
- cmp \$0x80100,%r11d
+ and \$0x80108,%r11d
+ cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1
jne .Lfrom_mont_nox
lea (%rax,$num),$rptr
- call sqrx8x_reduction
+ call __bn_sqrx8x_reduction
+ call __bn_postx4x_internal
pxor %xmm0,%xmm0
lea 48(%rsp),%rax
@@ -1999,7 +2092,8 @@ $code.=<<___ if ($addx);
.Lfrom_mont_nox:
___
$code.=<<___;
- call sqr8x_reduction
+ call __bn_sqr8x_reduction
+ call __bn_post4x_internal
pxor %xmm0,%xmm0
lea 48(%rsp),%rax
@@ -2039,7 +2133,6 @@ $code.=<<___;
.align 32
bn_mulx4x_mont_gather5:
.Lmulx4x_enter:
- .byte 0x67
mov %rsp,%rax
push %rbx
push %rbp
@@ -2047,40 +2140,33 @@ bn_mulx4x_mont_gather5:
push %r13
push %r14
push %r15
-___
-$code.=<<___ if ($win64);
- lea -0x28(%rsp),%rsp
- movaps %xmm6,(%rsp)
- movaps %xmm7,0x10(%rsp)
-___
-$code.=<<___;
- .byte 0x67
- mov ${num}d,%r10d
+
shl \$3,${num}d # convert $num to bytes
- shl \$3+2,%r10d # 4*$num
+ lea ($num,$num,2),%r10 # 3*$num in bytes
neg $num # -$num
mov ($n0),$n0 # *n0
##############################################################
- # ensure that stack frame doesn't alias with $aptr+4*$num
- # modulo 4096, which covers a[num], ret[num] and n[2*num]
- # (see bn_exp.c). this is done to allow memory disambiguation
- # logic do its magic. [excessive frame is allocated in order
- # to allow bn_from_mont8x to clear it.]
+ # Ensure that stack frame doesn't alias with $rptr+3*$num
+ # modulo 4096, which covers ret[num], am[num] and n[num]
+ # (see bn_exp.c). This is done to allow memory disambiguation
+ # logic do its magic. [Extra [num] is allocated in order
+ # to align with bn_power5's frame, which is cleansed after
+ # completing exponentiation. Extra 256 bytes is for power mask
+ # calculated from 7th argument, the index.]
#
- lea -64(%rsp,$num,2),%r11
- sub $ap,%r11
+ lea -320(%rsp,$num,2),%r11
+ sub $rp,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lmulx4xsp_alt
sub %r11,%rsp # align with $aptr
- lea -64(%rsp,$num,2),%rsp # alloca(frame+$num)
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
jmp .Lmulx4xsp_done
-.align 32
.Lmulx4xsp_alt:
- lea 4096-64(,$num,2),%r10 # 4096-frame-$num
- lea -64(%rsp,$num,2),%rsp # alloca(frame+$num)
+ lea 4096-320(,$num,2),%r10
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
@@ -2106,12 +2192,7 @@ $code.=<<___;
mov 40(%rsp),%rsi # restore %rsp
mov \$1,%rax
-___
-$code.=<<___ if ($win64);
- movaps -88(%rsi),%xmm6
- movaps -72(%rsi),%xmm7
-___
-$code.=<<___;
+
mov -48(%rsi),%r15
mov -40(%rsi),%r14
mov -32(%rsi),%r13
@@ -2126,14 +2207,16 @@ $code.=<<___;
.type mulx4x_internal,\@abi-omnipotent
.align 32
mulx4x_internal:
- .byte 0x4c,0x89,0x8c,0x24,0x08,0x00,0x00,0x00 # mov $num,8(%rsp) # save -$num
- .byte 0x67
+ mov $num,8(%rsp) # save -$num (it was in bytes)
+ mov $num,%r10
neg $num # restore $num
shl \$5,$num
- lea 256($bp,$num),%r13
+ neg %r10 # restore $num
+ lea 128($bp,$num),%r13 # end of powers table (+size optimization)
shr \$5+5,$num
- mov `($win64?56:8)`(%rax),%r10d # load 7th argument
+ movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument
sub \$1,$num
+ lea .Linc(%rip),%rax
mov %r13,16+8(%rsp) # end of b[num]
mov $num,24+8(%rsp) # inner counter
mov $rp, 56+8(%rsp) # save $rp
@@ -2144,52 +2227,92 @@ my $rptr=$bptr;
my $STRIDE=2**5*8; # 5 is "window size"
my $N=$STRIDE/4; # should match cache line size
$code.=<<___;
- mov %r10,%r11
- shr \$`log($N/8)/log(2)`,%r10
- and \$`$N/8-1`,%r11
- not %r10
- lea .Lmagic_masks(%rip),%rax
- and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
- lea 96($bp,%r11,8),$bptr # pointer within 1st cache line
- movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
- movq 8(%rax,%r10,8),%xmm5 # cache line contains element
- add \$7,%r11
- movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
- movq 24(%rax,%r10,8),%xmm7
- and \$7,%r11
-
- movq `0*$STRIDE/4-96`($bptr),%xmm0
- lea $STRIDE($bptr),$tptr # borrow $tptr
- movq `1*$STRIDE/4-96`($bptr),%xmm1
- pand %xmm4,%xmm0
- movq `2*$STRIDE/4-96`($bptr),%xmm2
- pand %xmm5,%xmm1
- movq `3*$STRIDE/4-96`($bptr),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- movq `0*$STRIDE/4-96`($tptr),%xmm1
- pand %xmm7,%xmm3
- por %xmm2,%xmm0
- movq `1*$STRIDE/4-96`($tptr),%xmm2
- por %xmm3,%xmm0
- .byte 0x67,0x67
- pand %xmm4,%xmm1
- movq `2*$STRIDE/4-96`($tptr),%xmm3
+ movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000
+ movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002
+ lea 88-112(%rsp,%r10),%r10 # place the mask after tp[num+1] (+ICache optimizaton)
+ lea 128($bp),$bptr # size optimization
+ pshufd \$0,%xmm5,%xmm5 # broadcast index
+ movdqa %xmm1,%xmm4
+ .byte 0x67
+ movdqa %xmm1,%xmm2
+___
+########################################################################
+# calculate mask by comparing 0..31 to index and save result to stack
+#
+$code.=<<___;
+ .byte 0x67
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0 # compare to 1,0
+ movdqa %xmm4,%xmm3
+___
+for($i=0;$i<$STRIDE/16-4;$i+=4) {
+$code.=<<___;
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1 # compare to 3,2
+ movdqa %xmm0,`16*($i+0)+112`(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2 # compare to 5,4
+ movdqa %xmm1,`16*($i+1)+112`(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3 # compare to 7,6
+ movdqa %xmm2,`16*($i+2)+112`(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,`16*($i+3)+112`(%r10)
+ movdqa %xmm4,%xmm3
+___
+}
+$code.=<<___; # last iteration can be optimized
+ .byte 0x67
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,`16*($i+0)+112`(%r10)
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,`16*($i+1)+112`(%r10)
+
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,`16*($i+2)+112`(%r10)
+
+ pand `16*($i+0)-128`($bptr),%xmm0 # while it's still in register
+ pand `16*($i+1)-128`($bptr),%xmm1
+ pand `16*($i+2)-128`($bptr),%xmm2
+ movdqa %xmm3,`16*($i+3)+112`(%r10)
+ pand `16*($i+3)-128`($bptr),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+___
+for($i=0;$i<$STRIDE/16-4;$i+=4) {
+$code.=<<___;
+ movdqa `16*($i+0)-128`($bptr),%xmm4
+ movdqa `16*($i+1)-128`($bptr),%xmm5
+ movdqa `16*($i+2)-128`($bptr),%xmm2
+ pand `16*($i+0)+112`(%r10),%xmm4
+ movdqa `16*($i+3)-128`($bptr),%xmm3
+ pand `16*($i+1)+112`(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand `16*($i+2)+112`(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand `16*($i+3)+112`(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+___
+}
+$code.=<<___;
+ pxor %xmm1,%xmm0
+ pshufd \$0x4e,%xmm0,%xmm1
+ por %xmm1,%xmm0
+ lea $STRIDE($bptr),$bptr
movq %xmm0,%rdx # bp[0]
- movq `3*$STRIDE/4-96`($tptr),%xmm0
- lea 2*$STRIDE($bptr),$bptr # next &b[i]
- pand %xmm5,%xmm2
- .byte 0x67,0x67
- pand %xmm6,%xmm3
- ##############################################################
- # $tptr is chosen so that writing to top-most element of the
- # vector occurs just "above" references to powers table,
- # "above" modulo cache-line size, which effectively precludes
- # possibility of memory disambiguation logic failure when
- # accessing the table.
- #
- lea 64+8*4+8(%rsp,%r11,8),$tptr
+ lea 64+8*4+8(%rsp),$tptr
mov %rdx,$bi
mulx 0*8($aptr),$mi,%rax # a[0]*b[0]
@@ -2205,37 +2328,31 @@ $code.=<<___;
xor $zero,$zero # cf=0, of=0
mov $mi,%rdx
- por %xmm2,%xmm1
- pand %xmm7,%xmm0
- por %xmm3,%xmm1
mov $bptr,8+8(%rsp) # off-load &b[i]
- por %xmm1,%xmm0
- .byte 0x48,0x8d,0xb6,0x20,0x00,0x00,0x00 # lea 4*8($aptr),$aptr
+ lea 4*8($aptr),$aptr
adcx %rax,%r13
adcx $zero,%r14 # cf=0
- mulx 0*16($nptr),%rax,%r10
+ mulx 0*8($nptr),%rax,%r10
adcx %rax,%r15 # discarded
adox %r11,%r10
- mulx 1*16($nptr),%rax,%r11
+ mulx 1*8($nptr),%rax,%r11
adcx %rax,%r10
adox %r12,%r11
- mulx 2*16($nptr),%rax,%r12
+ mulx 2*8($nptr),%rax,%r12
mov 24+8(%rsp),$bptr # counter value
- .byte 0x66
mov %r10,-8*4($tptr)
adcx %rax,%r11
adox %r13,%r12
- mulx 3*16($nptr),%rax,%r15
- .byte 0x67,0x67
+ mulx 3*8($nptr),%rax,%r15
mov $bi,%rdx
mov %r11,-8*3($tptr)
adcx %rax,%r12
adox $zero,%r15 # of=0
- .byte 0x48,0x8d,0x89,0x40,0x00,0x00,0x00 # lea 4*16($nptr),$nptr
+ lea 4*8($nptr),$nptr
mov %r12,-8*2($tptr)
- #jmp .Lmulx4x_1st
+ jmp .Lmulx4x_1st
.align 32
.Lmulx4x_1st:
@@ -2255,30 +2372,29 @@ $code.=<<___;
lea 4*8($tptr),$tptr
adox %r15,%r10
- mulx 0*16($nptr),%rax,%r15
+ mulx 0*8($nptr),%rax,%r15
adcx %rax,%r10
adox %r15,%r11
- mulx 1*16($nptr),%rax,%r15
+ mulx 1*8($nptr),%rax,%r15
adcx %rax,%r11
adox %r15,%r12
- mulx 2*16($nptr),%rax,%r15
+ mulx 2*8($nptr),%rax,%r15
mov %r10,-5*8($tptr)
adcx %rax,%r12
mov %r11,-4*8($tptr)
adox %r15,%r13
- mulx 3*16($nptr),%rax,%r15
+ mulx 3*8($nptr),%rax,%r15
mov $bi,%rdx
mov %r12,-3*8($tptr)
adcx %rax,%r13
adox $zero,%r15
- lea 4*16($nptr),$nptr
+ lea 4*8($nptr),$nptr
mov %r13,-2*8($tptr)
dec $bptr # of=0, pass cf
jnz .Lmulx4x_1st
mov 8(%rsp),$num # load -num
- movq %xmm0,%rdx # bp[1]
adc $zero,%r15 # modulo-scheduled
lea ($aptr,$num),$aptr # rewind $aptr
add %r15,%r14
@@ -2289,6 +2405,34 @@ $code.=<<___;
.align 32
.Lmulx4x_outer:
+ lea 16-256($tptr),%r10 # where 256-byte mask is (+density control)
+ pxor %xmm4,%xmm4
+ .byte 0x67,0x67
+ pxor %xmm5,%xmm5
+___
+for($i=0;$i<$STRIDE/16;$i+=4) {
+$code.=<<___;
+ movdqa `16*($i+0)-128`($bptr),%xmm0
+ movdqa `16*($i+1)-128`($bptr),%xmm1
+ movdqa `16*($i+2)-128`($bptr),%xmm2
+ pand `16*($i+0)+256`(%r10),%xmm0
+ movdqa `16*($i+3)-128`($bptr),%xmm3
+ pand `16*($i+1)+256`(%r10),%xmm1
+ por %xmm0,%xmm4
+ pand `16*($i+2)+256`(%r10),%xmm2
+ por %xmm1,%xmm5
+ pand `16*($i+3)+256`(%r10),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+___
+}
+$code.=<<___;
+ por %xmm5,%xmm4
+ pshufd \$0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
+ lea $STRIDE($bptr),$bptr
+ movq %xmm0,%rdx # m0=bp[i]
+
mov $zero,($tptr) # save top-most carry
lea 4*8($tptr,$num),$tptr # rewind $tptr
mulx 0*8($aptr),$mi,%r11 # a[0]*b[i]
@@ -2303,54 +2447,37 @@ $code.=<<___;
mulx 3*8($aptr),%rdx,%r14
adox -2*8($tptr),%r12
adcx %rdx,%r13
- lea ($nptr,$num,2),$nptr # rewind $nptr
+ lea ($nptr,$num),$nptr # rewind $nptr
lea 4*8($aptr),$aptr
adox -1*8($tptr),%r13
adcx $zero,%r14
adox $zero,%r14
- .byte 0x67
mov $mi,%r15
imulq 32+8(%rsp),$mi # "t[0]"*n0
- movq `0*$STRIDE/4-96`($bptr),%xmm0
- .byte 0x67,0x67
mov $mi,%rdx
- movq `1*$STRIDE/4-96`($bptr),%xmm1
- .byte 0x67
- pand %xmm4,%xmm0
- movq `2*$STRIDE/4-96`($bptr),%xmm2
- .byte 0x67
- pand %xmm5,%xmm1
- movq `3*$STRIDE/4-96`($bptr),%xmm3
- add \$$STRIDE,$bptr # next &b[i]
- .byte 0x67
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
xor $zero,$zero # cf=0, of=0
mov $bptr,8+8(%rsp) # off-load &b[i]
- mulx 0*16($nptr),%rax,%r10
+ mulx 0*8($nptr),%rax,%r10
adcx %rax,%r15 # discarded
adox %r11,%r10
- mulx 1*16($nptr),%rax,%r11
+ mulx 1*8($nptr),%rax,%r11
adcx %rax,%r10
adox %r12,%r11
- mulx 2*16($nptr),%rax,%r12
+ mulx 2*8($nptr),%rax,%r12
adcx %rax,%r11
adox %r13,%r12
- mulx 3*16($nptr),%rax,%r15
+ mulx 3*8($nptr),%rax,%r15
mov $bi,%rdx
- por %xmm2,%xmm0
mov 24+8(%rsp),$bptr # counter value
mov %r10,-8*4($tptr)
- por %xmm3,%xmm0
adcx %rax,%r12
mov %r11,-8*3($tptr)
adox $zero,%r15 # of=0
mov %r12,-8*2($tptr)
- lea 4*16($nptr),$nptr
+ lea 4*8($nptr),$nptr
jmp .Lmulx4x_inner
.align 32
@@ -2375,20 +2502,20 @@ $code.=<<___;
adcx $zero,%r14 # cf=0
adox %r15,%r10
- mulx 0*16($nptr),%rax,%r15
+ mulx 0*8($nptr),%rax,%r15
adcx %rax,%r10
adox %r15,%r11
- mulx 1*16($nptr),%rax,%r15
+ mulx 1*8($nptr),%rax,%r15
adcx %rax,%r11
adox %r15,%r12
- mulx 2*16($nptr),%rax,%r15
+ mulx 2*8($nptr),%rax,%r15
mov %r10,-5*8($tptr)
adcx %rax,%r12
adox %r15,%r13
mov %r11,-4*8($tptr)
- mulx 3*16($nptr),%rax,%r15
+ mulx 3*8($nptr),%rax,%r15
mov $bi,%rdx
- lea 4*16($nptr),$nptr
+ lea 4*8($nptr),$nptr
mov %r12,-3*8($tptr)
adcx %rax,%r13
adox $zero,%r15
@@ -2398,7 +2525,6 @@ $code.=<<___;
jnz .Lmulx4x_inner
mov 0+8(%rsp),$num # load -num
- movq %xmm0,%rdx # bp[i+1]
adc $zero,%r15 # modulo-scheduled
sub 0*8($tptr),$bptr # pull top-most carry to %cf
mov 8+8(%rsp),$bptr # re-load &b[i]
@@ -2411,20 +2537,26 @@ $code.=<<___;
cmp %r10,$bptr
jb .Lmulx4x_outer
- mov -16($nptr),%r10
+ mov -8($nptr),%r10
+ mov $zero,%r8
+ mov ($nptr,$num),%r12
+ lea ($nptr,$num),%rbp # rewind $nptr
+ mov $num,%rcx
+ lea ($tptr,$num),%rdi # rewind $tptr
+ xor %eax,%eax
xor %r15,%r15
sub %r14,%r10 # compare top-most words
adc %r15,%r15
- or %r15,$zero
- xor \$1,$zero
- lea ($tptr,$num),%rdi # rewind $tptr
- lea ($nptr,$num,2),$nptr # rewind $nptr
- .byte 0x67,0x67
- sar \$3+2,$num # cf=0
- lea ($nptr,$zero,8),%rbp
+ or %r15,%r8
+ sar \$3+2,%rcx
+ sub %r8,%rax # %rax=-%r8
mov 56+8(%rsp),%rdx # restore rp
- mov $num,%rcx
- jmp .Lsqrx4x_sub # common post-condition
+ dec %r12 # so that after 'not' we get -n[0]
+ mov 8*1(%rbp),%r13
+ xor %r8,%r8
+ mov 8*2(%rbp),%r14
+ mov 8*3(%rbp),%r15
+ jmp .Lsqrx4x_sub_entry # common post-condition
.size mulx4x_internal,.-mulx4x_internal
___
} {
@@ -2448,7 +2580,6 @@ $code.=<<___;
.align 32
bn_powerx5:
.Lpowerx5_enter:
- .byte 0x67
mov %rsp,%rax
push %rbx
push %rbp
@@ -2456,39 +2587,32 @@ bn_powerx5:
push %r13
push %r14
push %r15
-___
-$code.=<<___ if ($win64);
- lea -0x28(%rsp),%rsp
- movaps %xmm6,(%rsp)
- movaps %xmm7,0x10(%rsp)
-___
-$code.=<<___;
- .byte 0x67
- mov ${num}d,%r10d
+
shl \$3,${num}d # convert $num to bytes
- shl \$3+2,%r10d # 4*$num
+ lea ($num,$num,2),%r10 # 3*$num in bytes
neg $num
mov ($n0),$n0 # *n0
##############################################################
- # ensure that stack frame doesn't alias with $aptr+4*$num
- # modulo 4096, which covers ret[num], am[num] and n[2*num]
- # (see bn_exp.c). this is done to allow memory disambiguation
- # logic do its magic.
+ # Ensure that stack frame doesn't alias with $rptr+3*$num
+ # modulo 4096, which covers ret[num], am[num] and n[num]
+ # (see bn_exp.c). This is done to allow memory disambiguation
+ # logic do its magic. [Extra 256 bytes is for power mask
+ # calculated from 7th argument, the index.]
#
- lea -64(%rsp,$num,2),%r11
- sub $aptr,%r11
+ lea -320(%rsp,$num,2),%r11
+ sub $rptr,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lpwrx_sp_alt
sub %r11,%rsp # align with $aptr
- lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
jmp .Lpwrx_sp_done
.align 32
.Lpwrx_sp_alt:
- lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
- lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ lea 4096-320(,$num,2),%r10
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
@@ -2519,10 +2643,15 @@ $code.=<<___;
.Lpowerx5_body:
call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
mov %r10,$num # -num
mov $aptr,$rptr
@@ -2534,12 +2663,7 @@ $code.=<<___;
mov 40(%rsp),%rsi # restore %rsp
mov \$1,%rax
-___
-$code.=<<___ if ($win64);
- movaps -88(%rsi),%xmm6
- movaps -72(%rsi),%xmm7
-___
-$code.=<<___;
+
mov -48(%rsi),%r15
mov -40(%rsi),%r14
mov -32(%rsi),%r13
@@ -2973,11 +3097,11 @@ my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx");
$code.=<<___;
movq %xmm2,$nptr
-sqrx8x_reduction:
+__bn_sqrx8x_reduction:
xor %eax,%eax # initial top-most carry bit
mov 32+8(%rsp),%rbx # n0
mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr)
- lea -128($nptr,$num,2),%rcx # end of n[]
+ lea -8*8($nptr,$num),%rcx # end of n[]
#lea 48+8(%rsp,$num,2),$tptr # end of t[] buffer
mov %rcx, 0+8(%rsp) # save end of n[]
mov $tptr,8+8(%rsp) # save end of t[]
@@ -3006,23 +3130,23 @@ sqrx8x_reduction:
.align 32
.Lsqrx8x_reduce:
mov %r8, %rbx
- mulx 16*0($nptr),%rax,%r8 # n[0]
+ mulx 8*0($nptr),%rax,%r8 # n[0]
adcx %rbx,%rax # discarded
adox %r9,%r8
- mulx 16*1($nptr),%rbx,%r9 # n[1]
+ mulx 8*1($nptr),%rbx,%r9 # n[1]
adcx %rbx,%r8
adox %r10,%r9
- mulx 16*2($nptr),%rbx,%r10
+ mulx 8*2($nptr),%rbx,%r10
adcx %rbx,%r9
adox %r11,%r10
- mulx 16*3($nptr),%rbx,%r11
+ mulx 8*3($nptr),%rbx,%r11
adcx %rbx,%r10
adox %r12,%r11
- .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x40,0x00,0x00,0x00 # mulx 16*4($nptr),%rbx,%r12
+ .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rbx,%r12
mov %rdx,%rax
mov %r8,%rdx
adcx %rbx,%r11
@@ -3032,15 +3156,15 @@ sqrx8x_reduction:
mov %rax,%rdx
mov %rax,64+48+8(%rsp,%rcx,8) # put aside n0*a[i]
- mulx 16*5($nptr),%rax,%r13
+ mulx 8*5($nptr),%rax,%r13
adcx %rax,%r12
adox %r14,%r13
- mulx 16*6($nptr),%rax,%r14
+ mulx 8*6($nptr),%rax,%r14
adcx %rax,%r13
adox %r15,%r14
- mulx 16*7($nptr),%rax,%r15
+ mulx 8*7($nptr),%rax,%r15
mov %rbx,%rdx
adcx %rax,%r14
adox $carry,%r15 # $carry is 0
@@ -3056,7 +3180,7 @@ sqrx8x_reduction:
mov 48+8(%rsp),%rdx # pull n0*a[0]
add 8*0($tptr),%r8
- lea 16*8($nptr),$nptr
+ lea 8*8($nptr),$nptr
mov \$-8,%rcx
adcx 8*1($tptr),%r9
adcx 8*2($tptr),%r10
@@ -3075,35 +3199,35 @@ sqrx8x_reduction:
.align 32
.Lsqrx8x_tail:
mov %r8,%rbx
- mulx 16*0($nptr),%rax,%r8
+ mulx 8*0($nptr),%rax,%r8
adcx %rax,%rbx
adox %r9,%r8
- mulx 16*1($nptr),%rax,%r9
+ mulx 8*1($nptr),%rax,%r9
adcx %rax,%r8
adox %r10,%r9
- mulx 16*2($nptr),%rax,%r10
+ mulx 8*2($nptr),%rax,%r10
adcx %rax,%r9
adox %r11,%r10
- mulx 16*3($nptr),%rax,%r11
+ mulx 8*3($nptr),%rax,%r11
adcx %rax,%r10
adox %r12,%r11
- .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x40,0x00,0x00,0x00 # mulx 16*4($nptr),%rax,%r12
+ .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rax,%r12
adcx %rax,%r11
adox %r13,%r12
- mulx 16*5($nptr),%rax,%r13
+ mulx 8*5($nptr),%rax,%r13
adcx %rax,%r12
adox %r14,%r13
- mulx 16*6($nptr),%rax,%r14
+ mulx 8*6($nptr),%rax,%r14
adcx %rax,%r13
adox %r15,%r14
- mulx 16*7($nptr),%rax,%r15
+ mulx 8*7($nptr),%rax,%r15
mov 72+48+8(%rsp,%rcx,8),%rdx # pull n0*a[i]
adcx %rax,%r14
adox $carry,%r15
@@ -3119,7 +3243,7 @@ sqrx8x_reduction:
sub 16+8(%rsp),$carry # mov 16(%rsp),%cf
mov 48+8(%rsp),%rdx # pull n0*a[0]
- lea 16*8($nptr),$nptr
+ lea 8*8($nptr),$nptr
adc 8*0($tptr),%r8
adc 8*1($tptr),%r9
adc 8*2($tptr),%r10
@@ -3155,7 +3279,7 @@ sqrx8x_reduction:
adc 8*0($tptr),%r8
movq %xmm3,%rcx
adc 8*1($tptr),%r9
- mov 16*7($nptr),$carry
+ mov 8*7($nptr),$carry
movq %xmm2,$nptr # restore $nptr
adc 8*2($tptr),%r10
adc 8*3($tptr),%r11
@@ -3181,6 +3305,8 @@ sqrx8x_reduction:
lea 8*8($tptr,%rcx),$tptr # start of current t[] window
cmp 8+8(%rsp),%r8 # end of t[]?
jb .Lsqrx8x_reduction_loop
+ ret
+.size bn_sqrx8x_internal,.-bn_sqrx8x_internal
___
}
##############################################################
@@ -3188,52 +3314,59 @@ ___
#
{
my ($rptr,$nptr)=("%rdx","%rbp");
-my @ri=map("%r$_",(10..13));
-my @ni=map("%r$_",(14..15));
$code.=<<___;
- xor %ebx,%ebx
- sub %r15,%rsi # compare top-most words
- adc %rbx,%rbx
+.align 32
+__bn_postx4x_internal:
+ mov 8*0($nptr),%r12
mov %rcx,%r10 # -$num
- or %rbx,%rax
mov %rcx,%r9 # -$num
- xor \$1,%rax
- sar \$3+2,%rcx # cf=0
+ neg %rax
+ sar \$3+2,%rcx
#lea 48+8(%rsp,%r9),$tptr
- lea ($nptr,%rax,8),$nptr
movq %xmm1,$rptr # restore $rptr
movq %xmm1,$aptr # prepare for back-to-back call
- jmp .Lsqrx4x_sub
+ dec %r12 # so that after 'not' we get -n[0]
+ mov 8*1($nptr),%r13
+ xor %r8,%r8
+ mov 8*2($nptr),%r14
+ mov 8*3($nptr),%r15
+ jmp .Lsqrx4x_sub_entry
-.align 32
+.align 16
.Lsqrx4x_sub:
- .byte 0x66
- mov 8*0($tptr),%r12
- mov 8*1($tptr),%r13
- sbb 16*0($nptr),%r12
- mov 8*2($tptr),%r14
- sbb 16*1($nptr),%r13
- mov 8*3($tptr),%r15
- lea 8*4($tptr),$tptr
- sbb 16*2($nptr),%r14
+ mov 8*0($nptr),%r12
+ mov 8*1($nptr),%r13
+ mov 8*2($nptr),%r14
+ mov 8*3($nptr),%r15
+.Lsqrx4x_sub_entry:
+ andn %rax,%r12,%r12
+ lea 8*4($nptr),$nptr
+ andn %rax,%r13,%r13
+ andn %rax,%r14,%r14
+ andn %rax,%r15,%r15
+
+ neg %r8 # mov %r8,%cf
+ adc 8*0($tptr),%r12
+ adc 8*1($tptr),%r13
+ adc 8*2($tptr),%r14
+ adc 8*3($tptr),%r15
mov %r12,8*0($rptr)
- sbb 16*3($nptr),%r15
- lea 16*4($nptr),$nptr
+ lea 8*4($tptr),$tptr
mov %r13,8*1($rptr)
+ sbb %r8,%r8 # mov %cf,%r8
mov %r14,8*2($rptr)
mov %r15,8*3($rptr)
lea 8*4($rptr),$rptr
inc %rcx
jnz .Lsqrx4x_sub
-___
-}
-$code.=<<___;
+
neg %r9 # restore $num
ret
-.size bn_sqrx8x_internal,.-bn_sqrx8x_internal
+.size __bn_postx4x_internal,.-__bn_postx4x_internal
___
+}
}}}
{
my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order
@@ -3282,56 +3415,91 @@ bn_scatter5:
.globl bn_gather5
.type bn_gather5,\@abi-omnipotent
-.align 16
+.align 32
bn_gather5:
-___
-$code.=<<___ if ($win64);
-.LSEH_begin_bn_gather5:
+.LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases
# I can't trust assembler to use specific encoding:-(
- .byte 0x48,0x83,0xec,0x28 #sub \$0x28,%rsp
- .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
- .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp)
+ .byte 0x4c,0x8d,0x14,0x24 #lea (%rsp),%r10
+ .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 #sub $0x108,%rsp
+ lea .Linc(%rip),%rax
+ and \$-16,%rsp # shouldn't be formally required
+
+ movd $idx,%xmm5
+ movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000
+ movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002
+ lea 128($tbl),%r11 # size optimization
+ lea 128(%rsp),%rax # size optimization
+
+ pshufd \$0,%xmm5,%xmm5 # broadcast $idx
+ movdqa %xmm1,%xmm4
+ movdqa %xmm1,%xmm2
___
+########################################################################
+# calculate mask by comparing 0..31 to $idx and save result to stack
+#
+for($i=0;$i<$STRIDE/16;$i+=4) {
$code.=<<___;
- mov $idx,%r11d
- shr \$`log($N/8)/log(2)`,$idx
- and \$`$N/8-1`,%r11
- not $idx
- lea .Lmagic_masks(%rip),%rax
- and \$`2**5/($N/8)-1`,$idx # 5 is "window size"
- lea 128($tbl,%r11,8),$tbl # pointer within 1st cache line
- movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which
- movq 8(%rax,$idx,8),%xmm5 # cache line contains element
- movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument
- movq 24(%rax,$idx,8),%xmm7
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0 # compare to 1,0
+___
+$code.=<<___ if ($i);
+ movdqa %xmm3,`16*($i-1)-128`(%rax)
+___
+$code.=<<___;
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1 # compare to 3,2
+ movdqa %xmm0,`16*($i+0)-128`(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2 # compare to 5,4
+ movdqa %xmm1,`16*($i+1)-128`(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3 # compare to 7,6
+ movdqa %xmm2,`16*($i+2)-128`(%rax)
+ movdqa %xmm4,%xmm2
+___
+}
+$code.=<<___;
+ movdqa %xmm3,`16*($i-1)-128`(%rax)
jmp .Lgather
-.align 16
-.Lgather:
- movq `0*$STRIDE/4-128`($tbl),%xmm0
- movq `1*$STRIDE/4-128`($tbl),%xmm1
- pand %xmm4,%xmm0
- movq `2*$STRIDE/4-128`($tbl),%xmm2
- pand %xmm5,%xmm1
- movq `3*$STRIDE/4-128`($tbl),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
- .byte 0x67,0x67
- por %xmm2,%xmm0
- lea $STRIDE($tbl),$tbl
- por %xmm3,%xmm0
+.align 32
+.Lgather:
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+___
+for($i=0;$i<$STRIDE/16;$i+=4) {
+$code.=<<___;
+ movdqa `16*($i+0)-128`(%r11),%xmm0
+ movdqa `16*($i+1)-128`(%r11),%xmm1
+ movdqa `16*($i+2)-128`(%r11),%xmm2
+ pand `16*($i+0)-128`(%rax),%xmm0
+ movdqa `16*($i+3)-128`(%r11),%xmm3
+ pand `16*($i+1)-128`(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand `16*($i+2)-128`(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand `16*($i+3)-128`(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+___
+}
+$code.=<<___;
+ por %xmm5,%xmm4
+ lea $STRIDE(%r11),%r11
+ pshufd \$0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
movq %xmm0,($out) # m0=bp[0]
lea 8($out),$out
sub \$1,$num
jnz .Lgather
-___
-$code.=<<___ if ($win64);
- movaps (%rsp),%xmm6
- movaps 0x10(%rsp),%xmm7
- lea 0x28(%rsp),%rsp
-___
-$code.=<<___;
+
+ lea (%r10),%rsp
ret
.LSEH_end_bn_gather5:
.size bn_gather5,.-bn_gather5
@@ -3339,9 +3507,9 @@ ___
}
$code.=<<___;
.align 64
-.Lmagic_masks:
- .long 0,0, 0,0, 0,0, -1,-1
- .long 0,0, 0,0, 0,0, 0,0
+.Linc:
+ .long 0,0, 1,1
+ .long 2,2, 2,2
.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
___
@@ -3389,19 +3557,16 @@ mul_handler:
lea .Lmul_epilogue(%rip),%r10
cmp %r10,%rbx
- jb .Lbody_40
+ ja .Lbody_40
mov 192($context),%r10 # pull $num
mov 8(%rax,%r10,8),%rax # pull saved stack pointer
+
jmp .Lbody_proceed
.Lbody_40:
mov 40(%rax),%rax # pull saved stack pointer
.Lbody_proceed:
-
- movaps -88(%rax),%xmm0
- movaps -72(%rax),%xmm1
-
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
@@ -3414,8 +3579,6 @@ mul_handler:
mov %r13,224($context) # restore context->R13
mov %r14,232($context) # restore context->R14
mov %r15,240($context) # restore context->R15
- movups %xmm0,512($context) # restore context->Xmm6
- movups %xmm1,528($context) # restore context->Xmm7
.Lcommon_seh_tail:
mov 8(%rax),%rdi
@@ -3526,10 +3689,9 @@ ___
$code.=<<___;
.align 8
.LSEH_info_bn_gather5:
- .byte 0x01,0x0d,0x05,0x00
- .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
- .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6
- .byte 0x04,0x42,0x00,0x00 #sub rsp,0x28
+ .byte 0x01,0x0b,0x03,0x0a
+ .byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108
+ .byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp)
.align 8
___
}