aboutsummaryrefslogtreecommitdiffstats
path: root/crypto/md5/asm/md5-x86_64.pl
diff options
context:
space:
mode:
authorJung-uk Kim <jkim@FreeBSD.org>2012-07-11 23:31:36 +0000
committerJung-uk Kim <jkim@FreeBSD.org>2012-07-11 23:31:36 +0000
commit0758ab5ea778e4ba36d2150af1bba602a48d6467 (patch)
tree0c30591ac90cb5e07a0763793709fd1056b67f57 /crypto/md5/asm/md5-x86_64.pl
parent2b8b5455829304396e38200c205612c4dc57c052 (diff)
downloadsrc-0758ab5ea778e4ba36d2150af1bba602a48d6467.tar.gz
src-0758ab5ea778e4ba36d2150af1bba602a48d6467.zip
Import OpenSSL 1.0.1c.vendor/openssl/1.0.1c
Approved by: benl (maintainer)
Notes
Notes: svn path=/vendor-crypto/openssl/dist/; revision=238384 svn path=/vendor-crypto/openssl/1.0.1c/; revision=238385; tag=vendor/openssl/1.0.1c
Diffstat (limited to 'crypto/md5/asm/md5-x86_64.pl')
-rwxr-xr-xcrypto/md5/asm/md5-x86_64.pl160
1 files changed, 140 insertions, 20 deletions
diff --git a/crypto/md5/asm/md5-x86_64.pl b/crypto/md5/asm/md5-x86_64.pl
index 05d040f0b928..867885435e29 100755
--- a/crypto/md5/asm/md5-x86_64.pl
+++ b/crypto/md5/asm/md5-x86_64.pl
@@ -15,11 +15,10 @@ my $code;
# dst = x + ((dst + F(x,y,z) + X[k] + T_i) <<< s)
# %r10d = X[k_next]
# %r11d = z' (copy of z for the next step)
-# Each round1_step() takes about 5.71 clocks (9 instructions, 1.58 IPC)
+# Each round1_step() takes about 5.3 clocks (9 instructions, 1.7 IPC)
sub round1_step
{
my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
- $T_i = unpack("l",pack("l", hex($T_i))); # convert to 32-bit signed decimal
$code .= " mov 0*4(%rsi), %r10d /* (NEXT STEP) X[0] */\n" if ($pos == -1);
$code .= " mov %edx, %r11d /* (NEXT STEP) z' = %edx */\n" if ($pos == -1);
$code .= <<EOF;
@@ -38,23 +37,26 @@ EOF
# round2_step() does:
# dst = x + ((dst + G(x,y,z) + X[k] + T_i) <<< s)
# %r10d = X[k_next]
-# %r11d = y' (copy of y for the next step)
-# Each round2_step() takes about 6.22 clocks (9 instructions, 1.45 IPC)
+# %r11d = z' (copy of z for the next step)
+# %r12d = z' (copy of z for the next step)
+# Each round2_step() takes about 5.4 clocks (11 instructions, 2.0 IPC)
sub round2_step
{
my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
- $T_i = unpack("l",pack("l", hex($T_i))); # convert to 32-bit signed decimal
$code .= " mov 1*4(%rsi), %r10d /* (NEXT STEP) X[1] */\n" if ($pos == -1);
- $code .= " mov %ecx, %r11d /* (NEXT STEP) y' = %ecx */\n" if ($pos == -1);
+ $code .= " mov %edx, %r11d /* (NEXT STEP) z' = %edx */\n" if ($pos == -1);
+ $code .= " mov %edx, %r12d /* (NEXT STEP) z' = %edx */\n" if ($pos == -1);
$code .= <<EOF;
- xor $x, %r11d /* x ^ ... */
+ not %r11d /* not z */
lea $T_i($dst,%r10d),$dst /* Const + dst + ... */
- and $z, %r11d /* z & ... */
- xor $y, %r11d /* y ^ ... */
+ and $x, %r12d /* x & z */
+ and $y, %r11d /* y & (not z) */
mov $k_next*4(%rsi),%r10d /* (NEXT STEP) X[$k_next] */
- add %r11d, $dst /* dst += ... */
+ or %r11d, %r12d /* (y & (not z)) | (x & z) */
+ mov $y, %r11d /* (NEXT STEP) z' = $y */
+ add %r12d, $dst /* dst += ... */
+ mov $y, %r12d /* (NEXT STEP) z' = $y */
rol \$$s, $dst /* dst <<< s */
- mov $x, %r11d /* (NEXT STEP) y' = $x */
add $x, $dst /* dst += x */
EOF
}
@@ -63,11 +65,10 @@ EOF
# dst = x + ((dst + H(x,y,z) + X[k] + T_i) <<< s)
# %r10d = X[k_next]
# %r11d = y' (copy of y for the next step)
-# Each round3_step() takes about 4.26 clocks (8 instructions, 1.88 IPC)
+# Each round3_step() takes about 4.2 clocks (8 instructions, 1.9 IPC)
sub round3_step
{
my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
- $T_i = unpack("l",pack("l", hex($T_i))); # convert to 32-bit signed decimal
$code .= " mov 5*4(%rsi), %r10d /* (NEXT STEP) X[5] */\n" if ($pos == -1);
$code .= " mov %ecx, %r11d /* (NEXT STEP) y' = %ecx */\n" if ($pos == -1);
$code .= <<EOF;
@@ -86,11 +87,10 @@ EOF
# dst = x + ((dst + I(x,y,z) + X[k] + T_i) <<< s)
# %r10d = X[k_next]
# %r11d = not z' (copy of not z for the next step)
-# Each round4_step() takes about 5.27 clocks (9 instructions, 1.71 IPC)
+# Each round4_step() takes about 5.2 clocks (9 instructions, 1.7 IPC)
sub round4_step
{
my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
- $T_i = unpack("l",pack("l", hex($T_i))); # convert to 32-bit signed decimal
$code .= " mov 0*4(%rsi), %r10d /* (NEXT STEP) X[0] */\n" if ($pos == -1);
$code .= " mov \$0xffffffff, %r11d\n" if ($pos == -1);
$code .= " xor %edx, %r11d /* (NEXT STEP) not z' = not %edx*/\n"
@@ -108,8 +108,19 @@ sub round4_step
EOF
}
-my $output = shift;
-open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output";
+my $flavour = shift;
+my $output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+my $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; my $xlate;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+no warnings qw(uninitialized);
+open STDOUT,"| $^X $xlate $flavour $output";
$code .= <<EOF;
.text
@@ -120,8 +131,10 @@ $code .= <<EOF;
md5_block_asm_data_order:
push %rbp
push %rbx
+ push %r12
push %r14
push %r15
+.Lprologue:
# rdi = arg #1 (ctx, MD5_CTX pointer)
# rsi = arg #2 (ptr, data pointer)
@@ -236,13 +249,120 @@ $code .= <<EOF;
mov %ecx, 2*4(%rbp) # ctx->C = C
mov %edx, 3*4(%rbp) # ctx->D = D
+ mov (%rsp),%r15
+ mov 8(%rsp),%r14
+ mov 16(%rsp),%r12
+ mov 24(%rsp),%rbx
+ mov 32(%rsp),%rbp
+ add \$40,%rsp
+.Lepilogue:
+ ret
+.size md5_block_asm_data_order,.-md5_block_asm_data_order
+EOF
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+my $rec="%rcx";
+my $frame="%rdx";
+my $context="%r8";
+my $disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ lea .Lprologue(%rip),%r10
+ cmp %r10,%rbx # context->Rip<.Lprologue
+ jb .Lin_prologue
+
+ mov 152($context),%rax # pull context->Rsp
+
+ lea .Lepilogue(%rip),%r10
+ cmp %r10,%rbx # context->Rip>=.Lepilogue
+ jae .Lin_prologue
+
+ lea 40(%rax),%rax
+
+ mov -8(%rax),%rbp
+ mov -16(%rax),%rbx
+ mov -24(%rax),%r12
+ mov -32(%rax),%r14
+ mov -40(%rax),%r15
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
+
+.Lin_prologue:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
pop %r15
pop %r14
- pop %rbx
+ pop %r13
+ pop %r12
pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
ret
-.size md5_block_asm_data_order,.-md5_block_asm_data_order
-EOF
+.size se_handler,.-se_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_md5_block_asm_data_order
+ .rva .LSEH_end_md5_block_asm_data_order
+ .rva .LSEH_info_md5_block_asm_data_order
+
+.section .xdata
+.align 8
+.LSEH_info_md5_block_asm_data_order:
+ .byte 9,0,0,0
+ .rva se_handler
+___
+}
print $code;