diff options
author | Jung-uk Kim <jkim@FreeBSD.org> | 2020-03-20 21:43:08 +0000 |
---|---|---|
committer | Jung-uk Kim <jkim@FreeBSD.org> | 2020-03-20 21:43:08 +0000 |
commit | 737493770cbfb0e6144e7c1eddebbe9160394a0f (patch) | |
tree | 041bf5f9d13b214279a3737b71d9ed65826ac7a6 /secure | |
parent | 29b5aa1b8efcee2c420239594c8840a0e465e8dd (diff) | |
download | src-737493770cbfb0e6144e7c1eddebbe9160394a0f.tar.gz src-737493770cbfb0e6144e7c1eddebbe9160394a0f.zip |
MFC: r359060, r359061, r359066
Merge OpenSSL 1.1.1e.
Notes
Notes:
svn path=/stable/12/; revision=359186
Diffstat (limited to 'secure')
572 files changed, 3176 insertions, 45575 deletions
diff --git a/secure/lib/libcrypto/Makefile.inc b/secure/lib/libcrypto/Makefile.inc index a9d8df50be02..91217dc1e454 100644 --- a/secure/lib/libcrypto/Makefile.inc +++ b/secure/lib/libcrypto/Makefile.inc @@ -3,8 +3,8 @@ .include <bsd.own.mk> # OpenSSL version used for manual page generation -OPENSSL_VER= 1.1.1d -OPENSSL_DATE= 2019-09-10 +OPENSSL_VER= 1.1.1e +OPENSSL_DATE= 2020-03-17 LCRYPTO_SRC= ${SRCTOP}/crypto/openssl LCRYPTO_DOC= ${LCRYPTO_SRC}/doc diff --git a/secure/lib/libcrypto/aarch64/ecp_nistz256-armv8.S b/secure/lib/libcrypto/aarch64/ecp_nistz256-armv8.S index c0b5f8cede17..f7fcce4365fa 100644 --- a/secure/lib/libcrypto/aarch64/ecp_nistz256-armv8.S +++ b/secure/lib/libcrypto/aarch64/ecp_nistz256-armv8.S @@ -3017,7 +3017,7 @@ __ecp_nistz256_div_by_2: .align 5 ecp_nistz256_point_double: .inst 0xd503233f // paciasp - stp x29,x30,[sp,#-80]! + stp x29,x30,[sp,#-96]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] @@ -3150,7 +3150,7 @@ ecp_nistz256_point_double: add sp,x29,#0 // destroy frame ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] - ldp x29,x30,[sp],#80 + ldp x29,x30,[sp],#96 .inst 0xd50323bf // autiasp ret .size ecp_nistz256_point_double,.-ecp_nistz256_point_double @@ -3159,12 +3159,13 @@ ecp_nistz256_point_double: .align 5 ecp_nistz256_point_add: .inst 0xd503233f // paciasp - stp x29,x30,[sp,#-80]! + stp x29,x30,[sp,#-96]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] sub sp,sp,#32*12 ldp x4,x5,[x2,#64] // in2_z @@ -3178,7 +3179,7 @@ ecp_nistz256_point_add: orr x10,x6,x7 orr x25,x8,x10 cmp x25,#0 - csetm x25,ne // !in2infty + csetm x25,ne // ~in2infty add x0,sp,#192 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z); @@ -3188,7 +3189,7 @@ ecp_nistz256_point_add: orr x10,x6,x7 orr x24,x8,x10 cmp x24,#0 - csetm x24,ne // !in1infty + csetm x24,ne // ~in1infty add x0,sp,#128 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); @@ -3229,7 +3230,7 @@ ecp_nistz256_point_add: orr x14,x14,x15 // see if result is zero orr x16,x16,x17 - orr x26,x14,x16 + orr x26,x14,x16 // ~is_equal(S1,S2) add x2,sp,#192 add x0,sp,#256 @@ -3250,32 +3251,21 @@ ecp_nistz256_point_add: orr x14,x14,x15 // see if result is zero orr x16,x16,x17 - orr x14,x14,x16 - tst x14,x14 - b.ne .Ladd_proceed // is_equal(U1,U2)? + orr x14,x14,x16 // ~is_equal(U1,U2) - tst x24,x25 - b.eq .Ladd_proceed // (in1infty || in2infty)? + mvn x27,x24 // -1/0 -> 0/-1 + mvn x28,x25 // -1/0 -> 0/-1 + orr x14,x14,x27 + orr x14,x14,x28 + orr x14,x14,x26 + cbnz x14,.Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) - tst x26,x26 - b.eq .Ladd_double // is_equal(S1,S2)? - - eor x4,x4,x4 - eor x5,x5,x5 - stp x4,x5,[x21] - stp x4,x5,[x21,#16] - stp x4,x5,[x21,#32] - stp x4,x5,[x21,#48] - stp x4,x5,[x21,#64] - stp x4,x5,[x21,#80] - b .Ladd_done - -.align 4 .Ladd_double: mov x1,x22 mov x0,x21 ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] add sp,sp,#32*(12-4) // difference in stack frames b .Ldouble_shortcut @@ -3357,14 +3347,14 @@ ecp_nistz256_point_add: ldp x8,x9,[x23] // in2 ldp x10,x11,[x23,#16] ldp x14,x15,[x22,#0] // in1 - cmp x24,#0 // !, remember? + cmp x24,#0 // ~, remember? ldp x16,x17,[x22,#0+16] csel x8,x4,x8,ne csel x9,x5,x9,ne ldp x4,x5,[sp,#0+0+32] // res csel x10,x6,x10,ne csel x11,x7,x11,ne - cmp x25,#0 // !, remember? + cmp x25,#0 // ~, remember? ldp x6,x7,[sp,#0+0+48] csel x14,x8,x14,ne csel x15,x9,x15,ne @@ -3375,14 +3365,14 @@ ecp_nistz256_point_add: stp x14,x15,[x21,#0] stp x16,x17,[x21,#0+16] ldp x14,x15,[x22,#32] // in1 - cmp x24,#0 // !, remember? + cmp x24,#0 // ~, remember? ldp x16,x17,[x22,#32+16] csel x8,x4,x8,ne csel x9,x5,x9,ne ldp x4,x5,[sp,#0+32+32] // res csel x10,x6,x10,ne csel x11,x7,x11,ne - cmp x25,#0 // !, remember? + cmp x25,#0 // ~, remember? ldp x6,x7,[sp,#0+32+48] csel x14,x8,x14,ne csel x15,x9,x15,ne @@ -3393,13 +3383,13 @@ ecp_nistz256_point_add: stp x14,x15,[x21,#32] stp x16,x17,[x21,#32+16] ldp x14,x15,[x22,#64] // in1 - cmp x24,#0 // !, remember? + cmp x24,#0 // ~, remember? ldp x16,x17,[x22,#64+16] csel x8,x4,x8,ne csel x9,x5,x9,ne csel x10,x6,x10,ne csel x11,x7,x11,ne - cmp x25,#0 // !, remember? + cmp x25,#0 // ~, remember? csel x14,x8,x14,ne csel x15,x9,x15,ne csel x16,x10,x16,ne @@ -3413,7 +3403,8 @@ ecp_nistz256_point_add: ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] - ldp x29,x30,[sp],#80 + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 .inst 0xd50323bf // autiasp ret .size ecp_nistz256_point_add,.-ecp_nistz256_point_add @@ -3442,7 +3433,7 @@ ecp_nistz256_point_add_affine: orr x10,x6,x7 orr x24,x8,x10 cmp x24,#0 - csetm x24,ne // !in1infty + csetm x24,ne // ~in1infty ldp x14,x15,[x2] // in2_x ldp x16,x17,[x2,#16] @@ -3456,7 +3447,7 @@ ecp_nistz256_point_add_affine: orr x8,x8,x10 orr x25,x14,x8 cmp x25,#0 - csetm x25,ne // !in2infty + csetm x25,ne // ~in2infty add x0,sp,#128 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); @@ -3563,14 +3554,14 @@ ecp_nistz256_point_add_affine: ldp x8,x9,[x23] // in2 ldp x10,x11,[x23,#16] ldp x14,x15,[x22,#0] // in1 - cmp x24,#0 // !, remember? + cmp x24,#0 // ~, remember? ldp x16,x17,[x22,#0+16] csel x8,x4,x8,ne csel x9,x5,x9,ne ldp x4,x5,[sp,#0+0+32] // res csel x10,x6,x10,ne csel x11,x7,x11,ne - cmp x25,#0 // !, remember? + cmp x25,#0 // ~, remember? ldp x6,x7,[sp,#0+0+48] csel x14,x8,x14,ne csel x15,x9,x15,ne @@ -3582,14 +3573,14 @@ ecp_nistz256_point_add_affine: stp x16,x17,[x21,#0+16] adr x23,.Lone_mont-64 ldp x14,x15,[x22,#32] // in1 - cmp x24,#0 // !, remember? + cmp x24,#0 // ~, remember? ldp x16,x17,[x22,#32+16] csel x8,x4,x8,ne csel x9,x5,x9,ne ldp x4,x5,[sp,#0+32+32] // res csel x10,x6,x10,ne csel x11,x7,x11,ne - cmp x25,#0 // !, remember? + cmp x25,#0 // ~, remember? ldp x6,x7,[sp,#0+32+48] csel x14,x8,x14,ne csel x15,x9,x15,ne @@ -3600,13 +3591,13 @@ ecp_nistz256_point_add_affine: stp x14,x15,[x21,#32] stp x16,x17,[x21,#32+16] ldp x14,x15,[x22,#64] // in1 - cmp x24,#0 // !, remember? + cmp x24,#0 // ~, remember? ldp x16,x17,[x22,#64+16] csel x8,x4,x8,ne csel x9,x5,x9,ne csel x10,x6,x10,ne csel x11,x7,x11,ne - cmp x25,#0 // !, remember? + cmp x25,#0 // ~, remember? csel x14,x8,x14,ne csel x15,x9,x15,ne csel x16,x10,x16,ne diff --git a/secure/lib/libcrypto/aarch64/sha256-armv8.S b/secure/lib/libcrypto/aarch64/sha256-armv8.S index 40d1fb269b35..35bf48ba5178 100644 --- a/secure/lib/libcrypto/aarch64/sha256-armv8.S +++ b/secure/lib/libcrypto/aarch64/sha256-armv8.S @@ -1,6 +1,6 @@ /* $FreeBSD$ */ /* Do not modify. This file is auto-generated from sha512-armv8.pl. */ -// Copyright 2014-2019 The OpenSSL Project Authors. All Rights Reserved. +// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. // // Licensed under the OpenSSL license (the "License"). You may not use // this file except in compliance with the License. You can obtain a copy diff --git a/secure/lib/libcrypto/aarch64/sha512-armv8.S b/secure/lib/libcrypto/aarch64/sha512-armv8.S index a2a2b030ef4c..06cf5a239d89 100644 --- a/secure/lib/libcrypto/aarch64/sha512-armv8.S +++ b/secure/lib/libcrypto/aarch64/sha512-armv8.S @@ -1,6 +1,6 @@ /* $FreeBSD$ */ /* Do not modify. This file is auto-generated from sha512-armv8.pl. */ -// Copyright 2014-2019 The OpenSSL Project Authors. All Rights Reserved. +// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. // // Licensed under the OpenSSL license (the "License"). You may not use // this file except in compliance with the License. You can obtain a copy diff --git a/secure/lib/libcrypto/amd64/aesni-gcm-x86_64.S b/secure/lib/libcrypto/amd64/aesni-gcm-x86_64.S index 723abb458f98..1cdcc86043b2 100644 --- a/secure/lib/libcrypto/amd64/aesni-gcm-x86_64.S +++ b/secure/lib/libcrypto/amd64/aesni-gcm-x86_64.S @@ -2,786 +2,20 @@ /* Do not modify. This file is auto-generated from aesni-gcm-x86_64.pl. */ .text -.type _aesni_ctr32_ghash_6x,@function -.align 32 -_aesni_ctr32_ghash_6x: - vmovdqu 32(%r11),%xmm2 - subq $6,%rdx - vpxor %xmm4,%xmm4,%xmm4 - vmovdqu 0-128(%rcx),%xmm15 - vpaddb %xmm2,%xmm1,%xmm10 - vpaddb %xmm2,%xmm10,%xmm11 - vpaddb %xmm2,%xmm11,%xmm12 - vpaddb %xmm2,%xmm12,%xmm13 - vpaddb %xmm2,%xmm13,%xmm14 - vpxor %xmm15,%xmm1,%xmm9 - vmovdqu %xmm4,16+8(%rsp) - jmp .Loop6x - -.align 32 -.Loop6x: - addl $100663296,%ebx - jc .Lhandle_ctr32 - vmovdqu 0-32(%r9),%xmm3 - vpaddb %xmm2,%xmm14,%xmm1 - vpxor %xmm15,%xmm10,%xmm10 - vpxor %xmm15,%xmm11,%xmm11 - -.Lresume_ctr32: - vmovdqu %xmm1,(%r8) - vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5 - vpxor %xmm15,%xmm12,%xmm12 - vmovups 16-128(%rcx),%xmm2 - vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6 - xorq %r12,%r12 - cmpq %r14,%r15 - - vaesenc %xmm2,%xmm9,%xmm9 - vmovdqu 48+8(%rsp),%xmm0 - vpxor %xmm15,%xmm13,%xmm13 - vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1 - vaesenc %xmm2,%xmm10,%xmm10 - vpxor %xmm15,%xmm14,%xmm14 - setnc %r12b - vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 - vaesenc %xmm2,%xmm11,%xmm11 - vmovdqu 16-32(%r9),%xmm3 - negq %r12 - vaesenc %xmm2,%xmm12,%xmm12 - vpxor %xmm5,%xmm6,%xmm6 - vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5 - vpxor %xmm4,%xmm8,%xmm8 - vaesenc %xmm2,%xmm13,%xmm13 - vpxor %xmm5,%xmm1,%xmm4 - andq $0x60,%r12 - vmovups 32-128(%rcx),%xmm15 - vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1 - vaesenc %xmm2,%xmm14,%xmm14 - - vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2 - leaq (%r14,%r12,1),%r14 - vaesenc %xmm15,%xmm9,%xmm9 - vpxor 16+8(%rsp),%xmm8,%xmm8 - vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3 - vmovdqu 64+8(%rsp),%xmm0 - vaesenc %xmm15,%xmm10,%xmm10 - movbeq 88(%r14),%r13 - vaesenc %xmm15,%xmm11,%xmm11 - movbeq 80(%r14),%r12 - vaesenc %xmm15,%xmm12,%xmm12 - movq %r13,32+8(%rsp) - vaesenc %xmm15,%xmm13,%xmm13 - movq %r12,40+8(%rsp) - vmovdqu 48-32(%r9),%xmm5 - vaesenc %xmm15,%xmm14,%xmm14 - - vmovups 48-128(%rcx),%xmm15 - vpxor %xmm1,%xmm6,%xmm6 - vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1 - vaesenc %xmm15,%xmm9,%xmm9 - vpxor %xmm2,%xmm6,%xmm6 - vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2 - vaesenc %xmm15,%xmm10,%xmm10 - vpxor %xmm3,%xmm7,%xmm7 - vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3 - vaesenc %xmm15,%xmm11,%xmm11 - vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5 - vmovdqu 80+8(%rsp),%xmm0 - vaesenc %xmm15,%xmm12,%xmm12 - vaesenc %xmm15,%xmm13,%xmm13 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqu 64-32(%r9),%xmm1 - vaesenc %xmm15,%xmm14,%xmm14 - - vmovups 64-128(%rcx),%xmm15 - vpxor %xmm2,%xmm6,%xmm6 - vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 - vaesenc %xmm15,%xmm9,%xmm9 - vpxor %xmm3,%xmm6,%xmm6 - vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 - vaesenc %xmm15,%xmm10,%xmm10 - movbeq 72(%r14),%r13 - vpxor %xmm5,%xmm7,%xmm7 - vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5 - vaesenc %xmm15,%xmm11,%xmm11 - movbeq 64(%r14),%r12 - vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1 - vmovdqu 96+8(%rsp),%xmm0 - vaesenc %xmm15,%xmm12,%xmm12 - movq %r13,48+8(%rsp) - vaesenc %xmm15,%xmm13,%xmm13 - movq %r12,56+8(%rsp) - vpxor %xmm2,%xmm4,%xmm4 - vmovdqu 96-32(%r9),%xmm2 - vaesenc %xmm15,%xmm14,%xmm14 - - vmovups 80-128(%rcx),%xmm15 - vpxor %xmm3,%xmm6,%xmm6 - vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3 - vaesenc %xmm15,%xmm9,%xmm9 - vpxor %xmm5,%xmm6,%xmm6 - vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5 - vaesenc %xmm15,%xmm10,%xmm10 - movbeq 56(%r14),%r13 - vpxor %xmm1,%xmm7,%xmm7 - vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1 - vpxor 112+8(%rsp),%xmm8,%xmm8 - vaesenc %xmm15,%xmm11,%xmm11 - movbeq 48(%r14),%r12 - vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2 - vaesenc %xmm15,%xmm12,%xmm12 - movq %r13,64+8(%rsp) - vaesenc %xmm15,%xmm13,%xmm13 - movq %r12,72+8(%rsp) - vpxor %xmm3,%xmm4,%xmm4 - vmovdqu 112-32(%r9),%xmm3 - vaesenc %xmm15,%xmm14,%xmm14 - - vmovups 96-128(%rcx),%xmm15 - vpxor %xmm5,%xmm6,%xmm6 - vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5 - vaesenc %xmm15,%xmm9,%xmm9 - vpxor %xmm1,%xmm6,%xmm6 - vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1 - vaesenc %xmm15,%xmm10,%xmm10 - movbeq 40(%r14),%r13 - vpxor %xmm2,%xmm7,%xmm7 - vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2 - vaesenc %xmm15,%xmm11,%xmm11 - movbeq 32(%r14),%r12 - vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8 - vaesenc %xmm15,%xmm12,%xmm12 - movq %r13,80+8(%rsp) - vaesenc %xmm15,%xmm13,%xmm13 - movq %r12,88+8(%rsp) - vpxor %xmm5,%xmm6,%xmm6 - vaesenc %xmm15,%xmm14,%xmm14 - vpxor %xmm1,%xmm6,%xmm6 - - vmovups 112-128(%rcx),%xmm15 - vpslldq $8,%xmm6,%xmm5 - vpxor %xmm2,%xmm4,%xmm4 - vmovdqu 16(%r11),%xmm3 - - vaesenc %xmm15,%xmm9,%xmm9 - vpxor %xmm8,%xmm7,%xmm7 - vaesenc %xmm15,%xmm10,%xmm10 - vpxor %xmm5,%xmm4,%xmm4 - movbeq 24(%r14),%r13 - vaesenc %xmm15,%xmm11,%xmm11 - movbeq 16(%r14),%r12 - vpalignr $8,%xmm4,%xmm4,%xmm0 - vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 - movq %r13,96+8(%rsp) - vaesenc %xmm15,%xmm12,%xmm12 - movq %r12,104+8(%rsp) - vaesenc %xmm15,%xmm13,%xmm13 - vmovups 128-128(%rcx),%xmm1 - vaesenc %xmm15,%xmm14,%xmm14 - - vaesenc %xmm1,%xmm9,%xmm9 - vmovups 144-128(%rcx),%xmm15 - vaesenc %xmm1,%xmm10,%xmm10 - vpsrldq $8,%xmm6,%xmm6 - vaesenc %xmm1,%xmm11,%xmm11 - vpxor %xmm6,%xmm7,%xmm7 - vaesenc %xmm1,%xmm12,%xmm12 - vpxor %xmm0,%xmm4,%xmm4 - movbeq 8(%r14),%r13 - vaesenc %xmm1,%xmm13,%xmm13 - movbeq 0(%r14),%r12 - vaesenc %xmm1,%xmm14,%xmm14 - vmovups 160-128(%rcx),%xmm1 - cmpl $11,%ebp - jb .Lenc_tail - - vaesenc %xmm15,%xmm9,%xmm9 - vaesenc %xmm15,%xmm10,%xmm10 - vaesenc %xmm15,%xmm11,%xmm11 - vaesenc %xmm15,%xmm12,%xmm12 - vaesenc %xmm15,%xmm13,%xmm13 - vaesenc %xmm15,%xmm14,%xmm14 - - vaesenc %xmm1,%xmm9,%xmm9 - vaesenc %xmm1,%xmm10,%xmm10 - vaesenc %xmm1,%xmm11,%xmm11 - vaesenc %xmm1,%xmm12,%xmm12 - vaesenc %xmm1,%xmm13,%xmm13 - vmovups 176-128(%rcx),%xmm15 - vaesenc %xmm1,%xmm14,%xmm14 - vmovups 192-128(%rcx),%xmm1 - je .Lenc_tail - - vaesenc %xmm15,%xmm9,%xmm9 - vaesenc %xmm15,%xmm10,%xmm10 - vaesenc %xmm15,%xmm11,%xmm11 - vaesenc %xmm15,%xmm12,%xmm12 - vaesenc %xmm15,%xmm13,%xmm13 - vaesenc %xmm15,%xmm14,%xmm14 - - vaesenc %xmm1,%xmm9,%xmm9 - vaesenc %xmm1,%xmm10,%xmm10 - vaesenc %xmm1,%xmm11,%xmm11 - vaesenc %xmm1,%xmm12,%xmm12 - vaesenc %xmm1,%xmm13,%xmm13 - vmovups 208-128(%rcx),%xmm15 - vaesenc %xmm1,%xmm14,%xmm14 - vmovups 224-128(%rcx),%xmm1 - jmp .Lenc_tail - -.align 32 -.Lhandle_ctr32: - vmovdqu (%r11),%xmm0 - vpshufb %xmm0,%xmm1,%xmm6 - vmovdqu 48(%r11),%xmm5 - vpaddd 64(%r11),%xmm6,%xmm10 - vpaddd %xmm5,%xmm6,%xmm11 - vmovdqu 0-32(%r9),%xmm3 - vpaddd %xmm5,%xmm10,%xmm12 - vpshufb %xmm0,%xmm10,%xmm10 - vpaddd %xmm5,%xmm11,%xmm13 - vpshufb %xmm0,%xmm11,%xmm11 - vpxor %xmm15,%xmm10,%xmm10 - vpaddd %xmm5,%xmm12,%xmm14 - vpshufb %xmm0,%xmm12,%xmm12 - vpxor %xmm15,%xmm11,%xmm11 - vpaddd %xmm5,%xmm13,%xmm1 - vpshufb %xmm0,%xmm13,%xmm13 - vpshufb %xmm0,%xmm14,%xmm14 - vpshufb %xmm0,%xmm1,%xmm1 - jmp .Lresume_ctr32 - -.align 32 -.Lenc_tail: - vaesenc %xmm15,%xmm9,%xmm9 - vmovdqu %xmm7,16+8(%rsp) - vpalignr $8,%xmm4,%xmm4,%xmm8 - vaesenc %xmm15,%xmm10,%xmm10 - vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 - vpxor 0(%rdi),%xmm1,%xmm2 - vaesenc %xmm15,%xmm11,%xmm11 - vpxor 16(%rdi),%xmm1,%xmm0 - vaesenc %xmm15,%xmm12,%xmm12 - vpxor 32(%rdi),%xmm1,%xmm5 - vaesenc %xmm15,%xmm13,%xmm13 - vpxor 48(%rdi),%xmm1,%xmm6 - vaesenc %xmm15,%xmm14,%xmm14 - vpxor 64(%rdi),%xmm1,%xmm7 - vpxor 80(%rdi),%xmm1,%xmm3 - vmovdqu (%r8),%xmm1 - - vaesenclast %xmm2,%xmm9,%xmm9 - vmovdqu 32(%r11),%xmm2 - vaesenclast %xmm0,%xmm10,%xmm10 - vpaddb %xmm2,%xmm1,%xmm0 - movq %r13,112+8(%rsp) - leaq 96(%rdi),%rdi - vaesenclast %xmm5,%xmm11,%xmm11 - vpaddb %xmm2,%xmm0,%xmm5 - movq %r12,120+8(%rsp) - leaq 96(%rsi),%rsi - vmovdqu 0-128(%rcx),%xmm15 - vaesenclast %xmm6,%xmm12,%xmm12 - vpaddb %xmm2,%xmm5,%xmm6 - vaesenclast %xmm7,%xmm13,%xmm13 - vpaddb %xmm2,%xmm6,%xmm7 - vaesenclast %xmm3,%xmm14,%xmm14 - vpaddb %xmm2,%xmm7,%xmm3 - - addq $0x60,%r10 - subq $0x6,%rdx - jc .L6x_done - - vmovups %xmm9,-96(%rsi) - vpxor %xmm15,%xmm1,%xmm9 - vmovups %xmm10,-80(%rsi) - vmovdqa %xmm0,%xmm10 - vmovups %xmm11,-64(%rsi) - vmovdqa %xmm5,%xmm11 - vmovups %xmm12,-48(%rsi) - vmovdqa %xmm6,%xmm12 - vmovups %xmm13,-32(%rsi) - vmovdqa %xmm7,%xmm13 - vmovups %xmm14,-16(%rsi) - vmovdqa %xmm3,%xmm14 - vmovdqu 32+8(%rsp),%xmm7 - jmp .Loop6x - -.L6x_done: - vpxor 16+8(%rsp),%xmm8,%xmm8 - vpxor %xmm4,%xmm8,%xmm8 - +.globl aesni_gcm_encrypt +.type aesni_gcm_encrypt,@function +aesni_gcm_encrypt: +.cfi_startproc + xorl %eax,%eax .byte 0xf3,0xc3 -.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x +.cfi_endproc +.size aesni_gcm_encrypt,.-aesni_gcm_encrypt + .globl aesni_gcm_decrypt .type aesni_gcm_decrypt,@function -.align 32 aesni_gcm_decrypt: .cfi_startproc - xorq %r10,%r10 - cmpq $0x60,%rdx - jb .Lgcm_dec_abort - - leaq (%rsp),%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - vzeroupper - - vmovdqu (%r8),%xmm1 - addq $-128,%rsp - movl 12(%r8),%ebx - leaq .Lbswap_mask(%rip),%r11 - leaq -128(%rcx),%r14 - movq $0xf80,%r15 - vmovdqu (%r9),%xmm8 - andq $-128,%rsp - vmovdqu (%r11),%xmm0 - leaq 128(%rcx),%rcx - leaq 32+32(%r9),%r9 - movl 240-128(%rcx),%ebp - vpshufb %xmm0,%xmm8,%xmm8 - - andq %r15,%r14 - andq %rsp,%r15 - subq %r14,%r15 - jc .Ldec_no_key_aliasing - cmpq $768,%r15 - jnc .Ldec_no_key_aliasing - subq %r15,%rsp -.Ldec_no_key_aliasing: - - vmovdqu 80(%rdi),%xmm7 - leaq (%rdi),%r14 - vmovdqu 64(%rdi),%xmm4 - leaq -192(%rdi,%rdx,1),%r15 - vmovdqu 48(%rdi),%xmm5 - shrq $4,%rdx - xorq %r10,%r10 - vmovdqu 32(%rdi),%xmm6 - vpshufb %xmm0,%xmm7,%xmm7 - vmovdqu 16(%rdi),%xmm2 - vpshufb %xmm0,%xmm4,%xmm4 - vmovdqu (%rdi),%xmm3 - vpshufb %xmm0,%xmm5,%xmm5 - vmovdqu %xmm4,48(%rsp) - vpshufb %xmm0,%xmm6,%xmm6 - vmovdqu %xmm5,64(%rsp) - vpshufb %xmm0,%xmm2,%xmm2 - vmovdqu %xmm6,80(%rsp) - vpshufb %xmm0,%xmm3,%xmm3 - vmovdqu %xmm2,96(%rsp) - vmovdqu %xmm3,112(%rsp) - - call _aesni_ctr32_ghash_6x - - vmovups %xmm9,-96(%rsi) - vmovups %xmm10,-80(%rsi) - vmovups %xmm11,-64(%rsi) - vmovups %xmm12,-48(%rsi) - vmovups %xmm13,-32(%rsi) - vmovups %xmm14,-16(%rsi) - - vpshufb (%r11),%xmm8,%xmm8 - vmovdqu %xmm8,-64(%r9) - - vzeroupper - movq -48(%rax),%r15 -.cfi_restore %r15 - movq -40(%rax),%r14 -.cfi_restore %r14 - movq -32(%rax),%r13 -.cfi_restore %r13 - movq -24(%rax),%r12 -.cfi_restore %r12 - movq -16(%rax),%rbp -.cfi_restore %rbp - movq -8(%rax),%rbx -.cfi_restore %rbx - leaq (%rax),%rsp -.cfi_def_cfa_register %rsp -.Lgcm_dec_abort: - movq %r10,%rax + xorl %eax,%eax .byte 0xf3,0xc3 .cfi_endproc .size aesni_gcm_decrypt,.-aesni_gcm_decrypt -.type _aesni_ctr32_6x,@function -.align 32 -_aesni_ctr32_6x: - vmovdqu 0-128(%rcx),%xmm4 - vmovdqu 32(%r11),%xmm2 - leaq -1(%rbp),%r13 - vmovups 16-128(%rcx),%xmm15 - leaq 32-128(%rcx),%r12 - vpxor %xmm4,%xmm1,%xmm9 - addl $100663296,%ebx - jc .Lhandle_ctr32_2 - vpaddb %xmm2,%xmm1,%xmm10 - vpaddb %xmm2,%xmm10,%xmm11 - vpxor %xmm4,%xmm10,%xmm10 - vpaddb %xmm2,%xmm11,%xmm12 - vpxor %xmm4,%xmm11,%xmm11 - vpaddb %xmm2,%xmm12,%xmm13 - vpxor %xmm4,%xmm12,%xmm12 - vpaddb %xmm2,%xmm13,%xmm14 - vpxor %xmm4,%xmm13,%xmm13 - vpaddb %xmm2,%xmm14,%xmm1 - vpxor %xmm4,%xmm14,%xmm14 - jmp .Loop_ctr32 - -.align 16 -.Loop_ctr32: - vaesenc %xmm15,%xmm9,%xmm9 - vaesenc %xmm15,%xmm10,%xmm10 - vaesenc %xmm15,%xmm11,%xmm11 - vaesenc %xmm15,%xmm12,%xmm12 - vaesenc %xmm15,%xmm13,%xmm13 - vaesenc %xmm15,%xmm14,%xmm14 - vmovups (%r12),%xmm15 - leaq 16(%r12),%r12 - decl %r13d - jnz .Loop_ctr32 - - vmovdqu (%r12),%xmm3 - vaesenc %xmm15,%xmm9,%xmm9 - vpxor 0(%rdi),%xmm3,%xmm4 - vaesenc %xmm15,%xmm10,%xmm10 - vpxor 16(%rdi),%xmm3,%xmm5 - vaesenc %xmm15,%xmm11,%xmm11 - vpxor 32(%rdi),%xmm3,%xmm6 - vaesenc %xmm15,%xmm12,%xmm12 - vpxor 48(%rdi),%xmm3,%xmm8 - vaesenc %xmm15,%xmm13,%xmm13 - vpxor 64(%rdi),%xmm3,%xmm2 - vaesenc %xmm15,%xmm14,%xmm14 - vpxor 80(%rdi),%xmm3,%xmm3 - leaq 96(%rdi),%rdi - - vaesenclast %xmm4,%xmm9,%xmm9 - vaesenclast %xmm5,%xmm10,%xmm10 - vaesenclast %xmm6,%xmm11,%xmm11 - vaesenclast %xmm8,%xmm12,%xmm12 - vaesenclast %xmm2,%xmm13,%xmm13 - vaesenclast %xmm3,%xmm14,%xmm14 - vmovups %xmm9,0(%rsi) - vmovups %xmm10,16(%rsi) - vmovups %xmm11,32(%rsi) - vmovups %xmm12,48(%rsi) - vmovups %xmm13,64(%rsi) - vmovups %xmm14,80(%rsi) - leaq 96(%rsi),%rsi - - .byte 0xf3,0xc3 -.align 32 -.Lhandle_ctr32_2: - vpshufb %xmm0,%xmm1,%xmm6 - vmovdqu 48(%r11),%xmm5 - vpaddd 64(%r11),%xmm6,%xmm10 - vpaddd %xmm5,%xmm6,%xmm11 - vpaddd %xmm5,%xmm10,%xmm12 - vpshufb %xmm0,%xmm10,%xmm10 - vpaddd %xmm5,%xmm11,%xmm13 - vpshufb %xmm0,%xmm11,%xmm11 - vpxor %xmm4,%xmm10,%xmm10 - vpaddd %xmm5,%xmm12,%xmm14 - vpshufb %xmm0,%xmm12,%xmm12 - vpxor %xmm4,%xmm11,%xmm11 - vpaddd %xmm5,%xmm13,%xmm1 - vpshufb %xmm0,%xmm13,%xmm13 - vpxor %xmm4,%xmm12,%xmm12 - vpshufb %xmm0,%xmm14,%xmm14 - vpxor %xmm4,%xmm13,%xmm13 - vpshufb %xmm0,%xmm1,%xmm1 - vpxor %xmm4,%xmm14,%xmm14 - jmp .Loop_ctr32 -.size _aesni_ctr32_6x,.-_aesni_ctr32_6x - -.globl aesni_gcm_encrypt -.type aesni_gcm_encrypt,@function -.align 32 -aesni_gcm_encrypt: -.cfi_startproc - xorq %r10,%r10 - cmpq $288,%rdx - jb .Lgcm_enc_abort - - leaq (%rsp),%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - vzeroupper - - vmovdqu (%r8),%xmm1 - addq $-128,%rsp - movl 12(%r8),%ebx - leaq .Lbswap_mask(%rip),%r11 - leaq -128(%rcx),%r14 - movq $0xf80,%r15 - leaq 128(%rcx),%rcx - vmovdqu (%r11),%xmm0 - andq $-128,%rsp - movl 240-128(%rcx),%ebp - - andq %r15,%r14 - andq %rsp,%r15 - subq %r14,%r15 - jc .Lenc_no_key_aliasing - cmpq $768,%r15 - jnc .Lenc_no_key_aliasing - subq %r15,%rsp -.Lenc_no_key_aliasing: - - leaq (%rsi),%r14 - leaq -192(%rsi,%rdx,1),%r15 - shrq $4,%rdx - - call _aesni_ctr32_6x - vpshufb %xmm0,%xmm9,%xmm8 - vpshufb %xmm0,%xmm10,%xmm2 - vmovdqu %xmm8,112(%rsp) - vpshufb %xmm0,%xmm11,%xmm4 - vmovdqu %xmm2,96(%rsp) - vpshufb %xmm0,%xmm12,%xmm5 - vmovdqu %xmm4,80(%rsp) - vpshufb %xmm0,%xmm13,%xmm6 - vmovdqu %xmm5,64(%rsp) - vpshufb %xmm0,%xmm14,%xmm7 - vmovdqu %xmm6,48(%rsp) - - call _aesni_ctr32_6x - - vmovdqu (%r9),%xmm8 - leaq 32+32(%r9),%r9 - subq $12,%rdx - movq $192,%r10 - vpshufb %xmm0,%xmm8,%xmm8 - - call _aesni_ctr32_ghash_6x - vmovdqu 32(%rsp),%xmm7 - vmovdqu (%r11),%xmm0 - vmovdqu 0-32(%r9),%xmm3 - vpunpckhqdq %xmm7,%xmm7,%xmm1 - vmovdqu 32-32(%r9),%xmm15 - vmovups %xmm9,-96(%rsi) - vpshufb %xmm0,%xmm9,%xmm9 - vpxor %xmm7,%xmm1,%xmm1 - vmovups %xmm10,-80(%rsi) - vpshufb %xmm0,%xmm10,%xmm10 - vmovups %xmm11,-64(%rsi) - vpshufb %xmm0,%xmm11,%xmm11 - vmovups %xmm12,-48(%rsi) - vpshufb %xmm0,%xmm12,%xmm12 - vmovups %xmm13,-32(%rsi) - vpshufb %xmm0,%xmm13,%xmm13 - vmovups %xmm14,-16(%rsi) - vpshufb %xmm0,%xmm14,%xmm14 - vmovdqu %xmm9,16(%rsp) - vmovdqu 48(%rsp),%xmm6 - vmovdqu 16-32(%r9),%xmm0 - vpunpckhqdq %xmm6,%xmm6,%xmm2 - vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5 - vpxor %xmm6,%xmm2,%xmm2 - vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 - vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 - - vmovdqu 64(%rsp),%xmm9 - vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4 - vmovdqu 48-32(%r9),%xmm3 - vpxor %xmm5,%xmm4,%xmm4 - vpunpckhqdq %xmm9,%xmm9,%xmm5 - vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6 - vpxor %xmm9,%xmm5,%xmm5 - vpxor %xmm7,%xmm6,%xmm6 - vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 - vmovdqu 80-32(%r9),%xmm15 - vpxor %xmm1,%xmm2,%xmm2 - - vmovdqu 80(%rsp),%xmm1 - vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7 - vmovdqu 64-32(%r9),%xmm0 - vpxor %xmm4,%xmm7,%xmm7 - vpunpckhqdq %xmm1,%xmm1,%xmm4 - vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9 - vpxor %xmm1,%xmm4,%xmm4 - vpxor %xmm6,%xmm9,%xmm9 - vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5 - vpxor %xmm2,%xmm5,%xmm5 - - vmovdqu 96(%rsp),%xmm2 - vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6 - vmovdqu 96-32(%r9),%xmm3 - vpxor %xmm7,%xmm6,%xmm6 - vpunpckhqdq %xmm2,%xmm2,%xmm7 - vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpxor %xmm9,%xmm1,%xmm1 - vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4 - vmovdqu 128-32(%r9),%xmm15 - vpxor %xmm5,%xmm4,%xmm4 - - vpxor 112(%rsp),%xmm8,%xmm8 - vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5 - vmovdqu 112-32(%r9),%xmm0 - vpunpckhqdq %xmm8,%xmm8,%xmm9 - vpxor %xmm6,%xmm5,%xmm5 - vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2 - vpxor %xmm8,%xmm9,%xmm9 - vpxor %xmm1,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7 - vpxor %xmm4,%xmm7,%xmm4 - - vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6 - vmovdqu 0-32(%r9),%xmm3 - vpunpckhqdq %xmm14,%xmm14,%xmm1 - vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8 - vpxor %xmm14,%xmm1,%xmm1 - vpxor %xmm5,%xmm6,%xmm5 - vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9 - vmovdqu 32-32(%r9),%xmm15 - vpxor %xmm2,%xmm8,%xmm7 - vpxor %xmm4,%xmm9,%xmm6 - - vmovdqu 16-32(%r9),%xmm0 - vpxor %xmm5,%xmm7,%xmm9 - vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4 - vpxor %xmm9,%xmm6,%xmm6 - vpunpckhqdq %xmm13,%xmm13,%xmm2 - vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14 - vpxor %xmm13,%xmm2,%xmm2 - vpslldq $8,%xmm6,%xmm9 - vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 - vpxor %xmm9,%xmm5,%xmm8 - vpsrldq $8,%xmm6,%xmm6 - vpxor %xmm6,%xmm7,%xmm7 - - vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5 - vmovdqu 48-32(%r9),%xmm3 - vpxor %xmm4,%xmm5,%xmm5 - vpunpckhqdq %xmm12,%xmm12,%xmm9 - vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13 - vpxor %xmm12,%xmm9,%xmm9 - vpxor %xmm14,%xmm13,%xmm13 - vpalignr $8,%xmm8,%xmm8,%xmm14 - vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 - vmovdqu 80-32(%r9),%xmm15 - vpxor %xmm1,%xmm2,%xmm2 - - vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4 - vmovdqu 64-32(%r9),%xmm0 - vpxor %xmm5,%xmm4,%xmm4 - vpunpckhqdq %xmm11,%xmm11,%xmm1 - vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12 - vpxor %xmm11,%xmm1,%xmm1 - vpxor %xmm13,%xmm12,%xmm12 - vxorps 16(%rsp),%xmm7,%xmm7 - vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9 - vpxor %xmm2,%xmm9,%xmm9 - - vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 - vxorps %xmm14,%xmm8,%xmm8 - - vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5 - vmovdqu 96-32(%r9),%xmm3 - vpxor %xmm4,%xmm5,%xmm5 - vpunpckhqdq %xmm10,%xmm10,%xmm2 - vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11 - vpxor %xmm10,%xmm2,%xmm2 - vpalignr $8,%xmm8,%xmm8,%xmm14 - vpxor %xmm12,%xmm11,%xmm11 - vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1 - vmovdqu 128-32(%r9),%xmm15 - vpxor %xmm9,%xmm1,%xmm1 - - vxorps %xmm7,%xmm14,%xmm14 - vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 - vxorps %xmm14,%xmm8,%xmm8 - - vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4 - vmovdqu 112-32(%r9),%xmm0 - vpxor %xmm5,%xmm4,%xmm4 - vpunpckhqdq %xmm8,%xmm8,%xmm9 - vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10 - vpxor %xmm8,%xmm9,%xmm9 - vpxor %xmm11,%xmm10,%xmm10 - vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2 - vpxor %xmm1,%xmm2,%xmm2 - - vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5 - vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7 - vpxor %xmm4,%xmm5,%xmm5 - vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6 - vpxor %xmm10,%xmm7,%xmm7 - vpxor %xmm2,%xmm6,%xmm6 - - vpxor %xmm5,%xmm7,%xmm4 - vpxor %xmm4,%xmm6,%xmm6 - vpslldq $8,%xmm6,%xmm1 - vmovdqu 16(%r11),%xmm3 - vpsrldq $8,%xmm6,%xmm6 - vpxor %xmm1,%xmm5,%xmm8 - vpxor %xmm6,%xmm7,%xmm7 - - vpalignr $8,%xmm8,%xmm8,%xmm2 - vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 - vpxor %xmm2,%xmm8,%xmm8 - - vpalignr $8,%xmm8,%xmm8,%xmm2 - vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 - vpxor %xmm7,%xmm2,%xmm2 - vpxor %xmm2,%xmm8,%xmm8 - vpshufb (%r11),%xmm8,%xmm8 - vmovdqu %xmm8,-64(%r9) - - vzeroupper - movq -48(%rax),%r15 -.cfi_restore %r15 - movq -40(%rax),%r14 -.cfi_restore %r14 - movq -32(%rax),%r13 -.cfi_restore %r13 - movq -24(%rax),%r12 -.cfi_restore %r12 - movq -16(%rax),%rbp -.cfi_restore %rbp - movq -8(%rax),%rbx -.cfi_restore %rbx - leaq (%rax),%rsp -.cfi_def_cfa_register %rsp -.Lgcm_enc_abort: - movq %r10,%rax - .byte 0xf3,0xc3 -.cfi_endproc -.size aesni_gcm_encrypt,.-aesni_gcm_encrypt -.align 64 -.Lbswap_mask: -.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 -.Lpoly: -.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 -.Lone_msb: -.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 -.Ltwo_lsb: -.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -.Lone_lsb: -.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 64 diff --git a/secure/lib/libcrypto/amd64/aesni-mb-x86_64.S b/secure/lib/libcrypto/amd64/aesni-mb-x86_64.S index 706c5c59d38d..de4bac9488f7 100644 --- a/secure/lib/libcrypto/amd64/aesni-mb-x86_64.S +++ b/secure/lib/libcrypto/amd64/aesni-mb-x86_64.S @@ -9,14 +9,6 @@ .align 32 aesni_multi_cbc_encrypt: .cfi_startproc - cmpl $2,%edx - jb .Lenc_non_avx - movl OPENSSL_ia32cap_P+4(%rip),%ecx - testl $268435456,%ecx - jnz _avx_cbc_enc_shortcut - jmp .Lenc_non_avx -.align 16 -.Lenc_non_avx: movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx @@ -291,14 +283,6 @@ aesni_multi_cbc_encrypt: .align 32 aesni_multi_cbc_decrypt: .cfi_startproc - cmpl $2,%edx - jb .Ldec_non_avx - movl OPENSSL_ia32cap_P+4(%rip),%ecx - testl $268435456,%ecx - jnz _avx_cbc_dec_shortcut - jmp .Ldec_non_avx -.align 16 -.Ldec_non_avx: movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx @@ -558,952 +542,3 @@ aesni_multi_cbc_decrypt: .byte 0xf3,0xc3 .cfi_endproc .size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt -.type aesni_multi_cbc_encrypt_avx,@function -.align 32 -aesni_multi_cbc_encrypt_avx: -.cfi_startproc -_avx_cbc_enc_shortcut: - movq %rsp,%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - - - - - - - - - subq $192,%rsp - andq $-128,%rsp - movq %rax,16(%rsp) -.cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x08 - -.Lenc8x_body: - vzeroupper - vmovdqu (%rsi),%xmm15 - leaq 120(%rsi),%rsi - leaq 160(%rdi),%rdi - shrl $1,%edx - -.Lenc8x_loop_grande: - - xorl %edx,%edx - movl -144(%rdi),%ecx - movq -160(%rdi),%r8 - cmpl %edx,%ecx - movq -152(%rdi),%rbx - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu -136(%rdi),%xmm2 - movl %ecx,32(%rsp) - cmovleq %rsp,%r8 - subq %r8,%rbx - movq %rbx,64(%rsp) - movl -104(%rdi),%ecx - movq -120(%rdi),%r9 - cmpl %edx,%ecx - movq -112(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu -96(%rdi),%xmm3 - movl %ecx,36(%rsp) - cmovleq %rsp,%r9 - subq %r9,%rbp - movq %rbp,72(%rsp) - movl -64(%rdi),%ecx - movq -80(%rdi),%r10 - cmpl %edx,%ecx - movq -72(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu -56(%rdi),%xmm4 - movl %ecx,40(%rsp) - cmovleq %rsp,%r10 - subq %r10,%rbp - movq %rbp,80(%rsp) - movl -24(%rdi),%ecx - movq -40(%rdi),%r11 - cmpl %edx,%ecx - movq -32(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu -16(%rdi),%xmm5 - movl %ecx,44(%rsp) - cmovleq %rsp,%r11 - subq %r11,%rbp - movq %rbp,88(%rsp) - movl 16(%rdi),%ecx - movq 0(%rdi),%r12 - cmpl %edx,%ecx - movq 8(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu 24(%rdi),%xmm6 - movl %ecx,48(%rsp) - cmovleq %rsp,%r12 - subq %r12,%rbp - movq %rbp,96(%rsp) - movl 56(%rdi),%ecx - movq 40(%rdi),%r13 - cmpl %edx,%ecx - movq 48(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu 64(%rdi),%xmm7 - movl %ecx,52(%rsp) - cmovleq %rsp,%r13 - subq %r13,%rbp - movq %rbp,104(%rsp) - movl 96(%rdi),%ecx - movq 80(%rdi),%r14 - cmpl %edx,%ecx - movq 88(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu 104(%rdi),%xmm8 - movl %ecx,56(%rsp) - cmovleq %rsp,%r14 - subq %r14,%rbp - movq %rbp,112(%rsp) - movl 136(%rdi),%ecx - movq 120(%rdi),%r15 - cmpl %edx,%ecx - movq 128(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu 144(%rdi),%xmm9 - movl %ecx,60(%rsp) - cmovleq %rsp,%r15 - subq %r15,%rbp - movq %rbp,120(%rsp) - testl %edx,%edx - jz .Lenc8x_done - - vmovups 16-120(%rsi),%xmm1 - vmovups 32-120(%rsi),%xmm0 - movl 240-120(%rsi),%eax - - vpxor (%r8),%xmm15,%xmm10 - leaq 128(%rsp),%rbp - vpxor (%r9),%xmm15,%xmm11 - vpxor (%r10),%xmm15,%xmm12 - vpxor (%r11),%xmm15,%xmm13 - vpxor %xmm10,%xmm2,%xmm2 - vpxor (%r12),%xmm15,%xmm10 - vpxor %xmm11,%xmm3,%xmm3 - vpxor (%r13),%xmm15,%xmm11 - vpxor %xmm12,%xmm4,%xmm4 - vpxor (%r14),%xmm15,%xmm12 - vpxor %xmm13,%xmm5,%xmm5 - vpxor (%r15),%xmm15,%xmm13 - vpxor %xmm10,%xmm6,%xmm6 - movl $1,%ecx - vpxor %xmm11,%xmm7,%xmm7 - vpxor %xmm12,%xmm8,%xmm8 - vpxor %xmm13,%xmm9,%xmm9 - jmp .Loop_enc8x - -.align 32 -.Loop_enc8x: - vaesenc %xmm1,%xmm2,%xmm2 - cmpl 32+0(%rsp),%ecx - vaesenc %xmm1,%xmm3,%xmm3 - prefetcht0 31(%r8) - vaesenc %xmm1,%xmm4,%xmm4 - vaesenc %xmm1,%xmm5,%xmm5 - leaq (%r8,%rbx,1),%rbx - cmovgeq %rsp,%r8 - vaesenc %xmm1,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesenc %xmm1,%xmm7,%xmm7 - subq %r8,%rbx - vaesenc %xmm1,%xmm8,%xmm8 - vpxor 16(%r8),%xmm15,%xmm10 - movq %rbx,64+0(%rsp) - vaesenc %xmm1,%xmm9,%xmm9 - vmovups -72(%rsi),%xmm1 - leaq 16(%r8,%rbx,1),%r8 - vmovdqu %xmm10,0(%rbp) - vaesenc %xmm0,%xmm2,%xmm2 - cmpl 32+4(%rsp),%ecx - movq 64+8(%rsp),%rbx - vaesenc %xmm0,%xmm3,%xmm3 - prefetcht0 31(%r9) - vaesenc %xmm0,%xmm4,%xmm4 - vaesenc %xmm0,%xmm5,%xmm5 - leaq (%r9,%rbx,1),%rbx - cmovgeq %rsp,%r9 - vaesenc %xmm0,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesenc %xmm0,%xmm7,%xmm7 - subq %r9,%rbx - vaesenc %xmm0,%xmm8,%xmm8 - vpxor 16(%r9),%xmm15,%xmm11 - movq %rbx,64+8(%rsp) - vaesenc %xmm0,%xmm9,%xmm9 - vmovups -56(%rsi),%xmm0 - leaq 16(%r9,%rbx,1),%r9 - vmovdqu %xmm11,16(%rbp) - vaesenc %xmm1,%xmm2,%xmm2 - cmpl 32+8(%rsp),%ecx - movq 64+16(%rsp),%rbx - vaesenc %xmm1,%xmm3,%xmm3 - prefetcht0 31(%r10) - vaesenc %xmm1,%xmm4,%xmm4 - prefetcht0 15(%r8) - vaesenc %xmm1,%xmm5,%xmm5 - leaq (%r10,%rbx,1),%rbx - cmovgeq %rsp,%r10 - vaesenc %xmm1,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesenc %xmm1,%xmm7,%xmm7 - subq %r10,%rbx - vaesenc %xmm1,%xmm8,%xmm8 - vpxor 16(%r10),%xmm15,%xmm12 - movq %rbx,64+16(%rsp) - vaesenc %xmm1,%xmm9,%xmm9 - vmovups -40(%rsi),%xmm1 - leaq 16(%r10,%rbx,1),%r10 - vmovdqu %xmm12,32(%rbp) - vaesenc %xmm0,%xmm2,%xmm2 - cmpl 32+12(%rsp),%ecx - movq 64+24(%rsp),%rbx - vaesenc %xmm0,%xmm3,%xmm3 - prefetcht0 31(%r11) - vaesenc %xmm0,%xmm4,%xmm4 - prefetcht0 15(%r9) - vaesenc %xmm0,%xmm5,%xmm5 - leaq (%r11,%rbx,1),%rbx - cmovgeq %rsp,%r11 - vaesenc %xmm0,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesenc %xmm0,%xmm7,%xmm7 - subq %r11,%rbx - vaesenc %xmm0,%xmm8,%xmm8 - vpxor 16(%r11),%xmm15,%xmm13 - movq %rbx,64+24(%rsp) - vaesenc %xmm0,%xmm9,%xmm9 - vmovups -24(%rsi),%xmm0 - leaq 16(%r11,%rbx,1),%r11 - vmovdqu %xmm13,48(%rbp) - vaesenc %xmm1,%xmm2,%xmm2 - cmpl 32+16(%rsp),%ecx - movq 64+32(%rsp),%rbx - vaesenc %xmm1,%xmm3,%xmm3 - prefetcht0 31(%r12) - vaesenc %xmm1,%xmm4,%xmm4 - prefetcht0 15(%r10) - vaesenc %xmm1,%xmm5,%xmm5 - leaq (%r12,%rbx,1),%rbx - cmovgeq %rsp,%r12 - vaesenc %xmm1,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesenc %xmm1,%xmm7,%xmm7 - subq %r12,%rbx - vaesenc %xmm1,%xmm8,%xmm8 - vpxor 16(%r12),%xmm15,%xmm10 - movq %rbx,64+32(%rsp) - vaesenc %xmm1,%xmm9,%xmm9 - vmovups -8(%rsi),%xmm1 - leaq 16(%r12,%rbx,1),%r12 - vaesenc %xmm0,%xmm2,%xmm2 - cmpl 32+20(%rsp),%ecx - movq 64+40(%rsp),%rbx - vaesenc %xmm0,%xmm3,%xmm3 - prefetcht0 31(%r13) - vaesenc %xmm0,%xmm4,%xmm4 - prefetcht0 15(%r11) - vaesenc %xmm0,%xmm5,%xmm5 - leaq (%rbx,%r13,1),%rbx - cmovgeq %rsp,%r13 - vaesenc %xmm0,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesenc %xmm0,%xmm7,%xmm7 - subq %r13,%rbx - vaesenc %xmm0,%xmm8,%xmm8 - vpxor 16(%r13),%xmm15,%xmm11 - movq %rbx,64+40(%rsp) - vaesenc %xmm0,%xmm9,%xmm9 - vmovups 8(%rsi),%xmm0 - leaq 16(%r13,%rbx,1),%r13 - vaesenc %xmm1,%xmm2,%xmm2 - cmpl 32+24(%rsp),%ecx - movq 64+48(%rsp),%rbx - vaesenc %xmm1,%xmm3,%xmm3 - prefetcht0 31(%r14) - vaesenc %xmm1,%xmm4,%xmm4 - prefetcht0 15(%r12) - vaesenc %xmm1,%xmm5,%xmm5 - leaq (%r14,%rbx,1),%rbx - cmovgeq %rsp,%r14 - vaesenc %xmm1,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesenc %xmm1,%xmm7,%xmm7 - subq %r14,%rbx - vaesenc %xmm1,%xmm8,%xmm8 - vpxor 16(%r14),%xmm15,%xmm12 - movq %rbx,64+48(%rsp) - vaesenc %xmm1,%xmm9,%xmm9 - vmovups 24(%rsi),%xmm1 - leaq 16(%r14,%rbx,1),%r14 - vaesenc %xmm0,%xmm2,%xmm2 - cmpl 32+28(%rsp),%ecx - movq 64+56(%rsp),%rbx - vaesenc %xmm0,%xmm3,%xmm3 - prefetcht0 31(%r15) - vaesenc %xmm0,%xmm4,%xmm4 - prefetcht0 15(%r13) - vaesenc %xmm0,%xmm5,%xmm5 - leaq (%r15,%rbx,1),%rbx - cmovgeq %rsp,%r15 - vaesenc %xmm0,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesenc %xmm0,%xmm7,%xmm7 - subq %r15,%rbx - vaesenc %xmm0,%xmm8,%xmm8 - vpxor 16(%r15),%xmm15,%xmm13 - movq %rbx,64+56(%rsp) - vaesenc %xmm0,%xmm9,%xmm9 - vmovups 40(%rsi),%xmm0 - leaq 16(%r15,%rbx,1),%r15 - vmovdqu 32(%rsp),%xmm14 - prefetcht0 15(%r14) - prefetcht0 15(%r15) - cmpl $11,%eax - jb .Lenc8x_tail - - vaesenc %xmm1,%xmm2,%xmm2 - vaesenc %xmm1,%xmm3,%xmm3 - vaesenc %xmm1,%xmm4,%xmm4 - vaesenc %xmm1,%xmm5,%xmm5 - vaesenc %xmm1,%xmm6,%xmm6 - vaesenc %xmm1,%xmm7,%xmm7 - vaesenc %xmm1,%xmm8,%xmm8 - vaesenc %xmm1,%xmm9,%xmm9 - vmovups 176-120(%rsi),%xmm1 - - vaesenc %xmm0,%xmm2,%xmm2 - vaesenc %xmm0,%xmm3,%xmm3 - vaesenc %xmm0,%xmm4,%xmm4 - vaesenc %xmm0,%xmm5,%xmm5 - vaesenc %xmm0,%xmm6,%xmm6 - vaesenc %xmm0,%xmm7,%xmm7 - vaesenc %xmm0,%xmm8,%xmm8 - vaesenc %xmm0,%xmm9,%xmm9 - vmovups 192-120(%rsi),%xmm0 - je .Lenc8x_tail - - vaesenc %xmm1,%xmm2,%xmm2 - vaesenc %xmm1,%xmm3,%xmm3 - vaesenc %xmm1,%xmm4,%xmm4 - vaesenc %xmm1,%xmm5,%xmm5 - vaesenc %xmm1,%xmm6,%xmm6 - vaesenc %xmm1,%xmm7,%xmm7 - vaesenc %xmm1,%xmm8,%xmm8 - vaesenc %xmm1,%xmm9,%xmm9 - vmovups 208-120(%rsi),%xmm1 - - vaesenc %xmm0,%xmm2,%xmm2 - vaesenc %xmm0,%xmm3,%xmm3 - vaesenc %xmm0,%xmm4,%xmm4 - vaesenc %xmm0,%xmm5,%xmm5 - vaesenc %xmm0,%xmm6,%xmm6 - vaesenc %xmm0,%xmm7,%xmm7 - vaesenc %xmm0,%xmm8,%xmm8 - vaesenc %xmm0,%xmm9,%xmm9 - vmovups 224-120(%rsi),%xmm0 - -.Lenc8x_tail: - vaesenc %xmm1,%xmm2,%xmm2 - vpxor %xmm15,%xmm15,%xmm15 - vaesenc %xmm1,%xmm3,%xmm3 - vaesenc %xmm1,%xmm4,%xmm4 - vpcmpgtd %xmm15,%xmm14,%xmm15 - vaesenc %xmm1,%xmm5,%xmm5 - vaesenc %xmm1,%xmm6,%xmm6 - vpaddd %xmm14,%xmm15,%xmm15 - vmovdqu 48(%rsp),%xmm14 - vaesenc %xmm1,%xmm7,%xmm7 - movq 64(%rsp),%rbx - vaesenc %xmm1,%xmm8,%xmm8 - vaesenc %xmm1,%xmm9,%xmm9 - vmovups 16-120(%rsi),%xmm1 - - vaesenclast %xmm0,%xmm2,%xmm2 - vmovdqa %xmm15,32(%rsp) - vpxor %xmm15,%xmm15,%xmm15 - vaesenclast %xmm0,%xmm3,%xmm3 - vaesenclast %xmm0,%xmm4,%xmm4 - vpcmpgtd %xmm15,%xmm14,%xmm15 - vaesenclast %xmm0,%xmm5,%xmm5 - vaesenclast %xmm0,%xmm6,%xmm6 - vpaddd %xmm15,%xmm14,%xmm14 - vmovdqu -120(%rsi),%xmm15 - vaesenclast %xmm0,%xmm7,%xmm7 - vaesenclast %xmm0,%xmm8,%xmm8 - vmovdqa %xmm14,48(%rsp) - vaesenclast %xmm0,%xmm9,%xmm9 - vmovups 32-120(%rsi),%xmm0 - - vmovups %xmm2,-16(%r8) - subq %rbx,%r8 - vpxor 0(%rbp),%xmm2,%xmm2 - vmovups %xmm3,-16(%r9) - subq 72(%rsp),%r9 - vpxor 16(%rbp),%xmm3,%xmm3 - vmovups %xmm4,-16(%r10) - subq 80(%rsp),%r10 - vpxor 32(%rbp),%xmm4,%xmm4 - vmovups %xmm5,-16(%r11) - subq 88(%rsp),%r11 - vpxor 48(%rbp),%xmm5,%xmm5 - vmovups %xmm6,-16(%r12) - subq 96(%rsp),%r12 - vpxor %xmm10,%xmm6,%xmm6 - vmovups %xmm7,-16(%r13) - subq 104(%rsp),%r13 - vpxor %xmm11,%xmm7,%xmm7 - vmovups %xmm8,-16(%r14) - subq 112(%rsp),%r14 - vpxor %xmm12,%xmm8,%xmm8 - vmovups %xmm9,-16(%r15) - subq 120(%rsp),%r15 - vpxor %xmm13,%xmm9,%xmm9 - - decl %edx - jnz .Loop_enc8x - - movq 16(%rsp),%rax -.cfi_def_cfa %rax,8 - - - - - -.Lenc8x_done: - vzeroupper - movq -48(%rax),%r15 -.cfi_restore %r15 - movq -40(%rax),%r14 -.cfi_restore %r14 - movq -32(%rax),%r13 -.cfi_restore %r13 - movq -24(%rax),%r12 -.cfi_restore %r12 - movq -16(%rax),%rbp -.cfi_restore %rbp - movq -8(%rax),%rbx -.cfi_restore %rbx - leaq (%rax),%rsp -.cfi_def_cfa_register %rsp -.Lenc8x_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx - -.type aesni_multi_cbc_decrypt_avx,@function -.align 32 -aesni_multi_cbc_decrypt_avx: -.cfi_startproc -_avx_cbc_dec_shortcut: - movq %rsp,%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - - - - - - - - - - subq $256,%rsp - andq $-256,%rsp - subq $192,%rsp - movq %rax,16(%rsp) -.cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x08 - -.Ldec8x_body: - vzeroupper - vmovdqu (%rsi),%xmm15 - leaq 120(%rsi),%rsi - leaq 160(%rdi),%rdi - shrl $1,%edx - -.Ldec8x_loop_grande: - - xorl %edx,%edx - movl -144(%rdi),%ecx - movq -160(%rdi),%r8 - cmpl %edx,%ecx - movq -152(%rdi),%rbx - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu -136(%rdi),%xmm2 - movl %ecx,32(%rsp) - cmovleq %rsp,%r8 - subq %r8,%rbx - movq %rbx,64(%rsp) - vmovdqu %xmm2,192(%rsp) - movl -104(%rdi),%ecx - movq -120(%rdi),%r9 - cmpl %edx,%ecx - movq -112(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu -96(%rdi),%xmm3 - movl %ecx,36(%rsp) - cmovleq %rsp,%r9 - subq %r9,%rbp - movq %rbp,72(%rsp) - vmovdqu %xmm3,208(%rsp) - movl -64(%rdi),%ecx - movq -80(%rdi),%r10 - cmpl %edx,%ecx - movq -72(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu -56(%rdi),%xmm4 - movl %ecx,40(%rsp) - cmovleq %rsp,%r10 - subq %r10,%rbp - movq %rbp,80(%rsp) - vmovdqu %xmm4,224(%rsp) - movl -24(%rdi),%ecx - movq -40(%rdi),%r11 - cmpl %edx,%ecx - movq -32(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu -16(%rdi),%xmm5 - movl %ecx,44(%rsp) - cmovleq %rsp,%r11 - subq %r11,%rbp - movq %rbp,88(%rsp) - vmovdqu %xmm5,240(%rsp) - movl 16(%rdi),%ecx - movq 0(%rdi),%r12 - cmpl %edx,%ecx - movq 8(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu 24(%rdi),%xmm6 - movl %ecx,48(%rsp) - cmovleq %rsp,%r12 - subq %r12,%rbp - movq %rbp,96(%rsp) - vmovdqu %xmm6,256(%rsp) - movl 56(%rdi),%ecx - movq 40(%rdi),%r13 - cmpl %edx,%ecx - movq 48(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu 64(%rdi),%xmm7 - movl %ecx,52(%rsp) - cmovleq %rsp,%r13 - subq %r13,%rbp - movq %rbp,104(%rsp) - vmovdqu %xmm7,272(%rsp) - movl 96(%rdi),%ecx - movq 80(%rdi),%r14 - cmpl %edx,%ecx - movq 88(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu 104(%rdi),%xmm8 - movl %ecx,56(%rsp) - cmovleq %rsp,%r14 - subq %r14,%rbp - movq %rbp,112(%rsp) - vmovdqu %xmm8,288(%rsp) - movl 136(%rdi),%ecx - movq 120(%rdi),%r15 - cmpl %edx,%ecx - movq 128(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu 144(%rdi),%xmm9 - movl %ecx,60(%rsp) - cmovleq %rsp,%r15 - subq %r15,%rbp - movq %rbp,120(%rsp) - vmovdqu %xmm9,304(%rsp) - testl %edx,%edx - jz .Ldec8x_done - - vmovups 16-120(%rsi),%xmm1 - vmovups 32-120(%rsi),%xmm0 - movl 240-120(%rsi),%eax - leaq 192+128(%rsp),%rbp - - vmovdqu (%r8),%xmm2 - vmovdqu (%r9),%xmm3 - vmovdqu (%r10),%xmm4 - vmovdqu (%r11),%xmm5 - vmovdqu (%r12),%xmm6 - vmovdqu (%r13),%xmm7 - vmovdqu (%r14),%xmm8 - vmovdqu (%r15),%xmm9 - vmovdqu %xmm2,0(%rbp) - vpxor %xmm15,%xmm2,%xmm2 - vmovdqu %xmm3,16(%rbp) - vpxor %xmm15,%xmm3,%xmm3 - vmovdqu %xmm4,32(%rbp) - vpxor %xmm15,%xmm4,%xmm4 - vmovdqu %xmm5,48(%rbp) - vpxor %xmm15,%xmm5,%xmm5 - vmovdqu %xmm6,64(%rbp) - vpxor %xmm15,%xmm6,%xmm6 - vmovdqu %xmm7,80(%rbp) - vpxor %xmm15,%xmm7,%xmm7 - vmovdqu %xmm8,96(%rbp) - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu %xmm9,112(%rbp) - vpxor %xmm15,%xmm9,%xmm9 - xorq $0x80,%rbp - movl $1,%ecx - jmp .Loop_dec8x - -.align 32 -.Loop_dec8x: - vaesdec %xmm1,%xmm2,%xmm2 - cmpl 32+0(%rsp),%ecx - vaesdec %xmm1,%xmm3,%xmm3 - prefetcht0 31(%r8) - vaesdec %xmm1,%xmm4,%xmm4 - vaesdec %xmm1,%xmm5,%xmm5 - leaq (%r8,%rbx,1),%rbx - cmovgeq %rsp,%r8 - vaesdec %xmm1,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesdec %xmm1,%xmm7,%xmm7 - subq %r8,%rbx - vaesdec %xmm1,%xmm8,%xmm8 - vmovdqu 16(%r8),%xmm10 - movq %rbx,64+0(%rsp) - vaesdec %xmm1,%xmm9,%xmm9 - vmovups -72(%rsi),%xmm1 - leaq 16(%r8,%rbx,1),%r8 - vmovdqu %xmm10,128(%rsp) - vaesdec %xmm0,%xmm2,%xmm2 - cmpl 32+4(%rsp),%ecx - movq 64+8(%rsp),%rbx - vaesdec %xmm0,%xmm3,%xmm3 - prefetcht0 31(%r9) - vaesdec %xmm0,%xmm4,%xmm4 - vaesdec %xmm0,%xmm5,%xmm5 - leaq (%r9,%rbx,1),%rbx - cmovgeq %rsp,%r9 - vaesdec %xmm0,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesdec %xmm0,%xmm7,%xmm7 - subq %r9,%rbx - vaesdec %xmm0,%xmm8,%xmm8 - vmovdqu 16(%r9),%xmm11 - movq %rbx,64+8(%rsp) - vaesdec %xmm0,%xmm9,%xmm9 - vmovups -56(%rsi),%xmm0 - leaq 16(%r9,%rbx,1),%r9 - vmovdqu %xmm11,144(%rsp) - vaesdec %xmm1,%xmm2,%xmm2 - cmpl 32+8(%rsp),%ecx - movq 64+16(%rsp),%rbx - vaesdec %xmm1,%xmm3,%xmm3 - prefetcht0 31(%r10) - vaesdec %xmm1,%xmm4,%xmm4 - prefetcht0 15(%r8) - vaesdec %xmm1,%xmm5,%xmm5 - leaq (%r10,%rbx,1),%rbx - cmovgeq %rsp,%r10 - vaesdec %xmm1,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesdec %xmm1,%xmm7,%xmm7 - subq %r10,%rbx - vaesdec %xmm1,%xmm8,%xmm8 - vmovdqu 16(%r10),%xmm12 - movq %rbx,64+16(%rsp) - vaesdec %xmm1,%xmm9,%xmm9 - vmovups -40(%rsi),%xmm1 - leaq 16(%r10,%rbx,1),%r10 - vmovdqu %xmm12,160(%rsp) - vaesdec %xmm0,%xmm2,%xmm2 - cmpl 32+12(%rsp),%ecx - movq 64+24(%rsp),%rbx - vaesdec %xmm0,%xmm3,%xmm3 - prefetcht0 31(%r11) - vaesdec %xmm0,%xmm4,%xmm4 - prefetcht0 15(%r9) - vaesdec %xmm0,%xmm5,%xmm5 - leaq (%r11,%rbx,1),%rbx - cmovgeq %rsp,%r11 - vaesdec %xmm0,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesdec %xmm0,%xmm7,%xmm7 - subq %r11,%rbx - vaesdec %xmm0,%xmm8,%xmm8 - vmovdqu 16(%r11),%xmm13 - movq %rbx,64+24(%rsp) - vaesdec %xmm0,%xmm9,%xmm9 - vmovups -24(%rsi),%xmm0 - leaq 16(%r11,%rbx,1),%r11 - vmovdqu %xmm13,176(%rsp) - vaesdec %xmm1,%xmm2,%xmm2 - cmpl 32+16(%rsp),%ecx - movq 64+32(%rsp),%rbx - vaesdec %xmm1,%xmm3,%xmm3 - prefetcht0 31(%r12) - vaesdec %xmm1,%xmm4,%xmm4 - prefetcht0 15(%r10) - vaesdec %xmm1,%xmm5,%xmm5 - leaq (%r12,%rbx,1),%rbx - cmovgeq %rsp,%r12 - vaesdec %xmm1,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesdec %xmm1,%xmm7,%xmm7 - subq %r12,%rbx - vaesdec %xmm1,%xmm8,%xmm8 - vmovdqu 16(%r12),%xmm10 - movq %rbx,64+32(%rsp) - vaesdec %xmm1,%xmm9,%xmm9 - vmovups -8(%rsi),%xmm1 - leaq 16(%r12,%rbx,1),%r12 - vaesdec %xmm0,%xmm2,%xmm2 - cmpl 32+20(%rsp),%ecx - movq 64+40(%rsp),%rbx - vaesdec %xmm0,%xmm3,%xmm3 - prefetcht0 31(%r13) - vaesdec %xmm0,%xmm4,%xmm4 - prefetcht0 15(%r11) - vaesdec %xmm0,%xmm5,%xmm5 - leaq (%rbx,%r13,1),%rbx - cmovgeq %rsp,%r13 - vaesdec %xmm0,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesdec %xmm0,%xmm7,%xmm7 - subq %r13,%rbx - vaesdec %xmm0,%xmm8,%xmm8 - vmovdqu 16(%r13),%xmm11 - movq %rbx,64+40(%rsp) - vaesdec %xmm0,%xmm9,%xmm9 - vmovups 8(%rsi),%xmm0 - leaq 16(%r13,%rbx,1),%r13 - vaesdec %xmm1,%xmm2,%xmm2 - cmpl 32+24(%rsp),%ecx - movq 64+48(%rsp),%rbx - vaesdec %xmm1,%xmm3,%xmm3 - prefetcht0 31(%r14) - vaesdec %xmm1,%xmm4,%xmm4 - prefetcht0 15(%r12) - vaesdec %xmm1,%xmm5,%xmm5 - leaq (%r14,%rbx,1),%rbx - cmovgeq %rsp,%r14 - vaesdec %xmm1,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesdec %xmm1,%xmm7,%xmm7 - subq %r14,%rbx - vaesdec %xmm1,%xmm8,%xmm8 - vmovdqu 16(%r14),%xmm12 - movq %rbx,64+48(%rsp) - vaesdec %xmm1,%xmm9,%xmm9 - vmovups 24(%rsi),%xmm1 - leaq 16(%r14,%rbx,1),%r14 - vaesdec %xmm0,%xmm2,%xmm2 - cmpl 32+28(%rsp),%ecx - movq 64+56(%rsp),%rbx - vaesdec %xmm0,%xmm3,%xmm3 - prefetcht0 31(%r15) - vaesdec %xmm0,%xmm4,%xmm4 - prefetcht0 15(%r13) - vaesdec %xmm0,%xmm5,%xmm5 - leaq (%r15,%rbx,1),%rbx - cmovgeq %rsp,%r15 - vaesdec %xmm0,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesdec %xmm0,%xmm7,%xmm7 - subq %r15,%rbx - vaesdec %xmm0,%xmm8,%xmm8 - vmovdqu 16(%r15),%xmm13 - movq %rbx,64+56(%rsp) - vaesdec %xmm0,%xmm9,%xmm9 - vmovups 40(%rsi),%xmm0 - leaq 16(%r15,%rbx,1),%r15 - vmovdqu 32(%rsp),%xmm14 - prefetcht0 15(%r14) - prefetcht0 15(%r15) - cmpl $11,%eax - jb .Ldec8x_tail - - vaesdec %xmm1,%xmm2,%xmm2 - vaesdec %xmm1,%xmm3,%xmm3 - vaesdec %xmm1,%xmm4,%xmm4 - vaesdec %xmm1,%xmm5,%xmm5 - vaesdec %xmm1,%xmm6,%xmm6 - vaesdec %xmm1,%xmm7,%xmm7 - vaesdec %xmm1,%xmm8,%xmm8 - vaesdec %xmm1,%xmm9,%xmm9 - vmovups 176-120(%rsi),%xmm1 - - vaesdec %xmm0,%xmm2,%xmm2 - vaesdec %xmm0,%xmm3,%xmm3 - vaesdec %xmm0,%xmm4,%xmm4 - vaesdec %xmm0,%xmm5,%xmm5 - vaesdec %xmm0,%xmm6,%xmm6 - vaesdec %xmm0,%xmm7,%xmm7 - vaesdec %xmm0,%xmm8,%xmm8 - vaesdec %xmm0,%xmm9,%xmm9 - vmovups 192-120(%rsi),%xmm0 - je .Ldec8x_tail - - vaesdec %xmm1,%xmm2,%xmm2 - vaesdec %xmm1,%xmm3,%xmm3 - vaesdec %xmm1,%xmm4,%xmm4 - vaesdec %xmm1,%xmm5,%xmm5 - vaesdec %xmm1,%xmm6,%xmm6 - vaesdec %xmm1,%xmm7,%xmm7 - vaesdec %xmm1,%xmm8,%xmm8 - vaesdec %xmm1,%xmm9,%xmm9 - vmovups 208-120(%rsi),%xmm1 - - vaesdec %xmm0,%xmm2,%xmm2 - vaesdec %xmm0,%xmm3,%xmm3 - vaesdec %xmm0,%xmm4,%xmm4 - vaesdec %xmm0,%xmm5,%xmm5 - vaesdec %xmm0,%xmm6,%xmm6 - vaesdec %xmm0,%xmm7,%xmm7 - vaesdec %xmm0,%xmm8,%xmm8 - vaesdec %xmm0,%xmm9,%xmm9 - vmovups 224-120(%rsi),%xmm0 - -.Ldec8x_tail: - vaesdec %xmm1,%xmm2,%xmm2 - vpxor %xmm15,%xmm15,%xmm15 - vaesdec %xmm1,%xmm3,%xmm3 - vaesdec %xmm1,%xmm4,%xmm4 - vpcmpgtd %xmm15,%xmm14,%xmm15 - vaesdec %xmm1,%xmm5,%xmm5 - vaesdec %xmm1,%xmm6,%xmm6 - vpaddd %xmm14,%xmm15,%xmm15 - vmovdqu 48(%rsp),%xmm14 - vaesdec %xmm1,%xmm7,%xmm7 - movq 64(%rsp),%rbx - vaesdec %xmm1,%xmm8,%xmm8 - vaesdec %xmm1,%xmm9,%xmm9 - vmovups 16-120(%rsi),%xmm1 - - vaesdeclast %xmm0,%xmm2,%xmm2 - vmovdqa %xmm15,32(%rsp) - vpxor %xmm15,%xmm15,%xmm15 - vaesdeclast %xmm0,%xmm3,%xmm3 - vpxor 0(%rbp),%xmm2,%xmm2 - vaesdeclast %xmm0,%xmm4,%xmm4 - vpxor 16(%rbp),%xmm3,%xmm3 - vpcmpgtd %xmm15,%xmm14,%xmm15 - vaesdeclast %xmm0,%xmm5,%xmm5 - vpxor 32(%rbp),%xmm4,%xmm4 - vaesdeclast %xmm0,%xmm6,%xmm6 - vpxor 48(%rbp),%xmm5,%xmm5 - vpaddd %xmm15,%xmm14,%xmm14 - vmovdqu -120(%rsi),%xmm15 - vaesdeclast %xmm0,%xmm7,%xmm7 - vpxor 64(%rbp),%xmm6,%xmm6 - vaesdeclast %xmm0,%xmm8,%xmm8 - vpxor 80(%rbp),%xmm7,%xmm7 - vmovdqa %xmm14,48(%rsp) - vaesdeclast %xmm0,%xmm9,%xmm9 - vpxor 96(%rbp),%xmm8,%xmm8 - vmovups 32-120(%rsi),%xmm0 - - vmovups %xmm2,-16(%r8) - subq %rbx,%r8 - vmovdqu 128+0(%rsp),%xmm2 - vpxor 112(%rbp),%xmm9,%xmm9 - vmovups %xmm3,-16(%r9) - subq 72(%rsp),%r9 - vmovdqu %xmm2,0(%rbp) - vpxor %xmm15,%xmm2,%xmm2 - vmovdqu 128+16(%rsp),%xmm3 - vmovups %xmm4,-16(%r10) - subq 80(%rsp),%r10 - vmovdqu %xmm3,16(%rbp) - vpxor %xmm15,%xmm3,%xmm3 - vmovdqu 128+32(%rsp),%xmm4 - vmovups %xmm5,-16(%r11) - subq 88(%rsp),%r11 - vmovdqu %xmm4,32(%rbp) - vpxor %xmm15,%xmm4,%xmm4 - vmovdqu 128+48(%rsp),%xmm5 - vmovups %xmm6,-16(%r12) - subq 96(%rsp),%r12 - vmovdqu %xmm5,48(%rbp) - vpxor %xmm15,%xmm5,%xmm5 - vmovdqu %xmm10,64(%rbp) - vpxor %xmm10,%xmm15,%xmm6 - vmovups %xmm7,-16(%r13) - subq 104(%rsp),%r13 - vmovdqu %xmm11,80(%rbp) - vpxor %xmm11,%xmm15,%xmm7 - vmovups %xmm8,-16(%r14) - subq 112(%rsp),%r14 - vmovdqu %xmm12,96(%rbp) - vpxor %xmm12,%xmm15,%xmm8 - vmovups %xmm9,-16(%r15) - subq 120(%rsp),%r15 - vmovdqu %xmm13,112(%rbp) - vpxor %xmm13,%xmm15,%xmm9 - - xorq $128,%rbp - decl %edx - jnz .Loop_dec8x - - movq 16(%rsp),%rax -.cfi_def_cfa %rax,8 - - - - - -.Ldec8x_done: - vzeroupper - movq -48(%rax),%r15 -.cfi_restore %r15 - movq -40(%rax),%r14 -.cfi_restore %r14 - movq -32(%rax),%r13 -.cfi_restore %r13 - movq -24(%rax),%r12 -.cfi_restore %r12 - movq -16(%rax),%rbp -.cfi_restore %rbp - movq -8(%rax),%rbx -.cfi_restore %rbx - leaq (%rax),%rsp -.cfi_def_cfa_register %rsp -.Ldec8x_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx diff --git a/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S b/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S index 92fa5bfd685d..294db310a06a 100644 --- a/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S +++ b/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S @@ -7,18 +7,15 @@ .type aesni_cbc_sha1_enc,@function .align 32 aesni_cbc_sha1_enc: +.cfi_startproc movl OPENSSL_ia32cap_P+0(%rip),%r10d movq OPENSSL_ia32cap_P+4(%rip),%r11 btq $61,%r11 jc aesni_cbc_sha1_enc_shaext - andl $268435456,%r11d - andl $1073741824,%r10d - orl %r11d,%r10d - cmpl $1342177280,%r10d - je aesni_cbc_sha1_enc_avx jmp aesni_cbc_sha1_enc_ssse3 .byte 0xf3,0xc3 +.cfi_endproc .size aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc .type aesni_cbc_sha1_enc_ssse3,@function .align 32 @@ -1397,1327 +1394,6 @@ aesni_cbc_sha1_enc_ssse3: .byte 0xf3,0xc3 .cfi_endproc .size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3 -.type aesni_cbc_sha1_enc_avx,@function -.align 32 -aesni_cbc_sha1_enc_avx: -.cfi_startproc - movq 8(%rsp),%r10 - - - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - leaq -104(%rsp),%rsp -.cfi_adjust_cfa_offset 104 - - - vzeroall - movq %rdi,%r12 - movq %rsi,%r13 - movq %rdx,%r14 - leaq 112(%rcx),%r15 - vmovdqu (%r8),%xmm12 - movq %r8,88(%rsp) - shlq $6,%r14 - subq %r12,%r13 - movl 240-112(%r15),%r8d - addq %r10,%r14 - - leaq K_XX_XX(%rip),%r11 - movl 0(%r9),%eax - movl 4(%r9),%ebx - movl 8(%r9),%ecx - movl 12(%r9),%edx - movl %ebx,%esi - movl 16(%r9),%ebp - movl %ecx,%edi - xorl %edx,%edi - andl %edi,%esi - - vmovdqa 64(%r11),%xmm6 - vmovdqa 0(%r11),%xmm10 - vmovdqu 0(%r10),%xmm0 - vmovdqu 16(%r10),%xmm1 - vmovdqu 32(%r10),%xmm2 - vmovdqu 48(%r10),%xmm3 - vpshufb %xmm6,%xmm0,%xmm0 - addq $64,%r10 - vpshufb %xmm6,%xmm1,%xmm1 - vpshufb %xmm6,%xmm2,%xmm2 - vpshufb %xmm6,%xmm3,%xmm3 - vpaddd %xmm10,%xmm0,%xmm4 - vpaddd %xmm10,%xmm1,%xmm5 - vpaddd %xmm10,%xmm2,%xmm6 - vmovdqa %xmm4,0(%rsp) - vmovdqa %xmm5,16(%rsp) - vmovdqa %xmm6,32(%rsp) - vmovups -112(%r15),%xmm15 - vmovups 16-112(%r15),%xmm14 - jmp .Loop_avx -.align 32 -.Loop_avx: - shrdl $2,%ebx,%ebx - vmovdqu 0(%r12),%xmm13 - vpxor %xmm15,%xmm13,%xmm13 - vpxor %xmm13,%xmm12,%xmm12 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -80(%r15),%xmm15 - xorl %edx,%esi - vpalignr $8,%xmm0,%xmm1,%xmm4 - movl %eax,%edi - addl 0(%rsp),%ebp - vpaddd %xmm3,%xmm10,%xmm9 - xorl %ecx,%ebx - shldl $5,%eax,%eax - vpsrldq $4,%xmm3,%xmm8 - addl %esi,%ebp - andl %ebx,%edi - vpxor %xmm0,%xmm4,%xmm4 - xorl %ecx,%ebx - addl %eax,%ebp - vpxor %xmm2,%xmm8,%xmm8 - shrdl $7,%eax,%eax - xorl %ecx,%edi - movl %ebp,%esi - addl 4(%rsp),%edx - vpxor %xmm8,%xmm4,%xmm4 - xorl %ebx,%eax - shldl $5,%ebp,%ebp - vmovdqa %xmm9,48(%rsp) - addl %edi,%edx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups -64(%r15),%xmm14 - andl %eax,%esi - vpsrld $31,%xmm4,%xmm8 - xorl %ebx,%eax - addl %ebp,%edx - shrdl $7,%ebp,%ebp - xorl %ebx,%esi - vpslldq $12,%xmm4,%xmm9 - vpaddd %xmm4,%xmm4,%xmm4 - movl %edx,%edi - addl 8(%rsp),%ecx - xorl %eax,%ebp - shldl $5,%edx,%edx - vpor %xmm8,%xmm4,%xmm4 - vpsrld $30,%xmm9,%xmm8 - addl %esi,%ecx - andl %ebp,%edi - xorl %eax,%ebp - addl %edx,%ecx - vpslld $2,%xmm9,%xmm9 - vpxor %xmm8,%xmm4,%xmm4 - shrdl $7,%edx,%edx - xorl %eax,%edi - movl %ecx,%esi - addl 12(%rsp),%ebx - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -48(%r15),%xmm15 - vpxor %xmm9,%xmm4,%xmm4 - xorl %ebp,%edx - shldl $5,%ecx,%ecx - addl %edi,%ebx - andl %edx,%esi - xorl %ebp,%edx - addl %ecx,%ebx - shrdl $7,%ecx,%ecx - xorl %ebp,%esi - vpalignr $8,%xmm1,%xmm2,%xmm5 - movl %ebx,%edi - addl 16(%rsp),%eax - vpaddd %xmm4,%xmm10,%xmm9 - xorl %edx,%ecx - shldl $5,%ebx,%ebx - vpsrldq $4,%xmm4,%xmm8 - addl %esi,%eax - andl %ecx,%edi - vpxor %xmm1,%xmm5,%xmm5 - xorl %edx,%ecx - addl %ebx,%eax - vpxor %xmm3,%xmm8,%xmm8 - shrdl $7,%ebx,%ebx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups -32(%r15),%xmm14 - xorl %edx,%edi - movl %eax,%esi - addl 20(%rsp),%ebp - vpxor %xmm8,%xmm5,%xmm5 - xorl %ecx,%ebx - shldl $5,%eax,%eax - vmovdqa %xmm9,0(%rsp) - addl %edi,%ebp - andl %ebx,%esi - vpsrld $31,%xmm5,%xmm8 - xorl %ecx,%ebx - addl %eax,%ebp - shrdl $7,%eax,%eax - xorl %ecx,%esi - vpslldq $12,%xmm5,%xmm9 - vpaddd %xmm5,%xmm5,%xmm5 - movl %ebp,%edi - addl 24(%rsp),%edx - xorl %ebx,%eax - shldl $5,%ebp,%ebp - vpor %xmm8,%xmm5,%xmm5 - vpsrld $30,%xmm9,%xmm8 - addl %esi,%edx - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -16(%r15),%xmm15 - andl %eax,%edi - xorl %ebx,%eax - addl %ebp,%edx - vpslld $2,%xmm9,%xmm9 - vpxor %xmm8,%xmm5,%xmm5 - shrdl $7,%ebp,%ebp - xorl %ebx,%edi - movl %edx,%esi - addl 28(%rsp),%ecx - vpxor %xmm9,%xmm5,%xmm5 - xorl %eax,%ebp - shldl $5,%edx,%edx - vmovdqa 16(%r11),%xmm10 - addl %edi,%ecx - andl %ebp,%esi - xorl %eax,%ebp - addl %edx,%ecx - shrdl $7,%edx,%edx - xorl %eax,%esi - vpalignr $8,%xmm2,%xmm3,%xmm6 - movl %ecx,%edi - addl 32(%rsp),%ebx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 0(%r15),%xmm14 - vpaddd %xmm5,%xmm10,%xmm9 - xorl %ebp,%edx - shldl $5,%ecx,%ecx - vpsrldq $4,%xmm5,%xmm8 - addl %esi,%ebx - andl %edx,%edi - vpxor %xmm2,%xmm6,%xmm6 - xorl %ebp,%edx - addl %ecx,%ebx - vpxor %xmm4,%xmm8,%xmm8 - shrdl $7,%ecx,%ecx - xorl %ebp,%edi - movl %ebx,%esi - addl 36(%rsp),%eax - vpxor %xmm8,%xmm6,%xmm6 - xorl %edx,%ecx - shldl $5,%ebx,%ebx - vmovdqa %xmm9,16(%rsp) - addl %edi,%eax - andl %ecx,%esi - vpsrld $31,%xmm6,%xmm8 - xorl %edx,%ecx - addl %ebx,%eax - shrdl $7,%ebx,%ebx - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 16(%r15),%xmm15 - xorl %edx,%esi - vpslldq $12,%xmm6,%xmm9 - vpaddd %xmm6,%xmm6,%xmm6 - movl %eax,%edi - addl 40(%rsp),%ebp - xorl %ecx,%ebx - shldl $5,%eax,%eax - vpor %xmm8,%xmm6,%xmm6 - vpsrld $30,%xmm9,%xmm8 - addl %esi,%ebp - andl %ebx,%edi - xorl %ecx,%ebx - addl %eax,%ebp - vpslld $2,%xmm9,%xmm9 - vpxor %xmm8,%xmm6,%xmm6 - shrdl $7,%eax,%eax - xorl %ecx,%edi - movl %ebp,%esi - addl 44(%rsp),%edx - vpxor %xmm9,%xmm6,%xmm6 - xorl %ebx,%eax - shldl $5,%ebp,%ebp - addl %edi,%edx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 32(%r15),%xmm14 - andl %eax,%esi - xorl %ebx,%eax - addl %ebp,%edx - shrdl $7,%ebp,%ebp - xorl %ebx,%esi - vpalignr $8,%xmm3,%xmm4,%xmm7 - movl %edx,%edi - addl 48(%rsp),%ecx - vpaddd %xmm6,%xmm10,%xmm9 - xorl %eax,%ebp - shldl $5,%edx,%edx - vpsrldq $4,%xmm6,%xmm8 - addl %esi,%ecx - andl %ebp,%edi - vpxor %xmm3,%xmm7,%xmm7 - xorl %eax,%ebp - addl %edx,%ecx - vpxor %xmm5,%xmm8,%xmm8 - shrdl $7,%edx,%edx - xorl %eax,%edi - movl %ecx,%esi - addl 52(%rsp),%ebx - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 48(%r15),%xmm15 - vpxor %xmm8,%xmm7,%xmm7 - xorl %ebp,%edx - shldl $5,%ecx,%ecx - vmovdqa %xmm9,32(%rsp) - addl %edi,%ebx - andl %edx,%esi - vpsrld $31,%xmm7,%xmm8 - xorl %ebp,%edx - addl %ecx,%ebx - shrdl $7,%ecx,%ecx - xorl %ebp,%esi - vpslldq $12,%xmm7,%xmm9 - vpaddd %xmm7,%xmm7,%xmm7 - movl %ebx,%edi - addl 56(%rsp),%eax - xorl %edx,%ecx - shldl $5,%ebx,%ebx - vpor %xmm8,%xmm7,%xmm7 - vpsrld $30,%xmm9,%xmm8 - addl %esi,%eax - andl %ecx,%edi - xorl %edx,%ecx - addl %ebx,%eax - vpslld $2,%xmm9,%xmm9 - vpxor %xmm8,%xmm7,%xmm7 - shrdl $7,%ebx,%ebx - cmpl $11,%r8d - jb .Lvaesenclast6 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 64(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 80(%r15),%xmm15 - je .Lvaesenclast6 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 96(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 112(%r15),%xmm15 -.Lvaesenclast6: - vaesenclast %xmm15,%xmm12,%xmm12 - vmovups -112(%r15),%xmm15 - vmovups 16-112(%r15),%xmm14 - xorl %edx,%edi - movl %eax,%esi - addl 60(%rsp),%ebp - vpxor %xmm9,%xmm7,%xmm7 - xorl %ecx,%ebx - shldl $5,%eax,%eax - addl %edi,%ebp - andl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%ebp - vpalignr $8,%xmm6,%xmm7,%xmm8 - vpxor %xmm4,%xmm0,%xmm0 - shrdl $7,%eax,%eax - xorl %ecx,%esi - movl %ebp,%edi - addl 0(%rsp),%edx - vpxor %xmm1,%xmm0,%xmm0 - xorl %ebx,%eax - shldl $5,%ebp,%ebp - vpaddd %xmm7,%xmm10,%xmm9 - addl %esi,%edx - vmovdqu 16(%r12),%xmm13 - vpxor %xmm15,%xmm13,%xmm13 - vmovups %xmm12,0(%r12,%r13,1) - vpxor %xmm13,%xmm12,%xmm12 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -80(%r15),%xmm15 - andl %eax,%edi - vpxor %xmm8,%xmm0,%xmm0 - xorl %ebx,%eax - addl %ebp,%edx - shrdl $7,%ebp,%ebp - xorl %ebx,%edi - vpsrld $30,%xmm0,%xmm8 - vmovdqa %xmm9,48(%rsp) - movl %edx,%esi - addl 4(%rsp),%ecx - xorl %eax,%ebp - shldl $5,%edx,%edx - vpslld $2,%xmm0,%xmm0 - addl %edi,%ecx - andl %ebp,%esi - xorl %eax,%ebp - addl %edx,%ecx - shrdl $7,%edx,%edx - xorl %eax,%esi - movl %ecx,%edi - addl 8(%rsp),%ebx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups -64(%r15),%xmm14 - vpor %xmm8,%xmm0,%xmm0 - xorl %ebp,%edx - shldl $5,%ecx,%ecx - addl %esi,%ebx - andl %edx,%edi - xorl %ebp,%edx - addl %ecx,%ebx - addl 12(%rsp),%eax - xorl %ebp,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpalignr $8,%xmm7,%xmm0,%xmm8 - vpxor %xmm5,%xmm1,%xmm1 - addl 16(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -48(%r15),%xmm15 - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - vpxor %xmm2,%xmm1,%xmm1 - addl %esi,%ebp - xorl %ecx,%edi - vpaddd %xmm0,%xmm10,%xmm9 - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpxor %xmm8,%xmm1,%xmm1 - addl 20(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - vpsrld $30,%xmm1,%xmm8 - vmovdqa %xmm9,0(%rsp) - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - vpslld $2,%xmm1,%xmm1 - addl 24(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - shldl $5,%edx,%edx - addl %esi,%ecx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups -32(%r15),%xmm14 - xorl %eax,%edi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpor %xmm8,%xmm1,%xmm1 - addl 28(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpalignr $8,%xmm0,%xmm1,%xmm8 - vpxor %xmm6,%xmm2,%xmm2 - addl 32(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - vpxor %xmm3,%xmm2,%xmm2 - addl %esi,%eax - xorl %edx,%edi - vpaddd %xmm1,%xmm10,%xmm9 - vmovdqa 32(%r11),%xmm10 - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpxor %xmm8,%xmm2,%xmm2 - addl 36(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -16(%r15),%xmm15 - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - vpsrld $30,%xmm2,%xmm8 - vmovdqa %xmm9,16(%rsp) - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpslld $2,%xmm2,%xmm2 - addl 40(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - shldl $5,%ebp,%ebp - addl %esi,%edx - xorl %ebx,%edi - shrdl $7,%eax,%eax - addl %ebp,%edx - vpor %xmm8,%xmm2,%xmm2 - addl 44(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - addl %edi,%ecx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 0(%r15),%xmm14 - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpalignr $8,%xmm1,%xmm2,%xmm8 - vpxor %xmm7,%xmm3,%xmm3 - addl 48(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - vpxor %xmm4,%xmm3,%xmm3 - addl %esi,%ebx - xorl %ebp,%edi - vpaddd %xmm2,%xmm10,%xmm9 - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpxor %xmm8,%xmm3,%xmm3 - addl 52(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - vpsrld $30,%xmm3,%xmm8 - vmovdqa %xmm9,32(%rsp) - addl %edi,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpslld $2,%xmm3,%xmm3 - addl 56(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 16(%r15),%xmm15 - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - addl %esi,%ebp - xorl %ecx,%edi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpor %xmm8,%xmm3,%xmm3 - addl 60(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - vpalignr $8,%xmm2,%xmm3,%xmm8 - vpxor %xmm0,%xmm4,%xmm4 - addl 0(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - shldl $5,%edx,%edx - vpxor %xmm5,%xmm4,%xmm4 - addl %esi,%ecx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 32(%r15),%xmm14 - xorl %eax,%edi - vpaddd %xmm3,%xmm10,%xmm9 - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpxor %xmm8,%xmm4,%xmm4 - addl 4(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - vpsrld $30,%xmm4,%xmm8 - vmovdqa %xmm9,48(%rsp) - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpslld $2,%xmm4,%xmm4 - addl 8(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%edi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpor %xmm8,%xmm4,%xmm4 - addl 12(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 48(%r15),%xmm15 - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpalignr $8,%xmm3,%xmm4,%xmm8 - vpxor %xmm1,%xmm5,%xmm5 - addl 16(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - shldl $5,%ebp,%ebp - vpxor %xmm6,%xmm5,%xmm5 - addl %esi,%edx - xorl %ebx,%edi - vpaddd %xmm4,%xmm10,%xmm9 - shrdl $7,%eax,%eax - addl %ebp,%edx - vpxor %xmm8,%xmm5,%xmm5 - addl 20(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - vpsrld $30,%xmm5,%xmm8 - vmovdqa %xmm9,0(%rsp) - addl %edi,%ecx - cmpl $11,%r8d - jb .Lvaesenclast7 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 64(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 80(%r15),%xmm15 - je .Lvaesenclast7 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 96(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 112(%r15),%xmm15 -.Lvaesenclast7: - vaesenclast %xmm15,%xmm12,%xmm12 - vmovups -112(%r15),%xmm15 - vmovups 16-112(%r15),%xmm14 - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpslld $2,%xmm5,%xmm5 - addl 24(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpor %xmm8,%xmm5,%xmm5 - addl 28(%rsp),%eax - shrdl $7,%ecx,%ecx - movl %ebx,%esi - xorl %edx,%edi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %ecx,%esi - xorl %edx,%ecx - addl %ebx,%eax - vpalignr $8,%xmm4,%xmm5,%xmm8 - vpxor %xmm2,%xmm6,%xmm6 - addl 32(%rsp),%ebp - vmovdqu 32(%r12),%xmm13 - vpxor %xmm15,%xmm13,%xmm13 - vmovups %xmm12,16(%r13,%r12,1) - vpxor %xmm13,%xmm12,%xmm12 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -80(%r15),%xmm15 - andl %ecx,%esi - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - vpxor %xmm7,%xmm6,%xmm6 - movl %eax,%edi - xorl %ecx,%esi - vpaddd %xmm5,%xmm10,%xmm9 - shldl $5,%eax,%eax - addl %esi,%ebp - vpxor %xmm8,%xmm6,%xmm6 - xorl %ebx,%edi - xorl %ecx,%ebx - addl %eax,%ebp - addl 36(%rsp),%edx - vpsrld $30,%xmm6,%xmm8 - vmovdqa %xmm9,16(%rsp) - andl %ebx,%edi - xorl %ecx,%ebx - shrdl $7,%eax,%eax - movl %ebp,%esi - vpslld $2,%xmm6,%xmm6 - xorl %ebx,%edi - shldl $5,%ebp,%ebp - addl %edi,%edx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups -64(%r15),%xmm14 - xorl %eax,%esi - xorl %ebx,%eax - addl %ebp,%edx - addl 40(%rsp),%ecx - andl %eax,%esi - vpor %xmm8,%xmm6,%xmm6 - xorl %ebx,%eax - shrdl $7,%ebp,%ebp - movl %edx,%edi - xorl %eax,%esi - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %ebp,%edi - xorl %eax,%ebp - addl %edx,%ecx - addl 44(%rsp),%ebx - andl %ebp,%edi - xorl %eax,%ebp - shrdl $7,%edx,%edx - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -48(%r15),%xmm15 - movl %ecx,%esi - xorl %ebp,%edi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %edx,%esi - xorl %ebp,%edx - addl %ecx,%ebx - vpalignr $8,%xmm5,%xmm6,%xmm8 - vpxor %xmm3,%xmm7,%xmm7 - addl 48(%rsp),%eax - andl %edx,%esi - xorl %ebp,%edx - shrdl $7,%ecx,%ecx - vpxor %xmm0,%xmm7,%xmm7 - movl %ebx,%edi - xorl %edx,%esi - vpaddd %xmm6,%xmm10,%xmm9 - vmovdqa 48(%r11),%xmm10 - shldl $5,%ebx,%ebx - addl %esi,%eax - vpxor %xmm8,%xmm7,%xmm7 - xorl %ecx,%edi - xorl %edx,%ecx - addl %ebx,%eax - addl 52(%rsp),%ebp - vaesenc %xmm15,%xmm12,%xmm12 - vmovups -32(%r15),%xmm14 - vpsrld $30,%xmm7,%xmm8 - vmovdqa %xmm9,32(%rsp) - andl %ecx,%edi - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - movl %eax,%esi - vpslld $2,%xmm7,%xmm7 - xorl %ecx,%edi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%ebp - addl 56(%rsp),%edx - andl %ebx,%esi - vpor %xmm8,%xmm7,%xmm7 - xorl %ecx,%ebx - shrdl $7,%eax,%eax - movl %ebp,%edi - xorl %ebx,%esi - shldl $5,%ebp,%ebp - addl %esi,%edx - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -16(%r15),%xmm15 - xorl %eax,%edi - xorl %ebx,%eax - addl %ebp,%edx - addl 60(%rsp),%ecx - andl %eax,%edi - xorl %ebx,%eax - shrdl $7,%ebp,%ebp - movl %edx,%esi - xorl %eax,%edi - shldl $5,%edx,%edx - addl %edi,%ecx - xorl %ebp,%esi - xorl %eax,%ebp - addl %edx,%ecx - vpalignr $8,%xmm6,%xmm7,%xmm8 - vpxor %xmm4,%xmm0,%xmm0 - addl 0(%rsp),%ebx - andl %ebp,%esi - xorl %eax,%ebp - shrdl $7,%edx,%edx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 0(%r15),%xmm14 - vpxor %xmm1,%xmm0,%xmm0 - movl %ecx,%edi - xorl %ebp,%esi - vpaddd %xmm7,%xmm10,%xmm9 - shldl $5,%ecx,%ecx - addl %esi,%ebx - vpxor %xmm8,%xmm0,%xmm0 - xorl %edx,%edi - xorl %ebp,%edx - addl %ecx,%ebx - addl 4(%rsp),%eax - vpsrld $30,%xmm0,%xmm8 - vmovdqa %xmm9,48(%rsp) - andl %edx,%edi - xorl %ebp,%edx - shrdl $7,%ecx,%ecx - movl %ebx,%esi - vpslld $2,%xmm0,%xmm0 - xorl %edx,%edi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %ecx,%esi - xorl %edx,%ecx - addl %ebx,%eax - addl 8(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 16(%r15),%xmm15 - andl %ecx,%esi - vpor %xmm8,%xmm0,%xmm0 - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - movl %eax,%edi - xorl %ecx,%esi - shldl $5,%eax,%eax - addl %esi,%ebp - xorl %ebx,%edi - xorl %ecx,%ebx - addl %eax,%ebp - addl 12(%rsp),%edx - andl %ebx,%edi - xorl %ecx,%ebx - shrdl $7,%eax,%eax - movl %ebp,%esi - xorl %ebx,%edi - shldl $5,%ebp,%ebp - addl %edi,%edx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 32(%r15),%xmm14 - xorl %eax,%esi - xorl %ebx,%eax - addl %ebp,%edx - vpalignr $8,%xmm7,%xmm0,%xmm8 - vpxor %xmm5,%xmm1,%xmm1 - addl 16(%rsp),%ecx - andl %eax,%esi - xorl %ebx,%eax - shrdl $7,%ebp,%ebp - vpxor %xmm2,%xmm1,%xmm1 - movl %edx,%edi - xorl %eax,%esi - vpaddd %xmm0,%xmm10,%xmm9 - shldl $5,%edx,%edx - addl %esi,%ecx - vpxor %xmm8,%xmm1,%xmm1 - xorl %ebp,%edi - xorl %eax,%ebp - addl %edx,%ecx - addl 20(%rsp),%ebx - vpsrld $30,%xmm1,%xmm8 - vmovdqa %xmm9,0(%rsp) - andl %ebp,%edi - xorl %eax,%ebp - shrdl $7,%edx,%edx - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 48(%r15),%xmm15 - movl %ecx,%esi - vpslld $2,%xmm1,%xmm1 - xorl %ebp,%edi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %edx,%esi - xorl %ebp,%edx - addl %ecx,%ebx - addl 24(%rsp),%eax - andl %edx,%esi - vpor %xmm8,%xmm1,%xmm1 - xorl %ebp,%edx - shrdl $7,%ecx,%ecx - movl %ebx,%edi - xorl %edx,%esi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %ecx,%edi - xorl %edx,%ecx - addl %ebx,%eax - addl 28(%rsp),%ebp - cmpl $11,%r8d - jb .Lvaesenclast8 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 64(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 80(%r15),%xmm15 - je .Lvaesenclast8 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 96(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 112(%r15),%xmm15 -.Lvaesenclast8: - vaesenclast %xmm15,%xmm12,%xmm12 - vmovups -112(%r15),%xmm15 - vmovups 16-112(%r15),%xmm14 - andl %ecx,%edi - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - movl %eax,%esi - xorl %ecx,%edi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%ebp - vpalignr $8,%xmm0,%xmm1,%xmm8 - vpxor %xmm6,%xmm2,%xmm2 - addl 32(%rsp),%edx - andl %ebx,%esi - xorl %ecx,%ebx - shrdl $7,%eax,%eax - vpxor %xmm3,%xmm2,%xmm2 - movl %ebp,%edi - xorl %ebx,%esi - vpaddd %xmm1,%xmm10,%xmm9 - shldl $5,%ebp,%ebp - addl %esi,%edx - vmovdqu 48(%r12),%xmm13 - vpxor %xmm15,%xmm13,%xmm13 - vmovups %xmm12,32(%r13,%r12,1) - vpxor %xmm13,%xmm12,%xmm12 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -80(%r15),%xmm15 - vpxor %xmm8,%xmm2,%xmm2 - xorl %eax,%edi - xorl %ebx,%eax - addl %ebp,%edx - addl 36(%rsp),%ecx - vpsrld $30,%xmm2,%xmm8 - vmovdqa %xmm9,16(%rsp) - andl %eax,%edi - xorl %ebx,%eax - shrdl $7,%ebp,%ebp - movl %edx,%esi - vpslld $2,%xmm2,%xmm2 - xorl %eax,%edi - shldl $5,%edx,%edx - addl %edi,%ecx - xorl %ebp,%esi - xorl %eax,%ebp - addl %edx,%ecx - addl 40(%rsp),%ebx - andl %ebp,%esi - vpor %xmm8,%xmm2,%xmm2 - xorl %eax,%ebp - shrdl $7,%edx,%edx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups -64(%r15),%xmm14 - movl %ecx,%edi - xorl %ebp,%esi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %edx,%edi - xorl %ebp,%edx - addl %ecx,%ebx - addl 44(%rsp),%eax - andl %edx,%edi - xorl %ebp,%edx - shrdl $7,%ecx,%ecx - movl %ebx,%esi - xorl %edx,%edi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %edx,%esi - addl %ebx,%eax - vpalignr $8,%xmm1,%xmm2,%xmm8 - vpxor %xmm7,%xmm3,%xmm3 - addl 48(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -48(%r15),%xmm15 - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - vpxor %xmm4,%xmm3,%xmm3 - addl %esi,%ebp - xorl %ecx,%edi - vpaddd %xmm2,%xmm10,%xmm9 - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpxor %xmm8,%xmm3,%xmm3 - addl 52(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - vpsrld $30,%xmm3,%xmm8 - vmovdqa %xmm9,32(%rsp) - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - vpslld $2,%xmm3,%xmm3 - addl 56(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - shldl $5,%edx,%edx - addl %esi,%ecx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups -32(%r15),%xmm14 - xorl %eax,%edi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpor %xmm8,%xmm3,%xmm3 - addl 60(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 0(%rsp),%eax - vpaddd %xmm3,%xmm10,%xmm9 - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - addl %esi,%eax - vmovdqa %xmm9,48(%rsp) - xorl %edx,%edi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 4(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -16(%r15),%xmm15 - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 8(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - shldl $5,%ebp,%ebp - addl %esi,%edx - xorl %ebx,%edi - shrdl $7,%eax,%eax - addl %ebp,%edx - addl 12(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - addl %edi,%ecx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 0(%r15),%xmm14 - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - cmpq %r14,%r10 - je .Ldone_avx - vmovdqa 64(%r11),%xmm9 - vmovdqa 0(%r11),%xmm10 - vmovdqu 0(%r10),%xmm0 - vmovdqu 16(%r10),%xmm1 - vmovdqu 32(%r10),%xmm2 - vmovdqu 48(%r10),%xmm3 - vpshufb %xmm9,%xmm0,%xmm0 - addq $64,%r10 - addl 16(%rsp),%ebx - xorl %ebp,%esi - vpshufb %xmm9,%xmm1,%xmm1 - movl %ecx,%edi - shldl $5,%ecx,%ecx - vpaddd %xmm10,%xmm0,%xmm8 - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vmovdqa %xmm8,0(%rsp) - addl 20(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 24(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 16(%r15),%xmm15 - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - addl %esi,%ebp - xorl %ecx,%edi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 28(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - addl 32(%rsp),%ecx - xorl %eax,%esi - vpshufb %xmm9,%xmm2,%xmm2 - movl %edx,%edi - shldl $5,%edx,%edx - vpaddd %xmm10,%xmm1,%xmm8 - addl %esi,%ecx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 32(%r15),%xmm14 - xorl %eax,%edi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vmovdqa %xmm8,16(%rsp) - addl 36(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 40(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%edi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 44(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 48(%r15),%xmm15 - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 48(%rsp),%edx - xorl %ebx,%esi - vpshufb %xmm9,%xmm3,%xmm3 - movl %ebp,%edi - shldl $5,%ebp,%ebp - vpaddd %xmm10,%xmm2,%xmm8 - addl %esi,%edx - xorl %ebx,%edi - shrdl $7,%eax,%eax - addl %ebp,%edx - vmovdqa %xmm8,32(%rsp) - addl 52(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - addl %edi,%ecx - cmpl $11,%r8d - jb .Lvaesenclast9 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 64(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 80(%r15),%xmm15 - je .Lvaesenclast9 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 96(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 112(%r15),%xmm15 -.Lvaesenclast9: - vaesenclast %xmm15,%xmm12,%xmm12 - vmovups -112(%r15),%xmm15 - vmovups 16-112(%r15),%xmm14 - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - addl 56(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 60(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vmovups %xmm12,48(%r13,%r12,1) - leaq 64(%r12),%r12 - - addl 0(%r9),%eax - addl 4(%r9),%esi - addl 8(%r9),%ecx - addl 12(%r9),%edx - movl %eax,0(%r9) - addl 16(%r9),%ebp - movl %esi,4(%r9) - movl %esi,%ebx - movl %ecx,8(%r9) - movl %ecx,%edi - movl %edx,12(%r9) - xorl %edx,%edi - movl %ebp,16(%r9) - andl %edi,%esi - jmp .Loop_avx - -.Ldone_avx: - addl 16(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 20(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 24(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 16(%r15),%xmm15 - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - addl %esi,%ebp - xorl %ecx,%edi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 28(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - addl 32(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - shldl $5,%edx,%edx - addl %esi,%ecx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 32(%r15),%xmm14 - xorl %eax,%edi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - addl 36(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 40(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%edi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 44(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 48(%r15),%xmm15 - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 48(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - shldl $5,%ebp,%ebp - addl %esi,%edx - xorl %ebx,%edi - shrdl $7,%eax,%eax - addl %ebp,%edx - addl 52(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - addl %edi,%ecx - cmpl $11,%r8d - jb .Lvaesenclast10 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 64(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 80(%r15),%xmm15 - je .Lvaesenclast10 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 96(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 112(%r15),%xmm15 -.Lvaesenclast10: - vaesenclast %xmm15,%xmm12,%xmm12 - vmovups -112(%r15),%xmm15 - vmovups 16-112(%r15),%xmm14 - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - addl 56(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 60(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vmovups %xmm12,48(%r13,%r12,1) - movq 88(%rsp),%r8 - - addl 0(%r9),%eax - addl 4(%r9),%esi - addl 8(%r9),%ecx - movl %eax,0(%r9) - addl 12(%r9),%edx - movl %esi,4(%r9) - addl 16(%r9),%ebp - movl %ecx,8(%r9) - movl %edx,12(%r9) - movl %ebp,16(%r9) - vmovups %xmm12,(%r8) - vzeroall - leaq 104(%rsp),%rsi -.cfi_def_cfa %rsi,56 - movq 0(%rsi),%r15 -.cfi_restore %r15 - movq 8(%rsi),%r14 -.cfi_restore %r14 - movq 16(%rsi),%r13 -.cfi_restore %r13 - movq 24(%rsi),%r12 -.cfi_restore %r12 - movq 32(%rsi),%rbp -.cfi_restore %rbp - movq 40(%rsi),%rbx -.cfi_restore %rbx - leaq 48(%rsi),%rsp -.cfi_def_cfa %rsp,8 -.Lepilogue_avx: - .byte 0xf3,0xc3 -.cfi_endproc -.size aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx .align 64 K_XX_XX: .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 @@ -2732,6 +1408,7 @@ K_XX_XX: .type aesni_cbc_sha1_enc_shaext,@function .align 32 aesni_cbc_sha1_enc_shaext: +.cfi_startproc movq 8(%rsp),%r10 movdqu (%r9),%xmm8 movd 16(%r9),%xmm9 @@ -2808,17 +1485,17 @@ aesni_cbc_sha1_enc_shaext: pxor %xmm3,%xmm5 .byte 15,56,201,243 cmpl $11,%r11d - jb .Laesenclast11 + jb .Laesenclast6 movups 64(%rcx),%xmm0 .byte 102,15,56,220,209 movups 80(%rcx),%xmm1 .byte 102,15,56,220,208 - je .Laesenclast11 + je .Laesenclast6 movups 96(%rcx),%xmm0 .byte 102,15,56,220,209 movups 112(%rcx),%xmm1 .byte 102,15,56,220,208 -.Laesenclast11: +.Laesenclast6: .byte 102,15,56,221,209 movups 16-112(%rcx),%xmm0 movdqa %xmm8,%xmm10 @@ -2874,17 +1551,17 @@ aesni_cbc_sha1_enc_shaext: pxor %xmm4,%xmm6 .byte 15,56,201,220 cmpl $11,%r11d - jb .Laesenclast12 + jb .Laesenclast7 movups 64(%rcx),%xmm0 .byte 102,15,56,220,209 movups 80(%rcx),%xmm1 .byte 102,15,56,220,208 - je .Laesenclast12 + je .Laesenclast7 movups 96(%rcx),%xmm0 .byte 102,15,56,220,209 movups 112(%rcx),%xmm1 .byte 102,15,56,220,208 -.Laesenclast12: +.Laesenclast7: .byte 102,15,56,221,209 movups 16-112(%rcx),%xmm0 movdqa %xmm8,%xmm9 @@ -2940,17 +1617,17 @@ aesni_cbc_sha1_enc_shaext: pxor %xmm5,%xmm3 .byte 15,56,201,229 cmpl $11,%r11d - jb .Laesenclast13 + jb .Laesenclast8 movups 64(%rcx),%xmm0 .byte 102,15,56,220,209 movups 80(%rcx),%xmm1 .byte 102,15,56,220,208 - je .Laesenclast13 + je .Laesenclast8 movups 96(%rcx),%xmm0 .byte 102,15,56,220,209 movups 112(%rcx),%xmm1 .byte 102,15,56,220,208 -.Laesenclast13: +.Laesenclast8: .byte 102,15,56,221,209 movups 16-112(%rcx),%xmm0 movdqa %xmm8,%xmm10 @@ -3004,17 +1681,17 @@ aesni_cbc_sha1_enc_shaext: movups 48(%rcx),%xmm1 .byte 102,15,56,220,208 cmpl $11,%r11d - jb .Laesenclast14 + jb .Laesenclast9 movups 64(%rcx),%xmm0 .byte 102,15,56,220,209 movups 80(%rcx),%xmm1 .byte 102,15,56,220,208 - je .Laesenclast14 + je .Laesenclast9 movups 96(%rcx),%xmm0 .byte 102,15,56,220,209 movups 112(%rcx),%xmm1 .byte 102,15,56,220,208 -.Laesenclast14: +.Laesenclast9: .byte 102,15,56,221,209 movups 16-112(%rcx),%xmm0 decq %rdx @@ -3030,4 +1707,5 @@ aesni_cbc_sha1_enc_shaext: movdqu %xmm8,(%r9) movd %xmm9,16(%r9) .byte 0xf3,0xc3 +.cfi_endproc .size aesni_cbc_sha1_enc_shaext,.-aesni_cbc_sha1_enc_shaext diff --git a/secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S b/secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S index e013190f8727..e42a02ebe647 100644 --- a/secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S +++ b/secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S @@ -7,31 +7,14 @@ .type aesni_cbc_sha256_enc,@function .align 16 aesni_cbc_sha256_enc: - leaq OPENSSL_ia32cap_P(%rip),%r11 - movl $1,%eax - cmpq $0,%rdi - je .Lprobe - movl 0(%r11),%eax - movq 4(%r11),%r10 - btq $61,%r10 - jc aesni_cbc_sha256_enc_shaext - movq %r10,%r11 - shrq $32,%r11 - - testl $2048,%r10d - jnz aesni_cbc_sha256_enc_xop - andl $296,%r11d - cmpl $296,%r11d - je aesni_cbc_sha256_enc_avx2 - andl $268435456,%r10d - jnz aesni_cbc_sha256_enc_avx - ud2 +.cfi_startproc xorl %eax,%eax cmpq $0,%rdi je .Lprobe ud2 .Lprobe: .byte 0xf3,0xc3 +.cfi_endproc .size aesni_cbc_sha256_enc,.-aesni_cbc_sha256_enc .align 64 @@ -76,4336 +59,3 @@ K256: .long 0,0,0,0, 0,0,0,0 .byte 65,69,83,78,73,45,67,66,67,43,83,72,65,50,53,54,32,115,116,105,116,99,104,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 -.type aesni_cbc_sha256_enc_xop,@function -.align 64 -aesni_cbc_sha256_enc_xop: -.cfi_startproc -.Lxop_shortcut: - movq 8(%rsp),%r10 - movq %rsp,%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - subq $128,%rsp - andq $-64,%rsp - - shlq $6,%rdx - subq %rdi,%rsi - subq %rdi,%r10 - addq %rdi,%rdx - - - movq %rsi,64+8(%rsp) - movq %rdx,64+16(%rsp) - - movq %r8,64+32(%rsp) - movq %r9,64+40(%rsp) - movq %r10,64+48(%rsp) - movq %rax,120(%rsp) -.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08 -.Lprologue_xop: - vzeroall - - movq %rdi,%r12 - leaq 128(%rcx),%rdi - leaq K256+544(%rip),%r13 - movl 240-128(%rdi),%r14d - movq %r9,%r15 - movq %r10,%rsi - vmovdqu (%r8),%xmm8 - subq $9,%r14 - - movl 0(%r15),%eax - movl 4(%r15),%ebx - movl 8(%r15),%ecx - movl 12(%r15),%edx - movl 16(%r15),%r8d - movl 20(%r15),%r9d - movl 24(%r15),%r10d - movl 28(%r15),%r11d - - vmovdqa 0(%r13,%r14,8),%xmm14 - vmovdqa 16(%r13,%r14,8),%xmm13 - vmovdqa 32(%r13,%r14,8),%xmm12 - vmovdqu 0-128(%rdi),%xmm10 - jmp .Lloop_xop -.align 16 -.Lloop_xop: - vmovdqa K256+512(%rip),%xmm7 - vmovdqu 0(%rsi,%r12,1),%xmm0 - vmovdqu 16(%rsi,%r12,1),%xmm1 - vmovdqu 32(%rsi,%r12,1),%xmm2 - vmovdqu 48(%rsi,%r12,1),%xmm3 - vpshufb %xmm7,%xmm0,%xmm0 - leaq K256(%rip),%rbp - vpshufb %xmm7,%xmm1,%xmm1 - vpshufb %xmm7,%xmm2,%xmm2 - vpaddd 0(%rbp),%xmm0,%xmm4 - vpshufb %xmm7,%xmm3,%xmm3 - vpaddd 32(%rbp),%xmm1,%xmm5 - vpaddd 64(%rbp),%xmm2,%xmm6 - vpaddd 96(%rbp),%xmm3,%xmm7 - vmovdqa %xmm4,0(%rsp) - movl %eax,%r14d - vmovdqa %xmm5,16(%rsp) - movl %ebx,%esi - vmovdqa %xmm6,32(%rsp) - xorl %ecx,%esi - vmovdqa %xmm7,48(%rsp) - movl %r8d,%r13d - jmp .Lxop_00_47 - -.align 16 -.Lxop_00_47: - subq $-32*4,%rbp - vmovdqu (%r12),%xmm9 - movq %r12,64+0(%rsp) - vpalignr $4,%xmm0,%xmm1,%xmm4 - rorl $14,%r13d - movl %r14d,%eax - vpalignr $4,%xmm2,%xmm3,%xmm7 - movl %r9d,%r12d - xorl %r8d,%r13d -.byte 143,232,120,194,236,14 - rorl $9,%r14d - xorl %r10d,%r12d - vpsrld $3,%xmm4,%xmm4 - rorl $5,%r13d - xorl %eax,%r14d - vpaddd %xmm7,%xmm0,%xmm0 - andl %r8d,%r12d - vpxor %xmm10,%xmm9,%xmm9 - vmovdqu 16-128(%rdi),%xmm10 - xorl %r8d,%r13d - addl 0(%rsp),%r11d - movl %eax,%r15d -.byte 143,232,120,194,245,11 - rorl $11,%r14d - xorl %r10d,%r12d - vpxor %xmm5,%xmm4,%xmm4 - xorl %ebx,%r15d - rorl $6,%r13d - addl %r12d,%r11d - andl %r15d,%esi -.byte 143,232,120,194,251,13 - xorl %eax,%r14d - addl %r13d,%r11d - vpxor %xmm6,%xmm4,%xmm4 - xorl %ebx,%esi - addl %r11d,%edx - vpsrld $10,%xmm3,%xmm6 - rorl $2,%r14d - addl %esi,%r11d - vpaddd %xmm4,%xmm0,%xmm0 - movl %edx,%r13d - addl %r11d,%r14d -.byte 143,232,120,194,239,2 - rorl $14,%r13d - movl %r14d,%r11d - vpxor %xmm6,%xmm7,%xmm7 - movl %r8d,%r12d - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%r12d - vpxor %xmm5,%xmm7,%xmm7 - rorl $5,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - vpxor %xmm8,%xmm9,%xmm9 - xorl %edx,%r13d - vpsrldq $8,%xmm7,%xmm7 - addl 4(%rsp),%r10d - movl %r11d,%esi - rorl $11,%r14d - xorl %r9d,%r12d - vpaddd %xmm7,%xmm0,%xmm0 - xorl %eax,%esi - rorl $6,%r13d - addl %r12d,%r10d - andl %esi,%r15d -.byte 143,232,120,194,248,13 - xorl %r11d,%r14d - addl %r13d,%r10d - vpsrld $10,%xmm0,%xmm6 - xorl %eax,%r15d - addl %r10d,%ecx -.byte 143,232,120,194,239,2 - rorl $2,%r14d - addl %r15d,%r10d - vpxor %xmm6,%xmm7,%xmm7 - movl %ecx,%r13d - addl %r10d,%r14d - rorl $14,%r13d - movl %r14d,%r10d - vpxor %xmm5,%xmm7,%xmm7 - movl %edx,%r12d - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r12d - vpslldq $8,%xmm7,%xmm7 - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 32-128(%rdi),%xmm10 - xorl %ecx,%r13d - vpaddd %xmm7,%xmm0,%xmm0 - addl 8(%rsp),%r9d - movl %r10d,%r15d - rorl $11,%r14d - xorl %r8d,%r12d - vpaddd 0(%rbp),%xmm0,%xmm6 - xorl %r11d,%r15d - rorl $6,%r13d - addl %r12d,%r9d - andl %r15d,%esi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%esi - addl %r9d,%ebx - rorl $2,%r14d - addl %esi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - rorl $14,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 48-128(%rdi),%xmm10 - xorl %ebx,%r13d - addl 12(%rsp),%r8d - movl %r9d,%esi - rorl $11,%r14d - xorl %edx,%r12d - xorl %r10d,%esi - rorl $6,%r13d - addl %r12d,%r8d - andl %esi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - rorl $2,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - vmovdqa %xmm6,0(%rsp) - vpalignr $4,%xmm1,%xmm2,%xmm4 - rorl $14,%r13d - movl %r14d,%r8d - vpalignr $4,%xmm3,%xmm0,%xmm7 - movl %ebx,%r12d - xorl %eax,%r13d -.byte 143,232,120,194,236,14 - rorl $9,%r14d - xorl %ecx,%r12d - vpsrld $3,%xmm4,%xmm4 - rorl $5,%r13d - xorl %r8d,%r14d - vpaddd %xmm7,%xmm1,%xmm1 - andl %eax,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 64-128(%rdi),%xmm10 - xorl %eax,%r13d - addl 16(%rsp),%edx - movl %r8d,%r15d -.byte 143,232,120,194,245,11 - rorl $11,%r14d - xorl %ecx,%r12d - vpxor %xmm5,%xmm4,%xmm4 - xorl %r9d,%r15d - rorl $6,%r13d - addl %r12d,%edx - andl %r15d,%esi -.byte 143,232,120,194,248,13 - xorl %r8d,%r14d - addl %r13d,%edx - vpxor %xmm6,%xmm4,%xmm4 - xorl %r9d,%esi - addl %edx,%r11d - vpsrld $10,%xmm0,%xmm6 - rorl $2,%r14d - addl %esi,%edx - vpaddd %xmm4,%xmm1,%xmm1 - movl %r11d,%r13d - addl %edx,%r14d -.byte 143,232,120,194,239,2 - rorl $14,%r13d - movl %r14d,%edx - vpxor %xmm6,%xmm7,%xmm7 - movl %eax,%r12d - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%r12d - vpxor %xmm5,%xmm7,%xmm7 - rorl $5,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 80-128(%rdi),%xmm10 - xorl %r11d,%r13d - vpsrldq $8,%xmm7,%xmm7 - addl 20(%rsp),%ecx - movl %edx,%esi - rorl $11,%r14d - xorl %ebx,%r12d - vpaddd %xmm7,%xmm1,%xmm1 - xorl %r8d,%esi - rorl $6,%r13d - addl %r12d,%ecx - andl %esi,%r15d -.byte 143,232,120,194,249,13 - xorl %edx,%r14d - addl %r13d,%ecx - vpsrld $10,%xmm1,%xmm6 - xorl %r8d,%r15d - addl %ecx,%r10d -.byte 143,232,120,194,239,2 - rorl $2,%r14d - addl %r15d,%ecx - vpxor %xmm6,%xmm7,%xmm7 - movl %r10d,%r13d - addl %ecx,%r14d - rorl $14,%r13d - movl %r14d,%ecx - vpxor %xmm5,%xmm7,%xmm7 - movl %r11d,%r12d - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r12d - vpslldq $8,%xmm7,%xmm7 - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 96-128(%rdi),%xmm10 - xorl %r10d,%r13d - vpaddd %xmm7,%xmm1,%xmm1 - addl 24(%rsp),%ebx - movl %ecx,%r15d - rorl $11,%r14d - xorl %eax,%r12d - vpaddd 32(%rbp),%xmm1,%xmm6 - xorl %edx,%r15d - rorl $6,%r13d - addl %r12d,%ebx - andl %r15d,%esi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%esi - addl %ebx,%r9d - rorl $2,%r14d - addl %esi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - rorl $14,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 112-128(%rdi),%xmm10 - xorl %r9d,%r13d - addl 28(%rsp),%eax - movl %ebx,%esi - rorl $11,%r14d - xorl %r11d,%r12d - xorl %ecx,%esi - rorl $6,%r13d - addl %r12d,%eax - andl %esi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - rorl $2,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - vmovdqa %xmm6,16(%rsp) - vpalignr $4,%xmm2,%xmm3,%xmm4 - rorl $14,%r13d - movl %r14d,%eax - vpalignr $4,%xmm0,%xmm1,%xmm7 - movl %r9d,%r12d - xorl %r8d,%r13d -.byte 143,232,120,194,236,14 - rorl $9,%r14d - xorl %r10d,%r12d - vpsrld $3,%xmm4,%xmm4 - rorl $5,%r13d - xorl %eax,%r14d - vpaddd %xmm7,%xmm2,%xmm2 - andl %r8d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 128-128(%rdi),%xmm10 - xorl %r8d,%r13d - addl 32(%rsp),%r11d - movl %eax,%r15d -.byte 143,232,120,194,245,11 - rorl $11,%r14d - xorl %r10d,%r12d - vpxor %xmm5,%xmm4,%xmm4 - xorl %ebx,%r15d - rorl $6,%r13d - addl %r12d,%r11d - andl %r15d,%esi -.byte 143,232,120,194,249,13 - xorl %eax,%r14d - addl %r13d,%r11d - vpxor %xmm6,%xmm4,%xmm4 - xorl %ebx,%esi - addl %r11d,%edx - vpsrld $10,%xmm1,%xmm6 - rorl $2,%r14d - addl %esi,%r11d - vpaddd %xmm4,%xmm2,%xmm2 - movl %edx,%r13d - addl %r11d,%r14d -.byte 143,232,120,194,239,2 - rorl $14,%r13d - movl %r14d,%r11d - vpxor %xmm6,%xmm7,%xmm7 - movl %r8d,%r12d - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%r12d - vpxor %xmm5,%xmm7,%xmm7 - rorl $5,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 144-128(%rdi),%xmm10 - xorl %edx,%r13d - vpsrldq $8,%xmm7,%xmm7 - addl 36(%rsp),%r10d - movl %r11d,%esi - rorl $11,%r14d - xorl %r9d,%r12d - vpaddd %xmm7,%xmm2,%xmm2 - xorl %eax,%esi - rorl $6,%r13d - addl %r12d,%r10d - andl %esi,%r15d -.byte 143,232,120,194,250,13 - xorl %r11d,%r14d - addl %r13d,%r10d - vpsrld $10,%xmm2,%xmm6 - xorl %eax,%r15d - addl %r10d,%ecx -.byte 143,232,120,194,239,2 - rorl $2,%r14d - addl %r15d,%r10d - vpxor %xmm6,%xmm7,%xmm7 - movl %ecx,%r13d - addl %r10d,%r14d - rorl $14,%r13d - movl %r14d,%r10d - vpxor %xmm5,%xmm7,%xmm7 - movl %edx,%r12d - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r12d - vpslldq $8,%xmm7,%xmm7 - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 160-128(%rdi),%xmm10 - xorl %ecx,%r13d - vpaddd %xmm7,%xmm2,%xmm2 - addl 40(%rsp),%r9d - movl %r10d,%r15d - rorl $11,%r14d - xorl %r8d,%r12d - vpaddd 64(%rbp),%xmm2,%xmm6 - xorl %r11d,%r15d - rorl $6,%r13d - addl %r12d,%r9d - andl %r15d,%esi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%esi - addl %r9d,%ebx - rorl $2,%r14d - addl %esi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - rorl $14,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 176-128(%rdi),%xmm10 - xorl %ebx,%r13d - addl 44(%rsp),%r8d - movl %r9d,%esi - rorl $11,%r14d - xorl %edx,%r12d - xorl %r10d,%esi - rorl $6,%r13d - addl %r12d,%r8d - andl %esi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - rorl $2,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - vmovdqa %xmm6,32(%rsp) - vpalignr $4,%xmm3,%xmm0,%xmm4 - rorl $14,%r13d - movl %r14d,%r8d - vpalignr $4,%xmm1,%xmm2,%xmm7 - movl %ebx,%r12d - xorl %eax,%r13d -.byte 143,232,120,194,236,14 - rorl $9,%r14d - xorl %ecx,%r12d - vpsrld $3,%xmm4,%xmm4 - rorl $5,%r13d - xorl %r8d,%r14d - vpaddd %xmm7,%xmm3,%xmm3 - andl %eax,%r12d - vpand %xmm12,%xmm11,%xmm8 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 192-128(%rdi),%xmm10 - xorl %eax,%r13d - addl 48(%rsp),%edx - movl %r8d,%r15d -.byte 143,232,120,194,245,11 - rorl $11,%r14d - xorl %ecx,%r12d - vpxor %xmm5,%xmm4,%xmm4 - xorl %r9d,%r15d - rorl $6,%r13d - addl %r12d,%edx - andl %r15d,%esi -.byte 143,232,120,194,250,13 - xorl %r8d,%r14d - addl %r13d,%edx - vpxor %xmm6,%xmm4,%xmm4 - xorl %r9d,%esi - addl %edx,%r11d - vpsrld $10,%xmm2,%xmm6 - rorl $2,%r14d - addl %esi,%edx - vpaddd %xmm4,%xmm3,%xmm3 - movl %r11d,%r13d - addl %edx,%r14d -.byte 143,232,120,194,239,2 - rorl $14,%r13d - movl %r14d,%edx - vpxor %xmm6,%xmm7,%xmm7 - movl %eax,%r12d - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%r12d - vpxor %xmm5,%xmm7,%xmm7 - rorl $5,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 208-128(%rdi),%xmm10 - xorl %r11d,%r13d - vpsrldq $8,%xmm7,%xmm7 - addl 52(%rsp),%ecx - movl %edx,%esi - rorl $11,%r14d - xorl %ebx,%r12d - vpaddd %xmm7,%xmm3,%xmm3 - xorl %r8d,%esi - rorl $6,%r13d - addl %r12d,%ecx - andl %esi,%r15d -.byte 143,232,120,194,251,13 - xorl %edx,%r14d - addl %r13d,%ecx - vpsrld $10,%xmm3,%xmm6 - xorl %r8d,%r15d - addl %ecx,%r10d -.byte 143,232,120,194,239,2 - rorl $2,%r14d - addl %r15d,%ecx - vpxor %xmm6,%xmm7,%xmm7 - movl %r10d,%r13d - addl %ecx,%r14d - rorl $14,%r13d - movl %r14d,%ecx - vpxor %xmm5,%xmm7,%xmm7 - movl %r11d,%r12d - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r12d - vpslldq $8,%xmm7,%xmm7 - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - vpand %xmm13,%xmm11,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 224-128(%rdi),%xmm10 - xorl %r10d,%r13d - vpaddd %xmm7,%xmm3,%xmm3 - addl 56(%rsp),%ebx - movl %ecx,%r15d - rorl $11,%r14d - xorl %eax,%r12d - vpaddd 96(%rbp),%xmm3,%xmm6 - xorl %edx,%r15d - rorl $6,%r13d - addl %r12d,%ebx - andl %r15d,%esi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%esi - addl %ebx,%r9d - rorl $2,%r14d - addl %esi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - rorl $14,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vpor %xmm11,%xmm8,%xmm8 - vaesenclast %xmm10,%xmm9,%xmm11 - vmovdqu 0-128(%rdi),%xmm10 - xorl %r9d,%r13d - addl 60(%rsp),%eax - movl %ebx,%esi - rorl $11,%r14d - xorl %r11d,%r12d - xorl %ecx,%esi - rorl $6,%r13d - addl %r12d,%eax - andl %esi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - rorl $2,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - vmovdqa %xmm6,48(%rsp) - movq 64+0(%rsp),%r12 - vpand %xmm14,%xmm11,%xmm11 - movq 64+8(%rsp),%r15 - vpor %xmm11,%xmm8,%xmm8 - vmovdqu %xmm8,(%r15,%r12,1) - leaq 16(%r12),%r12 - cmpb $0,131(%rbp) - jne .Lxop_00_47 - vmovdqu (%r12),%xmm9 - movq %r12,64+0(%rsp) - rorl $14,%r13d - movl %r14d,%eax - movl %r9d,%r12d - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r12d - rorl $5,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - vpxor %xmm10,%xmm9,%xmm9 - vmovdqu 16-128(%rdi),%xmm10 - xorl %r8d,%r13d - addl 0(%rsp),%r11d - movl %eax,%r15d - rorl $11,%r14d - xorl %r10d,%r12d - xorl %ebx,%r15d - rorl $6,%r13d - addl %r12d,%r11d - andl %r15d,%esi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%esi - addl %r11d,%edx - rorl $2,%r14d - addl %esi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - rorl $14,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%r12d - rorl $5,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - vpxor %xmm8,%xmm9,%xmm9 - xorl %edx,%r13d - addl 4(%rsp),%r10d - movl %r11d,%esi - rorl $11,%r14d - xorl %r9d,%r12d - xorl %eax,%esi - rorl $6,%r13d - addl %r12d,%r10d - andl %esi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - addl %r10d,%ecx - rorl $2,%r14d - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r12d - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 32-128(%rdi),%xmm10 - xorl %ecx,%r13d - addl 8(%rsp),%r9d - movl %r10d,%r15d - rorl $11,%r14d - xorl %r8d,%r12d - xorl %r11d,%r15d - rorl $6,%r13d - addl %r12d,%r9d - andl %r15d,%esi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%esi - addl %r9d,%ebx - rorl $2,%r14d - addl %esi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - rorl $14,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 48-128(%rdi),%xmm10 - xorl %ebx,%r13d - addl 12(%rsp),%r8d - movl %r9d,%esi - rorl $11,%r14d - xorl %edx,%r12d - xorl %r10d,%esi - rorl $6,%r13d - addl %r12d,%r8d - andl %esi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - rorl $2,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - rorl $14,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r12d - rorl $5,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 64-128(%rdi),%xmm10 - xorl %eax,%r13d - addl 16(%rsp),%edx - movl %r8d,%r15d - rorl $11,%r14d - xorl %ecx,%r12d - xorl %r9d,%r15d - rorl $6,%r13d - addl %r12d,%edx - andl %r15d,%esi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%esi - addl %edx,%r11d - rorl $2,%r14d - addl %esi,%edx - movl %r11d,%r13d - addl %edx,%r14d - rorl $14,%r13d - movl %r14d,%edx - movl %eax,%r12d - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%r12d - rorl $5,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 80-128(%rdi),%xmm10 - xorl %r11d,%r13d - addl 20(%rsp),%ecx - movl %edx,%esi - rorl $11,%r14d - xorl %ebx,%r12d - xorl %r8d,%esi - rorl $6,%r13d - addl %r12d,%ecx - andl %esi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - addl %ecx,%r10d - rorl $2,%r14d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r12d - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 96-128(%rdi),%xmm10 - xorl %r10d,%r13d - addl 24(%rsp),%ebx - movl %ecx,%r15d - rorl $11,%r14d - xorl %eax,%r12d - xorl %edx,%r15d - rorl $6,%r13d - addl %r12d,%ebx - andl %r15d,%esi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%esi - addl %ebx,%r9d - rorl $2,%r14d - addl %esi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - rorl $14,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 112-128(%rdi),%xmm10 - xorl %r9d,%r13d - addl 28(%rsp),%eax - movl %ebx,%esi - rorl $11,%r14d - xorl %r11d,%r12d - xorl %ecx,%esi - rorl $6,%r13d - addl %r12d,%eax - andl %esi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - rorl $2,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - rorl $14,%r13d - movl %r14d,%eax - movl %r9d,%r12d - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r12d - rorl $5,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 128-128(%rdi),%xmm10 - xorl %r8d,%r13d - addl 32(%rsp),%r11d - movl %eax,%r15d - rorl $11,%r14d - xorl %r10d,%r12d - xorl %ebx,%r15d - rorl $6,%r13d - addl %r12d,%r11d - andl %r15d,%esi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%esi - addl %r11d,%edx - rorl $2,%r14d - addl %esi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - rorl $14,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%r12d - rorl $5,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 144-128(%rdi),%xmm10 - xorl %edx,%r13d - addl 36(%rsp),%r10d - movl %r11d,%esi - rorl $11,%r14d - xorl %r9d,%r12d - xorl %eax,%esi - rorl $6,%r13d - addl %r12d,%r10d - andl %esi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - addl %r10d,%ecx - rorl $2,%r14d - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r12d - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 160-128(%rdi),%xmm10 - xorl %ecx,%r13d - addl 40(%rsp),%r9d - movl %r10d,%r15d - rorl $11,%r14d - xorl %r8d,%r12d - xorl %r11d,%r15d - rorl $6,%r13d - addl %r12d,%r9d - andl %r15d,%esi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%esi - addl %r9d,%ebx - rorl $2,%r14d - addl %esi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - rorl $14,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 176-128(%rdi),%xmm10 - xorl %ebx,%r13d - addl 44(%rsp),%r8d - movl %r9d,%esi - rorl $11,%r14d - xorl %edx,%r12d - xorl %r10d,%esi - rorl $6,%r13d - addl %r12d,%r8d - andl %esi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - rorl $2,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - rorl $14,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r12d - rorl $5,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - vpand %xmm12,%xmm11,%xmm8 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 192-128(%rdi),%xmm10 - xorl %eax,%r13d - addl 48(%rsp),%edx - movl %r8d,%r15d - rorl $11,%r14d - xorl %ecx,%r12d - xorl %r9d,%r15d - rorl $6,%r13d - addl %r12d,%edx - andl %r15d,%esi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%esi - addl %edx,%r11d - rorl $2,%r14d - addl %esi,%edx - movl %r11d,%r13d - addl %edx,%r14d - rorl $14,%r13d - movl %r14d,%edx - movl %eax,%r12d - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%r12d - rorl $5,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 208-128(%rdi),%xmm10 - xorl %r11d,%r13d - addl 52(%rsp),%ecx - movl %edx,%esi - rorl $11,%r14d - xorl %ebx,%r12d - xorl %r8d,%esi - rorl $6,%r13d - addl %r12d,%ecx - andl %esi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - addl %ecx,%r10d - rorl $2,%r14d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r12d - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - vpand %xmm13,%xmm11,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 224-128(%rdi),%xmm10 - xorl %r10d,%r13d - addl 56(%rsp),%ebx - movl %ecx,%r15d - rorl $11,%r14d - xorl %eax,%r12d - xorl %edx,%r15d - rorl $6,%r13d - addl %r12d,%ebx - andl %r15d,%esi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%esi - addl %ebx,%r9d - rorl $2,%r14d - addl %esi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - rorl $14,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vpor %xmm11,%xmm8,%xmm8 - vaesenclast %xmm10,%xmm9,%xmm11 - vmovdqu 0-128(%rdi),%xmm10 - xorl %r9d,%r13d - addl 60(%rsp),%eax - movl %ebx,%esi - rorl $11,%r14d - xorl %r11d,%r12d - xorl %ecx,%esi - rorl $6,%r13d - addl %r12d,%eax - andl %esi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - rorl $2,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - movq 64+0(%rsp),%r12 - movq 64+8(%rsp),%r13 - movq 64+40(%rsp),%r15 - movq 64+48(%rsp),%rsi - - vpand %xmm14,%xmm11,%xmm11 - movl %r14d,%eax - vpor %xmm11,%xmm8,%xmm8 - vmovdqu %xmm8,(%r12,%r13,1) - leaq 16(%r12),%r12 - - addl 0(%r15),%eax - addl 4(%r15),%ebx - addl 8(%r15),%ecx - addl 12(%r15),%edx - addl 16(%r15),%r8d - addl 20(%r15),%r9d - addl 24(%r15),%r10d - addl 28(%r15),%r11d - - cmpq 64+16(%rsp),%r12 - - movl %eax,0(%r15) - movl %ebx,4(%r15) - movl %ecx,8(%r15) - movl %edx,12(%r15) - movl %r8d,16(%r15) - movl %r9d,20(%r15) - movl %r10d,24(%r15) - movl %r11d,28(%r15) - - jb .Lloop_xop - - movq 64+32(%rsp),%r8 - movq 120(%rsp),%rsi -.cfi_def_cfa %rsi,8 - vmovdqu %xmm8,(%r8) - vzeroall - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbp -.cfi_restore %rbp - movq -8(%rsi),%rbx -.cfi_restore %rbx - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lepilogue_xop: - .byte 0xf3,0xc3 -.cfi_endproc -.size aesni_cbc_sha256_enc_xop,.-aesni_cbc_sha256_enc_xop -.type aesni_cbc_sha256_enc_avx,@function -.align 64 -aesni_cbc_sha256_enc_avx: -.cfi_startproc -.Lavx_shortcut: - movq 8(%rsp),%r10 - movq %rsp,%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - subq $128,%rsp - andq $-64,%rsp - - shlq $6,%rdx - subq %rdi,%rsi - subq %rdi,%r10 - addq %rdi,%rdx - - - movq %rsi,64+8(%rsp) - movq %rdx,64+16(%rsp) - - movq %r8,64+32(%rsp) - movq %r9,64+40(%rsp) - movq %r10,64+48(%rsp) - movq %rax,120(%rsp) -.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08 -.Lprologue_avx: - vzeroall - - movq %rdi,%r12 - leaq 128(%rcx),%rdi - leaq K256+544(%rip),%r13 - movl 240-128(%rdi),%r14d - movq %r9,%r15 - movq %r10,%rsi - vmovdqu (%r8),%xmm8 - subq $9,%r14 - - movl 0(%r15),%eax - movl 4(%r15),%ebx - movl 8(%r15),%ecx - movl 12(%r15),%edx - movl 16(%r15),%r8d - movl 20(%r15),%r9d - movl 24(%r15),%r10d - movl 28(%r15),%r11d - - vmovdqa 0(%r13,%r14,8),%xmm14 - vmovdqa 16(%r13,%r14,8),%xmm13 - vmovdqa 32(%r13,%r14,8),%xmm12 - vmovdqu 0-128(%rdi),%xmm10 - jmp .Lloop_avx -.align 16 -.Lloop_avx: - vmovdqa K256+512(%rip),%xmm7 - vmovdqu 0(%rsi,%r12,1),%xmm0 - vmovdqu 16(%rsi,%r12,1),%xmm1 - vmovdqu 32(%rsi,%r12,1),%xmm2 - vmovdqu 48(%rsi,%r12,1),%xmm3 - vpshufb %xmm7,%xmm0,%xmm0 - leaq K256(%rip),%rbp - vpshufb %xmm7,%xmm1,%xmm1 - vpshufb %xmm7,%xmm2,%xmm2 - vpaddd 0(%rbp),%xmm0,%xmm4 - vpshufb %xmm7,%xmm3,%xmm3 - vpaddd 32(%rbp),%xmm1,%xmm5 - vpaddd 64(%rbp),%xmm2,%xmm6 - vpaddd 96(%rbp),%xmm3,%xmm7 - vmovdqa %xmm4,0(%rsp) - movl %eax,%r14d - vmovdqa %xmm5,16(%rsp) - movl %ebx,%esi - vmovdqa %xmm6,32(%rsp) - xorl %ecx,%esi - vmovdqa %xmm7,48(%rsp) - movl %r8d,%r13d - jmp .Lavx_00_47 - -.align 16 -.Lavx_00_47: - subq $-32*4,%rbp - vmovdqu (%r12),%xmm9 - movq %r12,64+0(%rsp) - vpalignr $4,%xmm0,%xmm1,%xmm4 - shrdl $14,%r13d,%r13d - movl %r14d,%eax - movl %r9d,%r12d - vpalignr $4,%xmm2,%xmm3,%xmm7 - xorl %r8d,%r13d - shrdl $9,%r14d,%r14d - xorl %r10d,%r12d - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%r13d,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - vpaddd %xmm7,%xmm0,%xmm0 - vpxor %xmm10,%xmm9,%xmm9 - vmovdqu 16-128(%rdi),%xmm10 - xorl %r8d,%r13d - addl 0(%rsp),%r11d - movl %eax,%r15d - vpsrld $3,%xmm4,%xmm7 - shrdl $11,%r14d,%r14d - xorl %r10d,%r12d - xorl %ebx,%r15d - vpslld $14,%xmm4,%xmm5 - shrdl $6,%r13d,%r13d - addl %r12d,%r11d - andl %r15d,%esi - vpxor %xmm6,%xmm7,%xmm4 - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%esi - vpshufd $250,%xmm3,%xmm7 - addl %r11d,%edx - shrdl $2,%r14d,%r14d - addl %esi,%r11d - vpsrld $11,%xmm6,%xmm6 - movl %edx,%r13d - addl %r11d,%r14d - shrdl $14,%r13d,%r13d - vpxor %xmm5,%xmm4,%xmm4 - movl %r14d,%r11d - movl %r8d,%r12d - xorl %edx,%r13d - vpslld $11,%xmm5,%xmm5 - shrdl $9,%r14d,%r14d - xorl %r9d,%r12d - shrdl $5,%r13d,%r13d - vpxor %xmm6,%xmm4,%xmm4 - xorl %r11d,%r14d - andl %edx,%r12d - vpxor %xmm8,%xmm9,%xmm9 - xorl %edx,%r13d - vpsrld $10,%xmm7,%xmm6 - addl 4(%rsp),%r10d - movl %r11d,%esi - shrdl $11,%r14d,%r14d - vpxor %xmm5,%xmm4,%xmm4 - xorl %r9d,%r12d - xorl %eax,%esi - shrdl $6,%r13d,%r13d - vpsrlq $17,%xmm7,%xmm7 - addl %r12d,%r10d - andl %esi,%r15d - xorl %r11d,%r14d - vpaddd %xmm4,%xmm0,%xmm0 - addl %r13d,%r10d - xorl %eax,%r15d - addl %r10d,%ecx - vpxor %xmm7,%xmm6,%xmm6 - shrdl $2,%r14d,%r14d - addl %r15d,%r10d - movl %ecx,%r13d - vpsrlq $2,%xmm7,%xmm7 - addl %r10d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r10d - vpxor %xmm7,%xmm6,%xmm6 - movl %edx,%r12d - xorl %ecx,%r13d - shrdl $9,%r14d,%r14d - vpshufd $132,%xmm6,%xmm6 - xorl %r8d,%r12d - shrdl $5,%r13d,%r13d - xorl %r10d,%r14d - vpsrldq $8,%xmm6,%xmm6 - andl %ecx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 32-128(%rdi),%xmm10 - xorl %ecx,%r13d - addl 8(%rsp),%r9d - vpaddd %xmm6,%xmm0,%xmm0 - movl %r10d,%r15d - shrdl $11,%r14d,%r14d - xorl %r8d,%r12d - vpshufd $80,%xmm0,%xmm7 - xorl %r11d,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%r9d - vpsrld $10,%xmm7,%xmm6 - andl %r15d,%esi - xorl %r10d,%r14d - addl %r13d,%r9d - vpsrlq $17,%xmm7,%xmm7 - xorl %r11d,%esi - addl %r9d,%ebx - shrdl $2,%r14d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - addl %esi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - vpsrlq $2,%xmm7,%xmm7 - shrdl $14,%r13d,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - vpxor %xmm7,%xmm6,%xmm6 - xorl %ebx,%r13d - shrdl $9,%r14d,%r14d - xorl %edx,%r12d - vpshufd $232,%xmm6,%xmm6 - shrdl $5,%r13d,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vpslldq $8,%xmm6,%xmm6 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 48-128(%rdi),%xmm10 - xorl %ebx,%r13d - addl 12(%rsp),%r8d - movl %r9d,%esi - vpaddd %xmm6,%xmm0,%xmm0 - shrdl $11,%r14d,%r14d - xorl %edx,%r12d - xorl %r10d,%esi - vpaddd 0(%rbp),%xmm0,%xmm6 - shrdl $6,%r13d,%r13d - addl %r12d,%r8d - andl %esi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - shrdl $2,%r14d,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - vmovdqa %xmm6,0(%rsp) - vpalignr $4,%xmm1,%xmm2,%xmm4 - shrdl $14,%r13d,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - vpalignr $4,%xmm3,%xmm0,%xmm7 - xorl %eax,%r13d - shrdl $9,%r14d,%r14d - xorl %ecx,%r12d - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%r13d,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - vpaddd %xmm7,%xmm1,%xmm1 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 64-128(%rdi),%xmm10 - xorl %eax,%r13d - addl 16(%rsp),%edx - movl %r8d,%r15d - vpsrld $3,%xmm4,%xmm7 - shrdl $11,%r14d,%r14d - xorl %ecx,%r12d - xorl %r9d,%r15d - vpslld $14,%xmm4,%xmm5 - shrdl $6,%r13d,%r13d - addl %r12d,%edx - andl %r15d,%esi - vpxor %xmm6,%xmm7,%xmm4 - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%esi - vpshufd $250,%xmm0,%xmm7 - addl %edx,%r11d - shrdl $2,%r14d,%r14d - addl %esi,%edx - vpsrld $11,%xmm6,%xmm6 - movl %r11d,%r13d - addl %edx,%r14d - shrdl $14,%r13d,%r13d - vpxor %xmm5,%xmm4,%xmm4 - movl %r14d,%edx - movl %eax,%r12d - xorl %r11d,%r13d - vpslld $11,%xmm5,%xmm5 - shrdl $9,%r14d,%r14d - xorl %ebx,%r12d - shrdl $5,%r13d,%r13d - vpxor %xmm6,%xmm4,%xmm4 - xorl %edx,%r14d - andl %r11d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 80-128(%rdi),%xmm10 - xorl %r11d,%r13d - vpsrld $10,%xmm7,%xmm6 - addl 20(%rsp),%ecx - movl %edx,%esi - shrdl $11,%r14d,%r14d - vpxor %xmm5,%xmm4,%xmm4 - xorl %ebx,%r12d - xorl %r8d,%esi - shrdl $6,%r13d,%r13d - vpsrlq $17,%xmm7,%xmm7 - addl %r12d,%ecx - andl %esi,%r15d - xorl %edx,%r14d - vpaddd %xmm4,%xmm1,%xmm1 - addl %r13d,%ecx - xorl %r8d,%r15d - addl %ecx,%r10d - vpxor %xmm7,%xmm6,%xmm6 - shrdl $2,%r14d,%r14d - addl %r15d,%ecx - movl %r10d,%r13d - vpsrlq $2,%xmm7,%xmm7 - addl %ecx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ecx - vpxor %xmm7,%xmm6,%xmm6 - movl %r11d,%r12d - xorl %r10d,%r13d - shrdl $9,%r14d,%r14d - vpshufd $132,%xmm6,%xmm6 - xorl %eax,%r12d - shrdl $5,%r13d,%r13d - xorl %ecx,%r14d - vpsrldq $8,%xmm6,%xmm6 - andl %r10d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 96-128(%rdi),%xmm10 - xorl %r10d,%r13d - addl 24(%rsp),%ebx - vpaddd %xmm6,%xmm1,%xmm1 - movl %ecx,%r15d - shrdl $11,%r14d,%r14d - xorl %eax,%r12d - vpshufd $80,%xmm1,%xmm7 - xorl %edx,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%ebx - vpsrld $10,%xmm7,%xmm6 - andl %r15d,%esi - xorl %ecx,%r14d - addl %r13d,%ebx - vpsrlq $17,%xmm7,%xmm7 - xorl %edx,%esi - addl %ebx,%r9d - shrdl $2,%r14d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - addl %esi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - vpsrlq $2,%xmm7,%xmm7 - shrdl $14,%r13d,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - vpxor %xmm7,%xmm6,%xmm6 - xorl %r9d,%r13d - shrdl $9,%r14d,%r14d - xorl %r11d,%r12d - vpshufd $232,%xmm6,%xmm6 - shrdl $5,%r13d,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vpslldq $8,%xmm6,%xmm6 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 112-128(%rdi),%xmm10 - xorl %r9d,%r13d - addl 28(%rsp),%eax - movl %ebx,%esi - vpaddd %xmm6,%xmm1,%xmm1 - shrdl $11,%r14d,%r14d - xorl %r11d,%r12d - xorl %ecx,%esi - vpaddd 32(%rbp),%xmm1,%xmm6 - shrdl $6,%r13d,%r13d - addl %r12d,%eax - andl %esi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - shrdl $2,%r14d,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - vmovdqa %xmm6,16(%rsp) - vpalignr $4,%xmm2,%xmm3,%xmm4 - shrdl $14,%r13d,%r13d - movl %r14d,%eax - movl %r9d,%r12d - vpalignr $4,%xmm0,%xmm1,%xmm7 - xorl %r8d,%r13d - shrdl $9,%r14d,%r14d - xorl %r10d,%r12d - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%r13d,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - vpaddd %xmm7,%xmm2,%xmm2 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 128-128(%rdi),%xmm10 - xorl %r8d,%r13d - addl 32(%rsp),%r11d - movl %eax,%r15d - vpsrld $3,%xmm4,%xmm7 - shrdl $11,%r14d,%r14d - xorl %r10d,%r12d - xorl %ebx,%r15d - vpslld $14,%xmm4,%xmm5 - shrdl $6,%r13d,%r13d - addl %r12d,%r11d - andl %r15d,%esi - vpxor %xmm6,%xmm7,%xmm4 - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%esi - vpshufd $250,%xmm1,%xmm7 - addl %r11d,%edx - shrdl $2,%r14d,%r14d - addl %esi,%r11d - vpsrld $11,%xmm6,%xmm6 - movl %edx,%r13d - addl %r11d,%r14d - shrdl $14,%r13d,%r13d - vpxor %xmm5,%xmm4,%xmm4 - movl %r14d,%r11d - movl %r8d,%r12d - xorl %edx,%r13d - vpslld $11,%xmm5,%xmm5 - shrdl $9,%r14d,%r14d - xorl %r9d,%r12d - shrdl $5,%r13d,%r13d - vpxor %xmm6,%xmm4,%xmm4 - xorl %r11d,%r14d - andl %edx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 144-128(%rdi),%xmm10 - xorl %edx,%r13d - vpsrld $10,%xmm7,%xmm6 - addl 36(%rsp),%r10d - movl %r11d,%esi - shrdl $11,%r14d,%r14d - vpxor %xmm5,%xmm4,%xmm4 - xorl %r9d,%r12d - xorl %eax,%esi - shrdl $6,%r13d,%r13d - vpsrlq $17,%xmm7,%xmm7 - addl %r12d,%r10d - andl %esi,%r15d - xorl %r11d,%r14d - vpaddd %xmm4,%xmm2,%xmm2 - addl %r13d,%r10d - xorl %eax,%r15d - addl %r10d,%ecx - vpxor %xmm7,%xmm6,%xmm6 - shrdl $2,%r14d,%r14d - addl %r15d,%r10d - movl %ecx,%r13d - vpsrlq $2,%xmm7,%xmm7 - addl %r10d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r10d - vpxor %xmm7,%xmm6,%xmm6 - movl %edx,%r12d - xorl %ecx,%r13d - shrdl $9,%r14d,%r14d - vpshufd $132,%xmm6,%xmm6 - xorl %r8d,%r12d - shrdl $5,%r13d,%r13d - xorl %r10d,%r14d - vpsrldq $8,%xmm6,%xmm6 - andl %ecx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 160-128(%rdi),%xmm10 - xorl %ecx,%r13d - addl 40(%rsp),%r9d - vpaddd %xmm6,%xmm2,%xmm2 - movl %r10d,%r15d - shrdl $11,%r14d,%r14d - xorl %r8d,%r12d - vpshufd $80,%xmm2,%xmm7 - xorl %r11d,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%r9d - vpsrld $10,%xmm7,%xmm6 - andl %r15d,%esi - xorl %r10d,%r14d - addl %r13d,%r9d - vpsrlq $17,%xmm7,%xmm7 - xorl %r11d,%esi - addl %r9d,%ebx - shrdl $2,%r14d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - addl %esi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - vpsrlq $2,%xmm7,%xmm7 - shrdl $14,%r13d,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - vpxor %xmm7,%xmm6,%xmm6 - xorl %ebx,%r13d - shrdl $9,%r14d,%r14d - xorl %edx,%r12d - vpshufd $232,%xmm6,%xmm6 - shrdl $5,%r13d,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vpslldq $8,%xmm6,%xmm6 - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 176-128(%rdi),%xmm10 - xorl %ebx,%r13d - addl 44(%rsp),%r8d - movl %r9d,%esi - vpaddd %xmm6,%xmm2,%xmm2 - shrdl $11,%r14d,%r14d - xorl %edx,%r12d - xorl %r10d,%esi - vpaddd 64(%rbp),%xmm2,%xmm6 - shrdl $6,%r13d,%r13d - addl %r12d,%r8d - andl %esi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - shrdl $2,%r14d,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - vmovdqa %xmm6,32(%rsp) - vpalignr $4,%xmm3,%xmm0,%xmm4 - shrdl $14,%r13d,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - vpalignr $4,%xmm1,%xmm2,%xmm7 - xorl %eax,%r13d - shrdl $9,%r14d,%r14d - xorl %ecx,%r12d - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%r13d,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - vpaddd %xmm7,%xmm3,%xmm3 - vpand %xmm12,%xmm11,%xmm8 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 192-128(%rdi),%xmm10 - xorl %eax,%r13d - addl 48(%rsp),%edx - movl %r8d,%r15d - vpsrld $3,%xmm4,%xmm7 - shrdl $11,%r14d,%r14d - xorl %ecx,%r12d - xorl %r9d,%r15d - vpslld $14,%xmm4,%xmm5 - shrdl $6,%r13d,%r13d - addl %r12d,%edx - andl %r15d,%esi - vpxor %xmm6,%xmm7,%xmm4 - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%esi - vpshufd $250,%xmm2,%xmm7 - addl %edx,%r11d - shrdl $2,%r14d,%r14d - addl %esi,%edx - vpsrld $11,%xmm6,%xmm6 - movl %r11d,%r13d - addl %edx,%r14d - shrdl $14,%r13d,%r13d - vpxor %xmm5,%xmm4,%xmm4 - movl %r14d,%edx - movl %eax,%r12d - xorl %r11d,%r13d - vpslld $11,%xmm5,%xmm5 - shrdl $9,%r14d,%r14d - xorl %ebx,%r12d - shrdl $5,%r13d,%r13d - vpxor %xmm6,%xmm4,%xmm4 - xorl %edx,%r14d - andl %r11d,%r12d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 208-128(%rdi),%xmm10 - xorl %r11d,%r13d - vpsrld $10,%xmm7,%xmm6 - addl 52(%rsp),%ecx - movl %edx,%esi - shrdl $11,%r14d,%r14d - vpxor %xmm5,%xmm4,%xmm4 - xorl %ebx,%r12d - xorl %r8d,%esi - shrdl $6,%r13d,%r13d - vpsrlq $17,%xmm7,%xmm7 - addl %r12d,%ecx - andl %esi,%r15d - xorl %edx,%r14d - vpaddd %xmm4,%xmm3,%xmm3 - addl %r13d,%ecx - xorl %r8d,%r15d - addl %ecx,%r10d - vpxor %xmm7,%xmm6,%xmm6 - shrdl $2,%r14d,%r14d - addl %r15d,%ecx - movl %r10d,%r13d - vpsrlq $2,%xmm7,%xmm7 - addl %ecx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ecx - vpxor %xmm7,%xmm6,%xmm6 - movl %r11d,%r12d - xorl %r10d,%r13d - shrdl $9,%r14d,%r14d - vpshufd $132,%xmm6,%xmm6 - xorl %eax,%r12d - shrdl $5,%r13d,%r13d - xorl %ecx,%r14d - vpsrldq $8,%xmm6,%xmm6 - andl %r10d,%r12d - vpand %xmm13,%xmm11,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 224-128(%rdi),%xmm10 - xorl %r10d,%r13d - addl 56(%rsp),%ebx - vpaddd %xmm6,%xmm3,%xmm3 - movl %ecx,%r15d - shrdl $11,%r14d,%r14d - xorl %eax,%r12d - vpshufd $80,%xmm3,%xmm7 - xorl %edx,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%ebx - vpsrld $10,%xmm7,%xmm6 - andl %r15d,%esi - xorl %ecx,%r14d - addl %r13d,%ebx - vpsrlq $17,%xmm7,%xmm7 - xorl %edx,%esi - addl %ebx,%r9d - shrdl $2,%r14d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - addl %esi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - vpsrlq $2,%xmm7,%xmm7 - shrdl $14,%r13d,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - vpxor %xmm7,%xmm6,%xmm6 - xorl %r9d,%r13d - shrdl $9,%r14d,%r14d - xorl %r11d,%r12d - vpshufd $232,%xmm6,%xmm6 - shrdl $5,%r13d,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vpslldq $8,%xmm6,%xmm6 - vpor %xmm11,%xmm8,%xmm8 - vaesenclast %xmm10,%xmm9,%xmm11 - vmovdqu 0-128(%rdi),%xmm10 - xorl %r9d,%r13d - addl 60(%rsp),%eax - movl %ebx,%esi - vpaddd %xmm6,%xmm3,%xmm3 - shrdl $11,%r14d,%r14d - xorl %r11d,%r12d - xorl %ecx,%esi - vpaddd 96(%rbp),%xmm3,%xmm6 - shrdl $6,%r13d,%r13d - addl %r12d,%eax - andl %esi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - shrdl $2,%r14d,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - vmovdqa %xmm6,48(%rsp) - movq 64+0(%rsp),%r12 - vpand %xmm14,%xmm11,%xmm11 - movq 64+8(%rsp),%r15 - vpor %xmm11,%xmm8,%xmm8 - vmovdqu %xmm8,(%r15,%r12,1) - leaq 16(%r12),%r12 - cmpb $0,131(%rbp) - jne .Lavx_00_47 - vmovdqu (%r12),%xmm9 - movq %r12,64+0(%rsp) - shrdl $14,%r13d,%r13d - movl %r14d,%eax - movl %r9d,%r12d - xorl %r8d,%r13d - shrdl $9,%r14d,%r14d - xorl %r10d,%r12d - shrdl $5,%r13d,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - vpxor %xmm10,%xmm9,%xmm9 - vmovdqu 16-128(%rdi),%xmm10 - xorl %r8d,%r13d - addl 0(%rsp),%r11d - movl %eax,%r15d - shrdl $11,%r14d,%r14d - xorl %r10d,%r12d - xorl %ebx,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%r11d - andl %r15d,%esi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%esi - addl %r11d,%edx - shrdl $2,%r14d,%r14d - addl %esi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - xorl %edx,%r13d - shrdl $9,%r14d,%r14d - xorl %r9d,%r12d - shrdl $5,%r13d,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - vpxor %xmm8,%xmm9,%xmm9 - xorl %edx,%r13d - addl 4(%rsp),%r10d - movl %r11d,%esi - shrdl $11,%r14d,%r14d - xorl %r9d,%r12d - xorl %eax,%esi - shrdl $6,%r13d,%r13d - addl %r12d,%r10d - andl %esi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - addl %r10d,%ecx - shrdl $2,%r14d,%r14d - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r10d - movl %edx,%r12d - xorl %ecx,%r13d - shrdl $9,%r14d,%r14d - xorl %r8d,%r12d - shrdl $5,%r13d,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 32-128(%rdi),%xmm10 - xorl %ecx,%r13d - addl 8(%rsp),%r9d - movl %r10d,%r15d - shrdl $11,%r14d,%r14d - xorl %r8d,%r12d - xorl %r11d,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%r9d - andl %r15d,%esi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%esi - addl %r9d,%ebx - shrdl $2,%r14d,%r14d - addl %esi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - xorl %ebx,%r13d - shrdl $9,%r14d,%r14d - xorl %edx,%r12d - shrdl $5,%r13d,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 48-128(%rdi),%xmm10 - xorl %ebx,%r13d - addl 12(%rsp),%r8d - movl %r9d,%esi - shrdl $11,%r14d,%r14d - xorl %edx,%r12d - xorl %r10d,%esi - shrdl $6,%r13d,%r13d - addl %r12d,%r8d - andl %esi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - shrdl $2,%r14d,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - xorl %eax,%r13d - shrdl $9,%r14d,%r14d - xorl %ecx,%r12d - shrdl $5,%r13d,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 64-128(%rdi),%xmm10 - xorl %eax,%r13d - addl 16(%rsp),%edx - movl %r8d,%r15d - shrdl $11,%r14d,%r14d - xorl %ecx,%r12d - xorl %r9d,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%edx - andl %r15d,%esi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%esi - addl %edx,%r11d - shrdl $2,%r14d,%r14d - addl %esi,%edx - movl %r11d,%r13d - addl %edx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%edx - movl %eax,%r12d - xorl %r11d,%r13d - shrdl $9,%r14d,%r14d - xorl %ebx,%r12d - shrdl $5,%r13d,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 80-128(%rdi),%xmm10 - xorl %r11d,%r13d - addl 20(%rsp),%ecx - movl %edx,%esi - shrdl $11,%r14d,%r14d - xorl %ebx,%r12d - xorl %r8d,%esi - shrdl $6,%r13d,%r13d - addl %r12d,%ecx - andl %esi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - addl %ecx,%r10d - shrdl $2,%r14d,%r14d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - xorl %r10d,%r13d - shrdl $9,%r14d,%r14d - xorl %eax,%r12d - shrdl $5,%r13d,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 96-128(%rdi),%xmm10 - xorl %r10d,%r13d - addl 24(%rsp),%ebx - movl %ecx,%r15d - shrdl $11,%r14d,%r14d - xorl %eax,%r12d - xorl %edx,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%ebx - andl %r15d,%esi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%esi - addl %ebx,%r9d - shrdl $2,%r14d,%r14d - addl %esi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - xorl %r9d,%r13d - shrdl $9,%r14d,%r14d - xorl %r11d,%r12d - shrdl $5,%r13d,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 112-128(%rdi),%xmm10 - xorl %r9d,%r13d - addl 28(%rsp),%eax - movl %ebx,%esi - shrdl $11,%r14d,%r14d - xorl %r11d,%r12d - xorl %ecx,%esi - shrdl $6,%r13d,%r13d - addl %r12d,%eax - andl %esi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - shrdl $2,%r14d,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%eax - movl %r9d,%r12d - xorl %r8d,%r13d - shrdl $9,%r14d,%r14d - xorl %r10d,%r12d - shrdl $5,%r13d,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 128-128(%rdi),%xmm10 - xorl %r8d,%r13d - addl 32(%rsp),%r11d - movl %eax,%r15d - shrdl $11,%r14d,%r14d - xorl %r10d,%r12d - xorl %ebx,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%r11d - andl %r15d,%esi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%esi - addl %r11d,%edx - shrdl $2,%r14d,%r14d - addl %esi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - xorl %edx,%r13d - shrdl $9,%r14d,%r14d - xorl %r9d,%r12d - shrdl $5,%r13d,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 144-128(%rdi),%xmm10 - xorl %edx,%r13d - addl 36(%rsp),%r10d - movl %r11d,%esi - shrdl $11,%r14d,%r14d - xorl %r9d,%r12d - xorl %eax,%esi - shrdl $6,%r13d,%r13d - addl %r12d,%r10d - andl %esi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - addl %r10d,%ecx - shrdl $2,%r14d,%r14d - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r10d - movl %edx,%r12d - xorl %ecx,%r13d - shrdl $9,%r14d,%r14d - xorl %r8d,%r12d - shrdl $5,%r13d,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 160-128(%rdi),%xmm10 - xorl %ecx,%r13d - addl 40(%rsp),%r9d - movl %r10d,%r15d - shrdl $11,%r14d,%r14d - xorl %r8d,%r12d - xorl %r11d,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%r9d - andl %r15d,%esi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%esi - addl %r9d,%ebx - shrdl $2,%r14d,%r14d - addl %esi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - xorl %ebx,%r13d - shrdl $9,%r14d,%r14d - xorl %edx,%r12d - shrdl $5,%r13d,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 176-128(%rdi),%xmm10 - xorl %ebx,%r13d - addl 44(%rsp),%r8d - movl %r9d,%esi - shrdl $11,%r14d,%r14d - xorl %edx,%r12d - xorl %r10d,%esi - shrdl $6,%r13d,%r13d - addl %r12d,%r8d - andl %esi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - shrdl $2,%r14d,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - xorl %eax,%r13d - shrdl $9,%r14d,%r14d - xorl %ecx,%r12d - shrdl $5,%r13d,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - vpand %xmm12,%xmm11,%xmm8 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 192-128(%rdi),%xmm10 - xorl %eax,%r13d - addl 48(%rsp),%edx - movl %r8d,%r15d - shrdl $11,%r14d,%r14d - xorl %ecx,%r12d - xorl %r9d,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%edx - andl %r15d,%esi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%esi - addl %edx,%r11d - shrdl $2,%r14d,%r14d - addl %esi,%edx - movl %r11d,%r13d - addl %edx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%edx - movl %eax,%r12d - xorl %r11d,%r13d - shrdl $9,%r14d,%r14d - xorl %ebx,%r12d - shrdl $5,%r13d,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 208-128(%rdi),%xmm10 - xorl %r11d,%r13d - addl 52(%rsp),%ecx - movl %edx,%esi - shrdl $11,%r14d,%r14d - xorl %ebx,%r12d - xorl %r8d,%esi - shrdl $6,%r13d,%r13d - addl %r12d,%ecx - andl %esi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - addl %ecx,%r10d - shrdl $2,%r14d,%r14d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - xorl %r10d,%r13d - shrdl $9,%r14d,%r14d - xorl %eax,%r12d - shrdl $5,%r13d,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - vpand %xmm13,%xmm11,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 224-128(%rdi),%xmm10 - xorl %r10d,%r13d - addl 56(%rsp),%ebx - movl %ecx,%r15d - shrdl $11,%r14d,%r14d - xorl %eax,%r12d - xorl %edx,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%ebx - andl %r15d,%esi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%esi - addl %ebx,%r9d - shrdl $2,%r14d,%r14d - addl %esi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - xorl %r9d,%r13d - shrdl $9,%r14d,%r14d - xorl %r11d,%r12d - shrdl $5,%r13d,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vpor %xmm11,%xmm8,%xmm8 - vaesenclast %xmm10,%xmm9,%xmm11 - vmovdqu 0-128(%rdi),%xmm10 - xorl %r9d,%r13d - addl 60(%rsp),%eax - movl %ebx,%esi - shrdl $11,%r14d,%r14d - xorl %r11d,%r12d - xorl %ecx,%esi - shrdl $6,%r13d,%r13d - addl %r12d,%eax - andl %esi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - shrdl $2,%r14d,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - movq 64+0(%rsp),%r12 - movq 64+8(%rsp),%r13 - movq 64+40(%rsp),%r15 - movq 64+48(%rsp),%rsi - - vpand %xmm14,%xmm11,%xmm11 - movl %r14d,%eax - vpor %xmm11,%xmm8,%xmm8 - vmovdqu %xmm8,(%r12,%r13,1) - leaq 16(%r12),%r12 - - addl 0(%r15),%eax - addl 4(%r15),%ebx - addl 8(%r15),%ecx - addl 12(%r15),%edx - addl 16(%r15),%r8d - addl 20(%r15),%r9d - addl 24(%r15),%r10d - addl 28(%r15),%r11d - - cmpq 64+16(%rsp),%r12 - - movl %eax,0(%r15) - movl %ebx,4(%r15) - movl %ecx,8(%r15) - movl %edx,12(%r15) - movl %r8d,16(%r15) - movl %r9d,20(%r15) - movl %r10d,24(%r15) - movl %r11d,28(%r15) - jb .Lloop_avx - - movq 64+32(%rsp),%r8 - movq 120(%rsp),%rsi -.cfi_def_cfa %rsi,8 - vmovdqu %xmm8,(%r8) - vzeroall - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbp -.cfi_restore %rbp - movq -8(%rsi),%rbx -.cfi_restore %rbx - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lepilogue_avx: - .byte 0xf3,0xc3 -.cfi_endproc -.size aesni_cbc_sha256_enc_avx,.-aesni_cbc_sha256_enc_avx -.type aesni_cbc_sha256_enc_avx2,@function -.align 64 -aesni_cbc_sha256_enc_avx2: -.cfi_startproc -.Lavx2_shortcut: - movq 8(%rsp),%r10 - movq %rsp,%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - subq $576,%rsp - andq $-1024,%rsp - addq $448,%rsp - - shlq $6,%rdx - subq %rdi,%rsi - subq %rdi,%r10 - addq %rdi,%rdx - - - - movq %rdx,64+16(%rsp) - - movq %r8,64+32(%rsp) - movq %r9,64+40(%rsp) - movq %r10,64+48(%rsp) - movq %rax,120(%rsp) -.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08 -.Lprologue_avx2: - vzeroall - - movq %rdi,%r13 - vpinsrq $1,%rsi,%xmm15,%xmm15 - leaq 128(%rcx),%rdi - leaq K256+544(%rip),%r12 - movl 240-128(%rdi),%r14d - movq %r9,%r15 - movq %r10,%rsi - vmovdqu (%r8),%xmm8 - leaq -9(%r14),%r14 - - vmovdqa 0(%r12,%r14,8),%xmm14 - vmovdqa 16(%r12,%r14,8),%xmm13 - vmovdqa 32(%r12,%r14,8),%xmm12 - - subq $-64,%r13 - movl 0(%r15),%eax - leaq (%rsi,%r13,1),%r12 - movl 4(%r15),%ebx - cmpq %rdx,%r13 - movl 8(%r15),%ecx - cmoveq %rsp,%r12 - movl 12(%r15),%edx - movl 16(%r15),%r8d - movl 20(%r15),%r9d - movl 24(%r15),%r10d - movl 28(%r15),%r11d - vmovdqu 0-128(%rdi),%xmm10 - jmp .Loop_avx2 -.align 16 -.Loop_avx2: - vmovdqa K256+512(%rip),%ymm7 - vmovdqu -64+0(%rsi,%r13,1),%xmm0 - vmovdqu -64+16(%rsi,%r13,1),%xmm1 - vmovdqu -64+32(%rsi,%r13,1),%xmm2 - vmovdqu -64+48(%rsi,%r13,1),%xmm3 - - vinserti128 $1,(%r12),%ymm0,%ymm0 - vinserti128 $1,16(%r12),%ymm1,%ymm1 - vpshufb %ymm7,%ymm0,%ymm0 - vinserti128 $1,32(%r12),%ymm2,%ymm2 - vpshufb %ymm7,%ymm1,%ymm1 - vinserti128 $1,48(%r12),%ymm3,%ymm3 - - leaq K256(%rip),%rbp - vpshufb %ymm7,%ymm2,%ymm2 - leaq -64(%r13),%r13 - vpaddd 0(%rbp),%ymm0,%ymm4 - vpshufb %ymm7,%ymm3,%ymm3 - vpaddd 32(%rbp),%ymm1,%ymm5 - vpaddd 64(%rbp),%ymm2,%ymm6 - vpaddd 96(%rbp),%ymm3,%ymm7 - vmovdqa %ymm4,0(%rsp) - xorl %r14d,%r14d - vmovdqa %ymm5,32(%rsp) - leaq -64(%rsp),%rsp - movl %ebx,%esi - vmovdqa %ymm6,0(%rsp) - xorl %ecx,%esi - vmovdqa %ymm7,32(%rsp) - movl %r9d,%r12d - subq $-32*4,%rbp - jmp .Lavx2_00_47 - -.align 16 -.Lavx2_00_47: - vmovdqu (%r13),%xmm9 - vpinsrq $0,%r13,%xmm15,%xmm15 - leaq -64(%rsp),%rsp - vpalignr $4,%ymm0,%ymm1,%ymm4 - addl 0+128(%rsp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - vpalignr $4,%ymm2,%ymm3,%ymm7 - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - vpsrld $7,%ymm4,%ymm6 - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - vpaddd %ymm7,%ymm0,%ymm0 - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - vpsrld $3,%ymm4,%ymm7 - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - vpslld $14,%ymm4,%ymm5 - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - vpxor %ymm6,%ymm7,%ymm4 - andl %r15d,%esi - vpxor %xmm10,%xmm9,%xmm9 - vmovdqu 16-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ebx,%esi - vpshufd $250,%ymm3,%ymm7 - xorl %r13d,%r14d - leal (%r11,%rsi,1),%r11d - movl %r8d,%r12d - vpsrld $11,%ymm6,%ymm6 - addl 4+128(%rsp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - vpxor %ymm5,%ymm4,%ymm4 - rorxl $11,%edx,%esi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - vpslld $11,%ymm5,%ymm5 - andnl %r9d,%edx,%r12d - xorl %esi,%r13d - rorxl $6,%edx,%r14d - vpxor %ymm6,%ymm4,%ymm4 - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%esi - vpsrld $10,%ymm7,%ymm6 - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%esi - vpxor %ymm5,%ymm4,%ymm4 - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - vpsrlq $17,%ymm7,%ymm7 - andl %esi,%r15d - vpxor %xmm8,%xmm9,%xmm9 - xorl %r12d,%r14d - xorl %eax,%r15d - vpaddd %ymm4,%ymm0,%ymm0 - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 8+128(%rsp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - vpxor %ymm7,%ymm6,%ymm6 - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - vpshufd $132,%ymm6,%ymm6 - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - vpsrldq $8,%ymm6,%ymm6 - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - vpaddd %ymm6,%ymm0,%ymm0 - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - vpshufd $80,%ymm0,%ymm7 - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 32-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r11d,%esi - vpsrld $10,%ymm7,%ymm6 - xorl %r13d,%r14d - leal (%r9,%rsi,1),%r9d - movl %ecx,%r12d - vpsrlq $17,%ymm7,%ymm7 - addl 12+128(%rsp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - vpxor %ymm7,%ymm6,%ymm6 - rorxl $11,%ebx,%esi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - vpsrlq $2,%ymm7,%ymm7 - andnl %edx,%ebx,%r12d - xorl %esi,%r13d - rorxl $6,%ebx,%r14d - vpxor %ymm7,%ymm6,%ymm6 - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%esi - vpshufd $232,%ymm6,%ymm6 - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%esi - vpslldq $8,%ymm6,%ymm6 - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - vpaddd %ymm6,%ymm0,%ymm0 - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 48-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r10d,%r15d - vpaddd 0(%rbp),%ymm0,%ymm6 - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - vmovdqa %ymm6,0(%rsp) - vpalignr $4,%ymm1,%ymm2,%ymm4 - addl 32+128(%rsp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - vpalignr $4,%ymm3,%ymm0,%ymm7 - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - vpsrld $7,%ymm4,%ymm6 - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - vpaddd %ymm7,%ymm1,%ymm1 - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - vpsrld $3,%ymm4,%ymm7 - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - vpslld $14,%ymm4,%ymm5 - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - vpxor %ymm6,%ymm7,%ymm4 - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 64-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r9d,%esi - vpshufd $250,%ymm0,%ymm7 - xorl %r13d,%r14d - leal (%rdx,%rsi,1),%edx - movl %eax,%r12d - vpsrld $11,%ymm6,%ymm6 - addl 36+128(%rsp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - vpxor %ymm5,%ymm4,%ymm4 - rorxl $11,%r11d,%esi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - vpslld $11,%ymm5,%ymm5 - andnl %ebx,%r11d,%r12d - xorl %esi,%r13d - rorxl $6,%r11d,%r14d - vpxor %ymm6,%ymm4,%ymm4 - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%esi - vpsrld $10,%ymm7,%ymm6 - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%esi - vpxor %ymm5,%ymm4,%ymm4 - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - vpsrlq $17,%ymm7,%ymm7 - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 80-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r8d,%r15d - vpaddd %ymm4,%ymm1,%ymm1 - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 40+128(%rsp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - vpxor %ymm7,%ymm6,%ymm6 - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - vpshufd $132,%ymm6,%ymm6 - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - vpsrldq $8,%ymm6,%ymm6 - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - vpaddd %ymm6,%ymm1,%ymm1 - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - vpshufd $80,%ymm1,%ymm7 - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 96-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %edx,%esi - vpsrld $10,%ymm7,%ymm6 - xorl %r13d,%r14d - leal (%rbx,%rsi,1),%ebx - movl %r10d,%r12d - vpsrlq $17,%ymm7,%ymm7 - addl 44+128(%rsp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - vpxor %ymm7,%ymm6,%ymm6 - rorxl $11,%r9d,%esi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - vpsrlq $2,%ymm7,%ymm7 - andnl %r11d,%r9d,%r12d - xorl %esi,%r13d - rorxl $6,%r9d,%r14d - vpxor %ymm7,%ymm6,%ymm6 - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%esi - vpshufd $232,%ymm6,%ymm6 - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%esi - vpslldq $8,%ymm6,%ymm6 - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - vpaddd %ymm6,%ymm1,%ymm1 - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 112-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ecx,%r15d - vpaddd 32(%rbp),%ymm1,%ymm6 - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - vmovdqa %ymm6,32(%rsp) - leaq -64(%rsp),%rsp - vpalignr $4,%ymm2,%ymm3,%ymm4 - addl 0+128(%rsp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - vpalignr $4,%ymm0,%ymm1,%ymm7 - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - vpsrld $7,%ymm4,%ymm6 - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - vpaddd %ymm7,%ymm2,%ymm2 - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - vpsrld $3,%ymm4,%ymm7 - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - vpslld $14,%ymm4,%ymm5 - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - vpxor %ymm6,%ymm7,%ymm4 - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 128-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ebx,%esi - vpshufd $250,%ymm1,%ymm7 - xorl %r13d,%r14d - leal (%r11,%rsi,1),%r11d - movl %r8d,%r12d - vpsrld $11,%ymm6,%ymm6 - addl 4+128(%rsp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - vpxor %ymm5,%ymm4,%ymm4 - rorxl $11,%edx,%esi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - vpslld $11,%ymm5,%ymm5 - andnl %r9d,%edx,%r12d - xorl %esi,%r13d - rorxl $6,%edx,%r14d - vpxor %ymm6,%ymm4,%ymm4 - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%esi - vpsrld $10,%ymm7,%ymm6 - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%esi - vpxor %ymm5,%ymm4,%ymm4 - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - vpsrlq $17,%ymm7,%ymm7 - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 144-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %eax,%r15d - vpaddd %ymm4,%ymm2,%ymm2 - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 8+128(%rsp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - vpxor %ymm7,%ymm6,%ymm6 - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - vpshufd $132,%ymm6,%ymm6 - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - vpsrldq $8,%ymm6,%ymm6 - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - vpaddd %ymm6,%ymm2,%ymm2 - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - vpshufd $80,%ymm2,%ymm7 - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 160-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r11d,%esi - vpsrld $10,%ymm7,%ymm6 - xorl %r13d,%r14d - leal (%r9,%rsi,1),%r9d - movl %ecx,%r12d - vpsrlq $17,%ymm7,%ymm7 - addl 12+128(%rsp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - vpxor %ymm7,%ymm6,%ymm6 - rorxl $11,%ebx,%esi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - vpsrlq $2,%ymm7,%ymm7 - andnl %edx,%ebx,%r12d - xorl %esi,%r13d - rorxl $6,%ebx,%r14d - vpxor %ymm7,%ymm6,%ymm6 - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%esi - vpshufd $232,%ymm6,%ymm6 - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%esi - vpslldq $8,%ymm6,%ymm6 - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - vpaddd %ymm6,%ymm2,%ymm2 - andl %esi,%r15d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 176-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r10d,%r15d - vpaddd 64(%rbp),%ymm2,%ymm6 - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - vmovdqa %ymm6,0(%rsp) - vpalignr $4,%ymm3,%ymm0,%ymm4 - addl 32+128(%rsp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - vpalignr $4,%ymm1,%ymm2,%ymm7 - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - vpsrld $7,%ymm4,%ymm6 - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - vpaddd %ymm7,%ymm3,%ymm3 - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - vpsrld $3,%ymm4,%ymm7 - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - vpslld $14,%ymm4,%ymm5 - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - vpxor %ymm6,%ymm7,%ymm4 - andl %r15d,%esi - vpand %xmm12,%xmm11,%xmm8 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 192-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r9d,%esi - vpshufd $250,%ymm2,%ymm7 - xorl %r13d,%r14d - leal (%rdx,%rsi,1),%edx - movl %eax,%r12d - vpsrld $11,%ymm6,%ymm6 - addl 36+128(%rsp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - vpxor %ymm5,%ymm4,%ymm4 - rorxl $11,%r11d,%esi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - vpslld $11,%ymm5,%ymm5 - andnl %ebx,%r11d,%r12d - xorl %esi,%r13d - rorxl $6,%r11d,%r14d - vpxor %ymm6,%ymm4,%ymm4 - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%esi - vpsrld $10,%ymm7,%ymm6 - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%esi - vpxor %ymm5,%ymm4,%ymm4 - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - vpsrlq $17,%ymm7,%ymm7 - andl %esi,%r15d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 208-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r8d,%r15d - vpaddd %ymm4,%ymm3,%ymm3 - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 40+128(%rsp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - vpxor %ymm7,%ymm6,%ymm6 - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - vpshufd $132,%ymm6,%ymm6 - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - vpsrldq $8,%ymm6,%ymm6 - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - vpaddd %ymm6,%ymm3,%ymm3 - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - vpshufd $80,%ymm3,%ymm7 - andl %r15d,%esi - vpand %xmm13,%xmm11,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 224-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %edx,%esi - vpsrld $10,%ymm7,%ymm6 - xorl %r13d,%r14d - leal (%rbx,%rsi,1),%ebx - movl %r10d,%r12d - vpsrlq $17,%ymm7,%ymm7 - addl 44+128(%rsp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - vpxor %ymm7,%ymm6,%ymm6 - rorxl $11,%r9d,%esi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - vpsrlq $2,%ymm7,%ymm7 - andnl %r11d,%r9d,%r12d - xorl %esi,%r13d - rorxl $6,%r9d,%r14d - vpxor %ymm7,%ymm6,%ymm6 - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%esi - vpshufd $232,%ymm6,%ymm6 - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%esi - vpslldq $8,%ymm6,%ymm6 - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - vpaddd %ymm6,%ymm3,%ymm3 - andl %esi,%r15d - vpor %xmm11,%xmm8,%xmm8 - vaesenclast %xmm10,%xmm9,%xmm11 - vmovdqu 0-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ecx,%r15d - vpaddd 96(%rbp),%ymm3,%ymm6 - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - vmovdqa %ymm6,32(%rsp) - vmovq %xmm15,%r13 - vpextrq $1,%xmm15,%r15 - vpand %xmm14,%xmm11,%xmm11 - vpor %xmm11,%xmm8,%xmm8 - vmovdqu %xmm8,(%r15,%r13,1) - leaq 16(%r13),%r13 - leaq 128(%rbp),%rbp - cmpb $0,3(%rbp) - jne .Lavx2_00_47 - vmovdqu (%r13),%xmm9 - vpinsrq $0,%r13,%xmm15,%xmm15 - addl 0+64(%rsp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - andl %r15d,%esi - vpxor %xmm10,%xmm9,%xmm9 - vmovdqu 16-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ebx,%esi - xorl %r13d,%r14d - leal (%r11,%rsi,1),%r11d - movl %r8d,%r12d - addl 4+64(%rsp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - rorxl $11,%edx,%esi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - andnl %r9d,%edx,%r12d - xorl %esi,%r13d - rorxl $6,%edx,%r14d - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%esi - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%esi - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - andl %esi,%r15d - vpxor %xmm8,%xmm9,%xmm9 - xorl %r12d,%r14d - xorl %eax,%r15d - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - addl 8+64(%rsp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 32-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r11d,%esi - xorl %r13d,%r14d - leal (%r9,%rsi,1),%r9d - movl %ecx,%r12d - addl 12+64(%rsp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - rorxl $11,%ebx,%esi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - andnl %edx,%ebx,%r12d - xorl %esi,%r13d - rorxl $6,%ebx,%r14d - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%esi - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%esi - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 48-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r10d,%r15d - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - addl 32+64(%rsp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 64-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r9d,%esi - xorl %r13d,%r14d - leal (%rdx,%rsi,1),%edx - movl %eax,%r12d - addl 36+64(%rsp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - rorxl $11,%r11d,%esi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - andnl %ebx,%r11d,%r12d - xorl %esi,%r13d - rorxl $6,%r11d,%r14d - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%esi - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%esi - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 80-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r8d,%r15d - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - addl 40+64(%rsp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 96-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %edx,%esi - xorl %r13d,%r14d - leal (%rbx,%rsi,1),%ebx - movl %r10d,%r12d - addl 44+64(%rsp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - rorxl $11,%r9d,%esi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - andnl %r11d,%r9d,%r12d - xorl %esi,%r13d - rorxl $6,%r9d,%r14d - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%esi - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%esi - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 112-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ecx,%r15d - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - addl 0(%rsp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 128-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ebx,%esi - xorl %r13d,%r14d - leal (%r11,%rsi,1),%r11d - movl %r8d,%r12d - addl 4(%rsp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - rorxl $11,%edx,%esi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - andnl %r9d,%edx,%r12d - xorl %esi,%r13d - rorxl $6,%edx,%r14d - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%esi - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%esi - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 144-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %eax,%r15d - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - addl 8(%rsp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 160-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r11d,%esi - xorl %r13d,%r14d - leal (%r9,%rsi,1),%r9d - movl %ecx,%r12d - addl 12(%rsp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - rorxl $11,%ebx,%esi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - andnl %edx,%ebx,%r12d - xorl %esi,%r13d - rorxl $6,%ebx,%r14d - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%esi - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%esi - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - andl %esi,%r15d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 176-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r10d,%r15d - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - addl 32(%rsp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - andl %r15d,%esi - vpand %xmm12,%xmm11,%xmm8 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 192-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r9d,%esi - xorl %r13d,%r14d - leal (%rdx,%rsi,1),%edx - movl %eax,%r12d - addl 36(%rsp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - rorxl $11,%r11d,%esi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - andnl %ebx,%r11d,%r12d - xorl %esi,%r13d - rorxl $6,%r11d,%r14d - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%esi - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%esi - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - andl %esi,%r15d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 208-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r8d,%r15d - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - addl 40(%rsp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - andl %r15d,%esi - vpand %xmm13,%xmm11,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 224-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %edx,%esi - xorl %r13d,%r14d - leal (%rbx,%rsi,1),%ebx - movl %r10d,%r12d - addl 44(%rsp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - rorxl $11,%r9d,%esi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - andnl %r11d,%r9d,%r12d - xorl %esi,%r13d - rorxl $6,%r9d,%r14d - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%esi - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%esi - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - andl %esi,%r15d - vpor %xmm11,%xmm8,%xmm8 - vaesenclast %xmm10,%xmm9,%xmm11 - vmovdqu 0-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ecx,%r15d - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - vpextrq $1,%xmm15,%r12 - vmovq %xmm15,%r13 - movq 552(%rsp),%r15 - addl %r14d,%eax - leaq 448(%rsp),%rbp - - vpand %xmm14,%xmm11,%xmm11 - vpor %xmm11,%xmm8,%xmm8 - vmovdqu %xmm8,(%r12,%r13,1) - leaq 16(%r13),%r13 - - addl 0(%r15),%eax - addl 4(%r15),%ebx - addl 8(%r15),%ecx - addl 12(%r15),%edx - addl 16(%r15),%r8d - addl 20(%r15),%r9d - addl 24(%r15),%r10d - addl 28(%r15),%r11d - - movl %eax,0(%r15) - movl %ebx,4(%r15) - movl %ecx,8(%r15) - movl %edx,12(%r15) - movl %r8d,16(%r15) - movl %r9d,20(%r15) - movl %r10d,24(%r15) - movl %r11d,28(%r15) - - cmpq 80(%rbp),%r13 - je .Ldone_avx2 - - xorl %r14d,%r14d - movl %ebx,%esi - movl %r9d,%r12d - xorl %ecx,%esi - jmp .Lower_avx2 -.align 16 -.Lower_avx2: - vmovdqu (%r13),%xmm9 - vpinsrq $0,%r13,%xmm15,%xmm15 - addl 0+16(%rbp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - andl %r15d,%esi - vpxor %xmm10,%xmm9,%xmm9 - vmovdqu 16-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ebx,%esi - xorl %r13d,%r14d - leal (%r11,%rsi,1),%r11d - movl %r8d,%r12d - addl 4+16(%rbp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - rorxl $11,%edx,%esi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - andnl %r9d,%edx,%r12d - xorl %esi,%r13d - rorxl $6,%edx,%r14d - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%esi - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%esi - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - andl %esi,%r15d - vpxor %xmm8,%xmm9,%xmm9 - xorl %r12d,%r14d - xorl %eax,%r15d - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - addl 8+16(%rbp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 32-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r11d,%esi - xorl %r13d,%r14d - leal (%r9,%rsi,1),%r9d - movl %ecx,%r12d - addl 12+16(%rbp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - rorxl $11,%ebx,%esi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - andnl %edx,%ebx,%r12d - xorl %esi,%r13d - rorxl $6,%ebx,%r14d - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%esi - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%esi - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 48-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r10d,%r15d - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - addl 32+16(%rbp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 64-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r9d,%esi - xorl %r13d,%r14d - leal (%rdx,%rsi,1),%edx - movl %eax,%r12d - addl 36+16(%rbp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - rorxl $11,%r11d,%esi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - andnl %ebx,%r11d,%r12d - xorl %esi,%r13d - rorxl $6,%r11d,%r14d - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%esi - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%esi - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 80-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r8d,%r15d - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - addl 40+16(%rbp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 96-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %edx,%esi - xorl %r13d,%r14d - leal (%rbx,%rsi,1),%ebx - movl %r10d,%r12d - addl 44+16(%rbp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - rorxl $11,%r9d,%esi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - andnl %r11d,%r9d,%r12d - xorl %esi,%r13d - rorxl $6,%r9d,%r14d - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%esi - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%esi - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 112-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ecx,%r15d - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - leaq -64(%rbp),%rbp - addl 0+16(%rbp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 128-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ebx,%esi - xorl %r13d,%r14d - leal (%r11,%rsi,1),%r11d - movl %r8d,%r12d - addl 4+16(%rbp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - rorxl $11,%edx,%esi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - andnl %r9d,%edx,%r12d - xorl %esi,%r13d - rorxl $6,%edx,%r14d - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%esi - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%esi - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 144-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %eax,%r15d - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - addl 8+16(%rbp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 160-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r11d,%esi - xorl %r13d,%r14d - leal (%r9,%rsi,1),%r9d - movl %ecx,%r12d - addl 12+16(%rbp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - rorxl $11,%ebx,%esi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - andnl %edx,%ebx,%r12d - xorl %esi,%r13d - rorxl $6,%ebx,%r14d - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%esi - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%esi - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - andl %esi,%r15d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 176-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r10d,%r15d - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - addl 32+16(%rbp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - andl %r15d,%esi - vpand %xmm12,%xmm11,%xmm8 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 192-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r9d,%esi - xorl %r13d,%r14d - leal (%rdx,%rsi,1),%edx - movl %eax,%r12d - addl 36+16(%rbp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - rorxl $11,%r11d,%esi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - andnl %ebx,%r11d,%r12d - xorl %esi,%r13d - rorxl $6,%r11d,%r14d - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%esi - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%esi - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - andl %esi,%r15d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 208-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r8d,%r15d - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - addl 40+16(%rbp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - andl %r15d,%esi - vpand %xmm13,%xmm11,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 224-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %edx,%esi - xorl %r13d,%r14d - leal (%rbx,%rsi,1),%ebx - movl %r10d,%r12d - addl 44+16(%rbp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - rorxl $11,%r9d,%esi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - andnl %r11d,%r9d,%r12d - xorl %esi,%r13d - rorxl $6,%r9d,%r14d - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%esi - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%esi - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - andl %esi,%r15d - vpor %xmm11,%xmm8,%xmm8 - vaesenclast %xmm10,%xmm9,%xmm11 - vmovdqu 0-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ecx,%r15d - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - vmovq %xmm15,%r13 - vpextrq $1,%xmm15,%r15 - vpand %xmm14,%xmm11,%xmm11 - vpor %xmm11,%xmm8,%xmm8 - leaq -64(%rbp),%rbp - vmovdqu %xmm8,(%r15,%r13,1) - leaq 16(%r13),%r13 - cmpq %rsp,%rbp - jae .Lower_avx2 - - movq 552(%rsp),%r15 - leaq 64(%r13),%r13 - movq 560(%rsp),%rsi - addl %r14d,%eax - leaq 448(%rsp),%rsp - - addl 0(%r15),%eax - addl 4(%r15),%ebx - addl 8(%r15),%ecx - addl 12(%r15),%edx - addl 16(%r15),%r8d - addl 20(%r15),%r9d - addl 24(%r15),%r10d - leaq (%rsi,%r13,1),%r12 - addl 28(%r15),%r11d - - cmpq 64+16(%rsp),%r13 - - movl %eax,0(%r15) - cmoveq %rsp,%r12 - movl %ebx,4(%r15) - movl %ecx,8(%r15) - movl %edx,12(%r15) - movl %r8d,16(%r15) - movl %r9d,20(%r15) - movl %r10d,24(%r15) - movl %r11d,28(%r15) - - jbe .Loop_avx2 - leaq (%rsp),%rbp - -.Ldone_avx2: - leaq (%rbp),%rsp - movq 64+32(%rsp),%r8 - movq 120(%rsp),%rsi -.cfi_def_cfa %rsi,8 - vmovdqu %xmm8,(%r8) - vzeroall - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbp -.cfi_restore %rbp - movq -8(%rsi),%rbx -.cfi_restore %rbx - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lepilogue_avx2: - .byte 0xf3,0xc3 -.cfi_endproc -.size aesni_cbc_sha256_enc_avx2,.-aesni_cbc_sha256_enc_avx2 -.type aesni_cbc_sha256_enc_shaext,@function -.align 32 -aesni_cbc_sha256_enc_shaext: - movq 8(%rsp),%r10 - leaq K256+128(%rip),%rax - movdqu (%r9),%xmm1 - movdqu 16(%r9),%xmm2 - movdqa 512-128(%rax),%xmm3 - - movl 240(%rcx),%r11d - subq %rdi,%rsi - movups (%rcx),%xmm15 - movups (%r8),%xmm6 - movups 16(%rcx),%xmm4 - leaq 112(%rcx),%rcx - - pshufd $0x1b,%xmm1,%xmm0 - pshufd $0xb1,%xmm1,%xmm1 - pshufd $0x1b,%xmm2,%xmm2 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,202,8 - punpcklqdq %xmm0,%xmm2 - - jmp .Loop_shaext - -.align 16 -.Loop_shaext: - movdqu (%r10),%xmm10 - movdqu 16(%r10),%xmm11 - movdqu 32(%r10),%xmm12 -.byte 102,68,15,56,0,211 - movdqu 48(%r10),%xmm13 - - movdqa 0-128(%rax),%xmm0 - paddd %xmm10,%xmm0 -.byte 102,68,15,56,0,219 - movdqa %xmm2,%xmm9 - movdqa %xmm1,%xmm8 - movups 0(%rdi),%xmm14 - xorps %xmm15,%xmm14 - xorps %xmm14,%xmm6 - movups -80(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movups -64(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,202 - - movdqa 32-128(%rax),%xmm0 - paddd %xmm11,%xmm0 -.byte 102,68,15,56,0,227 - leaq 64(%r10),%r10 - movups -48(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movups -32(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,202 - - movdqa 64-128(%rax),%xmm0 - paddd %xmm12,%xmm0 -.byte 102,68,15,56,0,235 -.byte 69,15,56,204,211 - movups -16(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm13,%xmm3 -.byte 102,65,15,58,15,220,4 - paddd %xmm3,%xmm10 - movups 0(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,202 - - movdqa 96-128(%rax),%xmm0 - paddd %xmm13,%xmm0 -.byte 69,15,56,205,213 -.byte 69,15,56,204,220 - movups 16(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movups 32(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movdqa %xmm10,%xmm3 -.byte 102,65,15,58,15,221,4 - paddd %xmm3,%xmm11 -.byte 15,56,203,202 - movdqa 128-128(%rax),%xmm0 - paddd %xmm10,%xmm0 -.byte 69,15,56,205,218 -.byte 69,15,56,204,229 - movups 48(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm11,%xmm3 -.byte 102,65,15,58,15,218,4 - paddd %xmm3,%xmm12 - cmpl $11,%r11d - jb .Laesenclast1 - movups 64(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movups 80(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - je .Laesenclast1 - movups 96(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movups 112(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.Laesenclast1: - aesenclast %xmm5,%xmm6 - movups 16-112(%rcx),%xmm4 - nop -.byte 15,56,203,202 - movups 16(%rdi),%xmm14 - xorps %xmm15,%xmm14 - movups %xmm6,0(%rsi,%rdi,1) - xorps %xmm14,%xmm6 - movups -80(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - movdqa 160-128(%rax),%xmm0 - paddd %xmm11,%xmm0 -.byte 69,15,56,205,227 -.byte 69,15,56,204,234 - movups -64(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm12,%xmm3 -.byte 102,65,15,58,15,219,4 - paddd %xmm3,%xmm13 - movups -48(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,202 - movdqa 192-128(%rax),%xmm0 - paddd %xmm12,%xmm0 -.byte 69,15,56,205,236 -.byte 69,15,56,204,211 - movups -32(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm13,%xmm3 -.byte 102,65,15,58,15,220,4 - paddd %xmm3,%xmm10 - movups -16(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,202 - movdqa 224-128(%rax),%xmm0 - paddd %xmm13,%xmm0 -.byte 69,15,56,205,213 -.byte 69,15,56,204,220 - movups 0(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm10,%xmm3 -.byte 102,65,15,58,15,221,4 - paddd %xmm3,%xmm11 - movups 16(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,202 - movdqa 256-128(%rax),%xmm0 - paddd %xmm10,%xmm0 -.byte 69,15,56,205,218 -.byte 69,15,56,204,229 - movups 32(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm11,%xmm3 -.byte 102,65,15,58,15,218,4 - paddd %xmm3,%xmm12 - movups 48(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - cmpl $11,%r11d - jb .Laesenclast2 - movups 64(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movups 80(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - je .Laesenclast2 - movups 96(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movups 112(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.Laesenclast2: - aesenclast %xmm5,%xmm6 - movups 16-112(%rcx),%xmm4 - nop -.byte 15,56,203,202 - movups 32(%rdi),%xmm14 - xorps %xmm15,%xmm14 - movups %xmm6,16(%rsi,%rdi,1) - xorps %xmm14,%xmm6 - movups -80(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - movdqa 288-128(%rax),%xmm0 - paddd %xmm11,%xmm0 -.byte 69,15,56,205,227 -.byte 69,15,56,204,234 - movups -64(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm12,%xmm3 -.byte 102,65,15,58,15,219,4 - paddd %xmm3,%xmm13 - movups -48(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,202 - movdqa 320-128(%rax),%xmm0 - paddd %xmm12,%xmm0 -.byte 69,15,56,205,236 -.byte 69,15,56,204,211 - movups -32(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm13,%xmm3 -.byte 102,65,15,58,15,220,4 - paddd %xmm3,%xmm10 - movups -16(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,202 - movdqa 352-128(%rax),%xmm0 - paddd %xmm13,%xmm0 -.byte 69,15,56,205,213 -.byte 69,15,56,204,220 - movups 0(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm10,%xmm3 -.byte 102,65,15,58,15,221,4 - paddd %xmm3,%xmm11 - movups 16(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,202 - movdqa 384-128(%rax),%xmm0 - paddd %xmm10,%xmm0 -.byte 69,15,56,205,218 -.byte 69,15,56,204,229 - movups 32(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm11,%xmm3 -.byte 102,65,15,58,15,218,4 - paddd %xmm3,%xmm12 - movups 48(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,202 - movdqa 416-128(%rax),%xmm0 - paddd %xmm11,%xmm0 -.byte 69,15,56,205,227 -.byte 69,15,56,204,234 - cmpl $11,%r11d - jb .Laesenclast3 - movups 64(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movups 80(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - je .Laesenclast3 - movups 96(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movups 112(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.Laesenclast3: - aesenclast %xmm5,%xmm6 - movups 16-112(%rcx),%xmm4 - nop -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm12,%xmm3 -.byte 102,65,15,58,15,219,4 - paddd %xmm3,%xmm13 - movups 48(%rdi),%xmm14 - xorps %xmm15,%xmm14 - movups %xmm6,32(%rsi,%rdi,1) - xorps %xmm14,%xmm6 - movups -80(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - movups -64(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,202 - - movdqa 448-128(%rax),%xmm0 - paddd %xmm12,%xmm0 -.byte 69,15,56,205,236 - movdqa %xmm7,%xmm3 - movups -48(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movups -32(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,202 - - movdqa 480-128(%rax),%xmm0 - paddd %xmm13,%xmm0 - movups -16(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - movups 0(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movups 16(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,202 - - movups 32(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movups 48(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - cmpl $11,%r11d - jb .Laesenclast4 - movups 64(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movups 80(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - je .Laesenclast4 - movups 96(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movups 112(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.Laesenclast4: - aesenclast %xmm5,%xmm6 - movups 16-112(%rcx),%xmm4 - nop - - paddd %xmm9,%xmm2 - paddd %xmm8,%xmm1 - - decq %rdx - movups %xmm6,48(%rsi,%rdi,1) - leaq 64(%rdi),%rdi - jnz .Loop_shaext - - pshufd $0xb1,%xmm2,%xmm2 - pshufd $0x1b,%xmm1,%xmm3 - pshufd $0xb1,%xmm1,%xmm1 - punpckhqdq %xmm2,%xmm1 -.byte 102,15,58,15,211,8 - - movups %xmm6,(%r8) - movdqu %xmm1,(%r9) - movdqu %xmm2,16(%r9) - .byte 0xf3,0xc3 -.size aesni_cbc_sha256_enc_shaext,.-aesni_cbc_sha256_enc_shaext diff --git a/secure/lib/libcrypto/amd64/aesni-x86_64.S b/secure/lib/libcrypto/amd64/aesni-x86_64.S index e2ef2d6666cb..ce3ba1266de1 100644 --- a/secure/lib/libcrypto/amd64/aesni-x86_64.S +++ b/secure/lib/libcrypto/amd64/aesni-x86_64.S @@ -863,6 +863,7 @@ aesni_ecb_encrypt: .type aesni_ccm64_encrypt_blocks,@function .align 16 aesni_ccm64_encrypt_blocks: +.cfi_startproc movl 240(%rcx),%eax movdqu (%r8),%xmm6 movdqa .Lincrement64(%rip),%xmm9 @@ -921,11 +922,13 @@ aesni_ccm64_encrypt_blocks: pxor %xmm8,%xmm8 pxor %xmm6,%xmm6 .byte 0xf3,0xc3 +.cfi_endproc .size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks .globl aesni_ccm64_decrypt_blocks .type aesni_ccm64_decrypt_blocks,@function .align 16 aesni_ccm64_decrypt_blocks: +.cfi_startproc movl 240(%rcx),%eax movups (%r8),%xmm6 movdqu (%r9),%xmm3 @@ -1018,6 +1021,7 @@ aesni_ccm64_decrypt_blocks: pxor %xmm8,%xmm8 pxor %xmm6,%xmm6 .byte 0xf3,0xc3 +.cfi_endproc .size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks .globl aesni_ctr32_encrypt_blocks .type aesni_ctr32_encrypt_blocks,@function @@ -2792,6 +2796,7 @@ aesni_ocb_encrypt: .type __ocb_encrypt6,@function .align 32 __ocb_encrypt6: +.cfi_startproc pxor %xmm9,%xmm15 movdqu (%rbx,%r12,1),%xmm11 movdqa %xmm10,%xmm12 @@ -2889,11 +2894,13 @@ __ocb_encrypt6: .byte 102,65,15,56,221,246 .byte 102,65,15,56,221,255 .byte 0xf3,0xc3 +.cfi_endproc .size __ocb_encrypt6,.-__ocb_encrypt6 .type __ocb_encrypt4,@function .align 32 __ocb_encrypt4: +.cfi_startproc pxor %xmm9,%xmm15 movdqu (%rbx,%r12,1),%xmm11 movdqa %xmm10,%xmm12 @@ -2958,11 +2965,13 @@ __ocb_encrypt4: .byte 102,65,15,56,221,228 .byte 102,65,15,56,221,237 .byte 0xf3,0xc3 +.cfi_endproc .size __ocb_encrypt4,.-__ocb_encrypt4 .type __ocb_encrypt1,@function .align 32 __ocb_encrypt1: +.cfi_startproc pxor %xmm15,%xmm7 pxor %xmm9,%xmm7 pxor %xmm2,%xmm8 @@ -2993,6 +3002,7 @@ __ocb_encrypt1: .byte 102,15,56,221,215 .byte 0xf3,0xc3 +.cfi_endproc .size __ocb_encrypt1,.-__ocb_encrypt1 .globl aesni_ocb_decrypt @@ -3235,6 +3245,7 @@ aesni_ocb_decrypt: .type __ocb_decrypt6,@function .align 32 __ocb_decrypt6: +.cfi_startproc pxor %xmm9,%xmm15 movdqu (%rbx,%r12,1),%xmm11 movdqa %xmm10,%xmm12 @@ -3326,11 +3337,13 @@ __ocb_decrypt6: .byte 102,65,15,56,223,246 .byte 102,65,15,56,223,255 .byte 0xf3,0xc3 +.cfi_endproc .size __ocb_decrypt6,.-__ocb_decrypt6 .type __ocb_decrypt4,@function .align 32 __ocb_decrypt4: +.cfi_startproc pxor %xmm9,%xmm15 movdqu (%rbx,%r12,1),%xmm11 movdqa %xmm10,%xmm12 @@ -3391,11 +3404,13 @@ __ocb_decrypt4: .byte 102,65,15,56,223,228 .byte 102,65,15,56,223,237 .byte 0xf3,0xc3 +.cfi_endproc .size __ocb_decrypt4,.-__ocb_decrypt4 .type __ocb_decrypt1,@function .align 32 __ocb_decrypt1: +.cfi_startproc pxor %xmm15,%xmm7 pxor %xmm9,%xmm7 pxor %xmm7,%xmm2 @@ -3425,6 +3440,7 @@ __ocb_decrypt1: .byte 102,15,56,223,215 .byte 0xf3,0xc3 +.cfi_endproc .size __ocb_decrypt1,.-__ocb_decrypt1 .globl aesni_cbc_encrypt .type aesni_cbc_encrypt,@function @@ -4363,7 +4379,6 @@ __aesni_set_encrypt_key: addq $8,%rsp .cfi_adjust_cfa_offset -8 .byte 0xf3,0xc3 -.cfi_endproc .LSEH_end_set_encrypt_key: .align 16 @@ -4434,6 +4449,7 @@ __aesni_set_encrypt_key: shufps $170,%xmm1,%xmm1 xorps %xmm1,%xmm2 .byte 0xf3,0xc3 +.cfi_endproc .size aesni_set_encrypt_key,.-aesni_set_encrypt_key .size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key .align 64 diff --git a/secure/lib/libcrypto/amd64/chacha-x86_64.S b/secure/lib/libcrypto/amd64/chacha-x86_64.S index b01c1b87d47b..0b3d5b8b6db4 100644 --- a/secure/lib/libcrypto/amd64/chacha-x86_64.S +++ b/secure/lib/libcrypto/amd64/chacha-x86_64.S @@ -331,8 +331,6 @@ ChaCha20_ssse3: .LChaCha20_ssse3: movq %rsp,%r9 .cfi_def_cfa_register %r9 - testl $2048,%r10d - jnz .LChaCha20_4xop cmpq $128,%rdx je .LChaCha20_128 ja .LChaCha20_4x @@ -628,9 +626,6 @@ ChaCha20_4x: movq %rsp,%r9 .cfi_def_cfa_register %r9 movq %r10,%r11 - shrq $32,%r10 - testq $32,%r10 - jnz .LChaCha20_8x cmpq $192,%rdx ja .Lproceed4x @@ -1172,1024 +1167,3 @@ ChaCha20_4x: .byte 0xf3,0xc3 .cfi_endproc .size ChaCha20_4x,.-ChaCha20_4x -.type ChaCha20_4xop,@function -.align 32 -ChaCha20_4xop: -.cfi_startproc -.LChaCha20_4xop: - movq %rsp,%r9 -.cfi_def_cfa_register %r9 - subq $0x140+8,%rsp - vzeroupper - - vmovdqa .Lsigma(%rip),%xmm11 - vmovdqu (%rcx),%xmm3 - vmovdqu 16(%rcx),%xmm15 - vmovdqu (%r8),%xmm7 - leaq 256(%rsp),%rcx - - vpshufd $0x00,%xmm11,%xmm8 - vpshufd $0x55,%xmm11,%xmm9 - vmovdqa %xmm8,64(%rsp) - vpshufd $0xaa,%xmm11,%xmm10 - vmovdqa %xmm9,80(%rsp) - vpshufd $0xff,%xmm11,%xmm11 - vmovdqa %xmm10,96(%rsp) - vmovdqa %xmm11,112(%rsp) - - vpshufd $0x00,%xmm3,%xmm0 - vpshufd $0x55,%xmm3,%xmm1 - vmovdqa %xmm0,128-256(%rcx) - vpshufd $0xaa,%xmm3,%xmm2 - vmovdqa %xmm1,144-256(%rcx) - vpshufd $0xff,%xmm3,%xmm3 - vmovdqa %xmm2,160-256(%rcx) - vmovdqa %xmm3,176-256(%rcx) - - vpshufd $0x00,%xmm15,%xmm12 - vpshufd $0x55,%xmm15,%xmm13 - vmovdqa %xmm12,192-256(%rcx) - vpshufd $0xaa,%xmm15,%xmm14 - vmovdqa %xmm13,208-256(%rcx) - vpshufd $0xff,%xmm15,%xmm15 - vmovdqa %xmm14,224-256(%rcx) - vmovdqa %xmm15,240-256(%rcx) - - vpshufd $0x00,%xmm7,%xmm4 - vpshufd $0x55,%xmm7,%xmm5 - vpaddd .Linc(%rip),%xmm4,%xmm4 - vpshufd $0xaa,%xmm7,%xmm6 - vmovdqa %xmm5,272-256(%rcx) - vpshufd $0xff,%xmm7,%xmm7 - vmovdqa %xmm6,288-256(%rcx) - vmovdqa %xmm7,304-256(%rcx) - - jmp .Loop_enter4xop - -.align 32 -.Loop_outer4xop: - vmovdqa 64(%rsp),%xmm8 - vmovdqa 80(%rsp),%xmm9 - vmovdqa 96(%rsp),%xmm10 - vmovdqa 112(%rsp),%xmm11 - vmovdqa 128-256(%rcx),%xmm0 - vmovdqa 144-256(%rcx),%xmm1 - vmovdqa 160-256(%rcx),%xmm2 - vmovdqa 176-256(%rcx),%xmm3 - vmovdqa 192-256(%rcx),%xmm12 - vmovdqa 208-256(%rcx),%xmm13 - vmovdqa 224-256(%rcx),%xmm14 - vmovdqa 240-256(%rcx),%xmm15 - vmovdqa 256-256(%rcx),%xmm4 - vmovdqa 272-256(%rcx),%xmm5 - vmovdqa 288-256(%rcx),%xmm6 - vmovdqa 304-256(%rcx),%xmm7 - vpaddd .Lfour(%rip),%xmm4,%xmm4 - -.Loop_enter4xop: - movl $10,%eax - vmovdqa %xmm4,256-256(%rcx) - jmp .Loop4xop - -.align 32 -.Loop4xop: - vpaddd %xmm0,%xmm8,%xmm8 - vpaddd %xmm1,%xmm9,%xmm9 - vpaddd %xmm2,%xmm10,%xmm10 - vpaddd %xmm3,%xmm11,%xmm11 - vpxor %xmm4,%xmm8,%xmm4 - vpxor %xmm5,%xmm9,%xmm5 - vpxor %xmm6,%xmm10,%xmm6 - vpxor %xmm7,%xmm11,%xmm7 -.byte 143,232,120,194,228,16 -.byte 143,232,120,194,237,16 -.byte 143,232,120,194,246,16 -.byte 143,232,120,194,255,16 - vpaddd %xmm4,%xmm12,%xmm12 - vpaddd %xmm5,%xmm13,%xmm13 - vpaddd %xmm6,%xmm14,%xmm14 - vpaddd %xmm7,%xmm15,%xmm15 - vpxor %xmm0,%xmm12,%xmm0 - vpxor %xmm1,%xmm13,%xmm1 - vpxor %xmm14,%xmm2,%xmm2 - vpxor %xmm15,%xmm3,%xmm3 -.byte 143,232,120,194,192,12 -.byte 143,232,120,194,201,12 -.byte 143,232,120,194,210,12 -.byte 143,232,120,194,219,12 - vpaddd %xmm8,%xmm0,%xmm8 - vpaddd %xmm9,%xmm1,%xmm9 - vpaddd %xmm2,%xmm10,%xmm10 - vpaddd %xmm3,%xmm11,%xmm11 - vpxor %xmm4,%xmm8,%xmm4 - vpxor %xmm5,%xmm9,%xmm5 - vpxor %xmm6,%xmm10,%xmm6 - vpxor %xmm7,%xmm11,%xmm7 -.byte 143,232,120,194,228,8 -.byte 143,232,120,194,237,8 -.byte 143,232,120,194,246,8 -.byte 143,232,120,194,255,8 - vpaddd %xmm4,%xmm12,%xmm12 - vpaddd %xmm5,%xmm13,%xmm13 - vpaddd %xmm6,%xmm14,%xmm14 - vpaddd %xmm7,%xmm15,%xmm15 - vpxor %xmm0,%xmm12,%xmm0 - vpxor %xmm1,%xmm13,%xmm1 - vpxor %xmm14,%xmm2,%xmm2 - vpxor %xmm15,%xmm3,%xmm3 -.byte 143,232,120,194,192,7 -.byte 143,232,120,194,201,7 -.byte 143,232,120,194,210,7 -.byte 143,232,120,194,219,7 - vpaddd %xmm1,%xmm8,%xmm8 - vpaddd %xmm2,%xmm9,%xmm9 - vpaddd %xmm3,%xmm10,%xmm10 - vpaddd %xmm0,%xmm11,%xmm11 - vpxor %xmm7,%xmm8,%xmm7 - vpxor %xmm4,%xmm9,%xmm4 - vpxor %xmm5,%xmm10,%xmm5 - vpxor %xmm6,%xmm11,%xmm6 -.byte 143,232,120,194,255,16 -.byte 143,232,120,194,228,16 -.byte 143,232,120,194,237,16 -.byte 143,232,120,194,246,16 - vpaddd %xmm7,%xmm14,%xmm14 - vpaddd %xmm4,%xmm15,%xmm15 - vpaddd %xmm5,%xmm12,%xmm12 - vpaddd %xmm6,%xmm13,%xmm13 - vpxor %xmm1,%xmm14,%xmm1 - vpxor %xmm2,%xmm15,%xmm2 - vpxor %xmm12,%xmm3,%xmm3 - vpxor %xmm13,%xmm0,%xmm0 -.byte 143,232,120,194,201,12 -.byte 143,232,120,194,210,12 -.byte 143,232,120,194,219,12 -.byte 143,232,120,194,192,12 - vpaddd %xmm8,%xmm1,%xmm8 - vpaddd %xmm9,%xmm2,%xmm9 - vpaddd %xmm3,%xmm10,%xmm10 - vpaddd %xmm0,%xmm11,%xmm11 - vpxor %xmm7,%xmm8,%xmm7 - vpxor %xmm4,%xmm9,%xmm4 - vpxor %xmm5,%xmm10,%xmm5 - vpxor %xmm6,%xmm11,%xmm6 -.byte 143,232,120,194,255,8 -.byte 143,232,120,194,228,8 -.byte 143,232,120,194,237,8 -.byte 143,232,120,194,246,8 - vpaddd %xmm7,%xmm14,%xmm14 - vpaddd %xmm4,%xmm15,%xmm15 - vpaddd %xmm5,%xmm12,%xmm12 - vpaddd %xmm6,%xmm13,%xmm13 - vpxor %xmm1,%xmm14,%xmm1 - vpxor %xmm2,%xmm15,%xmm2 - vpxor %xmm12,%xmm3,%xmm3 - vpxor %xmm13,%xmm0,%xmm0 -.byte 143,232,120,194,201,7 -.byte 143,232,120,194,210,7 -.byte 143,232,120,194,219,7 -.byte 143,232,120,194,192,7 - decl %eax - jnz .Loop4xop - - vpaddd 64(%rsp),%xmm8,%xmm8 - vpaddd 80(%rsp),%xmm9,%xmm9 - vpaddd 96(%rsp),%xmm10,%xmm10 - vpaddd 112(%rsp),%xmm11,%xmm11 - - vmovdqa %xmm14,32(%rsp) - vmovdqa %xmm15,48(%rsp) - - vpunpckldq %xmm9,%xmm8,%xmm14 - vpunpckldq %xmm11,%xmm10,%xmm15 - vpunpckhdq %xmm9,%xmm8,%xmm8 - vpunpckhdq %xmm11,%xmm10,%xmm10 - vpunpcklqdq %xmm15,%xmm14,%xmm9 - vpunpckhqdq %xmm15,%xmm14,%xmm14 - vpunpcklqdq %xmm10,%xmm8,%xmm11 - vpunpckhqdq %xmm10,%xmm8,%xmm8 - vpaddd 128-256(%rcx),%xmm0,%xmm0 - vpaddd 144-256(%rcx),%xmm1,%xmm1 - vpaddd 160-256(%rcx),%xmm2,%xmm2 - vpaddd 176-256(%rcx),%xmm3,%xmm3 - - vmovdqa %xmm9,0(%rsp) - vmovdqa %xmm14,16(%rsp) - vmovdqa 32(%rsp),%xmm9 - vmovdqa 48(%rsp),%xmm14 - - vpunpckldq %xmm1,%xmm0,%xmm10 - vpunpckldq %xmm3,%xmm2,%xmm15 - vpunpckhdq %xmm1,%xmm0,%xmm0 - vpunpckhdq %xmm3,%xmm2,%xmm2 - vpunpcklqdq %xmm15,%xmm10,%xmm1 - vpunpckhqdq %xmm15,%xmm10,%xmm10 - vpunpcklqdq %xmm2,%xmm0,%xmm3 - vpunpckhqdq %xmm2,%xmm0,%xmm0 - vpaddd 192-256(%rcx),%xmm12,%xmm12 - vpaddd 208-256(%rcx),%xmm13,%xmm13 - vpaddd 224-256(%rcx),%xmm9,%xmm9 - vpaddd 240-256(%rcx),%xmm14,%xmm14 - - vpunpckldq %xmm13,%xmm12,%xmm2 - vpunpckldq %xmm14,%xmm9,%xmm15 - vpunpckhdq %xmm13,%xmm12,%xmm12 - vpunpckhdq %xmm14,%xmm9,%xmm9 - vpunpcklqdq %xmm15,%xmm2,%xmm13 - vpunpckhqdq %xmm15,%xmm2,%xmm2 - vpunpcklqdq %xmm9,%xmm12,%xmm14 - vpunpckhqdq %xmm9,%xmm12,%xmm12 - vpaddd 256-256(%rcx),%xmm4,%xmm4 - vpaddd 272-256(%rcx),%xmm5,%xmm5 - vpaddd 288-256(%rcx),%xmm6,%xmm6 - vpaddd 304-256(%rcx),%xmm7,%xmm7 - - vpunpckldq %xmm5,%xmm4,%xmm9 - vpunpckldq %xmm7,%xmm6,%xmm15 - vpunpckhdq %xmm5,%xmm4,%xmm4 - vpunpckhdq %xmm7,%xmm6,%xmm6 - vpunpcklqdq %xmm15,%xmm9,%xmm5 - vpunpckhqdq %xmm15,%xmm9,%xmm9 - vpunpcklqdq %xmm6,%xmm4,%xmm7 - vpunpckhqdq %xmm6,%xmm4,%xmm4 - vmovdqa 0(%rsp),%xmm6 - vmovdqa 16(%rsp),%xmm15 - - cmpq $256,%rdx - jb .Ltail4xop - - vpxor 0(%rsi),%xmm6,%xmm6 - vpxor 16(%rsi),%xmm1,%xmm1 - vpxor 32(%rsi),%xmm13,%xmm13 - vpxor 48(%rsi),%xmm5,%xmm5 - vpxor 64(%rsi),%xmm15,%xmm15 - vpxor 80(%rsi),%xmm10,%xmm10 - vpxor 96(%rsi),%xmm2,%xmm2 - vpxor 112(%rsi),%xmm9,%xmm9 - leaq 128(%rsi),%rsi - vpxor 0(%rsi),%xmm11,%xmm11 - vpxor 16(%rsi),%xmm3,%xmm3 - vpxor 32(%rsi),%xmm14,%xmm14 - vpxor 48(%rsi),%xmm7,%xmm7 - vpxor 64(%rsi),%xmm8,%xmm8 - vpxor 80(%rsi),%xmm0,%xmm0 - vpxor 96(%rsi),%xmm12,%xmm12 - vpxor 112(%rsi),%xmm4,%xmm4 - leaq 128(%rsi),%rsi - - vmovdqu %xmm6,0(%rdi) - vmovdqu %xmm1,16(%rdi) - vmovdqu %xmm13,32(%rdi) - vmovdqu %xmm5,48(%rdi) - vmovdqu %xmm15,64(%rdi) - vmovdqu %xmm10,80(%rdi) - vmovdqu %xmm2,96(%rdi) - vmovdqu %xmm9,112(%rdi) - leaq 128(%rdi),%rdi - vmovdqu %xmm11,0(%rdi) - vmovdqu %xmm3,16(%rdi) - vmovdqu %xmm14,32(%rdi) - vmovdqu %xmm7,48(%rdi) - vmovdqu %xmm8,64(%rdi) - vmovdqu %xmm0,80(%rdi) - vmovdqu %xmm12,96(%rdi) - vmovdqu %xmm4,112(%rdi) - leaq 128(%rdi),%rdi - - subq $256,%rdx - jnz .Loop_outer4xop - - jmp .Ldone4xop - -.align 32 -.Ltail4xop: - cmpq $192,%rdx - jae .L192_or_more4xop - cmpq $128,%rdx - jae .L128_or_more4xop - cmpq $64,%rdx - jae .L64_or_more4xop - - xorq %r10,%r10 - vmovdqa %xmm6,0(%rsp) - vmovdqa %xmm1,16(%rsp) - vmovdqa %xmm13,32(%rsp) - vmovdqa %xmm5,48(%rsp) - jmp .Loop_tail4xop - -.align 32 -.L64_or_more4xop: - vpxor 0(%rsi),%xmm6,%xmm6 - vpxor 16(%rsi),%xmm1,%xmm1 - vpxor 32(%rsi),%xmm13,%xmm13 - vpxor 48(%rsi),%xmm5,%xmm5 - vmovdqu %xmm6,0(%rdi) - vmovdqu %xmm1,16(%rdi) - vmovdqu %xmm13,32(%rdi) - vmovdqu %xmm5,48(%rdi) - je .Ldone4xop - - leaq 64(%rsi),%rsi - vmovdqa %xmm15,0(%rsp) - xorq %r10,%r10 - vmovdqa %xmm10,16(%rsp) - leaq 64(%rdi),%rdi - vmovdqa %xmm2,32(%rsp) - subq $64,%rdx - vmovdqa %xmm9,48(%rsp) - jmp .Loop_tail4xop - -.align 32 -.L128_or_more4xop: - vpxor 0(%rsi),%xmm6,%xmm6 - vpxor 16(%rsi),%xmm1,%xmm1 - vpxor 32(%rsi),%xmm13,%xmm13 - vpxor 48(%rsi),%xmm5,%xmm5 - vpxor 64(%rsi),%xmm15,%xmm15 - vpxor 80(%rsi),%xmm10,%xmm10 - vpxor 96(%rsi),%xmm2,%xmm2 - vpxor 112(%rsi),%xmm9,%xmm9 - - vmovdqu %xmm6,0(%rdi) - vmovdqu %xmm1,16(%rdi) - vmovdqu %xmm13,32(%rdi) - vmovdqu %xmm5,48(%rdi) - vmovdqu %xmm15,64(%rdi) - vmovdqu %xmm10,80(%rdi) - vmovdqu %xmm2,96(%rdi) - vmovdqu %xmm9,112(%rdi) - je .Ldone4xop - - leaq 128(%rsi),%rsi - vmovdqa %xmm11,0(%rsp) - xorq %r10,%r10 - vmovdqa %xmm3,16(%rsp) - leaq 128(%rdi),%rdi - vmovdqa %xmm14,32(%rsp) - subq $128,%rdx - vmovdqa %xmm7,48(%rsp) - jmp .Loop_tail4xop - -.align 32 -.L192_or_more4xop: - vpxor 0(%rsi),%xmm6,%xmm6 - vpxor 16(%rsi),%xmm1,%xmm1 - vpxor 32(%rsi),%xmm13,%xmm13 - vpxor 48(%rsi),%xmm5,%xmm5 - vpxor 64(%rsi),%xmm15,%xmm15 - vpxor 80(%rsi),%xmm10,%xmm10 - vpxor 96(%rsi),%xmm2,%xmm2 - vpxor 112(%rsi),%xmm9,%xmm9 - leaq 128(%rsi),%rsi - vpxor 0(%rsi),%xmm11,%xmm11 - vpxor 16(%rsi),%xmm3,%xmm3 - vpxor 32(%rsi),%xmm14,%xmm14 - vpxor 48(%rsi),%xmm7,%xmm7 - - vmovdqu %xmm6,0(%rdi) - vmovdqu %xmm1,16(%rdi) - vmovdqu %xmm13,32(%rdi) - vmovdqu %xmm5,48(%rdi) - vmovdqu %xmm15,64(%rdi) - vmovdqu %xmm10,80(%rdi) - vmovdqu %xmm2,96(%rdi) - vmovdqu %xmm9,112(%rdi) - leaq 128(%rdi),%rdi - vmovdqu %xmm11,0(%rdi) - vmovdqu %xmm3,16(%rdi) - vmovdqu %xmm14,32(%rdi) - vmovdqu %xmm7,48(%rdi) - je .Ldone4xop - - leaq 64(%rsi),%rsi - vmovdqa %xmm8,0(%rsp) - xorq %r10,%r10 - vmovdqa %xmm0,16(%rsp) - leaq 64(%rdi),%rdi - vmovdqa %xmm12,32(%rsp) - subq $192,%rdx - vmovdqa %xmm4,48(%rsp) - -.Loop_tail4xop: - movzbl (%rsi,%r10,1),%eax - movzbl (%rsp,%r10,1),%ecx - leaq 1(%r10),%r10 - xorl %ecx,%eax - movb %al,-1(%rdi,%r10,1) - decq %rdx - jnz .Loop_tail4xop - -.Ldone4xop: - vzeroupper - leaq (%r9),%rsp -.cfi_def_cfa_register %rsp -.L4xop_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size ChaCha20_4xop,.-ChaCha20_4xop -.type ChaCha20_8x,@function -.align 32 -ChaCha20_8x: -.cfi_startproc -.LChaCha20_8x: - movq %rsp,%r9 -.cfi_def_cfa_register %r9 - subq $0x280+8,%rsp - andq $-32,%rsp - vzeroupper - - - - - - - - - - - vbroadcasti128 .Lsigma(%rip),%ymm11 - vbroadcasti128 (%rcx),%ymm3 - vbroadcasti128 16(%rcx),%ymm15 - vbroadcasti128 (%r8),%ymm7 - leaq 256(%rsp),%rcx - leaq 512(%rsp),%rax - leaq .Lrot16(%rip),%r10 - leaq .Lrot24(%rip),%r11 - - vpshufd $0x00,%ymm11,%ymm8 - vpshufd $0x55,%ymm11,%ymm9 - vmovdqa %ymm8,128-256(%rcx) - vpshufd $0xaa,%ymm11,%ymm10 - vmovdqa %ymm9,160-256(%rcx) - vpshufd $0xff,%ymm11,%ymm11 - vmovdqa %ymm10,192-256(%rcx) - vmovdqa %ymm11,224-256(%rcx) - - vpshufd $0x00,%ymm3,%ymm0 - vpshufd $0x55,%ymm3,%ymm1 - vmovdqa %ymm0,256-256(%rcx) - vpshufd $0xaa,%ymm3,%ymm2 - vmovdqa %ymm1,288-256(%rcx) - vpshufd $0xff,%ymm3,%ymm3 - vmovdqa %ymm2,320-256(%rcx) - vmovdqa %ymm3,352-256(%rcx) - - vpshufd $0x00,%ymm15,%ymm12 - vpshufd $0x55,%ymm15,%ymm13 - vmovdqa %ymm12,384-512(%rax) - vpshufd $0xaa,%ymm15,%ymm14 - vmovdqa %ymm13,416-512(%rax) - vpshufd $0xff,%ymm15,%ymm15 - vmovdqa %ymm14,448-512(%rax) - vmovdqa %ymm15,480-512(%rax) - - vpshufd $0x00,%ymm7,%ymm4 - vpshufd $0x55,%ymm7,%ymm5 - vpaddd .Lincy(%rip),%ymm4,%ymm4 - vpshufd $0xaa,%ymm7,%ymm6 - vmovdqa %ymm5,544-512(%rax) - vpshufd $0xff,%ymm7,%ymm7 - vmovdqa %ymm6,576-512(%rax) - vmovdqa %ymm7,608-512(%rax) - - jmp .Loop_enter8x - -.align 32 -.Loop_outer8x: - vmovdqa 128-256(%rcx),%ymm8 - vmovdqa 160-256(%rcx),%ymm9 - vmovdqa 192-256(%rcx),%ymm10 - vmovdqa 224-256(%rcx),%ymm11 - vmovdqa 256-256(%rcx),%ymm0 - vmovdqa 288-256(%rcx),%ymm1 - vmovdqa 320-256(%rcx),%ymm2 - vmovdqa 352-256(%rcx),%ymm3 - vmovdqa 384-512(%rax),%ymm12 - vmovdqa 416-512(%rax),%ymm13 - vmovdqa 448-512(%rax),%ymm14 - vmovdqa 480-512(%rax),%ymm15 - vmovdqa 512-512(%rax),%ymm4 - vmovdqa 544-512(%rax),%ymm5 - vmovdqa 576-512(%rax),%ymm6 - vmovdqa 608-512(%rax),%ymm7 - vpaddd .Leight(%rip),%ymm4,%ymm4 - -.Loop_enter8x: - vmovdqa %ymm14,64(%rsp) - vmovdqa %ymm15,96(%rsp) - vbroadcasti128 (%r10),%ymm15 - vmovdqa %ymm4,512-512(%rax) - movl $10,%eax - jmp .Loop8x - -.align 32 -.Loop8x: - vpaddd %ymm0,%ymm8,%ymm8 - vpxor %ymm4,%ymm8,%ymm4 - vpshufb %ymm15,%ymm4,%ymm4 - vpaddd %ymm1,%ymm9,%ymm9 - vpxor %ymm5,%ymm9,%ymm5 - vpshufb %ymm15,%ymm5,%ymm5 - vpaddd %ymm4,%ymm12,%ymm12 - vpxor %ymm0,%ymm12,%ymm0 - vpslld $12,%ymm0,%ymm14 - vpsrld $20,%ymm0,%ymm0 - vpor %ymm0,%ymm14,%ymm0 - vbroadcasti128 (%r11),%ymm14 - vpaddd %ymm5,%ymm13,%ymm13 - vpxor %ymm1,%ymm13,%ymm1 - vpslld $12,%ymm1,%ymm15 - vpsrld $20,%ymm1,%ymm1 - vpor %ymm1,%ymm15,%ymm1 - vpaddd %ymm0,%ymm8,%ymm8 - vpxor %ymm4,%ymm8,%ymm4 - vpshufb %ymm14,%ymm4,%ymm4 - vpaddd %ymm1,%ymm9,%ymm9 - vpxor %ymm5,%ymm9,%ymm5 - vpshufb %ymm14,%ymm5,%ymm5 - vpaddd %ymm4,%ymm12,%ymm12 - vpxor %ymm0,%ymm12,%ymm0 - vpslld $7,%ymm0,%ymm15 - vpsrld $25,%ymm0,%ymm0 - vpor %ymm0,%ymm15,%ymm0 - vbroadcasti128 (%r10),%ymm15 - vpaddd %ymm5,%ymm13,%ymm13 - vpxor %ymm1,%ymm13,%ymm1 - vpslld $7,%ymm1,%ymm14 - vpsrld $25,%ymm1,%ymm1 - vpor %ymm1,%ymm14,%ymm1 - vmovdqa %ymm12,0(%rsp) - vmovdqa %ymm13,32(%rsp) - vmovdqa 64(%rsp),%ymm12 - vmovdqa 96(%rsp),%ymm13 - vpaddd %ymm2,%ymm10,%ymm10 - vpxor %ymm6,%ymm10,%ymm6 - vpshufb %ymm15,%ymm6,%ymm6 - vpaddd %ymm3,%ymm11,%ymm11 - vpxor %ymm7,%ymm11,%ymm7 - vpshufb %ymm15,%ymm7,%ymm7 - vpaddd %ymm6,%ymm12,%ymm12 - vpxor %ymm2,%ymm12,%ymm2 - vpslld $12,%ymm2,%ymm14 - vpsrld $20,%ymm2,%ymm2 - vpor %ymm2,%ymm14,%ymm2 - vbroadcasti128 (%r11),%ymm14 - vpaddd %ymm7,%ymm13,%ymm13 - vpxor %ymm3,%ymm13,%ymm3 - vpslld $12,%ymm3,%ymm15 - vpsrld $20,%ymm3,%ymm3 - vpor %ymm3,%ymm15,%ymm3 - vpaddd %ymm2,%ymm10,%ymm10 - vpxor %ymm6,%ymm10,%ymm6 - vpshufb %ymm14,%ymm6,%ymm6 - vpaddd %ymm3,%ymm11,%ymm11 - vpxor %ymm7,%ymm11,%ymm7 - vpshufb %ymm14,%ymm7,%ymm7 - vpaddd %ymm6,%ymm12,%ymm12 - vpxor %ymm2,%ymm12,%ymm2 - vpslld $7,%ymm2,%ymm15 - vpsrld $25,%ymm2,%ymm2 - vpor %ymm2,%ymm15,%ymm2 - vbroadcasti128 (%r10),%ymm15 - vpaddd %ymm7,%ymm13,%ymm13 - vpxor %ymm3,%ymm13,%ymm3 - vpslld $7,%ymm3,%ymm14 - vpsrld $25,%ymm3,%ymm3 - vpor %ymm3,%ymm14,%ymm3 - vpaddd %ymm1,%ymm8,%ymm8 - vpxor %ymm7,%ymm8,%ymm7 - vpshufb %ymm15,%ymm7,%ymm7 - vpaddd %ymm2,%ymm9,%ymm9 - vpxor %ymm4,%ymm9,%ymm4 - vpshufb %ymm15,%ymm4,%ymm4 - vpaddd %ymm7,%ymm12,%ymm12 - vpxor %ymm1,%ymm12,%ymm1 - vpslld $12,%ymm1,%ymm14 - vpsrld $20,%ymm1,%ymm1 - vpor %ymm1,%ymm14,%ymm1 - vbroadcasti128 (%r11),%ymm14 - vpaddd %ymm4,%ymm13,%ymm13 - vpxor %ymm2,%ymm13,%ymm2 - vpslld $12,%ymm2,%ymm15 - vpsrld $20,%ymm2,%ymm2 - vpor %ymm2,%ymm15,%ymm2 - vpaddd %ymm1,%ymm8,%ymm8 - vpxor %ymm7,%ymm8,%ymm7 - vpshufb %ymm14,%ymm7,%ymm7 - vpaddd %ymm2,%ymm9,%ymm9 - vpxor %ymm4,%ymm9,%ymm4 - vpshufb %ymm14,%ymm4,%ymm4 - vpaddd %ymm7,%ymm12,%ymm12 - vpxor %ymm1,%ymm12,%ymm1 - vpslld $7,%ymm1,%ymm15 - vpsrld $25,%ymm1,%ymm1 - vpor %ymm1,%ymm15,%ymm1 - vbroadcasti128 (%r10),%ymm15 - vpaddd %ymm4,%ymm13,%ymm13 - vpxor %ymm2,%ymm13,%ymm2 - vpslld $7,%ymm2,%ymm14 - vpsrld $25,%ymm2,%ymm2 - vpor %ymm2,%ymm14,%ymm2 - vmovdqa %ymm12,64(%rsp) - vmovdqa %ymm13,96(%rsp) - vmovdqa 0(%rsp),%ymm12 - vmovdqa 32(%rsp),%ymm13 - vpaddd %ymm3,%ymm10,%ymm10 - vpxor %ymm5,%ymm10,%ymm5 - vpshufb %ymm15,%ymm5,%ymm5 - vpaddd %ymm0,%ymm11,%ymm11 - vpxor %ymm6,%ymm11,%ymm6 - vpshufb %ymm15,%ymm6,%ymm6 - vpaddd %ymm5,%ymm12,%ymm12 - vpxor %ymm3,%ymm12,%ymm3 - vpslld $12,%ymm3,%ymm14 - vpsrld $20,%ymm3,%ymm3 - vpor %ymm3,%ymm14,%ymm3 - vbroadcasti128 (%r11),%ymm14 - vpaddd %ymm6,%ymm13,%ymm13 - vpxor %ymm0,%ymm13,%ymm0 - vpslld $12,%ymm0,%ymm15 - vpsrld $20,%ymm0,%ymm0 - vpor %ymm0,%ymm15,%ymm0 - vpaddd %ymm3,%ymm10,%ymm10 - vpxor %ymm5,%ymm10,%ymm5 - vpshufb %ymm14,%ymm5,%ymm5 - vpaddd %ymm0,%ymm11,%ymm11 - vpxor %ymm6,%ymm11,%ymm6 - vpshufb %ymm14,%ymm6,%ymm6 - vpaddd %ymm5,%ymm12,%ymm12 - vpxor %ymm3,%ymm12,%ymm3 - vpslld $7,%ymm3,%ymm15 - vpsrld $25,%ymm3,%ymm3 - vpor %ymm3,%ymm15,%ymm3 - vbroadcasti128 (%r10),%ymm15 - vpaddd %ymm6,%ymm13,%ymm13 - vpxor %ymm0,%ymm13,%ymm0 - vpslld $7,%ymm0,%ymm14 - vpsrld $25,%ymm0,%ymm0 - vpor %ymm0,%ymm14,%ymm0 - decl %eax - jnz .Loop8x - - leaq 512(%rsp),%rax - vpaddd 128-256(%rcx),%ymm8,%ymm8 - vpaddd 160-256(%rcx),%ymm9,%ymm9 - vpaddd 192-256(%rcx),%ymm10,%ymm10 - vpaddd 224-256(%rcx),%ymm11,%ymm11 - - vpunpckldq %ymm9,%ymm8,%ymm14 - vpunpckldq %ymm11,%ymm10,%ymm15 - vpunpckhdq %ymm9,%ymm8,%ymm8 - vpunpckhdq %ymm11,%ymm10,%ymm10 - vpunpcklqdq %ymm15,%ymm14,%ymm9 - vpunpckhqdq %ymm15,%ymm14,%ymm14 - vpunpcklqdq %ymm10,%ymm8,%ymm11 - vpunpckhqdq %ymm10,%ymm8,%ymm8 - vpaddd 256-256(%rcx),%ymm0,%ymm0 - vpaddd 288-256(%rcx),%ymm1,%ymm1 - vpaddd 320-256(%rcx),%ymm2,%ymm2 - vpaddd 352-256(%rcx),%ymm3,%ymm3 - - vpunpckldq %ymm1,%ymm0,%ymm10 - vpunpckldq %ymm3,%ymm2,%ymm15 - vpunpckhdq %ymm1,%ymm0,%ymm0 - vpunpckhdq %ymm3,%ymm2,%ymm2 - vpunpcklqdq %ymm15,%ymm10,%ymm1 - vpunpckhqdq %ymm15,%ymm10,%ymm10 - vpunpcklqdq %ymm2,%ymm0,%ymm3 - vpunpckhqdq %ymm2,%ymm0,%ymm0 - vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 - vperm2i128 $0x31,%ymm1,%ymm9,%ymm1 - vperm2i128 $0x20,%ymm10,%ymm14,%ymm9 - vperm2i128 $0x31,%ymm10,%ymm14,%ymm10 - vperm2i128 $0x20,%ymm3,%ymm11,%ymm14 - vperm2i128 $0x31,%ymm3,%ymm11,%ymm3 - vperm2i128 $0x20,%ymm0,%ymm8,%ymm11 - vperm2i128 $0x31,%ymm0,%ymm8,%ymm0 - vmovdqa %ymm15,0(%rsp) - vmovdqa %ymm9,32(%rsp) - vmovdqa 64(%rsp),%ymm15 - vmovdqa 96(%rsp),%ymm9 - - vpaddd 384-512(%rax),%ymm12,%ymm12 - vpaddd 416-512(%rax),%ymm13,%ymm13 - vpaddd 448-512(%rax),%ymm15,%ymm15 - vpaddd 480-512(%rax),%ymm9,%ymm9 - - vpunpckldq %ymm13,%ymm12,%ymm2 - vpunpckldq %ymm9,%ymm15,%ymm8 - vpunpckhdq %ymm13,%ymm12,%ymm12 - vpunpckhdq %ymm9,%ymm15,%ymm15 - vpunpcklqdq %ymm8,%ymm2,%ymm13 - vpunpckhqdq %ymm8,%ymm2,%ymm2 - vpunpcklqdq %ymm15,%ymm12,%ymm9 - vpunpckhqdq %ymm15,%ymm12,%ymm12 - vpaddd 512-512(%rax),%ymm4,%ymm4 - vpaddd 544-512(%rax),%ymm5,%ymm5 - vpaddd 576-512(%rax),%ymm6,%ymm6 - vpaddd 608-512(%rax),%ymm7,%ymm7 - - vpunpckldq %ymm5,%ymm4,%ymm15 - vpunpckldq %ymm7,%ymm6,%ymm8 - vpunpckhdq %ymm5,%ymm4,%ymm4 - vpunpckhdq %ymm7,%ymm6,%ymm6 - vpunpcklqdq %ymm8,%ymm15,%ymm5 - vpunpckhqdq %ymm8,%ymm15,%ymm15 - vpunpcklqdq %ymm6,%ymm4,%ymm7 - vpunpckhqdq %ymm6,%ymm4,%ymm4 - vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 - vperm2i128 $0x31,%ymm5,%ymm13,%ymm5 - vperm2i128 $0x20,%ymm15,%ymm2,%ymm13 - vperm2i128 $0x31,%ymm15,%ymm2,%ymm15 - vperm2i128 $0x20,%ymm7,%ymm9,%ymm2 - vperm2i128 $0x31,%ymm7,%ymm9,%ymm7 - vperm2i128 $0x20,%ymm4,%ymm12,%ymm9 - vperm2i128 $0x31,%ymm4,%ymm12,%ymm4 - vmovdqa 0(%rsp),%ymm6 - vmovdqa 32(%rsp),%ymm12 - - cmpq $512,%rdx - jb .Ltail8x - - vpxor 0(%rsi),%ymm6,%ymm6 - vpxor 32(%rsi),%ymm8,%ymm8 - vpxor 64(%rsi),%ymm1,%ymm1 - vpxor 96(%rsi),%ymm5,%ymm5 - leaq 128(%rsi),%rsi - vmovdqu %ymm6,0(%rdi) - vmovdqu %ymm8,32(%rdi) - vmovdqu %ymm1,64(%rdi) - vmovdqu %ymm5,96(%rdi) - leaq 128(%rdi),%rdi - - vpxor 0(%rsi),%ymm12,%ymm12 - vpxor 32(%rsi),%ymm13,%ymm13 - vpxor 64(%rsi),%ymm10,%ymm10 - vpxor 96(%rsi),%ymm15,%ymm15 - leaq 128(%rsi),%rsi - vmovdqu %ymm12,0(%rdi) - vmovdqu %ymm13,32(%rdi) - vmovdqu %ymm10,64(%rdi) - vmovdqu %ymm15,96(%rdi) - leaq 128(%rdi),%rdi - - vpxor 0(%rsi),%ymm14,%ymm14 - vpxor 32(%rsi),%ymm2,%ymm2 - vpxor 64(%rsi),%ymm3,%ymm3 - vpxor 96(%rsi),%ymm7,%ymm7 - leaq 128(%rsi),%rsi - vmovdqu %ymm14,0(%rdi) - vmovdqu %ymm2,32(%rdi) - vmovdqu %ymm3,64(%rdi) - vmovdqu %ymm7,96(%rdi) - leaq 128(%rdi),%rdi - - vpxor 0(%rsi),%ymm11,%ymm11 - vpxor 32(%rsi),%ymm9,%ymm9 - vpxor 64(%rsi),%ymm0,%ymm0 - vpxor 96(%rsi),%ymm4,%ymm4 - leaq 128(%rsi),%rsi - vmovdqu %ymm11,0(%rdi) - vmovdqu %ymm9,32(%rdi) - vmovdqu %ymm0,64(%rdi) - vmovdqu %ymm4,96(%rdi) - leaq 128(%rdi),%rdi - - subq $512,%rdx - jnz .Loop_outer8x - - jmp .Ldone8x - -.Ltail8x: - cmpq $448,%rdx - jae .L448_or_more8x - cmpq $384,%rdx - jae .L384_or_more8x - cmpq $320,%rdx - jae .L320_or_more8x - cmpq $256,%rdx - jae .L256_or_more8x - cmpq $192,%rdx - jae .L192_or_more8x - cmpq $128,%rdx - jae .L128_or_more8x - cmpq $64,%rdx - jae .L64_or_more8x - - xorq %r10,%r10 - vmovdqa %ymm6,0(%rsp) - vmovdqa %ymm8,32(%rsp) - jmp .Loop_tail8x - -.align 32 -.L64_or_more8x: - vpxor 0(%rsi),%ymm6,%ymm6 - vpxor 32(%rsi),%ymm8,%ymm8 - vmovdqu %ymm6,0(%rdi) - vmovdqu %ymm8,32(%rdi) - je .Ldone8x - - leaq 64(%rsi),%rsi - xorq %r10,%r10 - vmovdqa %ymm1,0(%rsp) - leaq 64(%rdi),%rdi - subq $64,%rdx - vmovdqa %ymm5,32(%rsp) - jmp .Loop_tail8x - -.align 32 -.L128_or_more8x: - vpxor 0(%rsi),%ymm6,%ymm6 - vpxor 32(%rsi),%ymm8,%ymm8 - vpxor 64(%rsi),%ymm1,%ymm1 - vpxor 96(%rsi),%ymm5,%ymm5 - vmovdqu %ymm6,0(%rdi) - vmovdqu %ymm8,32(%rdi) - vmovdqu %ymm1,64(%rdi) - vmovdqu %ymm5,96(%rdi) - je .Ldone8x - - leaq 128(%rsi),%rsi - xorq %r10,%r10 - vmovdqa %ymm12,0(%rsp) - leaq 128(%rdi),%rdi - subq $128,%rdx - vmovdqa %ymm13,32(%rsp) - jmp .Loop_tail8x - -.align 32 -.L192_or_more8x: - vpxor 0(%rsi),%ymm6,%ymm6 - vpxor 32(%rsi),%ymm8,%ymm8 - vpxor 64(%rsi),%ymm1,%ymm1 - vpxor 96(%rsi),%ymm5,%ymm5 - vpxor 128(%rsi),%ymm12,%ymm12 - vpxor 160(%rsi),%ymm13,%ymm13 - vmovdqu %ymm6,0(%rdi) - vmovdqu %ymm8,32(%rdi) - vmovdqu %ymm1,64(%rdi) - vmovdqu %ymm5,96(%rdi) - vmovdqu %ymm12,128(%rdi) - vmovdqu %ymm13,160(%rdi) - je .Ldone8x - - leaq 192(%rsi),%rsi - xorq %r10,%r10 - vmovdqa %ymm10,0(%rsp) - leaq 192(%rdi),%rdi - subq $192,%rdx - vmovdqa %ymm15,32(%rsp) - jmp .Loop_tail8x - -.align 32 -.L256_or_more8x: - vpxor 0(%rsi),%ymm6,%ymm6 - vpxor 32(%rsi),%ymm8,%ymm8 - vpxor 64(%rsi),%ymm1,%ymm1 - vpxor 96(%rsi),%ymm5,%ymm5 - vpxor 128(%rsi),%ymm12,%ymm12 - vpxor 160(%rsi),%ymm13,%ymm13 - vpxor 192(%rsi),%ymm10,%ymm10 - vpxor 224(%rsi),%ymm15,%ymm15 - vmovdqu %ymm6,0(%rdi) - vmovdqu %ymm8,32(%rdi) - vmovdqu %ymm1,64(%rdi) - vmovdqu %ymm5,96(%rdi) - vmovdqu %ymm12,128(%rdi) - vmovdqu %ymm13,160(%rdi) - vmovdqu %ymm10,192(%rdi) - vmovdqu %ymm15,224(%rdi) - je .Ldone8x - - leaq 256(%rsi),%rsi - xorq %r10,%r10 - vmovdqa %ymm14,0(%rsp) - leaq 256(%rdi),%rdi - subq $256,%rdx - vmovdqa %ymm2,32(%rsp) - jmp .Loop_tail8x - -.align 32 -.L320_or_more8x: - vpxor 0(%rsi),%ymm6,%ymm6 - vpxor 32(%rsi),%ymm8,%ymm8 - vpxor 64(%rsi),%ymm1,%ymm1 - vpxor 96(%rsi),%ymm5,%ymm5 - vpxor 128(%rsi),%ymm12,%ymm12 - vpxor 160(%rsi),%ymm13,%ymm13 - vpxor 192(%rsi),%ymm10,%ymm10 - vpxor 224(%rsi),%ymm15,%ymm15 - vpxor 256(%rsi),%ymm14,%ymm14 - vpxor 288(%rsi),%ymm2,%ymm2 - vmovdqu %ymm6,0(%rdi) - vmovdqu %ymm8,32(%rdi) - vmovdqu %ymm1,64(%rdi) - vmovdqu %ymm5,96(%rdi) - vmovdqu %ymm12,128(%rdi) - vmovdqu %ymm13,160(%rdi) - vmovdqu %ymm10,192(%rdi) - vmovdqu %ymm15,224(%rdi) - vmovdqu %ymm14,256(%rdi) - vmovdqu %ymm2,288(%rdi) - je .Ldone8x - - leaq 320(%rsi),%rsi - xorq %r10,%r10 - vmovdqa %ymm3,0(%rsp) - leaq 320(%rdi),%rdi - subq $320,%rdx - vmovdqa %ymm7,32(%rsp) - jmp .Loop_tail8x - -.align 32 -.L384_or_more8x: - vpxor 0(%rsi),%ymm6,%ymm6 - vpxor 32(%rsi),%ymm8,%ymm8 - vpxor 64(%rsi),%ymm1,%ymm1 - vpxor 96(%rsi),%ymm5,%ymm5 - vpxor 128(%rsi),%ymm12,%ymm12 - vpxor 160(%rsi),%ymm13,%ymm13 - vpxor 192(%rsi),%ymm10,%ymm10 - vpxor 224(%rsi),%ymm15,%ymm15 - vpxor 256(%rsi),%ymm14,%ymm14 - vpxor 288(%rsi),%ymm2,%ymm2 - vpxor 320(%rsi),%ymm3,%ymm3 - vpxor 352(%rsi),%ymm7,%ymm7 - vmovdqu %ymm6,0(%rdi) - vmovdqu %ymm8,32(%rdi) - vmovdqu %ymm1,64(%rdi) - vmovdqu %ymm5,96(%rdi) - vmovdqu %ymm12,128(%rdi) - vmovdqu %ymm13,160(%rdi) - vmovdqu %ymm10,192(%rdi) - vmovdqu %ymm15,224(%rdi) - vmovdqu %ymm14,256(%rdi) - vmovdqu %ymm2,288(%rdi) - vmovdqu %ymm3,320(%rdi) - vmovdqu %ymm7,352(%rdi) - je .Ldone8x - - leaq 384(%rsi),%rsi - xorq %r10,%r10 - vmovdqa %ymm11,0(%rsp) - leaq 384(%rdi),%rdi - subq $384,%rdx - vmovdqa %ymm9,32(%rsp) - jmp .Loop_tail8x - -.align 32 -.L448_or_more8x: - vpxor 0(%rsi),%ymm6,%ymm6 - vpxor 32(%rsi),%ymm8,%ymm8 - vpxor 64(%rsi),%ymm1,%ymm1 - vpxor 96(%rsi),%ymm5,%ymm5 - vpxor 128(%rsi),%ymm12,%ymm12 - vpxor 160(%rsi),%ymm13,%ymm13 - vpxor 192(%rsi),%ymm10,%ymm10 - vpxor 224(%rsi),%ymm15,%ymm15 - vpxor 256(%rsi),%ymm14,%ymm14 - vpxor 288(%rsi),%ymm2,%ymm2 - vpxor 320(%rsi),%ymm3,%ymm3 - vpxor 352(%rsi),%ymm7,%ymm7 - vpxor 384(%rsi),%ymm11,%ymm11 - vpxor 416(%rsi),%ymm9,%ymm9 - vmovdqu %ymm6,0(%rdi) - vmovdqu %ymm8,32(%rdi) - vmovdqu %ymm1,64(%rdi) - vmovdqu %ymm5,96(%rdi) - vmovdqu %ymm12,128(%rdi) - vmovdqu %ymm13,160(%rdi) - vmovdqu %ymm10,192(%rdi) - vmovdqu %ymm15,224(%rdi) - vmovdqu %ymm14,256(%rdi) - vmovdqu %ymm2,288(%rdi) - vmovdqu %ymm3,320(%rdi) - vmovdqu %ymm7,352(%rdi) - vmovdqu %ymm11,384(%rdi) - vmovdqu %ymm9,416(%rdi) - je .Ldone8x - - leaq 448(%rsi),%rsi - xorq %r10,%r10 - vmovdqa %ymm0,0(%rsp) - leaq 448(%rdi),%rdi - subq $448,%rdx - vmovdqa %ymm4,32(%rsp) - -.Loop_tail8x: - movzbl (%rsi,%r10,1),%eax - movzbl (%rsp,%r10,1),%ecx - leaq 1(%r10),%r10 - xorl %ecx,%eax - movb %al,-1(%rdi,%r10,1) - decq %rdx - jnz .Loop_tail8x - -.Ldone8x: - vzeroall - leaq (%r9),%rsp -.cfi_def_cfa_register %rsp -.L8x_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size ChaCha20_8x,.-ChaCha20_8x diff --git a/secure/lib/libcrypto/amd64/cmll-x86_64.S b/secure/lib/libcrypto/amd64/cmll-x86_64.S index 7feb198a7c1c..d1d284b5a32b 100644 --- a/secure/lib/libcrypto/amd64/cmll-x86_64.S +++ b/secure/lib/libcrypto/amd64/cmll-x86_64.S @@ -7,11 +7,13 @@ .type Camellia_EncryptBlock,@function .align 16 Camellia_EncryptBlock: +.cfi_startproc movl $128,%eax subl %edi,%eax movl $3,%edi adcl $0,%edi jmp .Lenc_rounds +.cfi_endproc .size Camellia_EncryptBlock,.-Camellia_EncryptBlock .globl Camellia_EncryptBlock_Rounds @@ -85,6 +87,7 @@ Camellia_EncryptBlock_Rounds: .type _x86_64_Camellia_encrypt,@function .align 16 _x86_64_Camellia_encrypt: +.cfi_startproc xorl 0(%r14),%r9d xorl 4(%r14),%r8d xorl 8(%r14),%r11d @@ -287,6 +290,7 @@ _x86_64_Camellia_encrypt: movl %edx,%r11d .byte 0xf3,0xc3 +.cfi_endproc .size _x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt @@ -294,11 +298,13 @@ _x86_64_Camellia_encrypt: .type Camellia_DecryptBlock,@function .align 16 Camellia_DecryptBlock: +.cfi_startproc movl $128,%eax subl %edi,%eax movl $3,%edi adcl $0,%edi jmp .Ldec_rounds +.cfi_endproc .size Camellia_DecryptBlock,.-Camellia_DecryptBlock .globl Camellia_DecryptBlock_Rounds @@ -372,6 +378,7 @@ Camellia_DecryptBlock_Rounds: .type _x86_64_Camellia_decrypt,@function .align 16 _x86_64_Camellia_decrypt: +.cfi_startproc xorl 0(%r14),%r9d xorl 4(%r14),%r8d xorl 8(%r14),%r11d @@ -575,6 +582,7 @@ _x86_64_Camellia_decrypt: movl %ebx,%r11d .byte 0xf3,0xc3 +.cfi_endproc .size _x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt .globl Camellia_Ekeygen .type Camellia_Ekeygen,@function diff --git a/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S b/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S index 1176feea40c2..c69b4d978f39 100644 --- a/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S +++ b/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S @@ -2790,10 +2790,6 @@ ecp_nistz256_neg: .align 32 ecp_nistz256_ord_mul_mont: .cfi_startproc - movl $0x80100,%ecx - andl OPENSSL_ia32cap_P+8(%rip),%ecx - cmpl $0x80100,%ecx - je .Lecp_nistz256_ord_mul_montx pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -3122,10 +3118,6 @@ ecp_nistz256_ord_mul_mont: .align 32 ecp_nistz256_ord_sqr_mont: .cfi_startproc - movl $0x80100,%ecx - andl OPENSSL_ia32cap_P+8(%rip),%ecx - cmpl $0x80100,%ecx - je .Lecp_nistz256_ord_sqr_montx pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -3413,462 +3405,6 @@ ecp_nistz256_ord_sqr_mont: .cfi_endproc .size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont -.type ecp_nistz256_ord_mul_montx,@function -.align 32 -ecp_nistz256_ord_mul_montx: -.cfi_startproc -.Lecp_nistz256_ord_mul_montx: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 -.Lord_mulx_body: - - movq %rdx,%rbx - movq 0(%rdx),%rdx - movq 0(%rsi),%r9 - movq 8(%rsi),%r10 - movq 16(%rsi),%r11 - movq 24(%rsi),%r12 - leaq -128(%rsi),%rsi - leaq .Lord-128(%rip),%r14 - movq .LordK(%rip),%r15 - - - mulxq %r9,%r8,%r9 - mulxq %r10,%rcx,%r10 - mulxq %r11,%rbp,%r11 - addq %rcx,%r9 - mulxq %r12,%rcx,%r12 - movq %r8,%rdx - mulxq %r15,%rdx,%rax - adcq %rbp,%r10 - adcq %rcx,%r11 - adcq $0,%r12 - - - xorq %r13,%r13 - mulxq 0+128(%r14),%rcx,%rbp - adcxq %rcx,%r8 - adoxq %rbp,%r9 - - mulxq 8+128(%r14),%rcx,%rbp - adcxq %rcx,%r9 - adoxq %rbp,%r10 - - mulxq 16+128(%r14),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - - mulxq 24+128(%r14),%rcx,%rbp - movq 8(%rbx),%rdx - adcxq %rcx,%r11 - adoxq %rbp,%r12 - adcxq %r8,%r12 - adoxq %r8,%r13 - adcq $0,%r13 - - - mulxq 0+128(%rsi),%rcx,%rbp - adcxq %rcx,%r9 - adoxq %rbp,%r10 - - mulxq 8+128(%rsi),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - - mulxq 16+128(%rsi),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 24+128(%rsi),%rcx,%rbp - movq %r9,%rdx - mulxq %r15,%rdx,%rax - adcxq %rcx,%r12 - adoxq %rbp,%r13 - - adcxq %r8,%r13 - adoxq %r8,%r8 - adcq $0,%r8 - - - mulxq 0+128(%r14),%rcx,%rbp - adcxq %rcx,%r9 - adoxq %rbp,%r10 - - mulxq 8+128(%r14),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - - mulxq 16+128(%r14),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 24+128(%r14),%rcx,%rbp - movq 16(%rbx),%rdx - adcxq %rcx,%r12 - adoxq %rbp,%r13 - adcxq %r9,%r13 - adoxq %r9,%r8 - adcq $0,%r8 - - - mulxq 0+128(%rsi),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - - mulxq 8+128(%rsi),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 16+128(%rsi),%rcx,%rbp - adcxq %rcx,%r12 - adoxq %rbp,%r13 - - mulxq 24+128(%rsi),%rcx,%rbp - movq %r10,%rdx - mulxq %r15,%rdx,%rax - adcxq %rcx,%r13 - adoxq %rbp,%r8 - - adcxq %r9,%r8 - adoxq %r9,%r9 - adcq $0,%r9 - - - mulxq 0+128(%r14),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - - mulxq 8+128(%r14),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 16+128(%r14),%rcx,%rbp - adcxq %rcx,%r12 - adoxq %rbp,%r13 - - mulxq 24+128(%r14),%rcx,%rbp - movq 24(%rbx),%rdx - adcxq %rcx,%r13 - adoxq %rbp,%r8 - adcxq %r10,%r8 - adoxq %r10,%r9 - adcq $0,%r9 - - - mulxq 0+128(%rsi),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 8+128(%rsi),%rcx,%rbp - adcxq %rcx,%r12 - adoxq %rbp,%r13 - - mulxq 16+128(%rsi),%rcx,%rbp - adcxq %rcx,%r13 - adoxq %rbp,%r8 - - mulxq 24+128(%rsi),%rcx,%rbp - movq %r11,%rdx - mulxq %r15,%rdx,%rax - adcxq %rcx,%r8 - adoxq %rbp,%r9 - - adcxq %r10,%r9 - adoxq %r10,%r10 - adcq $0,%r10 - - - mulxq 0+128(%r14),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 8+128(%r14),%rcx,%rbp - adcxq %rcx,%r12 - adoxq %rbp,%r13 - - mulxq 16+128(%r14),%rcx,%rbp - adcxq %rcx,%r13 - adoxq %rbp,%r8 - - mulxq 24+128(%r14),%rcx,%rbp - leaq 128(%r14),%r14 - movq %r12,%rbx - adcxq %rcx,%r8 - adoxq %rbp,%r9 - movq %r13,%rdx - adcxq %r11,%r9 - adoxq %r11,%r10 - adcq $0,%r10 - - - - movq %r8,%rcx - subq 0(%r14),%r12 - sbbq 8(%r14),%r13 - sbbq 16(%r14),%r8 - movq %r9,%rbp - sbbq 24(%r14),%r9 - sbbq $0,%r10 - - cmovcq %rbx,%r12 - cmovcq %rdx,%r13 - cmovcq %rcx,%r8 - cmovcq %rbp,%r9 - - movq %r12,0(%rdi) - movq %r13,8(%rdi) - movq %r8,16(%rdi) - movq %r9,24(%rdi) - - movq 0(%rsp),%r15 -.cfi_restore %r15 - movq 8(%rsp),%r14 -.cfi_restore %r14 - movq 16(%rsp),%r13 -.cfi_restore %r13 - movq 24(%rsp),%r12 -.cfi_restore %r12 - movq 32(%rsp),%rbx -.cfi_restore %rbx - movq 40(%rsp),%rbp -.cfi_restore %rbp - leaq 48(%rsp),%rsp -.cfi_adjust_cfa_offset -48 -.Lord_mulx_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx - -.type ecp_nistz256_ord_sqr_montx,@function -.align 32 -ecp_nistz256_ord_sqr_montx: -.cfi_startproc -.Lecp_nistz256_ord_sqr_montx: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 -.Lord_sqrx_body: - - movq %rdx,%rbx - movq 0(%rsi),%rdx - movq 8(%rsi),%r14 - movq 16(%rsi),%r15 - movq 24(%rsi),%r8 - leaq .Lord(%rip),%rsi - jmp .Loop_ord_sqrx - -.align 32 -.Loop_ord_sqrx: - mulxq %r14,%r9,%r10 - mulxq %r15,%rcx,%r11 - movq %rdx,%rax -.byte 102,73,15,110,206 - mulxq %r8,%rbp,%r12 - movq %r14,%rdx - addq %rcx,%r10 -.byte 102,73,15,110,215 - adcq %rbp,%r11 - adcq $0,%r12 - xorq %r13,%r13 - - mulxq %r15,%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq %r8,%rcx,%rbp - movq %r15,%rdx - adcxq %rcx,%r12 - adoxq %rbp,%r13 - adcq $0,%r13 - - mulxq %r8,%rcx,%r14 - movq %rax,%rdx -.byte 102,73,15,110,216 - xorq %r15,%r15 - adcxq %r9,%r9 - adoxq %rcx,%r13 - adcxq %r10,%r10 - adoxq %r15,%r14 - - - mulxq %rdx,%r8,%rbp -.byte 102,72,15,126,202 - adcxq %r11,%r11 - adoxq %rbp,%r9 - adcxq %r12,%r12 - mulxq %rdx,%rcx,%rax -.byte 102,72,15,126,210 - adcxq %r13,%r13 - adoxq %rcx,%r10 - adcxq %r14,%r14 - mulxq %rdx,%rcx,%rbp -.byte 0x67 -.byte 102,72,15,126,218 - adoxq %rax,%r11 - adcxq %r15,%r15 - adoxq %rcx,%r12 - adoxq %rbp,%r13 - mulxq %rdx,%rcx,%rax - adoxq %rcx,%r14 - adoxq %rax,%r15 - - - movq %r8,%rdx - mulxq 32(%rsi),%rdx,%rcx - - xorq %rax,%rax - mulxq 0(%rsi),%rcx,%rbp - adcxq %rcx,%r8 - adoxq %rbp,%r9 - mulxq 8(%rsi),%rcx,%rbp - adcxq %rcx,%r9 - adoxq %rbp,%r10 - mulxq 16(%rsi),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - mulxq 24(%rsi),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r8 - adcxq %rax,%r8 - - - movq %r9,%rdx - mulxq 32(%rsi),%rdx,%rcx - - mulxq 0(%rsi),%rcx,%rbp - adoxq %rcx,%r9 - adcxq %rbp,%r10 - mulxq 8(%rsi),%rcx,%rbp - adoxq %rcx,%r10 - adcxq %rbp,%r11 - mulxq 16(%rsi),%rcx,%rbp - adoxq %rcx,%r11 - adcxq %rbp,%r8 - mulxq 24(%rsi),%rcx,%rbp - adoxq %rcx,%r8 - adcxq %rbp,%r9 - adoxq %rax,%r9 - - - movq %r10,%rdx - mulxq 32(%rsi),%rdx,%rcx - - mulxq 0(%rsi),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - mulxq 8(%rsi),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r8 - mulxq 16(%rsi),%rcx,%rbp - adcxq %rcx,%r8 - adoxq %rbp,%r9 - mulxq 24(%rsi),%rcx,%rbp - adcxq %rcx,%r9 - adoxq %rbp,%r10 - adcxq %rax,%r10 - - - movq %r11,%rdx - mulxq 32(%rsi),%rdx,%rcx - - mulxq 0(%rsi),%rcx,%rbp - adoxq %rcx,%r11 - adcxq %rbp,%r8 - mulxq 8(%rsi),%rcx,%rbp - adoxq %rcx,%r8 - adcxq %rbp,%r9 - mulxq 16(%rsi),%rcx,%rbp - adoxq %rcx,%r9 - adcxq %rbp,%r10 - mulxq 24(%rsi),%rcx,%rbp - adoxq %rcx,%r10 - adcxq %rbp,%r11 - adoxq %rax,%r11 - - - addq %r8,%r12 - adcq %r13,%r9 - movq %r12,%rdx - adcq %r14,%r10 - adcq %r15,%r11 - movq %r9,%r14 - adcq $0,%rax - - - subq 0(%rsi),%r12 - movq %r10,%r15 - sbbq 8(%rsi),%r9 - sbbq 16(%rsi),%r10 - movq %r11,%r8 - sbbq 24(%rsi),%r11 - sbbq $0,%rax - - cmovncq %r12,%rdx - cmovncq %r9,%r14 - cmovncq %r10,%r15 - cmovncq %r11,%r8 - - decq %rbx - jnz .Loop_ord_sqrx - - movq %rdx,0(%rdi) - movq %r14,8(%rdi) - pxor %xmm1,%xmm1 - movq %r15,16(%rdi) - pxor %xmm2,%xmm2 - movq %r8,24(%rdi) - pxor %xmm3,%xmm3 - - movq 0(%rsp),%r15 -.cfi_restore %r15 - movq 8(%rsp),%r14 -.cfi_restore %r14 - movq 16(%rsp),%r13 -.cfi_restore %r13 - movq 24(%rsp),%r12 -.cfi_restore %r12 - movq 32(%rsp),%rbx -.cfi_restore %rbx - movq 40(%rsp),%rbp -.cfi_restore %rbp - leaq 48(%rsp),%rsp -.cfi_adjust_cfa_offset -48 -.Lord_sqrx_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx - @@ -3876,10 +3412,10 @@ ecp_nistz256_ord_sqr_montx: .type ecp_nistz256_to_mont,@function .align 32 ecp_nistz256_to_mont: - movl $0x80100,%ecx - andl OPENSSL_ia32cap_P+8(%rip),%ecx +.cfi_startproc leaq .LRR(%rip),%rdx jmp .Lmul_mont +.cfi_endproc .size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont @@ -3893,8 +3429,6 @@ ecp_nistz256_to_mont: .align 32 ecp_nistz256_mul_mont: .cfi_startproc - movl $0x80100,%ecx - andl OPENSSL_ia32cap_P+8(%rip),%ecx .Lmul_mont: pushq %rbp .cfi_adjust_cfa_offset 8 @@ -3915,8 +3449,6 @@ ecp_nistz256_mul_mont: .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 .Lmul_body: - cmpl $0x80100,%ecx - je .Lmul_montx movq %rdx,%rbx movq 0(%rdx),%rax movq 0(%rsi),%r9 @@ -3925,19 +3457,6 @@ ecp_nistz256_mul_mont: movq 24(%rsi),%r12 call __ecp_nistz256_mul_montq - jmp .Lmul_mont_done - -.align 32 -.Lmul_montx: - movq %rdx,%rbx - movq 0(%rdx),%rdx - movq 0(%rsi),%r9 - movq 8(%rsi),%r10 - movq 16(%rsi),%r11 - movq 24(%rsi),%r12 - leaq -128(%rsi),%rsi - - call __ecp_nistz256_mul_montx .Lmul_mont_done: movq 0(%rsp),%r15 .cfi_restore %r15 @@ -4188,8 +3707,6 @@ __ecp_nistz256_mul_montq: .align 32 ecp_nistz256_sqr_mont: .cfi_startproc - movl $0x80100,%ecx - andl OPENSSL_ia32cap_P+8(%rip),%ecx pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -4209,25 +3726,12 @@ ecp_nistz256_sqr_mont: .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 .Lsqr_body: - cmpl $0x80100,%ecx - je .Lsqr_montx movq 0(%rsi),%rax movq 8(%rsi),%r14 movq 16(%rsi),%r15 movq 24(%rsi),%r8 call __ecp_nistz256_sqr_montq - jmp .Lsqr_mont_done - -.align 32 -.Lsqr_montx: - movq 0(%rsi),%rdx - movq 8(%rsi),%r14 - movq 16(%rsi),%r15 - movq 24(%rsi),%r8 - leaq -128(%rsi),%rsi - - call __ecp_nistz256_sqr_montx .Lsqr_mont_done: movq 0(%rsp),%r15 .cfi_restore %r15 @@ -4411,304 +3915,6 @@ __ecp_nistz256_sqr_montq: .byte 0xf3,0xc3 .cfi_endproc .size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq -.type __ecp_nistz256_mul_montx,@function -.align 32 -__ecp_nistz256_mul_montx: -.cfi_startproc - - - mulxq %r9,%r8,%r9 - mulxq %r10,%rcx,%r10 - movq $32,%r14 - xorq %r13,%r13 - mulxq %r11,%rbp,%r11 - movq .Lpoly+24(%rip),%r15 - adcq %rcx,%r9 - mulxq %r12,%rcx,%r12 - movq %r8,%rdx - adcq %rbp,%r10 - shlxq %r14,%r8,%rbp - adcq %rcx,%r11 - shrxq %r14,%r8,%rcx - adcq $0,%r12 - - - - addq %rbp,%r9 - adcq %rcx,%r10 - - mulxq %r15,%rcx,%rbp - movq 8(%rbx),%rdx - adcq %rcx,%r11 - adcq %rbp,%r12 - adcq $0,%r13 - xorq %r8,%r8 - - - - mulxq 0+128(%rsi),%rcx,%rbp - adcxq %rcx,%r9 - adoxq %rbp,%r10 - - mulxq 8+128(%rsi),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - - mulxq 16+128(%rsi),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 24+128(%rsi),%rcx,%rbp - movq %r9,%rdx - adcxq %rcx,%r12 - shlxq %r14,%r9,%rcx - adoxq %rbp,%r13 - shrxq %r14,%r9,%rbp - - adcxq %r8,%r13 - adoxq %r8,%r8 - adcq $0,%r8 - - - - addq %rcx,%r10 - adcq %rbp,%r11 - - mulxq %r15,%rcx,%rbp - movq 16(%rbx),%rdx - adcq %rcx,%r12 - adcq %rbp,%r13 - adcq $0,%r8 - xorq %r9,%r9 - - - - mulxq 0+128(%rsi),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - - mulxq 8+128(%rsi),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 16+128(%rsi),%rcx,%rbp - adcxq %rcx,%r12 - adoxq %rbp,%r13 - - mulxq 24+128(%rsi),%rcx,%rbp - movq %r10,%rdx - adcxq %rcx,%r13 - shlxq %r14,%r10,%rcx - adoxq %rbp,%r8 - shrxq %r14,%r10,%rbp - - adcxq %r9,%r8 - adoxq %r9,%r9 - adcq $0,%r9 - - - - addq %rcx,%r11 - adcq %rbp,%r12 - - mulxq %r15,%rcx,%rbp - movq 24(%rbx),%rdx - adcq %rcx,%r13 - adcq %rbp,%r8 - adcq $0,%r9 - xorq %r10,%r10 - - - - mulxq 0+128(%rsi),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 8+128(%rsi),%rcx,%rbp - adcxq %rcx,%r12 - adoxq %rbp,%r13 - - mulxq 16+128(%rsi),%rcx,%rbp - adcxq %rcx,%r13 - adoxq %rbp,%r8 - - mulxq 24+128(%rsi),%rcx,%rbp - movq %r11,%rdx - adcxq %rcx,%r8 - shlxq %r14,%r11,%rcx - adoxq %rbp,%r9 - shrxq %r14,%r11,%rbp - - adcxq %r10,%r9 - adoxq %r10,%r10 - adcq $0,%r10 - - - - addq %rcx,%r12 - adcq %rbp,%r13 - - mulxq %r15,%rcx,%rbp - movq %r12,%rbx - movq .Lpoly+8(%rip),%r14 - adcq %rcx,%r8 - movq %r13,%rdx - adcq %rbp,%r9 - adcq $0,%r10 - - - - xorl %eax,%eax - movq %r8,%rcx - sbbq $-1,%r12 - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%rbp - sbbq %r15,%r9 - sbbq $0,%r10 - - cmovcq %rbx,%r12 - cmovcq %rdx,%r13 - movq %r12,0(%rdi) - cmovcq %rcx,%r8 - movq %r13,8(%rdi) - cmovcq %rbp,%r9 - movq %r8,16(%rdi) - movq %r9,24(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx - -.type __ecp_nistz256_sqr_montx,@function -.align 32 -__ecp_nistz256_sqr_montx: -.cfi_startproc - mulxq %r14,%r9,%r10 - mulxq %r15,%rcx,%r11 - xorl %eax,%eax - adcq %rcx,%r10 - mulxq %r8,%rbp,%r12 - movq %r14,%rdx - adcq %rbp,%r11 - adcq $0,%r12 - xorq %r13,%r13 - - - mulxq %r15,%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq %r8,%rcx,%rbp - movq %r15,%rdx - adcxq %rcx,%r12 - adoxq %rbp,%r13 - adcq $0,%r13 - - - mulxq %r8,%rcx,%r14 - movq 0+128(%rsi),%rdx - xorq %r15,%r15 - adcxq %r9,%r9 - adoxq %rcx,%r13 - adcxq %r10,%r10 - adoxq %r15,%r14 - - mulxq %rdx,%r8,%rbp - movq 8+128(%rsi),%rdx - adcxq %r11,%r11 - adoxq %rbp,%r9 - adcxq %r12,%r12 - mulxq %rdx,%rcx,%rax - movq 16+128(%rsi),%rdx - adcxq %r13,%r13 - adoxq %rcx,%r10 - adcxq %r14,%r14 -.byte 0x67 - mulxq %rdx,%rcx,%rbp - movq 24+128(%rsi),%rdx - adoxq %rax,%r11 - adcxq %r15,%r15 - adoxq %rcx,%r12 - movq $32,%rsi - adoxq %rbp,%r13 -.byte 0x67,0x67 - mulxq %rdx,%rcx,%rax - movq .Lpoly+24(%rip),%rdx - adoxq %rcx,%r14 - shlxq %rsi,%r8,%rcx - adoxq %rax,%r15 - shrxq %rsi,%r8,%rax - movq %rdx,%rbp - - - addq %rcx,%r9 - adcq %rax,%r10 - - mulxq %r8,%rcx,%r8 - adcq %rcx,%r11 - shlxq %rsi,%r9,%rcx - adcq $0,%r8 - shrxq %rsi,%r9,%rax - - - addq %rcx,%r10 - adcq %rax,%r11 - - mulxq %r9,%rcx,%r9 - adcq %rcx,%r8 - shlxq %rsi,%r10,%rcx - adcq $0,%r9 - shrxq %rsi,%r10,%rax - - - addq %rcx,%r11 - adcq %rax,%r8 - - mulxq %r10,%rcx,%r10 - adcq %rcx,%r9 - shlxq %rsi,%r11,%rcx - adcq $0,%r10 - shrxq %rsi,%r11,%rax - - - addq %rcx,%r8 - adcq %rax,%r9 - - mulxq %r11,%rcx,%r11 - adcq %rcx,%r10 - adcq $0,%r11 - - xorq %rdx,%rdx - addq %r8,%r12 - movq .Lpoly+8(%rip),%rsi - adcq %r9,%r13 - movq %r12,%r8 - adcq %r10,%r14 - adcq %r11,%r15 - movq %r13,%r9 - adcq $0,%rdx - - subq $-1,%r12 - movq %r14,%r10 - sbbq %rsi,%r13 - sbbq $0,%r14 - movq %r15,%r11 - sbbq %rbp,%r15 - sbbq $0,%rdx - - cmovcq %r8,%r12 - cmovcq %r9,%r13 - movq %r12,0(%rdi) - cmovcq %r10,%r14 - movq %r13,8(%rdi) - cmovcq %r11,%r15 - movq %r14,16(%rdi) - movq %r15,24(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx @@ -4823,6 +4029,7 @@ ecp_nistz256_from_mont: .type ecp_nistz256_scatter_w5,@function .align 32 ecp_nistz256_scatter_w5: +.cfi_startproc leal -3(%rdx,%rdx,2),%edx movdqa 0(%rsi),%xmm0 shll $5,%edx @@ -4839,6 +4046,7 @@ ecp_nistz256_scatter_w5: movdqa %xmm5,80(%rdi,%rdx,1) .byte 0xf3,0xc3 +.cfi_endproc .size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 @@ -4848,9 +4056,6 @@ ecp_nistz256_scatter_w5: .align 32 ecp_nistz256_gather_w5: .cfi_startproc - movl OPENSSL_ia32cap_P+8(%rip),%eax - testl $32,%eax - jnz .Lavx2_gather_w5 movdqa .LOne(%rip),%xmm0 movd %edx,%xmm1 @@ -4912,6 +4117,7 @@ ecp_nistz256_gather_w5: .type ecp_nistz256_scatter_w7,@function .align 32 ecp_nistz256_scatter_w7: +.cfi_startproc movdqu 0(%rsi),%xmm0 shll $6,%edx movdqu 16(%rsi),%xmm1 @@ -4923,6 +4129,7 @@ ecp_nistz256_scatter_w7: movdqa %xmm3,48(%rdi,%rdx,1) .byte 0xf3,0xc3 +.cfi_endproc .size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 @@ -4932,9 +4139,6 @@ ecp_nistz256_scatter_w7: .align 32 ecp_nistz256_gather_w7: .cfi_startproc - movl OPENSSL_ia32cap_P+8(%rip),%eax - testl $32,%eax - jnz .Lavx2_gather_w7 movdqa .LOne(%rip),%xmm8 movd %edx,%xmm1 @@ -4978,148 +4182,14 @@ ecp_nistz256_gather_w7: .cfi_endproc .LSEH_end_ecp_nistz256_gather_w7: .size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 - - -.type ecp_nistz256_avx2_gather_w5,@function -.align 32 -ecp_nistz256_avx2_gather_w5: -.cfi_startproc -.Lavx2_gather_w5: - vzeroupper - vmovdqa .LTwo(%rip),%ymm0 - - vpxor %ymm2,%ymm2,%ymm2 - vpxor %ymm3,%ymm3,%ymm3 - vpxor %ymm4,%ymm4,%ymm4 - - vmovdqa .LOne(%rip),%ymm5 - vmovdqa .LTwo(%rip),%ymm10 - - vmovd %edx,%xmm1 - vpermd %ymm1,%ymm2,%ymm1 - - movq $8,%rax -.Lselect_loop_avx2_w5: - - vmovdqa 0(%rsi),%ymm6 - vmovdqa 32(%rsi),%ymm7 - vmovdqa 64(%rsi),%ymm8 - - vmovdqa 96(%rsi),%ymm11 - vmovdqa 128(%rsi),%ymm12 - vmovdqa 160(%rsi),%ymm13 - - vpcmpeqd %ymm1,%ymm5,%ymm9 - vpcmpeqd %ymm1,%ymm10,%ymm14 - - vpaddd %ymm0,%ymm5,%ymm5 - vpaddd %ymm0,%ymm10,%ymm10 - leaq 192(%rsi),%rsi - - vpand %ymm9,%ymm6,%ymm6 - vpand %ymm9,%ymm7,%ymm7 - vpand %ymm9,%ymm8,%ymm8 - vpand %ymm14,%ymm11,%ymm11 - vpand %ymm14,%ymm12,%ymm12 - vpand %ymm14,%ymm13,%ymm13 - - vpxor %ymm6,%ymm2,%ymm2 - vpxor %ymm7,%ymm3,%ymm3 - vpxor %ymm8,%ymm4,%ymm4 - vpxor %ymm11,%ymm2,%ymm2 - vpxor %ymm12,%ymm3,%ymm3 - vpxor %ymm13,%ymm4,%ymm4 - - decq %rax - jnz .Lselect_loop_avx2_w5 - - vmovdqu %ymm2,0(%rdi) - vmovdqu %ymm3,32(%rdi) - vmovdqu %ymm4,64(%rdi) - vzeroupper - .byte 0xf3,0xc3 -.cfi_endproc -.LSEH_end_ecp_nistz256_avx2_gather_w5: -.size ecp_nistz256_avx2_gather_w5,.-ecp_nistz256_avx2_gather_w5 - - - .globl ecp_nistz256_avx2_gather_w7 .type ecp_nistz256_avx2_gather_w7,@function .align 32 ecp_nistz256_avx2_gather_w7: .cfi_startproc -.Lavx2_gather_w7: - vzeroupper - vmovdqa .LThree(%rip),%ymm0 - - vpxor %ymm2,%ymm2,%ymm2 - vpxor %ymm3,%ymm3,%ymm3 - - vmovdqa .LOne(%rip),%ymm4 - vmovdqa .LTwo(%rip),%ymm8 - vmovdqa .LThree(%rip),%ymm12 - - vmovd %edx,%xmm1 - vpermd %ymm1,%ymm2,%ymm1 - - - movq $21,%rax -.Lselect_loop_avx2_w7: - - vmovdqa 0(%rsi),%ymm5 - vmovdqa 32(%rsi),%ymm6 - - vmovdqa 64(%rsi),%ymm9 - vmovdqa 96(%rsi),%ymm10 - - vmovdqa 128(%rsi),%ymm13 - vmovdqa 160(%rsi),%ymm14 - - vpcmpeqd %ymm1,%ymm4,%ymm7 - vpcmpeqd %ymm1,%ymm8,%ymm11 - vpcmpeqd %ymm1,%ymm12,%ymm15 - - vpaddd %ymm0,%ymm4,%ymm4 - vpaddd %ymm0,%ymm8,%ymm8 - vpaddd %ymm0,%ymm12,%ymm12 - leaq 192(%rsi),%rsi - - vpand %ymm7,%ymm5,%ymm5 - vpand %ymm7,%ymm6,%ymm6 - vpand %ymm11,%ymm9,%ymm9 - vpand %ymm11,%ymm10,%ymm10 - vpand %ymm15,%ymm13,%ymm13 - vpand %ymm15,%ymm14,%ymm14 - - vpxor %ymm5,%ymm2,%ymm2 - vpxor %ymm6,%ymm3,%ymm3 - vpxor %ymm9,%ymm2,%ymm2 - vpxor %ymm10,%ymm3,%ymm3 - vpxor %ymm13,%ymm2,%ymm2 - vpxor %ymm14,%ymm3,%ymm3 - - decq %rax - jnz .Lselect_loop_avx2_w7 - - - vmovdqa 0(%rsi),%ymm5 - vmovdqa 32(%rsi),%ymm6 - - vpcmpeqd %ymm1,%ymm4,%ymm7 - - vpand %ymm7,%ymm5,%ymm5 - vpand %ymm7,%ymm6,%ymm6 - - vpxor %ymm5,%ymm2,%ymm2 - vpxor %ymm6,%ymm3,%ymm3 - - vmovdqu %ymm2,0(%rdi) - vmovdqu %ymm3,32(%rdi) - vzeroupper +.byte 0x0f,0x0b .byte 0xf3,0xc3 .cfi_endproc -.LSEH_end_ecp_nistz256_avx2_gather_w7: .size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7 .type __ecp_nistz256_add_toq,@function .align 32 @@ -5255,10 +4325,6 @@ __ecp_nistz256_mul_by_2q: .align 32 ecp_nistz256_point_double: .cfi_startproc - movl $0x80100,%ecx - andl OPENSSL_ia32cap_P+8(%rip),%ecx - cmpl $0x80100,%ecx - je .Lpoint_doublex pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -5487,10 +4553,6 @@ ecp_nistz256_point_double: .align 32 ecp_nistz256_point_add: .cfi_startproc - movl $0x80100,%ecx - andl OPENSSL_ia32cap_P+8(%rip),%ecx - cmpl $0x80100,%ecx - je .Lpoint_addx pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -5657,26 +4719,16 @@ ecp_nistz256_point_add: orq %r8,%r12 orq %r9,%r12 -.byte 0x3e - jnz .Ladd_proceedq .byte 102,73,15,126,208 .byte 102,73,15,126,217 - testq %r8,%r8 - jnz .Ladd_proceedq - testq %r9,%r9 - jz .Ladd_doubleq -.byte 102,72,15,126,199 - pxor %xmm0,%xmm0 - movdqu %xmm0,0(%rdi) - movdqu %xmm0,16(%rdi) - movdqu %xmm0,32(%rdi) - movdqu %xmm0,48(%rdi) - movdqu %xmm0,64(%rdi) - movdqu %xmm0,80(%rdi) - jmp .Ladd_doneq + orq %r8,%r12 + orq %r9,%r12 + + +.byte 0x3e + jnz .Ladd_proceedq -.align 32 .Ladd_doubleq: .byte 102,72,15,126,206 .byte 102,72,15,126,199 @@ -5915,10 +4967,6 @@ ecp_nistz256_point_add: .align 32 ecp_nistz256_point_add_affine: .cfi_startproc - movl $0x80100,%ecx - andl OPENSSL_ia32cap_P+8(%rip),%ecx - cmpl $0x80100,%ecx - je .Lpoint_add_affinex pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -6242,1118 +5290,3 @@ ecp_nistz256_point_add_affine: .byte 0xf3,0xc3 .cfi_endproc .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine -.type __ecp_nistz256_add_tox,@function -.align 32 -__ecp_nistz256_add_tox: -.cfi_startproc - xorq %r11,%r11 - adcq 0(%rbx),%r12 - adcq 8(%rbx),%r13 - movq %r12,%rax - adcq 16(%rbx),%r8 - adcq 24(%rbx),%r9 - movq %r13,%rbp - adcq $0,%r11 - - xorq %r10,%r10 - sbbq $-1,%r12 - movq %r8,%rcx - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%r10 - sbbq %r15,%r9 - sbbq $0,%r11 - - cmovcq %rax,%r12 - cmovcq %rbp,%r13 - movq %r12,0(%rdi) - cmovcq %rcx,%r8 - movq %r13,8(%rdi) - cmovcq %r10,%r9 - movq %r8,16(%rdi) - movq %r9,24(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox - -.type __ecp_nistz256_sub_fromx,@function -.align 32 -__ecp_nistz256_sub_fromx: -.cfi_startproc - xorq %r11,%r11 - sbbq 0(%rbx),%r12 - sbbq 8(%rbx),%r13 - movq %r12,%rax - sbbq 16(%rbx),%r8 - sbbq 24(%rbx),%r9 - movq %r13,%rbp - sbbq $0,%r11 - - xorq %r10,%r10 - adcq $-1,%r12 - movq %r8,%rcx - adcq %r14,%r13 - adcq $0,%r8 - movq %r9,%r10 - adcq %r15,%r9 - - btq $0,%r11 - cmovncq %rax,%r12 - cmovncq %rbp,%r13 - movq %r12,0(%rdi) - cmovncq %rcx,%r8 - movq %r13,8(%rdi) - cmovncq %r10,%r9 - movq %r8,16(%rdi) - movq %r9,24(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx - -.type __ecp_nistz256_subx,@function -.align 32 -__ecp_nistz256_subx: -.cfi_startproc - xorq %r11,%r11 - sbbq %r12,%rax - sbbq %r13,%rbp - movq %rax,%r12 - sbbq %r8,%rcx - sbbq %r9,%r10 - movq %rbp,%r13 - sbbq $0,%r11 - - xorq %r9,%r9 - adcq $-1,%rax - movq %rcx,%r8 - adcq %r14,%rbp - adcq $0,%rcx - movq %r10,%r9 - adcq %r15,%r10 - - btq $0,%r11 - cmovcq %rax,%r12 - cmovcq %rbp,%r13 - cmovcq %rcx,%r8 - cmovcq %r10,%r9 - - .byte 0xf3,0xc3 -.cfi_endproc -.size __ecp_nistz256_subx,.-__ecp_nistz256_subx - -.type __ecp_nistz256_mul_by_2x,@function -.align 32 -__ecp_nistz256_mul_by_2x: -.cfi_startproc - xorq %r11,%r11 - adcq %r12,%r12 - adcq %r13,%r13 - movq %r12,%rax - adcq %r8,%r8 - adcq %r9,%r9 - movq %r13,%rbp - adcq $0,%r11 - - xorq %r10,%r10 - sbbq $-1,%r12 - movq %r8,%rcx - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%r10 - sbbq %r15,%r9 - sbbq $0,%r11 - - cmovcq %rax,%r12 - cmovcq %rbp,%r13 - movq %r12,0(%rdi) - cmovcq %rcx,%r8 - movq %r13,8(%rdi) - cmovcq %r10,%r9 - movq %r8,16(%rdi) - movq %r9,24(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x -.type ecp_nistz256_point_doublex,@function -.align 32 -ecp_nistz256_point_doublex: -.cfi_startproc -.Lpoint_doublex: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $160+8,%rsp -.cfi_adjust_cfa_offset 32*5+8 -.Lpoint_doublex_body: - -.Lpoint_double_shortcutx: - movdqu 0(%rsi),%xmm0 - movq %rsi,%rbx - movdqu 16(%rsi),%xmm1 - movq 32+0(%rsi),%r12 - movq 32+8(%rsi),%r13 - movq 32+16(%rsi),%r8 - movq 32+24(%rsi),%r9 - movq .Lpoly+8(%rip),%r14 - movq .Lpoly+24(%rip),%r15 - movdqa %xmm0,96(%rsp) - movdqa %xmm1,96+16(%rsp) - leaq 32(%rdi),%r10 - leaq 64(%rdi),%r11 -.byte 102,72,15,110,199 -.byte 102,73,15,110,202 -.byte 102,73,15,110,211 - - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_by_2x - - movq 64+0(%rsi),%rdx - movq 64+8(%rsi),%r14 - movq 64+16(%rsi),%r15 - movq 64+24(%rsi),%r8 - leaq 64-128(%rsi),%rsi - leaq 64(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - movq 0+0(%rsp),%rdx - movq 8+0(%rsp),%r14 - leaq -128+0(%rsp),%rsi - movq 16+0(%rsp),%r15 - movq 24+0(%rsp),%r8 - leaq 0(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - movq 32(%rbx),%rdx - movq 64+0(%rbx),%r9 - movq 64+8(%rbx),%r10 - movq 64+16(%rbx),%r11 - movq 64+24(%rbx),%r12 - leaq 64-128(%rbx),%rsi - leaq 32(%rbx),%rbx -.byte 102,72,15,126,215 - call __ecp_nistz256_mul_montx - call __ecp_nistz256_mul_by_2x - - movq 96+0(%rsp),%r12 - movq 96+8(%rsp),%r13 - leaq 64(%rsp),%rbx - movq 96+16(%rsp),%r8 - movq 96+24(%rsp),%r9 - leaq 32(%rsp),%rdi - call __ecp_nistz256_add_tox - - movq 96+0(%rsp),%r12 - movq 96+8(%rsp),%r13 - leaq 64(%rsp),%rbx - movq 96+16(%rsp),%r8 - movq 96+24(%rsp),%r9 - leaq 64(%rsp),%rdi - call __ecp_nistz256_sub_fromx - - movq 0+0(%rsp),%rdx - movq 8+0(%rsp),%r14 - leaq -128+0(%rsp),%rsi - movq 16+0(%rsp),%r15 - movq 24+0(%rsp),%r8 -.byte 102,72,15,126,207 - call __ecp_nistz256_sqr_montx - xorq %r9,%r9 - movq %r12,%rax - addq $-1,%r12 - movq %r13,%r10 - adcq %rsi,%r13 - movq %r14,%rcx - adcq $0,%r14 - movq %r15,%r8 - adcq %rbp,%r15 - adcq $0,%r9 - xorq %rsi,%rsi - testq $1,%rax - - cmovzq %rax,%r12 - cmovzq %r10,%r13 - cmovzq %rcx,%r14 - cmovzq %r8,%r15 - cmovzq %rsi,%r9 - - movq %r13,%rax - shrq $1,%r12 - shlq $63,%rax - movq %r14,%r10 - shrq $1,%r13 - orq %rax,%r12 - shlq $63,%r10 - movq %r15,%rcx - shrq $1,%r14 - orq %r10,%r13 - shlq $63,%rcx - movq %r12,0(%rdi) - shrq $1,%r15 - movq %r13,8(%rdi) - shlq $63,%r9 - orq %rcx,%r14 - orq %r9,%r15 - movq %r14,16(%rdi) - movq %r15,24(%rdi) - movq 64(%rsp),%rdx - leaq 64(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 128(%rsp),%rdi - call __ecp_nistz256_mul_by_2x - - leaq 32(%rsp),%rbx - leaq 32(%rsp),%rdi - call __ecp_nistz256_add_tox - - movq 96(%rsp),%rdx - leaq 96(%rsp),%rbx - movq 0+0(%rsp),%r9 - movq 8+0(%rsp),%r10 - leaq -128+0(%rsp),%rsi - movq 16+0(%rsp),%r11 - movq 24+0(%rsp),%r12 - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 128(%rsp),%rdi - call __ecp_nistz256_mul_by_2x - - movq 0+32(%rsp),%rdx - movq 8+32(%rsp),%r14 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r15 - movq 24+32(%rsp),%r8 -.byte 102,72,15,126,199 - call __ecp_nistz256_sqr_montx - - leaq 128(%rsp),%rbx - movq %r14,%r8 - movq %r15,%r9 - movq %rsi,%r14 - movq %rbp,%r15 - call __ecp_nistz256_sub_fromx - - movq 0+0(%rsp),%rax - movq 0+8(%rsp),%rbp - movq 0+16(%rsp),%rcx - movq 0+24(%rsp),%r10 - leaq 0(%rsp),%rdi - call __ecp_nistz256_subx - - movq 32(%rsp),%rdx - leaq 32(%rsp),%rbx - movq %r12,%r14 - xorl %ecx,%ecx - movq %r12,0+0(%rsp) - movq %r13,%r10 - movq %r13,0+8(%rsp) - cmovzq %r8,%r11 - movq %r8,0+16(%rsp) - leaq 0-128(%rsp),%rsi - cmovzq %r9,%r12 - movq %r9,0+24(%rsp) - movq %r14,%r9 - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montx - -.byte 102,72,15,126,203 -.byte 102,72,15,126,207 - call __ecp_nistz256_sub_fromx - - leaq 160+56(%rsp),%rsi -.cfi_def_cfa %rsi,8 - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbx -.cfi_restore %rbx - movq -8(%rsi),%rbp -.cfi_restore %rbp - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lpoint_doublex_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size ecp_nistz256_point_doublex,.-ecp_nistz256_point_doublex -.type ecp_nistz256_point_addx,@function -.align 32 -ecp_nistz256_point_addx: -.cfi_startproc -.Lpoint_addx: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $576+8,%rsp -.cfi_adjust_cfa_offset 32*18+8 -.Lpoint_addx_body: - - movdqu 0(%rsi),%xmm0 - movdqu 16(%rsi),%xmm1 - movdqu 32(%rsi),%xmm2 - movdqu 48(%rsi),%xmm3 - movdqu 64(%rsi),%xmm4 - movdqu 80(%rsi),%xmm5 - movq %rsi,%rbx - movq %rdx,%rsi - movdqa %xmm0,384(%rsp) - movdqa %xmm1,384+16(%rsp) - movdqa %xmm2,416(%rsp) - movdqa %xmm3,416+16(%rsp) - movdqa %xmm4,448(%rsp) - movdqa %xmm5,448+16(%rsp) - por %xmm4,%xmm5 - - movdqu 0(%rsi),%xmm0 - pshufd $0xb1,%xmm5,%xmm3 - movdqu 16(%rsi),%xmm1 - movdqu 32(%rsi),%xmm2 - por %xmm3,%xmm5 - movdqu 48(%rsi),%xmm3 - movq 64+0(%rsi),%rdx - movq 64+8(%rsi),%r14 - movq 64+16(%rsi),%r15 - movq 64+24(%rsi),%r8 - movdqa %xmm0,480(%rsp) - pshufd $0x1e,%xmm5,%xmm4 - movdqa %xmm1,480+16(%rsp) - movdqu 64(%rsi),%xmm0 - movdqu 80(%rsi),%xmm1 - movdqa %xmm2,512(%rsp) - movdqa %xmm3,512+16(%rsp) - por %xmm4,%xmm5 - pxor %xmm4,%xmm4 - por %xmm0,%xmm1 -.byte 102,72,15,110,199 - - leaq 64-128(%rsi),%rsi - movq %rdx,544+0(%rsp) - movq %r14,544+8(%rsp) - movq %r15,544+16(%rsp) - movq %r8,544+24(%rsp) - leaq 96(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - pcmpeqd %xmm4,%xmm5 - pshufd $0xb1,%xmm1,%xmm4 - por %xmm1,%xmm4 - pshufd $0,%xmm5,%xmm5 - pshufd $0x1e,%xmm4,%xmm3 - por %xmm3,%xmm4 - pxor %xmm3,%xmm3 - pcmpeqd %xmm3,%xmm4 - pshufd $0,%xmm4,%xmm4 - movq 64+0(%rbx),%rdx - movq 64+8(%rbx),%r14 - movq 64+16(%rbx),%r15 - movq 64+24(%rbx),%r8 -.byte 102,72,15,110,203 - - leaq 64-128(%rbx),%rsi - leaq 32(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - movq 544(%rsp),%rdx - leaq 544(%rsp),%rbx - movq 0+96(%rsp),%r9 - movq 8+96(%rsp),%r10 - leaq -128+96(%rsp),%rsi - movq 16+96(%rsp),%r11 - movq 24+96(%rsp),%r12 - leaq 224(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 448(%rsp),%rdx - leaq 448(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 256(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 416(%rsp),%rdx - leaq 416(%rsp),%rbx - movq 0+224(%rsp),%r9 - movq 8+224(%rsp),%r10 - leaq -128+224(%rsp),%rsi - movq 16+224(%rsp),%r11 - movq 24+224(%rsp),%r12 - leaq 224(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 512(%rsp),%rdx - leaq 512(%rsp),%rbx - movq 0+256(%rsp),%r9 - movq 8+256(%rsp),%r10 - leaq -128+256(%rsp),%rsi - movq 16+256(%rsp),%r11 - movq 24+256(%rsp),%r12 - leaq 256(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 224(%rsp),%rbx - leaq 64(%rsp),%rdi - call __ecp_nistz256_sub_fromx - - orq %r13,%r12 - movdqa %xmm4,%xmm2 - orq %r8,%r12 - orq %r9,%r12 - por %xmm5,%xmm2 -.byte 102,73,15,110,220 - - movq 384(%rsp),%rdx - leaq 384(%rsp),%rbx - movq 0+96(%rsp),%r9 - movq 8+96(%rsp),%r10 - leaq -128+96(%rsp),%rsi - movq 16+96(%rsp),%r11 - movq 24+96(%rsp),%r12 - leaq 160(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 480(%rsp),%rdx - leaq 480(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 192(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 160(%rsp),%rbx - leaq 0(%rsp),%rdi - call __ecp_nistz256_sub_fromx - - orq %r13,%r12 - orq %r8,%r12 - orq %r9,%r12 - -.byte 0x3e - jnz .Ladd_proceedx -.byte 102,73,15,126,208 -.byte 102,73,15,126,217 - testq %r8,%r8 - jnz .Ladd_proceedx - testq %r9,%r9 - jz .Ladd_doublex - -.byte 102,72,15,126,199 - pxor %xmm0,%xmm0 - movdqu %xmm0,0(%rdi) - movdqu %xmm0,16(%rdi) - movdqu %xmm0,32(%rdi) - movdqu %xmm0,48(%rdi) - movdqu %xmm0,64(%rdi) - movdqu %xmm0,80(%rdi) - jmp .Ladd_donex - -.align 32 -.Ladd_doublex: -.byte 102,72,15,126,206 -.byte 102,72,15,126,199 - addq $416,%rsp -.cfi_adjust_cfa_offset -416 - jmp .Lpoint_double_shortcutx -.cfi_adjust_cfa_offset 416 - -.align 32 -.Ladd_proceedx: - movq 0+64(%rsp),%rdx - movq 8+64(%rsp),%r14 - leaq -128+64(%rsp),%rsi - movq 16+64(%rsp),%r15 - movq 24+64(%rsp),%r8 - leaq 96(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - movq 448(%rsp),%rdx - leaq 448(%rsp),%rbx - movq 0+0(%rsp),%r9 - movq 8+0(%rsp),%r10 - leaq -128+0(%rsp),%rsi - movq 16+0(%rsp),%r11 - movq 24+0(%rsp),%r12 - leaq 352(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 0+0(%rsp),%rdx - movq 8+0(%rsp),%r14 - leaq -128+0(%rsp),%rsi - movq 16+0(%rsp),%r15 - movq 24+0(%rsp),%r8 - leaq 32(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - movq 544(%rsp),%rdx - leaq 544(%rsp),%rbx - movq 0+352(%rsp),%r9 - movq 8+352(%rsp),%r10 - leaq -128+352(%rsp),%rsi - movq 16+352(%rsp),%r11 - movq 24+352(%rsp),%r12 - leaq 352(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 0(%rsp),%rdx - leaq 0(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 128(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 160(%rsp),%rdx - leaq 160(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 192(%rsp),%rdi - call __ecp_nistz256_mul_montx - - - - - xorq %r11,%r11 - addq %r12,%r12 - leaq 96(%rsp),%rsi - adcq %r13,%r13 - movq %r12,%rax - adcq %r8,%r8 - adcq %r9,%r9 - movq %r13,%rbp - adcq $0,%r11 - - subq $-1,%r12 - movq %r8,%rcx - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%r10 - sbbq %r15,%r9 - sbbq $0,%r11 - - cmovcq %rax,%r12 - movq 0(%rsi),%rax - cmovcq %rbp,%r13 - movq 8(%rsi),%rbp - cmovcq %rcx,%r8 - movq 16(%rsi),%rcx - cmovcq %r10,%r9 - movq 24(%rsi),%r10 - - call __ecp_nistz256_subx - - leaq 128(%rsp),%rbx - leaq 288(%rsp),%rdi - call __ecp_nistz256_sub_fromx - - movq 192+0(%rsp),%rax - movq 192+8(%rsp),%rbp - movq 192+16(%rsp),%rcx - movq 192+24(%rsp),%r10 - leaq 320(%rsp),%rdi - - call __ecp_nistz256_subx - - movq %r12,0(%rdi) - movq %r13,8(%rdi) - movq %r8,16(%rdi) - movq %r9,24(%rdi) - movq 128(%rsp),%rdx - leaq 128(%rsp),%rbx - movq 0+224(%rsp),%r9 - movq 8+224(%rsp),%r10 - leaq -128+224(%rsp),%rsi - movq 16+224(%rsp),%r11 - movq 24+224(%rsp),%r12 - leaq 256(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 320(%rsp),%rdx - leaq 320(%rsp),%rbx - movq 0+64(%rsp),%r9 - movq 8+64(%rsp),%r10 - leaq -128+64(%rsp),%rsi - movq 16+64(%rsp),%r11 - movq 24+64(%rsp),%r12 - leaq 320(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 256(%rsp),%rbx - leaq 320(%rsp),%rdi - call __ecp_nistz256_sub_fromx - -.byte 102,72,15,126,199 - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 352(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 352+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 544(%rsp),%xmm2 - pand 544+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 448(%rsp),%xmm2 - pand 448+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,64(%rdi) - movdqu %xmm3,80(%rdi) - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 288(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 288+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 480(%rsp),%xmm2 - pand 480+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 384(%rsp),%xmm2 - pand 384+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,0(%rdi) - movdqu %xmm3,16(%rdi) - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 320(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 320+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 512(%rsp),%xmm2 - pand 512+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 416(%rsp),%xmm2 - pand 416+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,32(%rdi) - movdqu %xmm3,48(%rdi) - -.Ladd_donex: - leaq 576+56(%rsp),%rsi -.cfi_def_cfa %rsi,8 - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbx -.cfi_restore %rbx - movq -8(%rsi),%rbp -.cfi_restore %rbp - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lpoint_addx_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size ecp_nistz256_point_addx,.-ecp_nistz256_point_addx -.type ecp_nistz256_point_add_affinex,@function -.align 32 -ecp_nistz256_point_add_affinex: -.cfi_startproc -.Lpoint_add_affinex: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $480+8,%rsp -.cfi_adjust_cfa_offset 32*15+8 -.Ladd_affinex_body: - - movdqu 0(%rsi),%xmm0 - movq %rdx,%rbx - movdqu 16(%rsi),%xmm1 - movdqu 32(%rsi),%xmm2 - movdqu 48(%rsi),%xmm3 - movdqu 64(%rsi),%xmm4 - movdqu 80(%rsi),%xmm5 - movq 64+0(%rsi),%rdx - movq 64+8(%rsi),%r14 - movq 64+16(%rsi),%r15 - movq 64+24(%rsi),%r8 - movdqa %xmm0,320(%rsp) - movdqa %xmm1,320+16(%rsp) - movdqa %xmm2,352(%rsp) - movdqa %xmm3,352+16(%rsp) - movdqa %xmm4,384(%rsp) - movdqa %xmm5,384+16(%rsp) - por %xmm4,%xmm5 - - movdqu 0(%rbx),%xmm0 - pshufd $0xb1,%xmm5,%xmm3 - movdqu 16(%rbx),%xmm1 - movdqu 32(%rbx),%xmm2 - por %xmm3,%xmm5 - movdqu 48(%rbx),%xmm3 - movdqa %xmm0,416(%rsp) - pshufd $0x1e,%xmm5,%xmm4 - movdqa %xmm1,416+16(%rsp) - por %xmm0,%xmm1 -.byte 102,72,15,110,199 - movdqa %xmm2,448(%rsp) - movdqa %xmm3,448+16(%rsp) - por %xmm2,%xmm3 - por %xmm4,%xmm5 - pxor %xmm4,%xmm4 - por %xmm1,%xmm3 - - leaq 64-128(%rsi),%rsi - leaq 32(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - pcmpeqd %xmm4,%xmm5 - pshufd $0xb1,%xmm3,%xmm4 - movq 0(%rbx),%rdx - - movq %r12,%r9 - por %xmm3,%xmm4 - pshufd $0,%xmm5,%xmm5 - pshufd $0x1e,%xmm4,%xmm3 - movq %r13,%r10 - por %xmm3,%xmm4 - pxor %xmm3,%xmm3 - movq %r14,%r11 - pcmpeqd %xmm3,%xmm4 - pshufd $0,%xmm4,%xmm4 - - leaq 32-128(%rsp),%rsi - movq %r15,%r12 - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 320(%rsp),%rbx - leaq 64(%rsp),%rdi - call __ecp_nistz256_sub_fromx - - movq 384(%rsp),%rdx - leaq 384(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 384(%rsp),%rdx - leaq 384(%rsp),%rbx - movq 0+64(%rsp),%r9 - movq 8+64(%rsp),%r10 - leaq -128+64(%rsp),%rsi - movq 16+64(%rsp),%r11 - movq 24+64(%rsp),%r12 - leaq 288(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 448(%rsp),%rdx - leaq 448(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 352(%rsp),%rbx - leaq 96(%rsp),%rdi - call __ecp_nistz256_sub_fromx - - movq 0+64(%rsp),%rdx - movq 8+64(%rsp),%r14 - leaq -128+64(%rsp),%rsi - movq 16+64(%rsp),%r15 - movq 24+64(%rsp),%r8 - leaq 128(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - movq 0+96(%rsp),%rdx - movq 8+96(%rsp),%r14 - leaq -128+96(%rsp),%rsi - movq 16+96(%rsp),%r15 - movq 24+96(%rsp),%r8 - leaq 192(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - movq 128(%rsp),%rdx - leaq 128(%rsp),%rbx - movq 0+64(%rsp),%r9 - movq 8+64(%rsp),%r10 - leaq -128+64(%rsp),%rsi - movq 16+64(%rsp),%r11 - movq 24+64(%rsp),%r12 - leaq 160(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 320(%rsp),%rdx - leaq 320(%rsp),%rbx - movq 0+128(%rsp),%r9 - movq 8+128(%rsp),%r10 - leaq -128+128(%rsp),%rsi - movq 16+128(%rsp),%r11 - movq 24+128(%rsp),%r12 - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montx - - - - - xorq %r11,%r11 - addq %r12,%r12 - leaq 192(%rsp),%rsi - adcq %r13,%r13 - movq %r12,%rax - adcq %r8,%r8 - adcq %r9,%r9 - movq %r13,%rbp - adcq $0,%r11 - - subq $-1,%r12 - movq %r8,%rcx - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%r10 - sbbq %r15,%r9 - sbbq $0,%r11 - - cmovcq %rax,%r12 - movq 0(%rsi),%rax - cmovcq %rbp,%r13 - movq 8(%rsi),%rbp - cmovcq %rcx,%r8 - movq 16(%rsi),%rcx - cmovcq %r10,%r9 - movq 24(%rsi),%r10 - - call __ecp_nistz256_subx - - leaq 160(%rsp),%rbx - leaq 224(%rsp),%rdi - call __ecp_nistz256_sub_fromx - - movq 0+0(%rsp),%rax - movq 0+8(%rsp),%rbp - movq 0+16(%rsp),%rcx - movq 0+24(%rsp),%r10 - leaq 64(%rsp),%rdi - - call __ecp_nistz256_subx - - movq %r12,0(%rdi) - movq %r13,8(%rdi) - movq %r8,16(%rdi) - movq %r9,24(%rdi) - movq 352(%rsp),%rdx - leaq 352(%rsp),%rbx - movq 0+160(%rsp),%r9 - movq 8+160(%rsp),%r10 - leaq -128+160(%rsp),%rsi - movq 16+160(%rsp),%r11 - movq 24+160(%rsp),%r12 - leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 96(%rsp),%rdx - leaq 96(%rsp),%rbx - movq 0+64(%rsp),%r9 - movq 8+64(%rsp),%r10 - leaq -128+64(%rsp),%rsi - movq 16+64(%rsp),%r11 - movq 24+64(%rsp),%r12 - leaq 64(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 32(%rsp),%rbx - leaq 256(%rsp),%rdi - call __ecp_nistz256_sub_fromx - -.byte 102,72,15,126,199 - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 288(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 288+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand .LONE_mont(%rip),%xmm2 - pand .LONE_mont+16(%rip),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 384(%rsp),%xmm2 - pand 384+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,64(%rdi) - movdqu %xmm3,80(%rdi) - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 224(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 224+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 416(%rsp),%xmm2 - pand 416+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 320(%rsp),%xmm2 - pand 320+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,0(%rdi) - movdqu %xmm3,16(%rdi) - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 256(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 256+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 448(%rsp),%xmm2 - pand 448+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 352(%rsp),%xmm2 - pand 352+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,32(%rdi) - movdqu %xmm3,48(%rdi) - - leaq 480+56(%rsp),%rsi -.cfi_def_cfa %rsi,8 - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbx -.cfi_restore %rbx - movq -8(%rsi),%rbp -.cfi_restore %rbp - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Ladd_affinex_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size ecp_nistz256_point_add_affinex,.-ecp_nistz256_point_add_affinex diff --git a/secure/lib/libcrypto/amd64/ghash-x86_64.S b/secure/lib/libcrypto/amd64/ghash-x86_64.S index 078353528d5f..55ad7db1f240 100644 --- a/secure/lib/libcrypto/amd64/ghash-x86_64.S +++ b/secure/lib/libcrypto/amd64/ghash-x86_64.S @@ -1304,108 +1304,7 @@ gcm_ghash_clmul: .align 32 gcm_init_avx: .cfi_startproc - vzeroupper - - vmovdqu (%rsi),%xmm2 - vpshufd $78,%xmm2,%xmm2 - - - vpshufd $255,%xmm2,%xmm4 - vpsrlq $63,%xmm2,%xmm3 - vpsllq $1,%xmm2,%xmm2 - vpxor %xmm5,%xmm5,%xmm5 - vpcmpgtd %xmm4,%xmm5,%xmm5 - vpslldq $8,%xmm3,%xmm3 - vpor %xmm3,%xmm2,%xmm2 - - - vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5 - vpxor %xmm5,%xmm2,%xmm2 - - vpunpckhqdq %xmm2,%xmm2,%xmm6 - vmovdqa %xmm2,%xmm0 - vpxor %xmm2,%xmm6,%xmm6 - movq $4,%r10 - jmp .Linit_start_avx -.align 32 -.Linit_loop_avx: - vpalignr $8,%xmm3,%xmm4,%xmm5 - vmovdqu %xmm5,-16(%rdi) - vpunpckhqdq %xmm0,%xmm0,%xmm3 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 - vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 - vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 - vpxor %xmm0,%xmm1,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - - vpslldq $8,%xmm3,%xmm4 - vpsrldq $8,%xmm3,%xmm3 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm3,%xmm1,%xmm1 - vpsllq $57,%xmm0,%xmm3 - vpsllq $62,%xmm0,%xmm4 - vpxor %xmm3,%xmm4,%xmm4 - vpsllq $63,%xmm0,%xmm3 - vpxor %xmm3,%xmm4,%xmm4 - vpslldq $8,%xmm4,%xmm3 - vpsrldq $8,%xmm4,%xmm4 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm4,%xmm1,%xmm1 - - vpsrlq $1,%xmm0,%xmm4 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpsrlq $5,%xmm4,%xmm4 - vpxor %xmm4,%xmm0,%xmm0 - vpsrlq $1,%xmm0,%xmm0 - vpxor %xmm1,%xmm0,%xmm0 -.Linit_start_avx: - vmovdqa %xmm0,%xmm5 - vpunpckhqdq %xmm0,%xmm0,%xmm3 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 - vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 - vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 - vpxor %xmm0,%xmm1,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - - vpslldq $8,%xmm3,%xmm4 - vpsrldq $8,%xmm3,%xmm3 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm3,%xmm1,%xmm1 - vpsllq $57,%xmm0,%xmm3 - vpsllq $62,%xmm0,%xmm4 - vpxor %xmm3,%xmm4,%xmm4 - vpsllq $63,%xmm0,%xmm3 - vpxor %xmm3,%xmm4,%xmm4 - vpslldq $8,%xmm4,%xmm3 - vpsrldq $8,%xmm4,%xmm4 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm4,%xmm1,%xmm1 - - vpsrlq $1,%xmm0,%xmm4 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpsrlq $5,%xmm4,%xmm4 - vpxor %xmm4,%xmm0,%xmm0 - vpsrlq $1,%xmm0,%xmm0 - vpxor %xmm1,%xmm0,%xmm0 - vpshufd $78,%xmm5,%xmm3 - vpshufd $78,%xmm0,%xmm4 - vpxor %xmm5,%xmm3,%xmm3 - vmovdqu %xmm5,0(%rdi) - vpxor %xmm0,%xmm4,%xmm4 - vmovdqu %xmm0,16(%rdi) - leaq 48(%rdi),%rdi - subq $1,%r10 - jnz .Linit_loop_avx - - vpalignr $8,%xmm4,%xmm3,%xmm5 - vmovdqu %xmm5,-16(%rdi) - - vzeroupper - .byte 0xf3,0xc3 + jmp .L_init_clmul .cfi_endproc .size gcm_init_avx,.-gcm_init_avx .globl gcm_gmult_avx @@ -1421,377 +1320,7 @@ gcm_gmult_avx: .align 32 gcm_ghash_avx: .cfi_startproc - vzeroupper - - vmovdqu (%rdi),%xmm10 - leaq .L0x1c2_polynomial(%rip),%r10 - leaq 64(%rsi),%rsi - vmovdqu .Lbswap_mask(%rip),%xmm13 - vpshufb %xmm13,%xmm10,%xmm10 - cmpq $0x80,%rcx - jb .Lshort_avx - subq $0x80,%rcx - - vmovdqu 112(%rdx),%xmm14 - vmovdqu 0-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm14 - vmovdqu 32-64(%rsi),%xmm7 - - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vmovdqu 96(%rdx),%xmm15 - vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 - vpxor %xmm14,%xmm9,%xmm9 - vpshufb %xmm13,%xmm15,%xmm15 - vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 - vmovdqu 16-64(%rsi),%xmm6 - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vmovdqu 80(%rdx),%xmm14 - vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 - vpxor %xmm15,%xmm8,%xmm8 - - vpshufb %xmm13,%xmm14,%xmm14 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 - vmovdqu 48-64(%rsi),%xmm6 - vpxor %xmm14,%xmm9,%xmm9 - vmovdqu 64(%rdx),%xmm15 - vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 - vmovdqu 80-64(%rsi),%xmm7 - - vpshufb %xmm13,%xmm15,%xmm15 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 - vpxor %xmm1,%xmm4,%xmm4 - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 - vmovdqu 64-64(%rsi),%xmm6 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 - vpxor %xmm15,%xmm8,%xmm8 - - vmovdqu 48(%rdx),%xmm14 - vpxor %xmm3,%xmm0,%xmm0 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 - vpxor %xmm4,%xmm1,%xmm1 - vpshufb %xmm13,%xmm14,%xmm14 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 - vmovdqu 96-64(%rsi),%xmm6 - vpxor %xmm5,%xmm2,%xmm2 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 - vmovdqu 128-64(%rsi),%xmm7 - vpxor %xmm14,%xmm9,%xmm9 - - vmovdqu 32(%rdx),%xmm15 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 - vpxor %xmm1,%xmm4,%xmm4 - vpshufb %xmm13,%xmm15,%xmm15 - vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 - vmovdqu 112-64(%rsi),%xmm6 - vpxor %xmm2,%xmm5,%xmm5 - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 - vpxor %xmm15,%xmm8,%xmm8 - - vmovdqu 16(%rdx),%xmm14 - vpxor %xmm3,%xmm0,%xmm0 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 - vpxor %xmm4,%xmm1,%xmm1 - vpshufb %xmm13,%xmm14,%xmm14 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 - vmovdqu 144-64(%rsi),%xmm6 - vpxor %xmm5,%xmm2,%xmm2 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 - vmovdqu 176-64(%rsi),%xmm7 - vpxor %xmm14,%xmm9,%xmm9 - - vmovdqu (%rdx),%xmm15 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 - vpxor %xmm1,%xmm4,%xmm4 - vpshufb %xmm13,%xmm15,%xmm15 - vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 - vmovdqu 160-64(%rsi),%xmm6 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 - - leaq 128(%rdx),%rdx - cmpq $0x80,%rcx - jb .Ltail_avx - - vpxor %xmm10,%xmm15,%xmm15 - subq $0x80,%rcx - jmp .Loop8x_avx - -.align 32 -.Loop8x_avx: - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vmovdqu 112(%rdx),%xmm14 - vpxor %xmm0,%xmm3,%xmm3 - vpxor %xmm15,%xmm8,%xmm8 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10 - vpshufb %xmm13,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11 - vmovdqu 0-64(%rsi),%xmm6 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12 - vmovdqu 32-64(%rsi),%xmm7 - vpxor %xmm14,%xmm9,%xmm9 - - vmovdqu 96(%rdx),%xmm15 - vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 - vpxor %xmm3,%xmm10,%xmm10 - vpshufb %xmm13,%xmm15,%xmm15 - vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 - vxorps %xmm4,%xmm11,%xmm11 - vmovdqu 16-64(%rsi),%xmm6 - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 - vpxor %xmm5,%xmm12,%xmm12 - vxorps %xmm15,%xmm8,%xmm8 - - vmovdqu 80(%rdx),%xmm14 - vpxor %xmm10,%xmm12,%xmm12 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 - vpxor %xmm11,%xmm12,%xmm12 - vpslldq $8,%xmm12,%xmm9 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 - vpsrldq $8,%xmm12,%xmm12 - vpxor %xmm9,%xmm10,%xmm10 - vmovdqu 48-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm14 - vxorps %xmm12,%xmm11,%xmm11 - vpxor %xmm1,%xmm4,%xmm4 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 - vmovdqu 80-64(%rsi),%xmm7 - vpxor %xmm14,%xmm9,%xmm9 - vpxor %xmm2,%xmm5,%xmm5 - - vmovdqu 64(%rdx),%xmm15 - vpalignr $8,%xmm10,%xmm10,%xmm12 - vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 - vpshufb %xmm13,%xmm15,%xmm15 - vpxor %xmm3,%xmm0,%xmm0 - vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 - vmovdqu 64-64(%rsi),%xmm6 - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 - vxorps %xmm15,%xmm8,%xmm8 - vpxor %xmm5,%xmm2,%xmm2 - - vmovdqu 48(%rdx),%xmm14 - vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 - vpshufb %xmm13,%xmm14,%xmm14 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 - vmovdqu 96-64(%rsi),%xmm6 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 - vmovdqu 128-64(%rsi),%xmm7 - vpxor %xmm14,%xmm9,%xmm9 - vpxor %xmm2,%xmm5,%xmm5 - - vmovdqu 32(%rdx),%xmm15 - vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 - vpshufb %xmm13,%xmm15,%xmm15 - vpxor %xmm3,%xmm0,%xmm0 - vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 - vmovdqu 112-64(%rsi),%xmm6 - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 - vpxor %xmm15,%xmm8,%xmm8 - vpxor %xmm5,%xmm2,%xmm2 - vxorps %xmm12,%xmm10,%xmm10 - - vmovdqu 16(%rdx),%xmm14 - vpalignr $8,%xmm10,%xmm10,%xmm12 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 - vpshufb %xmm13,%xmm14,%xmm14 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 - vmovdqu 144-64(%rsi),%xmm6 - vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 - vxorps %xmm11,%xmm12,%xmm12 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 - vmovdqu 176-64(%rsi),%xmm7 - vpxor %xmm14,%xmm9,%xmm9 - vpxor %xmm2,%xmm5,%xmm5 - - vmovdqu (%rdx),%xmm15 - vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 - vpshufb %xmm13,%xmm15,%xmm15 - vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 - vmovdqu 160-64(%rsi),%xmm6 - vpxor %xmm12,%xmm15,%xmm15 - vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 - vpxor %xmm10,%xmm15,%xmm15 - - leaq 128(%rdx),%rdx - subq $0x80,%rcx - jnc .Loop8x_avx - - addq $0x80,%rcx - jmp .Ltail_no_xor_avx - -.align 32 -.Lshort_avx: - vmovdqu -16(%rdx,%rcx,1),%xmm14 - leaq (%rdx,%rcx,1),%rdx - vmovdqu 0-64(%rsi),%xmm6 - vmovdqu 32-64(%rsi),%xmm7 - vpshufb %xmm13,%xmm14,%xmm15 - - vmovdqa %xmm0,%xmm3 - vmovdqa %xmm1,%xmm4 - vmovdqa %xmm2,%xmm5 - subq $0x10,%rcx - jz .Ltail_avx - - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu -32(%rdx),%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 - vmovdqu 16-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm15 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 - vpsrldq $8,%xmm7,%xmm7 - subq $0x10,%rcx - jz .Ltail_avx - - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu -48(%rdx),%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 - vmovdqu 48-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm15 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 - vmovdqu 80-64(%rsi),%xmm7 - subq $0x10,%rcx - jz .Ltail_avx - - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu -64(%rdx),%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 - vmovdqu 64-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm15 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 - vpsrldq $8,%xmm7,%xmm7 - subq $0x10,%rcx - jz .Ltail_avx - - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu -80(%rdx),%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 - vmovdqu 96-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm15 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 - vmovdqu 128-64(%rsi),%xmm7 - subq $0x10,%rcx - jz .Ltail_avx - - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu -96(%rdx),%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 - vmovdqu 112-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm15 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 - vpsrldq $8,%xmm7,%xmm7 - subq $0x10,%rcx - jz .Ltail_avx - - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu -112(%rdx),%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 - vmovdqu 144-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm15 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 - vmovq 184-64(%rsi),%xmm7 - subq $0x10,%rcx - jmp .Ltail_avx - -.align 32 -.Ltail_avx: - vpxor %xmm10,%xmm15,%xmm15 -.Ltail_no_xor_avx: - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 - - vmovdqu (%r10),%xmm12 - - vpxor %xmm0,%xmm3,%xmm10 - vpxor %xmm1,%xmm4,%xmm11 - vpxor %xmm2,%xmm5,%xmm5 - - vpxor %xmm10,%xmm5,%xmm5 - vpxor %xmm11,%xmm5,%xmm5 - vpslldq $8,%xmm5,%xmm9 - vpsrldq $8,%xmm5,%xmm5 - vpxor %xmm9,%xmm10,%xmm10 - vpxor %xmm5,%xmm11,%xmm11 - - vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 - vpalignr $8,%xmm10,%xmm10,%xmm10 - vpxor %xmm9,%xmm10,%xmm10 - - vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 - vpalignr $8,%xmm10,%xmm10,%xmm10 - vpxor %xmm11,%xmm10,%xmm10 - vpxor %xmm9,%xmm10,%xmm10 - - cmpq $0,%rcx - jne .Lshort_avx - - vpshufb %xmm13,%xmm10,%xmm10 - vmovdqu %xmm10,(%rdi) - vzeroupper - .byte 0xf3,0xc3 + jmp .L_ghash_clmul .cfi_endproc .size gcm_ghash_avx,.-gcm_ghash_avx .align 64 diff --git a/secure/lib/libcrypto/amd64/keccak1600-x86_64.S b/secure/lib/libcrypto/amd64/keccak1600-x86_64.S index 582740bd2802..d36758807990 100644 --- a/secure/lib/libcrypto/amd64/keccak1600-x86_64.S +++ b/secure/lib/libcrypto/amd64/keccak1600-x86_64.S @@ -5,6 +5,7 @@ .type __KeccakF1600,@function .align 32 __KeccakF1600: +.cfi_startproc movq 60(%rdi),%rax movq 68(%rdi),%rbx movq 76(%rdi),%rcx @@ -257,6 +258,7 @@ __KeccakF1600: leaq -192(%r15),%r15 .byte 0xf3,0xc3 +.cfi_endproc .size __KeccakF1600,.-__KeccakF1600 .type KeccakF1600,@function diff --git a/secure/lib/libcrypto/amd64/poly1305-x86_64.S b/secure/lib/libcrypto/amd64/poly1305-x86_64.S index 6973743427f3..d74ee9b45052 100644 --- a/secure/lib/libcrypto/amd64/poly1305-x86_64.S +++ b/secure/lib/libcrypto/amd64/poly1305-x86_64.S @@ -14,6 +14,7 @@ .type poly1305_init,@function .align 32 poly1305_init: +.cfi_startproc xorq %rax,%rax movq %rax,0(%rdi) movq %rax,8(%rdi) @@ -24,15 +25,6 @@ poly1305_init: leaq poly1305_blocks(%rip),%r10 leaq poly1305_emit(%rip),%r11 - movq OPENSSL_ia32cap_P+4(%rip),%r9 - leaq poly1305_blocks_avx(%rip),%rax - leaq poly1305_emit_avx(%rip),%rcx - btq $28,%r9 - cmovcq %rax,%r10 - cmovcq %rcx,%r11 - leaq poly1305_blocks_avx2(%rip),%rax - btq $37,%r9 - cmovcq %rax,%r10 movq $0x0ffffffc0fffffff,%rax movq $0x0ffffffc0ffffffc,%rcx andq 0(%rsi),%rax @@ -44,6 +36,7 @@ poly1305_init: movl $1,%eax .Lno_key: .byte 0xf3,0xc3 +.cfi_endproc .size poly1305_init,.-poly1305_init .type poly1305_blocks,@function @@ -164,6 +157,7 @@ poly1305_blocks: .type poly1305_emit,@function .align 32 poly1305_emit: +.cfi_startproc .Lemit: movq 0(%rdi),%r8 movq 8(%rdi),%r9 @@ -184,1783 +178,15 @@ poly1305_emit: movq %rcx,8(%rsi) .byte 0xf3,0xc3 -.size poly1305_emit,.-poly1305_emit -.type __poly1305_block,@function -.align 32 -__poly1305_block: - mulq %r14 - movq %rax,%r9 - movq %r11,%rax - movq %rdx,%r10 - - mulq %r14 - movq %rax,%r14 - movq %r11,%rax - movq %rdx,%r8 - - mulq %rbx - addq %rax,%r9 - movq %r13,%rax - adcq %rdx,%r10 - - mulq %rbx - movq %rbp,%rbx - addq %rax,%r14 - adcq %rdx,%r8 - - imulq %r13,%rbx - addq %rbx,%r9 - movq %r8,%rbx - adcq $0,%r10 - - imulq %r11,%rbp - addq %r9,%rbx - movq $-4,%rax - adcq %rbp,%r10 - - andq %r10,%rax - movq %r10,%rbp - shrq $2,%r10 - andq $3,%rbp - addq %r10,%rax - addq %rax,%r14 - adcq $0,%rbx - adcq $0,%rbp - .byte 0xf3,0xc3 -.size __poly1305_block,.-__poly1305_block - -.type __poly1305_init_avx,@function -.align 32 -__poly1305_init_avx: - movq %r11,%r14 - movq %r12,%rbx - xorq %rbp,%rbp - - leaq 48+64(%rdi),%rdi - - movq %r12,%rax - call __poly1305_block - - movl $0x3ffffff,%eax - movl $0x3ffffff,%edx - movq %r14,%r8 - andl %r14d,%eax - movq %r11,%r9 - andl %r11d,%edx - movl %eax,-64(%rdi) - shrq $26,%r8 - movl %edx,-60(%rdi) - shrq $26,%r9 - - movl $0x3ffffff,%eax - movl $0x3ffffff,%edx - andl %r8d,%eax - andl %r9d,%edx - movl %eax,-48(%rdi) - leal (%rax,%rax,4),%eax - movl %edx,-44(%rdi) - leal (%rdx,%rdx,4),%edx - movl %eax,-32(%rdi) - shrq $26,%r8 - movl %edx,-28(%rdi) - shrq $26,%r9 - - movq %rbx,%rax - movq %r12,%rdx - shlq $12,%rax - shlq $12,%rdx - orq %r8,%rax - orq %r9,%rdx - andl $0x3ffffff,%eax - andl $0x3ffffff,%edx - movl %eax,-16(%rdi) - leal (%rax,%rax,4),%eax - movl %edx,-12(%rdi) - leal (%rdx,%rdx,4),%edx - movl %eax,0(%rdi) - movq %rbx,%r8 - movl %edx,4(%rdi) - movq %r12,%r9 - - movl $0x3ffffff,%eax - movl $0x3ffffff,%edx - shrq $14,%r8 - shrq $14,%r9 - andl %r8d,%eax - andl %r9d,%edx - movl %eax,16(%rdi) - leal (%rax,%rax,4),%eax - movl %edx,20(%rdi) - leal (%rdx,%rdx,4),%edx - movl %eax,32(%rdi) - shrq $26,%r8 - movl %edx,36(%rdi) - shrq $26,%r9 - - movq %rbp,%rax - shlq $24,%rax - orq %rax,%r8 - movl %r8d,48(%rdi) - leaq (%r8,%r8,4),%r8 - movl %r9d,52(%rdi) - leaq (%r9,%r9,4),%r9 - movl %r8d,64(%rdi) - movl %r9d,68(%rdi) - - movq %r12,%rax - call __poly1305_block - - movl $0x3ffffff,%eax - movq %r14,%r8 - andl %r14d,%eax - shrq $26,%r8 - movl %eax,-52(%rdi) - - movl $0x3ffffff,%edx - andl %r8d,%edx - movl %edx,-36(%rdi) - leal (%rdx,%rdx,4),%edx - shrq $26,%r8 - movl %edx,-20(%rdi) - - movq %rbx,%rax - shlq $12,%rax - orq %r8,%rax - andl $0x3ffffff,%eax - movl %eax,-4(%rdi) - leal (%rax,%rax,4),%eax - movq %rbx,%r8 - movl %eax,12(%rdi) - - movl $0x3ffffff,%edx - shrq $14,%r8 - andl %r8d,%edx - movl %edx,28(%rdi) - leal (%rdx,%rdx,4),%edx - shrq $26,%r8 - movl %edx,44(%rdi) - - movq %rbp,%rax - shlq $24,%rax - orq %rax,%r8 - movl %r8d,60(%rdi) - leaq (%r8,%r8,4),%r8 - movl %r8d,76(%rdi) - - movq %r12,%rax - call __poly1305_block - - movl $0x3ffffff,%eax - movq %r14,%r8 - andl %r14d,%eax - shrq $26,%r8 - movl %eax,-56(%rdi) - - movl $0x3ffffff,%edx - andl %r8d,%edx - movl %edx,-40(%rdi) - leal (%rdx,%rdx,4),%edx - shrq $26,%r8 - movl %edx,-24(%rdi) - - movq %rbx,%rax - shlq $12,%rax - orq %r8,%rax - andl $0x3ffffff,%eax - movl %eax,-8(%rdi) - leal (%rax,%rax,4),%eax - movq %rbx,%r8 - movl %eax,8(%rdi) - - movl $0x3ffffff,%edx - shrq $14,%r8 - andl %r8d,%edx - movl %edx,24(%rdi) - leal (%rdx,%rdx,4),%edx - shrq $26,%r8 - movl %edx,40(%rdi) - - movq %rbp,%rax - shlq $24,%rax - orq %rax,%r8 - movl %r8d,56(%rdi) - leaq (%r8,%r8,4),%r8 - movl %r8d,72(%rdi) - - leaq -48-64(%rdi),%rdi - .byte 0xf3,0xc3 -.size __poly1305_init_avx,.-__poly1305_init_avx - -.type poly1305_blocks_avx,@function -.align 32 -poly1305_blocks_avx: -.cfi_startproc - movl 20(%rdi),%r8d - cmpq $128,%rdx - jae .Lblocks_avx - testl %r8d,%r8d - jz .Lblocks - -.Lblocks_avx: - andq $-16,%rdx - jz .Lno_data_avx - - vzeroupper - - testl %r8d,%r8d - jz .Lbase2_64_avx - - testq $31,%rdx - jz .Leven_avx - - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 -.Lblocks_avx_body: - - movq %rdx,%r15 - - movq 0(%rdi),%r8 - movq 8(%rdi),%r9 - movl 16(%rdi),%ebp - - movq 24(%rdi),%r11 - movq 32(%rdi),%r13 - - - movl %r8d,%r14d - andq $-2147483648,%r8 - movq %r9,%r12 - movl %r9d,%ebx - andq $-2147483648,%r9 - - shrq $6,%r8 - shlq $52,%r12 - addq %r8,%r14 - shrq $12,%rbx - shrq $18,%r9 - addq %r12,%r14 - adcq %r9,%rbx - - movq %rbp,%r8 - shlq $40,%r8 - shrq $24,%rbp - addq %r8,%rbx - adcq $0,%rbp - - movq $-4,%r9 - movq %rbp,%r8 - andq %rbp,%r9 - shrq $2,%r8 - andq $3,%rbp - addq %r9,%r8 - addq %r8,%r14 - adcq $0,%rbx - adcq $0,%rbp - - movq %r13,%r12 - movq %r13,%rax - shrq $2,%r13 - addq %r12,%r13 - - addq 0(%rsi),%r14 - adcq 8(%rsi),%rbx - leaq 16(%rsi),%rsi - adcq %rcx,%rbp - - call __poly1305_block - - testq %rcx,%rcx - jz .Lstore_base2_64_avx - - - movq %r14,%rax - movq %r14,%rdx - shrq $52,%r14 - movq %rbx,%r11 - movq %rbx,%r12 - shrq $26,%rdx - andq $0x3ffffff,%rax - shlq $12,%r11 - andq $0x3ffffff,%rdx - shrq $14,%rbx - orq %r11,%r14 - shlq $24,%rbp - andq $0x3ffffff,%r14 - shrq $40,%r12 - andq $0x3ffffff,%rbx - orq %r12,%rbp - - subq $16,%r15 - jz .Lstore_base2_26_avx - - vmovd %eax,%xmm0 - vmovd %edx,%xmm1 - vmovd %r14d,%xmm2 - vmovd %ebx,%xmm3 - vmovd %ebp,%xmm4 - jmp .Lproceed_avx - -.align 32 -.Lstore_base2_64_avx: - movq %r14,0(%rdi) - movq %rbx,8(%rdi) - movq %rbp,16(%rdi) - jmp .Ldone_avx - -.align 16 -.Lstore_base2_26_avx: - movl %eax,0(%rdi) - movl %edx,4(%rdi) - movl %r14d,8(%rdi) - movl %ebx,12(%rdi) - movl %ebp,16(%rdi) -.align 16 -.Ldone_avx: - movq 0(%rsp),%r15 -.cfi_restore %r15 - movq 8(%rsp),%r14 -.cfi_restore %r14 - movq 16(%rsp),%r13 -.cfi_restore %r13 - movq 24(%rsp),%r12 -.cfi_restore %r12 - movq 32(%rsp),%rbp -.cfi_restore %rbp - movq 40(%rsp),%rbx -.cfi_restore %rbx - leaq 48(%rsp),%rsp -.cfi_adjust_cfa_offset -48 -.Lno_data_avx: -.Lblocks_avx_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc - -.align 32 -.Lbase2_64_avx: -.cfi_startproc - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 -.Lbase2_64_avx_body: - - movq %rdx,%r15 - - movq 24(%rdi),%r11 - movq 32(%rdi),%r13 - - movq 0(%rdi),%r14 - movq 8(%rdi),%rbx - movl 16(%rdi),%ebp - - movq %r13,%r12 - movq %r13,%rax - shrq $2,%r13 - addq %r12,%r13 - - testq $31,%rdx - jz .Linit_avx - - addq 0(%rsi),%r14 - adcq 8(%rsi),%rbx - leaq 16(%rsi),%rsi - adcq %rcx,%rbp - subq $16,%r15 - - call __poly1305_block - -.Linit_avx: - - movq %r14,%rax - movq %r14,%rdx - shrq $52,%r14 - movq %rbx,%r8 - movq %rbx,%r9 - shrq $26,%rdx - andq $0x3ffffff,%rax - shlq $12,%r8 - andq $0x3ffffff,%rdx - shrq $14,%rbx - orq %r8,%r14 - shlq $24,%rbp - andq $0x3ffffff,%r14 - shrq $40,%r9 - andq $0x3ffffff,%rbx - orq %r9,%rbp - - vmovd %eax,%xmm0 - vmovd %edx,%xmm1 - vmovd %r14d,%xmm2 - vmovd %ebx,%xmm3 - vmovd %ebp,%xmm4 - movl $1,20(%rdi) - - call __poly1305_init_avx - -.Lproceed_avx: - movq %r15,%rdx - - movq 0(%rsp),%r15 -.cfi_restore %r15 - movq 8(%rsp),%r14 -.cfi_restore %r14 - movq 16(%rsp),%r13 -.cfi_restore %r13 - movq 24(%rsp),%r12 -.cfi_restore %r12 - movq 32(%rsp),%rbp -.cfi_restore %rbp - movq 40(%rsp),%rbx -.cfi_restore %rbx - leaq 48(%rsp),%rax - leaq 48(%rsp),%rsp -.cfi_adjust_cfa_offset -48 -.Lbase2_64_avx_epilogue: - jmp .Ldo_avx -.cfi_endproc - -.align 32 -.Leven_avx: -.cfi_startproc - vmovd 0(%rdi),%xmm0 - vmovd 4(%rdi),%xmm1 - vmovd 8(%rdi),%xmm2 - vmovd 12(%rdi),%xmm3 - vmovd 16(%rdi),%xmm4 - -.Ldo_avx: - leaq -88(%rsp),%r11 -.cfi_def_cfa %r11,0x60 - subq $0x178,%rsp - subq $64,%rdx - leaq -32(%rsi),%rax - cmovcq %rax,%rsi - - vmovdqu 48(%rdi),%xmm14 - leaq 112(%rdi),%rdi - leaq .Lconst(%rip),%rcx - - - - vmovdqu 32(%rsi),%xmm5 - vmovdqu 48(%rsi),%xmm6 - vmovdqa 64(%rcx),%xmm15 - - vpsrldq $6,%xmm5,%xmm7 - vpsrldq $6,%xmm6,%xmm8 - vpunpckhqdq %xmm6,%xmm5,%xmm9 - vpunpcklqdq %xmm6,%xmm5,%xmm5 - vpunpcklqdq %xmm8,%xmm7,%xmm8 - - vpsrlq $40,%xmm9,%xmm9 - vpsrlq $26,%xmm5,%xmm6 - vpand %xmm15,%xmm5,%xmm5 - vpsrlq $4,%xmm8,%xmm7 - vpand %xmm15,%xmm6,%xmm6 - vpsrlq $30,%xmm8,%xmm8 - vpand %xmm15,%xmm7,%xmm7 - vpand %xmm15,%xmm8,%xmm8 - vpor 32(%rcx),%xmm9,%xmm9 - - jbe .Lskip_loop_avx - - - vmovdqu -48(%rdi),%xmm11 - vmovdqu -32(%rdi),%xmm12 - vpshufd $0xEE,%xmm14,%xmm13 - vpshufd $0x44,%xmm14,%xmm10 - vmovdqa %xmm13,-144(%r11) - vmovdqa %xmm10,0(%rsp) - vpshufd $0xEE,%xmm11,%xmm14 - vmovdqu -16(%rdi),%xmm10 - vpshufd $0x44,%xmm11,%xmm11 - vmovdqa %xmm14,-128(%r11) - vmovdqa %xmm11,16(%rsp) - vpshufd $0xEE,%xmm12,%xmm13 - vmovdqu 0(%rdi),%xmm11 - vpshufd $0x44,%xmm12,%xmm12 - vmovdqa %xmm13,-112(%r11) - vmovdqa %xmm12,32(%rsp) - vpshufd $0xEE,%xmm10,%xmm14 - vmovdqu 16(%rdi),%xmm12 - vpshufd $0x44,%xmm10,%xmm10 - vmovdqa %xmm14,-96(%r11) - vmovdqa %xmm10,48(%rsp) - vpshufd $0xEE,%xmm11,%xmm13 - vmovdqu 32(%rdi),%xmm10 - vpshufd $0x44,%xmm11,%xmm11 - vmovdqa %xmm13,-80(%r11) - vmovdqa %xmm11,64(%rsp) - vpshufd $0xEE,%xmm12,%xmm14 - vmovdqu 48(%rdi),%xmm11 - vpshufd $0x44,%xmm12,%xmm12 - vmovdqa %xmm14,-64(%r11) - vmovdqa %xmm12,80(%rsp) - vpshufd $0xEE,%xmm10,%xmm13 - vmovdqu 64(%rdi),%xmm12 - vpshufd $0x44,%xmm10,%xmm10 - vmovdqa %xmm13,-48(%r11) - vmovdqa %xmm10,96(%rsp) - vpshufd $0xEE,%xmm11,%xmm14 - vpshufd $0x44,%xmm11,%xmm11 - vmovdqa %xmm14,-32(%r11) - vmovdqa %xmm11,112(%rsp) - vpshufd $0xEE,%xmm12,%xmm13 - vmovdqa 0(%rsp),%xmm14 - vpshufd $0x44,%xmm12,%xmm12 - vmovdqa %xmm13,-16(%r11) - vmovdqa %xmm12,128(%rsp) - - jmp .Loop_avx - -.align 32 -.Loop_avx: - - - - - - - - - - - - - - - - - - - - - vpmuludq %xmm5,%xmm14,%xmm10 - vpmuludq %xmm6,%xmm14,%xmm11 - vmovdqa %xmm2,32(%r11) - vpmuludq %xmm7,%xmm14,%xmm12 - vmovdqa 16(%rsp),%xmm2 - vpmuludq %xmm8,%xmm14,%xmm13 - vpmuludq %xmm9,%xmm14,%xmm14 - - vmovdqa %xmm0,0(%r11) - vpmuludq 32(%rsp),%xmm9,%xmm0 - vmovdqa %xmm1,16(%r11) - vpmuludq %xmm8,%xmm2,%xmm1 - vpaddq %xmm0,%xmm10,%xmm10 - vpaddq %xmm1,%xmm14,%xmm14 - vmovdqa %xmm3,48(%r11) - vpmuludq %xmm7,%xmm2,%xmm0 - vpmuludq %xmm6,%xmm2,%xmm1 - vpaddq %xmm0,%xmm13,%xmm13 - vmovdqa 48(%rsp),%xmm3 - vpaddq %xmm1,%xmm12,%xmm12 - vmovdqa %xmm4,64(%r11) - vpmuludq %xmm5,%xmm2,%xmm2 - vpmuludq %xmm7,%xmm3,%xmm0 - vpaddq %xmm2,%xmm11,%xmm11 - - vmovdqa 64(%rsp),%xmm4 - vpaddq %xmm0,%xmm14,%xmm14 - vpmuludq %xmm6,%xmm3,%xmm1 - vpmuludq %xmm5,%xmm3,%xmm3 - vpaddq %xmm1,%xmm13,%xmm13 - vmovdqa 80(%rsp),%xmm2 - vpaddq %xmm3,%xmm12,%xmm12 - vpmuludq %xmm9,%xmm4,%xmm0 - vpmuludq %xmm8,%xmm4,%xmm4 - vpaddq %xmm0,%xmm11,%xmm11 - vmovdqa 96(%rsp),%xmm3 - vpaddq %xmm4,%xmm10,%xmm10 - - vmovdqa 128(%rsp),%xmm4 - vpmuludq %xmm6,%xmm2,%xmm1 - vpmuludq %xmm5,%xmm2,%xmm2 - vpaddq %xmm1,%xmm14,%xmm14 - vpaddq %xmm2,%xmm13,%xmm13 - vpmuludq %xmm9,%xmm3,%xmm0 - vpmuludq %xmm8,%xmm3,%xmm1 - vpaddq %xmm0,%xmm12,%xmm12 - vmovdqu 0(%rsi),%xmm0 - vpaddq %xmm1,%xmm11,%xmm11 - vpmuludq %xmm7,%xmm3,%xmm3 - vpmuludq %xmm7,%xmm4,%xmm7 - vpaddq %xmm3,%xmm10,%xmm10 - - vmovdqu 16(%rsi),%xmm1 - vpaddq %xmm7,%xmm11,%xmm11 - vpmuludq %xmm8,%xmm4,%xmm8 - vpmuludq %xmm9,%xmm4,%xmm9 - vpsrldq $6,%xmm0,%xmm2 - vpaddq %xmm8,%xmm12,%xmm12 - vpaddq %xmm9,%xmm13,%xmm13 - vpsrldq $6,%xmm1,%xmm3 - vpmuludq 112(%rsp),%xmm5,%xmm9 - vpmuludq %xmm6,%xmm4,%xmm5 - vpunpckhqdq %xmm1,%xmm0,%xmm4 - vpaddq %xmm9,%xmm14,%xmm14 - vmovdqa -144(%r11),%xmm9 - vpaddq %xmm5,%xmm10,%xmm10 - - vpunpcklqdq %xmm1,%xmm0,%xmm0 - vpunpcklqdq %xmm3,%xmm2,%xmm3 - - - vpsrldq $5,%xmm4,%xmm4 - vpsrlq $26,%xmm0,%xmm1 - vpand %xmm15,%xmm0,%xmm0 - vpsrlq $4,%xmm3,%xmm2 - vpand %xmm15,%xmm1,%xmm1 - vpand 0(%rcx),%xmm4,%xmm4 - vpsrlq $30,%xmm3,%xmm3 - vpand %xmm15,%xmm2,%xmm2 - vpand %xmm15,%xmm3,%xmm3 - vpor 32(%rcx),%xmm4,%xmm4 - - vpaddq 0(%r11),%xmm0,%xmm0 - vpaddq 16(%r11),%xmm1,%xmm1 - vpaddq 32(%r11),%xmm2,%xmm2 - vpaddq 48(%r11),%xmm3,%xmm3 - vpaddq 64(%r11),%xmm4,%xmm4 - - leaq 32(%rsi),%rax - leaq 64(%rsi),%rsi - subq $64,%rdx - cmovcq %rax,%rsi - - - - - - - - - - - vpmuludq %xmm0,%xmm9,%xmm5 - vpmuludq %xmm1,%xmm9,%xmm6 - vpaddq %xmm5,%xmm10,%xmm10 - vpaddq %xmm6,%xmm11,%xmm11 - vmovdqa -128(%r11),%xmm7 - vpmuludq %xmm2,%xmm9,%xmm5 - vpmuludq %xmm3,%xmm9,%xmm6 - vpaddq %xmm5,%xmm12,%xmm12 - vpaddq %xmm6,%xmm13,%xmm13 - vpmuludq %xmm4,%xmm9,%xmm9 - vpmuludq -112(%r11),%xmm4,%xmm5 - vpaddq %xmm9,%xmm14,%xmm14 - - vpaddq %xmm5,%xmm10,%xmm10 - vpmuludq %xmm2,%xmm7,%xmm6 - vpmuludq %xmm3,%xmm7,%xmm5 - vpaddq %xmm6,%xmm13,%xmm13 - vmovdqa -96(%r11),%xmm8 - vpaddq %xmm5,%xmm14,%xmm14 - vpmuludq %xmm1,%xmm7,%xmm6 - vpmuludq %xmm0,%xmm7,%xmm7 - vpaddq %xmm6,%xmm12,%xmm12 - vpaddq %xmm7,%xmm11,%xmm11 - - vmovdqa -80(%r11),%xmm9 - vpmuludq %xmm2,%xmm8,%xmm5 - vpmuludq %xmm1,%xmm8,%xmm6 - vpaddq %xmm5,%xmm14,%xmm14 - vpaddq %xmm6,%xmm13,%xmm13 - vmovdqa -64(%r11),%xmm7 - vpmuludq %xmm0,%xmm8,%xmm8 - vpmuludq %xmm4,%xmm9,%xmm5 - vpaddq %xmm8,%xmm12,%xmm12 - vpaddq %xmm5,%xmm11,%xmm11 - vmovdqa -48(%r11),%xmm8 - vpmuludq %xmm3,%xmm9,%xmm9 - vpmuludq %xmm1,%xmm7,%xmm6 - vpaddq %xmm9,%xmm10,%xmm10 - - vmovdqa -16(%r11),%xmm9 - vpaddq %xmm6,%xmm14,%xmm14 - vpmuludq %xmm0,%xmm7,%xmm7 - vpmuludq %xmm4,%xmm8,%xmm5 - vpaddq %xmm7,%xmm13,%xmm13 - vpaddq %xmm5,%xmm12,%xmm12 - vmovdqu 32(%rsi),%xmm5 - vpmuludq %xmm3,%xmm8,%xmm7 - vpmuludq %xmm2,%xmm8,%xmm8 - vpaddq %xmm7,%xmm11,%xmm11 - vmovdqu 48(%rsi),%xmm6 - vpaddq %xmm8,%xmm10,%xmm10 - - vpmuludq %xmm2,%xmm9,%xmm2 - vpmuludq %xmm3,%xmm9,%xmm3 - vpsrldq $6,%xmm5,%xmm7 - vpaddq %xmm2,%xmm11,%xmm11 - vpmuludq %xmm4,%xmm9,%xmm4 - vpsrldq $6,%xmm6,%xmm8 - vpaddq %xmm3,%xmm12,%xmm2 - vpaddq %xmm4,%xmm13,%xmm3 - vpmuludq -32(%r11),%xmm0,%xmm4 - vpmuludq %xmm1,%xmm9,%xmm0 - vpunpckhqdq %xmm6,%xmm5,%xmm9 - vpaddq %xmm4,%xmm14,%xmm4 - vpaddq %xmm0,%xmm10,%xmm0 - - vpunpcklqdq %xmm6,%xmm5,%xmm5 - vpunpcklqdq %xmm8,%xmm7,%xmm8 - - - vpsrldq $5,%xmm9,%xmm9 - vpsrlq $26,%xmm5,%xmm6 - vmovdqa 0(%rsp),%xmm14 - vpand %xmm15,%xmm5,%xmm5 - vpsrlq $4,%xmm8,%xmm7 - vpand %xmm15,%xmm6,%xmm6 - vpand 0(%rcx),%xmm9,%xmm9 - vpsrlq $30,%xmm8,%xmm8 - vpand %xmm15,%xmm7,%xmm7 - vpand %xmm15,%xmm8,%xmm8 - vpor 32(%rcx),%xmm9,%xmm9 - - - - - - vpsrlq $26,%xmm3,%xmm13 - vpand %xmm15,%xmm3,%xmm3 - vpaddq %xmm13,%xmm4,%xmm4 - - vpsrlq $26,%xmm0,%xmm10 - vpand %xmm15,%xmm0,%xmm0 - vpaddq %xmm10,%xmm11,%xmm1 - - vpsrlq $26,%xmm4,%xmm10 - vpand %xmm15,%xmm4,%xmm4 - - vpsrlq $26,%xmm1,%xmm11 - vpand %xmm15,%xmm1,%xmm1 - vpaddq %xmm11,%xmm2,%xmm2 - - vpaddq %xmm10,%xmm0,%xmm0 - vpsllq $2,%xmm10,%xmm10 - vpaddq %xmm10,%xmm0,%xmm0 - - vpsrlq $26,%xmm2,%xmm12 - vpand %xmm15,%xmm2,%xmm2 - vpaddq %xmm12,%xmm3,%xmm3 - - vpsrlq $26,%xmm0,%xmm10 - vpand %xmm15,%xmm0,%xmm0 - vpaddq %xmm10,%xmm1,%xmm1 - - vpsrlq $26,%xmm3,%xmm13 - vpand %xmm15,%xmm3,%xmm3 - vpaddq %xmm13,%xmm4,%xmm4 - - ja .Loop_avx - -.Lskip_loop_avx: - - - - vpshufd $0x10,%xmm14,%xmm14 - addq $32,%rdx - jnz .Long_tail_avx - - vpaddq %xmm2,%xmm7,%xmm7 - vpaddq %xmm0,%xmm5,%xmm5 - vpaddq %xmm1,%xmm6,%xmm6 - vpaddq %xmm3,%xmm8,%xmm8 - vpaddq %xmm4,%xmm9,%xmm9 - -.Long_tail_avx: - vmovdqa %xmm2,32(%r11) - vmovdqa %xmm0,0(%r11) - vmovdqa %xmm1,16(%r11) - vmovdqa %xmm3,48(%r11) - vmovdqa %xmm4,64(%r11) - - - - - - - - vpmuludq %xmm7,%xmm14,%xmm12 - vpmuludq %xmm5,%xmm14,%xmm10 - vpshufd $0x10,-48(%rdi),%xmm2 - vpmuludq %xmm6,%xmm14,%xmm11 - vpmuludq %xmm8,%xmm14,%xmm13 - vpmuludq %xmm9,%xmm14,%xmm14 - - vpmuludq %xmm8,%xmm2,%xmm0 - vpaddq %xmm0,%xmm14,%xmm14 - vpshufd $0x10,-32(%rdi),%xmm3 - vpmuludq %xmm7,%xmm2,%xmm1 - vpaddq %xmm1,%xmm13,%xmm13 - vpshufd $0x10,-16(%rdi),%xmm4 - vpmuludq %xmm6,%xmm2,%xmm0 - vpaddq %xmm0,%xmm12,%xmm12 - vpmuludq %xmm5,%xmm2,%xmm2 - vpaddq %xmm2,%xmm11,%xmm11 - vpmuludq %xmm9,%xmm3,%xmm3 - vpaddq %xmm3,%xmm10,%xmm10 - - vpshufd $0x10,0(%rdi),%xmm2 - vpmuludq %xmm7,%xmm4,%xmm1 - vpaddq %xmm1,%xmm14,%xmm14 - vpmuludq %xmm6,%xmm4,%xmm0 - vpaddq %xmm0,%xmm13,%xmm13 - vpshufd $0x10,16(%rdi),%xmm3 - vpmuludq %xmm5,%xmm4,%xmm4 - vpaddq %xmm4,%xmm12,%xmm12 - vpmuludq %xmm9,%xmm2,%xmm1 - vpaddq %xmm1,%xmm11,%xmm11 - vpshufd $0x10,32(%rdi),%xmm4 - vpmuludq %xmm8,%xmm2,%xmm2 - vpaddq %xmm2,%xmm10,%xmm10 - - vpmuludq %xmm6,%xmm3,%xmm0 - vpaddq %xmm0,%xmm14,%xmm14 - vpmuludq %xmm5,%xmm3,%xmm3 - vpaddq %xmm3,%xmm13,%xmm13 - vpshufd $0x10,48(%rdi),%xmm2 - vpmuludq %xmm9,%xmm4,%xmm1 - vpaddq %xmm1,%xmm12,%xmm12 - vpshufd $0x10,64(%rdi),%xmm3 - vpmuludq %xmm8,%xmm4,%xmm0 - vpaddq %xmm0,%xmm11,%xmm11 - vpmuludq %xmm7,%xmm4,%xmm4 - vpaddq %xmm4,%xmm10,%xmm10 - - vpmuludq %xmm5,%xmm2,%xmm2 - vpaddq %xmm2,%xmm14,%xmm14 - vpmuludq %xmm9,%xmm3,%xmm1 - vpaddq %xmm1,%xmm13,%xmm13 - vpmuludq %xmm8,%xmm3,%xmm0 - vpaddq %xmm0,%xmm12,%xmm12 - vpmuludq %xmm7,%xmm3,%xmm1 - vpaddq %xmm1,%xmm11,%xmm11 - vpmuludq %xmm6,%xmm3,%xmm3 - vpaddq %xmm3,%xmm10,%xmm10 - - jz .Lshort_tail_avx - - vmovdqu 0(%rsi),%xmm0 - vmovdqu 16(%rsi),%xmm1 - - vpsrldq $6,%xmm0,%xmm2 - vpsrldq $6,%xmm1,%xmm3 - vpunpckhqdq %xmm1,%xmm0,%xmm4 - vpunpcklqdq %xmm1,%xmm0,%xmm0 - vpunpcklqdq %xmm3,%xmm2,%xmm3 - - vpsrlq $40,%xmm4,%xmm4 - vpsrlq $26,%xmm0,%xmm1 - vpand %xmm15,%xmm0,%xmm0 - vpsrlq $4,%xmm3,%xmm2 - vpand %xmm15,%xmm1,%xmm1 - vpsrlq $30,%xmm3,%xmm3 - vpand %xmm15,%xmm2,%xmm2 - vpand %xmm15,%xmm3,%xmm3 - vpor 32(%rcx),%xmm4,%xmm4 - - vpshufd $0x32,-64(%rdi),%xmm9 - vpaddq 0(%r11),%xmm0,%xmm0 - vpaddq 16(%r11),%xmm1,%xmm1 - vpaddq 32(%r11),%xmm2,%xmm2 - vpaddq 48(%r11),%xmm3,%xmm3 - vpaddq 64(%r11),%xmm4,%xmm4 - - - - - vpmuludq %xmm0,%xmm9,%xmm5 - vpaddq %xmm5,%xmm10,%xmm10 - vpmuludq %xmm1,%xmm9,%xmm6 - vpaddq %xmm6,%xmm11,%xmm11 - vpmuludq %xmm2,%xmm9,%xmm5 - vpaddq %xmm5,%xmm12,%xmm12 - vpshufd $0x32,-48(%rdi),%xmm7 - vpmuludq %xmm3,%xmm9,%xmm6 - vpaddq %xmm6,%xmm13,%xmm13 - vpmuludq %xmm4,%xmm9,%xmm9 - vpaddq %xmm9,%xmm14,%xmm14 - - vpmuludq %xmm3,%xmm7,%xmm5 - vpaddq %xmm5,%xmm14,%xmm14 - vpshufd $0x32,-32(%rdi),%xmm8 - vpmuludq %xmm2,%xmm7,%xmm6 - vpaddq %xmm6,%xmm13,%xmm13 - vpshufd $0x32,-16(%rdi),%xmm9 - vpmuludq %xmm1,%xmm7,%xmm5 - vpaddq %xmm5,%xmm12,%xmm12 - vpmuludq %xmm0,%xmm7,%xmm7 - vpaddq %xmm7,%xmm11,%xmm11 - vpmuludq %xmm4,%xmm8,%xmm8 - vpaddq %xmm8,%xmm10,%xmm10 - - vpshufd $0x32,0(%rdi),%xmm7 - vpmuludq %xmm2,%xmm9,%xmm6 - vpaddq %xmm6,%xmm14,%xmm14 - vpmuludq %xmm1,%xmm9,%xmm5 - vpaddq %xmm5,%xmm13,%xmm13 - vpshufd $0x32,16(%rdi),%xmm8 - vpmuludq %xmm0,%xmm9,%xmm9 - vpaddq %xmm9,%xmm12,%xmm12 - vpmuludq %xmm4,%xmm7,%xmm6 - vpaddq %xmm6,%xmm11,%xmm11 - vpshufd $0x32,32(%rdi),%xmm9 - vpmuludq %xmm3,%xmm7,%xmm7 - vpaddq %xmm7,%xmm10,%xmm10 - - vpmuludq %xmm1,%xmm8,%xmm5 - vpaddq %xmm5,%xmm14,%xmm14 - vpmuludq %xmm0,%xmm8,%xmm8 - vpaddq %xmm8,%xmm13,%xmm13 - vpshufd $0x32,48(%rdi),%xmm7 - vpmuludq %xmm4,%xmm9,%xmm6 - vpaddq %xmm6,%xmm12,%xmm12 - vpshufd $0x32,64(%rdi),%xmm8 - vpmuludq %xmm3,%xmm9,%xmm5 - vpaddq %xmm5,%xmm11,%xmm11 - vpmuludq %xmm2,%xmm9,%xmm9 - vpaddq %xmm9,%xmm10,%xmm10 - - vpmuludq %xmm0,%xmm7,%xmm7 - vpaddq %xmm7,%xmm14,%xmm14 - vpmuludq %xmm4,%xmm8,%xmm6 - vpaddq %xmm6,%xmm13,%xmm13 - vpmuludq %xmm3,%xmm8,%xmm5 - vpaddq %xmm5,%xmm12,%xmm12 - vpmuludq %xmm2,%xmm8,%xmm6 - vpaddq %xmm6,%xmm11,%xmm11 - vpmuludq %xmm1,%xmm8,%xmm8 - vpaddq %xmm8,%xmm10,%xmm10 - -.Lshort_tail_avx: - - - - vpsrldq $8,%xmm14,%xmm9 - vpsrldq $8,%xmm13,%xmm8 - vpsrldq $8,%xmm11,%xmm6 - vpsrldq $8,%xmm10,%xmm5 - vpsrldq $8,%xmm12,%xmm7 - vpaddq %xmm8,%xmm13,%xmm13 - vpaddq %xmm9,%xmm14,%xmm14 - vpaddq %xmm5,%xmm10,%xmm10 - vpaddq %xmm6,%xmm11,%xmm11 - vpaddq %xmm7,%xmm12,%xmm12 - - - - - vpsrlq $26,%xmm13,%xmm3 - vpand %xmm15,%xmm13,%xmm13 - vpaddq %xmm3,%xmm14,%xmm14 - - vpsrlq $26,%xmm10,%xmm0 - vpand %xmm15,%xmm10,%xmm10 - vpaddq %xmm0,%xmm11,%xmm11 - - vpsrlq $26,%xmm14,%xmm4 - vpand %xmm15,%xmm14,%xmm14 - - vpsrlq $26,%xmm11,%xmm1 - vpand %xmm15,%xmm11,%xmm11 - vpaddq %xmm1,%xmm12,%xmm12 - - vpaddq %xmm4,%xmm10,%xmm10 - vpsllq $2,%xmm4,%xmm4 - vpaddq %xmm4,%xmm10,%xmm10 - - vpsrlq $26,%xmm12,%xmm2 - vpand %xmm15,%xmm12,%xmm12 - vpaddq %xmm2,%xmm13,%xmm13 - - vpsrlq $26,%xmm10,%xmm0 - vpand %xmm15,%xmm10,%xmm10 - vpaddq %xmm0,%xmm11,%xmm11 - - vpsrlq $26,%xmm13,%xmm3 - vpand %xmm15,%xmm13,%xmm13 - vpaddq %xmm3,%xmm14,%xmm14 - - vmovd %xmm10,-112(%rdi) - vmovd %xmm11,-108(%rdi) - vmovd %xmm12,-104(%rdi) - vmovd %xmm13,-100(%rdi) - vmovd %xmm14,-96(%rdi) - leaq 88(%r11),%rsp -.cfi_def_cfa %rsp,8 - vzeroupper - .byte 0xf3,0xc3 -.cfi_endproc -.size poly1305_blocks_avx,.-poly1305_blocks_avx - -.type poly1305_emit_avx,@function -.align 32 -poly1305_emit_avx: - cmpl $0,20(%rdi) - je .Lemit - - movl 0(%rdi),%eax - movl 4(%rdi),%ecx - movl 8(%rdi),%r8d - movl 12(%rdi),%r11d - movl 16(%rdi),%r10d - - shlq $26,%rcx - movq %r8,%r9 - shlq $52,%r8 - addq %rcx,%rax - shrq $12,%r9 - addq %rax,%r8 - adcq $0,%r9 - - shlq $14,%r11 - movq %r10,%rax - shrq $24,%r10 - addq %r11,%r9 - shlq $40,%rax - addq %rax,%r9 - adcq $0,%r10 - - movq %r10,%rax - movq %r10,%rcx - andq $3,%r10 - shrq $2,%rax - andq $-4,%rcx - addq %rcx,%rax - addq %rax,%r8 - adcq $0,%r9 - adcq $0,%r10 - - movq %r8,%rax - addq $5,%r8 - movq %r9,%rcx - adcq $0,%r9 - adcq $0,%r10 - shrq $2,%r10 - cmovnzq %r8,%rax - cmovnzq %r9,%rcx - - addq 0(%rdx),%rax - adcq 8(%rdx),%rcx - movq %rax,0(%rsi) - movq %rcx,8(%rsi) - - .byte 0xf3,0xc3 -.size poly1305_emit_avx,.-poly1305_emit_avx -.type poly1305_blocks_avx2,@function -.align 32 -poly1305_blocks_avx2: -.cfi_startproc - movl 20(%rdi),%r8d - cmpq $128,%rdx - jae .Lblocks_avx2 - testl %r8d,%r8d - jz .Lblocks - -.Lblocks_avx2: - andq $-16,%rdx - jz .Lno_data_avx2 - - vzeroupper - - testl %r8d,%r8d - jz .Lbase2_64_avx2 - - testq $63,%rdx - jz .Leven_avx2 - - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 -.Lblocks_avx2_body: - - movq %rdx,%r15 - - movq 0(%rdi),%r8 - movq 8(%rdi),%r9 - movl 16(%rdi),%ebp - - movq 24(%rdi),%r11 - movq 32(%rdi),%r13 - - - movl %r8d,%r14d - andq $-2147483648,%r8 - movq %r9,%r12 - movl %r9d,%ebx - andq $-2147483648,%r9 - - shrq $6,%r8 - shlq $52,%r12 - addq %r8,%r14 - shrq $12,%rbx - shrq $18,%r9 - addq %r12,%r14 - adcq %r9,%rbx - - movq %rbp,%r8 - shlq $40,%r8 - shrq $24,%rbp - addq %r8,%rbx - adcq $0,%rbp - - movq $-4,%r9 - movq %rbp,%r8 - andq %rbp,%r9 - shrq $2,%r8 - andq $3,%rbp - addq %r9,%r8 - addq %r8,%r14 - adcq $0,%rbx - adcq $0,%rbp - - movq %r13,%r12 - movq %r13,%rax - shrq $2,%r13 - addq %r12,%r13 - -.Lbase2_26_pre_avx2: - addq 0(%rsi),%r14 - adcq 8(%rsi),%rbx - leaq 16(%rsi),%rsi - adcq %rcx,%rbp - subq $16,%r15 - - call __poly1305_block - movq %r12,%rax - - testq $63,%r15 - jnz .Lbase2_26_pre_avx2 - - testq %rcx,%rcx - jz .Lstore_base2_64_avx2 - - - movq %r14,%rax - movq %r14,%rdx - shrq $52,%r14 - movq %rbx,%r11 - movq %rbx,%r12 - shrq $26,%rdx - andq $0x3ffffff,%rax - shlq $12,%r11 - andq $0x3ffffff,%rdx - shrq $14,%rbx - orq %r11,%r14 - shlq $24,%rbp - andq $0x3ffffff,%r14 - shrq $40,%r12 - andq $0x3ffffff,%rbx - orq %r12,%rbp - - testq %r15,%r15 - jz .Lstore_base2_26_avx2 - - vmovd %eax,%xmm0 - vmovd %edx,%xmm1 - vmovd %r14d,%xmm2 - vmovd %ebx,%xmm3 - vmovd %ebp,%xmm4 - jmp .Lproceed_avx2 - -.align 32 -.Lstore_base2_64_avx2: - movq %r14,0(%rdi) - movq %rbx,8(%rdi) - movq %rbp,16(%rdi) - jmp .Ldone_avx2 - -.align 16 -.Lstore_base2_26_avx2: - movl %eax,0(%rdi) - movl %edx,4(%rdi) - movl %r14d,8(%rdi) - movl %ebx,12(%rdi) - movl %ebp,16(%rdi) -.align 16 -.Ldone_avx2: - movq 0(%rsp),%r15 -.cfi_restore %r15 - movq 8(%rsp),%r14 -.cfi_restore %r14 - movq 16(%rsp),%r13 -.cfi_restore %r13 - movq 24(%rsp),%r12 -.cfi_restore %r12 - movq 32(%rsp),%rbp -.cfi_restore %rbp - movq 40(%rsp),%rbx -.cfi_restore %rbx - leaq 48(%rsp),%rsp -.cfi_adjust_cfa_offset -48 -.Lno_data_avx2: -.Lblocks_avx2_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc - -.align 32 -.Lbase2_64_avx2: -.cfi_startproc - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 -.Lbase2_64_avx2_body: - - movq %rdx,%r15 - - movq 24(%rdi),%r11 - movq 32(%rdi),%r13 - - movq 0(%rdi),%r14 - movq 8(%rdi),%rbx - movl 16(%rdi),%ebp - - movq %r13,%r12 - movq %r13,%rax - shrq $2,%r13 - addq %r12,%r13 - - testq $63,%rdx - jz .Linit_avx2 - -.Lbase2_64_pre_avx2: - addq 0(%rsi),%r14 - adcq 8(%rsi),%rbx - leaq 16(%rsi),%rsi - adcq %rcx,%rbp - subq $16,%r15 - - call __poly1305_block - movq %r12,%rax - - testq $63,%r15 - jnz .Lbase2_64_pre_avx2 - -.Linit_avx2: - - movq %r14,%rax - movq %r14,%rdx - shrq $52,%r14 - movq %rbx,%r8 - movq %rbx,%r9 - shrq $26,%rdx - andq $0x3ffffff,%rax - shlq $12,%r8 - andq $0x3ffffff,%rdx - shrq $14,%rbx - orq %r8,%r14 - shlq $24,%rbp - andq $0x3ffffff,%r14 - shrq $40,%r9 - andq $0x3ffffff,%rbx - orq %r9,%rbp - - vmovd %eax,%xmm0 - vmovd %edx,%xmm1 - vmovd %r14d,%xmm2 - vmovd %ebx,%xmm3 - vmovd %ebp,%xmm4 - movl $1,20(%rdi) - - call __poly1305_init_avx - -.Lproceed_avx2: - movq %r15,%rdx - movl OPENSSL_ia32cap_P+8(%rip),%r10d - movl $3221291008,%r11d - - movq 0(%rsp),%r15 -.cfi_restore %r15 - movq 8(%rsp),%r14 -.cfi_restore %r14 - movq 16(%rsp),%r13 -.cfi_restore %r13 - movq 24(%rsp),%r12 -.cfi_restore %r12 - movq 32(%rsp),%rbp -.cfi_restore %rbp - movq 40(%rsp),%rbx -.cfi_restore %rbx - leaq 48(%rsp),%rax - leaq 48(%rsp),%rsp -.cfi_adjust_cfa_offset -48 -.Lbase2_64_avx2_epilogue: - jmp .Ldo_avx2 -.cfi_endproc - -.align 32 -.Leven_avx2: -.cfi_startproc - movl OPENSSL_ia32cap_P+8(%rip),%r10d - vmovd 0(%rdi),%xmm0 - vmovd 4(%rdi),%xmm1 - vmovd 8(%rdi),%xmm2 - vmovd 12(%rdi),%xmm3 - vmovd 16(%rdi),%xmm4 - -.Ldo_avx2: - leaq -8(%rsp),%r11 -.cfi_def_cfa %r11,16 - subq $0x128,%rsp - leaq .Lconst(%rip),%rcx - leaq 48+64(%rdi),%rdi - vmovdqa 96(%rcx),%ymm7 - - - vmovdqu -64(%rdi),%xmm9 - andq $-512,%rsp - vmovdqu -48(%rdi),%xmm10 - vmovdqu -32(%rdi),%xmm6 - vmovdqu -16(%rdi),%xmm11 - vmovdqu 0(%rdi),%xmm12 - vmovdqu 16(%rdi),%xmm13 - leaq 144(%rsp),%rax - vmovdqu 32(%rdi),%xmm14 - vpermd %ymm9,%ymm7,%ymm9 - vmovdqu 48(%rdi),%xmm15 - vpermd %ymm10,%ymm7,%ymm10 - vmovdqu 64(%rdi),%xmm5 - vpermd %ymm6,%ymm7,%ymm6 - vmovdqa %ymm9,0(%rsp) - vpermd %ymm11,%ymm7,%ymm11 - vmovdqa %ymm10,32-144(%rax) - vpermd %ymm12,%ymm7,%ymm12 - vmovdqa %ymm6,64-144(%rax) - vpermd %ymm13,%ymm7,%ymm13 - vmovdqa %ymm11,96-144(%rax) - vpermd %ymm14,%ymm7,%ymm14 - vmovdqa %ymm12,128-144(%rax) - vpermd %ymm15,%ymm7,%ymm15 - vmovdqa %ymm13,160-144(%rax) - vpermd %ymm5,%ymm7,%ymm5 - vmovdqa %ymm14,192-144(%rax) - vmovdqa %ymm15,224-144(%rax) - vmovdqa %ymm5,256-144(%rax) - vmovdqa 64(%rcx),%ymm5 - - - - vmovdqu 0(%rsi),%xmm7 - vmovdqu 16(%rsi),%xmm8 - vinserti128 $1,32(%rsi),%ymm7,%ymm7 - vinserti128 $1,48(%rsi),%ymm8,%ymm8 - leaq 64(%rsi),%rsi - - vpsrldq $6,%ymm7,%ymm9 - vpsrldq $6,%ymm8,%ymm10 - vpunpckhqdq %ymm8,%ymm7,%ymm6 - vpunpcklqdq %ymm10,%ymm9,%ymm9 - vpunpcklqdq %ymm8,%ymm7,%ymm7 - - vpsrlq $30,%ymm9,%ymm10 - vpsrlq $4,%ymm9,%ymm9 - vpsrlq $26,%ymm7,%ymm8 - vpsrlq $40,%ymm6,%ymm6 - vpand %ymm5,%ymm9,%ymm9 - vpand %ymm5,%ymm7,%ymm7 - vpand %ymm5,%ymm8,%ymm8 - vpand %ymm5,%ymm10,%ymm10 - vpor 32(%rcx),%ymm6,%ymm6 - - vpaddq %ymm2,%ymm9,%ymm2 - subq $64,%rdx - jz .Ltail_avx2 - jmp .Loop_avx2 - -.align 32 -.Loop_avx2: - - - - - - - - - vpaddq %ymm0,%ymm7,%ymm0 - vmovdqa 0(%rsp),%ymm7 - vpaddq %ymm1,%ymm8,%ymm1 - vmovdqa 32(%rsp),%ymm8 - vpaddq %ymm3,%ymm10,%ymm3 - vmovdqa 96(%rsp),%ymm9 - vpaddq %ymm4,%ymm6,%ymm4 - vmovdqa 48(%rax),%ymm10 - vmovdqa 112(%rax),%ymm5 - - - - - - - - - - - - - - - - - vpmuludq %ymm2,%ymm7,%ymm13 - vpmuludq %ymm2,%ymm8,%ymm14 - vpmuludq %ymm2,%ymm9,%ymm15 - vpmuludq %ymm2,%ymm10,%ymm11 - vpmuludq %ymm2,%ymm5,%ymm12 - - vpmuludq %ymm0,%ymm8,%ymm6 - vpmuludq %ymm1,%ymm8,%ymm2 - vpaddq %ymm6,%ymm12,%ymm12 - vpaddq %ymm2,%ymm13,%ymm13 - vpmuludq %ymm3,%ymm8,%ymm6 - vpmuludq 64(%rsp),%ymm4,%ymm2 - vpaddq %ymm6,%ymm15,%ymm15 - vpaddq %ymm2,%ymm11,%ymm11 - vmovdqa -16(%rax),%ymm8 - - vpmuludq %ymm0,%ymm7,%ymm6 - vpmuludq %ymm1,%ymm7,%ymm2 - vpaddq %ymm6,%ymm11,%ymm11 - vpaddq %ymm2,%ymm12,%ymm12 - vpmuludq %ymm3,%ymm7,%ymm6 - vpmuludq %ymm4,%ymm7,%ymm2 - vmovdqu 0(%rsi),%xmm7 - vpaddq %ymm6,%ymm14,%ymm14 - vpaddq %ymm2,%ymm15,%ymm15 - vinserti128 $1,32(%rsi),%ymm7,%ymm7 - - vpmuludq %ymm3,%ymm8,%ymm6 - vpmuludq %ymm4,%ymm8,%ymm2 - vmovdqu 16(%rsi),%xmm8 - vpaddq %ymm6,%ymm11,%ymm11 - vpaddq %ymm2,%ymm12,%ymm12 - vmovdqa 16(%rax),%ymm2 - vpmuludq %ymm1,%ymm9,%ymm6 - vpmuludq %ymm0,%ymm9,%ymm9 - vpaddq %ymm6,%ymm14,%ymm14 - vpaddq %ymm9,%ymm13,%ymm13 - vinserti128 $1,48(%rsi),%ymm8,%ymm8 - leaq 64(%rsi),%rsi - - vpmuludq %ymm1,%ymm2,%ymm6 - vpmuludq %ymm0,%ymm2,%ymm2 - vpsrldq $6,%ymm7,%ymm9 - vpaddq %ymm6,%ymm15,%ymm15 - vpaddq %ymm2,%ymm14,%ymm14 - vpmuludq %ymm3,%ymm10,%ymm6 - vpmuludq %ymm4,%ymm10,%ymm2 - vpsrldq $6,%ymm8,%ymm10 - vpaddq %ymm6,%ymm12,%ymm12 - vpaddq %ymm2,%ymm13,%ymm13 - vpunpckhqdq %ymm8,%ymm7,%ymm6 - - vpmuludq %ymm3,%ymm5,%ymm3 - vpmuludq %ymm4,%ymm5,%ymm4 - vpunpcklqdq %ymm8,%ymm7,%ymm7 - vpaddq %ymm3,%ymm13,%ymm2 - vpaddq %ymm4,%ymm14,%ymm3 - vpunpcklqdq %ymm10,%ymm9,%ymm10 - vpmuludq 80(%rax),%ymm0,%ymm4 - vpmuludq %ymm1,%ymm5,%ymm0 - vmovdqa 64(%rcx),%ymm5 - vpaddq %ymm4,%ymm15,%ymm4 - vpaddq %ymm0,%ymm11,%ymm0 - - - - - vpsrlq $26,%ymm3,%ymm14 - vpand %ymm5,%ymm3,%ymm3 - vpaddq %ymm14,%ymm4,%ymm4 - - vpsrlq $26,%ymm0,%ymm11 - vpand %ymm5,%ymm0,%ymm0 - vpaddq %ymm11,%ymm12,%ymm1 - - vpsrlq $26,%ymm4,%ymm15 - vpand %ymm5,%ymm4,%ymm4 - - vpsrlq $4,%ymm10,%ymm9 - - vpsrlq $26,%ymm1,%ymm12 - vpand %ymm5,%ymm1,%ymm1 - vpaddq %ymm12,%ymm2,%ymm2 - - vpaddq %ymm15,%ymm0,%ymm0 - vpsllq $2,%ymm15,%ymm15 - vpaddq %ymm15,%ymm0,%ymm0 - - vpand %ymm5,%ymm9,%ymm9 - vpsrlq $26,%ymm7,%ymm8 - - vpsrlq $26,%ymm2,%ymm13 - vpand %ymm5,%ymm2,%ymm2 - vpaddq %ymm13,%ymm3,%ymm3 - - vpaddq %ymm9,%ymm2,%ymm2 - vpsrlq $30,%ymm10,%ymm10 - - vpsrlq $26,%ymm0,%ymm11 - vpand %ymm5,%ymm0,%ymm0 - vpaddq %ymm11,%ymm1,%ymm1 - - vpsrlq $40,%ymm6,%ymm6 - - vpsrlq $26,%ymm3,%ymm14 - vpand %ymm5,%ymm3,%ymm3 - vpaddq %ymm14,%ymm4,%ymm4 - - vpand %ymm5,%ymm7,%ymm7 - vpand %ymm5,%ymm8,%ymm8 - vpand %ymm5,%ymm10,%ymm10 - vpor 32(%rcx),%ymm6,%ymm6 - - subq $64,%rdx - jnz .Loop_avx2 - -.byte 0x66,0x90 -.Ltail_avx2: - - - - - - - - vpaddq %ymm0,%ymm7,%ymm0 - vmovdqu 4(%rsp),%ymm7 - vpaddq %ymm1,%ymm8,%ymm1 - vmovdqu 36(%rsp),%ymm8 - vpaddq %ymm3,%ymm10,%ymm3 - vmovdqu 100(%rsp),%ymm9 - vpaddq %ymm4,%ymm6,%ymm4 - vmovdqu 52(%rax),%ymm10 - vmovdqu 116(%rax),%ymm5 - - vpmuludq %ymm2,%ymm7,%ymm13 - vpmuludq %ymm2,%ymm8,%ymm14 - vpmuludq %ymm2,%ymm9,%ymm15 - vpmuludq %ymm2,%ymm10,%ymm11 - vpmuludq %ymm2,%ymm5,%ymm12 - - vpmuludq %ymm0,%ymm8,%ymm6 - vpmuludq %ymm1,%ymm8,%ymm2 - vpaddq %ymm6,%ymm12,%ymm12 - vpaddq %ymm2,%ymm13,%ymm13 - vpmuludq %ymm3,%ymm8,%ymm6 - vpmuludq 68(%rsp),%ymm4,%ymm2 - vpaddq %ymm6,%ymm15,%ymm15 - vpaddq %ymm2,%ymm11,%ymm11 - - vpmuludq %ymm0,%ymm7,%ymm6 - vpmuludq %ymm1,%ymm7,%ymm2 - vpaddq %ymm6,%ymm11,%ymm11 - vmovdqu -12(%rax),%ymm8 - vpaddq %ymm2,%ymm12,%ymm12 - vpmuludq %ymm3,%ymm7,%ymm6 - vpmuludq %ymm4,%ymm7,%ymm2 - vpaddq %ymm6,%ymm14,%ymm14 - vpaddq %ymm2,%ymm15,%ymm15 - - vpmuludq %ymm3,%ymm8,%ymm6 - vpmuludq %ymm4,%ymm8,%ymm2 - vpaddq %ymm6,%ymm11,%ymm11 - vpaddq %ymm2,%ymm12,%ymm12 - vmovdqu 20(%rax),%ymm2 - vpmuludq %ymm1,%ymm9,%ymm6 - vpmuludq %ymm0,%ymm9,%ymm9 - vpaddq %ymm6,%ymm14,%ymm14 - vpaddq %ymm9,%ymm13,%ymm13 - - vpmuludq %ymm1,%ymm2,%ymm6 - vpmuludq %ymm0,%ymm2,%ymm2 - vpaddq %ymm6,%ymm15,%ymm15 - vpaddq %ymm2,%ymm14,%ymm14 - vpmuludq %ymm3,%ymm10,%ymm6 - vpmuludq %ymm4,%ymm10,%ymm2 - vpaddq %ymm6,%ymm12,%ymm12 - vpaddq %ymm2,%ymm13,%ymm13 - - vpmuludq %ymm3,%ymm5,%ymm3 - vpmuludq %ymm4,%ymm5,%ymm4 - vpaddq %ymm3,%ymm13,%ymm2 - vpaddq %ymm4,%ymm14,%ymm3 - vpmuludq 84(%rax),%ymm0,%ymm4 - vpmuludq %ymm1,%ymm5,%ymm0 - vmovdqa 64(%rcx),%ymm5 - vpaddq %ymm4,%ymm15,%ymm4 - vpaddq %ymm0,%ymm11,%ymm0 - - - - - vpsrldq $8,%ymm12,%ymm8 - vpsrldq $8,%ymm2,%ymm9 - vpsrldq $8,%ymm3,%ymm10 - vpsrldq $8,%ymm4,%ymm6 - vpsrldq $8,%ymm0,%ymm7 - vpaddq %ymm8,%ymm12,%ymm12 - vpaddq %ymm9,%ymm2,%ymm2 - vpaddq %ymm10,%ymm3,%ymm3 - vpaddq %ymm6,%ymm4,%ymm4 - vpaddq %ymm7,%ymm0,%ymm0 - - vpermq $0x2,%ymm3,%ymm10 - vpermq $0x2,%ymm4,%ymm6 - vpermq $0x2,%ymm0,%ymm7 - vpermq $0x2,%ymm12,%ymm8 - vpermq $0x2,%ymm2,%ymm9 - vpaddq %ymm10,%ymm3,%ymm3 - vpaddq %ymm6,%ymm4,%ymm4 - vpaddq %ymm7,%ymm0,%ymm0 - vpaddq %ymm8,%ymm12,%ymm12 - vpaddq %ymm9,%ymm2,%ymm2 - - - - - vpsrlq $26,%ymm3,%ymm14 - vpand %ymm5,%ymm3,%ymm3 - vpaddq %ymm14,%ymm4,%ymm4 - - vpsrlq $26,%ymm0,%ymm11 - vpand %ymm5,%ymm0,%ymm0 - vpaddq %ymm11,%ymm12,%ymm1 - - vpsrlq $26,%ymm4,%ymm15 - vpand %ymm5,%ymm4,%ymm4 - - vpsrlq $26,%ymm1,%ymm12 - vpand %ymm5,%ymm1,%ymm1 - vpaddq %ymm12,%ymm2,%ymm2 - - vpaddq %ymm15,%ymm0,%ymm0 - vpsllq $2,%ymm15,%ymm15 - vpaddq %ymm15,%ymm0,%ymm0 - - vpsrlq $26,%ymm2,%ymm13 - vpand %ymm5,%ymm2,%ymm2 - vpaddq %ymm13,%ymm3,%ymm3 - - vpsrlq $26,%ymm0,%ymm11 - vpand %ymm5,%ymm0,%ymm0 - vpaddq %ymm11,%ymm1,%ymm1 - - vpsrlq $26,%ymm3,%ymm14 - vpand %ymm5,%ymm3,%ymm3 - vpaddq %ymm14,%ymm4,%ymm4 - - vmovd %xmm0,-112(%rdi) - vmovd %xmm1,-108(%rdi) - vmovd %xmm2,-104(%rdi) - vmovd %xmm3,-100(%rdi) - vmovd %xmm4,-96(%rdi) - leaq 8(%r11),%rsp -.cfi_def_cfa %rsp,8 - vzeroupper - .byte 0xf3,0xc3 .cfi_endproc -.size poly1305_blocks_avx2,.-poly1305_blocks_avx2 -.align 64 -.Lconst: -.Lmask24: -.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 -.L129: -.long 16777216,0,16777216,0,16777216,0,16777216,0 -.Lmask26: -.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 -.Lpermd_avx2: -.long 2,2,2,3,2,0,2,1 -.Lpermd_avx512: -.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 - -.L2_44_inp_permd: -.long 0,1,1,2,2,3,7,7 -.L2_44_inp_shift: -.quad 0,12,24,64 -.L2_44_mask: -.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff -.L2_44_shift_rgt: -.quad 44,44,42,64 -.L2_44_shift_lft: -.quad 8,8,10,64 - -.align 64 -.Lx_mask44: -.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff -.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff -.Lx_mask42: -.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff -.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff +.size poly1305_emit,.-poly1305_emit .byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 16 .globl xor128_encrypt_n_pad .type xor128_encrypt_n_pad,@function .align 16 xor128_encrypt_n_pad: +.cfi_startproc subq %rdx,%rsi subq %rdx,%rdi movq %rcx,%r10 @@ -2002,12 +228,14 @@ xor128_encrypt_n_pad: .Ldone_enc: movq %rdx,%rax .byte 0xf3,0xc3 +.cfi_endproc .size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad .globl xor128_decrypt_n_pad .type xor128_decrypt_n_pad,@function .align 16 xor128_decrypt_n_pad: +.cfi_startproc subq %rdx,%rsi subq %rdx,%rdi movq %rcx,%r10 @@ -2053,4 +281,5 @@ xor128_decrypt_n_pad: .Ldone_dec: movq %rdx,%rax .byte 0xf3,0xc3 +.cfi_endproc .size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad diff --git a/secure/lib/libcrypto/amd64/rc4-x86_64.S b/secure/lib/libcrypto/amd64/rc4-x86_64.S index b77714c170f3..a084e9b9c993 100644 --- a/secure/lib/libcrypto/amd64/rc4-x86_64.S +++ b/secure/lib/libcrypto/amd64/rc4-x86_64.S @@ -6,11 +6,12 @@ .globl RC4 .type RC4,@function .align 16 -RC4: orq %rsi,%rsi +RC4: +.cfi_startproc + orq %rsi,%rsi jne .Lentry .byte 0xf3,0xc3 .Lentry: -.cfi_startproc pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-16 @@ -535,6 +536,7 @@ RC4: orq %rsi,%rsi .type RC4_set_key,@function .align 16 RC4_set_key: +.cfi_startproc leaq 8(%rdi),%rdi leaq (%rdx,%rsi,1),%rdx negq %rsi @@ -601,12 +603,14 @@ RC4_set_key: movl %eax,-8(%rdi) movl %eax,-4(%rdi) .byte 0xf3,0xc3 +.cfi_endproc .size RC4_set_key,.-RC4_set_key .globl RC4_options .type RC4_options,@function .align 16 RC4_options: +.cfi_startproc leaq .Lopts(%rip),%rax movl OPENSSL_ia32cap_P(%rip),%edx btl $20,%edx @@ -619,6 +623,7 @@ RC4_options: addq $12,%rax .Ldone: .byte 0xf3,0xc3 +.cfi_endproc .align 64 .Lopts: .byte 114,99,52,40,56,120,44,105,110,116,41,0 diff --git a/secure/lib/libcrypto/amd64/rsaz-avx2.S b/secure/lib/libcrypto/amd64/rsaz-avx2.S index 3075a52d2eec..e957915a7d81 100644 --- a/secure/lib/libcrypto/amd64/rsaz-avx2.S +++ b/secure/lib/libcrypto/amd64/rsaz-avx2.S @@ -2,1745 +2,26 @@ /* Do not modify. This file is auto-generated from rsaz-avx2.pl. */ .text +.globl rsaz_avx2_eligible +.type rsaz_avx2_eligible,@function +rsaz_avx2_eligible: + xorl %eax,%eax + .byte 0xf3,0xc3 +.size rsaz_avx2_eligible,.-rsaz_avx2_eligible + .globl rsaz_1024_sqr_avx2 +.globl rsaz_1024_mul_avx2 +.globl rsaz_1024_norm2red_avx2 +.globl rsaz_1024_red2norm_avx2 +.globl rsaz_1024_scatter5_avx2 +.globl rsaz_1024_gather5_avx2 .type rsaz_1024_sqr_avx2,@function -.align 64 rsaz_1024_sqr_avx2: -.cfi_startproc - leaq (%rsp),%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - vzeroupper - movq %rax,%rbp -.cfi_def_cfa_register %rbp - movq %rdx,%r13 - subq $832,%rsp - movq %r13,%r15 - subq $-128,%rdi - subq $-128,%rsi - subq $-128,%r13 - - andq $4095,%r15 - addq $320,%r15 - shrq $12,%r15 - vpxor %ymm9,%ymm9,%ymm9 - jz .Lsqr_1024_no_n_copy - - - - - - subq $320,%rsp - vmovdqu 0-128(%r13),%ymm0 - andq $-2048,%rsp - vmovdqu 32-128(%r13),%ymm1 - vmovdqu 64-128(%r13),%ymm2 - vmovdqu 96-128(%r13),%ymm3 - vmovdqu 128-128(%r13),%ymm4 - vmovdqu 160-128(%r13),%ymm5 - vmovdqu 192-128(%r13),%ymm6 - vmovdqu 224-128(%r13),%ymm7 - vmovdqu 256-128(%r13),%ymm8 - leaq 832+128(%rsp),%r13 - vmovdqu %ymm0,0-128(%r13) - vmovdqu %ymm1,32-128(%r13) - vmovdqu %ymm2,64-128(%r13) - vmovdqu %ymm3,96-128(%r13) - vmovdqu %ymm4,128-128(%r13) - vmovdqu %ymm5,160-128(%r13) - vmovdqu %ymm6,192-128(%r13) - vmovdqu %ymm7,224-128(%r13) - vmovdqu %ymm8,256-128(%r13) - vmovdqu %ymm9,288-128(%r13) - -.Lsqr_1024_no_n_copy: - andq $-1024,%rsp - - vmovdqu 32-128(%rsi),%ymm1 - vmovdqu 64-128(%rsi),%ymm2 - vmovdqu 96-128(%rsi),%ymm3 - vmovdqu 128-128(%rsi),%ymm4 - vmovdqu 160-128(%rsi),%ymm5 - vmovdqu 192-128(%rsi),%ymm6 - vmovdqu 224-128(%rsi),%ymm7 - vmovdqu 256-128(%rsi),%ymm8 - - leaq 192(%rsp),%rbx - vmovdqu .Land_mask(%rip),%ymm15 - jmp .LOOP_GRANDE_SQR_1024 - -.align 32 -.LOOP_GRANDE_SQR_1024: - leaq 576+128(%rsp),%r9 - leaq 448(%rsp),%r12 - - - - - vpaddq %ymm1,%ymm1,%ymm1 - vpbroadcastq 0-128(%rsi),%ymm10 - vpaddq %ymm2,%ymm2,%ymm2 - vmovdqa %ymm1,0-128(%r9) - vpaddq %ymm3,%ymm3,%ymm3 - vmovdqa %ymm2,32-128(%r9) - vpaddq %ymm4,%ymm4,%ymm4 - vmovdqa %ymm3,64-128(%r9) - vpaddq %ymm5,%ymm5,%ymm5 - vmovdqa %ymm4,96-128(%r9) - vpaddq %ymm6,%ymm6,%ymm6 - vmovdqa %ymm5,128-128(%r9) - vpaddq %ymm7,%ymm7,%ymm7 - vmovdqa %ymm6,160-128(%r9) - vpaddq %ymm8,%ymm8,%ymm8 - vmovdqa %ymm7,192-128(%r9) - vpxor %ymm9,%ymm9,%ymm9 - vmovdqa %ymm8,224-128(%r9) - - vpmuludq 0-128(%rsi),%ymm10,%ymm0 - vpbroadcastq 32-128(%rsi),%ymm11 - vmovdqu %ymm9,288-192(%rbx) - vpmuludq %ymm10,%ymm1,%ymm1 - vmovdqu %ymm9,320-448(%r12) - vpmuludq %ymm10,%ymm2,%ymm2 - vmovdqu %ymm9,352-448(%r12) - vpmuludq %ymm10,%ymm3,%ymm3 - vmovdqu %ymm9,384-448(%r12) - vpmuludq %ymm10,%ymm4,%ymm4 - vmovdqu %ymm9,416-448(%r12) - vpmuludq %ymm10,%ymm5,%ymm5 - vmovdqu %ymm9,448-448(%r12) - vpmuludq %ymm10,%ymm6,%ymm6 - vmovdqu %ymm9,480-448(%r12) - vpmuludq %ymm10,%ymm7,%ymm7 - vmovdqu %ymm9,512-448(%r12) - vpmuludq %ymm10,%ymm8,%ymm8 - vpbroadcastq 64-128(%rsi),%ymm10 - vmovdqu %ymm9,544-448(%r12) - - movq %rsi,%r15 - movl $4,%r14d - jmp .Lsqr_entry_1024 -.align 32 -.LOOP_SQR_1024: - vpbroadcastq 32-128(%r15),%ymm11 - vpmuludq 0-128(%rsi),%ymm10,%ymm0 - vpaddq 0-192(%rbx),%ymm0,%ymm0 - vpmuludq 0-128(%r9),%ymm10,%ymm1 - vpaddq 32-192(%rbx),%ymm1,%ymm1 - vpmuludq 32-128(%r9),%ymm10,%ymm2 - vpaddq 64-192(%rbx),%ymm2,%ymm2 - vpmuludq 64-128(%r9),%ymm10,%ymm3 - vpaddq 96-192(%rbx),%ymm3,%ymm3 - vpmuludq 96-128(%r9),%ymm10,%ymm4 - vpaddq 128-192(%rbx),%ymm4,%ymm4 - vpmuludq 128-128(%r9),%ymm10,%ymm5 - vpaddq 160-192(%rbx),%ymm5,%ymm5 - vpmuludq 160-128(%r9),%ymm10,%ymm6 - vpaddq 192-192(%rbx),%ymm6,%ymm6 - vpmuludq 192-128(%r9),%ymm10,%ymm7 - vpaddq 224-192(%rbx),%ymm7,%ymm7 - vpmuludq 224-128(%r9),%ymm10,%ymm8 - vpbroadcastq 64-128(%r15),%ymm10 - vpaddq 256-192(%rbx),%ymm8,%ymm8 -.Lsqr_entry_1024: - vmovdqu %ymm0,0-192(%rbx) - vmovdqu %ymm1,32-192(%rbx) - - vpmuludq 32-128(%rsi),%ymm11,%ymm12 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq 32-128(%r9),%ymm11,%ymm14 - vpaddq %ymm14,%ymm3,%ymm3 - vpmuludq 64-128(%r9),%ymm11,%ymm13 - vpaddq %ymm13,%ymm4,%ymm4 - vpmuludq 96-128(%r9),%ymm11,%ymm12 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq 128-128(%r9),%ymm11,%ymm14 - vpaddq %ymm14,%ymm6,%ymm6 - vpmuludq 160-128(%r9),%ymm11,%ymm13 - vpaddq %ymm13,%ymm7,%ymm7 - vpmuludq 192-128(%r9),%ymm11,%ymm12 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq 224-128(%r9),%ymm11,%ymm0 - vpbroadcastq 96-128(%r15),%ymm11 - vpaddq 288-192(%rbx),%ymm0,%ymm0 - - vmovdqu %ymm2,64-192(%rbx) - vmovdqu %ymm3,96-192(%rbx) - - vpmuludq 64-128(%rsi),%ymm10,%ymm13 - vpaddq %ymm13,%ymm4,%ymm4 - vpmuludq 64-128(%r9),%ymm10,%ymm12 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq 96-128(%r9),%ymm10,%ymm14 - vpaddq %ymm14,%ymm6,%ymm6 - vpmuludq 128-128(%r9),%ymm10,%ymm13 - vpaddq %ymm13,%ymm7,%ymm7 - vpmuludq 160-128(%r9),%ymm10,%ymm12 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq 192-128(%r9),%ymm10,%ymm14 - vpaddq %ymm14,%ymm0,%ymm0 - vpmuludq 224-128(%r9),%ymm10,%ymm1 - vpbroadcastq 128-128(%r15),%ymm10 - vpaddq 320-448(%r12),%ymm1,%ymm1 - - vmovdqu %ymm4,128-192(%rbx) - vmovdqu %ymm5,160-192(%rbx) - - vpmuludq 96-128(%rsi),%ymm11,%ymm12 - vpaddq %ymm12,%ymm6,%ymm6 - vpmuludq 96-128(%r9),%ymm11,%ymm14 - vpaddq %ymm14,%ymm7,%ymm7 - vpmuludq 128-128(%r9),%ymm11,%ymm13 - vpaddq %ymm13,%ymm8,%ymm8 - vpmuludq 160-128(%r9),%ymm11,%ymm12 - vpaddq %ymm12,%ymm0,%ymm0 - vpmuludq 192-128(%r9),%ymm11,%ymm14 - vpaddq %ymm14,%ymm1,%ymm1 - vpmuludq 224-128(%r9),%ymm11,%ymm2 - vpbroadcastq 160-128(%r15),%ymm11 - vpaddq 352-448(%r12),%ymm2,%ymm2 - - vmovdqu %ymm6,192-192(%rbx) - vmovdqu %ymm7,224-192(%rbx) - - vpmuludq 128-128(%rsi),%ymm10,%ymm12 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq 128-128(%r9),%ymm10,%ymm14 - vpaddq %ymm14,%ymm0,%ymm0 - vpmuludq 160-128(%r9),%ymm10,%ymm13 - vpaddq %ymm13,%ymm1,%ymm1 - vpmuludq 192-128(%r9),%ymm10,%ymm12 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq 224-128(%r9),%ymm10,%ymm3 - vpbroadcastq 192-128(%r15),%ymm10 - vpaddq 384-448(%r12),%ymm3,%ymm3 - - vmovdqu %ymm8,256-192(%rbx) - vmovdqu %ymm0,288-192(%rbx) - leaq 8(%rbx),%rbx - - vpmuludq 160-128(%rsi),%ymm11,%ymm13 - vpaddq %ymm13,%ymm1,%ymm1 - vpmuludq 160-128(%r9),%ymm11,%ymm12 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq 192-128(%r9),%ymm11,%ymm14 - vpaddq %ymm14,%ymm3,%ymm3 - vpmuludq 224-128(%r9),%ymm11,%ymm4 - vpbroadcastq 224-128(%r15),%ymm11 - vpaddq 416-448(%r12),%ymm4,%ymm4 - - vmovdqu %ymm1,320-448(%r12) - vmovdqu %ymm2,352-448(%r12) - - vpmuludq 192-128(%rsi),%ymm10,%ymm12 - vpaddq %ymm12,%ymm3,%ymm3 - vpmuludq 192-128(%r9),%ymm10,%ymm14 - vpbroadcastq 256-128(%r15),%ymm0 - vpaddq %ymm14,%ymm4,%ymm4 - vpmuludq 224-128(%r9),%ymm10,%ymm5 - vpbroadcastq 0+8-128(%r15),%ymm10 - vpaddq 448-448(%r12),%ymm5,%ymm5 - - vmovdqu %ymm3,384-448(%r12) - vmovdqu %ymm4,416-448(%r12) - leaq 8(%r15),%r15 - - vpmuludq 224-128(%rsi),%ymm11,%ymm12 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq 224-128(%r9),%ymm11,%ymm6 - vpaddq 480-448(%r12),%ymm6,%ymm6 - - vpmuludq 256-128(%rsi),%ymm0,%ymm7 - vmovdqu %ymm5,448-448(%r12) - vpaddq 512-448(%r12),%ymm7,%ymm7 - vmovdqu %ymm6,480-448(%r12) - vmovdqu %ymm7,512-448(%r12) - leaq 8(%r12),%r12 - - decl %r14d - jnz .LOOP_SQR_1024 - - vmovdqu 256(%rsp),%ymm8 - vmovdqu 288(%rsp),%ymm1 - vmovdqu 320(%rsp),%ymm2 - leaq 192(%rsp),%rbx - - vpsrlq $29,%ymm8,%ymm14 - vpand %ymm15,%ymm8,%ymm8 - vpsrlq $29,%ymm1,%ymm11 - vpand %ymm15,%ymm1,%ymm1 - - vpermq $0x93,%ymm14,%ymm14 - vpxor %ymm9,%ymm9,%ymm9 - vpermq $0x93,%ymm11,%ymm11 - - vpblendd $3,%ymm9,%ymm14,%ymm10 - vpblendd $3,%ymm14,%ymm11,%ymm14 - vpaddq %ymm10,%ymm8,%ymm8 - vpblendd $3,%ymm11,%ymm9,%ymm11 - vpaddq %ymm14,%ymm1,%ymm1 - vpaddq %ymm11,%ymm2,%ymm2 - vmovdqu %ymm1,288-192(%rbx) - vmovdqu %ymm2,320-192(%rbx) - - movq (%rsp),%rax - movq 8(%rsp),%r10 - movq 16(%rsp),%r11 - movq 24(%rsp),%r12 - vmovdqu 32(%rsp),%ymm1 - vmovdqu 64-192(%rbx),%ymm2 - vmovdqu 96-192(%rbx),%ymm3 - vmovdqu 128-192(%rbx),%ymm4 - vmovdqu 160-192(%rbx),%ymm5 - vmovdqu 192-192(%rbx),%ymm6 - vmovdqu 224-192(%rbx),%ymm7 - - movq %rax,%r9 - imull %ecx,%eax - andl $0x1fffffff,%eax - vmovd %eax,%xmm12 - - movq %rax,%rdx - imulq -128(%r13),%rax - vpbroadcastq %xmm12,%ymm12 - addq %rax,%r9 - movq %rdx,%rax - imulq 8-128(%r13),%rax - shrq $29,%r9 - addq %rax,%r10 - movq %rdx,%rax - imulq 16-128(%r13),%rax - addq %r9,%r10 - addq %rax,%r11 - imulq 24-128(%r13),%rdx - addq %rdx,%r12 - - movq %r10,%rax - imull %ecx,%eax - andl $0x1fffffff,%eax - - movl $9,%r14d - jmp .LOOP_REDUCE_1024 - -.align 32 -.LOOP_REDUCE_1024: - vmovd %eax,%xmm13 - vpbroadcastq %xmm13,%ymm13 - - vpmuludq 32-128(%r13),%ymm12,%ymm10 - movq %rax,%rdx - imulq -128(%r13),%rax - vpaddq %ymm10,%ymm1,%ymm1 - addq %rax,%r10 - vpmuludq 64-128(%r13),%ymm12,%ymm14 - movq %rdx,%rax - imulq 8-128(%r13),%rax - vpaddq %ymm14,%ymm2,%ymm2 - vpmuludq 96-128(%r13),%ymm12,%ymm11 -.byte 0x67 - addq %rax,%r11 -.byte 0x67 - movq %rdx,%rax - imulq 16-128(%r13),%rax - shrq $29,%r10 - vpaddq %ymm11,%ymm3,%ymm3 - vpmuludq 128-128(%r13),%ymm12,%ymm10 - addq %rax,%r12 - addq %r10,%r11 - vpaddq %ymm10,%ymm4,%ymm4 - vpmuludq 160-128(%r13),%ymm12,%ymm14 - movq %r11,%rax - imull %ecx,%eax - vpaddq %ymm14,%ymm5,%ymm5 - vpmuludq 192-128(%r13),%ymm12,%ymm11 - andl $0x1fffffff,%eax - vpaddq %ymm11,%ymm6,%ymm6 - vpmuludq 224-128(%r13),%ymm12,%ymm10 - vpaddq %ymm10,%ymm7,%ymm7 - vpmuludq 256-128(%r13),%ymm12,%ymm14 - vmovd %eax,%xmm12 - - vpaddq %ymm14,%ymm8,%ymm8 - - vpbroadcastq %xmm12,%ymm12 - - vpmuludq 32-8-128(%r13),%ymm13,%ymm11 - vmovdqu 96-8-128(%r13),%ymm14 - movq %rax,%rdx - imulq -128(%r13),%rax - vpaddq %ymm11,%ymm1,%ymm1 - vpmuludq 64-8-128(%r13),%ymm13,%ymm10 - vmovdqu 128-8-128(%r13),%ymm11 - addq %rax,%r11 - movq %rdx,%rax - imulq 8-128(%r13),%rax - vpaddq %ymm10,%ymm2,%ymm2 - addq %r12,%rax - shrq $29,%r11 - vpmuludq %ymm13,%ymm14,%ymm14 - vmovdqu 160-8-128(%r13),%ymm10 - addq %r11,%rax - vpaddq %ymm14,%ymm3,%ymm3 - vpmuludq %ymm13,%ymm11,%ymm11 - vmovdqu 192-8-128(%r13),%ymm14 -.byte 0x67 - movq %rax,%r12 - imull %ecx,%eax - vpaddq %ymm11,%ymm4,%ymm4 - vpmuludq %ymm13,%ymm10,%ymm10 -.byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 - andl $0x1fffffff,%eax - vpaddq %ymm10,%ymm5,%ymm5 - vpmuludq %ymm13,%ymm14,%ymm14 - vmovdqu 256-8-128(%r13),%ymm10 - vpaddq %ymm14,%ymm6,%ymm6 - vpmuludq %ymm13,%ymm11,%ymm11 - vmovdqu 288-8-128(%r13),%ymm9 - vmovd %eax,%xmm0 - imulq -128(%r13),%rax - vpaddq %ymm11,%ymm7,%ymm7 - vpmuludq %ymm13,%ymm10,%ymm10 - vmovdqu 32-16-128(%r13),%ymm14 - vpbroadcastq %xmm0,%ymm0 - vpaddq %ymm10,%ymm8,%ymm8 - vpmuludq %ymm13,%ymm9,%ymm9 - vmovdqu 64-16-128(%r13),%ymm11 - addq %rax,%r12 - - vmovdqu 32-24-128(%r13),%ymm13 - vpmuludq %ymm12,%ymm14,%ymm14 - vmovdqu 96-16-128(%r13),%ymm10 - vpaddq %ymm14,%ymm1,%ymm1 - vpmuludq %ymm0,%ymm13,%ymm13 - vpmuludq %ymm12,%ymm11,%ymm11 -.byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff - vpaddq %ymm1,%ymm13,%ymm13 - vpaddq %ymm11,%ymm2,%ymm2 - vpmuludq %ymm12,%ymm10,%ymm10 - vmovdqu 160-16-128(%r13),%ymm11 -.byte 0x67 - vmovq %xmm13,%rax - vmovdqu %ymm13,(%rsp) - vpaddq %ymm10,%ymm3,%ymm3 - vpmuludq %ymm12,%ymm14,%ymm14 - vmovdqu 192-16-128(%r13),%ymm10 - vpaddq %ymm14,%ymm4,%ymm4 - vpmuludq %ymm12,%ymm11,%ymm11 - vmovdqu 224-16-128(%r13),%ymm14 - vpaddq %ymm11,%ymm5,%ymm5 - vpmuludq %ymm12,%ymm10,%ymm10 - vmovdqu 256-16-128(%r13),%ymm11 - vpaddq %ymm10,%ymm6,%ymm6 - vpmuludq %ymm12,%ymm14,%ymm14 - shrq $29,%r12 - vmovdqu 288-16-128(%r13),%ymm10 - addq %r12,%rax - vpaddq %ymm14,%ymm7,%ymm7 - vpmuludq %ymm12,%ymm11,%ymm11 - - movq %rax,%r9 - imull %ecx,%eax - vpaddq %ymm11,%ymm8,%ymm8 - vpmuludq %ymm12,%ymm10,%ymm10 - andl $0x1fffffff,%eax - vmovd %eax,%xmm12 - vmovdqu 96-24-128(%r13),%ymm11 -.byte 0x67 - vpaddq %ymm10,%ymm9,%ymm9 - vpbroadcastq %xmm12,%ymm12 - - vpmuludq 64-24-128(%r13),%ymm0,%ymm14 - vmovdqu 128-24-128(%r13),%ymm10 - movq %rax,%rdx - imulq -128(%r13),%rax - movq 8(%rsp),%r10 - vpaddq %ymm14,%ymm2,%ymm1 - vpmuludq %ymm0,%ymm11,%ymm11 - vmovdqu 160-24-128(%r13),%ymm14 - addq %rax,%r9 - movq %rdx,%rax - imulq 8-128(%r13),%rax -.byte 0x67 - shrq $29,%r9 - movq 16(%rsp),%r11 - vpaddq %ymm11,%ymm3,%ymm2 - vpmuludq %ymm0,%ymm10,%ymm10 - vmovdqu 192-24-128(%r13),%ymm11 - addq %rax,%r10 - movq %rdx,%rax - imulq 16-128(%r13),%rax - vpaddq %ymm10,%ymm4,%ymm3 - vpmuludq %ymm0,%ymm14,%ymm14 - vmovdqu 224-24-128(%r13),%ymm10 - imulq 24-128(%r13),%rdx - addq %rax,%r11 - leaq (%r9,%r10,1),%rax - vpaddq %ymm14,%ymm5,%ymm4 - vpmuludq %ymm0,%ymm11,%ymm11 - vmovdqu 256-24-128(%r13),%ymm14 - movq %rax,%r10 - imull %ecx,%eax - vpmuludq %ymm0,%ymm10,%ymm10 - vpaddq %ymm11,%ymm6,%ymm5 - vmovdqu 288-24-128(%r13),%ymm11 - andl $0x1fffffff,%eax - vpaddq %ymm10,%ymm7,%ymm6 - vpmuludq %ymm0,%ymm14,%ymm14 - addq 24(%rsp),%rdx - vpaddq %ymm14,%ymm8,%ymm7 - vpmuludq %ymm0,%ymm11,%ymm11 - vpaddq %ymm11,%ymm9,%ymm8 - vmovq %r12,%xmm9 - movq %rdx,%r12 - - decl %r14d - jnz .LOOP_REDUCE_1024 - leaq 448(%rsp),%r12 - vpaddq %ymm9,%ymm13,%ymm0 - vpxor %ymm9,%ymm9,%ymm9 - - vpaddq 288-192(%rbx),%ymm0,%ymm0 - vpaddq 320-448(%r12),%ymm1,%ymm1 - vpaddq 352-448(%r12),%ymm2,%ymm2 - vpaddq 384-448(%r12),%ymm3,%ymm3 - vpaddq 416-448(%r12),%ymm4,%ymm4 - vpaddq 448-448(%r12),%ymm5,%ymm5 - vpaddq 480-448(%r12),%ymm6,%ymm6 - vpaddq 512-448(%r12),%ymm7,%ymm7 - vpaddq 544-448(%r12),%ymm8,%ymm8 - - vpsrlq $29,%ymm0,%ymm14 - vpand %ymm15,%ymm0,%ymm0 - vpsrlq $29,%ymm1,%ymm11 - vpand %ymm15,%ymm1,%ymm1 - vpsrlq $29,%ymm2,%ymm12 - vpermq $0x93,%ymm14,%ymm14 - vpand %ymm15,%ymm2,%ymm2 - vpsrlq $29,%ymm3,%ymm13 - vpermq $0x93,%ymm11,%ymm11 - vpand %ymm15,%ymm3,%ymm3 - vpermq $0x93,%ymm12,%ymm12 - - vpblendd $3,%ymm9,%ymm14,%ymm10 - vpermq $0x93,%ymm13,%ymm13 - vpblendd $3,%ymm14,%ymm11,%ymm14 - vpaddq %ymm10,%ymm0,%ymm0 - vpblendd $3,%ymm11,%ymm12,%ymm11 - vpaddq %ymm14,%ymm1,%ymm1 - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm11,%ymm2,%ymm2 - vpblendd $3,%ymm13,%ymm9,%ymm13 - vpaddq %ymm12,%ymm3,%ymm3 - vpaddq %ymm13,%ymm4,%ymm4 - - vpsrlq $29,%ymm0,%ymm14 - vpand %ymm15,%ymm0,%ymm0 - vpsrlq $29,%ymm1,%ymm11 - vpand %ymm15,%ymm1,%ymm1 - vpsrlq $29,%ymm2,%ymm12 - vpermq $0x93,%ymm14,%ymm14 - vpand %ymm15,%ymm2,%ymm2 - vpsrlq $29,%ymm3,%ymm13 - vpermq $0x93,%ymm11,%ymm11 - vpand %ymm15,%ymm3,%ymm3 - vpermq $0x93,%ymm12,%ymm12 - - vpblendd $3,%ymm9,%ymm14,%ymm10 - vpermq $0x93,%ymm13,%ymm13 - vpblendd $3,%ymm14,%ymm11,%ymm14 - vpaddq %ymm10,%ymm0,%ymm0 - vpblendd $3,%ymm11,%ymm12,%ymm11 - vpaddq %ymm14,%ymm1,%ymm1 - vmovdqu %ymm0,0-128(%rdi) - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm11,%ymm2,%ymm2 - vmovdqu %ymm1,32-128(%rdi) - vpblendd $3,%ymm13,%ymm9,%ymm13 - vpaddq %ymm12,%ymm3,%ymm3 - vmovdqu %ymm2,64-128(%rdi) - vpaddq %ymm13,%ymm4,%ymm4 - vmovdqu %ymm3,96-128(%rdi) - vpsrlq $29,%ymm4,%ymm14 - vpand %ymm15,%ymm4,%ymm4 - vpsrlq $29,%ymm5,%ymm11 - vpand %ymm15,%ymm5,%ymm5 - vpsrlq $29,%ymm6,%ymm12 - vpermq $0x93,%ymm14,%ymm14 - vpand %ymm15,%ymm6,%ymm6 - vpsrlq $29,%ymm7,%ymm13 - vpermq $0x93,%ymm11,%ymm11 - vpand %ymm15,%ymm7,%ymm7 - vpsrlq $29,%ymm8,%ymm0 - vpermq $0x93,%ymm12,%ymm12 - vpand %ymm15,%ymm8,%ymm8 - vpermq $0x93,%ymm13,%ymm13 - - vpblendd $3,%ymm9,%ymm14,%ymm10 - vpermq $0x93,%ymm0,%ymm0 - vpblendd $3,%ymm14,%ymm11,%ymm14 - vpaddq %ymm10,%ymm4,%ymm4 - vpblendd $3,%ymm11,%ymm12,%ymm11 - vpaddq %ymm14,%ymm5,%ymm5 - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm11,%ymm6,%ymm6 - vpblendd $3,%ymm13,%ymm0,%ymm13 - vpaddq %ymm12,%ymm7,%ymm7 - vpaddq %ymm13,%ymm8,%ymm8 - - vpsrlq $29,%ymm4,%ymm14 - vpand %ymm15,%ymm4,%ymm4 - vpsrlq $29,%ymm5,%ymm11 - vpand %ymm15,%ymm5,%ymm5 - vpsrlq $29,%ymm6,%ymm12 - vpermq $0x93,%ymm14,%ymm14 - vpand %ymm15,%ymm6,%ymm6 - vpsrlq $29,%ymm7,%ymm13 - vpermq $0x93,%ymm11,%ymm11 - vpand %ymm15,%ymm7,%ymm7 - vpsrlq $29,%ymm8,%ymm0 - vpermq $0x93,%ymm12,%ymm12 - vpand %ymm15,%ymm8,%ymm8 - vpermq $0x93,%ymm13,%ymm13 - - vpblendd $3,%ymm9,%ymm14,%ymm10 - vpermq $0x93,%ymm0,%ymm0 - vpblendd $3,%ymm14,%ymm11,%ymm14 - vpaddq %ymm10,%ymm4,%ymm4 - vpblendd $3,%ymm11,%ymm12,%ymm11 - vpaddq %ymm14,%ymm5,%ymm5 - vmovdqu %ymm4,128-128(%rdi) - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm11,%ymm6,%ymm6 - vmovdqu %ymm5,160-128(%rdi) - vpblendd $3,%ymm13,%ymm0,%ymm13 - vpaddq %ymm12,%ymm7,%ymm7 - vmovdqu %ymm6,192-128(%rdi) - vpaddq %ymm13,%ymm8,%ymm8 - vmovdqu %ymm7,224-128(%rdi) - vmovdqu %ymm8,256-128(%rdi) - - movq %rdi,%rsi - decl %r8d - jne .LOOP_GRANDE_SQR_1024 - - vzeroall - movq %rbp,%rax -.cfi_def_cfa_register %rax - movq -48(%rax),%r15 -.cfi_restore %r15 - movq -40(%rax),%r14 -.cfi_restore %r14 - movq -32(%rax),%r13 -.cfi_restore %r13 - movq -24(%rax),%r12 -.cfi_restore %r12 - movq -16(%rax),%rbp -.cfi_restore %rbp - movq -8(%rax),%rbx -.cfi_restore %rbx - leaq (%rax),%rsp -.cfi_def_cfa_register %rsp -.Lsqr_1024_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 -.globl rsaz_1024_mul_avx2 -.type rsaz_1024_mul_avx2,@function -.align 64 rsaz_1024_mul_avx2: -.cfi_startproc - leaq (%rsp),%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - movq %rax,%rbp -.cfi_def_cfa_register %rbp - vzeroall - movq %rdx,%r13 - subq $64,%rsp - - - - - - -.byte 0x67,0x67 - movq %rsi,%r15 - andq $4095,%r15 - addq $320,%r15 - shrq $12,%r15 - movq %rsi,%r15 - cmovnzq %r13,%rsi - cmovnzq %r15,%r13 - - movq %rcx,%r15 - subq $-128,%rsi - subq $-128,%rcx - subq $-128,%rdi - - andq $4095,%r15 - addq $320,%r15 -.byte 0x67,0x67 - shrq $12,%r15 - jz .Lmul_1024_no_n_copy - - - - - - subq $320,%rsp - vmovdqu 0-128(%rcx),%ymm0 - andq $-512,%rsp - vmovdqu 32-128(%rcx),%ymm1 - vmovdqu 64-128(%rcx),%ymm2 - vmovdqu 96-128(%rcx),%ymm3 - vmovdqu 128-128(%rcx),%ymm4 - vmovdqu 160-128(%rcx),%ymm5 - vmovdqu 192-128(%rcx),%ymm6 - vmovdqu 224-128(%rcx),%ymm7 - vmovdqu 256-128(%rcx),%ymm8 - leaq 64+128(%rsp),%rcx - vmovdqu %ymm0,0-128(%rcx) - vpxor %ymm0,%ymm0,%ymm0 - vmovdqu %ymm1,32-128(%rcx) - vpxor %ymm1,%ymm1,%ymm1 - vmovdqu %ymm2,64-128(%rcx) - vpxor %ymm2,%ymm2,%ymm2 - vmovdqu %ymm3,96-128(%rcx) - vpxor %ymm3,%ymm3,%ymm3 - vmovdqu %ymm4,128-128(%rcx) - vpxor %ymm4,%ymm4,%ymm4 - vmovdqu %ymm5,160-128(%rcx) - vpxor %ymm5,%ymm5,%ymm5 - vmovdqu %ymm6,192-128(%rcx) - vpxor %ymm6,%ymm6,%ymm6 - vmovdqu %ymm7,224-128(%rcx) - vpxor %ymm7,%ymm7,%ymm7 - vmovdqu %ymm8,256-128(%rcx) - vmovdqa %ymm0,%ymm8 - vmovdqu %ymm9,288-128(%rcx) -.Lmul_1024_no_n_copy: - andq $-64,%rsp - - movq (%r13),%rbx - vpbroadcastq (%r13),%ymm10 - vmovdqu %ymm0,(%rsp) - xorq %r9,%r9 -.byte 0x67 - xorq %r10,%r10 - xorq %r11,%r11 - xorq %r12,%r12 - - vmovdqu .Land_mask(%rip),%ymm15 - movl $9,%r14d - vmovdqu %ymm9,288-128(%rdi) - jmp .Loop_mul_1024 - -.align 32 -.Loop_mul_1024: - vpsrlq $29,%ymm3,%ymm9 - movq %rbx,%rax - imulq -128(%rsi),%rax - addq %r9,%rax - movq %rbx,%r10 - imulq 8-128(%rsi),%r10 - addq 8(%rsp),%r10 - - movq %rax,%r9 - imull %r8d,%eax - andl $0x1fffffff,%eax - - movq %rbx,%r11 - imulq 16-128(%rsi),%r11 - addq 16(%rsp),%r11 - - movq %rbx,%r12 - imulq 24-128(%rsi),%r12 - addq 24(%rsp),%r12 - vpmuludq 32-128(%rsi),%ymm10,%ymm0 - vmovd %eax,%xmm11 - vpaddq %ymm0,%ymm1,%ymm1 - vpmuludq 64-128(%rsi),%ymm10,%ymm12 - vpbroadcastq %xmm11,%ymm11 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq 96-128(%rsi),%ymm10,%ymm13 - vpand %ymm15,%ymm3,%ymm3 - vpaddq %ymm13,%ymm3,%ymm3 - vpmuludq 128-128(%rsi),%ymm10,%ymm0 - vpaddq %ymm0,%ymm4,%ymm4 - vpmuludq 160-128(%rsi),%ymm10,%ymm12 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq 192-128(%rsi),%ymm10,%ymm13 - vpaddq %ymm13,%ymm6,%ymm6 - vpmuludq 224-128(%rsi),%ymm10,%ymm0 - vpermq $0x93,%ymm9,%ymm9 - vpaddq %ymm0,%ymm7,%ymm7 - vpmuludq 256-128(%rsi),%ymm10,%ymm12 - vpbroadcastq 8(%r13),%ymm10 - vpaddq %ymm12,%ymm8,%ymm8 - - movq %rax,%rdx - imulq -128(%rcx),%rax - addq %rax,%r9 - movq %rdx,%rax - imulq 8-128(%rcx),%rax - addq %rax,%r10 - movq %rdx,%rax - imulq 16-128(%rcx),%rax - addq %rax,%r11 - shrq $29,%r9 - imulq 24-128(%rcx),%rdx - addq %rdx,%r12 - addq %r9,%r10 - - vpmuludq 32-128(%rcx),%ymm11,%ymm13 - vmovq %xmm10,%rbx - vpaddq %ymm13,%ymm1,%ymm1 - vpmuludq 64-128(%rcx),%ymm11,%ymm0 - vpaddq %ymm0,%ymm2,%ymm2 - vpmuludq 96-128(%rcx),%ymm11,%ymm12 - vpaddq %ymm12,%ymm3,%ymm3 - vpmuludq 128-128(%rcx),%ymm11,%ymm13 - vpaddq %ymm13,%ymm4,%ymm4 - vpmuludq 160-128(%rcx),%ymm11,%ymm0 - vpaddq %ymm0,%ymm5,%ymm5 - vpmuludq 192-128(%rcx),%ymm11,%ymm12 - vpaddq %ymm12,%ymm6,%ymm6 - vpmuludq 224-128(%rcx),%ymm11,%ymm13 - vpblendd $3,%ymm14,%ymm9,%ymm12 - vpaddq %ymm13,%ymm7,%ymm7 - vpmuludq 256-128(%rcx),%ymm11,%ymm0 - vpaddq %ymm12,%ymm3,%ymm3 - vpaddq %ymm0,%ymm8,%ymm8 - - movq %rbx,%rax - imulq -128(%rsi),%rax - addq %rax,%r10 - vmovdqu -8+32-128(%rsi),%ymm12 - movq %rbx,%rax - imulq 8-128(%rsi),%rax - addq %rax,%r11 - vmovdqu -8+64-128(%rsi),%ymm13 - - movq %r10,%rax - vpblendd $0xfc,%ymm14,%ymm9,%ymm9 - imull %r8d,%eax - vpaddq %ymm9,%ymm4,%ymm4 - andl $0x1fffffff,%eax - - imulq 16-128(%rsi),%rbx - addq %rbx,%r12 - vpmuludq %ymm10,%ymm12,%ymm12 - vmovd %eax,%xmm11 - vmovdqu -8+96-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm1,%ymm1 - vpmuludq %ymm10,%ymm13,%ymm13 - vpbroadcastq %xmm11,%ymm11 - vmovdqu -8+128-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm2,%ymm2 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovdqu -8+160-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm3,%ymm3 - vpmuludq %ymm10,%ymm12,%ymm12 - vmovdqu -8+192-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm4,%ymm4 - vpmuludq %ymm10,%ymm13,%ymm13 - vmovdqu -8+224-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm5,%ymm5 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovdqu -8+256-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm6,%ymm6 - vpmuludq %ymm10,%ymm12,%ymm12 - vmovdqu -8+288-128(%rsi),%ymm9 - vpaddq %ymm12,%ymm7,%ymm7 - vpmuludq %ymm10,%ymm13,%ymm13 - vpaddq %ymm13,%ymm8,%ymm8 - vpmuludq %ymm10,%ymm9,%ymm9 - vpbroadcastq 16(%r13),%ymm10 - - movq %rax,%rdx - imulq -128(%rcx),%rax - addq %rax,%r10 - vmovdqu -8+32-128(%rcx),%ymm0 - movq %rdx,%rax - imulq 8-128(%rcx),%rax - addq %rax,%r11 - vmovdqu -8+64-128(%rcx),%ymm12 - shrq $29,%r10 - imulq 16-128(%rcx),%rdx - addq %rdx,%r12 - addq %r10,%r11 - - vpmuludq %ymm11,%ymm0,%ymm0 - vmovq %xmm10,%rbx - vmovdqu -8+96-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm1,%ymm1 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu -8+128-128(%rcx),%ymm0 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -8+160-128(%rcx),%ymm12 - vpaddq %ymm13,%ymm3,%ymm3 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovdqu -8+192-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm4,%ymm4 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu -8+224-128(%rcx),%ymm0 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -8+256-128(%rcx),%ymm12 - vpaddq %ymm13,%ymm6,%ymm6 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovdqu -8+288-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm7,%ymm7 - vpmuludq %ymm11,%ymm12,%ymm12 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq %ymm11,%ymm13,%ymm13 - vpaddq %ymm13,%ymm9,%ymm9 - - vmovdqu -16+32-128(%rsi),%ymm0 - movq %rbx,%rax - imulq -128(%rsi),%rax - addq %r11,%rax - - vmovdqu -16+64-128(%rsi),%ymm12 - movq %rax,%r11 - imull %r8d,%eax - andl $0x1fffffff,%eax - - imulq 8-128(%rsi),%rbx - addq %rbx,%r12 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovd %eax,%xmm11 - vmovdqu -16+96-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm1,%ymm1 - vpmuludq %ymm10,%ymm12,%ymm12 - vpbroadcastq %xmm11,%ymm11 - vmovdqu -16+128-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq %ymm10,%ymm13,%ymm13 - vmovdqu -16+160-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm3,%ymm3 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovdqu -16+192-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm4,%ymm4 - vpmuludq %ymm10,%ymm12,%ymm12 - vmovdqu -16+224-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq %ymm10,%ymm13,%ymm13 - vmovdqu -16+256-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm6,%ymm6 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovdqu -16+288-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm7,%ymm7 - vpmuludq %ymm10,%ymm12,%ymm12 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq %ymm10,%ymm13,%ymm13 - vpbroadcastq 24(%r13),%ymm10 - vpaddq %ymm13,%ymm9,%ymm9 - - vmovdqu -16+32-128(%rcx),%ymm0 - movq %rax,%rdx - imulq -128(%rcx),%rax - addq %rax,%r11 - vmovdqu -16+64-128(%rcx),%ymm12 - imulq 8-128(%rcx),%rdx - addq %rdx,%r12 - shrq $29,%r11 - - vpmuludq %ymm11,%ymm0,%ymm0 - vmovq %xmm10,%rbx - vmovdqu -16+96-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm1,%ymm1 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu -16+128-128(%rcx),%ymm0 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -16+160-128(%rcx),%ymm12 - vpaddq %ymm13,%ymm3,%ymm3 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovdqu -16+192-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm4,%ymm4 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu -16+224-128(%rcx),%ymm0 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -16+256-128(%rcx),%ymm12 - vpaddq %ymm13,%ymm6,%ymm6 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovdqu -16+288-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm7,%ymm7 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu -24+32-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -24+64-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm9,%ymm9 - - addq %r11,%r12 - imulq -128(%rsi),%rbx - addq %rbx,%r12 - - movq %r12,%rax - imull %r8d,%eax - andl $0x1fffffff,%eax - - vpmuludq %ymm10,%ymm0,%ymm0 - vmovd %eax,%xmm11 - vmovdqu -24+96-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm1,%ymm1 - vpmuludq %ymm10,%ymm12,%ymm12 - vpbroadcastq %xmm11,%ymm11 - vmovdqu -24+128-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq %ymm10,%ymm13,%ymm13 - vmovdqu -24+160-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm3,%ymm3 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovdqu -24+192-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm4,%ymm4 - vpmuludq %ymm10,%ymm12,%ymm12 - vmovdqu -24+224-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq %ymm10,%ymm13,%ymm13 - vmovdqu -24+256-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm6,%ymm6 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovdqu -24+288-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm7,%ymm7 - vpmuludq %ymm10,%ymm12,%ymm12 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq %ymm10,%ymm13,%ymm13 - vpbroadcastq 32(%r13),%ymm10 - vpaddq %ymm13,%ymm9,%ymm9 - addq $32,%r13 - - vmovdqu -24+32-128(%rcx),%ymm0 - imulq -128(%rcx),%rax - addq %rax,%r12 - shrq $29,%r12 - - vmovdqu -24+64-128(%rcx),%ymm12 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovq %xmm10,%rbx - vmovdqu -24+96-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm1,%ymm0 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu %ymm0,(%rsp) - vpaddq %ymm12,%ymm2,%ymm1 - vmovdqu -24+128-128(%rcx),%ymm0 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -24+160-128(%rcx),%ymm12 - vpaddq %ymm13,%ymm3,%ymm2 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovdqu -24+192-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm4,%ymm3 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu -24+224-128(%rcx),%ymm0 - vpaddq %ymm12,%ymm5,%ymm4 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -24+256-128(%rcx),%ymm12 - vpaddq %ymm13,%ymm6,%ymm5 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovdqu -24+288-128(%rcx),%ymm13 - movq %r12,%r9 - vpaddq %ymm0,%ymm7,%ymm6 - vpmuludq %ymm11,%ymm12,%ymm12 - addq (%rsp),%r9 - vpaddq %ymm12,%ymm8,%ymm7 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovq %r12,%xmm12 - vpaddq %ymm13,%ymm9,%ymm8 - - decl %r14d - jnz .Loop_mul_1024 - vpaddq (%rsp),%ymm12,%ymm0 - - vpsrlq $29,%ymm0,%ymm12 - vpand %ymm15,%ymm0,%ymm0 - vpsrlq $29,%ymm1,%ymm13 - vpand %ymm15,%ymm1,%ymm1 - vpsrlq $29,%ymm2,%ymm10 - vpermq $0x93,%ymm12,%ymm12 - vpand %ymm15,%ymm2,%ymm2 - vpsrlq $29,%ymm3,%ymm11 - vpermq $0x93,%ymm13,%ymm13 - vpand %ymm15,%ymm3,%ymm3 - - vpblendd $3,%ymm14,%ymm12,%ymm9 - vpermq $0x93,%ymm10,%ymm10 - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpermq $0x93,%ymm11,%ymm11 - vpaddq %ymm9,%ymm0,%ymm0 - vpblendd $3,%ymm13,%ymm10,%ymm13 - vpaddq %ymm12,%ymm1,%ymm1 - vpblendd $3,%ymm10,%ymm11,%ymm10 - vpaddq %ymm13,%ymm2,%ymm2 - vpblendd $3,%ymm11,%ymm14,%ymm11 - vpaddq %ymm10,%ymm3,%ymm3 - vpaddq %ymm11,%ymm4,%ymm4 - - vpsrlq $29,%ymm0,%ymm12 - vpand %ymm15,%ymm0,%ymm0 - vpsrlq $29,%ymm1,%ymm13 - vpand %ymm15,%ymm1,%ymm1 - vpsrlq $29,%ymm2,%ymm10 - vpermq $0x93,%ymm12,%ymm12 - vpand %ymm15,%ymm2,%ymm2 - vpsrlq $29,%ymm3,%ymm11 - vpermq $0x93,%ymm13,%ymm13 - vpand %ymm15,%ymm3,%ymm3 - vpermq $0x93,%ymm10,%ymm10 - - vpblendd $3,%ymm14,%ymm12,%ymm9 - vpermq $0x93,%ymm11,%ymm11 - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm9,%ymm0,%ymm0 - vpblendd $3,%ymm13,%ymm10,%ymm13 - vpaddq %ymm12,%ymm1,%ymm1 - vpblendd $3,%ymm10,%ymm11,%ymm10 - vpaddq %ymm13,%ymm2,%ymm2 - vpblendd $3,%ymm11,%ymm14,%ymm11 - vpaddq %ymm10,%ymm3,%ymm3 - vpaddq %ymm11,%ymm4,%ymm4 - - vmovdqu %ymm0,0-128(%rdi) - vmovdqu %ymm1,32-128(%rdi) - vmovdqu %ymm2,64-128(%rdi) - vmovdqu %ymm3,96-128(%rdi) - vpsrlq $29,%ymm4,%ymm12 - vpand %ymm15,%ymm4,%ymm4 - vpsrlq $29,%ymm5,%ymm13 - vpand %ymm15,%ymm5,%ymm5 - vpsrlq $29,%ymm6,%ymm10 - vpermq $0x93,%ymm12,%ymm12 - vpand %ymm15,%ymm6,%ymm6 - vpsrlq $29,%ymm7,%ymm11 - vpermq $0x93,%ymm13,%ymm13 - vpand %ymm15,%ymm7,%ymm7 - vpsrlq $29,%ymm8,%ymm0 - vpermq $0x93,%ymm10,%ymm10 - vpand %ymm15,%ymm8,%ymm8 - vpermq $0x93,%ymm11,%ymm11 - - vpblendd $3,%ymm14,%ymm12,%ymm9 - vpermq $0x93,%ymm0,%ymm0 - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm9,%ymm4,%ymm4 - vpblendd $3,%ymm13,%ymm10,%ymm13 - vpaddq %ymm12,%ymm5,%ymm5 - vpblendd $3,%ymm10,%ymm11,%ymm10 - vpaddq %ymm13,%ymm6,%ymm6 - vpblendd $3,%ymm11,%ymm0,%ymm11 - vpaddq %ymm10,%ymm7,%ymm7 - vpaddq %ymm11,%ymm8,%ymm8 - - vpsrlq $29,%ymm4,%ymm12 - vpand %ymm15,%ymm4,%ymm4 - vpsrlq $29,%ymm5,%ymm13 - vpand %ymm15,%ymm5,%ymm5 - vpsrlq $29,%ymm6,%ymm10 - vpermq $0x93,%ymm12,%ymm12 - vpand %ymm15,%ymm6,%ymm6 - vpsrlq $29,%ymm7,%ymm11 - vpermq $0x93,%ymm13,%ymm13 - vpand %ymm15,%ymm7,%ymm7 - vpsrlq $29,%ymm8,%ymm0 - vpermq $0x93,%ymm10,%ymm10 - vpand %ymm15,%ymm8,%ymm8 - vpermq $0x93,%ymm11,%ymm11 - - vpblendd $3,%ymm14,%ymm12,%ymm9 - vpermq $0x93,%ymm0,%ymm0 - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm9,%ymm4,%ymm4 - vpblendd $3,%ymm13,%ymm10,%ymm13 - vpaddq %ymm12,%ymm5,%ymm5 - vpblendd $3,%ymm10,%ymm11,%ymm10 - vpaddq %ymm13,%ymm6,%ymm6 - vpblendd $3,%ymm11,%ymm0,%ymm11 - vpaddq %ymm10,%ymm7,%ymm7 - vpaddq %ymm11,%ymm8,%ymm8 - - vmovdqu %ymm4,128-128(%rdi) - vmovdqu %ymm5,160-128(%rdi) - vmovdqu %ymm6,192-128(%rdi) - vmovdqu %ymm7,224-128(%rdi) - vmovdqu %ymm8,256-128(%rdi) - vzeroupper - - movq %rbp,%rax -.cfi_def_cfa_register %rax - movq -48(%rax),%r15 -.cfi_restore %r15 - movq -40(%rax),%r14 -.cfi_restore %r14 - movq -32(%rax),%r13 -.cfi_restore %r13 - movq -24(%rax),%r12 -.cfi_restore %r12 - movq -16(%rax),%rbp -.cfi_restore %rbp - movq -8(%rax),%rbx -.cfi_restore %rbx - leaq (%rax),%rsp -.cfi_def_cfa_register %rsp -.Lmul_1024_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2 -.globl rsaz_1024_red2norm_avx2 -.type rsaz_1024_red2norm_avx2,@function -.align 32 -rsaz_1024_red2norm_avx2: -.cfi_startproc - subq $-128,%rsi - xorq %rax,%rax - movq -128(%rsi),%r8 - movq -120(%rsi),%r9 - movq -112(%rsi),%r10 - shlq $0,%r8 - shlq $29,%r9 - movq %r10,%r11 - shlq $58,%r10 - shrq $6,%r11 - addq %r8,%rax - addq %r9,%rax - addq %r10,%rax - adcq $0,%r11 - movq %rax,0(%rdi) - movq %r11,%rax - movq -104(%rsi),%r8 - movq -96(%rsi),%r9 - shlq $23,%r8 - movq %r9,%r10 - shlq $52,%r9 - shrq $12,%r10 - addq %r8,%rax - addq %r9,%rax - adcq $0,%r10 - movq %rax,8(%rdi) - movq %r10,%rax - movq -88(%rsi),%r11 - movq -80(%rsi),%r8 - shlq $17,%r11 - movq %r8,%r9 - shlq $46,%r8 - shrq $18,%r9 - addq %r11,%rax - addq %r8,%rax - adcq $0,%r9 - movq %rax,16(%rdi) - movq %r9,%rax - movq -72(%rsi),%r10 - movq -64(%rsi),%r11 - shlq $11,%r10 - movq %r11,%r8 - shlq $40,%r11 - shrq $24,%r8 - addq %r10,%rax - addq %r11,%rax - adcq $0,%r8 - movq %rax,24(%rdi) - movq %r8,%rax - movq -56(%rsi),%r9 - movq -48(%rsi),%r10 - movq -40(%rsi),%r11 - shlq $5,%r9 - shlq $34,%r10 - movq %r11,%r8 - shlq $63,%r11 - shrq $1,%r8 - addq %r9,%rax - addq %r10,%rax - addq %r11,%rax - adcq $0,%r8 - movq %rax,32(%rdi) - movq %r8,%rax - movq -32(%rsi),%r9 - movq -24(%rsi),%r10 - shlq $28,%r9 - movq %r10,%r11 - shlq $57,%r10 - shrq $7,%r11 - addq %r9,%rax - addq %r10,%rax - adcq $0,%r11 - movq %rax,40(%rdi) - movq %r11,%rax - movq -16(%rsi),%r8 - movq -8(%rsi),%r9 - shlq $22,%r8 - movq %r9,%r10 - shlq $51,%r9 - shrq $13,%r10 - addq %r8,%rax - addq %r9,%rax - adcq $0,%r10 - movq %rax,48(%rdi) - movq %r10,%rax - movq 0(%rsi),%r11 - movq 8(%rsi),%r8 - shlq $16,%r11 - movq %r8,%r9 - shlq $45,%r8 - shrq $19,%r9 - addq %r11,%rax - addq %r8,%rax - adcq $0,%r9 - movq %rax,56(%rdi) - movq %r9,%rax - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - shlq $10,%r10 - movq %r11,%r8 - shlq $39,%r11 - shrq $25,%r8 - addq %r10,%rax - addq %r11,%rax - adcq $0,%r8 - movq %rax,64(%rdi) - movq %r8,%rax - movq 32(%rsi),%r9 - movq 40(%rsi),%r10 - movq 48(%rsi),%r11 - shlq $4,%r9 - shlq $33,%r10 - movq %r11,%r8 - shlq $62,%r11 - shrq $2,%r8 - addq %r9,%rax - addq %r10,%rax - addq %r11,%rax - adcq $0,%r8 - movq %rax,72(%rdi) - movq %r8,%rax - movq 56(%rsi),%r9 - movq 64(%rsi),%r10 - shlq $27,%r9 - movq %r10,%r11 - shlq $56,%r10 - shrq $8,%r11 - addq %r9,%rax - addq %r10,%rax - adcq $0,%r11 - movq %rax,80(%rdi) - movq %r11,%rax - movq 72(%rsi),%r8 - movq 80(%rsi),%r9 - shlq $21,%r8 - movq %r9,%r10 - shlq $50,%r9 - shrq $14,%r10 - addq %r8,%rax - addq %r9,%rax - adcq $0,%r10 - movq %rax,88(%rdi) - movq %r10,%rax - movq 88(%rsi),%r11 - movq 96(%rsi),%r8 - shlq $15,%r11 - movq %r8,%r9 - shlq $44,%r8 - shrq $20,%r9 - addq %r11,%rax - addq %r8,%rax - adcq $0,%r9 - movq %rax,96(%rdi) - movq %r9,%rax - movq 104(%rsi),%r10 - movq 112(%rsi),%r11 - shlq $9,%r10 - movq %r11,%r8 - shlq $38,%r11 - shrq $26,%r8 - addq %r10,%rax - addq %r11,%rax - adcq $0,%r8 - movq %rax,104(%rdi) - movq %r8,%rax - movq 120(%rsi),%r9 - movq 128(%rsi),%r10 - movq 136(%rsi),%r11 - shlq $3,%r9 - shlq $32,%r10 - movq %r11,%r8 - shlq $61,%r11 - shrq $3,%r8 - addq %r9,%rax - addq %r10,%rax - addq %r11,%rax - adcq $0,%r8 - movq %rax,112(%rdi) - movq %r8,%rax - movq 144(%rsi),%r9 - movq 152(%rsi),%r10 - shlq $26,%r9 - movq %r10,%r11 - shlq $55,%r10 - shrq $9,%r11 - addq %r9,%rax - addq %r10,%rax - adcq $0,%r11 - movq %rax,120(%rdi) - movq %r11,%rax - .byte 0xf3,0xc3 -.cfi_endproc -.size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2 - -.globl rsaz_1024_norm2red_avx2 -.type rsaz_1024_norm2red_avx2,@function -.align 32 rsaz_1024_norm2red_avx2: -.cfi_startproc - subq $-128,%rdi - movq (%rsi),%r8 - movl $0x1fffffff,%eax - movq 8(%rsi),%r9 - movq %r8,%r11 - shrq $0,%r11 - andq %rax,%r11 - movq %r11,-128(%rdi) - movq %r8,%r10 - shrq $29,%r10 - andq %rax,%r10 - movq %r10,-120(%rdi) - shrdq $58,%r9,%r8 - andq %rax,%r8 - movq %r8,-112(%rdi) - movq 16(%rsi),%r10 - movq %r9,%r8 - shrq $23,%r8 - andq %rax,%r8 - movq %r8,-104(%rdi) - shrdq $52,%r10,%r9 - andq %rax,%r9 - movq %r9,-96(%rdi) - movq 24(%rsi),%r11 - movq %r10,%r9 - shrq $17,%r9 - andq %rax,%r9 - movq %r9,-88(%rdi) - shrdq $46,%r11,%r10 - andq %rax,%r10 - movq %r10,-80(%rdi) - movq 32(%rsi),%r8 - movq %r11,%r10 - shrq $11,%r10 - andq %rax,%r10 - movq %r10,-72(%rdi) - shrdq $40,%r8,%r11 - andq %rax,%r11 - movq %r11,-64(%rdi) - movq 40(%rsi),%r9 - movq %r8,%r11 - shrq $5,%r11 - andq %rax,%r11 - movq %r11,-56(%rdi) - movq %r8,%r10 - shrq $34,%r10 - andq %rax,%r10 - movq %r10,-48(%rdi) - shrdq $63,%r9,%r8 - andq %rax,%r8 - movq %r8,-40(%rdi) - movq 48(%rsi),%r10 - movq %r9,%r8 - shrq $28,%r8 - andq %rax,%r8 - movq %r8,-32(%rdi) - shrdq $57,%r10,%r9 - andq %rax,%r9 - movq %r9,-24(%rdi) - movq 56(%rsi),%r11 - movq %r10,%r9 - shrq $22,%r9 - andq %rax,%r9 - movq %r9,-16(%rdi) - shrdq $51,%r11,%r10 - andq %rax,%r10 - movq %r10,-8(%rdi) - movq 64(%rsi),%r8 - movq %r11,%r10 - shrq $16,%r10 - andq %rax,%r10 - movq %r10,0(%rdi) - shrdq $45,%r8,%r11 - andq %rax,%r11 - movq %r11,8(%rdi) - movq 72(%rsi),%r9 - movq %r8,%r11 - shrq $10,%r11 - andq %rax,%r11 - movq %r11,16(%rdi) - shrdq $39,%r9,%r8 - andq %rax,%r8 - movq %r8,24(%rdi) - movq 80(%rsi),%r10 - movq %r9,%r8 - shrq $4,%r8 - andq %rax,%r8 - movq %r8,32(%rdi) - movq %r9,%r11 - shrq $33,%r11 - andq %rax,%r11 - movq %r11,40(%rdi) - shrdq $62,%r10,%r9 - andq %rax,%r9 - movq %r9,48(%rdi) - movq 88(%rsi),%r11 - movq %r10,%r9 - shrq $27,%r9 - andq %rax,%r9 - movq %r9,56(%rdi) - shrdq $56,%r11,%r10 - andq %rax,%r10 - movq %r10,64(%rdi) - movq 96(%rsi),%r8 - movq %r11,%r10 - shrq $21,%r10 - andq %rax,%r10 - movq %r10,72(%rdi) - shrdq $50,%r8,%r11 - andq %rax,%r11 - movq %r11,80(%rdi) - movq 104(%rsi),%r9 - movq %r8,%r11 - shrq $15,%r11 - andq %rax,%r11 - movq %r11,88(%rdi) - shrdq $44,%r9,%r8 - andq %rax,%r8 - movq %r8,96(%rdi) - movq 112(%rsi),%r10 - movq %r9,%r8 - shrq $9,%r8 - andq %rax,%r8 - movq %r8,104(%rdi) - shrdq $38,%r10,%r9 - andq %rax,%r9 - movq %r9,112(%rdi) - movq 120(%rsi),%r11 - movq %r10,%r9 - shrq $3,%r9 - andq %rax,%r9 - movq %r9,120(%rdi) - movq %r10,%r8 - shrq $32,%r8 - andq %rax,%r8 - movq %r8,128(%rdi) - shrdq $61,%r11,%r10 - andq %rax,%r10 - movq %r10,136(%rdi) - xorq %r8,%r8 - movq %r11,%r10 - shrq $26,%r10 - andq %rax,%r10 - movq %r10,144(%rdi) - shrdq $55,%r8,%r11 - andq %rax,%r11 - movq %r11,152(%rdi) - movq %r8,160(%rdi) - movq %r8,168(%rdi) - movq %r8,176(%rdi) - movq %r8,184(%rdi) - .byte 0xf3,0xc3 -.cfi_endproc -.size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2 -.globl rsaz_1024_scatter5_avx2 -.type rsaz_1024_scatter5_avx2,@function -.align 32 +rsaz_1024_red2norm_avx2: rsaz_1024_scatter5_avx2: -.cfi_startproc - vzeroupper - vmovdqu .Lscatter_permd(%rip),%ymm5 - shll $4,%edx - leaq (%rdi,%rdx,1),%rdi - movl $9,%eax - jmp .Loop_scatter_1024 - -.align 32 -.Loop_scatter_1024: - vmovdqu (%rsi),%ymm0 - leaq 32(%rsi),%rsi - vpermd %ymm0,%ymm5,%ymm0 - vmovdqu %xmm0,(%rdi) - leaq 512(%rdi),%rdi - decl %eax - jnz .Loop_scatter_1024 - - vzeroupper - .byte 0xf3,0xc3 -.cfi_endproc -.size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2 - -.globl rsaz_1024_gather5_avx2 -.type rsaz_1024_gather5_avx2,@function -.align 32 rsaz_1024_gather5_avx2: -.cfi_startproc - vzeroupper - movq %rsp,%r11 -.cfi_def_cfa_register %r11 - leaq -256(%rsp),%rsp - andq $-32,%rsp - leaq .Linc(%rip),%r10 - leaq -128(%rsp),%rax - - vmovd %edx,%xmm4 - vmovdqa (%r10),%ymm0 - vmovdqa 32(%r10),%ymm1 - vmovdqa 64(%r10),%ymm5 - vpbroadcastd %xmm4,%ymm4 - - vpaddd %ymm5,%ymm0,%ymm2 - vpcmpeqd %ymm4,%ymm0,%ymm0 - vpaddd %ymm5,%ymm1,%ymm3 - vpcmpeqd %ymm4,%ymm1,%ymm1 - vmovdqa %ymm0,0+128(%rax) - vpaddd %ymm5,%ymm2,%ymm0 - vpcmpeqd %ymm4,%ymm2,%ymm2 - vmovdqa %ymm1,32+128(%rax) - vpaddd %ymm5,%ymm3,%ymm1 - vpcmpeqd %ymm4,%ymm3,%ymm3 - vmovdqa %ymm2,64+128(%rax) - vpaddd %ymm5,%ymm0,%ymm2 - vpcmpeqd %ymm4,%ymm0,%ymm0 - vmovdqa %ymm3,96+128(%rax) - vpaddd %ymm5,%ymm1,%ymm3 - vpcmpeqd %ymm4,%ymm1,%ymm1 - vmovdqa %ymm0,128+128(%rax) - vpaddd %ymm5,%ymm2,%ymm8 - vpcmpeqd %ymm4,%ymm2,%ymm2 - vmovdqa %ymm1,160+128(%rax) - vpaddd %ymm5,%ymm3,%ymm9 - vpcmpeqd %ymm4,%ymm3,%ymm3 - vmovdqa %ymm2,192+128(%rax) - vpaddd %ymm5,%ymm8,%ymm10 - vpcmpeqd %ymm4,%ymm8,%ymm8 - vmovdqa %ymm3,224+128(%rax) - vpaddd %ymm5,%ymm9,%ymm11 - vpcmpeqd %ymm4,%ymm9,%ymm9 - vpaddd %ymm5,%ymm10,%ymm12 - vpcmpeqd %ymm4,%ymm10,%ymm10 - vpaddd %ymm5,%ymm11,%ymm13 - vpcmpeqd %ymm4,%ymm11,%ymm11 - vpaddd %ymm5,%ymm12,%ymm14 - vpcmpeqd %ymm4,%ymm12,%ymm12 - vpaddd %ymm5,%ymm13,%ymm15 - vpcmpeqd %ymm4,%ymm13,%ymm13 - vpcmpeqd %ymm4,%ymm14,%ymm14 - vpcmpeqd %ymm4,%ymm15,%ymm15 - - vmovdqa -32(%r10),%ymm7 - leaq 128(%rsi),%rsi - movl $9,%edx - -.Loop_gather_1024: - vmovdqa 0-128(%rsi),%ymm0 - vmovdqa 32-128(%rsi),%ymm1 - vmovdqa 64-128(%rsi),%ymm2 - vmovdqa 96-128(%rsi),%ymm3 - vpand 0+128(%rax),%ymm0,%ymm0 - vpand 32+128(%rax),%ymm1,%ymm1 - vpand 64+128(%rax),%ymm2,%ymm2 - vpor %ymm0,%ymm1,%ymm4 - vpand 96+128(%rax),%ymm3,%ymm3 - vmovdqa 128-128(%rsi),%ymm0 - vmovdqa 160-128(%rsi),%ymm1 - vpor %ymm2,%ymm3,%ymm5 - vmovdqa 192-128(%rsi),%ymm2 - vmovdqa 224-128(%rsi),%ymm3 - vpand 128+128(%rax),%ymm0,%ymm0 - vpand 160+128(%rax),%ymm1,%ymm1 - vpand 192+128(%rax),%ymm2,%ymm2 - vpor %ymm0,%ymm4,%ymm4 - vpand 224+128(%rax),%ymm3,%ymm3 - vpand 256-128(%rsi),%ymm8,%ymm0 - vpor %ymm1,%ymm5,%ymm5 - vpand 288-128(%rsi),%ymm9,%ymm1 - vpor %ymm2,%ymm4,%ymm4 - vpand 320-128(%rsi),%ymm10,%ymm2 - vpor %ymm3,%ymm5,%ymm5 - vpand 352-128(%rsi),%ymm11,%ymm3 - vpor %ymm0,%ymm4,%ymm4 - vpand 384-128(%rsi),%ymm12,%ymm0 - vpor %ymm1,%ymm5,%ymm5 - vpand 416-128(%rsi),%ymm13,%ymm1 - vpor %ymm2,%ymm4,%ymm4 - vpand 448-128(%rsi),%ymm14,%ymm2 - vpor %ymm3,%ymm5,%ymm5 - vpand 480-128(%rsi),%ymm15,%ymm3 - leaq 512(%rsi),%rsi - vpor %ymm0,%ymm4,%ymm4 - vpor %ymm1,%ymm5,%ymm5 - vpor %ymm2,%ymm4,%ymm4 - vpor %ymm3,%ymm5,%ymm5 - - vpor %ymm5,%ymm4,%ymm4 - vextracti128 $1,%ymm4,%xmm5 - vpor %xmm4,%xmm5,%xmm5 - vpermd %ymm5,%ymm7,%ymm5 - vmovdqu %ymm5,(%rdi) - leaq 32(%rdi),%rdi - decl %edx - jnz .Loop_gather_1024 - - vpxor %ymm0,%ymm0,%ymm0 - vmovdqu %ymm0,(%rdi) - vzeroupper - leaq (%r11),%rsp -.cfi_def_cfa_register %rsp +.byte 0x0f,0x0b .byte 0xf3,0xc3 -.cfi_endproc -.LSEH_end_rsaz_1024_gather5: -.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2 - -.globl rsaz_avx2_eligible -.type rsaz_avx2_eligible,@function -.align 32 -rsaz_avx2_eligible: - movl OPENSSL_ia32cap_P+8(%rip),%eax - movl $524544,%ecx - movl $0,%edx - andl %eax,%ecx - cmpl $524544,%ecx - cmovel %edx,%eax - andl $32,%eax - shrl $5,%eax - .byte 0xf3,0xc3 -.size rsaz_avx2_eligible,.-rsaz_avx2_eligible - -.align 64 -.Land_mask: -.quad 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff -.Lscatter_permd: -.long 0,2,4,6,7,7,7,7 -.Lgather_permd: -.long 0,7,1,7,2,7,3,7 -.Linc: -.long 0,0,0,0, 1,1,1,1 -.long 2,2,2,2, 3,3,3,3 -.long 4,4,4,4, 4,4,4,4 -.align 64 +.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 diff --git a/secure/lib/libcrypto/amd64/rsaz-x86_64.S b/secure/lib/libcrypto/amd64/rsaz-x86_64.S index e4e7b0469a53..ae64f7a73987 100644 --- a/secure/lib/libcrypto/amd64/rsaz-x86_64.S +++ b/secure/lib/libcrypto/amd64/rsaz-x86_64.S @@ -31,14 +31,10 @@ rsaz_512_sqr: subq $128+24,%rsp .cfi_adjust_cfa_offset 128+24 .Lsqr_body: - movq %rdx,%rbp +.byte 102,72,15,110,202 movq (%rsi),%rdx movq 8(%rsi),%rax movq %rcx,128(%rsp) - movl $0x80100,%r11d - andl OPENSSL_ia32cap_P+8(%rip),%r11d - cmpl $0x80100,%r11d - je .Loop_sqrx jmp .Loop_sqr .align 32 @@ -46,6 +42,7 @@ rsaz_512_sqr: movl %r8d,128+8(%rsp) movq %rdx,%rbx + movq %rax,%rbp mulq %rdx movq %rax,%r8 movq 16(%rsi),%rax @@ -84,31 +81,29 @@ rsaz_512_sqr: mulq %rbx addq %rax,%r14 movq %rbx,%rax - movq %rdx,%r15 - adcq $0,%r15 + adcq $0,%rdx + xorq %rcx,%rcx addq %r8,%r8 - movq %r9,%rcx - adcq %r9,%r9 + movq %rdx,%r15 + adcq $0,%rcx mulq %rax - movq %rax,(%rsp) - addq %rdx,%r8 - adcq $0,%r9 + addq %r8,%rdx + adcq $0,%rcx - movq %r8,8(%rsp) - shrq $63,%rcx + movq %rax,(%rsp) + movq %rdx,8(%rsp) - movq 8(%rsi),%r8 movq 16(%rsi),%rax - mulq %r8 + mulq %rbp addq %rax,%r10 movq 24(%rsi),%rax movq %rdx,%rbx adcq $0,%rbx - mulq %r8 + mulq %rbp addq %rax,%r11 movq 32(%rsi),%rax adcq $0,%rdx @@ -116,7 +111,7 @@ rsaz_512_sqr: movq %rdx,%rbx adcq $0,%rbx - mulq %r8 + mulq %rbp addq %rax,%r12 movq 40(%rsi),%rax adcq $0,%rdx @@ -124,7 +119,7 @@ rsaz_512_sqr: movq %rdx,%rbx adcq $0,%rbx - mulq %r8 + mulq %rbp addq %rax,%r13 movq 48(%rsi),%rax adcq $0,%rdx @@ -132,7 +127,7 @@ rsaz_512_sqr: movq %rdx,%rbx adcq $0,%rbx - mulq %r8 + mulq %rbp addq %rax,%r14 movq 56(%rsi),%rax adcq $0,%rdx @@ -140,39 +135,39 @@ rsaz_512_sqr: movq %rdx,%rbx adcq $0,%rbx - mulq %r8 + mulq %rbp addq %rax,%r15 - movq %r8,%rax + movq %rbp,%rax adcq $0,%rdx addq %rbx,%r15 - movq %rdx,%r8 - movq %r10,%rdx - adcq $0,%r8 + adcq $0,%rdx - addq %rdx,%rdx - leaq (%rcx,%r10,2),%r10 - movq %r11,%rbx - adcq %r11,%r11 + xorq %rbx,%rbx + addq %r9,%r9 + movq %rdx,%r8 + adcq %r10,%r10 + adcq $0,%rbx mulq %rax + + addq %rcx,%rax + movq 16(%rsi),%rbp addq %rax,%r9 + movq 24(%rsi),%rax adcq %rdx,%r10 - adcq $0,%r11 + adcq $0,%rbx movq %r9,16(%rsp) movq %r10,24(%rsp) - shrq $63,%rbx - movq 16(%rsi),%r9 - movq 24(%rsi),%rax - mulq %r9 + mulq %rbp addq %rax,%r12 movq 32(%rsi),%rax movq %rdx,%rcx adcq $0,%rcx - mulq %r9 + mulq %rbp addq %rax,%r13 movq 40(%rsi),%rax adcq $0,%rdx @@ -180,7 +175,7 @@ rsaz_512_sqr: movq %rdx,%rcx adcq $0,%rcx - mulq %r9 + mulq %rbp addq %rax,%r14 movq 48(%rsi),%rax adcq $0,%rdx @@ -188,9 +183,7 @@ rsaz_512_sqr: movq %rdx,%rcx adcq $0,%rcx - mulq %r9 - movq %r12,%r10 - leaq (%rbx,%r12,2),%r12 + mulq %rbp addq %rax,%r15 movq 56(%rsi),%rax adcq $0,%rdx @@ -198,36 +191,40 @@ rsaz_512_sqr: movq %rdx,%rcx adcq $0,%rcx - mulq %r9 - shrq $63,%r10 + mulq %rbp addq %rax,%r8 - movq %r9,%rax + movq %rbp,%rax adcq $0,%rdx addq %rcx,%r8 - movq %rdx,%r9 - adcq $0,%r9 + adcq $0,%rdx - movq %r13,%rcx - leaq (%r10,%r13,2),%r13 + xorq %rcx,%rcx + addq %r11,%r11 + movq %rdx,%r9 + adcq %r12,%r12 + adcq $0,%rcx mulq %rax + + addq %rbx,%rax + movq 24(%rsi),%r10 addq %rax,%r11 + movq 32(%rsi),%rax adcq %rdx,%r12 - adcq $0,%r13 + adcq $0,%rcx movq %r11,32(%rsp) movq %r12,40(%rsp) - shrq $63,%rcx - movq 24(%rsi),%r10 - movq 32(%rsi),%rax + movq %rax,%r11 mulq %r10 addq %rax,%r14 movq 40(%rsi),%rax movq %rdx,%rbx adcq $0,%rbx + movq %rax,%r12 mulq %r10 addq %rax,%r15 movq 48(%rsi),%rax @@ -236,9 +233,8 @@ rsaz_512_sqr: movq %rdx,%rbx adcq $0,%rbx + movq %rax,%rbp mulq %r10 - movq %r14,%r12 - leaq (%rcx,%r14,2),%r14 addq %rax,%r8 movq 56(%rsi),%rax adcq $0,%rdx @@ -247,32 +243,33 @@ rsaz_512_sqr: adcq $0,%rbx mulq %r10 - shrq $63,%r12 addq %rax,%r9 movq %r10,%rax adcq $0,%rdx addq %rbx,%r9 - movq %rdx,%r10 - adcq $0,%r10 + adcq $0,%rdx - movq %r15,%rbx - leaq (%r12,%r15,2),%r15 + xorq %rbx,%rbx + addq %r13,%r13 + movq %rdx,%r10 + adcq %r14,%r14 + adcq $0,%rbx mulq %rax + + addq %rcx,%rax addq %rax,%r13 + movq %r12,%rax adcq %rdx,%r14 - adcq $0,%r15 + adcq $0,%rbx movq %r13,48(%rsp) movq %r14,56(%rsp) - shrq $63,%rbx - movq 32(%rsi),%r11 - movq 40(%rsi),%rax mulq %r11 addq %rax,%r8 - movq 48(%rsi),%rax + movq %rbp,%rax movq %rdx,%rcx adcq $0,%rcx @@ -280,97 +277,99 @@ rsaz_512_sqr: addq %rax,%r9 movq 56(%rsi),%rax adcq $0,%rdx - movq %r8,%r12 - leaq (%rbx,%r8,2),%r8 addq %rcx,%r9 movq %rdx,%rcx adcq $0,%rcx + movq %rax,%r14 mulq %r11 - shrq $63,%r12 addq %rax,%r10 movq %r11,%rax adcq $0,%rdx addq %rcx,%r10 - movq %rdx,%r11 - adcq $0,%r11 + adcq $0,%rdx - movq %r9,%rcx - leaq (%r12,%r9,2),%r9 + xorq %rcx,%rcx + addq %r15,%r15 + movq %rdx,%r11 + adcq %r8,%r8 + adcq $0,%rcx mulq %rax + + addq %rbx,%rax addq %rax,%r15 + movq %rbp,%rax adcq %rdx,%r8 - adcq $0,%r9 + adcq $0,%rcx movq %r15,64(%rsp) movq %r8,72(%rsp) - shrq $63,%rcx - movq 40(%rsi),%r12 - movq 48(%rsi),%rax mulq %r12 addq %rax,%r10 - movq 56(%rsi),%rax + movq %r14,%rax movq %rdx,%rbx adcq $0,%rbx mulq %r12 addq %rax,%r11 movq %r12,%rax - movq %r10,%r15 - leaq (%rcx,%r10,2),%r10 adcq $0,%rdx - shrq $63,%r15 addq %rbx,%r11 - movq %rdx,%r12 - adcq $0,%r12 + adcq $0,%rdx - movq %r11,%rbx - leaq (%r15,%r11,2),%r11 + xorq %rbx,%rbx + addq %r9,%r9 + movq %rdx,%r12 + adcq %r10,%r10 + adcq $0,%rbx mulq %rax + + addq %rcx,%rax addq %rax,%r9 + movq %r14,%rax adcq %rdx,%r10 - adcq $0,%r11 + adcq $0,%rbx movq %r9,80(%rsp) movq %r10,88(%rsp) - movq 48(%rsi),%r13 - movq 56(%rsi),%rax - mulq %r13 + mulq %rbp addq %rax,%r12 - movq %r13,%rax - movq %rdx,%r13 - adcq $0,%r13 + movq %rbp,%rax + adcq $0,%rdx - xorq %r14,%r14 - shlq $1,%rbx + xorq %rcx,%rcx + addq %r11,%r11 + movq %rdx,%r13 adcq %r12,%r12 - adcq %r13,%r13 - adcq %r14,%r14 + adcq $0,%rcx mulq %rax + + addq %rbx,%rax addq %rax,%r11 + movq %r14,%rax adcq %rdx,%r12 - adcq $0,%r13 + adcq $0,%rcx movq %r11,96(%rsp) movq %r12,104(%rsp) - movq 56(%rsi),%rax - mulq %rax - addq %rax,%r13 - adcq $0,%rdx + xorq %rbx,%rbx + addq %r13,%r13 + adcq $0,%rbx - addq %rdx,%r14 + mulq %rax - movq %r13,112(%rsp) - movq %r14,120(%rsp) + addq %rcx,%rax + addq %r13,%rax + adcq %rbx,%rdx movq (%rsp),%r8 movq 8(%rsp),%r9 @@ -380,276 +379,12 @@ rsaz_512_sqr: movq 40(%rsp),%r13 movq 48(%rsp),%r14 movq 56(%rsp),%r15 - - call __rsaz_512_reduce - - addq 64(%rsp),%r8 - adcq 72(%rsp),%r9 - adcq 80(%rsp),%r10 - adcq 88(%rsp),%r11 - adcq 96(%rsp),%r12 - adcq 104(%rsp),%r13 - adcq 112(%rsp),%r14 - adcq 120(%rsp),%r15 - sbbq %rcx,%rcx - - call __rsaz_512_subtract - - movq %r8,%rdx - movq %r9,%rax - movl 128+8(%rsp),%r8d - movq %rdi,%rsi - - decl %r8d - jnz .Loop_sqr - jmp .Lsqr_tail - -.align 32 -.Loop_sqrx: - movl %r8d,128+8(%rsp) -.byte 102,72,15,110,199 -.byte 102,72,15,110,205 - - mulxq %rax,%r8,%r9 - - mulxq 16(%rsi),%rcx,%r10 - xorq %rbp,%rbp - - mulxq 24(%rsi),%rax,%r11 - adcxq %rcx,%r9 - - mulxq 32(%rsi),%rcx,%r12 - adcxq %rax,%r10 - - mulxq 40(%rsi),%rax,%r13 - adcxq %rcx,%r11 - -.byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 - adcxq %rax,%r12 - adcxq %rcx,%r13 - -.byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 - adcxq %rax,%r14 - adcxq %rbp,%r15 - - movq %r9,%rcx - shldq $1,%r8,%r9 - shlq $1,%r8 - - xorl %ebp,%ebp - mulxq %rdx,%rax,%rdx - adcxq %rdx,%r8 - movq 8(%rsi),%rdx - adcxq %rbp,%r9 - - movq %rax,(%rsp) - movq %r8,8(%rsp) - - - mulxq 16(%rsi),%rax,%rbx - adoxq %rax,%r10 - adcxq %rbx,%r11 - -.byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 - adoxq %rdi,%r11 - adcxq %r8,%r12 - - mulxq 32(%rsi),%rax,%rbx - adoxq %rax,%r12 - adcxq %rbx,%r13 - - mulxq 40(%rsi),%rdi,%r8 - adoxq %rdi,%r13 - adcxq %r8,%r14 - -.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 - adoxq %rax,%r14 - adcxq %rbx,%r15 - -.byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 - adoxq %rdi,%r15 - adcxq %rbp,%r8 - adoxq %rbp,%r8 - - movq %r11,%rbx - shldq $1,%r10,%r11 - shldq $1,%rcx,%r10 - - xorl %ebp,%ebp - mulxq %rdx,%rax,%rcx - movq 16(%rsi),%rdx - adcxq %rax,%r9 - adcxq %rcx,%r10 - adcxq %rbp,%r11 - - movq %r9,16(%rsp) -.byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 - - -.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 - adoxq %rdi,%r12 - adcxq %r9,%r13 - - mulxq 32(%rsi),%rax,%rcx - adoxq %rax,%r13 - adcxq %rcx,%r14 - - mulxq 40(%rsi),%rdi,%r9 - adoxq %rdi,%r14 - adcxq %r9,%r15 - -.byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 - adoxq %rax,%r15 - adcxq %rcx,%r8 - -.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 - adoxq %rdi,%r8 - adcxq %rbp,%r9 - adoxq %rbp,%r9 - - movq %r13,%rcx - shldq $1,%r12,%r13 - shldq $1,%rbx,%r12 - - xorl %ebp,%ebp - mulxq %rdx,%rax,%rdx - adcxq %rax,%r11 - adcxq %rdx,%r12 - movq 24(%rsi),%rdx - adcxq %rbp,%r13 - - movq %r11,32(%rsp) -.byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 - - -.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 - adoxq %rax,%r14 - adcxq %rbx,%r15 - - mulxq 40(%rsi),%rdi,%r10 - adoxq %rdi,%r15 - adcxq %r10,%r8 - - mulxq 48(%rsi),%rax,%rbx - adoxq %rax,%r8 - adcxq %rbx,%r9 - - mulxq 56(%rsi),%rdi,%r10 - adoxq %rdi,%r9 - adcxq %rbp,%r10 - adoxq %rbp,%r10 - -.byte 0x66 - movq %r15,%rbx - shldq $1,%r14,%r15 - shldq $1,%rcx,%r14 - - xorl %ebp,%ebp - mulxq %rdx,%rax,%rdx - adcxq %rax,%r13 - adcxq %rdx,%r14 - movq 32(%rsi),%rdx - adcxq %rbp,%r15 - - movq %r13,48(%rsp) - movq %r14,56(%rsp) - - -.byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 - adoxq %rdi,%r8 - adcxq %r11,%r9 - - mulxq 48(%rsi),%rax,%rcx - adoxq %rax,%r9 - adcxq %rcx,%r10 - - mulxq 56(%rsi),%rdi,%r11 - adoxq %rdi,%r10 - adcxq %rbp,%r11 - adoxq %rbp,%r11 - - movq %r9,%rcx - shldq $1,%r8,%r9 - shldq $1,%rbx,%r8 - - xorl %ebp,%ebp - mulxq %rdx,%rax,%rdx - adcxq %rax,%r15 - adcxq %rdx,%r8 - movq 40(%rsi),%rdx - adcxq %rbp,%r9 - - movq %r15,64(%rsp) - movq %r8,72(%rsp) - - -.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 - adoxq %rax,%r10 - adcxq %rbx,%r11 - -.byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 - adoxq %rdi,%r11 - adcxq %rbp,%r12 - adoxq %rbp,%r12 - - movq %r11,%rbx - shldq $1,%r10,%r11 - shldq $1,%rcx,%r10 - - xorl %ebp,%ebp - mulxq %rdx,%rax,%rdx - adcxq %rax,%r9 - adcxq %rdx,%r10 - movq 48(%rsi),%rdx - adcxq %rbp,%r11 - - movq %r9,80(%rsp) - movq %r10,88(%rsp) - - -.byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 - adoxq %rax,%r12 - adoxq %rbp,%r13 - - xorq %r14,%r14 - shldq $1,%r13,%r14 - shldq $1,%r12,%r13 - shldq $1,%rbx,%r12 - - xorl %ebp,%ebp - mulxq %rdx,%rax,%rdx - adcxq %rax,%r11 - adcxq %rdx,%r12 - movq 56(%rsi),%rdx - adcxq %rbp,%r13 - -.byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 -.byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 - - - mulxq %rdx,%rax,%rdx - adoxq %rax,%r13 - adoxq %rbp,%rdx - -.byte 0x66 - addq %rdx,%r14 - - movq %r13,112(%rsp) - movq %r14,120(%rsp) -.byte 102,72,15,126,199 .byte 102,72,15,126,205 - movq 128(%rsp),%rdx - movq (%rsp),%r8 - movq 8(%rsp),%r9 - movq 16(%rsp),%r10 - movq 24(%rsp),%r11 - movq 32(%rsp),%r12 - movq 40(%rsp),%r13 - movq 48(%rsp),%r14 - movq 56(%rsp),%r15 + movq %rax,112(%rsp) + movq %rdx,120(%rsp) - call __rsaz_512_reducex + call __rsaz_512_reduce addq 64(%rsp),%r8 adcq 72(%rsp),%r9 @@ -669,9 +404,7 @@ rsaz_512_sqr: movq %rdi,%rsi decl %r8d - jnz .Loop_sqrx - -.Lsqr_tail: + jnz .Loop_sqr leaq 128+24+48(%rsp),%rax .cfi_def_cfa %rax,8 @@ -723,10 +456,6 @@ rsaz_512_mul: .byte 102,72,15,110,199 .byte 102,72,15,110,201 movq %r8,128(%rsp) - movl $0x80100,%r11d - andl OPENSSL_ia32cap_P+8(%rip),%r11d - cmpl $0x80100,%r11d - je .Lmulx movq (%rdx),%rbx movq %rdx,%rbp call __rsaz_512_mul @@ -744,29 +473,6 @@ rsaz_512_mul: movq 56(%rsp),%r15 call __rsaz_512_reduce - jmp .Lmul_tail - -.align 32 -.Lmulx: - movq %rdx,%rbp - movq (%rdx),%rdx - call __rsaz_512_mulx - -.byte 102,72,15,126,199 -.byte 102,72,15,126,205 - - movq 128(%rsp),%rdx - movq (%rsp),%r8 - movq 8(%rsp),%r9 - movq 16(%rsp),%r10 - movq 24(%rsp),%r11 - movq 32(%rsp),%r12 - movq 40(%rsp),%r13 - movq 48(%rsp),%r14 - movq 56(%rsp),%r15 - - call __rsaz_512_reducex -.Lmul_tail: addq 64(%rsp),%r8 adcq 72(%rsp),%r9 adcq 80(%rsp),%r10 @@ -880,10 +586,6 @@ rsaz_512_mul_gather4: por %xmm9,%xmm8 pshufd $0x4e,%xmm8,%xmm9 por %xmm9,%xmm8 - movl $0x80100,%r11d - andl OPENSSL_ia32cap_P+8(%rip),%r11d - cmpl $0x80100,%r11d - je .Lmulx_gather .byte 102,76,15,126,195 movq %r8,128(%rsp) @@ -1064,142 +766,6 @@ rsaz_512_mul_gather4: movq 56(%rsp),%r15 call __rsaz_512_reduce - jmp .Lmul_gather_tail - -.align 32 -.Lmulx_gather: -.byte 102,76,15,126,194 - - movq %r8,128(%rsp) - movq %rdi,128+8(%rsp) - movq %rcx,128+16(%rsp) - - mulxq (%rsi),%rbx,%r8 - movq %rbx,(%rsp) - xorl %edi,%edi - - mulxq 8(%rsi),%rax,%r9 - - mulxq 16(%rsi),%rbx,%r10 - adcxq %rax,%r8 - - mulxq 24(%rsi),%rax,%r11 - adcxq %rbx,%r9 - - mulxq 32(%rsi),%rbx,%r12 - adcxq %rax,%r10 - - mulxq 40(%rsi),%rax,%r13 - adcxq %rbx,%r11 - - mulxq 48(%rsi),%rbx,%r14 - adcxq %rax,%r12 - - mulxq 56(%rsi),%rax,%r15 - adcxq %rbx,%r13 - adcxq %rax,%r14 -.byte 0x67 - movq %r8,%rbx - adcxq %rdi,%r15 - - movq $-7,%rcx - jmp .Loop_mulx_gather - -.align 32 -.Loop_mulx_gather: - movdqa 0(%rbp),%xmm8 - movdqa 16(%rbp),%xmm9 - movdqa 32(%rbp),%xmm10 - movdqa 48(%rbp),%xmm11 - pand %xmm0,%xmm8 - movdqa 64(%rbp),%xmm12 - pand %xmm1,%xmm9 - movdqa 80(%rbp),%xmm13 - pand %xmm2,%xmm10 - movdqa 96(%rbp),%xmm14 - pand %xmm3,%xmm11 - movdqa 112(%rbp),%xmm15 - leaq 128(%rbp),%rbp - pand %xmm4,%xmm12 - pand %xmm5,%xmm13 - pand %xmm6,%xmm14 - pand %xmm7,%xmm15 - por %xmm10,%xmm8 - por %xmm11,%xmm9 - por %xmm12,%xmm8 - por %xmm13,%xmm9 - por %xmm14,%xmm8 - por %xmm15,%xmm9 - - por %xmm9,%xmm8 - pshufd $0x4e,%xmm8,%xmm9 - por %xmm9,%xmm8 -.byte 102,76,15,126,194 - -.byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 - adcxq %rax,%rbx - adoxq %r9,%r8 - - mulxq 8(%rsi),%rax,%r9 - adcxq %rax,%r8 - adoxq %r10,%r9 - - mulxq 16(%rsi),%rax,%r10 - adcxq %rax,%r9 - adoxq %r11,%r10 - -.byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 - adcxq %rax,%r10 - adoxq %r12,%r11 - - mulxq 32(%rsi),%rax,%r12 - adcxq %rax,%r11 - adoxq %r13,%r12 - - mulxq 40(%rsi),%rax,%r13 - adcxq %rax,%r12 - adoxq %r14,%r13 - -.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 - adcxq %rax,%r13 -.byte 0x67 - adoxq %r15,%r14 - - mulxq 56(%rsi),%rax,%r15 - movq %rbx,64(%rsp,%rcx,8) - adcxq %rax,%r14 - adoxq %rdi,%r15 - movq %r8,%rbx - adcxq %rdi,%r15 - - incq %rcx - jnz .Loop_mulx_gather - - movq %r8,64(%rsp) - movq %r9,64+8(%rsp) - movq %r10,64+16(%rsp) - movq %r11,64+24(%rsp) - movq %r12,64+32(%rsp) - movq %r13,64+40(%rsp) - movq %r14,64+48(%rsp) - movq %r15,64+56(%rsp) - - movq 128(%rsp),%rdx - movq 128+8(%rsp),%rdi - movq 128+16(%rsp),%rbp - - movq (%rsp),%r8 - movq 8(%rsp),%r9 - movq 16(%rsp),%r10 - movq 24(%rsp),%r11 - movq 32(%rsp),%r12 - movq 40(%rsp),%r13 - movq 48(%rsp),%r14 - movq 56(%rsp),%r15 - - call __rsaz_512_reducex - -.Lmul_gather_tail: addq 64(%rsp),%r8 adcq 72(%rsp),%r9 adcq 80(%rsp),%r10 @@ -1267,10 +833,6 @@ rsaz_512_mul_scatter4: movq %rcx,128(%rsp) movq %rdi,%rbp - movl $0x80100,%r11d - andl OPENSSL_ia32cap_P+8(%rip),%r11d - cmpl $0x80100,%r11d - je .Lmulx_scatter movq (%rdi),%rbx call __rsaz_512_mul @@ -1287,29 +849,6 @@ rsaz_512_mul_scatter4: movq 56(%rsp),%r15 call __rsaz_512_reduce - jmp .Lmul_scatter_tail - -.align 32 -.Lmulx_scatter: - movq (%rdi),%rdx - call __rsaz_512_mulx - -.byte 102,72,15,126,199 -.byte 102,72,15,126,205 - - movq 128(%rsp),%rdx - movq (%rsp),%r8 - movq 8(%rsp),%r9 - movq 16(%rsp),%r10 - movq 24(%rsp),%r11 - movq 32(%rsp),%r12 - movq 40(%rsp),%r13 - movq 48(%rsp),%r14 - movq 56(%rsp),%r15 - - call __rsaz_512_reducex - -.Lmul_scatter_tail: addq 64(%rsp),%r8 adcq 72(%rsp),%r9 adcq 80(%rsp),%r10 @@ -1379,7 +918,6 @@ rsaz_512_mul_by_one: subq $128+24,%rsp .cfi_adjust_cfa_offset 128+24 .Lmul_by_one_body: - movl OPENSSL_ia32cap_P+8(%rip),%eax movq %rdx,%rbp movq %rcx,128(%rsp) @@ -1400,16 +938,7 @@ rsaz_512_mul_by_one: movdqa %xmm0,64(%rsp) movdqa %xmm0,80(%rsp) movdqa %xmm0,96(%rsp) - andl $0x80100,%eax - cmpl $0x80100,%eax - je .Lby_one_callx call __rsaz_512_reduce - jmp .Lby_one_tail -.align 32 -.Lby_one_callx: - movq 128(%rsp),%rdx - call __rsaz_512_reducex -.Lby_one_tail: movq %r8,(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) @@ -1442,6 +971,7 @@ rsaz_512_mul_by_one: .type __rsaz_512_reduce,@function .align 32 __rsaz_512_reduce: +.cfi_startproc movq %r8,%rbx imulq 128+8(%rsp),%rbx movq 0(%rbp),%rax @@ -1521,66 +1051,12 @@ __rsaz_512_reduce: jne .Lreduction_loop .byte 0xf3,0xc3 +.cfi_endproc .size __rsaz_512_reduce,.-__rsaz_512_reduce -.type __rsaz_512_reducex,@function -.align 32 -__rsaz_512_reducex: - - imulq %r8,%rdx - xorq %rsi,%rsi - movl $8,%ecx - jmp .Lreduction_loopx - -.align 32 -.Lreduction_loopx: - movq %r8,%rbx - mulxq 0(%rbp),%rax,%r8 - adcxq %rbx,%rax - adoxq %r9,%r8 - - mulxq 8(%rbp),%rax,%r9 - adcxq %rax,%r8 - adoxq %r10,%r9 - - mulxq 16(%rbp),%rbx,%r10 - adcxq %rbx,%r9 - adoxq %r11,%r10 - - mulxq 24(%rbp),%rbx,%r11 - adcxq %rbx,%r10 - adoxq %r12,%r11 - -.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 - movq %rdx,%rax - movq %r8,%rdx - adcxq %rbx,%r11 - adoxq %r13,%r12 - - mulxq 128+8(%rsp),%rbx,%rdx - movq %rax,%rdx - - mulxq 40(%rbp),%rax,%r13 - adcxq %rax,%r12 - adoxq %r14,%r13 - -.byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 - adcxq %rax,%r13 - adoxq %r15,%r14 - - mulxq 56(%rbp),%rax,%r15 - movq %rbx,%rdx - adcxq %rax,%r14 - adoxq %rsi,%r15 - adcxq %rsi,%r15 - - decl %ecx - jne .Lreduction_loopx - - .byte 0xf3,0xc3 -.size __rsaz_512_reducex,.-__rsaz_512_reducex .type __rsaz_512_subtract,@function .align 32 __rsaz_512_subtract: +.cfi_startproc movq %r8,(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) @@ -1634,10 +1110,12 @@ __rsaz_512_subtract: movq %r15,56(%rdi) .byte 0xf3,0xc3 +.cfi_endproc .size __rsaz_512_subtract,.-__rsaz_512_subtract .type __rsaz_512_mul,@function .align 32 __rsaz_512_mul: +.cfi_startproc leaq 8(%rsp),%rdi movq (%rsi),%rax @@ -1776,131 +1254,13 @@ __rsaz_512_mul: movq %r15,56(%rdi) .byte 0xf3,0xc3 +.cfi_endproc .size __rsaz_512_mul,.-__rsaz_512_mul -.type __rsaz_512_mulx,@function -.align 32 -__rsaz_512_mulx: - mulxq (%rsi),%rbx,%r8 - movq $-6,%rcx - - mulxq 8(%rsi),%rax,%r9 - movq %rbx,8(%rsp) - - mulxq 16(%rsi),%rbx,%r10 - adcq %rax,%r8 - - mulxq 24(%rsi),%rax,%r11 - adcq %rbx,%r9 - - mulxq 32(%rsi),%rbx,%r12 - adcq %rax,%r10 - - mulxq 40(%rsi),%rax,%r13 - adcq %rbx,%r11 - - mulxq 48(%rsi),%rbx,%r14 - adcq %rax,%r12 - - mulxq 56(%rsi),%rax,%r15 - movq 8(%rbp),%rdx - adcq %rbx,%r13 - adcq %rax,%r14 - adcq $0,%r15 - - xorq %rdi,%rdi - jmp .Loop_mulx - -.align 32 -.Loop_mulx: - movq %r8,%rbx - mulxq (%rsi),%rax,%r8 - adcxq %rax,%rbx - adoxq %r9,%r8 - - mulxq 8(%rsi),%rax,%r9 - adcxq %rax,%r8 - adoxq %r10,%r9 - - mulxq 16(%rsi),%rax,%r10 - adcxq %rax,%r9 - adoxq %r11,%r10 - - mulxq 24(%rsi),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 - -.byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 - adcxq %rax,%r11 - adoxq %r13,%r12 - - mulxq 40(%rsi),%rax,%r13 - adcxq %rax,%r12 - adoxq %r14,%r13 - - mulxq 48(%rsi),%rax,%r14 - adcxq %rax,%r13 - adoxq %r15,%r14 - - mulxq 56(%rsi),%rax,%r15 - movq 64(%rbp,%rcx,8),%rdx - movq %rbx,8+64-8(%rsp,%rcx,8) - adcxq %rax,%r14 - adoxq %rdi,%r15 - adcxq %rdi,%r15 - - incq %rcx - jnz .Loop_mulx - - movq %r8,%rbx - mulxq (%rsi),%rax,%r8 - adcxq %rax,%rbx - adoxq %r9,%r8 - -.byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 - adcxq %rax,%r8 - adoxq %r10,%r9 - -.byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 - adcxq %rax,%r9 - adoxq %r11,%r10 - - mulxq 24(%rsi),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 - - mulxq 32(%rsi),%rax,%r12 - adcxq %rax,%r11 - adoxq %r13,%r12 - - mulxq 40(%rsi),%rax,%r13 - adcxq %rax,%r12 - adoxq %r14,%r13 - -.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 - adcxq %rax,%r13 - adoxq %r15,%r14 - -.byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 - adcxq %rax,%r14 - adoxq %rdi,%r15 - adcxq %rdi,%r15 - - movq %rbx,8+64-8(%rsp) - movq %r8,8+64(%rsp) - movq %r9,8+64+8(%rsp) - movq %r10,8+64+16(%rsp) - movq %r11,8+64+24(%rsp) - movq %r12,8+64+32(%rsp) - movq %r13,8+64+40(%rsp) - movq %r14,8+64+48(%rsp) - movq %r15,8+64+56(%rsp) - - .byte 0xf3,0xc3 -.size __rsaz_512_mulx,.-__rsaz_512_mulx .globl rsaz_512_scatter4 .type rsaz_512_scatter4,@function .align 16 rsaz_512_scatter4: +.cfi_startproc leaq (%rdi,%rdx,8),%rdi movl $8,%r9d jmp .Loop_scatter @@ -1913,12 +1273,14 @@ rsaz_512_scatter4: decl %r9d jnz .Loop_scatter .byte 0xf3,0xc3 +.cfi_endproc .size rsaz_512_scatter4,.-rsaz_512_scatter4 .globl rsaz_512_gather4 .type rsaz_512_gather4,@function .align 16 rsaz_512_gather4: +.cfi_startproc movd %edx,%xmm8 movdqa .Linc+16(%rip),%xmm1 movdqa .Linc(%rip),%xmm0 @@ -1982,6 +1344,7 @@ rsaz_512_gather4: jnz .Loop_gather .byte 0xf3,0xc3 .LSEH_end_rsaz_512_gather4: +.cfi_endproc .size rsaz_512_gather4,.-rsaz_512_gather4 .align 64 diff --git a/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S b/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S index 0090e020c573..488e554c247e 100644 --- a/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S +++ b/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S @@ -12,8 +12,6 @@ sha1_multi_block: movq OPENSSL_ia32cap_P+4(%rip),%rcx btq $61,%rcx jc _shaext_shortcut - testl $268435456,%ecx - jnz _avx_shortcut movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx @@ -2939,4319 +2937,6 @@ _shaext_shortcut: .byte 0xf3,0xc3 .cfi_endproc .size sha1_multi_block_shaext,.-sha1_multi_block_shaext -.type sha1_multi_block_avx,@function -.align 32 -sha1_multi_block_avx: -.cfi_startproc -_avx_shortcut: - shrq $32,%rcx - cmpl $2,%edx - jb .Lavx - testl $32,%ecx - jnz _avx2_shortcut - jmp .Lavx -.align 32 -.Lavx: - movq %rsp,%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - subq $288,%rsp - andq $-256,%rsp - movq %rax,272(%rsp) -.cfi_escape 0x0f,0x06,0x77,0x90,0x02,0x06,0x23,0x08 -.Lbody_avx: - leaq K_XX_XX(%rip),%rbp - leaq 256(%rsp),%rbx - - vzeroupper -.Loop_grande_avx: - movl %edx,280(%rsp) - xorl %edx,%edx - movq 0(%rsi),%r8 - movl 8(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,0(%rbx) - cmovleq %rbp,%r8 - movq 16(%rsi),%r9 - movl 24(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,4(%rbx) - cmovleq %rbp,%r9 - movq 32(%rsi),%r10 - movl 40(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,8(%rbx) - cmovleq %rbp,%r10 - movq 48(%rsi),%r11 - movl 56(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,12(%rbx) - cmovleq %rbp,%r11 - testl %edx,%edx - jz .Ldone_avx - - vmovdqu 0(%rdi),%xmm10 - leaq 128(%rsp),%rax - vmovdqu 32(%rdi),%xmm11 - vmovdqu 64(%rdi),%xmm12 - vmovdqu 96(%rdi),%xmm13 - vmovdqu 128(%rdi),%xmm14 - vmovdqu 96(%rbp),%xmm5 - jmp .Loop_avx - -.align 32 -.Loop_avx: - vmovdqa -32(%rbp),%xmm15 - vmovd (%r8),%xmm0 - leaq 64(%r8),%r8 - vmovd (%r9),%xmm2 - leaq 64(%r9),%r9 - vpinsrd $1,(%r10),%xmm0,%xmm0 - leaq 64(%r10),%r10 - vpinsrd $1,(%r11),%xmm2,%xmm2 - leaq 64(%r11),%r11 - vmovd -60(%r8),%xmm1 - vpunpckldq %xmm2,%xmm0,%xmm0 - vmovd -60(%r9),%xmm9 - vpshufb %xmm5,%xmm0,%xmm0 - vpinsrd $1,-60(%r10),%xmm1,%xmm1 - vpinsrd $1,-60(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm14,%xmm14 - vpslld $5,%xmm10,%xmm8 - vpandn %xmm13,%xmm11,%xmm7 - vpand %xmm12,%xmm11,%xmm6 - - vmovdqa %xmm0,0-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpunpckldq %xmm9,%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -56(%r8),%xmm2 - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -56(%r9),%xmm9 - vpaddd %xmm6,%xmm14,%xmm14 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpshufb %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpinsrd $1,-56(%r10),%xmm2,%xmm2 - vpinsrd $1,-56(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm13,%xmm13 - vpslld $5,%xmm14,%xmm8 - vpandn %xmm12,%xmm10,%xmm7 - vpand %xmm11,%xmm10,%xmm6 - - vmovdqa %xmm1,16-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpunpckldq %xmm9,%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -52(%r8),%xmm3 - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -52(%r9),%xmm9 - vpaddd %xmm6,%xmm13,%xmm13 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpshufb %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpinsrd $1,-52(%r10),%xmm3,%xmm3 - vpinsrd $1,-52(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm12,%xmm12 - vpslld $5,%xmm13,%xmm8 - vpandn %xmm11,%xmm14,%xmm7 - vpand %xmm10,%xmm14,%xmm6 - - vmovdqa %xmm2,32-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpunpckldq %xmm9,%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -48(%r8),%xmm4 - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -48(%r9),%xmm9 - vpaddd %xmm6,%xmm12,%xmm12 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpshufb %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpinsrd $1,-48(%r10),%xmm4,%xmm4 - vpinsrd $1,-48(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm11,%xmm11 - vpslld $5,%xmm12,%xmm8 - vpandn %xmm10,%xmm13,%xmm7 - vpand %xmm14,%xmm13,%xmm6 - - vmovdqa %xmm3,48-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpunpckldq %xmm9,%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -44(%r8),%xmm0 - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -44(%r9),%xmm9 - vpaddd %xmm6,%xmm11,%xmm11 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpshufb %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpinsrd $1,-44(%r10),%xmm0,%xmm0 - vpinsrd $1,-44(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm10,%xmm10 - vpslld $5,%xmm11,%xmm8 - vpandn %xmm14,%xmm12,%xmm7 - vpand %xmm13,%xmm12,%xmm6 - - vmovdqa %xmm4,64-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpunpckldq %xmm9,%xmm0,%xmm0 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -40(%r8),%xmm1 - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -40(%r9),%xmm9 - vpaddd %xmm6,%xmm10,%xmm10 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpshufb %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vpinsrd $1,-40(%r10),%xmm1,%xmm1 - vpinsrd $1,-40(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm14,%xmm14 - vpslld $5,%xmm10,%xmm8 - vpandn %xmm13,%xmm11,%xmm7 - vpand %xmm12,%xmm11,%xmm6 - - vmovdqa %xmm0,80-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpunpckldq %xmm9,%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -36(%r8),%xmm2 - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -36(%r9),%xmm9 - vpaddd %xmm6,%xmm14,%xmm14 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpshufb %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpinsrd $1,-36(%r10),%xmm2,%xmm2 - vpinsrd $1,-36(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm13,%xmm13 - vpslld $5,%xmm14,%xmm8 - vpandn %xmm12,%xmm10,%xmm7 - vpand %xmm11,%xmm10,%xmm6 - - vmovdqa %xmm1,96-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpunpckldq %xmm9,%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -32(%r8),%xmm3 - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -32(%r9),%xmm9 - vpaddd %xmm6,%xmm13,%xmm13 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpshufb %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpinsrd $1,-32(%r10),%xmm3,%xmm3 - vpinsrd $1,-32(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm12,%xmm12 - vpslld $5,%xmm13,%xmm8 - vpandn %xmm11,%xmm14,%xmm7 - vpand %xmm10,%xmm14,%xmm6 - - vmovdqa %xmm2,112-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpunpckldq %xmm9,%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -28(%r8),%xmm4 - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -28(%r9),%xmm9 - vpaddd %xmm6,%xmm12,%xmm12 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpshufb %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpinsrd $1,-28(%r10),%xmm4,%xmm4 - vpinsrd $1,-28(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm11,%xmm11 - vpslld $5,%xmm12,%xmm8 - vpandn %xmm10,%xmm13,%xmm7 - vpand %xmm14,%xmm13,%xmm6 - - vmovdqa %xmm3,128-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpunpckldq %xmm9,%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -24(%r8),%xmm0 - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -24(%r9),%xmm9 - vpaddd %xmm6,%xmm11,%xmm11 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpshufb %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpinsrd $1,-24(%r10),%xmm0,%xmm0 - vpinsrd $1,-24(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm10,%xmm10 - vpslld $5,%xmm11,%xmm8 - vpandn %xmm14,%xmm12,%xmm7 - vpand %xmm13,%xmm12,%xmm6 - - vmovdqa %xmm4,144-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpunpckldq %xmm9,%xmm0,%xmm0 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -20(%r8),%xmm1 - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -20(%r9),%xmm9 - vpaddd %xmm6,%xmm10,%xmm10 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpshufb %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vpinsrd $1,-20(%r10),%xmm1,%xmm1 - vpinsrd $1,-20(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm14,%xmm14 - vpslld $5,%xmm10,%xmm8 - vpandn %xmm13,%xmm11,%xmm7 - vpand %xmm12,%xmm11,%xmm6 - - vmovdqa %xmm0,160-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpunpckldq %xmm9,%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -16(%r8),%xmm2 - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -16(%r9),%xmm9 - vpaddd %xmm6,%xmm14,%xmm14 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpshufb %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpinsrd $1,-16(%r10),%xmm2,%xmm2 - vpinsrd $1,-16(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm13,%xmm13 - vpslld $5,%xmm14,%xmm8 - vpandn %xmm12,%xmm10,%xmm7 - vpand %xmm11,%xmm10,%xmm6 - - vmovdqa %xmm1,176-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpunpckldq %xmm9,%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -12(%r8),%xmm3 - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -12(%r9),%xmm9 - vpaddd %xmm6,%xmm13,%xmm13 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpshufb %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpinsrd $1,-12(%r10),%xmm3,%xmm3 - vpinsrd $1,-12(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm12,%xmm12 - vpslld $5,%xmm13,%xmm8 - vpandn %xmm11,%xmm14,%xmm7 - vpand %xmm10,%xmm14,%xmm6 - - vmovdqa %xmm2,192-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpunpckldq %xmm9,%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -8(%r8),%xmm4 - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -8(%r9),%xmm9 - vpaddd %xmm6,%xmm12,%xmm12 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpshufb %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpinsrd $1,-8(%r10),%xmm4,%xmm4 - vpinsrd $1,-8(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm11,%xmm11 - vpslld $5,%xmm12,%xmm8 - vpandn %xmm10,%xmm13,%xmm7 - vpand %xmm14,%xmm13,%xmm6 - - vmovdqa %xmm3,208-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpunpckldq %xmm9,%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -4(%r8),%xmm0 - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -4(%r9),%xmm9 - vpaddd %xmm6,%xmm11,%xmm11 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpshufb %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vmovdqa 0-128(%rax),%xmm1 - vpinsrd $1,-4(%r10),%xmm0,%xmm0 - vpinsrd $1,-4(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm10,%xmm10 - prefetcht0 63(%r8) - vpslld $5,%xmm11,%xmm8 - vpandn %xmm14,%xmm12,%xmm7 - vpand %xmm13,%xmm12,%xmm6 - - vmovdqa %xmm4,224-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpunpckldq %xmm9,%xmm0,%xmm0 - vpsrld $27,%xmm11,%xmm9 - prefetcht0 63(%r9) - vpxor %xmm7,%xmm6,%xmm6 - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - prefetcht0 63(%r10) - vpaddd %xmm6,%xmm10,%xmm10 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - prefetcht0 63(%r11) - vpshufb %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vmovdqa 16-128(%rax),%xmm2 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 32-128(%rax),%xmm3 - - vpaddd %xmm15,%xmm14,%xmm14 - vpslld $5,%xmm10,%xmm8 - vpandn %xmm13,%xmm11,%xmm7 - - vpand %xmm12,%xmm11,%xmm6 - - vmovdqa %xmm0,240-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpxor 128-128(%rax),%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm14,%xmm14 - - vpsrld $31,%xmm1,%xmm5 - vpaddd %xmm1,%xmm1,%xmm1 - - vpsrld $2,%xmm11,%xmm11 - - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 48-128(%rax),%xmm4 - - vpaddd %xmm15,%xmm13,%xmm13 - vpslld $5,%xmm14,%xmm8 - vpandn %xm |