aboutsummaryrefslogtreecommitdiffstats
path: root/crypto
diff options
context:
space:
mode:
Diffstat (limited to 'crypto')
-rw-r--r--crypto/asn1/tasn_dec.c14
-rw-r--r--crypto/bio/b_print.c187
-rw-r--r--crypto/bio/bio.h4
-rw-r--r--crypto/bio/bss_mem.c6
-rw-r--r--crypto/bn/Makefile4
-rwxr-xr-xcrypto/bn/asm/rsaz-avx2.pl219
-rwxr-xr-xcrypto/bn/asm/rsaz-x86_64.pl375
-rwxr-xr-xcrypto/bn/asm/x86_64-mont.pl227
-rwxr-xr-xcrypto/bn/asm/x86_64-mont5.pl1276
-rw-r--r--crypto/bn/bn.h14
-rw-r--r--crypto/bn/bn_exp.c103
-rw-r--r--crypto/bn/bn_print.c17
-rw-r--r--crypto/bn/bn_recp.c1
-rw-r--r--crypto/cmac/cmac.c8
-rw-r--r--crypto/cryptlib.c6
-rw-r--r--crypto/crypto.h2
-rw-r--r--crypto/dh/dh.h2
-rw-r--r--crypto/dh/dh_check.c7
-rw-r--r--crypto/dsa/dsa_ameth.c22
-rw-r--r--crypto/dso/dso_lib.c1
-rwxr-xr-xcrypto/ec/asm/ecp_nistz256-x86_64.pl11
-rw-r--r--crypto/ec/ecp_nistp224.c4
-rw-r--r--crypto/ec/ecp_nistp256.c4
-rw-r--r--crypto/ec/ecp_nistp521.c4
-rw-r--r--crypto/ec/ectest.c9
-rw-r--r--crypto/engine/eng_dyn.c4
-rw-r--r--crypto/evp/e_des.c11
-rw-r--r--crypto/evp/e_des3.c13
-rwxr-xr-xcrypto/modes/asm/aesni-gcm-x86_64.pl4
-rwxr-xr-xcrypto/modes/asm/ghash-x86_64.pl2
-rw-r--r--crypto/modes/ctr128.c41
-rw-r--r--crypto/opensslconf.h12
-rw-r--r--crypto/opensslv.h6
-rwxr-xr-xcrypto/perlasm/x86_64-xlate.pl7
-rw-r--r--crypto/pkcs7/pk7_smime.c17
-rw-r--r--crypto/rsa/rsa_sign.c4
-rw-r--r--crypto/srp/srp.h10
-rw-r--r--crypto/srp/srp_vfy.c57
-rw-r--r--crypto/stack/stack.c2
-rw-r--r--crypto/x509/x509_vfy.c70
40 files changed, 1741 insertions, 1046 deletions
diff --git a/crypto/asn1/tasn_dec.c b/crypto/asn1/tasn_dec.c
index 9256049d1588..5a507967c894 100644
--- a/crypto/asn1/tasn_dec.c
+++ b/crypto/asn1/tasn_dec.c
@@ -717,7 +717,7 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval,
long plen;
char cst, inf, free_cont = 0;
const unsigned char *p;
- BUF_MEM buf;
+ BUF_MEM buf = { 0, NULL, 0 };
const unsigned char *cont = NULL;
long len;
if (!pval) {
@@ -793,7 +793,6 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval,
} else {
len = p - cont + plen;
p += plen;
- buf.data = NULL;
}
} else if (cst) {
if (utype == V_ASN1_NULL || utype == V_ASN1_BOOLEAN
@@ -802,9 +801,9 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval,
ASN1err(ASN1_F_ASN1_D2I_EX_PRIMITIVE, ASN1_R_TYPE_NOT_PRIMITIVE);
return 0;
}
- buf.length = 0;
- buf.max = 0;
- buf.data = NULL;
+
+ /* Free any returned 'buf' content */
+ free_cont = 1;
/*
* Should really check the internal tags are correct but some things
* may get this wrong. The relevant specs say that constructed string
@@ -812,18 +811,16 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval,
* So instead just check for UNIVERSAL class and ignore the tag.
*/
if (!asn1_collect(&buf, &p, plen, inf, -1, V_ASN1_UNIVERSAL, 0)) {
- free_cont = 1;
goto err;
}
len = buf.length;
/* Append a final null to string */
if (!BUF_MEM_grow_clean(&buf, len + 1)) {
ASN1err(ASN1_F_ASN1_D2I_EX_PRIMITIVE, ERR_R_MALLOC_FAILURE);
- return 0;
+ goto err;
}
buf.data[len] = 0;
cont = (const unsigned char *)buf.data;
- free_cont = 1;
} else {
cont = p;
len = plen;
@@ -831,6 +828,7 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval,
}
/* We now have content length and type: translate into a structure */
+ /* asn1_ex_c2i may reuse allocated buffer, and so sets free_cont to 0 */
if (!asn1_ex_c2i(pval, cont, len, utype, &free_cont, it))
goto err;
diff --git a/crypto/bio/b_print.c b/crypto/bio/b_print.c
index 7c81e25d482c..90248fa2aaba 100644
--- a/crypto/bio/b_print.c
+++ b/crypto/bio/b_print.c
@@ -125,16 +125,16 @@
# define LLONG long
#endif
-static void fmtstr(char **, char **, size_t *, size_t *,
- const char *, int, int, int);
-static void fmtint(char **, char **, size_t *, size_t *,
- LLONG, int, int, int, int);
-static void fmtfp(char **, char **, size_t *, size_t *,
- LDOUBLE, int, int, int);
-static void doapr_outch(char **, char **, size_t *, size_t *, int);
-static void _dopr(char **sbuffer, char **buffer,
- size_t *maxlen, size_t *retlen, int *truncated,
- const char *format, va_list args);
+static int fmtstr(char **, char **, size_t *, size_t *,
+ const char *, int, int, int);
+static int fmtint(char **, char **, size_t *, size_t *,
+ LLONG, int, int, int, int);
+static int fmtfp(char **, char **, size_t *, size_t *,
+ LDOUBLE, int, int, int);
+static int doapr_outch(char **, char **, size_t *, size_t *, int);
+static int _dopr(char **sbuffer, char **buffer,
+ size_t *maxlen, size_t *retlen, int *truncated,
+ const char *format, va_list args);
/* format read states */
#define DP_S_DEFAULT 0
@@ -165,7 +165,7 @@ static void _dopr(char **sbuffer, char **buffer,
#define char_to_int(p) (p - '0')
#define OSSL_MAX(p,q) ((p >= q) ? p : q)
-static void
+static int
_dopr(char **sbuffer,
char **buffer,
size_t *maxlen,
@@ -196,7 +196,8 @@ _dopr(char **sbuffer,
if (ch == '%')
state = DP_S_FLAGS;
else
- doapr_outch(sbuffer, buffer, &currlen, maxlen, ch);
+ if(!doapr_outch(sbuffer, buffer, &currlen, maxlen, ch))
+ return 0;
ch = *format++;
break;
case DP_S_FLAGS:
@@ -302,8 +303,9 @@ _dopr(char **sbuffer,
value = va_arg(args, int);
break;
}
- fmtint(sbuffer, buffer, &currlen, maxlen,
- value, 10, min, max, flags);
+ if (!fmtint(sbuffer, buffer, &currlen, maxlen, value, 10, min,
+ max, flags))
+ return 0;
break;
case 'X':
flags |= DP_F_UP;
@@ -326,17 +328,19 @@ _dopr(char **sbuffer,
value = (LLONG) va_arg(args, unsigned int);
break;
}
- fmtint(sbuffer, buffer, &currlen, maxlen, value,
- ch == 'o' ? 8 : (ch == 'u' ? 10 : 16),
- min, max, flags);
+ if (!fmtint(sbuffer, buffer, &currlen, maxlen, value,
+ ch == 'o' ? 8 : (ch == 'u' ? 10 : 16),
+ min, max, flags))
+ return 0;
break;
case 'f':
if (cflags == DP_C_LDOUBLE)
fvalue = va_arg(args, LDOUBLE);
else
fvalue = va_arg(args, double);
- fmtfp(sbuffer, buffer, &currlen, maxlen,
- fvalue, min, max, flags);
+ if (!fmtfp(sbuffer, buffer, &currlen, maxlen, fvalue, min, max,
+ flags))
+ return 0;
break;
case 'E':
flags |= DP_F_UP;
@@ -355,8 +359,9 @@ _dopr(char **sbuffer,
fvalue = va_arg(args, double);
break;
case 'c':
- doapr_outch(sbuffer, buffer, &currlen, maxlen,
- va_arg(args, int));
+ if(!doapr_outch(sbuffer, buffer, &currlen, maxlen,
+ va_arg(args, int)))
+ return 0;
break;
case 's':
strvalue = va_arg(args, char *);
@@ -366,13 +371,15 @@ _dopr(char **sbuffer,
else
max = *maxlen;
}
- fmtstr(sbuffer, buffer, &currlen, maxlen, strvalue,
- flags, min, max);
+ if (!fmtstr(sbuffer, buffer, &currlen, maxlen, strvalue,
+ flags, min, max))
+ return 0;
break;
case 'p':
value = (long)va_arg(args, void *);
- fmtint(sbuffer, buffer, &currlen, maxlen,
- value, 16, min, max, flags | DP_F_NUM);
+ if (!fmtint(sbuffer, buffer, &currlen, maxlen,
+ value, 16, min, max, flags | DP_F_NUM))
+ return 0;
break;
case 'n': /* XXX */
if (cflags == DP_C_SHORT) {
@@ -394,7 +401,8 @@ _dopr(char **sbuffer,
}
break;
case '%':
- doapr_outch(sbuffer, buffer, &currlen, maxlen, ch);
+ if(!doapr_outch(sbuffer, buffer, &currlen, maxlen, ch))
+ return 0;
break;
case 'w':
/* not supported yet, treat as next char */
@@ -418,46 +426,56 @@ _dopr(char **sbuffer,
*truncated = (currlen > *maxlen - 1);
if (*truncated)
currlen = *maxlen - 1;
- doapr_outch(sbuffer, buffer, &currlen, maxlen, '\0');
+ if(!doapr_outch(sbuffer, buffer, &currlen, maxlen, '\0'))
+ return 0;
*retlen = currlen - 1;
- return;
+ return 1;
}
-static void
+static int
fmtstr(char **sbuffer,
char **buffer,
size_t *currlen,
size_t *maxlen, const char *value, int flags, int min, int max)
{
- int padlen, strln;
+ int padlen;
+ size_t strln;
int cnt = 0;
if (value == 0)
value = "<NULL>";
- for (strln = 0; value[strln]; ++strln) ;
+
+ strln = strlen(value);
+ if (strln > INT_MAX)
+ strln = INT_MAX;
+
padlen = min - strln;
- if (padlen < 0)
+ if (min < 0 || padlen < 0)
padlen = 0;
if (flags & DP_F_MINUS)
padlen = -padlen;
while ((padlen > 0) && (cnt < max)) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, ' ');
+ if(!doapr_outch(sbuffer, buffer, currlen, maxlen, ' '))
+ return 0;
--padlen;
++cnt;
}
while (*value && (cnt < max)) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, *value++);
+ if(!doapr_outch(sbuffer, buffer, currlen, maxlen, *value++))
+ return 0;
++cnt;
}
while ((padlen < 0) && (cnt < max)) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, ' ');
+ if(!doapr_outch(sbuffer, buffer, currlen, maxlen, ' '))
+ return 0;
++padlen;
++cnt;
}
+ return 1;
}
-static void
+static int
fmtint(char **sbuffer,
char **buffer,
size_t *currlen,
@@ -517,37 +535,44 @@ fmtint(char **sbuffer,
/* spaces */
while (spadlen > 0) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, ' ');
+ if(!doapr_outch(sbuffer, buffer, currlen, maxlen, ' '))
+ return 0;
--spadlen;
}
/* sign */
if (signvalue)
- doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue);
+ if(!doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue))
+ return 0;
/* prefix */
while (*prefix) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, *prefix);
+ if(!doapr_outch(sbuffer, buffer, currlen, maxlen, *prefix))
+ return 0;
prefix++;
}
/* zeros */
if (zpadlen > 0) {
while (zpadlen > 0) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, '0');
+ if(!doapr_outch(sbuffer, buffer, currlen, maxlen, '0'))
+ return 0;
--zpadlen;
}
}
/* digits */
- while (place > 0)
- doapr_outch(sbuffer, buffer, currlen, maxlen, convert[--place]);
+ while (place > 0) {
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, convert[--place]))
+ return 0;
+ }
/* left justified spaces */
while (spadlen < 0) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, ' ');
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, ' '))
+ return 0;
++spadlen;
}
- return;
+ return 1;
}
static LDOUBLE abs_val(LDOUBLE value)
@@ -578,7 +603,7 @@ static long roundv(LDOUBLE value)
return intpart;
}
-static void
+static int
fmtfp(char **sbuffer,
char **buffer,
size_t *currlen,
@@ -657,47 +682,61 @@ fmtfp(char **sbuffer,
if ((flags & DP_F_ZERO) && (padlen > 0)) {
if (signvalue) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue);
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue))
+ return 0;
--padlen;
signvalue = 0;
}
while (padlen > 0) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, '0');
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, '0'))
+ return 0;
--padlen;
}
}
while (padlen > 0) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, ' ');
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, ' '))
+ return 0;
--padlen;
}
- if (signvalue)
- doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue);
+ if (signvalue && !doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue))
+ return 0;
- while (iplace > 0)
- doapr_outch(sbuffer, buffer, currlen, maxlen, iconvert[--iplace]);
+ while (iplace > 0) {
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, iconvert[--iplace]))
+ return 0;
+ }
/*
* Decimal point. This should probably use locale to find the correct
* char to print out.
*/
if (max > 0 || (flags & DP_F_NUM)) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, '.');
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, '.'))
+ return 0;
- while (fplace > 0)
- doapr_outch(sbuffer, buffer, currlen, maxlen, fconvert[--fplace]);
+ while (fplace > 0) {
+ if(!doapr_outch(sbuffer, buffer, currlen, maxlen,
+ fconvert[--fplace]))
+ return 0;
+ }
}
while (zpadlen > 0) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, '0');
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, '0'))
+ return 0;
--zpadlen;
}
while (padlen < 0) {
- doapr_outch(sbuffer, buffer, currlen, maxlen, ' ');
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, ' '))
+ return 0;
++padlen;
}
+ return 1;
}
-static void
+#define BUFFER_INC 1024
+
+static int
doapr_outch(char **sbuffer,
char **buffer, size_t *currlen, size_t *maxlen, int c)
{
@@ -708,24 +747,25 @@ doapr_outch(char **sbuffer,
assert(*currlen <= *maxlen);
if (buffer && *currlen == *maxlen) {
- *maxlen += 1024;
+ if (*maxlen > INT_MAX - BUFFER_INC)
+ return 0;
+
+ *maxlen += BUFFER_INC;
if (*buffer == NULL) {
*buffer = OPENSSL_malloc(*maxlen);
- if (!*buffer) {
- /* Panic! Can't really do anything sensible. Just return */
- return;
- }
+ if (*buffer == NULL)
+ return 0;
if (*currlen > 0) {
assert(*sbuffer != NULL);
memcpy(*buffer, *sbuffer, *currlen);
}
*sbuffer = NULL;
} else {
- *buffer = OPENSSL_realloc(*buffer, *maxlen);
- if (!*buffer) {
- /* Panic! Can't really do anything sensible. Just return */
- return;
- }
+ char *tmpbuf;
+ tmpbuf = OPENSSL_realloc(*buffer, *maxlen);
+ if (tmpbuf == NULL)
+ return 0;
+ *buffer = tmpbuf;
}
}
@@ -736,7 +776,7 @@ doapr_outch(char **sbuffer,
(*buffer)[(*currlen)++] = (char)c;
}
- return;
+ return 1;
}
/***************************************************************************/
@@ -768,7 +808,11 @@ int BIO_vprintf(BIO *bio, const char *format, va_list args)
dynbuf = NULL;
CRYPTO_push_info("doapr()");
- _dopr(&hugebufp, &dynbuf, &hugebufsize, &retlen, &ignored, format, args);
+ if (!_dopr(&hugebufp, &dynbuf, &hugebufsize, &retlen, &ignored, format,
+ args)) {
+ OPENSSL_free(dynbuf);
+ return -1;
+ }
if (dynbuf) {
ret = BIO_write(bio, dynbuf, (int)retlen);
OPENSSL_free(dynbuf);
@@ -803,7 +847,8 @@ int BIO_vsnprintf(char *buf, size_t n, const char *format, va_list args)
size_t retlen;
int truncated;
- _dopr(&buf, NULL, &n, &retlen, &truncated, format, args);
+ if(!_dopr(&buf, NULL, &n, &retlen, &truncated, format, args))
+ return -1;
if (truncated)
/*
diff --git a/crypto/bio/bio.h b/crypto/bio/bio.h
index 6e2293bc66da..6790aed28e0b 100644
--- a/crypto/bio/bio.h
+++ b/crypto/bio/bio.h
@@ -479,7 +479,7 @@ struct bio_dgram_sctp_prinfo {
# define BIO_get_conn_hostname(b) BIO_ptr_ctrl(b,BIO_C_GET_CONNECT,0)
# define BIO_get_conn_port(b) BIO_ptr_ctrl(b,BIO_C_GET_CONNECT,1)
# define BIO_get_conn_ip(b) BIO_ptr_ctrl(b,BIO_C_GET_CONNECT,2)
-# define BIO_get_conn_int_port(b) BIO_ctrl(b,BIO_C_GET_CONNECT,3,0,NULL)
+# define BIO_get_conn_int_port(b) BIO_ctrl(b,BIO_C_GET_CONNECT,3,NULL)
# define BIO_set_nbio(b,n) BIO_ctrl(b,BIO_C_SET_NBIO,(n),NULL)
@@ -689,7 +689,7 @@ long BIO_debug_callback(BIO *bio, int cmd, const char *argp, int argi,
long argl, long ret);
BIO_METHOD *BIO_s_mem(void);
-BIO *BIO_new_mem_buf(void *buf, int len);
+BIO *BIO_new_mem_buf(const void *buf, int len);
BIO_METHOD *BIO_s_socket(void);
BIO_METHOD *BIO_s_connect(void);
BIO_METHOD *BIO_s_accept(void);
diff --git a/crypto/bio/bss_mem.c b/crypto/bio/bss_mem.c
index d190765dc201..b0394a960da1 100644
--- a/crypto/bio/bss_mem.c
+++ b/crypto/bio/bss_mem.c
@@ -91,7 +91,8 @@ BIO_METHOD *BIO_s_mem(void)
return (&mem_method);
}
-BIO *BIO_new_mem_buf(void *buf, int len)
+
+BIO *BIO_new_mem_buf(const void *buf, int len)
{
BIO *ret;
BUF_MEM *b;
@@ -105,7 +106,8 @@ BIO *BIO_new_mem_buf(void *buf, int len)
if (!(ret = BIO_new(BIO_s_mem())))
return NULL;
b = (BUF_MEM *)ret->ptr;
- b->data = buf;
+ /* Cast away const and trust in the MEM_RDONLY flag. */
+ b->data = (void *)buf;
b->length = sz;
b->max = sz;
ret->flags |= BIO_FLAGS_MEM_RDONLY;
diff --git a/crypto/bn/Makefile b/crypto/bn/Makefile
index 215855ecae91..c4c640951759 100644
--- a/crypto/bn/Makefile
+++ b/crypto/bn/Makefile
@@ -252,8 +252,8 @@ bn_exp.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
bn_exp.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
bn_exp.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
bn_exp.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_exp.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_exp.c bn_lcl.h
-bn_exp.o: rsaz_exp.h
+bn_exp.o: ../../include/openssl/symhacks.h ../constant_time_locl.h
+bn_exp.o: ../cryptlib.h bn_exp.c bn_lcl.h rsaz_exp.h
bn_exp2.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
bn_exp2.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
bn_exp2.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
diff --git a/crypto/bn/asm/rsaz-avx2.pl b/crypto/bn/asm/rsaz-avx2.pl
index 3b6ccf83d13e..712a77fe8ca3 100755
--- a/crypto/bn/asm/rsaz-avx2.pl
+++ b/crypto/bn/asm/rsaz-avx2.pl
@@ -443,7 +443,7 @@ $TEMP2 = $B2;
$TEMP3 = $Y1;
$TEMP4 = $Y2;
$code.=<<___;
- #we need to fix indexes 32-39 to avoid overflow
+ # we need to fix indices 32-39 to avoid overflow
vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0),
vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0)
vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0)
@@ -1592,68 +1592,128 @@ rsaz_1024_scatter5_avx2:
.type rsaz_1024_gather5_avx2,\@abi-omnipotent
.align 32
rsaz_1024_gather5_avx2:
+ vzeroupper
+ mov %rsp,%r11
___
$code.=<<___ if ($win64);
lea -0x88(%rsp),%rax
- vzeroupper
.LSEH_begin_rsaz_1024_gather5:
# I can't trust assembler to use specific encoding:-(
- .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
- .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6,-0x20(%rax)
- .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7,-0x10(%rax)
- .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8,0(%rax)
- .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9,0x10(%rax)
- .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10,0x20(%rax)
- .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11,0x30(%rax)
- .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12,0x40(%rax)
- .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13,0x50(%rax)
- .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14,0x60(%rax)
- .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15,0x70(%rax)
+ .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax),%rsp
+ .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6,-0x20(%rax)
+ .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7,-0x10(%rax)
+ .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8,0(%rax)
+ .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9,0x10(%rax)
+ .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10,0x20(%rax)
+ .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11,0x30(%rax)
+ .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12,0x40(%rax)
+ .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13,0x50(%rax)
+ .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14,0x60(%rax)
+ .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15,0x70(%rax)
___
$code.=<<___;
- lea .Lgather_table(%rip),%r11
- mov $power,%eax
- and \$3,$power
- shr \$2,%eax # cache line number
- shl \$4,$power # offset within cache line
-
- vmovdqu -32(%r11),%ymm7 # .Lgather_permd
- vpbroadcastb 8(%r11,%rax), %xmm8
- vpbroadcastb 7(%r11,%rax), %xmm9
- vpbroadcastb 6(%r11,%rax), %xmm10
- vpbroadcastb 5(%r11,%rax), %xmm11
- vpbroadcastb 4(%r11,%rax), %xmm12
- vpbroadcastb 3(%r11,%rax), %xmm13
- vpbroadcastb 2(%r11,%rax), %xmm14
- vpbroadcastb 1(%r11,%rax), %xmm15
-
- lea 64($inp,$power),$inp
- mov \$64,%r11 # size optimization
- mov \$9,%eax
- jmp .Loop_gather_1024
+ lea -0x100(%rsp),%rsp
+ and \$-32, %rsp
+ lea .Linc(%rip), %r10
+ lea -128(%rsp),%rax # control u-op density
+
+ vmovd $power, %xmm4
+ vmovdqa (%r10),%ymm0
+ vmovdqa 32(%r10),%ymm1
+ vmovdqa 64(%r10),%ymm5
+ vpbroadcastd %xmm4,%ymm4
+
+ vpaddd %ymm5, %ymm0, %ymm2
+ vpcmpeqd %ymm4, %ymm0, %ymm0
+ vpaddd %ymm5, %ymm1, %ymm3
+ vpcmpeqd %ymm4, %ymm1, %ymm1
+ vmovdqa %ymm0, 32*0+128(%rax)
+ vpaddd %ymm5, %ymm2, %ymm0
+ vpcmpeqd %ymm4, %ymm2, %ymm2
+ vmovdqa %ymm1, 32*1+128(%rax)
+ vpaddd %ymm5, %ymm3, %ymm1
+ vpcmpeqd %ymm4, %ymm3, %ymm3
+ vmovdqa %ymm2, 32*2+128(%rax)
+ vpaddd %ymm5, %ymm0, %ymm2
+ vpcmpeqd %ymm4, %ymm0, %ymm0
+ vmovdqa %ymm3, 32*3+128(%rax)
+ vpaddd %ymm5, %ymm1, %ymm3
+ vpcmpeqd %ymm4, %ymm1, %ymm1
+ vmovdqa %ymm0, 32*4+128(%rax)
+ vpaddd %ymm5, %ymm2, %ymm8
+ vpcmpeqd %ymm4, %ymm2, %ymm2
+ vmovdqa %ymm1, 32*5+128(%rax)
+ vpaddd %ymm5, %ymm3, %ymm9
+ vpcmpeqd %ymm4, %ymm3, %ymm3
+ vmovdqa %ymm2, 32*6+128(%rax)
+ vpaddd %ymm5, %ymm8, %ymm10
+ vpcmpeqd %ymm4, %ymm8, %ymm8
+ vmovdqa %ymm3, 32*7+128(%rax)
+ vpaddd %ymm5, %ymm9, %ymm11
+ vpcmpeqd %ymm4, %ymm9, %ymm9
+ vpaddd %ymm5, %ymm10, %ymm12
+ vpcmpeqd %ymm4, %ymm10, %ymm10
+ vpaddd %ymm5, %ymm11, %ymm13
+ vpcmpeqd %ymm4, %ymm11, %ymm11
+ vpaddd %ymm5, %ymm12, %ymm14
+ vpcmpeqd %ymm4, %ymm12, %ymm12
+ vpaddd %ymm5, %ymm13, %ymm15
+ vpcmpeqd %ymm4, %ymm13, %ymm13
+ vpcmpeqd %ymm4, %ymm14, %ymm14
+ vpcmpeqd %ymm4, %ymm15, %ymm15
+
+ vmovdqa -32(%r10),%ymm7 # .Lgather_permd
+ lea 128($inp), $inp
+ mov \$9,$power
-.align 32
.Loop_gather_1024:
- vpand -64($inp), %xmm8,%xmm0
- vpand ($inp), %xmm9,%xmm1
- vpand 64($inp), %xmm10,%xmm2
- vpand ($inp,%r11,2), %xmm11,%xmm3
- vpor %xmm0,%xmm1,%xmm1
- vpand 64($inp,%r11,2), %xmm12,%xmm4
- vpor %xmm2,%xmm3,%xmm3
- vpand ($inp,%r11,4), %xmm13,%xmm5
- vpor %xmm1,%xmm3,%xmm3
- vpand 64($inp,%r11,4), %xmm14,%xmm6
- vpor %xmm4,%xmm5,%xmm5
- vpand -128($inp,%r11,8), %xmm15,%xmm2
- lea ($inp,%r11,8),$inp
- vpor %xmm3,%xmm5,%xmm5
- vpor %xmm2,%xmm6,%xmm6
- vpor %xmm5,%xmm6,%xmm6
- vpermd %ymm6,%ymm7,%ymm6
- vmovdqu %ymm6,($out)
+ vmovdqa 32*0-128($inp), %ymm0
+ vmovdqa 32*1-128($inp), %ymm1
+ vmovdqa 32*2-128($inp), %ymm2
+ vmovdqa 32*3-128($inp), %ymm3
+ vpand 32*0+128(%rax), %ymm0, %ymm0
+ vpand 32*1+128(%rax), %ymm1, %ymm1
+ vpand 32*2+128(%rax), %ymm2, %ymm2
+ vpor %ymm0, %ymm1, %ymm4
+ vpand 32*3+128(%rax), %ymm3, %ymm3
+ vmovdqa 32*4-128($inp), %ymm0
+ vmovdqa 32*5-128($inp), %ymm1
+ vpor %ymm2, %ymm3, %ymm5
+ vmovdqa 32*6-128($inp), %ymm2
+ vmovdqa 32*7-128($inp), %ymm3
+ vpand 32*4+128(%rax), %ymm0, %ymm0
+ vpand 32*5+128(%rax), %ymm1, %ymm1
+ vpand 32*6+128(%rax), %ymm2, %ymm2
+ vpor %ymm0, %ymm4, %ymm4
+ vpand 32*7+128(%rax), %ymm3, %ymm3
+ vpand 32*8-128($inp), %ymm8, %ymm0
+ vpor %ymm1, %ymm5, %ymm5
+ vpand 32*9-128($inp), %ymm9, %ymm1
+ vpor %ymm2, %ymm4, %ymm4
+ vpand 32*10-128($inp),%ymm10, %ymm2
+ vpor %ymm3, %ymm5, %ymm5
+ vpand 32*11-128($inp),%ymm11, %ymm3
+ vpor %ymm0, %ymm4, %ymm4
+ vpand 32*12-128($inp),%ymm12, %ymm0
+ vpor %ymm1, %ymm5, %ymm5
+ vpand 32*13-128($inp),%ymm13, %ymm1
+ vpor %ymm2, %ymm4, %ymm4
+ vpand 32*14-128($inp),%ymm14, %ymm2
+ vpor %ymm3, %ymm5, %ymm5
+ vpand 32*15-128($inp),%ymm15, %ymm3
+ lea 32*16($inp), $inp
+ vpor %ymm0, %ymm4, %ymm4
+ vpor %ymm1, %ymm5, %ymm5
+ vpor %ymm2, %ymm4, %ymm4
+ vpor %ymm3, %ymm5, %ymm5
+
+ vpor %ymm5, %ymm4, %ymm4
+ vextracti128 \$1, %ymm4, %xmm5 # upper half is cleared
+ vpor %xmm4, %xmm5, %xmm5
+ vpermd %ymm5,%ymm7,%ymm5
+ vmovdqu %ymm5,($out)
lea 32($out),$out
- dec %eax
+ dec $power
jnz .Loop_gather_1024
vpxor %ymm0,%ymm0,%ymm0
@@ -1661,20 +1721,20 @@ $code.=<<___;
vzeroupper
___
$code.=<<___ if ($win64);
- movaps (%rsp),%xmm6
- movaps 0x10(%rsp),%xmm7
- movaps 0x20(%rsp),%xmm8
- movaps 0x30(%rsp),%xmm9
- movaps 0x40(%rsp),%xmm10
- movaps 0x50(%rsp),%xmm11
- movaps 0x60(%rsp),%xmm12
- movaps 0x70(%rsp),%xmm13
- movaps 0x80(%rsp),%xmm14
- movaps 0x90(%rsp),%xmm15
- lea 0xa8(%rsp),%rsp
+ movaps -0xa8(%r11),%xmm6
+ movaps -0x98(%r11),%xmm7
+ movaps -0x88(%r11),%xmm8
+ movaps -0x78(%r11),%xmm9
+ movaps -0x68(%r11),%xmm10
+ movaps -0x58(%r11),%xmm11
+ movaps -0x48(%r11),%xmm12
+ movaps -0x38(%r11),%xmm13
+ movaps -0x28(%r11),%xmm14
+ movaps -0x18(%r11),%xmm15
.LSEH_end_rsaz_1024_gather5:
___
$code.=<<___;
+ lea (%r11),%rsp
ret
.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
___
@@ -1708,8 +1768,10 @@ $code.=<<___;
.long 0,2,4,6,7,7,7,7
.Lgather_permd:
.long 0,7,1,7,2,7,3,7
-.Lgather_table:
- .byte 0,0,0,0,0,0,0,0, 0xff,0,0,0,0,0,0,0
+.Linc:
+ .long 0,0,0,0, 1,1,1,1
+ .long 2,2,2,2, 3,3,3,3
+ .long 4,4,4,4, 4,4,4,4
.align 64
___
@@ -1837,18 +1899,19 @@ rsaz_se_handler:
.rva rsaz_se_handler
.rva .Lmul_1024_body,.Lmul_1024_epilogue
.LSEH_info_rsaz_1024_gather5:
- .byte 0x01,0x33,0x16,0x00
- .byte 0x36,0xf8,0x09,0x00 #vmovaps 0x90(rsp),xmm15
- .byte 0x31,0xe8,0x08,0x00 #vmovaps 0x80(rsp),xmm14
- .byte 0x2c,0xd8,0x07,0x00 #vmovaps 0x70(rsp),xmm13
- .byte 0x27,0xc8,0x06,0x00 #vmovaps 0x60(rsp),xmm12
- .byte 0x22,0xb8,0x05,0x00 #vmovaps 0x50(rsp),xmm11
- .byte 0x1d,0xa8,0x04,0x00 #vmovaps 0x40(rsp),xmm10
- .byte 0x18,0x98,0x03,0x00 #vmovaps 0x30(rsp),xmm9
- .byte 0x13,0x88,0x02,0x00 #vmovaps 0x20(rsp),xmm8
- .byte 0x0e,0x78,0x01,0x00 #vmovaps 0x10(rsp),xmm7
- .byte 0x09,0x68,0x00,0x00 #vmovaps 0x00(rsp),xmm6
- .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
+ .byte 0x01,0x36,0x17,0x0b
+ .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15
+ .byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14
+ .byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13
+ .byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12
+ .byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11
+ .byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10
+ .byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9
+ .byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8
+ .byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7
+ .byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6
+ .byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8
+ .byte 0x00,0xb3,0x00,0x00 # set_frame r11
___
}
diff --git a/crypto/bn/asm/rsaz-x86_64.pl b/crypto/bn/asm/rsaz-x86_64.pl
index 091cdc2069da..87ce2c34d90c 100755
--- a/crypto/bn/asm/rsaz-x86_64.pl
+++ b/crypto/bn/asm/rsaz-x86_64.pl
@@ -915,9 +915,76 @@ rsaz_512_mul_gather4:
push %r14
push %r15
- mov $pwr, $pwr
- subq \$128+24, %rsp
+ subq \$`128+24+($win64?0xb0:0)`, %rsp
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,0xa0(%rsp)
+ movaps %xmm7,0xb0(%rsp)
+ movaps %xmm8,0xc0(%rsp)
+ movaps %xmm9,0xd0(%rsp)
+ movaps %xmm10,0xe0(%rsp)
+ movaps %xmm11,0xf0(%rsp)
+ movaps %xmm12,0x100(%rsp)
+ movaps %xmm13,0x110(%rsp)
+ movaps %xmm14,0x120(%rsp)
+ movaps %xmm15,0x130(%rsp)
+___
+$code.=<<___;
.Lmul_gather4_body:
+ movd $pwr,%xmm8
+ movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
+ movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
+
+ pshufd \$0,%xmm8,%xmm8 # broadcast $power
+ movdqa %xmm1,%xmm7
+ movdqa %xmm1,%xmm2
+___
+########################################################################
+# calculate mask by comparing 0..15 to $power
+#
+for($i=0;$i<4;$i++) {
+$code.=<<___;
+ paddd %xmm`$i`,%xmm`$i+1`
+ pcmpeqd %xmm8,%xmm`$i`
+ movdqa %xmm7,%xmm`$i+3`
+___
+}
+for(;$i<7;$i++) {
+$code.=<<___;
+ paddd %xmm`$i`,%xmm`$i+1`
+ pcmpeqd %xmm8,%xmm`$i`
+___
+}
+$code.=<<___;
+ pcmpeqd %xmm8,%xmm7
+
+ movdqa 16*0($bp),%xmm8
+ movdqa 16*1($bp),%xmm9
+ movdqa 16*2($bp),%xmm10
+ movdqa 16*3($bp),%xmm11
+ pand %xmm0,%xmm8
+ movdqa 16*4($bp),%xmm12
+ pand %xmm1,%xmm9
+ movdqa 16*5($bp),%xmm13
+ pand %xmm2,%xmm10
+ movdqa 16*6($bp),%xmm14
+ pand %xmm3,%xmm11
+ movdqa 16*7($bp),%xmm15
+ leaq 128($bp), %rbp
+ pand %xmm4,%xmm12
+ pand %xmm5,%xmm13
+ pand %xmm6,%xmm14
+ pand %xmm7,%xmm15
+ por %xmm10,%xmm8
+ por %xmm11,%xmm9
+ por %xmm12,%xmm8
+ por %xmm13,%xmm9
+ por %xmm14,%xmm8
+ por %xmm15,%xmm9
+
+ por %xmm9,%xmm8
+ pshufd \$0x4e,%xmm8,%xmm9
+ por %xmm9,%xmm8
___
$code.=<<___ if ($addx);
movl \$0x80100,%r11d
@@ -926,45 +993,38 @@ $code.=<<___ if ($addx);
je .Lmulx_gather
___
$code.=<<___;
- movl 64($bp,$pwr,4), %eax
- movq $out, %xmm0 # off-load arguments
- movl ($bp,$pwr,4), %ebx
- movq $mod, %xmm1
- movq $n0, 128(%rsp)
+ movq %xmm8,%rbx
+
+ movq $n0, 128(%rsp) # off-load arguments
+ movq $out, 128+8(%rsp)
+ movq $mod, 128+16(%rsp)
- shlq \$32, %rax
- or %rax, %rbx
movq ($ap), %rax
movq 8($ap), %rcx
- leaq 128($bp,$pwr,4), %rbp
mulq %rbx # 0 iteration
movq %rax, (%rsp)
movq %rcx, %rax
movq %rdx, %r8
mulq %rbx
- movd (%rbp), %xmm4
addq %rax, %r8
movq 16($ap), %rax
movq %rdx, %r9
adcq \$0, %r9
mulq %rbx
- movd 64(%rbp), %xmm5
addq %rax, %r9
movq 24($ap), %rax
movq %rdx, %r10
adcq \$0, %r10
mulq %rbx
- pslldq \$4, %xmm5
addq %rax, %r10
movq 32($ap), %rax
movq %rdx, %r11
adcq \$0, %r11
mulq %rbx
- por %xmm5, %xmm4
addq %rax, %r11
movq 40($ap), %rax
movq %rdx, %r12
@@ -977,14 +1037,12 @@ $code.=<<___;
adcq \$0, %r13
mulq %rbx
- leaq 128(%rbp), %rbp
addq %rax, %r13
movq 56($ap), %rax
movq %rdx, %r14
adcq \$0, %r14
mulq %rbx
- movq %xmm4, %rbx
addq %rax, %r14
movq ($ap), %rax
movq %rdx, %r15
@@ -996,6 +1054,35 @@ $code.=<<___;
.align 32
.Loop_mul_gather:
+ movdqa 16*0(%rbp),%xmm8
+ movdqa 16*1(%rbp),%xmm9
+ movdqa 16*2(%rbp),%xmm10
+ movdqa 16*3(%rbp),%xmm11
+ pand %xmm0,%xmm8
+ movdqa 16*4(%rbp),%xmm12
+ pand %xmm1,%xmm9
+ movdqa 16*5(%rbp),%xmm13
+ pand %xmm2,%xmm10
+ movdqa 16*6(%rbp),%xmm14
+ pand %xmm3,%xmm11
+ movdqa 16*7(%rbp),%xmm15
+ leaq 128(%rbp), %rbp
+ pand %xmm4,%xmm12
+ pand %xmm5,%xmm13
+ pand %xmm6,%xmm14
+ pand %xmm7,%xmm15
+ por %xmm10,%xmm8
+ por %xmm11,%xmm9
+ por %xmm12,%xmm8
+ por %xmm13,%xmm9
+ por %xmm14,%xmm8
+ por %xmm15,%xmm9
+
+ por %xmm9,%xmm8
+ pshufd \$0x4e,%xmm8,%xmm9
+ por %xmm9,%xmm8
+ movq %xmm8,%rbx
+
mulq %rbx
addq %rax, %r8
movq 8($ap), %rax
@@ -1004,7 +1091,6 @@ $code.=<<___;
adcq \$0, %r8
mulq %rbx
- movd (%rbp), %xmm4
addq %rax, %r9
movq 16($ap), %rax
adcq \$0, %rdx
@@ -1013,7 +1099,6 @@ $code.=<<___;
adcq \$0, %r9
mulq %rbx
- movd 64(%rbp), %xmm5
addq %rax, %r10
movq 24($ap), %rax
adcq \$0, %rdx
@@ -1022,7 +1107,6 @@ $code.=<<___;
adcq \$0, %r10
mulq %rbx
- pslldq \$4, %xmm5
addq %rax, %r11
movq 32($ap), %rax
adcq \$0, %rdx
@@ -1031,7 +1115,6 @@ $code.=<<___;
adcq \$0, %r11
mulq %rbx
- por %xmm5, %xmm4
addq %rax, %r12
movq 40($ap), %rax
adcq \$0, %rdx
@@ -1056,7 +1139,6 @@ $code.=<<___;
adcq \$0, %r14
mulq %rbx
- movq %xmm4, %rbx
addq %rax, %r15
movq ($ap), %rax
adcq \$0, %rdx
@@ -1064,7 +1146,6 @@ $code.=<<___;
movq %rdx, %r15
adcq \$0, %r15
- leaq 128(%rbp), %rbp
leaq 8(%rdi), %rdi
decl %ecx
@@ -1079,8 +1160,8 @@ $code.=<<___;
movq %r14, 48(%rdi)
movq %r15, 56(%rdi)
- movq %xmm0, $out
- movq %xmm1, %rbp
+ movq 128+8(%rsp), $out
+ movq 128+16(%rsp), %rbp
movq (%rsp), %r8
movq 8(%rsp), %r9
@@ -1098,45 +1179,37 @@ $code.=<<___ if ($addx);
.align 32
.Lmulx_gather:
- mov 64($bp,$pwr,4), %eax
- movq $out, %xmm0 # off-load arguments
- lea 128($bp,$pwr,4), %rbp
- mov ($bp,$pwr,4), %edx
- movq $mod, %xmm1
- mov $n0, 128(%rsp)
+ movq %xmm8,%rdx
+
+ mov $n0, 128(%rsp) # off-load arguments
+ mov $out, 128+8(%rsp)
+ mov $mod, 128+16(%rsp)
- shl \$32, %rax
- or %rax, %rdx
mulx ($ap), %rbx, %r8 # 0 iteration
mov %rbx, (%rsp)
xor %edi, %edi # cf=0, of=0
mulx 8($ap), %rax, %r9
- movd (%rbp), %xmm4
mulx 16($ap), %rbx, %r10
- movd 64(%rbp), %xmm5
adcx %rax, %r8
mulx 24($ap), %rax, %r11
- pslldq \$4, %xmm5
adcx %rbx, %r9
mulx 32($ap), %rbx, %r12
- por %xmm5, %xmm4
adcx %rax, %r10
mulx 40($ap), %rax, %r13
adcx %rbx, %r11
mulx 48($ap), %rbx, %r14
- lea 128(%rbp), %rbp
adcx %rax, %r12
mulx 56($ap), %rax, %r15
- movq %xmm4, %rdx
adcx %rbx, %r13
adcx %rax, %r14
+ .byte 0x67
mov %r8, %rbx
adcx %rdi, %r15 # %rdi is 0
@@ -1145,24 +1218,48 @@ $code.=<<___ if ($addx);
.align 32
.Loop_mulx_gather:
- mulx ($ap), %rax, %r8
+ movdqa 16*0(%rbp),%xmm8
+ movdqa 16*1(%rbp),%xmm9
+ movdqa 16*2(%rbp),%xmm10
+ movdqa 16*3(%rbp),%xmm11
+ pand %xmm0,%xmm8
+ movdqa 16*4(%rbp),%xmm12
+ pand %xmm1,%xmm9
+ movdqa 16*5(%rbp),%xmm13
+ pand %xmm2,%xmm10
+ movdqa 16*6(%rbp),%xmm14
+ pand %xmm3,%xmm11
+ movdqa 16*7(%rbp),%xmm15
+ leaq 128(%rbp), %rbp
+ pand %xmm4,%xmm12
+ pand %xmm5,%xmm13
+ pand %xmm6,%xmm14
+ pand %xmm7,%xmm15
+ por %xmm10,%xmm8
+ por %xmm11,%xmm9
+ por %xmm12,%xmm8
+ por %xmm13,%xmm9
+ por %xmm14,%xmm8
+ por %xmm15,%xmm9
+
+ por %xmm9,%xmm8
+ pshufd \$0x4e,%xmm8,%xmm9
+ por %xmm9,%xmm8
+ movq %xmm8,%rdx
+
+ .byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 # mulx ($ap), %rax, %r8
adcx %rax, %rbx
adox %r9, %r8
mulx 8($ap), %rax, %r9
- .byte 0x66,0x0f,0x6e,0xa5,0x00,0x00,0x00,0x00 # movd (%rbp), %xmm4
adcx %rax, %r8
adox %r10, %r9
mulx 16($ap), %rax, %r10
- movd 64(%rbp), %xmm5
- lea 128(%rbp), %rbp
adcx %rax, %r9
adox %r11, %r10
.byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11
- pslldq \$4, %xmm5
- por %xmm5, %xmm4
adcx %rax, %r10
adox %r12, %r11
@@ -1176,10 +1273,10 @@ $code.=<<___ if ($addx);
.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
adcx %rax, %r13
+ .byte 0x67
adox %r15, %r14
mulx 56($ap), %rax, %r15
- movq %xmm4, %rdx
mov %rbx, 64(%rsp,%rcx,8)
adcx %rax, %r14
adox %rdi, %r15
@@ -1198,10 +1295,10 @@ $code.=<<___ if ($addx);
mov %r14, 64+48(%rsp)
mov %r15, 64+56(%rsp)
- movq %xmm0, $out
- movq %xmm1, %rbp
+ mov 128(%rsp), %rdx # pull arguments
+ mov 128+8(%rsp), $out
+ mov 128+16(%rsp), %rbp
- mov 128(%rsp), %rdx # pull $n0
mov (%rsp), %r8
mov 8(%rsp), %r9
mov 16(%rsp), %r10
@@ -1229,6 +1326,21 @@ $code.=<<___;
call __rsaz_512_subtract
leaq 128+24+48(%rsp), %rax
+___
+$code.=<<___ if ($win64);
+ movaps 0xa0-0xc8(%rax),%xmm6
+ movaps 0xb0-0xc8(%rax),%xmm7
+ movaps 0xc0-0xc8(%rax),%xmm8
+ movaps 0xd0-0xc8(%rax),%xmm9
+ movaps 0xe0-0xc8(%rax),%xmm10
+ movaps 0xf0-0xc8(%rax),%xmm11
+ movaps 0x100-0xc8(%rax),%xmm12
+ movaps 0x110-0xc8(%rax),%xmm13
+ movaps 0x120-0xc8(%rax),%xmm14
+ movaps 0x130-0xc8(%rax),%xmm15
+ lea 0xb0(%rax),%rax
+___
+$code.=<<___;
movq -48(%rax), %r15
movq -40(%rax), %r14
movq -32(%rax), %r13
@@ -1258,7 +1370,7 @@ rsaz_512_mul_scatter4:
mov $pwr, $pwr
subq \$128+24, %rsp
.Lmul_scatter4_body:
- leaq ($tbl,$pwr,4), $tbl
+ leaq ($tbl,$pwr,8), $tbl
movq $out, %xmm0 # off-load arguments
movq $mod, %xmm1
movq $tbl, %xmm2
@@ -1329,30 +1441,14 @@ $code.=<<___;
call __rsaz_512_subtract
- movl %r8d, 64*0($inp) # scatter
- shrq \$32, %r8
- movl %r9d, 64*2($inp)
- shrq \$32, %r9
- movl %r10d, 64*4($inp)
- shrq \$32, %r10
- movl %r11d, 64*6($inp)
- shrq \$32, %r11
- movl %r12d, 64*8($inp)
- shrq \$32, %r12
- movl %r13d, 64*10($inp)
- shrq \$32, %r13
- movl %r14d, 64*12($inp)
- shrq \$32, %r14
- movl %r15d, 64*14($inp)
- shrq \$32, %r15
- movl %r8d, 64*1($inp)
- movl %r9d, 64*3($inp)
- movl %r10d, 64*5($inp)
- movl %r11d, 64*7($inp)
- movl %r12d, 64*9($inp)
- movl %r13d, 64*11($inp)
- movl %r14d, 64*13($inp)
- movl %r15d, 64*15($inp)
+ movq %r8, 128*0($inp) # scatter
+ movq %r9, 128*1($inp)
+ movq %r10, 128*2($inp)
+ movq %r11, 128*3($inp)
+ movq %r12, 128*4($inp)
+ movq %r13, 128*5($inp)
+ movq %r14, 128*6($inp)
+ movq %r15, 128*7($inp)
leaq 128+24+48(%rsp), %rax
movq -48(%rax), %r15
@@ -1956,16 +2052,14 @@ $code.=<<___;
.type rsaz_512_scatter4,\@abi-omnipotent
.align 16
rsaz_512_scatter4:
- leaq ($out,$power,4), $out
+ leaq ($out,$power,8), $out
movl \$8, %r9d
jmp .Loop_scatter
.align 16
.Loop_scatter:
movq ($inp), %rax
leaq 8($inp), $inp
- movl %eax, ($out)
- shrq \$32, %rax
- movl %eax, 64($out)
+ movq %rax, ($out)
leaq 128($out), $out
decl %r9d
jnz .Loop_scatter
@@ -1976,22 +2070,106 @@ rsaz_512_scatter4:
.type rsaz_512_gather4,\@abi-omnipotent
.align 16
rsaz_512_gather4:
- leaq ($inp,$power,4), $inp
+___
+$code.=<<___ if ($win64);
+.LSEH_begin_rsaz_512_gather4:
+ .byte 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 # sub $0xa8,%rsp
+ .byte 0x0f,0x29,0x34,0x24 # movaps %xmm6,(%rsp)
+ .byte 0x0f,0x29,0x7c,0x24,0x10 # movaps %xmm7,0x10(%rsp)
+ .byte 0x44,0x0f,0x29,0x44,0x24,0x20 # movaps %xmm8,0x20(%rsp)
+ .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 # movaps %xmm9,0x30(%rsp)
+ .byte 0x44,0x0f,0x29,0x54,0x24,0x40 # movaps %xmm10,0x40(%rsp)
+ .byte 0x44,0x0f,0x29,0x5c,0x24,0x50 # movaps %xmm11,0x50(%rsp)
+ .byte 0x44,0x0f,0x29,0x64,0x24,0x60 # movaps %xmm12,0x60(%rsp)
+ .byte 0x44,0x0f,0x29,0x6c,0x24,0x70 # movaps %xmm13,0x70(%rsp)
+ .byte 0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0 # movaps %xmm14,0x80(%rsp)
+ .byte 0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0 # movaps %xmm15,0x90(%rsp)
+___
+$code.=<<___;
+ movd $power,%xmm8
+ movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
+ movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
+
+ pshufd \$0,%xmm8,%xmm8 # broadcast $power
+ movdqa %xmm1,%xmm7
+ movdqa %xmm1,%xmm2
+___
+########################################################################
+# calculate mask by comparing 0..15 to $power
+#
+for($i=0;$i<4;$i++) {
+$code.=<<___;
+ paddd %xmm`$i`,%xmm`$i+1`
+ pcmpeqd %xmm8,%xmm`$i`
+ movdqa %xmm7,%xmm`$i+3`
+___
+}
+for(;$i<7;$i++) {
+$code.=<<___;
+ paddd %xmm`$i`,%xmm`$i+1`
+ pcmpeqd %xmm8,%xmm`$i`
+___
+}
+$code.=<<___;
+ pcmpeqd %xmm8,%xmm7
movl \$8, %r9d
jmp .Loop_gather
.align 16
.Loop_gather:
- movl ($inp), %eax
- movl 64($inp), %r8d
+ movdqa 16*0($inp),%xmm8
+ movdqa 16*1($inp),%xmm9
+ movdqa 16*2($inp),%xmm10
+ movdqa 16*3($inp),%xmm11
+ pand %xmm0,%xmm8
+ movdqa 16*4($inp),%xmm12
+ pand %xmm1,%xmm9
+ movdqa 16*5($inp),%xmm13
+ pand %xmm2,%xmm10
+ movdqa 16*6($inp),%xmm14
+ pand %xmm3,%xmm11
+ movdqa 16*7($inp),%xmm15
leaq 128($inp), $inp
- shlq \$32, %r8
- or %r8, %rax
- movq %rax, ($out)
+ pand %xmm4,%xmm12
+ pand %xmm5,%xmm13
+ pand %xmm6,%xmm14
+ pand %xmm7,%xmm15
+ por %xmm10,%xmm8
+ por %xmm11,%xmm9
+ por %xmm12,%xmm8
+ por %xmm13,%xmm9
+ por %xmm14,%xmm8
+ por %xmm15,%xmm9
+
+ por %xmm9,%xmm8
+ pshufd \$0x4e,%xmm8,%xmm9
+ por %xmm9,%xmm8
+ movq %xmm8,($out)
leaq 8($out), $out
decl %r9d
jnz .Loop_gather
+___
+$code.=<<___ if ($win64);
+ movaps 0x00(%rsp),%xmm6
+ movaps 0x10(%rsp),%xmm7
+ movaps 0x20(%rsp),%xmm8
+ movaps 0x30(%rsp),%xmm9
+ movaps 0x40(%rsp),%xmm10
+ movaps 0x50(%rsp),%xmm11
+ movaps 0x60(%rsp),%xmm12
+ movaps 0x70(%rsp),%xmm13
+ movaps 0x80(%rsp),%xmm14
+ movaps 0x90(%rsp),%xmm15
+ add \$0xa8,%rsp
+___
+$code.=<<___;
ret
+.LSEH_end_rsaz_512_gather4:
.size rsaz_512_gather4,.-rsaz_512_gather4
+
+.align 64
+.Linc:
+ .long 0,0, 1,1
+ .long 2,2, 2,2
___
}
@@ -2039,6 +2217,18 @@ se_handler:
lea 128+24+48(%rax),%rax
+ lea .Lmul_gather4_epilogue(%rip),%rbx
+ cmp %r10,%rbx
+ jne .Lse_not_in_mul_gather4
+
+ lea 0xb0(%rax),%rax
+
+ lea -48-0xa8(%rax),%rsi
+ lea 512($context),%rdi
+ mov \$20,%ecx
+ .long 0xa548f3fc # cld; rep movsq
+
+.Lse_not_in_mul_gather4:
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
@@ -2090,7 +2280,7 @@ se_handler:
pop %rdi
pop %rsi
ret
-.size sqr_handler,.-sqr_handler
+.size se_handler,.-se_handler
.section .pdata
.align 4
@@ -2114,6 +2304,10 @@ se_handler:
.rva .LSEH_end_rsaz_512_mul_by_one
.rva .LSEH_info_rsaz_512_mul_by_one
+ .rva .LSEH_begin_rsaz_512_gather4
+ .rva .LSEH_end_rsaz_512_gather4
+ .rva .LSEH_info_rsaz_512_gather4
+
.section .xdata
.align 8
.LSEH_info_rsaz_512_sqr:
@@ -2136,6 +2330,19 @@ se_handler:
.byte 9,0,0,0
.rva se_handler
.rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[]
+.LSEH_info_rsaz_512_gather4:
+ .byte 0x01,0x46,0x16,0x00
+ .byte 0x46,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15
+ .byte 0x3d,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14
+ .byte 0x34,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13
+ .byte 0x2e,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12
+ .byte 0x28,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11
+ .byte 0x22,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10
+ .byte 0x1c,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9
+ .byte 0x16,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8
+ .byte 0x10,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7
+ .byte 0x0b,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6
+ .byte 0x07,0x01,0x15,0x00 # sub rsp,0xa8
___
}
diff --git a/crypto/bn/asm/x86_64-mont.pl b/crypto/bn/asm/x86_64-mont.pl
index e82e451388c7..29ba1224e36b 100755
--- a/crypto/bn/asm/x86_64-mont.pl
+++ b/crypto/bn/asm/x86_64-mont.pl
@@ -775,100 +775,126 @@ bn_sqr8x_mont:
# 4096. this is done to allow memory disambiguation logic
# do its job.
#
- lea -64(%rsp,$num,4),%r11
+ lea -64(%rsp,$num,2),%r11
mov ($n0),$n0 # *n0
sub $aptr,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lsqr8x_sp_alt
sub %r11,%rsp # align with $aptr
- lea -64(%rsp,$num,4),%rsp # alloca(frame+4*$num)
+ lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
jmp .Lsqr8x_sp_done
.align 32
.Lsqr8x_sp_alt:
- lea 4096-64(,$num,4),%r10 # 4096-frame-4*$num
- lea -64(%rsp,$num,4),%rsp # alloca(frame+4*$num)
+ lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
+ lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
sub %r11,%rsp
.Lsqr8x_sp_done:
and \$-64,%rsp
- mov $num,%r10
+ mov $num,%r10
neg $num
- lea 64(%rsp,$num,2),%r11 # copy of modulus
mov $n0, 32(%rsp)
mov %rax, 40(%rsp) # save original %rsp
.Lsqr8x_body:
- mov $num,$i
- movq %r11, %xmm2 # save pointer to modulus copy
- shr \$3+2,$i
- mov OPENSSL_ia32cap_P+8(%rip),%eax
- jmp .Lsqr8x_copy_n
-
-.align 32
-.Lsqr8x_copy_n:
- movq 8*0($nptr),%xmm0
- movq 8*1($nptr),%xmm1
- movq 8*2($nptr),%xmm3
- movq 8*3($nptr),%xmm4
- lea 8*4($nptr),$nptr
- movdqa %xmm0,16*0(%r11)
- movdqa %xmm1,16*1(%r11)
- movdqa %xmm3,16*2(%r11)
- movdqa %xmm4,16*3(%r11)
- lea 16*4(%r11),%r11
- dec $i
- jnz .Lsqr8x_copy_n
-
+ movq $nptr, %xmm2 # save pointer to modulus
pxor %xmm0,%xmm0
movq $rptr,%xmm1 # save $rptr
movq %r10, %xmm3 # -$num
___
$code.=<<___ if ($addx);
+ mov OPENSSL_ia32cap_P+8(%rip),%eax
and \$0x80100,%eax
cmp \$0x80100,%eax
jne .Lsqr8x_nox
call bn_sqrx8x_internal # see x86_64-mont5 module
-
- pxor %xmm0,%xmm0
- lea 48(%rsp),%rax
- lea 64(%rsp,$num,2),%rdx
- shr \$3+2,$num
- mov 40(%rsp),%rsi # restore %rsp
- jmp .Lsqr8x_zero
+ # %rax top-most carry
+ # %rbp nptr
+ # %rcx -8*num
+ # %r8 end of tp[2*num]
+ lea (%r8,%rcx),%rbx
+ mov %rcx,$num
+ mov %rcx,%rdx
+ movq %xmm1,$rptr
+ sar \$3+2,%rcx # %cf=0
+ jmp .Lsqr8x_sub
.align 32
.Lsqr8x_nox:
___
$code.=<<___;
call bn_sqr8x_internal # see x86_64-mont5 module
+ # %rax top-most carry
+ # %rbp nptr
+ # %r8 -8*num
+ # %rdi end of tp[2*num]
+ lea (%rdi,$num),%rbx
+ mov $num,%rcx
+ mov $num,%rdx
+ movq %xmm1,$rptr
+ sar \$3+2,%rcx # %cf=0
+ jmp .Lsqr8x_sub
+.align 32
+.Lsqr8x_sub:
+ mov 8*0(%rbx),%r12
+ mov 8*1(%rbx),%r13
+ mov 8*2(%rbx),%r14
+ mov 8*3(%rbx),%r15
+ lea 8*4(%rbx),%rbx
+ sbb 8*0(%rbp),%r12
+ sbb 8*1(%rbp),%r13
+ sbb 8*2(%rbp),%r14
+ sbb 8*3(%rbp),%r15
+ lea 8*4(%rbp),%rbp
+ mov %r12,8*0($rptr)
+ mov %r13,8*1($rptr)
+ mov %r14,8*2($rptr)
+ mov %r15,8*3($rptr)
+ lea 8*4($rptr),$rptr
+ inc %rcx # preserves %cf
+ jnz .Lsqr8x_sub
+
+ sbb \$0,%rax # top-most carry
+ lea (%rbx,$num),%rbx # rewind
+ lea ($rptr,$num),$rptr # rewind
+
+ movq %rax,%xmm1
pxor %xmm0,%xmm0
- lea 48(%rsp),%rax
- lea 64(%rsp,$num,2),%rdx
- shr \$3+2,$num
+ pshufd \$0,%xmm1,%xmm1
mov 40(%rsp),%rsi # restore %rsp
- jmp .Lsqr8x_zero
+ jmp .Lsqr8x_cond_copy
.align 32
-.Lsqr8x_zero:
- movdqa %xmm0,16*0(%rax) # wipe t
- movdqa %xmm0,16*1(%rax)
- movdqa %xmm0,16*2(%rax)
- movdqa %xmm0,16*3(%rax)
- lea 16*4(%rax),%rax
- movdqa %xmm0,16*0(%rdx) # wipe n
- movdqa %xmm0,16*1(%rdx)
- movdqa %xmm0,16*2(%rdx)
- movdqa %xmm0,16*3(%rdx)
- lea 16*4(%rdx),%rdx
- dec $num
- jnz .Lsqr8x_zero
+.Lsqr8x_cond_copy:
+ movdqa 16*0(%rbx),%xmm2
+ movdqa 16*1(%rbx),%xmm3
+ lea 16*2(%rbx),%rbx
+ movdqu 16*0($rptr),%xmm4
+ movdqu 16*1($rptr),%xmm5
+ lea 16*2($rptr),$rptr
+ movdqa %xmm0,-16*2(%rbx) # zero tp
+ movdqa %xmm0,-16*1(%rbx)
+ movdqa %xmm0,-16*2(%rbx,%rdx)
+ movdqa %xmm0,-16*1(%rbx,%rdx)
+ pcmpeqd %xmm1,%xmm0
+ pand %xmm1,%xmm2
+ pand %xmm1,%xmm3
+ pand %xmm0,%xmm4
+ pand %xmm0,%xmm5
+ pxor %xmm0,%xmm0
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqu %xmm4,-16*2($rptr)
+ movdqu %xmm5,-16*1($rptr)
+ add \$32,$num
+ jnz .Lsqr8x_cond_copy
mov \$1,%rax
mov -48(%rsi),%r15
@@ -1135,64 +1161,75 @@ $code.=<<___;
adc $zero,%r15 # modulo-scheduled
sub 0*8($tptr),$zero # pull top-most carry
adc %r15,%r14
- mov -8($nptr),$mi
sbb %r15,%r15 # top-most carry
mov %r14,-1*8($tptr)
cmp 16(%rsp),$bptr
jne .Lmulx4x_outer
- sub %r14,$mi # compare top-most words
- sbb $mi,$mi
- or $mi,%r15
-
- neg $num
- xor %rdx,%rdx
+ lea 64(%rsp),$tptr
+ sub $num,$nptr # rewind $nptr
+ neg %r15
+ mov $num,%rdx
+ shr \$3+2,$num # %cf=0
mov 32(%rsp),$rptr # restore rp
+ jmp .Lmulx4x_sub
+
+.align 32
+.Lmulx4x_sub:
+ mov 8*0($tptr),%r11
+ mov 8*1($tptr),%r12
+ mov 8*2($tptr),%r13
+ mov 8*3($tptr),%r14
+ lea 8*4($tptr),$tptr
+ sbb 8*0($nptr),%r11
+ sbb 8*1($nptr),%r12
+ sbb 8*2($nptr),%r13
+ sbb 8*3($nptr),%r14
+ lea 8*4($nptr),$nptr
+ mov %r11,8*0($rptr)
+ mov %r12,8*1($rptr)
+ mov %r13,8*2($rptr)
+ mov %r14,8*3($rptr)
+ lea 8*4($rptr),$rptr
+ dec $num # preserves %cf
+ jnz .Lmulx4x_sub
+
+ sbb \$0,%r15 # top-most carry
lea 64(%rsp),$tptr
+ sub %rdx,$rptr # rewind
+ movq %r15,%xmm1
pxor %xmm0,%xmm0
- mov 0*8($nptr,$num),%r8
- mov 1*8($nptr,$num),%r9
- neg %r8
- jmp .Lmulx4x_sub_entry
+ pshufd \$0,%xmm1,%xmm1
+ mov 40(%rsp),%rsi # restore %rsp
+ jmp .Lmulx4x_cond_copy
.align 32
-.Lmulx4x_sub:
- mov 0*8($nptr,$num),%r8
- mov 1*8($nptr,$num),%r9
- not %r8
-.Lmulx4x_sub_entry:
- mov 2*8($nptr,$num),%r10
- not %r9
- and %r15,%r8
- mov 3*8($nptr,$num),%r11
- not %r10
- and %r15,%r9
- not %r11
- and %r15,%r10
- and %r15,%r11
-
- neg %rdx # mov %rdx,%cf
- adc 0*8($tptr),%r8
- adc 1*8($tptr),%r9
- movdqa %xmm0,($tptr)
- adc 2*8($tptr),%r10
- adc 3*8($tptr),%r11
- movdqa %xmm0,16($tptr)
- lea 4*8($tptr),$tptr
- sbb %rdx,%rdx # mov %cf,%rdx
+.Lmulx4x_cond_copy:
+ movdqa 16*0($tptr),%xmm2
+ movdqa 16*1($tptr),%xmm3
+ lea 16*2($tptr),$tptr
+ movdqu 16*0($rptr),%xmm4
+ movdqu 16*1($rptr),%xmm5
+ lea 16*2($rptr),$rptr
+ movdqa %xmm0,-16*2($tptr) # zero tp
+ movdqa %xmm0,-16*1($tptr)
+ pcmpeqd %xmm1,%xmm0
+ pand %xmm1,%xmm2
+ pand %xmm1,%xmm3
+ pand %xmm0,%xmm4
+ pand %xmm0,%xmm5
+ pxor %xmm0,%xmm0
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqu %xmm4,-16*2($rptr)
+ movdqu %xmm5,-16*1($rptr)
+ sub \$32,%rdx
+ jnz .Lmulx4x_cond_copy
- mov %r8,0*8($rptr)
- mov %r9,1*8($rptr)
- mov %r10,2*8($rptr)
- mov %r11,3*8($rptr)
- lea 4*8($rptr),$rptr
+ mov %rdx,($tptr)
- add \$32,$num
- jnz .Lmulx4x_sub
-
- mov 40(%rsp),%rsi # restore %rsp
mov \$1,%rax
mov -48(%rsi),%r15
mov -40(%rsi),%r14
diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl
index 292409c4ffb8..2e8c9db32cbc 100755
--- a/crypto/bn/asm/x86_64-mont5.pl
+++ b/crypto/bn/asm/x86_64-mont5.pl
@@ -99,58 +99,111 @@ $code.=<<___;
.Lmul_enter:
mov ${num}d,${num}d
mov %rsp,%rax
- mov `($win64?56:8)`(%rsp),%r10d # load 7th argument
+ movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument
+ lea .Linc(%rip),%r10
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
-___
-$code.=<<___ if ($win64);
- lea -0x28(%rsp),%rsp
- movaps %xmm6,(%rsp)
- movaps %xmm7,0x10(%rsp)
-___
-$code.=<<___;
+
lea 2($num),%r11
neg %r11
- lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2))
+ lea -264(%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)+256+8)
and \$-1024,%rsp # minimize TLB usage
mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
.Lmul_body:
- mov $bp,%r12 # reassign $bp
+ lea 128($bp),%r12 # reassign $bp (+size optimization)
___
$bp="%r12";
$STRIDE=2**5*8; # 5 is "window size"
$N=$STRIDE/4; # should match cache line size
$code.=<<___;
- mov %r10,%r11
- shr \$`log($N/8)/log(2)`,%r10
- and \$`$N/8-1`,%r11
- not %r10
- lea .Lmagic_masks(%rip),%rax
- and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
- lea 96($bp,%r11,8),$bp # pointer within 1st cache line
- movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
- movq 8(%rax,%r10,8),%xmm5 # cache line contains element
- movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
- movq 24(%rax,%r10,8),%xmm7
-
- movq `0*$STRIDE/4-96`($bp),%xmm0
- movq `1*$STRIDE/4-96`($bp),%xmm1
- pand %xmm4,%xmm0
- movq `2*$STRIDE/4-96`($bp),%xmm2
- pand %xmm5,%xmm1
- movq `3*$STRIDE/4-96`($bp),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
+ movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000
+ movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002
+ lea 24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization)
+ and \$-16,%r10
+
+ pshufd \$0,%xmm5,%xmm5 # broadcast index
+ movdqa %xmm1,%xmm4
+ movdqa %xmm1,%xmm2
+___
+########################################################################
+# calculate mask by comparing 0..31 to index and save result to stack
+#
+$code.=<<___;
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0 # compare to 1,0
+ .byte 0x67
+ movdqa %xmm4,%xmm3
+___
+for($k=0;$k<$STRIDE/16-4;$k+=4) {
+$code.=<<___;
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1 # compare to 3,2
+ movdqa %xmm0,`16*($k+0)+112`(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2 # compare to 5,4
+ movdqa %xmm1,`16*($k+1)+112`(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3 # compare to 7,6
+ movdqa %xmm2,`16*($k+2)+112`(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,`16*($k+3)+112`(%r10)
+ movdqa %xmm4,%xmm3
+___
+}
+$code.=<<___; # last iteration can be optimized
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,`16*($k+0)+112`(%r10)
+
+ paddd %xmm2,%xmm3
+ .byte 0x67
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,`16*($k+1)+112`(%r10)
+
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,`16*($k+2)+112`(%r10)
+ pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register
+
+ pand `16*($k+1)-128`($bp),%xmm1
+ pand `16*($k+2)-128`($bp),%xmm2
+ movdqa %xmm3,`16*($k+3)+112`(%r10)
+ pand `16*($k+3)-128`($bp),%xmm3
por %xmm2,%xmm0
+ por %xmm3,%xmm1
+___
+for($k=0;$k<$STRIDE/16-4;$k+=4) {
+$code.=<<___;
+ movdqa `16*($k+0)-128`($bp),%xmm4
+ movdqa `16*($k+1)-128`($bp),%xmm5
+ movdqa `16*($k+2)-128`($bp),%xmm2
+ pand `16*($k+0)+112`(%r10),%xmm4
+ movdqa `16*($k+3)-128`($bp),%xmm3
+ pand `16*($k+1)+112`(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand `16*($k+2)+112`(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand `16*($k+3)+112`(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+___
+}
+$code.=<<___;
+ por %xmm1,%xmm0
+ pshufd \$0x4e,%xmm0,%xmm1
+ por %xmm1,%xmm0
lea $STRIDE($bp),$bp
- por %xmm3,%xmm0
-
movq %xmm0,$m0 # m0=bp[0]
mov ($n0),$n0 # pull n0[0] value
@@ -159,29 +212,14 @@ $code.=<<___;
xor $i,$i # i=0
xor $j,$j # j=0
- movq `0*$STRIDE/4-96`($bp),%xmm0
- movq `1*$STRIDE/4-96`($bp),%xmm1
- pand %xmm4,%xmm0
- movq `2*$STRIDE/4-96`($bp),%xmm2
- pand %xmm5,%xmm1
-
mov $n0,$m1
mulq $m0 # ap[0]*bp[0]
mov %rax,$lo0
mov ($np),%rax
- movq `3*$STRIDE/4-96`($bp),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
-
imulq $lo0,$m1 # "tp[0]"*n0
mov %rdx,$hi0
- por %xmm2,%xmm0
- lea $STRIDE($bp),$bp
- por %xmm3,%xmm0
-
mulq $m1 # np[0]*m1
add %rax,$lo0 # discarded
mov 8($ap),%rax
@@ -212,16 +250,14 @@ $code.=<<___;
mulq $m1 # np[j]*m1
cmp $num,$j
- jne .L1st
-
- movq %xmm0,$m0 # bp[1]
+ jne .L1st # note that upon exit $j==$num, so
+ # they can be used interchangeably
add %rax,$hi1
- mov ($ap),%rax # ap[0]
adc \$0,%rdx
add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
adc \$0,%rdx
- mov $hi1,-16(%rsp,$j,8) # tp[j-1]
+ mov $hi1,-16(%rsp,$num,8) # tp[num-1]
mov %rdx,$hi1
mov $lo0,$hi0
@@ -235,33 +271,48 @@ $code.=<<___;
jmp .Louter
.align 16
.Louter:
+ lea 24+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization)
+ and \$-16,%rdx
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+___
+for($k=0;$k<$STRIDE/16;$k+=4) {
+$code.=<<___;
+ movdqa `16*($k+0)-128`($bp),%xmm0
+ movdqa `16*($k+1)-128`($bp),%xmm1
+ movdqa `16*($k+2)-128`($bp),%xmm2
+ movdqa `16*($k+3)-128`($bp),%xmm3
+ pand `16*($k+0)-128`(%rdx),%xmm0
+ pand `16*($k+1)-128`(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand `16*($k+2)-128`(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand `16*($k+3)-128`(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+___
+}
+$code.=<<___;
+ por %xmm5,%xmm4
+ pshufd \$0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
+ lea $STRIDE($bp),$bp
+
+ mov ($ap),%rax # ap[0]
+ movq %xmm0,$m0 # m0=bp[i]
+
xor $j,$j # j=0
mov $n0,$m1
mov (%rsp),$lo0
- movq `0*$STRIDE/4-96`($bp),%xmm0
- movq `1*$STRIDE/4-96`($bp),%xmm1
- pand %xmm4,%xmm0
- movq `2*$STRIDE/4-96`($bp),%xmm2
- pand %xmm5,%xmm1
-
mulq $m0 # ap[0]*bp[i]
add %rax,$lo0 # ap[0]*bp[i]+tp[0]
mov ($np),%rax
adc \$0,%rdx
- movq `3*$STRIDE/4-96`($bp),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
-
imulq $lo0,$m1 # tp[0]*n0
mov %rdx,$hi0
- por %xmm2,%xmm0
- lea $STRIDE($bp),$bp
- por %xmm3,%xmm0
-
mulq $m1 # np[0]*m1
add %rax,$lo0 # discarded
mov 8($ap),%rax
@@ -295,17 +346,14 @@ $code.=<<___;
mulq $m1 # np[j]*m1
cmp $num,$j
- jne .Linner
-
- movq %xmm0,$m0 # bp[i+1]
-
+ jne .Linner # note that upon exit $j==$num, so
+ # they can be used interchangeably
add %rax,$hi1
- mov ($ap),%rax # ap[0]
adc \$0,%rdx
add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
- mov (%rsp,$j,8),$lo0
+ mov (%rsp,$num,8),$lo0
adc \$0,%rdx
- mov $hi1,-16(%rsp,$j,8) # tp[j-1]
+ mov $hi1,-16(%rsp,$num,8) # tp[num-1]
mov %rdx,$hi1
xor %rdx,%rdx
@@ -352,12 +400,7 @@ $code.=<<___;
mov 8(%rsp,$num,8),%rsi # restore %rsp
mov \$1,%rax
-___
-$code.=<<___ if ($win64);
- movaps -88(%rsi),%xmm6
- movaps -72(%rsi),%xmm7
-___
-$code.=<<___;
+
mov -48(%rsi),%r15
mov -40(%rsi),%r14
mov -32(%rsi),%r13
@@ -379,8 +422,8 @@ bn_mul4x_mont_gather5:
.Lmul4x_enter:
___
$code.=<<___ if ($addx);
- and \$0x80100,%r11d
- cmp \$0x80100,%r11d
+ and \$0x80108,%r11d
+ cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1
je .Lmulx4x_enter
___
$code.=<<___;
@@ -392,39 +435,34 @@ $code.=<<___;
push %r13
push %r14
push %r15
-___
-$code.=<<___ if ($win64);
- lea -0x28(%rsp),%rsp
- movaps %xmm6,(%rsp)
- movaps %xmm7,0x10(%rsp)
-___
-$code.=<<___;
+
.byte 0x67
- mov ${num}d,%r10d
- shl \$3,${num}d
- shl \$3+2,%r10d # 4*$num
+ shl \$3,${num}d # convert $num to bytes
+ lea ($num,$num,2),%r10 # 3*$num in bytes
neg $num # -$num
##############################################################
- # ensure that stack frame doesn't alias with $aptr+4*$num
- # modulo 4096, which covers ret[num], am[num] and n[2*num]
- # (see bn_exp.c). this is done to allow memory disambiguation
- # logic do its magic. [excessive frame is allocated in order
- # to allow bn_from_mont8x to clear it.]
+ # Ensure that stack frame doesn't alias with $rptr+3*$num
+ # modulo 4096, which covers ret[num], am[num] and n[num]
+ # (see bn_exp.c). This is done to allow memory disambiguation
+ # logic do its magic. [Extra [num] is allocated in order
+ # to align with bn_power5's frame, which is cleansed after
+ # completing exponentiation. Extra 256 bytes is for power mask
+ # calculated from 7th argument, the index.]
#
- lea -64(%rsp,$num,2),%r11
- sub $ap,%r11
+ lea -320(%rsp,$num,2),%r11
+ sub $rp,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lmul4xsp_alt
- sub %r11,%rsp # align with $ap
- lea -64(%rsp,$num,2),%rsp # alloca(128+num*8)
+ sub %r11,%rsp # align with $rp
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
jmp .Lmul4xsp_done
.align 32
.Lmul4xsp_alt:
- lea 4096-64(,$num,2),%r10
- lea -64(%rsp,$num,2),%rsp # alloca(128+num*8)
+ lea 4096-320(,$num,2),%r10
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
@@ -440,12 +478,7 @@ $code.=<<___;
mov 40(%rsp),%rsi # restore %rsp
mov \$1,%rax
-___
-$code.=<<___ if ($win64);
- movaps -88(%rsi),%xmm6
- movaps -72(%rsi),%xmm7
-___
-$code.=<<___;
+
mov -48(%rsi),%r15
mov -40(%rsi),%r14
mov -32(%rsi),%r13
@@ -460,9 +493,10 @@ $code.=<<___;
.type mul4x_internal,\@abi-omnipotent
.align 32
mul4x_internal:
- shl \$5,$num
- mov `($win64?56:8)`(%rax),%r10d # load 7th argument
- lea 256(%rdx,$num),%r13
+ shl \$5,$num # $num was in bytes
+ movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument, index
+ lea .Linc(%rip),%rax
+ lea 128(%rdx,$num),%r13 # end of powers table (+size optimization)
shr \$5,$num # restore $num
___
$bp="%r12";
@@ -470,44 +504,92 @@ ___
$N=$STRIDE/4; # should match cache line size
$tp=$i;
$code.=<<___;
- mov %r10,%r11
- shr \$`log($N/8)/log(2)`,%r10
- and \$`$N/8-1`,%r11
- not %r10
- lea .Lmagic_masks(%rip),%rax
- and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
- lea 96(%rdx,%r11,8),$bp # pointer within 1st cache line
- movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
- movq 8(%rax,%r10,8),%xmm5 # cache line contains element
- add \$7,%r11
- movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
- movq 24(%rax,%r10,8),%xmm7
- and \$7,%r11
-
- movq `0*$STRIDE/4-96`($bp),%xmm0
- lea $STRIDE($bp),$tp # borrow $tp
- movq `1*$STRIDE/4-96`($bp),%xmm1
- pand %xmm4,%xmm0
- movq `2*$STRIDE/4-96`($bp),%xmm2
- pand %xmm5,%xmm1
- movq `3*$STRIDE/4-96`($bp),%xmm3
- pand %xmm6,%xmm2
- .byte 0x67
- por %xmm1,%xmm0
- movq `0*$STRIDE/4-96`($tp),%xmm1
- .byte 0x67
- pand %xmm7,%xmm3
- .byte 0x67
- por %xmm2,%xmm0
- movq `1*$STRIDE/4-96`($tp),%xmm2
+ movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000
+ movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002
+ lea 88-112(%rsp,$num),%r10 # place the mask after tp[num+1] (+ICache optimization)
+ lea 128(%rdx),$bp # size optimization
+
+ pshufd \$0,%xmm5,%xmm5 # broadcast index
+ movdqa %xmm1,%xmm4
+ .byte 0x67,0x67
+ movdqa %xmm1,%xmm2
+___
+########################################################################
+# calculate mask by comparing 0..31 to index and save result to stack
+#
+$code.=<<___;
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0 # compare to 1,0
.byte 0x67
- pand %xmm4,%xmm1
+ movdqa %xmm4,%xmm3
+___
+for($i=0;$i<$STRIDE/16-4;$i+=4) {
+$code.=<<___;
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1 # compare to 3,2
+ movdqa %xmm0,`16*($i+0)+112`(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2 # compare to 5,4
+ movdqa %xmm1,`16*($i+1)+112`(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3 # compare to 7,6
+ movdqa %xmm2,`16*($i+2)+112`(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,`16*($i+3)+112`(%r10)
+ movdqa %xmm4,%xmm3
+___
+}
+$code.=<<___; # last iteration can be optimized
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,`16*($i+0)+112`(%r10)
+
+ paddd %xmm2,%xmm3
.byte 0x67
- por %xmm3,%xmm0
- movq `2*$STRIDE/4-96`($tp),%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,`16*($i+1)+112`(%r10)
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,`16*($i+2)+112`(%r10)
+ pand `16*($i+0)-128`($bp),%xmm0 # while it's still in register
+
+ pand `16*($i+1)-128`($bp),%xmm1
+ pand `16*($i+2)-128`($bp),%xmm2
+ movdqa %xmm3,`16*($i+3)+112`(%r10)
+ pand `16*($i+3)-128`($bp),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+___
+for($i=0;$i<$STRIDE/16-4;$i+=4) {
+$code.=<<___;
+ movdqa `16*($i+0)-128`($bp),%xmm4
+ movdqa `16*($i+1)-128`($bp),%xmm5
+ movdqa `16*($i+2)-128`($bp),%xmm2
+ pand `16*($i+0)+112`(%r10),%xmm4
+ movdqa `16*($i+3)-128`($bp),%xmm3
+ pand `16*($i+1)+112`(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand `16*($i+2)+112`(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand `16*($i+3)+112`(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+___
+}
+$code.=<<___;
+ por %xmm1,%xmm0
+ pshufd \$0x4e,%xmm0,%xmm1
+ por %xmm1,%xmm0
+ lea $STRIDE($bp),$bp
movq %xmm0,$m0 # m0=bp[0]
- movq `3*$STRIDE/4-96`($tp),%xmm0
+
mov %r13,16+8(%rsp) # save end of b[num]
mov $rp, 56+8(%rsp) # save $rp
@@ -521,26 +603,10 @@ $code.=<<___;
mov %rax,$A[0]
mov ($np),%rax
- pand %xmm5,%xmm2
- pand %xmm6,%xmm3
- por %xmm2,%xmm1
-
imulq $A[0],$m1 # "tp[0]"*n0
- ##############################################################
- # $tp is chosen so that writing to top-most element of the
- # vector occurs just "above" references to powers table,
- # "above" modulo cache-line size, which effectively precludes
- # possibility of memory disambiguation logic failure when
- # accessing the table.
- #
- lea 64+8(%rsp,%r11,8),$tp
+ lea 64+8(%rsp),$tp
mov %rdx,$A[1]
- pand %xmm7,%xmm0
- por %xmm3,%xmm1
- lea 2*$STRIDE($bp),$bp
- por %xmm1,%xmm0
-
mulq $m1 # np[0]*m1
add %rax,$A[0] # discarded
mov 8($ap,$num),%rax
@@ -549,7 +615,7 @@ $code.=<<___;
mulq $m0
add %rax,$A[1]
- mov 16*1($np),%rax # interleaved with 0, therefore 16*n
+ mov 8*1($np),%rax
adc \$0,%rdx
mov %rdx,$A[0]
@@ -559,7 +625,7 @@ $code.=<<___;
adc \$0,%rdx
add $A[1],$N[1]
lea 4*8($num),$j # j=4
- lea 16*4($np),$np
+ lea 8*4($np),$np
adc \$0,%rdx
mov $N[1],($tp)
mov %rdx,$N[0]
@@ -569,7 +635,7 @@ $code.=<<___;
.L1st4x:
mulq $m0 # ap[j]*bp[0]
add %rax,$A[0]
- mov -16*2($np),%rax
+ mov -8*2($np),%rax
lea 32($tp),$tp
adc \$0,%rdx
mov %rdx,$A[1]
@@ -585,7 +651,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[0]
add %rax,$A[1]
- mov -16*1($np),%rax
+ mov -8*1($np),%rax
adc \$0,%rdx
mov %rdx,$A[0]
@@ -600,7 +666,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[0]
add %rax,$A[0]
- mov 16*0($np),%rax
+ mov 8*0($np),%rax
adc \$0,%rdx
mov %rdx,$A[1]
@@ -615,7 +681,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[0]
add %rax,$A[1]
- mov 16*1($np),%rax
+ mov 8*1($np),%rax
adc \$0,%rdx
mov %rdx,$A[0]
@@ -624,7 +690,7 @@ $code.=<<___;
mov 16($ap,$j),%rax
adc \$0,%rdx
add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
- lea 16*4($np),$np
+ lea 8*4($np),$np
adc \$0,%rdx
mov $N[1],($tp) # tp[j-1]
mov %rdx,$N[0]
@@ -634,7 +700,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[0]
add %rax,$A[0]
- mov -16*2($np),%rax
+ mov -8*2($np),%rax
lea 32($tp),$tp
adc \$0,%rdx
mov %rdx,$A[1]
@@ -650,7 +716,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[0]
add %rax,$A[1]
- mov -16*1($np),%rax
+ mov -8*1($np),%rax
adc \$0,%rdx
mov %rdx,$A[0]
@@ -663,8 +729,7 @@ $code.=<<___;
mov $N[1],-16($tp) # tp[j-1]
mov %rdx,$N[0]
- movq %xmm0,$m0 # bp[1]
- lea ($np,$num,2),$np # rewind $np
+ lea ($np,$num),$np # rewind $np
xor $N[1],$N[1]
add $A[0],$N[0]
@@ -675,6 +740,33 @@ $code.=<<___;
.align 32
.Louter4x:
+ lea 16+128($tp),%rdx # where 256-byte mask is (+size optimization)
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+___
+for($i=0;$i<$STRIDE/16;$i+=4) {
+$code.=<<___;
+ movdqa `16*($i+0)-128`($bp),%xmm0
+ movdqa `16*($i+1)-128`($bp),%xmm1
+ movdqa `16*($i+2)-128`($bp),%xmm2
+ movdqa `16*($i+3)-128`($bp),%xmm3
+ pand `16*($i+0)-128`(%rdx),%xmm0
+ pand `16*($i+1)-128`(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand `16*($i+2)-128`(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand `16*($i+3)-128`(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+___
+}
+$code.=<<___;
+ por %xmm5,%xmm4
+ pshufd \$0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
+ lea $STRIDE($bp),$bp
+ movq %xmm0,$m0 # m0=bp[i]
+
mov ($tp,$num),$A[0]
mov $n0,$m1
mulq $m0 # ap[0]*bp[i]
@@ -682,25 +774,11 @@ $code.=<<___;
mov ($np),%rax
adc \$0,%rdx
- movq `0*$STRIDE/4-96`($bp),%xmm0
- movq `1*$STRIDE/4-96`($bp),%xmm1
- pand %xmm4,%xmm0
- movq `2*$STRIDE/4-96`($bp),%xmm2
- pand %xmm5,%xmm1
- movq `3*$STRIDE/4-96`($bp),%xmm3
-
imulq $A[0],$m1 # tp[0]*n0
- .byte 0x67
mov %rdx,$A[1]
mov $N[1],($tp) # store upmost overflow bit
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
- por %xmm2,%xmm0
lea ($tp,$num),$tp # rewind $tp
- lea $STRIDE($bp),$bp
- por %xmm3,%xmm0
mulq $m1 # np[0]*m1
add %rax,$A[0] # "$N[0]", discarded
@@ -710,7 +788,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[i]
add %rax,$A[1]
- mov 16*1($np),%rax # interleaved with 0, therefore 16*n
+ mov 8*1($np),%rax
adc \$0,%rdx
add 8($tp),$A[1] # +tp[1]
adc \$0,%rdx
@@ -722,7 +800,7 @@ $code.=<<___;
adc \$0,%rdx
add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
lea 4*8($num),$j # j=4
- lea 16*4($np),$np
+ lea 8*4($np),$np
adc \$0,%rdx
mov %rdx,$N[0]
jmp .Linner4x
@@ -731,7 +809,7 @@ $code.=<<___;
.Linner4x:
mulq $m0 # ap[j]*bp[i]
add %rax,$A[0]
- mov -16*2($np),%rax
+ mov -8*2($np),%rax
adc \$0,%rdx
add 16($tp),$A[0] # ap[j]*bp[i]+tp[j]
lea 32($tp),$tp
@@ -749,7 +827,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[i]
add %rax,$A[1]
- mov -16*1($np),%rax
+ mov -8*1($np),%rax
adc \$0,%rdx
add -8($tp),$A[1]
adc \$0,%rdx
@@ -766,7 +844,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[i]
add %rax,$A[0]
- mov 16*0($np),%rax
+ mov 8*0($np),%rax
adc \$0,%rdx
add ($tp),$A[0] # ap[j]*bp[i]+tp[j]
adc \$0,%rdx
@@ -783,7 +861,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[i]
add %rax,$A[1]
- mov 16*1($np),%rax
+ mov 8*1($np),%rax
adc \$0,%rdx
add 8($tp),$A[1]
adc \$0,%rdx
@@ -794,7 +872,7 @@ $code.=<<___;
mov 16($ap,$j),%rax
adc \$0,%rdx
add $A[1],$N[1]
- lea 16*4($np),$np
+ lea 8*4($np),$np
adc \$0,%rdx
mov $N[0],-8($tp) # tp[j-1]
mov %rdx,$N[0]
@@ -804,7 +882,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[i]
add %rax,$A[0]
- mov -16*2($np),%rax
+ mov -8*2($np),%rax
adc \$0,%rdx
add 16($tp),$A[0] # ap[j]*bp[i]+tp[j]
lea 32($tp),$tp
@@ -823,7 +901,7 @@ $code.=<<___;
mulq $m0 # ap[j]*bp[i]
add %rax,$A[1]
mov $m1,%rax
- mov -16*1($np),$m1
+ mov -8*1($np),$m1
adc \$0,%rdx
add -8($tp),$A[1]
adc \$0,%rdx
@@ -838,9 +916,8 @@ $code.=<<___;
mov $N[0],-24($tp) # tp[j-1]
mov %rdx,$N[0]
- movq %xmm0,$m0 # bp[i+1]
mov $N[1],-16($tp) # tp[j-1]
- lea ($np,$num,2),$np # rewind $np
+ lea ($np,$num),$np # rewind $np
xor $N[1],$N[1]
add $A[0],$N[0]
@@ -854,16 +931,23 @@ $code.=<<___;
___
if (1) {
$code.=<<___;
+ xor %rax,%rax
sub $N[0],$m1 # compare top-most words
adc $j,$j # $j is zero
or $j,$N[1]
- xor \$1,$N[1]
+ sub $N[1],%rax # %rax=-$N[1]
lea ($tp,$num),%rbx # tptr in .sqr4x_sub
- lea ($np,$N[1],8),%rbp # nptr in .sqr4x_sub
+ mov ($np),%r12
+ lea ($np),%rbp # nptr in .sqr4x_sub
mov %r9,%rcx
- sar \$3+2,%rcx # cf=0
+ sar \$3+2,%rcx
mov 56+8(%rsp),%rdi # rptr in .sqr4x_sub
- jmp .Lsqr4x_sub
+ dec %r12 # so that after 'not' we get -n[0]
+ xor %r10,%r10
+ mov 8*1(%rbp),%r13
+ mov 8*2(%rbp),%r14
+ mov 8*3(%rbp),%r15
+ jmp .Lsqr4x_sub_entry
___
} else {
my @ri=("%rax",$bp,$m0,$m1);
@@ -930,8 +1014,8 @@ bn_power5:
___
$code.=<<___ if ($addx);
mov OPENSSL_ia32cap_P+8(%rip),%r11d
- and \$0x80100,%r11d
- cmp \$0x80100,%r11d
+ and \$0x80108,%r11d
+ cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1
je .Lpowerx5_enter
___
$code.=<<___;
@@ -942,38 +1026,32 @@ $code.=<<___;
push %r13
push %r14
push %r15
-___
-$code.=<<___ if ($win64);
- lea -0x28(%rsp),%rsp
- movaps %xmm6,(%rsp)
- movaps %xmm7,0x10(%rsp)
-___
-$code.=<<___;
- mov ${num}d,%r10d
+
shl \$3,${num}d # convert $num to bytes
- shl \$3+2,%r10d # 4*$num
+ lea ($num,$num,2),%r10d # 3*$num
neg $num
mov ($n0),$n0 # *n0
##############################################################
- # ensure that stack frame doesn't alias with $aptr+4*$num
- # modulo 4096, which covers ret[num], am[num] and n[2*num]
- # (see bn_exp.c). this is done to allow memory disambiguation
- # logic do its magic.
+ # Ensure that stack frame doesn't alias with $rptr+3*$num
+ # modulo 4096, which covers ret[num], am[num] and n[num]
+ # (see bn_exp.c). This is done to allow memory disambiguation
+ # logic do its magic. [Extra 256 bytes is for power mask
+ # calculated from 7th argument, the index.]
#
- lea -64(%rsp,$num,2),%r11
- sub $aptr,%r11
+ lea -320(%rsp,$num,2),%r11
+ sub $rptr,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lpwr_sp_alt
sub %r11,%rsp # align with $aptr
- lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
jmp .Lpwr_sp_done
.align 32
.Lpwr_sp_alt:
- lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
- lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ lea 4096-320(,$num,2),%r10
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
@@ -995,16 +1073,21 @@ $code.=<<___;
mov $n0, 32(%rsp)
mov %rax, 40(%rsp) # save original %rsp
.Lpower5_body:
- movq $rptr,%xmm1 # save $rptr
+ movq $rptr,%xmm1 # save $rptr, used in sqr8x
movq $nptr,%xmm2 # save $nptr
- movq %r10, %xmm3 # -$num
+ movq %r10, %xmm3 # -$num, used in sqr8x
movq $bptr,%xmm4
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
movq %xmm2,$nptr
movq %xmm4,$bptr
@@ -1565,9 +1648,9 @@ my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx");
$code.=<<___;
movq %xmm2,$nptr
-sqr8x_reduction:
+__bn_sqr8x_reduction:
xor %rax,%rax
- lea ($nptr,$num,2),%rcx # end of n[]
+ lea ($nptr,$num),%rcx # end of n[]
lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer
mov %rcx,0+8(%rsp)
lea 48+8(%rsp,$num),$tptr # end of initial t[] window
@@ -1593,21 +1676,21 @@ sqr8x_reduction:
.byte 0x67
mov $m0,%r8
imulq 32+8(%rsp),$m0 # n0*a[0]
- mov 16*0($nptr),%rax # n[0]
+ mov 8*0($nptr),%rax # n[0]
mov \$8,%ecx
jmp .L8x_reduce
.align 32
.L8x_reduce:
mulq $m0
- mov 16*1($nptr),%rax # n[1]
+ mov 8*1($nptr),%rax # n[1]
neg %r8
mov %rdx,%r8
adc \$0,%r8
mulq $m0
add %rax,%r9
- mov 16*2($nptr),%rax
+ mov 8*2($nptr),%rax
adc \$0,%rdx
add %r9,%r8
mov $m0,48-8+8(%rsp,%rcx,8) # put aside n0*a[i]
@@ -1616,7 +1699,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r10
- mov 16*3($nptr),%rax
+ mov 8*3($nptr),%rax
adc \$0,%rdx
add %r10,%r9
mov 32+8(%rsp),$carry # pull n0, borrow $carry
@@ -1625,7 +1708,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r11
- mov 16*4($nptr),%rax
+ mov 8*4($nptr),%rax
adc \$0,%rdx
imulq %r8,$carry # modulo-scheduled
add %r11,%r10
@@ -1634,7 +1717,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r12
- mov 16*5($nptr),%rax
+ mov 8*5($nptr),%rax
adc \$0,%rdx
add %r12,%r11
mov %rdx,%r12
@@ -1642,7 +1725,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r13
- mov 16*6($nptr),%rax
+ mov 8*6($nptr),%rax
adc \$0,%rdx
add %r13,%r12
mov %rdx,%r13
@@ -1650,7 +1733,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r14
- mov 16*7($nptr),%rax
+ mov 8*7($nptr),%rax
adc \$0,%rdx
add %r14,%r13
mov %rdx,%r14
@@ -1659,7 +1742,7 @@ sqr8x_reduction:
mulq $m0
mov $carry,$m0 # n0*a[i]
add %rax,%r15
- mov 16*0($nptr),%rax # n[0]
+ mov 8*0($nptr),%rax # n[0]
adc \$0,%rdx
add %r15,%r14
mov %rdx,%r15
@@ -1668,7 +1751,7 @@ sqr8x_reduction:
dec %ecx
jnz .L8x_reduce
- lea 16*8($nptr),$nptr
+ lea 8*8($nptr),$nptr
xor %rax,%rax
mov 8+8(%rsp),%rdx # pull end of t[]
cmp 0+8(%rsp),$nptr # end of n[]?
@@ -1687,21 +1770,21 @@ sqr8x_reduction:
mov 48+56+8(%rsp),$m0 # pull n0*a[0]
mov \$8,%ecx
- mov 16*0($nptr),%rax
+ mov 8*0($nptr),%rax
jmp .L8x_tail
.align 32
.L8x_tail:
mulq $m0
add %rax,%r8
- mov 16*1($nptr),%rax
+ mov 8*1($nptr),%rax
mov %r8,($tptr) # save result
mov %rdx,%r8
adc \$0,%r8
mulq $m0
add %rax,%r9
- mov 16*2($nptr),%rax
+ mov 8*2($nptr),%rax
adc \$0,%rdx
add %r9,%r8
lea 8($tptr),$tptr # $tptr++
@@ -1710,7 +1793,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r10
- mov 16*3($nptr),%rax
+ mov 8*3($nptr),%rax
adc \$0,%rdx
add %r10,%r9
mov %rdx,%r10
@@ -1718,7 +1801,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r11
- mov 16*4($nptr),%rax
+ mov 8*4($nptr),%rax
adc \$0,%rdx
add %r11,%r10
mov %rdx,%r11
@@ -1726,7 +1809,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r12
- mov 16*5($nptr),%rax
+ mov 8*5($nptr),%rax
adc \$0,%rdx
add %r12,%r11
mov %rdx,%r12
@@ -1734,7 +1817,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r13
- mov 16*6($nptr),%rax
+ mov 8*6($nptr),%rax
adc \$0,%rdx
add %r13,%r12
mov %rdx,%r13
@@ -1742,7 +1825,7 @@ sqr8x_reduction:
mulq $m0
add %rax,%r14
- mov 16*7($nptr),%rax
+ mov 8*7($nptr),%rax
adc \$0,%rdx
add %r14,%r13
mov %rdx,%r14
@@ -1753,14 +1836,14 @@ sqr8x_reduction:
add %rax,%r15
adc \$0,%rdx
add %r15,%r14
- mov 16*0($nptr),%rax # pull n[0]
+ mov 8*0($nptr),%rax # pull n[0]
mov %rdx,%r15
adc \$0,%r15
dec %ecx
jnz .L8x_tail
- lea 16*8($nptr),$nptr
+ lea 8*8($nptr),$nptr
mov 8+8(%rsp),%rdx # pull end of t[]
cmp 0+8(%rsp),$nptr # end of n[]?
jae .L8x_tail_done # break out of loop
@@ -1806,7 +1889,7 @@ sqr8x_reduction:
adc 8*6($tptr),%r14
adc 8*7($tptr),%r15
adc \$0,%rax # top-most carry
- mov -16($nptr),%rcx # np[num-1]
+ mov -8($nptr),%rcx # np[num-1]
xor $carry,$carry
movq %xmm2,$nptr # restore $nptr
@@ -1824,6 +1907,8 @@ sqr8x_reduction:
cmp %rdx,$tptr # end of t[]?
jb .L8x_reduction_loop
+ ret
+.size bn_sqr8x_internal,.-bn_sqr8x_internal
___
}
##############################################################
@@ -1832,48 +1917,62 @@ ___
{
my ($tptr,$nptr)=("%rbx","%rbp");
$code.=<<___;
- #xor %rsi,%rsi # %rsi was $carry above
- sub %r15,%rcx # compare top-most words
+.type __bn_post4x_internal,\@abi-omnipotent
+.align 32
+__bn_post4x_internal:
+ mov 8*0($nptr),%r12
lea (%rdi,$num),$tptr # %rdi was $tptr above
- adc %rsi,%rsi
mov $num,%rcx
- or %rsi,%rax
movq %xmm1,$rptr # restore $rptr
- xor \$1,%rax
+ neg %rax
movq %xmm1,$aptr # prepare for back-to-back call
- lea ($nptr,%rax,8),$nptr
- sar \$3+2,%rcx # cf=0
- jmp .Lsqr4x_sub
+ sar \$3+2,%rcx
+ dec %r12 # so that after 'not' we get -n[0]
+ xor %r10,%r10
+ mov 8*1($nptr),%r13
+ mov 8*2($nptr),%r14
+ mov 8*3($nptr),%r15
+ jmp .Lsqr4x_sub_entry
-.align 32
+.align 16
.Lsqr4x_sub:
- .byte 0x66
- mov 8*0($tptr),%r12
- mov 8*1($tptr),%r13
- sbb 16*0($nptr),%r12
- mov 8*2($tptr),%r14
- sbb 16*1($nptr),%r13
- mov 8*3($tptr),%r15
- lea 8*4($tptr),$tptr
- sbb 16*2($nptr),%r14
+ mov 8*0($nptr),%r12
+ mov 8*1($nptr),%r13
+ mov 8*2($nptr),%r14
+ mov 8*3($nptr),%r15
+.Lsqr4x_sub_entry:
+ lea 8*4($nptr),$nptr
+ not %r12
+ not %r13
+ not %r14
+ not %r15
+ and %rax,%r12
+ and %rax,%r13
+ and %rax,%r14
+ and %rax,%r15
+
+ neg %r10 # mov %r10,%cf
+ adc 8*0($tptr),%r12
+ adc 8*1($tptr),%r13
+ adc 8*2($tptr),%r14
+ adc 8*3($tptr),%r15
mov %r12,8*0($rptr)
- sbb 16*3($nptr),%r15
- lea 16*4($nptr),$nptr
+ lea 8*4($tptr),$tptr
mov %r13,8*1($rptr)
+ sbb %r10,%r10 # mov %cf,%r10
mov %r14,8*2($rptr)
mov %r15,8*3($rptr)
lea 8*4($rptr),$rptr
inc %rcx # pass %cf
jnz .Lsqr4x_sub
-___
-}
-$code.=<<___;
+
mov $num,%r10 # prepare for back-to-back call
neg $num # restore $num
ret
-.size bn_sqr8x_internal,.-bn_sqr8x_internal
+.size __bn_post4x_internal,.-__bn_post4x_internal
___
+}
{
$code.=<<___;
.globl bn_from_montgomery
@@ -1897,39 +1996,32 @@ bn_from_mont8x:
push %r13
push %r14
push %r15
-___
-$code.=<<___ if ($win64);
- lea -0x28(%rsp),%rsp
- movaps %xmm6,(%rsp)
- movaps %xmm7,0x10(%rsp)
-___
-$code.=<<___;
- .byte 0x67
- mov ${num}d,%r10d
+
shl \$3,${num}d # convert $num to bytes
- shl \$3+2,%r10d # 4*$num
+ lea ($num,$num,2),%r10 # 3*$num in bytes
neg $num
mov ($n0),$n0 # *n0
##############################################################
- # ensure that stack frame doesn't alias with $aptr+4*$num
- # modulo 4096, which covers ret[num], am[num] and n[2*num]
- # (see bn_exp.c). this is done to allow memory disambiguation
- # logic do its magic.
+ # Ensure that stack frame doesn't alias with $rptr+3*$num
+ # modulo 4096, which covers ret[num], am[num] and n[num]
+ # (see bn_exp.c). The stack is allocated to aligned with
+ # bn_power5's frame, and as bn_from_montgomery happens to be
+ # last operation, we use the opportunity to cleanse it.
#
- lea -64(%rsp,$num,2),%r11
- sub $aptr,%r11
+ lea -320(%rsp,$num,2),%r11
+ sub $rptr,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lfrom_sp_alt
sub %r11,%rsp # align with $aptr
- lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
jmp .Lfrom_sp_done
.align 32
.Lfrom_sp_alt:
- lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
- lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ lea 4096-320(,$num,2),%r10
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
@@ -1983,12 +2075,13 @@ $code.=<<___;
___
$code.=<<___ if ($addx);
mov OPENSSL_ia32cap_P+8(%rip),%r11d
- and \$0x80100,%r11d
- cmp \$0x80100,%r11d
+ and \$0x80108,%r11d
+ cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1
jne .Lfrom_mont_nox
lea (%rax,$num),$rptr
- call sqrx8x_reduction
+ call __bn_sqrx8x_reduction
+ call __bn_postx4x_internal
pxor %xmm0,%xmm0
lea 48(%rsp),%rax
@@ -1999,7 +2092,8 @@ $code.=<<___ if ($addx);
.Lfrom_mont_nox:
___
$code.=<<___;
- call sqr8x_reduction
+ call __bn_sqr8x_reduction
+ call __bn_post4x_internal
pxor %xmm0,%xmm0
lea 48(%rsp),%rax
@@ -2039,7 +2133,6 @@ $code.=<<___;
.align 32
bn_mulx4x_mont_gather5:
.Lmulx4x_enter:
- .byte 0x67
mov %rsp,%rax
push %rbx
push %rbp
@@ -2047,40 +2140,33 @@ bn_mulx4x_mont_gather5:
push %r13
push %r14
push %r15
-___
-$code.=<<___ if ($win64);
- lea -0x28(%rsp),%rsp
- movaps %xmm6,(%rsp)
- movaps %xmm7,0x10(%rsp)
-___
-$code.=<<___;
- .byte 0x67
- mov ${num}d,%r10d
+
shl \$3,${num}d # convert $num to bytes
- shl \$3+2,%r10d # 4*$num
+ lea ($num,$num,2),%r10 # 3*$num in bytes
neg $num # -$num
mov ($n0),$n0 # *n0
##############################################################
- # ensure that stack frame doesn't alias with $aptr+4*$num
- # modulo 4096, which covers a[num], ret[num] and n[2*num]
- # (see bn_exp.c). this is done to allow memory disambiguation
- # logic do its magic. [excessive frame is allocated in order
- # to allow bn_from_mont8x to clear it.]
+ # Ensure that stack frame doesn't alias with $rptr+3*$num
+ # modulo 4096, which covers ret[num], am[num] and n[num]
+ # (see bn_exp.c). This is done to allow memory disambiguation
+ # logic do its magic. [Extra [num] is allocated in order
+ # to align with bn_power5's frame, which is cleansed after
+ # completing exponentiation. Extra 256 bytes is for power mask
+ # calculated from 7th argument, the index.]
#
- lea -64(%rsp,$num,2),%r11
- sub $ap,%r11
+ lea -320(%rsp,$num,2),%r11
+ sub $rp,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lmulx4xsp_alt
sub %r11,%rsp # align with $aptr
- lea -64(%rsp,$num,2),%rsp # alloca(frame+$num)
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
jmp .Lmulx4xsp_done
-.align 32
.Lmulx4xsp_alt:
- lea 4096-64(,$num,2),%r10 # 4096-frame-$num
- lea -64(%rsp,$num,2),%rsp # alloca(frame+$num)
+ lea 4096-320(,$num,2),%r10
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
@@ -2106,12 +2192,7 @@ $code.=<<___;
mov 40(%rsp),%rsi # restore %rsp
mov \$1,%rax
-___
-$code.=<<___ if ($win64);
- movaps -88(%rsi),%xmm6
- movaps -72(%rsi),%xmm7
-___
-$code.=<<___;
+
mov -48(%rsi),%r15
mov -40(%rsi),%r14
mov -32(%rsi),%r13
@@ -2126,14 +2207,16 @@ $code.=<<___;
.type mulx4x_internal,\@abi-omnipotent
.align 32
mulx4x_internal:
- .byte 0x4c,0x89,0x8c,0x24,0x08,0x00,0x00,0x00 # mov $num,8(%rsp) # save -$num
- .byte 0x67
+ mov $num,8(%rsp) # save -$num (it was in bytes)
+ mov $num,%r10
neg $num # restore $num
shl \$5,$num
- lea 256($bp,$num),%r13
+ neg %r10 # restore $num
+ lea 128($bp,$num),%r13 # end of powers table (+size optimization)
shr \$5+5,$num
- mov `($win64?56:8)`(%rax),%r10d # load 7th argument
+ movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument
sub \$1,$num
+ lea .Linc(%rip),%rax
mov %r13,16+8(%rsp) # end of b[num]
mov $num,24+8(%rsp) # inner counter
mov $rp, 56+8(%rsp) # save $rp
@@ -2144,52 +2227,92 @@ my $rptr=$bptr;
my $STRIDE=2**5*8; # 5 is "window size"
my $N=$STRIDE/4; # should match cache line size
$code.=<<___;
- mov %r10,%r11
- shr \$`log($N/8)/log(2)`,%r10
- and \$`$N/8-1`,%r11
- not %r10
- lea .Lmagic_masks(%rip),%rax
- and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
- lea 96($bp,%r11,8),$bptr # pointer within 1st cache line
- movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
- movq 8(%rax,%r10,8),%xmm5 # cache line contains element
- add \$7,%r11
- movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
- movq 24(%rax,%r10,8),%xmm7
- and \$7,%r11
-
- movq `0*$STRIDE/4-96`($bptr),%xmm0
- lea $STRIDE($bptr),$tptr # borrow $tptr
- movq `1*$STRIDE/4-96`($bptr),%xmm1
- pand %xmm4,%xmm0
- movq `2*$STRIDE/4-96`($bptr),%xmm2
- pand %xmm5,%xmm1
- movq `3*$STRIDE/4-96`($bptr),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- movq `0*$STRIDE/4-96`($tptr),%xmm1
- pand %xmm7,%xmm3
- por %xmm2,%xmm0
- movq `1*$STRIDE/4-96`($tptr),%xmm2
- por %xmm3,%xmm0
- .byte 0x67,0x67
- pand %xmm4,%xmm1
- movq `2*$STRIDE/4-96`($tptr),%xmm3
+ movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000
+ movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002
+ lea 88-112(%rsp,%r10),%r10 # place the mask after tp[num+1] (+ICache optimizaton)
+ lea 128($bp),$bptr # size optimization
+ pshufd \$0,%xmm5,%xmm5 # broadcast index
+ movdqa %xmm1,%xmm4
+ .byte 0x67
+ movdqa %xmm1,%xmm2
+___
+########################################################################
+# calculate mask by comparing 0..31 to index and save result to stack
+#
+$code.=<<___;
+ .byte 0x67
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0 # compare to 1,0
+ movdqa %xmm4,%xmm3
+___
+for($i=0;$i<$STRIDE/16-4;$i+=4) {
+$code.=<<___;
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1 # compare to 3,2
+ movdqa %xmm0,`16*($i+0)+112`(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2 # compare to 5,4
+ movdqa %xmm1,`16*($i+1)+112`(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3 # compare to 7,6
+ movdqa %xmm2,`16*($i+2)+112`(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,`16*($i+3)+112`(%r10)
+ movdqa %xmm4,%xmm3
+___
+}
+$code.=<<___; # last iteration can be optimized
+ .byte 0x67
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,`16*($i+0)+112`(%r10)
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,`16*($i+1)+112`(%r10)
+
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,`16*($i+2)+112`(%r10)
+
+ pand `16*($i+0)-128`($bptr),%xmm0 # while it's still in register
+ pand `16*($i+1)-128`($bptr),%xmm1
+ pand `16*($i+2)-128`($bptr),%xmm2
+ movdqa %xmm3,`16*($i+3)+112`(%r10)
+ pand `16*($i+3)-128`($bptr),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+___
+for($i=0;$i<$STRIDE/16-4;$i+=4) {
+$code.=<<___;
+ movdqa `16*($i+0)-128`($bptr),%xmm4
+ movdqa `16*($i+1)-128`($bptr),%xmm5
+ movdqa `16*($i+2)-128`($bptr),%xmm2
+ pand `16*($i+0)+112`(%r10),%xmm4
+ movdqa `16*($i+3)-128`($bptr),%xmm3
+ pand `16*($i+1)+112`(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand `16*($i+2)+112`(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand `16*($i+3)+112`(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+___
+}
+$code.=<<___;
+ pxor %xmm1,%xmm0
+ pshufd \$0x4e,%xmm0,%xmm1
+ por %xmm1,%xmm0
+ lea $STRIDE($bptr),$bptr
movq %xmm0,%rdx # bp[0]
- movq `3*$STRIDE/4-96`($tptr),%xmm0
- lea 2*$STRIDE($bptr),$bptr # next &b[i]
- pand %xmm5,%xmm2
- .byte 0x67,0x67
- pand %xmm6,%xmm3
- ##############################################################
- # $tptr is chosen so that writing to top-most element of the
- # vector occurs just "above" references to powers table,
- # "above" modulo cache-line size, which effectively precludes
- # possibility of memory disambiguation logic failure when
- # accessing the table.
- #
- lea 64+8*4+8(%rsp,%r11,8),$tptr
+ lea 64+8*4+8(%rsp),$tptr
mov %rdx,$bi
mulx 0*8($aptr),$mi,%rax # a[0]*b[0]
@@ -2205,37 +2328,31 @@ $code.=<<___;
xor $zero,$zero # cf=0, of=0
mov $mi,%rdx
- por %xmm2,%xmm1
- pand %xmm7,%xmm0
- por %xmm3,%xmm1
mov $bptr,8+8(%rsp) # off-load &b[i]
- por %xmm1,%xmm0
- .byte 0x48,0x8d,0xb6,0x20,0x00,0x00,0x00 # lea 4*8($aptr),$aptr
+ lea 4*8($aptr),$aptr
adcx %rax,%r13
adcx $zero,%r14 # cf=0
- mulx 0*16($nptr),%rax,%r10
+ mulx 0*8($nptr),%rax,%r10
adcx %rax,%r15 # discarded
adox %r11,%r10
- mulx 1*16($nptr),%rax,%r11
+ mulx 1*8($nptr),%rax,%r11
adcx %rax,%r10
adox %r12,%r11
- mulx 2*16($nptr),%rax,%r12
+ mulx 2*8($nptr),%rax,%r12
mov 24+8(%rsp),$bptr # counter value
- .byte 0x66
mov %r10,-8*4($tptr)
adcx %rax,%r11
adox %r13,%r12
- mulx 3*16($nptr),%rax,%r15
- .byte 0x67,0x67
+ mulx 3*8($nptr),%rax,%r15
mov $bi,%rdx
mov %r11,-8*3($tptr)
adcx %rax,%r12
adox $zero,%r15 # of=0
- .byte 0x48,0x8d,0x89,0x40,0x00,0x00,0x00 # lea 4*16($nptr),$nptr
+ lea 4*8($nptr),$nptr
mov %r12,-8*2($tptr)
- #jmp .Lmulx4x_1st
+ jmp .Lmulx4x_1st
.align 32
.Lmulx4x_1st:
@@ -2255,30 +2372,29 @@ $code.=<<___;
lea 4*8($tptr),$tptr
adox %r15,%r10
- mulx 0*16($nptr),%rax,%r15
+ mulx 0*8($nptr),%rax,%r15
adcx %rax,%r10
adox %r15,%r11
- mulx 1*16($nptr),%rax,%r15
+ mulx 1*8($nptr),%rax,%r15
adcx %rax,%r11
adox %r15,%r12
- mulx 2*16($nptr),%rax,%r15
+ mulx 2*8($nptr),%rax,%r15
mov %r10,-5*8($tptr)
adcx %rax,%r12
mov %r11,-4*8($tptr)
adox %r15,%r13
- mulx 3*16($nptr),%rax,%r15
+ mulx 3*8($nptr),%rax,%r15
mov $bi,%rdx
mov %r12,-3*8($tptr)
adcx %rax,%r13
adox $zero,%r15
- lea 4*16($nptr),$nptr
+ lea 4*8($nptr),$nptr
mov %r13,-2*8($tptr)
dec $bptr # of=0, pass cf
jnz .Lmulx4x_1st
mov 8(%rsp),$num # load -num
- movq %xmm0,%rdx # bp[1]
adc $zero,%r15 # modulo-scheduled
lea ($aptr,$num),$aptr # rewind $aptr
add %r15,%r14
@@ -2289,6 +2405,34 @@ $code.=<<___;
.align 32
.Lmulx4x_outer:
+ lea 16-256($tptr),%r10 # where 256-byte mask is (+density control)
+ pxor %xmm4,%xmm4
+ .byte 0x67,0x67
+ pxor %xmm5,%xmm5
+___
+for($i=0;$i<$STRIDE/16;$i+=4) {
+$code.=<<___;
+ movdqa `16*($i+0)-128`($bptr),%xmm0
+ movdqa `16*($i+1)-128`($bptr),%xmm1
+ movdqa `16*($i+2)-128`($bptr),%xmm2
+ pand `16*($i+0)+256`(%r10),%xmm0
+ movdqa `16*($i+3)-128`($bptr),%xmm3
+ pand `16*($i+1)+256`(%r10),%xmm1
+ por %xmm0,%xmm4
+ pand `16*($i+2)+256`(%r10),%xmm2
+ por %xmm1,%xmm5
+ pand `16*($i+3)+256`(%r10),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+___
+}
+$code.=<<___;
+ por %xmm5,%xmm4
+ pshufd \$0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
+ lea $STRIDE($bptr),$bptr
+ movq %xmm0,%rdx # m0=bp[i]
+
mov $zero,($tptr) # save top-most carry
lea 4*8($tptr,$num),$tptr # rewind $tptr
mulx 0*8($aptr),$mi,%r11 # a[0]*b[i]
@@ -2303,54 +2447,37 @@ $code.=<<___;
mulx 3*8($aptr),%rdx,%r14
adox -2*8($tptr),%r12
adcx %rdx,%r13
- lea ($nptr,$num,2),$nptr # rewind $nptr
+ lea ($nptr,$num),$nptr # rewind $nptr
lea 4*8($aptr),$aptr
adox -1*8($tptr),%r13
adcx $zero,%r14
adox $zero,%r14
- .byte 0x67
mov $mi,%r15
imulq 32+8(%rsp),$mi # "t[0]"*n0
- movq `0*$STRIDE/4-96`($bptr),%xmm0
- .byte 0x67,0x67
mov $mi,%rdx
- movq `1*$STRIDE/4-96`($bptr),%xmm1
- .byte 0x67
- pand %xmm4,%xmm0
- movq `2*$STRIDE/4-96`($bptr),%xmm2
- .byte 0x67
- pand %xmm5,%xmm1
- movq `3*$STRIDE/4-96`($bptr),%xmm3
- add \$$STRIDE,$bptr # next &b[i]
- .byte 0x67
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
xor $zero,$zero # cf=0, of=0
mov $bptr,8+8(%rsp) # off-load &b[i]
- mulx 0*16($nptr),%rax,%r10
+ mulx 0*8($nptr),%rax,%r10
adcx %rax,%r15 # discarded
adox %r11,%r10
- mulx 1*16($nptr),%rax,%r11
+ mulx 1*8($nptr),%rax,%r11
adcx %rax,%r10
adox %r12,%r11
- mulx 2*16($nptr),%rax,%r12
+ mulx 2*8($nptr),%rax,%r12
adcx %rax,%r11
adox %r13,%r12
- mulx 3*16($nptr),%rax,%r15
+ mulx 3*8($nptr),%rax,%r15
mov $bi,%rdx
- por %xmm2,%xmm0
mov 24+8(%rsp),$bptr # counter value
mov %r10,-8*4($tptr)
- por %xmm3,%xmm0
adcx %rax,%r12
mov %r11,-8*3($tptr)
adox $zero,%r15 # of=0
mov %r12,-8*2($tptr)
- lea 4*16($nptr),$nptr
+ lea 4*8($nptr),$nptr
jmp .Lmulx4x_inner
.align 32
@@ -2375,20 +2502,20 @@ $code.=<<___;
adcx $zero,%r14 # cf=0
adox %r15,%r10
- mulx 0*16($nptr),%rax,%r15
+ mulx 0*8($nptr),%rax,%r15
adcx %rax,%r10
adox %r15,%r11
- mulx 1*16($nptr),%rax,%r15
+ mulx 1*8($nptr),%rax,%r15
adcx %rax,%r11
adox %r15,%r12
- mulx 2*16($nptr),%rax,%r15
+ mulx 2*8($nptr),%rax,%r15
mov %r10,-5*8($tptr)
adcx %rax,%r12
adox %r15,%r13
mov %r11,-4*8($tptr)
- mulx 3*16($nptr),%rax,%r15
+ mulx 3*8($nptr),%rax,%r15
mov $bi,%rdx
- lea 4*16($nptr),$nptr
+ lea 4*8($nptr),$nptr
mov %r12,-3*8($tptr)
adcx %rax,%r13
adox $zero,%r15
@@ -2398,7 +2525,6 @@ $code.=<<___;
jnz .Lmulx4x_inner
mov 0+8(%rsp),$num # load -num
- movq %xmm0,%rdx # bp[i+1]
adc $zero,%r15 # modulo-scheduled
sub 0*8($tptr),$bptr # pull top-most carry to %cf
mov 8+8(%rsp),$bptr # re-load &b[i]
@@ -2411,20 +2537,26 @@ $code.=<<___;
cmp %r10,$bptr
jb .Lmulx4x_outer
- mov -16($nptr),%r10
+ mov -8($nptr),%r10
+ mov $zero,%r8
+ mov ($nptr,$num),%r12
+ lea ($nptr,$num),%rbp # rewind $nptr
+ mov $num,%rcx
+ lea ($tptr,$num),%rdi # rewind $tptr
+ xor %eax,%eax
xor %r15,%r15
sub %r14,%r10 # compare top-most words
adc %r15,%r15
- or %r15,$zero
- xor \$1,$zero
- lea ($tptr,$num),%rdi # rewind $tptr
- lea ($nptr,$num,2),$nptr # rewind $nptr
- .byte 0x67,0x67
- sar \$3+2,$num # cf=0
- lea ($nptr,$zero,8),%rbp
+ or %r15,%r8
+ sar \$3+2,%rcx
+ sub %r8,%rax # %rax=-%r8
mov 56+8(%rsp),%rdx # restore rp
- mov $num,%rcx
- jmp .Lsqrx4x_sub # common post-condition
+ dec %r12 # so that after 'not' we get -n[0]
+ mov 8*1(%rbp),%r13
+ xor %r8,%r8
+ mov 8*2(%rbp),%r14
+ mov 8*3(%rbp),%r15
+ jmp .Lsqrx4x_sub_entry # common post-condition
.size mulx4x_internal,.-mulx4x_internal
___
} {
@@ -2448,7 +2580,6 @@ $code.=<<___;
.align 32
bn_powerx5:
.Lpowerx5_enter:
- .byte 0x67
mov %rsp,%rax
push %rbx
push %rbp
@@ -2456,39 +2587,32 @@ bn_powerx5:
push %r13
push %r14
push %r15
-___
-$code.=<<___ if ($win64);
- lea -0x28(%rsp),%rsp
- movaps %xmm6,(%rsp)
- movaps %xmm7,0x10(%rsp)
-___
-$code.=<<___;
- .byte 0x67
- mov ${num}d,%r10d
+
shl \$3,${num}d # convert $num to bytes
- shl \$3+2,%r10d # 4*$num
+ lea ($num,$num,2),%r10 # 3*$num in bytes
neg $num
mov ($n0),$n0 # *n0
##############################################################
- # ensure that stack frame doesn't alias with $aptr+4*$num
- # modulo 4096, which covers ret[num], am[num] and n[2*num]
- # (see bn_exp.c). this is done to allow memory disambiguation
- # logic do its magic.
+ # Ensure that stack frame doesn't alias with $rptr+3*$num
+ # modulo 4096, which covers ret[num], am[num] and n[num]
+ # (see bn_exp.c). This is done to allow memory disambiguation
+ # logic do its magic. [Extra 256 bytes is for power mask
+ # calculated from 7th argument, the index.]
#
- lea -64(%rsp,$num,2),%r11
- sub $aptr,%r11
+ lea -320(%rsp,$num,2),%r11
+ sub $rptr,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lpwrx_sp_alt
sub %r11,%rsp # align with $aptr
- lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
jmp .Lpwrx_sp_done
.align 32
.Lpwrx_sp_alt:
- lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
- lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ lea 4096-320(,$num,2),%r10
+ lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
@@ -2519,10 +2643,15 @@ $code.=<<___;
.Lpowerx5_body:
call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
mov %r10,$num # -num
mov $aptr,$rptr
@@ -2534,12 +2663,7 @@ $code.=<<___;
mov 40(%rsp),%rsi # restore %rsp
mov \$1,%rax
-___
-$code.=<<___ if ($win64);
- movaps -88(%rsi),%xmm6
- movaps -72(%rsi),%xmm7
-___
-$code.=<<___;
+
mov -48(%rsi),%r15
mov -40(%rsi),%r14
mov -32(%rsi),%r13
@@ -2973,11 +3097,11 @@ my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx");
$code.=<<___;
movq %xmm2,$nptr
-sqrx8x_reduction:
+__bn_sqrx8x_reduction:
xor %eax,%eax # initial top-most carry bit
mov 32+8(%rsp),%rbx # n0
mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr)
- lea -128($nptr,$num,2),%rcx # end of n[]
+ lea -8*8($nptr,$num),%rcx # end of n[]
#lea 48+8(%rsp,$num,2),$tptr # end of t[] buffer
mov %rcx, 0+8(%rsp) # save end of n[]
mov $tptr,8+8(%rsp) # save end of t[]
@@ -3006,23 +3130,23 @@ sqrx8x_reduction:
.align 32
.Lsqrx8x_reduce:
mov %r8, %rbx
- mulx 16*0($nptr),%rax,%r8 # n[0]
+ mulx 8*0($nptr),%rax,%r8 # n[0]
adcx %rbx,%rax # discarded
adox %r9,%r8
- mulx 16*1($nptr),%rbx,%r9 # n[1]
+ mulx 8*1($nptr),%rbx,%r9 # n[1]
adcx %rbx,%r8
adox %r10,%r9
- mulx 16*2($nptr),%rbx,%r10
+ mulx 8*2($nptr),%rbx,%r10
adcx %rbx,%r9
adox %r11,%r10
- mulx 16*3($nptr),%rbx,%r11
+ mulx 8*3($nptr),%rbx,%r11
adcx %rbx,%r10
adox %r12,%r11
- .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x40,0x00,0x00,0x00 # mulx 16*4($nptr),%rbx,%r12
+ .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rbx,%r12
mov %rdx,%rax
mov %r8,%rdx
adcx %rbx,%r11
@@ -3032,15 +3156,15 @@ sqrx8x_reduction:
mov %rax,%rdx
mov %rax,64+48+8(%rsp,%rcx,8) # put aside n0*a[i]
- mulx 16*5($nptr),%rax,%r13
+ mulx 8*5($nptr),%rax,%r13
adcx %rax,%r12
adox %r14,%r13
- mulx 16*6($nptr),%rax,%r14
+ mulx 8*6($nptr),%rax,%r14
adcx %rax,%r13
adox %r15,%r14
- mulx 16*7($nptr),%rax,%r15
+ mulx 8*7($nptr),%rax,%r15
mov %rbx,%rdx
adcx %rax,%r14
adox $carry,%r15 # $carry is 0
@@ -3056,7 +3180,7 @@ sqrx8x_reduction:
mov 48+8(%rsp),%rdx # pull n0*a[0]
add 8*0($tptr),%r8
- lea 16*8($nptr),$nptr
+ lea 8*8($nptr),$nptr
mov \$-8,%rcx
adcx 8*1($tptr),%r9
adcx 8*2($tptr),%r10
@@ -3075,35 +3199,35 @@ sqrx8x_reduction:
.align 32
.Lsqrx8x_tail:
mov %r8,%rbx
- mulx 16*0($nptr),%rax,%r8
+ mulx 8*0($nptr),%rax,%r8
adcx %rax,%rbx
adox %r9,%r8
- mulx 16*1($nptr),%rax,%r9
+ mulx 8*1($nptr),%rax,%r9
adcx %rax,%r8
adox %r10,%r9
- mulx 16*2($nptr),%rax,%r10
+ mulx 8*2($nptr),%rax,%r10
adcx %rax,%r9
adox %r11,%r10
- mulx 16*3($nptr),%rax,%r11
+ mulx 8*3($nptr),%rax,%r11
adcx %rax,%r10
adox %r12,%r11
- .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x40,0x00,0x00,0x00 # mulx 16*4($nptr),%rax,%r12
+ .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rax,%r12
adcx %rax,%r11
adox %r13,%r12
- mulx 16*5($nptr),%rax,%r13
+ mulx 8*5($nptr),%rax,%r13
adcx %rax,%r12
adox %r14,%r13
- mulx 16*6($nptr),%rax,%r14
+ mulx 8*6($nptr),%rax,%r14
adcx %rax,%r13
adox %r15,%r14
- mulx 16*7($nptr),%rax,%r15
+ mulx 8*7($nptr),%rax,%r15
mov 72+48+8(%rsp,%rcx,8),%rdx # pull n0*a[i]
adcx %rax,%r14
adox $carry,%r15
@@ -3119,7 +3243,7 @@ sqrx8x_reduction:
sub 16+8(%rsp),$carry # mov 16(%rsp),%cf
mov 48+8(%rsp),%rdx # pull n0*a[0]
- lea 16*8($nptr),$nptr
+ lea 8*8($nptr),$nptr
adc 8*0($tptr),%r8
adc 8*1($tptr),%r9
adc 8*2($tptr),%r10
@@ -3155,7 +3279,7 @@ sqrx8x_reduction:
adc 8*0($tptr),%r8
movq %xmm3,%rcx
adc 8*1($tptr),%r9
- mov 16*7($nptr),$carry
+ mov 8*7($nptr),$carry
movq %xmm2,$nptr # restore $nptr
adc 8*2($tptr),%r10
adc 8*3($tptr),%r11
@@ -3181,6 +3305,8 @@ sqrx8x_reduction:
lea 8*8($tptr,%rcx),$tptr # start of current t[] window
cmp 8+8(%rsp),%r8 # end of t[]?
jb .Lsqrx8x_reduction_loop
+ ret
+.size bn_sqrx8x_internal,.-bn_sqrx8x_internal
___
}
##############################################################
@@ -3188,52 +3314,59 @@ ___
#
{
my ($rptr,$nptr)=("%rdx","%rbp");
-my @ri=map("%r$_",(10..13));
-my @ni=map("%r$_",(14..15));
$code.=<<___;
- xor %ebx,%ebx
- sub %r15,%rsi # compare top-most words
- adc %rbx,%rbx
+.align 32
+__bn_postx4x_internal:
+ mov 8*0($nptr),%r12
mov %rcx,%r10 # -$num
- or %rbx,%rax
mov %rcx,%r9 # -$num
- xor \$1,%rax
- sar \$3+2,%rcx # cf=0
+ neg %rax
+ sar \$3+2,%rcx
#lea 48+8(%rsp,%r9),$tptr
- lea ($nptr,%rax,8),$nptr
movq %xmm1,$rptr # restore $rptr
movq %xmm1,$aptr # prepare for back-to-back call
- jmp .Lsqrx4x_sub
+ dec %r12 # so that after 'not' we get -n[0]
+ mov 8*1($nptr),%r13
+ xor %r8,%r8
+ mov 8*2($nptr),%r14
+ mov 8*3($nptr),%r15
+ jmp .Lsqrx4x_sub_entry
-.align 32
+.align 16
.Lsqrx4x_sub:
- .byte 0x66
- mov 8*0($tptr),%r12
- mov 8*1($tptr),%r13
- sbb 16*0($nptr),%r12
- mov 8*2($tptr),%r14
- sbb 16*1($nptr),%r13
- mov 8*3($tptr),%r15
- lea 8*4($tptr),$tptr
- sbb 16*2($nptr),%r14
+ mov 8*0($nptr),%r12
+ mov 8*1($nptr),%r13
+ mov 8*2($nptr),%r14
+ mov 8*3($nptr),%r15
+.Lsqrx4x_sub_entry:
+ andn %rax,%r12,%r12
+ lea 8*4($nptr),$nptr
+ andn %rax,%r13,%r13
+ andn %rax,%r14,%r14
+ andn %rax,%r15,%r15
+
+ neg %r8 # mov %r8,%cf
+ adc 8*0($tptr),%r12
+ adc 8*1($tptr),%r13
+ adc 8*2($tptr),%r14
+ adc 8*3($tptr),%r15
mov %r12,8*0($rptr)
- sbb 16*3($nptr),%r15
- lea 16*4($nptr),$nptr
+ lea 8*4($tptr),$tptr
mov %r13,8*1($rptr)
+ sbb %r8,%r8 # mov %cf,%r8
mov %r14,8*2($rptr)
mov %r15,8*3($rptr)
lea 8*4($rptr),$rptr
inc %rcx
jnz .Lsqrx4x_sub
-___
-}
-$code.=<<___;
+
neg %r9 # restore $num
ret
-.size bn_sqrx8x_internal,.-bn_sqrx8x_internal
+.size __bn_postx4x_internal,.-__bn_postx4x_internal
___
+}
}}}
{
my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order
@@ -3282,56 +3415,91 @@ bn_scatter5:
.globl bn_gather5
.type bn_gather5,\@abi-omnipotent
-.align 16
+.align 32
bn_gather5:
-___
-$code.=<<___ if ($win64);
-.LSEH_begin_bn_gather5:
+.LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases
# I can't trust assembler to use specific encoding:-(
- .byte 0x48,0x83,0xec,0x28 #sub \$0x28,%rsp
- .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
- .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp)
+ .byte 0x4c,0x8d,0x14,0x24 #lea (%rsp),%r10
+ .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 #sub $0x108,%rsp
+ lea .Linc(%rip),%rax
+ and \$-16,%rsp # shouldn't be formally required
+
+ movd $idx,%xmm5
+ movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000
+ movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002
+ lea 128($tbl),%r11 # size optimization
+ lea 128(%rsp),%rax # size optimization
+
+ pshufd \$0,%xmm5,%xmm5 # broadcast $idx
+ movdqa %xmm1,%xmm4
+ movdqa %xmm1,%xmm2
___
+########################################################################
+# calculate mask by comparing 0..31 to $idx and save result to stack
+#
+for($i=0;$i<$STRIDE/16;$i+=4) {
$code.=<<___;
- mov $idx,%r11d
- shr \$`log($N/8)/log(2)`,$idx
- and \$`$N/8-1`,%r11
- not $idx
- lea .Lmagic_masks(%rip),%rax
- and \$`2**5/($N/8)-1`,$idx # 5 is "window size"
- lea 128($tbl,%r11,8),$tbl # pointer within 1st cache line
- movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which
- movq 8(%rax,$idx,8),%xmm5 # cache line contains element
- movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument
- movq 24(%rax,$idx,8),%xmm7
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0 # compare to 1,0
+___
+$code.=<<___ if ($i);
+ movdqa %xmm3,`16*($i-1)-128`(%rax)
+___
+$code.=<<___;
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1 # compare to 3,2
+ movdqa %xmm0,`16*($i+0)-128`(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2 # compare to 5,4
+ movdqa %xmm1,`16*($i+1)-128`(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3 # compare to 7,6
+ movdqa %xmm2,`16*($i+2)-128`(%rax)
+ movdqa %xmm4,%xmm2
+___
+}
+$code.=<<___;
+ movdqa %xmm3,`16*($i-1)-128`(%rax)
jmp .Lgather
-.align 16
-.Lgather:
- movq `0*$STRIDE/4-128`($tbl),%xmm0
- movq `1*$STRIDE/4-128`($tbl),%xmm1
- pand %xmm4,%xmm0
- movq `2*$STRIDE/4-128`($tbl),%xmm2
- pand %xmm5,%xmm1
- movq `3*$STRIDE/4-128`($tbl),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
- .byte 0x67,0x67
- por %xmm2,%xmm0
- lea $STRIDE($tbl),$tbl
- por %xmm3,%xmm0
+.align 32
+.Lgather:
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+___
+for($i=0;$i<$STRIDE/16;$i+=4) {
+$code.=<<___;
+ movdqa `16*($i+0)-128`(%r11),%xmm0
+ movdqa `16*($i+1)-128`(%r11),%xmm1
+ movdqa `16*($i+2)-128`(%r11),%xmm2
+ pand `16*($i+0)-128`(%rax),%xmm0
+ movdqa `16*($i+3)-128`(%r11),%xmm3
+ pand `16*($i+1)-128`(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand `16*($i+2)-128`(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand `16*($i+3)-128`(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+___
+}
+$code.=<<___;
+ por %xmm5,%xmm4
+ lea $STRIDE(%r11),%r11
+ pshufd \$0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
movq %xmm0,($out) # m0=bp[0]
lea 8($out),$out
sub \$1,$num
jnz .Lgather
-___
-$code.=<<___ if ($win64);
- movaps (%rsp),%xmm6
- movaps 0x10(%rsp),%xmm7
- lea 0x28(%rsp),%rsp
-___
-$code.=<<___;
+
+ lea (%r10),%rsp
ret
.LSEH_end_bn_gather5:
.size bn_gather5,.-bn_gather5
@@ -3339,9 +3507,9 @@ ___
}
$code.=<<___;
.align 64
-.Lmagic_masks:
- .long 0,0, 0,0, 0,0, -1,-1
- .long 0,0, 0,0, 0,0, 0,0
+.Linc:
+ .long 0,0, 1,1
+ .long 2,2, 2,2
.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
___
@@ -3389,19 +3557,16 @@ mul_handler:
lea .Lmul_epilogue(%rip),%r10
cmp %r10,%rbx
- jb .Lbody_40
+ ja .Lbody_40
mov 192($context),%r10 # pull $num
mov 8(%rax,%r10,8),%rax # pull saved stack pointer
+
jmp .Lbody_proceed
.Lbody_40:
mov 40(%rax),%rax # pull saved stack pointer
.Lbody_proceed:
-
- movaps -88(%rax),%xmm0
- movaps -72(%rax),%xmm1
-
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
@@ -3414,8 +3579,6 @@ mul_handler:
mov %r13,224($context) # restore context->R13
mov %r14,232($context) # restore context->R14
mov %r15,240($context) # restore context->R15
- movups %xmm0,512($context) # restore context->Xmm6
- movups %xmm1,528($context) # restore context->Xmm7
.Lcommon_seh_tail:
mov 8(%rax),%rdi
@@ -3526,10 +3689,9 @@ ___
$code.=<<___;
.align 8
.LSEH_info_bn_gather5:
- .byte 0x01,0x0d,0x05,0x00
- .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
- .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6
- .byte 0x04,0x42,0x00,0x00 #sub rsp,0x28
+ .byte 0x01,0x0b,0x03,0x0a
+ .byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108
+ .byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp)
.align 8
___
}
diff --git a/crypto/bn/bn.h b/crypto/bn/bn.h
index 5696965e9a09..86264ae6315f 100644
--- a/crypto/bn/bn.h
+++ b/crypto/bn/bn.h
@@ -125,6 +125,7 @@
#ifndef HEADER_BN_H
# define HEADER_BN_H
+# include <limits.h>
# include <openssl/e_os2.h>
# ifndef OPENSSL_NO_FP_API
# include <stdio.h> /* FILE */
@@ -721,8 +722,17 @@ const BIGNUM *BN_get0_nist_prime_521(void);
/* library internal functions */
-# define bn_expand(a,bits) ((((((bits+BN_BITS2-1))/BN_BITS2)) <= (a)->dmax)?\
- (a):bn_expand2((a),(bits+BN_BITS2-1)/BN_BITS2))
+# define bn_expand(a,bits) \
+ ( \
+ bits > (INT_MAX - BN_BITS2 + 1) ? \
+ NULL \
+ : \
+ (((bits+BN_BITS2-1)/BN_BITS2) <= (a)->dmax) ? \
+ (a) \
+ : \
+ bn_expand2((a),(bits+BN_BITS2-1)/BN_BITS2) \
+ )
+
# define bn_wexpand(a,words) (((words) <= (a)->dmax)?(a):bn_expand2((a),(words)))
BIGNUM *bn_expand2(BIGNUM *a, int words);
# ifndef OPENSSL_NO_DEPRECATED
diff --git a/crypto/bn/bn_exp.c b/crypto/bn/bn_exp.c
index 6d30d1e0fff5..1670f01d1d8c 100644
--- a/crypto/bn/bn_exp.c
+++ b/crypto/bn/bn_exp.c
@@ -110,6 +110,7 @@
*/
#include "cryptlib.h"
+#include "constant_time_locl.h"
#include "bn_lcl.h"
#include <stdlib.h>
@@ -606,15 +607,17 @@ static BN_ULONG bn_get_bits(const BIGNUM *a, int bitpos)
static int MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top,
unsigned char *buf, int idx,
- int width)
+ int window)
{
- size_t i, j;
+ int i, j;
+ int width = 1 << window;
+ BN_ULONG *table = (BN_ULONG *)buf;
if (top > b->top)
top = b->top; /* this works because 'buf' is explicitly
* zeroed */
- for (i = 0, j = idx; i < top * sizeof b->d[0]; i++, j += width) {
- buf[j] = ((unsigned char *)b->d)[i];
+ for (i = 0, j = idx; i < top; i++, j += width) {
+ table[j] = b->d[i];
}
return 1;
@@ -622,15 +625,51 @@ static int MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top,
static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top,
unsigned char *buf, int idx,
- int width)
+ int window)
{
- size_t i, j;
+ int i, j;
+ int width = 1 << window;
+ volatile BN_ULONG *table = (volatile BN_ULONG *)buf;
if (bn_wexpand(b, top) == NULL)
return 0;
- for (i = 0, j = idx; i < top * sizeof b->d[0]; i++, j += width) {
- ((unsigned char *)b->d)[i] = buf[j];
+ if (window <= 3) {
+ for (i = 0; i < top; i++, table += width) {
+ BN_ULONG acc = 0;
+
+ for (j = 0; j < width; j++) {
+ acc |= table[j] &
+ ((BN_ULONG)0 - (constant_time_eq_int(j,idx)&1));
+ }
+
+ b->d[i] = acc;
+ }
+ } else {
+ int xstride = 1 << (window - 2);
+ BN_ULONG y0, y1, y2, y3;
+
+ i = idx >> (window - 2); /* equivalent of idx / xstride */
+ idx &= xstride - 1; /* equivalent of idx % xstride */
+
+ y0 = (BN_ULONG)0 - (constant_time_eq_int(i,0)&1);
+ y1 = (BN_ULONG)0 - (constant_time_eq_int(i,1)&1);
+ y2 = (BN_ULONG)0 - (constant_time_eq_int(i,2)&1);
+ y3 = (BN_ULONG)0 - (constant_time_eq_int(i,3)&1);
+
+ for (i = 0; i < top; i++, table += width) {
+ BN_ULONG acc = 0;
+
+ for (j = 0; j < xstride; j++) {
+ acc |= ( (table[j + 0 * xstride] & y0) |
+ (table[j + 1 * xstride] & y1) |
+ (table[j + 2 * xstride] & y2) |
+ (table[j + 3 * xstride] & y3) )
+ & ((BN_ULONG)0 - (constant_time_eq_int(j,idx)&1));
+ }
+
+ b->d[i] = acc;
+ }
}
b->top = top;
@@ -749,8 +788,8 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
if (window >= 5) {
window = 5; /* ~5% improvement for RSA2048 sign, and even
* for RSA4096 */
- if ((top & 7) == 0)
- powerbufLen += 2 * top * sizeof(m->d[0]);
+ /* reserve space for mont->N.d[] copy */
+ powerbufLen += top * sizeof(mont->N.d[0]);
}
#endif
(void)0;
@@ -971,7 +1010,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
const BN_ULONG *not_used, const BN_ULONG *np,
const BN_ULONG *n0, int num);
- BN_ULONG *np = mont->N.d, *n0 = mont->n0, *np2;
+ BN_ULONG *n0 = mont->n0, *np;
/*
* BN_to_montgomery can contaminate words above .top [in
@@ -982,11 +1021,11 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
for (i = tmp.top; i < top; i++)
tmp.d[i] = 0;
- if (top & 7)
- np2 = np;
- else
- for (np2 = am.d + top, i = 0; i < top; i++)
- np2[2 * i] = np[i];
+ /*
+ * copy mont->N.d[] to improve cache locality
+ */
+ for (np = am.d + top, i = 0; i < top; i++)
+ np[i] = mont->N.d[i];
bn_scatter5(tmp.d, top, powerbuf, 0);
bn_scatter5(am.d, am.top, powerbuf, 1);
@@ -996,7 +1035,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
# if 0
for (i = 3; i < 32; i++) {
/* Calculate a^i = a^(i-1) * a */
- bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
+ bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
bn_scatter5(tmp.d, top, powerbuf, i);
}
# else
@@ -1007,7 +1046,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
}
for (i = 3; i < 8; i += 2) {
int j;
- bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
+ bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
bn_scatter5(tmp.d, top, powerbuf, i);
for (j = 2 * i; j < 32; j *= 2) {
bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
@@ -1015,13 +1054,13 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
}
}
for (; i < 16; i += 2) {
- bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
+ bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
bn_scatter5(tmp.d, top, powerbuf, i);
bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
bn_scatter5(tmp.d, top, powerbuf, 2 * i);
}
for (; i < 32; i += 2) {
- bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
+ bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
bn_scatter5(tmp.d, top, powerbuf, i);
}
# endif
@@ -1050,11 +1089,11 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
while (bits >= 0) {
wvalue = bn_get_bits5(p->d, bits - 4);
bits -= 5;
- bn_power5(tmp.d, tmp.d, powerbuf, np2, n0, top, wvalue);
+ bn_power5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue);
}
}
- ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np2, n0, top);
+ ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np, n0, top);
tmp.top = top;
bn_correct_top(&tmp);
if (ret) {
@@ -1065,9 +1104,9 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
} else
#endif
{
- if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 0, numPowers))
+ if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 0, window))
goto err;
- if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&am, top, powerbuf, 1, numPowers))
+ if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&am, top, powerbuf, 1, window))
goto err;
/*
@@ -1079,15 +1118,15 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
if (window > 1) {
if (!BN_mod_mul_montgomery(&tmp, &am, &am, mont, ctx))
goto err;
- if (!MOD_EXP_CTIME_COPY_TO_PREBUF
- (&tmp, top, powerbuf, 2, numPowers))
+ if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 2,
+ window))
goto err;
for (i = 3; i < numPowers; i++) {
/* Calculate a^i = a^(i-1) * a */
if (!BN_mod_mul_montgomery(&tmp, &am, &tmp, mont, ctx))
goto err;
- if (!MOD_EXP_CTIME_COPY_TO_PREBUF
- (&tmp, top, powerbuf, i, numPowers))
+ if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, i,
+ window))
goto err;
}
}
@@ -1095,8 +1134,8 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
bits--;
for (wvalue = 0, i = bits % window; i >= 0; i--, bits--)
wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
- if (!MOD_EXP_CTIME_COPY_FROM_PREBUF
- (&tmp, top, powerbuf, wvalue, numPowers))
+ if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&tmp, top, powerbuf, wvalue,
+ window))
goto err;
/*
@@ -1116,8 +1155,8 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
/*
* Fetch the appropriate pre-computed value from the pre-buf
*/
- if (!MOD_EXP_CTIME_COPY_FROM_PREBUF
- (&am, top, powerbuf, wvalue, numPowers))
+ if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&am, top, powerbuf, wvalue,
+ window))
goto err;
/* Multiply the result into the intermediate result */
diff --git a/crypto/bn/bn_print.c b/crypto/bn/bn_print.c
index ab10b957ba27..bfa31efc5621 100644
--- a/crypto/bn/bn_print.c
+++ b/crypto/bn/bn_print.c
@@ -58,6 +58,7 @@
#include <stdio.h>
#include <ctype.h>
+#include <limits.h>
#include "cryptlib.h"
#include <openssl/buffer.h>
#include "bn_lcl.h"
@@ -189,7 +190,11 @@ int BN_hex2bn(BIGNUM **bn, const char *a)
a++;
}
- for (i = 0; isxdigit((unsigned char)a[i]); i++) ;
+ for (i = 0; i <= (INT_MAX/4) && isxdigit((unsigned char)a[i]); i++)
+ continue;
+
+ if (i > INT_MAX/4)
+ goto err;
num = i + neg;
if (bn == NULL)
@@ -204,7 +209,7 @@ int BN_hex2bn(BIGNUM **bn, const char *a)
BN_zero(ret);
}
- /* i is the number of hex digests; */
+ /* i is the number of hex digits */
if (bn_expand(ret, i * 4) == NULL)
goto err;
@@ -260,7 +265,11 @@ int BN_dec2bn(BIGNUM **bn, const char *a)
a++;
}
- for (i = 0; isdigit((unsigned char)a[i]); i++) ;
+ for (i = 0; i <= (INT_MAX/4) && isdigit((unsigned char)a[i]); i++)
+ continue;
+
+ if (i > INT_MAX/4)
+ goto err;
num = i + neg;
if (bn == NULL)
@@ -278,7 +287,7 @@ int BN_dec2bn(BIGNUM **bn, const char *a)
BN_zero(ret);
}
- /* i is the number of digests, a bit of an over expand; */
+ /* i is the number of digits, a bit of an over expand */
if (bn_expand(ret, i * 4) == NULL)
goto err;
diff --git a/crypto/bn/bn_recp.c b/crypto/bn/bn_recp.c
index 7497ac624d94..f047040efe03 100644
--- a/crypto/bn/bn_recp.c
+++ b/crypto/bn/bn_recp.c
@@ -65,6 +65,7 @@ void BN_RECP_CTX_init(BN_RECP_CTX *recp)
BN_init(&(recp->N));
BN_init(&(recp->Nr));
recp->num_bits = 0;
+ recp->shift = 0;
recp->flags = 0;
}
diff --git a/crypto/cmac/cmac.c b/crypto/cmac/cmac.c
index 774e6dc91905..2954b6eb7dcf 100644
--- a/crypto/cmac/cmac.c
+++ b/crypto/cmac/cmac.c
@@ -160,6 +160,14 @@ int CMAC_Init(CMAC_CTX *ctx, const void *key, size_t keylen,
EVPerr(EVP_F_CMAC_INIT, EVP_R_DISABLED_FOR_FIPS);
return 0;
}
+
+ /* Switch to FIPS cipher implementation if possible */
+ if (cipher != NULL) {
+ const EVP_CIPHER *fcipher;
+ fcipher = FIPS_get_cipherbynid(EVP_CIPHER_nid(cipher));
+ if (fcipher != NULL)
+ cipher = fcipher;
+ }
/*
* Other algorithm blocking will be done in FIPS_cmac_init, via
* FIPS_cipherinit().
diff --git a/crypto/cryptlib.c b/crypto/cryptlib.c
index c9f674ba8e62..1925428f5ec5 100644
--- a/crypto/cryptlib.c
+++ b/crypto/cryptlib.c
@@ -1016,11 +1016,11 @@ void *OPENSSL_stderr(void)
return stderr;
}
-int CRYPTO_memcmp(const void *in_a, const void *in_b, size_t len)
+int CRYPTO_memcmp(const volatile void *in_a, const volatile void *in_b, size_t len)
{
size_t i;
- const unsigned char *a = in_a;
- const unsigned char *b = in_b;
+ const volatile unsigned char *a = in_a;
+ const volatile unsigned char *b = in_b;
unsigned char x = 0;
for (i = 0; i < len; i++)
diff --git a/crypto/crypto.h b/crypto/crypto.h
index c450d7a3c374..6c644ce12a82 100644
--- a/crypto/crypto.h
+++ b/crypto/crypto.h
@@ -628,7 +628,7 @@ void OPENSSL_init(void);
* into a defined order as the return value when a != b is undefined, other
* than to be non-zero.
*/
-int CRYPTO_memcmp(const void *a, const void *b, size_t len);
+int CRYPTO_memcmp(const volatile void *a, const volatile void *b, size_t len);
/* BEGIN ERROR CODES */
/*
diff --git a/crypto/dh/dh.h b/crypto/dh/dh.h
index 5498a9dc1060..a5bd9016aae8 100644
--- a/crypto/dh/dh.h
+++ b/crypto/dh/dh.h
@@ -174,7 +174,7 @@ struct dh_st {
/* DH_check_pub_key error codes */
# define DH_CHECK_PUBKEY_TOO_SMALL 0x01
# define DH_CHECK_PUBKEY_TOO_LARGE 0x02
-# define DH_CHECK_PUBKEY_INVALID 0x03
+# define DH_CHECK_PUBKEY_INVALID 0x04
/*
* primes p where (p-1)/2 is prime too are called "safe"; we define this for
diff --git a/crypto/dh/dh_check.c b/crypto/dh/dh_check.c
index 5adedc0d264e..027704111432 100644
--- a/crypto/dh/dh_check.c
+++ b/crypto/dh/dh_check.c
@@ -160,13 +160,12 @@ int DH_check_pub_key(const DH *dh, const BIGNUM *pub_key, int *ret)
goto err;
BN_CTX_start(ctx);
tmp = BN_CTX_get(ctx);
- if (tmp == NULL)
+ if (tmp == NULL || !BN_set_word(tmp, 1))
goto err;
- BN_set_word(tmp, 1);
if (BN_cmp(pub_key, tmp) <= 0)
*ret |= DH_CHECK_PUBKEY_TOO_SMALL;
- BN_copy(tmp, dh->p);
- BN_sub_word(tmp, 1);
+ if (BN_copy(tmp, dh->p) == NULL || !BN_sub_word(tmp, 1))
+ goto err;
if (BN_cmp(pub_key, tmp) >= 0)
*ret |= DH_CHECK_PUBKEY_TOO_LARGE;
diff --git a/crypto/dsa/dsa_ameth.c b/crypto/dsa/dsa_ameth.c
index c40e1777ade1..cc83d6e6ad3b 100644
--- a/crypto/dsa/dsa_ameth.c
+++ b/crypto/dsa/dsa_ameth.c
@@ -191,6 +191,8 @@ static int dsa_priv_decode(EVP_PKEY *pkey, PKCS8_PRIV_KEY_INFO *p8)
STACK_OF(ASN1_TYPE) *ndsa = NULL;
DSA *dsa = NULL;
+ int ret = 0;
+
if (!PKCS8_pkey_get0(NULL, &p, &pklen, &palg, p8))
return 0;
X509_ALGOR_get0(NULL, &ptype, &pval, palg);
@@ -262,23 +264,21 @@ static int dsa_priv_decode(EVP_PKEY *pkey, PKCS8_PRIV_KEY_INFO *p8)
}
EVP_PKEY_assign_DSA(pkey, dsa);
- BN_CTX_free(ctx);
- if (ndsa)
- sk_ASN1_TYPE_pop_free(ndsa, ASN1_TYPE_free);
- else
- ASN1_STRING_clear_free(privkey);
- return 1;
+ ret = 1;
+ goto done;
decerr:
- DSAerr(DSA_F_DSA_PRIV_DECODE, EVP_R_DECODE_ERROR);
+ DSAerr(DSA_F_DSA_PRIV_DECODE, DSA_R_DECODE_ERROR);
dsaerr:
+ DSA_free(dsa);
+ done:
BN_CTX_free(ctx);
- if (privkey)
+ if (ndsa)
+ sk_ASN1_TYPE_pop_free(ndsa, ASN1_TYPE_free);
+ else
ASN1_STRING_clear_free(privkey);
- sk_ASN1_TYPE_pop_free(ndsa, ASN1_TYPE_free);
- DSA_free(dsa);
- return 0;
+ return ret;
}
static int dsa_priv_encode(PKCS8_PRIV_KEY_INFO *p8, const EVP_PKEY *pkey)
diff --git a/crypto/dso/dso_lib.c b/crypto/dso/dso_lib.c
index 3312450eae67..2beb7c1ba542 100644
--- a/crypto/dso/dso_lib.c
+++ b/crypto/dso/dso_lib.c
@@ -122,6 +122,7 @@ DSO *DSO_new_method(DSO_METHOD *meth)
ret->meth = meth;
ret->references = 1;
if ((ret->meth->init != NULL) && !ret->meth->init(ret)) {
+ sk_void_free(ret->meth_data);
OPENSSL_free(ret);
ret = NULL;
}
diff --git a/crypto/ec/asm/ecp_nistz256-x86_64.pl b/crypto/ec/asm/ecp_nistz256-x86_64.pl
index e6acfd59f0d4..7140860e245b 100755
--- a/crypto/ec/asm/ecp_nistz256-x86_64.pl
+++ b/crypto/ec/asm/ecp_nistz256-x86_64.pl
@@ -2001,6 +2001,7 @@ $code.=<<___;
push %r15
sub \$32*5+8, %rsp
+.Lpoint_double_shortcut$x:
movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x
mov $a_ptr, $b_ptr # backup copy
movdqu 0x10($a_ptr), %xmm1
@@ -2291,6 +2292,7 @@ $code.=<<___;
mov 0x40+8*1($b_ptr), $acc6
mov 0x40+8*2($b_ptr), $acc7
mov 0x40+8*3($b_ptr), $acc0
+ movq $b_ptr, %xmm1
lea 0x40-$bias($b_ptr), $a_ptr
lea $Z1sqr(%rsp), $r_ptr # Z1^2
@@ -2346,7 +2348,7 @@ $code.=<<___;
test $acc0, $acc0
jnz .Ladd_proceed$x # (in1infty || in2infty)?
test $acc1, $acc1
- jz .Ladd_proceed$x # is_equal(S1,S2)?
+ jz .Ladd_double$x # is_equal(S1,S2)?
movq %xmm0, $r_ptr # restore $r_ptr
pxor %xmm0, %xmm0
@@ -2359,6 +2361,13 @@ $code.=<<___;
jmp .Ladd_done$x
.align 32
+.Ladd_double$x:
+ movq %xmm1, $a_ptr # restore $a_ptr
+ movq %xmm0, $r_ptr # restore $r_ptr
+ add \$`32*(18-5)`, %rsp # difference in frame sizes
+ jmp .Lpoint_double_shortcut$x
+
+.align 32
.Ladd_proceed$x:
`&load_for_sqr("$R(%rsp)", "$src0")`
lea $Rsqr(%rsp), $r_ptr # R^2
diff --git a/crypto/ec/ecp_nistp224.c b/crypto/ec/ecp_nistp224.c
index ed09f97ade68..d81cc9ce6b1a 100644
--- a/crypto/ec/ecp_nistp224.c
+++ b/crypto/ec/ecp_nistp224.c
@@ -1657,8 +1657,7 @@ int ec_GFp_nistp224_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
*/
if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
- ret = 1;
- goto err;
+ goto done;
}
if ((!BN_to_felem(pre->g_pre_comp[0][1][0], &group->generator->X)) ||
(!BN_to_felem(pre->g_pre_comp[0][1][1], &group->generator->Y)) ||
@@ -1736,6 +1735,7 @@ int ec_GFp_nistp224_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
}
make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_felems);
+ done:
if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp224_pre_comp_dup,
nistp224_pre_comp_free,
nistp224_pre_comp_clear_free))
diff --git a/crypto/ec/ecp_nistp256.c b/crypto/ec/ecp_nistp256.c
index a5887086c638..78d191aac7af 100644
--- a/crypto/ec/ecp_nistp256.c
+++ b/crypto/ec/ecp_nistp256.c
@@ -2249,8 +2249,7 @@ int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
*/
if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
- ret = 1;
- goto err;
+ goto done;
}
if ((!BN_to_felem(x_tmp, &group->generator->X)) ||
(!BN_to_felem(y_tmp, &group->generator->Y)) ||
@@ -2337,6 +2336,7 @@ int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
}
make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems);
+ done:
if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp256_pre_comp_dup,
nistp256_pre_comp_free,
nistp256_pre_comp_clear_free))
diff --git a/crypto/ec/ecp_nistp521.c b/crypto/ec/ecp_nistp521.c
index 360b9a3516f6..c53a61bbfb69 100644
--- a/crypto/ec/ecp_nistp521.c
+++ b/crypto/ec/ecp_nistp521.c
@@ -2056,8 +2056,7 @@ int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
*/
if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
- ret = 1;
- goto err;
+ goto done;
}
if ((!BN_to_felem(pre->g_pre_comp[1][0], &group->generator->X)) ||
(!BN_to_felem(pre->g_pre_comp[1][1], &group->generator->Y)) ||
@@ -2115,6 +2114,7 @@ int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
}
make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
+ done:
if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp521_pre_comp_dup,
nistp521_pre_comp_free,
nistp521_pre_comp_clear_free))
diff --git a/crypto/ec/ectest.c b/crypto/ec/ectest.c
index efab0b07b1d2..40a1f003259f 100644
--- a/crypto/ec/ectest.c
+++ b/crypto/ec/ectest.c
@@ -1758,9 +1758,18 @@ static void nistp_single_test(const struct nistp_test_params *test)
if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx))
ABORT;
+ /*
+ * We have not performed precomputation so have_precompute mult should be
+ * false
+ */
+ if (EC_GROUP_have_precompute_mult(NISTP))
+ ABORT;
+
/* now repeat all tests with precomputation */
if (!EC_GROUP_precompute_mult(NISTP, ctx))
ABORT;
+ if (!EC_GROUP_have_precompute_mult(NISTP))
+ ABORT;
/* fixed point multiplication */
EC_POINT_mul(NISTP, Q, m, NULL, NULL, ctx);
diff --git a/crypto/engine/eng_dyn.c b/crypto/engine/eng_dyn.c
index 3169b09ad865..40f30e9d585e 100644
--- a/crypto/engine/eng_dyn.c
+++ b/crypto/engine/eng_dyn.c
@@ -243,8 +243,10 @@ static int dynamic_set_data_ctx(ENGINE *e, dynamic_data_ctx **ctx)
* If we lost the race to set the context, c is non-NULL and *ctx is the
* context of the thread that won.
*/
- if (c)
+ if (c) {
+ sk_OPENSSL_STRING_free(c->dirs);
OPENSSL_free(c);
+ }
return 1;
}
diff --git a/crypto/evp/e_des.c b/crypto/evp/e_des.c
index aae13a675694..8ca65cd03ae1 100644
--- a/crypto/evp/e_des.c
+++ b/crypto/evp/e_des.c
@@ -71,12 +71,13 @@ typedef struct {
DES_key_schedule ks;
} ks;
union {
- void (*cbc) (const void *, void *, size_t, const void *, void *);
+ void (*cbc) (const void *, void *, size_t,
+ const DES_key_schedule *, unsigned char *);
} stream;
} EVP_DES_KEY;
# if defined(AES_ASM) && (defined(__sparc) || defined(__sparc__))
-/* ---------^^^ this is not a typo, just a way to detect that
+/* ----------^^^ this is not a typo, just a way to detect that
* assembler support was in general requested... */
# include "sparc_arch.h"
@@ -86,9 +87,9 @@ extern unsigned int OPENSSL_sparcv9cap_P[];
void des_t4_key_expand(const void *key, DES_key_schedule *ks);
void des_t4_cbc_encrypt(const void *inp, void *out, size_t len,
- DES_key_schedule *ks, unsigned char iv[8]);
+ const DES_key_schedule *ks, unsigned char iv[8]);
void des_t4_cbc_decrypt(const void *inp, void *out, size_t len,
- DES_key_schedule *ks, unsigned char iv[8]);
+ const DES_key_schedule *ks, unsigned char iv[8]);
# endif
static int des_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
@@ -130,7 +131,7 @@ static int des_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
{
EVP_DES_KEY *dat = (EVP_DES_KEY *) ctx->cipher_data;
- if (dat->stream.cbc) {
+ if (dat->stream.cbc != NULL) {
(*dat->stream.cbc) (in, out, inl, &dat->ks.ks, ctx->iv);
return 1;
}
diff --git a/crypto/evp/e_des3.c b/crypto/evp/e_des3.c
index bf6c1d2d3d39..0e910d6d8085 100644
--- a/crypto/evp/e_des3.c
+++ b/crypto/evp/e_des3.c
@@ -75,7 +75,8 @@ typedef struct {
DES_key_schedule ks[3];
} ks;
union {
- void (*cbc) (const void *, void *, size_t, const void *, void *);
+ void (*cbc) (const void *, void *, size_t,
+ const DES_key_schedule *, unsigned char *);
} stream;
} DES_EDE_KEY;
# define ks1 ks.ks[0]
@@ -93,9 +94,9 @@ extern unsigned int OPENSSL_sparcv9cap_P[];
void des_t4_key_expand(const void *key, DES_key_schedule *ks);
void des_t4_ede3_cbc_encrypt(const void *inp, void *out, size_t len,
- DES_key_schedule *ks, unsigned char iv[8]);
+ const DES_key_schedule ks[3], unsigned char iv[8]);
void des_t4_ede3_cbc_decrypt(const void *inp, void *out, size_t len,
- DES_key_schedule *ks, unsigned char iv[8]);
+ const DES_key_schedule ks[3], unsigned char iv[8]);
# endif
static int des_ede_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
@@ -162,7 +163,7 @@ static int des_ede_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
}
# endif /* KSSL_DEBUG */
if (dat->stream.cbc) {
- (*dat->stream.cbc) (in, out, inl, &dat->ks, ctx->iv);
+ (*dat->stream.cbc) (in, out, inl, dat->ks.ks, ctx->iv);
return 1;
}
@@ -395,7 +396,7 @@ static int des_ede3_unwrap(EVP_CIPHER_CTX *ctx, unsigned char *out,
int rv = -1;
if (inl < 24)
return -1;
- if (!out)
+ if (out == NULL)
return inl - 16;
memcpy(ctx->iv, wrap_iv, 8);
/* Decrypt first block which will end up as icv */
@@ -438,7 +439,7 @@ static int des_ede3_wrap(EVP_CIPHER_CTX *ctx, unsigned char *out,
const unsigned char *in, size_t inl)
{
unsigned char sha1tmp[SHA_DIGEST_LENGTH];
- if (!out)
+ if (out == NULL)
return inl + 16;
/* Copy input to output buffer + 8 so we have space for IV */
memmove(out + 8, in, inl);
diff --git a/crypto/modes/asm/aesni-gcm-x86_64.pl b/crypto/modes/asm/aesni-gcm-x86_64.pl
index bd6bf72fe487..980cfd23efe3 100755
--- a/crypto/modes/asm/aesni-gcm-x86_64.pl
+++ b/crypto/modes/asm/aesni-gcm-x86_64.pl
@@ -43,7 +43,7 @@ die "can't locate x86_64-xlate.pl";
if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
- $avx = ($1>=2.19) + ($1>=2.22);
+ $avx = ($1>=2.20) + ($1>=2.22);
}
if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
@@ -489,7 +489,7 @@ $code.=<<___;
___
$code.=<<___ if ($win64);
movaps -0xd8(%rax),%xmm6
- movaps -0xd8(%rax),%xmm7
+ movaps -0xc8(%rax),%xmm7
movaps -0xb8(%rax),%xmm8
movaps -0xa8(%rax),%xmm9
movaps -0x98(%rax),%xmm10
diff --git a/crypto/modes/asm/ghash-x86_64.pl b/crypto/modes/asm/ghash-x86_64.pl
index 4ff2d39aa7b2..f889f2018789 100755
--- a/crypto/modes/asm/ghash-x86_64.pl
+++ b/crypto/modes/asm/ghash-x86_64.pl
@@ -92,7 +92,7 @@ die "can't locate x86_64-xlate.pl";
if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
- $avx = ($1>=2.19) + ($1>=2.22);
+ $avx = ($1>=2.20) + ($1>=2.22);
}
if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
diff --git a/crypto/modes/ctr128.c b/crypto/modes/ctr128.c
index f3bbcbf72376..bcafd6b6bfb1 100644
--- a/crypto/modes/ctr128.c
+++ b/crypto/modes/ctr128.c
@@ -67,23 +67,20 @@
/* increment counter (128-bit int) by 1 */
static void ctr128_inc(unsigned char *counter)
{
- u32 n = 16;
- u8 c;
+ u32 n = 16, c = 1;
do {
--n;
- c = counter[n];
- ++c;
- counter[n] = c;
- if (c)
- return;
+ c += counter[n];
+ counter[n] = (u8)c;
+ c >>= 8;
} while (n);
}
#if !defined(OPENSSL_SMALL_FOOTPRINT)
static void ctr128_inc_aligned(unsigned char *counter)
{
- size_t *data, c, n;
+ size_t *data, c, d, n;
const union {
long one;
char little;
@@ -91,20 +88,19 @@ static void ctr128_inc_aligned(unsigned char *counter)
1
};
- if (is_endian.little) {
+ if (is_endian.little || ((size_t)counter % sizeof(size_t)) != 0) {
ctr128_inc(counter);
return;
}
data = (size_t *)counter;
+ c = 1;
n = 16 / sizeof(size_t);
do {
--n;
- c = data[n];
- ++c;
- data[n] = c;
- if (c)
- return;
+ d = data[n] += c;
+ /* did addition carry? */
+ c = ((d - c) ^ d) >> (sizeof(size_t) * 8 - 1);
} while (n);
}
#endif
@@ -144,14 +140,14 @@ void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
}
# if defined(STRICT_ALIGNMENT)
- if (((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) !=
- 0)
+ if (((size_t)in | (size_t)out | (size_t)ecount_buf)
+ % sizeof(size_t) != 0)
break;
# endif
while (len >= 16) {
(*block) (ivec, ecount_buf, key);
ctr128_inc_aligned(ivec);
- for (; n < 16; n += sizeof(size_t))
+ for (n = 0; n < 16; n += sizeof(size_t))
*(size_t *)(out + n) =
*(size_t *)(in + n) ^ *(size_t *)(ecount_buf + n);
len -= 16;
@@ -189,16 +185,13 @@ void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
/* increment upper 96 bits of 128-bit counter by 1 */
static void ctr96_inc(unsigned char *counter)
{
- u32 n = 12;
- u8 c;
+ u32 n = 12, c = 1;
do {
--n;
- c = counter[n];
- ++c;
- counter[n] = c;
- if (c)
- return;
+ c += counter[n];
+ counter[n] = (u8)c;
+ c >>= 8;
} while (n);
}
diff --git a/crypto/opensslconf.h b/crypto/opensslconf.h
index b4d522e68505..f533508b152c 100644
--- a/crypto/opensslconf.h
+++ b/crypto/opensslconf.h
@@ -38,12 +38,18 @@ extern "C" {
#ifndef OPENSSL_NO_SSL_TRACE
# define OPENSSL_NO_SSL_TRACE
#endif
+#ifndef OPENSSL_NO_SSL2
+# define OPENSSL_NO_SSL2
+#endif
#ifndef OPENSSL_NO_STORE
# define OPENSSL_NO_STORE
#endif
#ifndef OPENSSL_NO_UNIT_TEST
# define OPENSSL_NO_UNIT_TEST
#endif
+#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
+# define OPENSSL_NO_WEAK_SSL_CIPHERS
+#endif
#endif /* OPENSSL_DOING_MAKEDEPEND */
@@ -86,12 +92,18 @@ extern "C" {
# if defined(OPENSSL_NO_SSL_TRACE) && !defined(NO_SSL_TRACE)
# define NO_SSL_TRACE
# endif
+# if defined(OPENSSL_NO_SSL2) && !defined(NO_SSL2)
+# define NO_SSL2
+# endif
# if defined(OPENSSL_NO_STORE) && !defined(NO_STORE)
# define NO_STORE
# endif
# if defined(OPENSSL_NO_UNIT_TEST) && !defined(NO_UNIT_TEST)
# define NO_UNIT_TEST
# endif
+# if defined(OPENSSL_NO_WEAK_SSL_CIPHERS) && !defined(NO_WEAK_SSL_CIPHERS)
+# define NO_WEAK_SSL_CIPHERS
+# endif
#endif
/* crypto/opensslconf.h.in */
diff --git a/crypto/opensslv.h b/crypto/opensslv.h
index 03b8c4843784..4334fd15cd87 100644
--- a/crypto/opensslv.h
+++ b/crypto/opensslv.h
@@ -30,11 +30,11 @@ extern "C" {
* (Prior to 0.9.5a beta1, a different scheme was used: MMNNFFRBB for
* major minor fix final patch/beta)
*/
-# define OPENSSL_VERSION_NUMBER 0x1000206fL
+# define OPENSSL_VERSION_NUMBER 0x1000207fL
# ifdef OPENSSL_FIPS
-# define OPENSSL_VERSION_TEXT "OpenSSL 1.0.2f-fips 28 Jan 2016"
+# define OPENSSL_VERSION_TEXT "OpenSSL 1.0.2g-fips 1 Mar 2016"
# else
-# define OPENSSL_VERSION_TEXT "OpenSSL 1.0.2f 28 Jan 2016"
+# define OPENSSL_VERSION_TEXT "OpenSSL 1.0.2g 1 Mar 2016"
# endif
# define OPENSSL_VERSION_PTEXT " part of " OPENSSL_VERSION_TEXT
diff --git a/crypto/perlasm/x86_64-xlate.pl b/crypto/perlasm/x86_64-xlate.pl
index 9c70b8c2c6e9..ee04221c7e1d 100755
--- a/crypto/perlasm/x86_64-xlate.pl
+++ b/crypto/perlasm/x86_64-xlate.pl
@@ -198,8 +198,11 @@ my %globals;
if ($gas) {
# Solaris /usr/ccs/bin/as can't handle multiplications
# in $self->{value}
- $self->{value} =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi;
- $self->{value} =~ s/([0-9]+\s*[\*\/\%]\s*[0-9]+)/eval($1)/eg;
+ my $value = $self->{value};
+ $value =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi;
+ if ($value =~ s/([0-9]+\s*[\*\/\%]\s*[0-9]+)/eval($1)/eg) {
+ $self->{value} = $value;
+ }
sprintf "\$%s",$self->{value};
} else {
$self->{value} =~ s/(0b[0-1]+)/oct($1)/eig;
diff --git a/crypto/pkcs7/pk7_smime.c b/crypto/pkcs7/pk7_smime.c
index c4d3724d2a48..dc9b484078af 100644
--- a/crypto/pkcs7/pk7_smime.c
+++ b/crypto/pkcs7/pk7_smime.c
@@ -274,12 +274,29 @@ int PKCS7_verify(PKCS7 *p7, STACK_OF(X509) *certs, X509_STORE *store,
PKCS7err(PKCS7_F_PKCS7_VERIFY, PKCS7_R_NO_CONTENT);
return 0;
}
+#if 0
+ /*
+ * NB: this test commented out because some versions of Netscape
+ * illegally include zero length content when signing data. Also
+ * Microsoft Authenticode includes a SpcIndirectDataContent data
+ * structure which describes the content to be protected by the
+ * signature, rather than directly embedding that content. So
+ * Authenticode implementations are also expected to use
+ * PKCS7_verify() with explicit external data, on non-detached
+ * PKCS#7 signatures.
+ *
+ * In OpenSSL 1.1 a new flag PKCS7_NO_DUAL_CONTENT has been
+ * introduced to disable this sanity check. For the 1.0.2 branch
+ * this change is not acceptable, so the check remains completely
+ * commented out (as it has been for a long time).
+ */
/* Check for data and content: two sets of data */
if (!PKCS7_get_detached(p7) && indata) {
PKCS7err(PKCS7_F_PKCS7_VERIFY, PKCS7_R_CONTENT_AND_DATA_PRESENT);
return 0;
}
+#endif
sinfos = PKCS7_get_signer_info(p7);
diff --git a/crypto/rsa/rsa_sign.c b/crypto/rsa/rsa_sign.c
index ed63a1d8b0e3..82ca8324dfbc 100644
--- a/crypto/rsa/rsa_sign.c
+++ b/crypto/rsa/rsa_sign.c
@@ -84,7 +84,7 @@ int RSA_sign(int type, const unsigned char *m, unsigned int m_len,
return 0;
}
#endif
- if (rsa->meth->rsa_sign) {
+ if ((rsa->flags & RSA_FLAG_SIGN_VER) && rsa->meth->rsa_sign) {
return rsa->meth->rsa_sign(type, m, m_len, sigret, siglen, rsa);
}
/* Special case: SSL signature, just check the length */
@@ -293,7 +293,7 @@ int RSA_verify(int dtype, const unsigned char *m, unsigned int m_len,
const unsigned char *sigbuf, unsigned int siglen, RSA *rsa)
{
- if (rsa->meth->rsa_verify) {
+ if ((rsa->flags & RSA_FLAG_SIGN_VER) && rsa->meth->rsa_verify) {
return rsa->meth->rsa_verify(dtype, m, m_len, sigbuf, siglen, rsa);
}
diff --git a/crypto/srp/srp.h b/crypto/srp/srp.h
index d072536fec9b..028892a1ff5e 100644
--- a/crypto/srp/srp.h
+++ b/crypto/srp/srp.h
@@ -82,16 +82,21 @@ typedef struct SRP_gN_cache_st {
DECLARE_STACK_OF(SRP_gN_cache)
typedef struct SRP_user_pwd_st {
+ /* Owned by us. */
char *id;
BIGNUM *s;
BIGNUM *v;
+ /* Not owned by us. */
const BIGNUM *g;
const BIGNUM *N;
+ /* Owned by us. */
char *info;
} SRP_user_pwd;
DECLARE_STACK_OF(SRP_user_pwd)
+void SRP_user_pwd_free(SRP_user_pwd *user_pwd);
+
typedef struct SRP_VBASE_st {
STACK_OF(SRP_user_pwd) *users_pwd;
STACK_OF(SRP_gN_cache) *gN_cache;
@@ -115,7 +120,12 @@ DECLARE_STACK_OF(SRP_gN)
SRP_VBASE *SRP_VBASE_new(char *seed_key);
int SRP_VBASE_free(SRP_VBASE *vb);
int SRP_VBASE_init(SRP_VBASE *vb, char *verifier_file);
+
+/* This method ignores the configured seed and fails for an unknown user. */
SRP_user_pwd *SRP_VBASE_get_by_user(SRP_VBASE *vb, char *username);
+/* NOTE: unlike in SRP_VBASE_get_by_user, caller owns the returned pointer.*/
+SRP_user_pwd *SRP_VBASE_get1_by_user(SRP_VBASE *vb, char *username);
+
char *SRP_create_verifier(const char *user, const char *pass, char **salt,
char **verifier, const char *N, const char *g);
int SRP_create_verifier_BN(const char *user, const char *pass, BIGNUM **salt,
diff --git a/crypto/srp/srp_vfy.c b/crypto/srp/srp_vfy.c
index a3f1a8a0a4d5..26ad3e07b4bb 100644
--- a/crypto/srp/srp_vfy.c
+++ b/crypto/srp/srp_vfy.c
@@ -185,7 +185,7 @@ static char *t_tob64(char *dst, const unsigned char *src, int size)
return olddst;
}
-static void SRP_user_pwd_free(SRP_user_pwd *user_pwd)
+void SRP_user_pwd_free(SRP_user_pwd *user_pwd)
{
if (user_pwd == NULL)
return;
@@ -247,6 +247,24 @@ static int SRP_user_pwd_set_sv_BN(SRP_user_pwd *vinfo, BIGNUM *s, BIGNUM *v)
return (vinfo->s != NULL && vinfo->v != NULL);
}
+static SRP_user_pwd *srp_user_pwd_dup(SRP_user_pwd *src)
+{
+ SRP_user_pwd *ret;
+
+ if (src == NULL)
+ return NULL;
+ if ((ret = SRP_user_pwd_new()) == NULL)
+ return NULL;
+
+ SRP_user_pwd_set_gN(ret, src->g, src->N);
+ if (!SRP_user_pwd_set_ids(ret, src->id, src->info)
+ || !SRP_user_pwd_set_sv_BN(ret, BN_dup(src->s), BN_dup(src->v))) {
+ SRP_user_pwd_free(ret);
+ return NULL;
+ }
+ return ret;
+}
+
SRP_VBASE *SRP_VBASE_new(char *seed_key)
{
SRP_VBASE *vb = (SRP_VBASE *)OPENSSL_malloc(sizeof(SRP_VBASE));
@@ -468,21 +486,50 @@ int SRP_VBASE_init(SRP_VBASE *vb, char *verifier_file)
}
-SRP_user_pwd *SRP_VBASE_get_by_user(SRP_VBASE *vb, char *username)
+static SRP_user_pwd *find_user(SRP_VBASE *vb, char *username)
{
int i;
SRP_user_pwd *user;
- unsigned char digv[SHA_DIGEST_LENGTH];
- unsigned char digs[SHA_DIGEST_LENGTH];
- EVP_MD_CTX ctxt;
if (vb == NULL)
return NULL;
+
for (i = 0; i < sk_SRP_user_pwd_num(vb->users_pwd); i++) {
user = sk_SRP_user_pwd_value(vb->users_pwd, i);
if (strcmp(user->id, username) == 0)
return user;
}
+
+ return NULL;
+}
+
+/*
+ * This method ignores the configured seed and fails for an unknown user.
+ * Ownership of the returned pointer is not released to the caller.
+ * In other words, caller must not free the result.
+ */
+SRP_user_pwd *SRP_VBASE_get_by_user(SRP_VBASE *vb, char *username)
+{
+ return find_user(vb, username);
+}
+
+/*
+ * Ownership of the returned pointer is released to the caller.
+ * In other words, caller must free the result once done.
+ */
+SRP_user_pwd *SRP_VBASE_get1_by_user(SRP_VBASE *vb, char *username)
+{
+ SRP_user_pwd *user;
+ unsigned char digv[SHA_DIGEST_LENGTH];
+ unsigned char digs[SHA_DIGEST_LENGTH];
+ EVP_MD_CTX ctxt;
+
+ if (vb == NULL)
+ return NULL;
+
+ if ((user = find_user(vb, username)) != NULL)
+ return srp_user_pwd_dup(user);
+
if ((vb->seed_key == NULL) ||
(vb->default_g == NULL) || (vb->default_N == NULL))
return NULL;
diff --git a/crypto/stack/stack.c b/crypto/stack/stack.c
index de437acf6a5c..fa50083e22b3 100644
--- a/crypto/stack/stack.c
+++ b/crypto/stack/stack.c
@@ -360,7 +360,7 @@ void *sk_set(_STACK *st, int i, void *value)
void sk_sort(_STACK *st)
{
- if (st && !st->sorted) {
+ if (st && !st->sorted && st->comp != NULL) {
int (*comp_func) (const void *, const void *);
/*
diff --git a/crypto/x509/x509_vfy.c b/crypto/x509/x509_vfy.c
index 0429767032fd..4d34dbac9314 100644
--- a/crypto/x509/x509_vfy.c
+++ b/crypto/x509/x509_vfy.c
@@ -194,6 +194,9 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
int num, j, retry;
int (*cb) (int xok, X509_STORE_CTX *xctx);
STACK_OF(X509) *sktmp = NULL;
+ int trust = X509_TRUST_UNTRUSTED;
+ int err;
+
if (ctx->cert == NULL) {
X509err(X509_F_X509_VERIFY_CERT, X509_R_NO_CERT_SET_FOR_US_TO_VERIFY);
return -1;
@@ -216,7 +219,8 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
if (((ctx->chain = sk_X509_new_null()) == NULL) ||
(!sk_X509_push(ctx->chain, ctx->cert))) {
X509err(X509_F_X509_VERIFY_CERT, ERR_R_MALLOC_FAILURE);
- goto end;
+ ok = -1;
+ goto err;
}
CRYPTO_add(&ctx->cert->references, 1, CRYPTO_LOCK_X509);
ctx->last_untrusted = 1;
@@ -225,7 +229,8 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
if (ctx->untrusted != NULL
&& (sktmp = sk_X509_dup(ctx->untrusted)) == NULL) {
X509err(X509_F_X509_VERIFY_CERT, ERR_R_MALLOC_FAILURE);
- goto end;
+ ok = -1;
+ goto err;
}
num = sk_X509_num(ctx->chain);
@@ -249,7 +254,7 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
if (ctx->param->flags & X509_V_FLAG_TRUSTED_FIRST) {
ok = ctx->get_issuer(&xtmp, ctx, x);
if (ok < 0)
- goto end;
+ goto err;
/*
* If successful for now free up cert so it will be picked up
* again later.
@@ -266,7 +271,8 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
if (xtmp != NULL) {
if (!sk_X509_push(ctx->chain, xtmp)) {
X509err(X509_F_X509_VERIFY_CERT, ERR_R_MALLOC_FAILURE);
- goto end;
+ ok = -1;
+ goto err;
}
CRYPTO_add(&xtmp->references, 1, CRYPTO_LOCK_X509);
(void)sk_X509_delete_ptr(sktmp, xtmp);
@@ -314,7 +320,7 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
bad_chain = 1;
ok = cb(0, ctx);
if (!ok)
- goto end;
+ goto err;
} else {
/*
* We have a match: replace certificate with store
@@ -347,25 +353,26 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
ok = ctx->get_issuer(&xtmp, ctx, x);
if (ok < 0)
- goto end;
+ goto err;
if (ok == 0)
break;
x = xtmp;
if (!sk_X509_push(ctx->chain, x)) {
X509_free(xtmp);
X509err(X509_F_X509_VERIFY_CERT, ERR_R_MALLOC_FAILURE);
- ok = 0;
- goto end;
+ ok = -1;
+ goto err;
}
num++;
}
/* we now have our chain, lets check it... */
- i = check_trust(ctx);
+ if ((trust = check_trust(ctx)) == X509_TRUST_REJECTED) {
+ /* Callback already issued */
+ ok = 0;
+ goto err;
+ }
- /* If explicitly rejected error */
- if (i == X509_TRUST_REJECTED)
- goto end;
/*
* If it's not explicitly trusted then check if there is an alternative
* chain that could be used. We only do this if we haven't already
@@ -373,14 +380,14 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
* chain checking
*/
retry = 0;
- if (i != X509_TRUST_TRUSTED
+ if (trust != X509_TRUST_TRUSTED
&& !(ctx->param->flags & X509_V_FLAG_TRUSTED_FIRST)
&& !(ctx->param->flags & X509_V_FLAG_NO_ALT_CHAINS)) {
while (j-- > 1) {
xtmp2 = sk_X509_value(ctx->chain, j - 1);
ok = ctx->get_issuer(&xtmp, ctx, xtmp2);
if (ok < 0)
- goto end;
+ goto err;
/* Check if we found an alternate chain */
if (ok > 0) {
/*
@@ -410,7 +417,7 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
* self signed certificate in which case we've indicated an error already
* and set bad_chain == 1
*/
- if (i != X509_TRUST_TRUSTED && !bad_chain) {
+ if (trust != X509_TRUST_TRUSTED && !bad_chain) {
if ((chain_ss == NULL) || !ctx->check_issued(ctx, x, chain_ss)) {
if (ctx->last_untrusted >= num)
ctx->error = X509_V_ERR_UNABLE_TO_GET_ISSUER_CERT_LOCALLY;
@@ -431,26 +438,26 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
bad_chain = 1;
ok = cb(0, ctx);
if (!ok)
- goto end;
+ goto err;
}
/* We have the chain complete: now we need to check its purpose */
ok = check_chain_extensions(ctx);
if (!ok)
- goto end;
+ goto err;
/* Check name constraints */
ok = check_name_constraints(ctx);
if (!ok)
- goto end;
+ goto err;
ok = check_id(ctx);
if (!ok)
- goto end;
+ goto err;
/* We may as well copy down any DSA parameters that are required */
X509_get_pubkey_parameters(NULL, ctx->chain);
@@ -462,16 +469,16 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
ok = ctx->check_revocation(ctx);
if (!ok)
- goto end;
+ goto err;
- i = X509_chain_check_suiteb(&ctx->error_depth, NULL, ctx->chain,
- ctx->param->flags);
- if (i != X509_V_OK) {
- ctx->error = i;
+ err = X509_chain_check_suiteb(&ctx->error_depth, NULL, ctx->chain,
+ ctx->param->flags);
+ if (err != X509_V_OK) {
+ ctx->error = err;
ctx->current_cert = sk_X509_value(ctx->chain, ctx->error_depth);
ok = cb(0, ctx);
if (!ok)
- goto end;
+ goto err;
}
/* At this point, we have a chain and need to verify it */
@@ -480,25 +487,28 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
else
ok = internal_verify(ctx);
if (!ok)
- goto end;
+ goto err;
#ifndef OPENSSL_NO_RFC3779
/* RFC 3779 path validation, now that CRL check has been done */
ok = v3_asid_validate_path(ctx);
if (!ok)
- goto end;
+ goto err;
ok = v3_addr_validate_path(ctx);
if (!ok)
- goto end;
+ goto err;
#endif
/* If we get this far evaluate policies */
if (!bad_chain && (ctx->param->flags & X509_V_FLAG_POLICY_CHECK))
ok = ctx->check_policy(ctx);
if (!ok)
- goto end;
+ goto err;
if (0) {
- end:
+ err:
+ /* Ensure we return an error */
+ if (ok > 0)
+ ok = 0;
X509_get_pubkey_parameters(NULL, ctx->chain);
}
if (sktmp != NULL)