diff options
author | Jung-uk Kim <jkim@FreeBSD.org> | 2018-09-13 19:18:07 +0000 |
---|---|---|
committer | Jung-uk Kim <jkim@FreeBSD.org> | 2018-09-13 19:18:07 +0000 |
commit | a43ce912fc025d11e1395506111f75fc194d7ba5 (patch) | |
tree | 9794cf7720d75938ed0ea4f499c0dcd4b6eacdda /crypto/ec | |
parent | 02be298e504b8554caca6dc85af450e1ea44d19d (diff) | |
download | src-a43ce912fc025d11e1395506111f75fc194d7ba5.tar.gz src-a43ce912fc025d11e1395506111f75fc194d7ba5.zip |
Import OpenSSL 1.1.1.vendor/openssl/1.1.1
Notes
Notes:
svn path=/vendor-crypto/openssl/dist/; revision=338658
svn path=/vendor-crypto/openssl/1.1.1/; revision=338659; tag=vendor/openssl/1.1.1
Diffstat (limited to 'crypto/ec')
62 files changed, 29681 insertions, 8596 deletions
diff --git a/crypto/ec/Makefile b/crypto/ec/Makefile deleted file mode 100644 index 6628390ba48e..000000000000 --- a/crypto/ec/Makefile +++ /dev/null @@ -1,274 +0,0 @@ -# -# crypto/ec/Makefile -# - -DIR= ec -TOP= ../.. -CC= cc -INCLUDES= -I.. -I$(TOP) -I../../include -CFLAG=-g -MAKEFILE= Makefile -AR= ar r - -CFLAGS= $(INCLUDES) $(CFLAG) -ASFLAGS= $(INCLUDES) $(ASFLAG) -AFLAGS= $(ASFLAGS) - -GENERAL=Makefile -TEST=ectest.c -APPS= - -LIB=$(TOP)/libcrypto.a -LIBSRC= ec_lib.c ecp_smpl.c ecp_mont.c ecp_nist.c ec_cvt.c ec_mult.c\ - ec_err.c ec_curve.c ec_check.c ec_print.c ec_asn1.c ec_key.c\ - ec2_smpl.c ec2_mult.c ec_ameth.c ec_pmeth.c eck_prn.c \ - ecp_nistp224.c ecp_nistp256.c ecp_nistp521.c ecp_nistputil.c \ - ecp_oct.c ec2_oct.c ec_oct.c - -LIBOBJ= ec_lib.o ecp_smpl.o ecp_mont.o ecp_nist.o ec_cvt.o ec_mult.o\ - ec_err.o ec_curve.o ec_check.o ec_print.o ec_asn1.o ec_key.o\ - ec2_smpl.o ec2_mult.o ec_ameth.o ec_pmeth.o eck_prn.o \ - ecp_nistp224.o ecp_nistp256.o ecp_nistp521.o ecp_nistputil.o \ - ecp_oct.o ec2_oct.o ec_oct.o $(EC_ASM) - -SRC= $(LIBSRC) - -EXHEADER= ec.h -HEADER= ec_lcl.h $(EXHEADER) - -ALL= $(GENERAL) $(SRC) $(HEADER) - -top: - (cd ../..; $(MAKE) DIRS=crypto SDIRS=$(DIR) sub_all) - -all: lib - -lib: $(LIBOBJ) - $(AR) $(LIB) $(LIBOBJ) - $(RANLIB) $(LIB) || echo Never mind. - @touch lib - -ecp_nistz256-x86_64.s: asm/ecp_nistz256-x86_64.pl - $(PERL) asm/ecp_nistz256-x86_64.pl $(PERLASM_SCHEME) > $@ - -ecp_nistz256-avx2.s: asm/ecp_nistz256-avx2.pl - $(PERL) asm/ecp_nistz256-avx2.pl $(PERLASM_SCHEME) > $@ - -files: - $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO - -links: - @$(PERL) $(TOP)/util/mklink.pl ../../include/openssl $(EXHEADER) - @$(PERL) $(TOP)/util/mklink.pl ../../test $(TEST) - @$(PERL) $(TOP)/util/mklink.pl ../../apps $(APPS) - -install: - @[ -n "$(INSTALLTOP)" ] # should be set by top Makefile... - @headerlist="$(EXHEADER)"; for i in $$headerlist ; \ - do \ - (cp $$i $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i; \ - chmod 644 $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i ); \ - done; - -tags: - ctags $(SRC) - -tests: - -lint: - lint -DLINT $(INCLUDES) $(SRC)>fluff - -update: depend - -depend: - @[ -n "$(MAKEDEPEND)" ] # should be set by upper Makefile... - $(MAKEDEPEND) -- $(CFLAG) $(INCLUDES) $(DEPFLAG) -- $(PROGS) $(LIBSRC) - -dclean: - $(PERL) -pe 'if (/^# DO NOT DELETE THIS LINE/) {print; exit(0);}' $(MAKEFILE) >Makefile.new - mv -f Makefile.new $(MAKEFILE) - -clean: - rm -f *.s *.o */*.o *.obj lib tags core .pure .nfs* *.old *.bak fluff - -# DO NOT DELETE THIS LINE -- make depend depends on it. - -ec2_mult.o: ../../include/openssl/asn1.h ../../include/openssl/bio.h -ec2_mult.o: ../../include/openssl/bn.h ../../include/openssl/crypto.h -ec2_mult.o: ../../include/openssl/e_os2.h ../../include/openssl/ec.h -ec2_mult.o: ../../include/openssl/err.h ../../include/openssl/lhash.h -ec2_mult.o: ../../include/openssl/obj_mac.h ../../include/openssl/opensslconf.h -ec2_mult.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h -ec2_mult.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h -ec2_mult.o: ../../include/openssl/symhacks.h ec2_mult.c ec_lcl.h -ec2_oct.o: ../../include/openssl/asn1.h ../../include/openssl/bio.h -ec2_oct.o: ../../include/openssl/bn.h ../../include/openssl/crypto.h -ec2_oct.o: ../../include/openssl/e_os2.h ../../include/openssl/ec.h -ec2_oct.o: ../../include/openssl/err.h ../../include/openssl/lhash.h -ec2_oct.o: ../../include/openssl/obj_mac.h ../../include/openssl/opensslconf.h -ec2_oct.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h -ec2_oct.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h -ec2_oct.o: ../../include/openssl/symhacks.h ec2_oct.c ec_lcl.h -ec2_smpl.o: ../../include/openssl/asn1.h ../../include/openssl/bio.h -ec2_smpl.o: ../../include/openssl/bn.h ../../include/openssl/crypto.h -ec2_smpl.o: ../../include/openssl/e_os2.h ../../include/openssl/ec.h -ec2_smpl.o: ../../include/openssl/err.h ../../include/openssl/lhash.h -ec2_smpl.o: ../../include/openssl/obj_mac.h ../../include/openssl/opensslconf.h -ec2_smpl.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h -ec2_smpl.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h -ec2_smpl.o: ../../include/openssl/symhacks.h ec2_smpl.c ec_lcl.h -ec_ameth.o: ../../e_os.h ../../include/openssl/asn1.h -ec_ameth.o: ../../include/openssl/asn1t.h ../../include/openssl/bio.h -ec_ameth.o: ../../include/openssl/bn.h ../../include/openssl/buffer.h -ec_ameth.o: ../../include/openssl/cms.h ../../include/openssl/crypto.h -ec_ameth.o: ../../include/openssl/e_os2.h ../../include/openssl/ec.h -ec_ameth.o: ../../include/openssl/ecdh.h ../../include/openssl/ecdsa.h -ec_ameth.o: ../../include/openssl/err.h ../../include/openssl/evp.h -ec_ameth.o: ../../include/openssl/lhash.h ../../include/openssl/obj_mac.h -ec_ameth.o: ../../include/openssl/objects.h ../../include/openssl/opensslconf.h -ec_ameth.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h -ec_ameth.o: ../../include/openssl/pkcs7.h ../../include/openssl/safestack.h -ec_ameth.o: ../../include/openssl/sha.h ../../include/openssl/stack.h -ec_ameth.o: ../../include/openssl/symhacks.h ../../include/openssl/x509.h -ec_ameth.o: ../../include/openssl/x509_vfy.h ../asn1/asn1_locl.h ../cryptlib.h -ec_ameth.o: ec_ameth.c ec_lcl.h -ec_asn1.o: ../../include/openssl/asn1.h ../../include/openssl/asn1t.h -ec_asn1.o: ../../include/openssl/bio.h ../../include/openssl/bn.h -ec_asn1.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h -ec_asn1.o: ../../include/openssl/ec.h ../../include/openssl/err.h -ec_asn1.o: ../../include/openssl/lhash.h ../../include/openssl/obj_mac.h -ec_asn1.o: ../../include/openssl/objects.h ../../include/openssl/opensslconf.h -ec_asn1.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h -ec_asn1.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h -ec_asn1.o: ../../include/openssl/symhacks.h ec_asn1.c ec_lcl.h -ec_check.o: ../../include/openssl/asn1.h ../../include/openssl/bio.h -ec_check.o: ../../include/openssl/bn.h ../../include/openssl/crypto.h -ec_check.o: ../../include/openssl/e_os2.h ../../include/openssl/ec.h -ec_check.o: ../../include/openssl/err.h ../../include/openssl/lhash.h -ec_check.o: ../../include/openssl/obj_mac.h ../../include/openssl/opensslconf.h -ec_check.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h -ec_check.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h -ec_check.o: ../../include/openssl/symhacks.h ec_check.c ec_lcl.h -ec_curve.o: ../../include/openssl/asn1.h ../../include/openssl/bio.h -ec_curve.o: ../../include/openssl/bn.h ../../include/openssl/crypto.h -ec_curve.o: ../../include/openssl/e_os2.h ../../include/openssl/ec.h -ec_curve.o: ../../include/openssl/err.h ../../include/openssl/lhash.h -ec_curve.o: ../../include/openssl/obj_mac.h ../../include/openssl/opensslconf.h -ec_curve.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h -ec_curve.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h -ec_curve.o: ../../include/openssl/symhacks.h ec_curve.c ec_lcl.h -ec_cvt.o: ../../include/openssl/asn1.h ../../include/openssl/bio.h -ec_cvt.o: ../../include/openssl/bn.h ../../include/openssl/crypto.h -ec_cvt.o: ../../include/openssl/e_os2.h ../../include/openssl/ec.h -ec_cvt.o: ../../include/openssl/err.h ../../include/openssl/lhash.h -ec_cvt.o: ../../include/openssl/obj_mac.h ../../include/openssl/opensslconf.h -ec_cvt.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h -ec_cvt.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h -ec_cvt.o: ../../include/openssl/symhacks.h ec_cvt.c ec_lcl.h -ec_err.o: ../../include/openssl/asn1.h ../../include/openssl/bio.h -ec_err.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h -ec_err.o: ../../include/openssl/ec.h ../../include/openssl/err.h -ec_err.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h -ec_err.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h -ec_err.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h -ec_err.o: ../../include/openssl/symhacks.h ec_err.c -ec_key.o: ../../include/openssl/asn1.h ../../include/openssl/bio.h -ec_key.o: ../../include/openssl/bn.h ../../include/openssl/crypto.h -ec_key.o: ../../include/openssl/e_os2.h ../../include/openssl/ec.h -ec_key.o: ../../include/openssl/err.h ../../include/openssl/lhash.h -ec_key.o: ../../include/openssl/obj_mac.h ../../include/openssl/opensslconf.h -ec_key.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h -ec_key.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h -ec_key.o: ../../include/openssl/symhacks.h ec_key.c ec_lcl.h -ec_lib.o: ../../include/openssl/asn1.h ../../include/openssl/bio.h -ec_lib.o: ../../include/openssl/bn.h ../../include/openssl/crypto.h -ec_lib.o: ../../include/openssl/e_os2.h ../../include/openssl/ec.h -ec_lib.o: ../../include/openssl/err.h ../../include/openssl/lhash.h -ec_lib.o: ../../include/openssl/obj_mac.h ../../include/openssl/opensslconf.h -ec_lib.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h -ec_lib.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h -ec_lib.o: ../../include/openssl/symhacks.h ec_lcl.h ec_lib.c -ec_mult.o: ../../include/openssl/asn1.h ../../include/openssl/bio.h -ec_mult.o: ../../include/openssl/bn.h ../../include/openssl/crypto.h -ec_mult.o: ../../include/openssl/e_os2.h ../../include/openssl/ec.h -ec_mult.o: ../../include/openssl/err.h ../../include/openssl/lhash.h -ec_mult.o: ../../include/openssl/obj_mac.h ../../include/openssl/opensslconf.h -ec_mult.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h -ec_mult.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h -ec_mult.o: ../../include/openssl/symhacks.h ec_lcl.h ec_mult.c -ec_oct.o: ../../include/openssl/asn1.h ../../include/openssl/bio.h -ec_oct.o: ../../include/openssl/bn.h ../../include/openssl/crypto.h -ec_oct.o: ../../include/openssl/e_os2.h ../../include/openssl/ec.h -ec_oct.o: ../../include/openssl/err.h ../../include/openssl/lhash.h -ec_oct.o: ../../include/openssl/obj_mac.h ../../include/openssl/opensslconf.h -ec_oct.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h -ec_oct.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h -ec_oct.o: ../../include/openssl/symhacks.h ec_lcl.h ec_oct.c -ec_pmeth.o: ../../e_os.h ../../include/openssl/asn1.h -ec_pmeth.o: ../../include/openssl/asn1t.h ../../include/openssl/bio.h -ec_pmeth.o: ../../include/openssl/bn.h ../../include/openssl/buffer.h -ec_pmeth.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h -ec_pmeth.o: ../../include/openssl/ec.h ../../include/openssl/ecdh.h -ec_pmeth.o: ../../include/openssl/ecdsa.h ../../include/openssl/err.h -ec_pmeth.o: ../../include/openssl/evp.h ../../include/openssl/lhash.h -ec_pmeth.o: ../../include/openssl/obj_mac.h ../../include/openssl/objects.h -ec_pmeth.o: ../../include/openssl/opensslconf.h -ec_pmeth.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h -ec_pmeth.o: ../../include/openssl/pkcs7.h ../../include/openssl/safestack.h -ec_pmeth.o: ../../include/openssl/sha.h ../../include/openssl/stack.h -ec_pmeth.o: ../../include/openssl/symhacks.h ../../include/openssl/x509.h -ec_pmeth.o: ../../include/openssl/x509_vfy.h ../cryptlib.h ../evp/evp_locl.h -ec_pmeth.o: ec_lcl.h ec_pmeth.c -ec_print.o: ../../include/openssl/asn1.h ../../include/openssl/bio.h -ec_print.o: ../../include/openssl/bn.h ../../include/openssl/crypto.h -ec_print.o: ../../include/openssl/e_os2.h ../../include/openssl/ec.h -ec_print.o: ../../include/openssl/obj_mac.h ../../include/openssl/opensslconf.h -ec_print.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h -ec_print.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h -ec_print.o: ../../include/openssl/symhacks.h ec_lcl.h ec_print.c -eck_prn.o: ../../e_os.h ../../include/openssl/asn1.h -eck_prn.o: ../../include/openssl/bio.h ../../include/openssl/bn.h -eck_prn.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h -eck_prn.o: ../../include/openssl/e_os2.h ../../include/openssl/ec.h -eck_prn.o: ../../include/openssl/err.h ../../include/openssl/evp.h -eck_prn.o: ../../include/openssl/lhash.h ../../include/openssl/obj_mac.h -eck_prn.o: ../../include/openssl/objects.h ../../include/openssl/opensslconf.h -eck_prn.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h -eck_prn.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h -eck_prn.o: ../../include/openssl/symhacks.h ../cryptlib.h eck_prn.c -ecp_mont.o: ../../include/openssl/asn1.h ../../include/openssl/bio.h -ecp_mont.o: ../../include/openssl/bn.h ../../include/openssl/crypto.h -ecp_mont.o: ../../include/openssl/e_os2.h ../../include/openssl/ec.h -ecp_mont.o: ../../include/openssl/err.h ../../include/openssl/lhash.h -ecp_mont.o: ../../include/openssl/obj_mac.h ../../include/openssl/opensslconf.h -ecp_mont.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h -ecp_mont.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h -ecp_mont.o: ../../include/openssl/symhacks.h ec_lcl.h ecp_mont.c -ecp_nist.o: ../../include/openssl/asn1.h ../../include/openssl/bio.h -ecp_nist.o: ../../include/openssl/bn.h ../../include/openssl/crypto.h -ecp_nist.o: ../../include/openssl/e_os2.h ../../include/openssl/ec.h -ecp_nist.o: ../../include/openssl/err.h ../../include/openssl/lhash.h -ecp_nist.o: ../../include/openssl/obj_mac.h ../../include/openssl/opensslconf.h -ecp_nist.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h -ecp_nist.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h -ecp_nist.o: ../../include/openssl/symhacks.h ec_lcl.h ecp_nist.c -ecp_nistp224.o: ../../include/openssl/opensslconf.h ecp_nistp224.c -ecp_nistp256.o: ../../include/openssl/opensslconf.h ecp_nistp256.c -ecp_nistp521.o: ../../include/openssl/opensslconf.h ecp_nistp521.c -ecp_nistputil.o: ../../include/openssl/opensslconf.h ecp_nistputil.c -ecp_oct.o: ../../include/openssl/asn1.h ../../include/openssl/bio.h -ecp_oct.o: ../../include/openssl/bn.h ../../include/openssl/crypto.h -ecp_oct.o: ../../include/openssl/e_os2.h ../../include/openssl/ec.h -ecp_oct.o: ../../include/openssl/err.h ../../include/openssl/lhash.h -ecp_oct.o: ../../include/openssl/obj_mac.h ../../include/openssl/opensslconf.h -ecp_oct.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h -ecp_oct.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h -ecp_oct.o: ../../include/openssl/symhacks.h ec_lcl.h ecp_oct.c -ecp_smpl.o: ../../include/openssl/asn1.h ../../include/openssl/bio.h -ecp_smpl.o: ../../include/openssl/bn.h ../../include/openssl/crypto.h -ecp_smpl.o: ../../include/openssl/e_os2.h ../../include/openssl/ec.h -ecp_smpl.o: ../../include/openssl/err.h ../../include/openssl/lhash.h -ecp_smpl.o: ../../include/openssl/obj_mac.h ../../include/openssl/opensslconf.h -ecp_smpl.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h -ecp_smpl.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h -ecp_smpl.o: ../../include/openssl/symhacks.h ec_lcl.h ecp_smpl.c diff --git a/crypto/ec/asm/ecp_nistz256-armv4.pl b/crypto/ec/asm/ecp_nistz256-armv4.pl new file mode 100755 index 000000000000..83abbdd89578 --- /dev/null +++ b/crypto/ec/asm/ecp_nistz256-armv4.pl @@ -0,0 +1,1865 @@ +#! /usr/bin/env perl +# Copyright 2015-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# ECP_NISTZ256 module for ARMv4. +# +# October 2014. +# +# Original ECP_NISTZ256 submission targeting x86_64 is detailed in +# http://eprint.iacr.org/2013/816. In the process of adaptation +# original .c module was made 32-bit savvy in order to make this +# implementation possible. +# +# with/without -DECP_NISTZ256_ASM +# Cortex-A8 +53-170% +# Cortex-A9 +76-205% +# Cortex-A15 +100-316% +# Snapdragon S4 +66-187% +# +# Ranges denote minimum and maximum improvement coefficients depending +# on benchmark. Lower coefficients are for ECDSA sign, server-side +# operation. Keep in mind that +200% means 3x improvement. + +$flavour = shift; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +$code.=<<___; +#include "arm_arch.h" + +.text +#if defined(__thumb2__) +.syntax unified +.thumb +#else +.code 32 +#endif +___ +######################################################################## +# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 +# +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +open TABLE,"<ecp_nistz256_table.c" or +open TABLE,"<${dir}../ecp_nistz256_table.c" or +die "failed to open ecp_nistz256_table.c:",$!; + +use integer; + +foreach(<TABLE>) { + s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; +} +close TABLE; + +# See ecp_nistz256_table.c for explanation for why it's 64*16*37. +# 64*16*37-1 is because $#arr returns last valid index or @arr, not +# amount of elements. +die "insane number of elements" if ($#arr != 64*16*37-1); + +$code.=<<___; +.globl ecp_nistz256_precomputed +.type ecp_nistz256_precomputed,%object +.align 12 +ecp_nistz256_precomputed: +___ +######################################################################## +# this conversion smashes P256_POINT_AFFINE by individual bytes with +# 64 byte interval, similar to +# 1111222233334444 +# 1234123412341234 +for(1..37) { + @tbl = splice(@arr,0,64*16); + for($i=0;$i<64;$i++) { + undef @line; + for($j=0;$j<64;$j++) { + push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff; + } + $code.=".byte\t"; + $code.=join(',',map { sprintf "0x%02x",$_} @line); + $code.="\n"; + } +} +$code.=<<___; +.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed +.align 5 +.LRR: @ 2^512 mod P precomputed for NIST P256 polynomial +.long 0x00000003, 0x00000000, 0xffffffff, 0xfffffffb +.long 0xfffffffe, 0xffffffff, 0xfffffffd, 0x00000004 +.Lone: +.long 1,0,0,0,0,0,0,0 +.asciz "ECP_NISTZ256 for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" +.align 6 +___ + +######################################################################## +# common register layout, note that $t2 is link register, so that if +# internal subroutine uses $t2, then it has to offload lr... + +($r_ptr,$a_ptr,$b_ptr,$ff,$a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7,$t1,$t2)= + map("r$_",(0..12,14)); +($t0,$t3)=($ff,$a_ptr); + +$code.=<<___; +@ void ecp_nistz256_to_mont(BN_ULONG r0[8],const BN_ULONG r1[8]); +.globl ecp_nistz256_to_mont +.type ecp_nistz256_to_mont,%function +ecp_nistz256_to_mont: + adr $b_ptr,.LRR + b .Lecp_nistz256_mul_mont +.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont + +@ void ecp_nistz256_from_mont(BN_ULONG r0[8],const BN_ULONG r1[8]); +.globl ecp_nistz256_from_mont +.type ecp_nistz256_from_mont,%function +ecp_nistz256_from_mont: + adr $b_ptr,.Lone + b .Lecp_nistz256_mul_mont +.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont + +@ void ecp_nistz256_mul_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]); +.globl ecp_nistz256_mul_by_2 +.type ecp_nistz256_mul_by_2,%function +.align 4 +ecp_nistz256_mul_by_2: + stmdb sp!,{r4-r12,lr} + bl __ecp_nistz256_mul_by_2 +#if __ARM_ARCH__>=5 || !defined(__thumb__) + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + bx lr @ interoperable with Thumb ISA:-) +#endif +.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 + +.type __ecp_nistz256_mul_by_2,%function +.align 4 +__ecp_nistz256_mul_by_2: + ldr $a0,[$a_ptr,#0] + ldr $a1,[$a_ptr,#4] + ldr $a2,[$a_ptr,#8] + adds $a0,$a0,$a0 @ a[0:7]+=a[0:7], i.e. add with itself + ldr $a3,[$a_ptr,#12] + adcs $a1,$a1,$a1 + ldr $a4,[$a_ptr,#16] + adcs $a2,$a2,$a2 + ldr $a5,[$a_ptr,#20] + adcs $a3,$a3,$a3 + ldr $a6,[$a_ptr,#24] + adcs $a4,$a4,$a4 + ldr $a7,[$a_ptr,#28] + adcs $a5,$a5,$a5 + adcs $a6,$a6,$a6 + mov $ff,#0 + adcs $a7,$a7,$a7 + adc $ff,$ff,#0 + + b .Lreduce_by_sub +.size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2 + +@ void ecp_nistz256_add(BN_ULONG r0[8],const BN_ULONG r1[8], +@ const BN_ULONG r2[8]); +.globl ecp_nistz256_add +.type ecp_nistz256_add,%function +.align 4 +ecp_nistz256_add: + stmdb sp!,{r4-r12,lr} + bl __ecp_nistz256_add +#if __ARM_ARCH__>=5 || !defined(__thumb__) + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + bx lr @ interoperable with Thumb ISA:-) +#endif +.size ecp_nistz256_add,.-ecp_nistz256_add + +.type __ecp_nistz256_add,%function +.align 4 +__ecp_nistz256_add: + str lr,[sp,#-4]! @ push lr + + ldr $a0,[$a_ptr,#0] + ldr $a1,[$a_ptr,#4] + ldr $a2,[$a_ptr,#8] + ldr $a3,[$a_ptr,#12] + ldr $a4,[$a_ptr,#16] + ldr $t0,[$b_ptr,#0] + ldr $a5,[$a_ptr,#20] + ldr $t1,[$b_ptr,#4] + ldr $a6,[$a_ptr,#24] + ldr $t2,[$b_ptr,#8] + ldr $a7,[$a_ptr,#28] + ldr $t3,[$b_ptr,#12] + adds $a0,$a0,$t0 + ldr $t0,[$b_ptr,#16] + adcs $a1,$a1,$t1 + ldr $t1,[$b_ptr,#20] + adcs $a2,$a2,$t2 + ldr $t2,[$b_ptr,#24] + adcs $a3,$a3,$t3 + ldr $t3,[$b_ptr,#28] + adcs $a4,$a4,$t0 + adcs $a5,$a5,$t1 + adcs $a6,$a6,$t2 + mov $ff,#0 + adcs $a7,$a7,$t3 + adc $ff,$ff,#0 + ldr lr,[sp],#4 @ pop lr + +.Lreduce_by_sub: + + @ if a+b >= modulus, subtract modulus. + @ + @ But since comparison implies subtraction, we subtract + @ modulus and then add it back if subtraction borrowed. + + subs $a0,$a0,#-1 + sbcs $a1,$a1,#-1 + sbcs $a2,$a2,#-1 + sbcs $a3,$a3,#0 + sbcs $a4,$a4,#0 + sbcs $a5,$a5,#0 + sbcs $a6,$a6,#1 + sbcs $a7,$a7,#-1 + sbc $ff,$ff,#0 + + @ Note that because mod has special form, i.e. consists of + @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by + @ using value of borrow as a whole or extracting single bit. + @ Follow $ff register... + + adds $a0,$a0,$ff @ add synthesized modulus + adcs $a1,$a1,$ff + str $a0,[$r_ptr,#0] + adcs $a2,$a2,$ff + str $a1,[$r_ptr,#4] + adcs $a3,$a3,#0 + str $a2,[$r_ptr,#8] + adcs $a4,$a4,#0 + str $a3,[$r_ptr,#12] + adcs $a5,$a5,#0 + str $a4,[$r_ptr,#16] + adcs $a6,$a6,$ff,lsr#31 + str $a5,[$r_ptr,#20] + adcs $a7,$a7,$ff + str $a6,[$r_ptr,#24] + str $a7,[$r_ptr,#28] + + mov pc,lr +.size __ecp_nistz256_add,.-__ecp_nistz256_add + +@ void ecp_nistz256_mul_by_3(BN_ULONG r0[8],const BN_ULONG r1[8]); +.globl ecp_nistz256_mul_by_3 +.type ecp_nistz256_mul_by_3,%function +.align 4 +ecp_nistz256_mul_by_3: + stmdb sp!,{r4-r12,lr} + bl __ecp_nistz256_mul_by_3 +#if __ARM_ARCH__>=5 || !defined(__thumb__) + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + bx lr @ interoperable with Thumb ISA:-) +#endif +.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 + +.type __ecp_nistz256_mul_by_3,%function +.align 4 +__ecp_nistz256_mul_by_3: + str lr,[sp,#-4]! @ push lr + + @ As multiplication by 3 is performed as 2*n+n, below are inline + @ copies of __ecp_nistz256_mul_by_2 and __ecp_nistz256_add, see + @ corresponding subroutines for details. + + ldr $a0,[$a_ptr,#0] + ldr $a1,[$a_ptr,#4] + ldr $a2,[$a_ptr,#8] + adds $a0,$a0,$a0 @ a[0:7]+=a[0:7] + ldr $a3,[$a_ptr,#12] + adcs $a1,$a1,$a1 + ldr $a4,[$a_ptr,#16] + adcs $a2,$a2,$a2 + ldr $a5,[$a_ptr,#20] + adcs $a3,$a3,$a3 + ldr $a6,[$a_ptr,#24] + adcs $a4,$a4,$a4 + ldr $a7,[$a_ptr,#28] + adcs $a5,$a5,$a5 + adcs $a6,$a6,$a6 + mov $ff,#0 + adcs $a7,$a7,$a7 + adc $ff,$ff,#0 + + subs $a0,$a0,#-1 @ .Lreduce_by_sub but without stores + sbcs $a1,$a1,#-1 + sbcs $a2,$a2,#-1 + sbcs $a3,$a3,#0 + sbcs $a4,$a4,#0 + sbcs $a5,$a5,#0 + sbcs $a6,$a6,#1 + sbcs $a7,$a7,#-1 + sbc $ff,$ff,#0 + + adds $a0,$a0,$ff @ add synthesized modulus + adcs $a1,$a1,$ff + adcs $a2,$a2,$ff + adcs $a3,$a3,#0 + adcs $a4,$a4,#0 + ldr $b_ptr,[$a_ptr,#0] + adcs $a5,$a5,#0 + ldr $t1,[$a_ptr,#4] + adcs $a6,$a6,$ff,lsr#31 + ldr $t2,[$a_ptr,#8] + adc $a7,$a7,$ff + + ldr $t0,[$a_ptr,#12] + adds $a0,$a0,$b_ptr @ 2*a[0:7]+=a[0:7] + ldr $b_ptr,[$a_ptr,#16] + adcs $a1,$a1,$t1 + ldr $t1,[$a_ptr,#20] + adcs $a2,$a2,$t2 + ldr $t2,[$a_ptr,#24] + adcs $a3,$a3,$t0 + ldr $t3,[$a_ptr,#28] + adcs $a4,$a4,$b_ptr + adcs $a5,$a5,$t1 + adcs $a6,$a6,$t2 + mov $ff,#0 + adcs $a7,$a7,$t3 + adc $ff,$ff,#0 + ldr lr,[sp],#4 @ pop lr + + b .Lreduce_by_sub +.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 + +@ void ecp_nistz256_div_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]); +.globl ecp_nistz256_div_by_2 +.type ecp_nistz256_div_by_2,%function +.align 4 +ecp_nistz256_div_by_2: + stmdb sp!,{r4-r12,lr} + bl __ecp_nistz256_div_by_2 +#if __ARM_ARCH__>=5 || !defined(__thumb__) + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + bx lr @ interoperable with Thumb ISA:-) +#endif +.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 + +.type __ecp_nistz256_div_by_2,%function +.align 4 +__ecp_nistz256_div_by_2: + @ ret = (a is odd ? a+mod : a) >> 1 + + ldr $a0,[$a_ptr,#0] + ldr $a1,[$a_ptr,#4] + ldr $a2,[$a_ptr,#8] + mov $ff,$a0,lsl#31 @ place least significant bit to most + @ significant position, now arithmetic + @ right shift by 31 will produce -1 or + @ 0, while logical right shift 1 or 0, + @ this is how modulus is conditionally + @ synthesized in this case... + ldr $a3,[$a_ptr,#12] + adds $a0,$a0,$ff,asr#31 + ldr $a4,[$a_ptr,#16] + adcs $a1,$a1,$ff,asr#31 + ldr $a5,[$a_ptr,#20] + adcs $a2,$a2,$ff,asr#31 + ldr $a6,[$a_ptr,#24] + adcs $a3,$a3,#0 + ldr $a7,[$a_ptr,#28] + adcs $a4,$a4,#0 + mov $a0,$a0,lsr#1 @ a[0:7]>>=1, we can start early + @ because it doesn't affect flags + adcs $a5,$a5,#0 + orr $a0,$a0,$a1,lsl#31 + adcs $a6,$a6,$ff,lsr#31 + mov $b_ptr,#0 + adcs $a7,$a7,$ff,asr#31 + mov $a1,$a1,lsr#1 + adc $b_ptr,$b_ptr,#0 @ top-most carry bit from addition + + orr $a1,$a1,$a2,lsl#31 + mov $a2,$a2,lsr#1 + str $a0,[$r_ptr,#0] + orr $a2,$a2,$a3,lsl#31 + mov $a3,$a3,lsr#1 + str $a1,[$r_ptr,#4] + orr $a3,$a3,$a4,lsl#31 + mov $a4,$a4,lsr#1 + str $a2,[$r_ptr,#8] + orr $a4,$a4,$a5,lsl#31 + mov $a5,$a5,lsr#1 + str $a3,[$r_ptr,#12] + orr $a5,$a5,$a6,lsl#31 + mov $a6,$a6,lsr#1 + str $a4,[$r_ptr,#16] + orr $a6,$a6,$a7,lsl#31 + mov $a7,$a7,lsr#1 + str $a5,[$r_ptr,#20] + orr $a7,$a7,$b_ptr,lsl#31 @ don't forget the top-most carry bit + str $a6,[$r_ptr,#24] + str $a7,[$r_ptr,#28] + + mov pc,lr +.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 + +@ void ecp_nistz256_sub(BN_ULONG r0[8],const BN_ULONG r1[8], +@ const BN_ULONG r2[8]); +.globl ecp_nistz256_sub +.type ecp_nistz256_sub,%function +.align 4 +ecp_nistz256_sub: + stmdb sp!,{r4-r12,lr} + bl __ecp_nistz256_sub +#if __ARM_ARCH__>=5 || !defined(__thumb__) + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + bx lr @ interoperable with Thumb ISA:-) +#endif +.size ecp_nistz256_sub,.-ecp_nistz256_sub + +.type __ecp_nistz256_sub,%function +.align 4 +__ecp_nistz256_sub: + str lr,[sp,#-4]! @ push lr + + ldr $a0,[$a_ptr,#0] + ldr $a1,[$a_ptr,#4] + ldr $a2,[$a_ptr,#8] + ldr $a3,[$a_ptr,#12] + ldr $a4,[$a_ptr,#16] + ldr $t0,[$b_ptr,#0] + ldr $a5,[$a_ptr,#20] + ldr $t1,[$b_ptr,#4] + ldr $a6,[$a_ptr,#24] + ldr $t2,[$b_ptr,#8] + ldr $a7,[$a_ptr,#28] + ldr $t3,[$b_ptr,#12] + subs $a0,$a0,$t0 + ldr $t0,[$b_ptr,#16] + sbcs $a1,$a1,$t1 + ldr $t1,[$b_ptr,#20] + sbcs $a2,$a2,$t2 + ldr $t2,[$b_ptr,#24] + sbcs $a3,$a3,$t3 + ldr $t3,[$b_ptr,#28] + sbcs $a4,$a4,$t0 + sbcs $a5,$a5,$t1 + sbcs $a6,$a6,$t2 + sbcs $a7,$a7,$t3 + sbc $ff,$ff,$ff @ broadcast borrow bit + ldr lr,[sp],#4 @ pop lr + +.Lreduce_by_add: + + @ if a-b borrows, add modulus. + @ + @ Note that because mod has special form, i.e. consists of + @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by + @ broadcasting borrow bit to a register, $ff, and using it as + @ a whole or extracting single bit. + + adds $a0,$a0,$ff @ add synthesized modulus + adcs $a1,$a1,$ff + str $a0,[$r_ptr,#0] + adcs $a2,$a2,$ff + str $a1,[$r_ptr,#4] + adcs $a3,$a3,#0 + str $a2,[$r_ptr,#8] + adcs $a4,$a4,#0 + str $a3,[$r_ptr,#12] + adcs $a5,$a5,#0 + str $a4,[$r_ptr,#16] + adcs $a6,$a6,$ff,lsr#31 + str $a5,[$r_ptr,#20] + adcs $a7,$a7,$ff + str $a6,[$r_ptr,#24] + str $a7,[$r_ptr,#28] + + mov pc,lr +.size __ecp_nistz256_sub,.-__ecp_nistz256_sub + +@ void ecp_nistz256_neg(BN_ULONG r0[8],const BN_ULONG r1[8]); +.globl ecp_nistz256_neg +.type ecp_nistz256_neg,%function +.align 4 +ecp_nistz256_neg: + stmdb sp!,{r4-r12,lr} + bl __ecp_nistz256_neg +#if __ARM_ARCH__>=5 || !defined(__thumb__) + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + bx lr @ interoperable with Thumb ISA:-) +#endif +.size ecp_nistz256_neg,.-ecp_nistz256_neg + +.type __ecp_nistz256_neg,%function +.align 4 +__ecp_nistz256_neg: + ldr $a0,[$a_ptr,#0] + eor $ff,$ff,$ff + ldr $a1,[$a_ptr,#4] + ldr $a2,[$a_ptr,#8] + subs $a0,$ff,$a0 + ldr $a3,[$a_ptr,#12] + sbcs $a1,$ff,$a1 + ldr $a4,[$a_ptr,#16] + sbcs $a2,$ff,$a2 + ldr $a5,[$a_ptr,#20] + sbcs $a3,$ff,$a3 + ldr $a6,[$a_ptr,#24] + sbcs $a4,$ff,$a4 + ldr $a7,[$a_ptr,#28] + sbcs $a5,$ff,$a5 + sbcs $a6,$ff,$a6 + sbcs $a7,$ff,$a7 + sbc $ff,$ff,$ff + + b .Lreduce_by_add +.size __ecp_nistz256_neg,.-__ecp_nistz256_neg +___ +{ +my @acc=map("r$_",(3..11)); +my ($t0,$t1,$bj,$t2,$t3)=map("r$_",(0,1,2,12,14)); + +$code.=<<___; +@ void ecp_nistz256_sqr_mont(BN_ULONG r0[8],const BN_ULONG r1[8]); +.globl ecp_nistz256_sqr_mont +.type ecp_nistz256_sqr_mont,%function +.align 4 +ecp_nistz256_sqr_mont: + mov $b_ptr,$a_ptr + b .Lecp_nistz256_mul_mont +.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont + +@ void ecp_nistz256_mul_mont(BN_ULONG r0[8],const BN_ULONG r1[8], +@ const BN_ULONG r2[8]); +.globl ecp_nistz256_mul_mont +.type ecp_nistz256_mul_mont,%function +.align 4 +ecp_nistz256_mul_mont: +.Lecp_nistz256_mul_mont: + stmdb sp!,{r4-r12,lr} + bl __ecp_nistz256_mul_mont +#if __ARM_ARCH__>=5 || !defined(__thumb__) + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + bx lr @ interoperable with Thumb ISA:-) +#endif +.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont + +.type __ecp_nistz256_mul_mont,%function +.align 4 +__ecp_nistz256_mul_mont: + stmdb sp!,{r0-r2,lr} @ make a copy of arguments too + + ldr $bj,[$b_ptr,#0] @ b[0] + ldmia $a_ptr,{@acc[1]-@acc[8]} + + umull @acc[0],$t3,@acc[1],$bj @ r[0]=a[0]*b[0] + stmdb sp!,{$acc[1]-@acc[8]} @ copy a[0-7] to stack, so + @ that it can be addressed + @ without spending register + @ on address + umull @acc[1],$t0,@acc[2],$bj @ r[1]=a[1]*b[0] + umull @acc[2],$t1,@acc[3],$bj + adds @acc[1],@acc[1],$t3 @ accumulate high part of mult + umull @acc[3],$t2,@acc[4],$bj + adcs @acc[2],@acc[2],$t0 + umull @acc[4],$t3,@acc[5],$bj + adcs @acc[3],@acc[3],$t1 + umull @acc[5],$t0,@acc[6],$bj + adcs @acc[4],@acc[4],$t2 + umull @acc[6],$t1,@acc[7],$bj + adcs @acc[5],@acc[5],$t3 + umull @acc[7],$t2,@acc[8],$bj + adcs @acc[6],@acc[6],$t0 + adcs @acc[7],@acc[7],$t1 + eor $t3,$t3,$t3 @ first overflow bit is zero + adc @acc[8],$t2,#0 +___ +for(my $i=1;$i<8;$i++) { +my $t4=@acc[0]; + + # Reduction iteration is normally performed by accumulating + # result of multiplication of modulus by "magic" digit [and + # omitting least significant word, which is guaranteed to + # be 0], but thanks to special form of modulus and "magic" + # digit being equal to least significant word, it can be + # performed with additions and subtractions alone. Indeed: + # + # ffff.0001.0000.0000.0000.ffff.ffff.ffff + # * abcd + # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd + # + # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we + # rewrite above as: + # + # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd + # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000 + # - abcd.0000.0000.0000.0000.0000.0000.abcd + # + # or marking redundant operations: + # + # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.---- + # + abcd.0000.abcd.0000.0000.abcd.----.----.---- + # - abcd.----.----.----.----.----.----.---- + +$code.=<<___; + @ multiplication-less reduction $i + adds @acc[3],@acc[3],@acc[0] @ r[3]+=r[0] + ldr $bj,[sp,#40] @ restore b_ptr + adcs @acc[4],@acc[4],#0 @ r[4]+=0 + adcs @acc[5],@acc[5],#0 @ r[5]+=0 + adcs @acc[6],@acc[6],@acc[0] @ r[6]+=r[0] + ldr $t1,[sp,#0] @ load a[0] + adcs @acc[7],@acc[7],#0 @ r[7]+=0 + ldr $bj,[$bj,#4*$i] @ load b[i] + adcs @acc[8],@acc[8],@acc[0] @ r[8]+=r[0] + eor $t0,$t0,$t0 + adc $t3,$t3,#0 @ overflow bit + subs @acc[7],@acc[7],@acc[0] @ r[7]-=r[0] + ldr $t2,[sp,#4] @ a[1] + sbcs @acc[8],@acc[8],#0 @ r[8]-=0 + umlal @acc[1],$t0,$t1,$bj @ "r[0]"+=a[0]*b[i] + eor $t1,$t1,$t1 + sbc @acc[0],$t3,#0 @ overflow bit, keep in mind + @ that netto result is + @ addition of a value which + @ makes underflow impossible + + ldr $t3,[sp,#8] @ a[2] + umlal @acc[2],$t1,$t2,$bj @ "r[1]"+=a[1]*b[i] + str @acc[0],[sp,#36] @ temporarily offload overflow + eor $t2,$t2,$t2 + ldr $t4,[sp,#12] @ a[3], $t4 is alias @acc[0] + umlal @acc[3],$t2,$t3,$bj @ "r[2]"+=a[2]*b[i] + eor $t3,$t3,$t3 + adds @acc[2],@acc[2],$t0 @ accumulate high part of mult + ldr $t0,[sp,#16] @ a[4] + umlal @acc[4],$t3,$t4,$bj @ "r[3]"+=a[3]*b[i] + eor $t4,$t4,$t4 + adcs @acc[3],@acc[3],$t1 + ldr $t1,[sp,#20] @ a[5] + umlal @acc[5],$t4,$t0,$bj @ "r[4]"+=a[4]*b[i] + eor $t0,$t0,$t0 + adcs @acc[4],@acc[4],$t2 + ldr $t2,[sp,#24] @ a[6] + umlal @acc[6],$t0,$t1,$bj @ "r[5]"+=a[5]*b[i] + eor $t1,$t1,$t1 + adcs @acc[5],@acc[5],$t3 + ldr $t3,[sp,#28] @ a[7] + umlal @acc[7],$t1,$t2,$bj @ "r[6]"+=a[6]*b[i] + eor $t2,$t2,$t2 + adcs @acc[6],@acc[6],$t4 + ldr @acc[0],[sp,#36] @ restore overflow bit + umlal @acc[8],$t2,$t3,$bj @ "r[7]"+=a[7]*b[i] + eor $t3,$t3,$t3 + adcs @acc[7],@acc[7],$t0 + adcs @acc[8],@acc[8],$t1 + adcs @acc[0],$acc[0],$t2 + adc $t3,$t3,#0 @ new overflow bit +___ + push(@acc,shift(@acc)); # rotate registers, so that + # "r[i]" becomes r[i] +} +$code.=<<___; + @ last multiplication-less reduction + adds @acc[3],@acc[3],@acc[0] + ldr $r_ptr,[sp,#32] @ restore r_ptr + adcs @acc[4],@acc[4],#0 + adcs @acc[5],@acc[5],#0 + adcs @acc[6],@acc[6],@acc[0] + adcs @acc[7],@acc[7],#0 + adcs @acc[8],@acc[8],@acc[0] + adc $t3,$t3,#0 + subs @acc[7],@acc[7],@acc[0] + sbcs @acc[8],@acc[8],#0 + sbc @acc[0],$t3,#0 @ overflow bit + + @ Final step is "if result > mod, subtract mod", but we do it + @ "other way around", namely subtract modulus from result + @ and if it borrowed, add modulus back. + + adds @acc[1],@acc[1],#1 @ subs @acc[1],@acc[1],#-1 + adcs @acc[2],@acc[2],#0 @ sbcs @acc[2],@acc[2],#-1 + adcs @acc[3],@acc[3],#0 @ sbcs @acc[3],@acc[3],#-1 + sbcs @acc[4],@acc[4],#0 + sbcs @acc[5],@acc[5],#0 + sbcs @acc[6],@acc[6],#0 + sbcs @acc[7],@acc[7],#1 + adcs @acc[8],@acc[8],#0 @ sbcs @acc[8],@acc[8],#-1 + ldr lr,[sp,#44] @ restore lr + sbc @acc[0],@acc[0],#0 @ broadcast borrow bit + add sp,sp,#48 + + @ Note that because mod has special form, i.e. consists of + @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by + @ broadcasting borrow bit to a register, @acc[0], and using it as + @ a whole or extracting single bit. + + adds @acc[1],@acc[1],@acc[0] @ add modulus or zero + adcs @acc[2],@acc[2],@acc[0] + str @acc[1],[$r_ptr,#0] + adcs @acc[3],@acc[3],@acc[0] + str @acc[2],[$r_ptr,#4] + adcs @acc[4],@acc[4],#0 + str @acc[3],[$r_ptr,#8] + adcs @acc[5],@acc[5],#0 + str @acc[4],[$r_ptr,#12] + adcs @acc[6],@acc[6],#0 + str @acc[5],[$r_ptr,#16] + adcs @acc[7],@acc[7],@acc[0],lsr#31 + str @acc[6],[$r_ptr,#20] + adc @acc[8],@acc[8],@acc[0] + str @acc[7],[$r_ptr,#24] + str @acc[8],[$r_ptr,#28] + + mov pc,lr +.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont +___ +} + +{ +my ($out,$inp,$index,$mask)=map("r$_",(0..3)); +$code.=<<___; +@ void ecp_nistz256_scatter_w5(void *r0,const P256_POINT *r1, +@ int r2); +.globl ecp_nistz256_scatter_w5 +.type ecp_nistz256_scatter_w5,%function +.align 5 +ecp_nistz256_scatter_w5: + stmdb sp!,{r4-r11} + + add $out,$out,$index,lsl#2 + + ldmia $inp!,{r4-r11} @ X + str r4,[$out,#64*0-4] + str r5,[$out,#64*1-4] + str r6,[$out,#64*2-4] + str r7,[$out,#64*3-4] + str r8,[$out,#64*4-4] + str r9,[$out,#64*5-4] + str r10,[$out,#64*6-4] + str r11,[$out,#64*7-4] + add $out,$out,#64*8 + + ldmia $inp!,{r4-r11} @ Y + str r4,[$out,#64*0-4] + str r5,[$out,#64*1-4] + str r6,[$out,#64*2-4] + str r7,[$out,#64*3-4] + str r8,[$out,#64*4-4] + str r9,[$out,#64*5-4] + str r10,[$out,#64*6-4] + str r11,[$out,#64*7-4] + add $out,$out,#64*8 + + ldmia $inp,{r4-r11} @ Z + str r4,[$out,#64*0-4] + str r5,[$out,#64*1-4] + str r6,[$out,#64*2-4] + str r7,[$out,#64*3-4] + str r8,[$out,#64*4-4] + str r9,[$out,#64*5-4] + str r10,[$out,#64*6-4] + str r11,[$out,#64*7-4] + + ldmia sp!,{r4-r11} +#if __ARM_ARCH__>=5 || defined(__thumb__) + bx lr +#else + mov pc,lr +#endif +.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 + +@ void ecp_nistz256_gather_w5(P256_POINT *r0,const void *r1, +@ int r2); +.globl ecp_nistz256_gather_w5 +.type ecp_nistz256_gather_w5,%function +.align 5 +ecp_nistz256_gather_w5: + stmdb sp!,{r4-r11} + + cmp $index,#0 + mov $mask,#0 +#ifdef __thumb2__ + itt ne +#endif + subne $index,$index,#1 + movne $mask,#-1 + add $inp,$inp,$index,lsl#2 + + ldr r4,[$inp,#64*0] + ldr r5,[$inp,#64*1] + ldr r6,[$inp,#64*2] + and r4,r4,$mask + ldr r7,[$inp,#64*3] + and r5,r5,$mask + ldr r8,[$inp,#64*4] + and r6,r6,$mask + ldr r9,[$inp,#64*5] + and r7,r7,$mask + ldr r10,[$inp,#64*6] + and r8,r8,$mask + ldr r11,[$inp,#64*7] + add $inp,$inp,#64*8 + and r9,r9,$mask + and r10,r10,$mask + and r11,r11,$mask + stmia $out!,{r4-r11} @ X + + ldr r4,[$inp,#64*0] + ldr r5,[$inp,#64*1] + ldr r6,[$inp,#64*2] + and r4,r4,$mask + ldr r7,[$inp,#64*3] + and r5,r5,$mask + ldr r8,[$inp,#64*4] + and r6,r6,$mask + ldr r9,[$inp,#64*5] + and r7,r7,$mask + ldr r10,[$inp,#64*6] + and r8,r8,$mask + ldr r11,[$inp,#64*7] + add $inp,$inp,#64*8 + and r9,r9,$mask + and r10,r10,$mask + and r11,r11,$mask + stmia $out!,{r4-r11} @ Y + + ldr r4,[$inp,#64*0] + ldr r5,[$inp,#64*1] + ldr r6,[$inp,#64*2] + and r4,r4,$mask + ldr r7,[$inp,#64*3] + and r5,r5,$mask + ldr r8,[$inp,#64*4] + and r6,r6,$mask + ldr r9,[$inp,#64*5] + and r7,r7,$mask + ldr r10,[$inp,#64*6] + and r8,r8,$mask + ldr r11,[$inp,#64*7] + and r9,r9,$mask + and r10,r10,$mask + and r11,r11,$mask + stmia $out,{r4-r11} @ Z + + ldmia sp!,{r4-r11} +#if __ARM_ARCH__>=5 || defined(__thumb__) + bx lr +#else + mov pc,lr +#endif +.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 + +@ void ecp_nistz256_scatter_w7(void *r0,const P256_POINT_AFFINE *r1, +@ int r2); +.globl ecp_nistz256_scatter_w7 +.type ecp_nistz256_scatter_w7,%function +.align 5 +ecp_nistz256_scatter_w7: + add $out,$out,$index + mov $index,#64/4 +.Loop_scatter_w7: + ldr $mask,[$inp],#4 + subs $index,$index,#1 + strb $mask,[$out,#64*0] + mov $mask,$mask,lsr#8 + strb $mask,[$out,#64*1] + mov $mask,$mask,lsr#8 + strb $mask,[$out,#64*2] + mov $mask,$mask,lsr#8 + strb $mask,[$out,#64*3] + add $out,$out,#64*4 + bne .Loop_scatter_w7 + +#if __ARM_ARCH__>=5 || defined(__thumb__) + bx lr +#else + mov pc,lr +#endif +.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 + +@ void ecp_nistz256_gather_w7(P256_POINT_AFFINE *r0,const void *r1, +@ int r2); +.globl ecp_nistz256_gather_w7 +.type ecp_nistz256_gather_w7,%function +.align 5 +ecp_nistz256_gather_w7: + stmdb sp!,{r4-r7} + + cmp $index,#0 + mov $mask,#0 +#ifdef __thumb2__ + itt ne +#endif + subne $index,$index,#1 + movne $mask,#-1 + add $inp,$inp,$index + mov $index,#64/4 + nop +.Loop_gather_w7: + ldrb r4,[$inp,#64*0] + subs $index,$index,#1 + ldrb r5,[$inp,#64*1] + ldrb r6,[$inp,#64*2] + ldrb r7,[$inp,#64*3] + add $inp,$inp,#64*4 + orr r4,r4,r5,lsl#8 + orr r4,r4,r6,lsl#16 + orr r4,r4,r7,lsl#24 + and r4,r4,$mask + str r4,[$out],#4 + bne .Loop_gather_w7 + + ldmia sp!,{r4-r7} +#if __ARM_ARCH__>=5 || defined(__thumb__) + bx lr +#else + mov pc,lr +#endif +.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 +___ +} +if (0) { +# In comparison to integer-only equivalent of below subroutine: +# +# Cortex-A8 +10% +# Cortex-A9 -10% +# Snapdragon S4 +5% +# +# As not all time is spent in multiplication, overall impact is deemed +# too low to care about. + +my ($A0,$A1,$A2,$A3,$Bi,$zero,$temp)=map("d$_",(0..7)); +my $mask="q4"; +my $mult="q5"; +my @AxB=map("q$_",(8..15)); + +my ($rptr,$aptr,$bptr,$toutptr)=map("r$_",(0..3)); + +$code.=<<___; +#if __ARM_ARCH__>=7 +.fpu neon + +.globl ecp_nistz256_mul_mont_neon +.type ecp_nistz256_mul_mont_neon,%function +.align 5 +ecp_nistz256_mul_mont_neon: + mov ip,sp + stmdb sp!,{r4-r9} + vstmdb sp!,{q4-q5} @ ABI specification says so + + sub $toutptr,sp,#40 + vld1.32 {${Bi}[0]},[$bptr,:32]! + veor $zero,$zero,$zero + vld1.32 {$A0-$A3}, [$aptr] @ can't specify :32 :-( + vzip.16 $Bi,$zero + mov sp,$toutptr @ alloca + vmov.i64 $mask,#0xffff + + vmull.u32 @AxB[0],$Bi,${A0}[0] + vmull.u32 @AxB[1],$Bi,${A0}[1] + vmull.u32 @AxB[2],$Bi,${A1}[0] + vmull.u32 @AxB[3],$Bi,${A1}[1] + vshr.u64 $temp,@AxB[0]#lo,#16 + vmull.u32 @AxB[4],$Bi,${A2}[0] + vadd.u64 @AxB[0]#hi,@AxB[0]#hi,$temp + vmull.u32 @AxB[5],$Bi,${A2}[1] + vshr.u64 $temp,@AxB[0]#hi,#16 @ upper 32 bits of a[0]*b[0] + vmull.u32 @AxB[6],$Bi,${A3}[0] + vand.u64 @AxB[0],@AxB[0],$mask @ lower 32 bits of a[0]*b[0] + vmull.u32 @AxB[7],$Bi,${A3}[1] +___ +for($i=1;$i<8;$i++) { +$code.=<<___; + vld1.32 {${Bi}[0]},[$bptr,:32]! + veor $zero,$zero,$zero + vadd.u64 @AxB[1]#lo,@AxB[1]#lo,$temp @ reduction + vshl.u64 $mult,@AxB[0],#32 + vadd.u64 @AxB[3],@AxB[3],@AxB[0] + vsub.u64 $mult,$mult,@AxB[0] + vzip.16 $Bi,$zero + vadd.u64 @AxB[6],@AxB[6],@AxB[0] + vadd.u64 @AxB[7],@AxB[7],$mult +___ + push(@AxB,shift(@AxB)); +$code.=<<___; + vmlal.u32 @AxB[0],$Bi,${A0}[0] + vmlal.u32 @AxB[1],$Bi,${A0}[1] + vmlal.u32 @AxB[2],$Bi,${A1}[0] + vmlal.u32 @AxB[3],$Bi,${A1}[1] + vshr.u64 $temp,@AxB[0]#lo,#16 + vmlal.u32 @AxB[4],$Bi,${A2}[0] + vadd.u64 @AxB[0]#hi,@AxB[0]#hi,$temp + vmlal.u32 @AxB[5],$Bi,${A2}[1] + vshr.u64 $temp,@AxB[0]#hi,#16 @ upper 33 bits of a[0]*b[i]+t[0] + vmlal.u32 @AxB[6],$Bi,${A3}[0] + vand.u64 @AxB[0],@AxB[0],$mask @ lower 32 bits of a[0]*b[0] + vmull.u32 @AxB[7],$Bi,${A3}[1] +___ +} +$code.=<<___; + vadd.u64 @AxB[1]#lo,@AxB[1]#lo,$temp @ last reduction + vshl.u64 $mult,@AxB[0],#32 + vadd.u64 @AxB[3],@AxB[3],@AxB[0] + vsub.u64 $mult,$mult,@AxB[0] + vadd.u64 @AxB[6],@AxB[6],@AxB[0] + vadd.u64 @AxB[7],@AxB[7],$mult + + vshr.u64 $temp,@AxB[1]#lo,#16 @ convert + vadd.u64 @AxB[1]#hi,@AxB[1]#hi,$temp + vshr.u64 $temp,@AxB[1]#hi,#16 + vzip.16 @AxB[1]#lo,@AxB[1]#hi +___ +foreach (2..7) { +$code.=<<___; + vadd.u64 @AxB[$_]#lo,@AxB[$_]#lo,$temp + vst1.32 {@AxB[$_-1]#lo[0]},[$toutptr,:32]! + vshr.u64 $temp,@AxB[$_]#lo,#16 + vadd.u64 @AxB[$_]#hi,@AxB[$_]#hi,$temp + vshr.u64 $temp,@AxB[$_]#hi,#16 + vzip.16 @AxB[$_]#lo,@AxB[$_]#hi +___ +} +$code.=<<___; + vst1.32 {@AxB[7]#lo[0]},[$toutptr,:32]! + vst1.32 {$temp},[$toutptr] @ upper 33 bits + + ldr r1,[sp,#0] + ldr r2,[sp,#4] + ldr r3,[sp,#8] + subs r1,r1,#-1 + ldr r4,[sp,#12] + sbcs r2,r2,#-1 + ldr r5,[sp,#16] + sbcs r3,r3,#-1 + ldr r6,[sp,#20] + sbcs r4,r4,#0 + ldr r7,[sp,#24] + sbcs r5,r5,#0 + ldr r8,[sp,#28] + sbcs r6,r6,#0 + ldr r9,[sp,#32] @ top-most bit + sbcs r7,r7,#1 + sub sp,ip,#40+16 + sbcs r8,r8,#-1 + sbc r9,r9,#0 + vldmia sp!,{q4-q5} + + adds r1,r1,r9 + adcs r2,r2,r9 + str r1,[$rptr,#0] + adcs r3,r3,r9 + str r2,[$rptr,#4] + adcs r4,r4,#0 + str r3,[$rptr,#8] + adcs r5,r5,#0 + str r4,[$rptr,#12] + adcs r6,r6,#0 + str r5,[$rptr,#16] + adcs r7,r7,r9,lsr#31 + str r6,[$rptr,#20] + adcs r8,r8,r9 + str r7,[$rptr,#24] + str r8,[$rptr,#28] + + ldmia sp!,{r4-r9} + bx lr +.size ecp_nistz256_mul_mont_neon,.-ecp_nistz256_mul_mont_neon +#endif +___ +} + +{{{ +######################################################################## +# Below $aN assignment matches order in which 256-bit result appears in +# register bank at return from __ecp_nistz256_mul_mont, so that we can +# skip over reloading it from memory. This means that below functions +# use custom calling sequence accepting 256-bit input in registers, +# output pointer in r0, $r_ptr, and optional pointer in r2, $b_ptr. +# +# See their "normal" counterparts for insights on calculations. + +my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7, + $t0,$t1,$t2,$t3)=map("r$_",(11,3..10,12,14,1)); +my $ff=$b_ptr; + +$code.=<<___; +.type __ecp_nistz256_sub_from,%function +.align 5 +__ecp_nistz256_sub_from: + str lr,[sp,#-4]! @ push lr + + ldr $t0,[$b_ptr,#0] + ldr $t1,[$b_ptr,#4] + ldr $t2,[$b_ptr,#8] + ldr $t3,[$b_ptr,#12] + subs $a0,$a0,$t0 + ldr $t0,[$b_ptr,#16] + sbcs $a1,$a1,$t1 + ldr $t1,[$b_ptr,#20] + sbcs $a2,$a2,$t2 + ldr $t2,[$b_ptr,#24] + sbcs $a3,$a3,$t3 + ldr $t3,[$b_ptr,#28] + sbcs $a4,$a4,$t0 + sbcs $a5,$a5,$t1 + sbcs $a6,$a6,$t2 + sbcs $a7,$a7,$t3 + sbc $ff,$ff,$ff @ broadcast borrow bit + ldr lr,[sp],#4 @ pop lr + + adds $a0,$a0,$ff @ add synthesized modulus + adcs $a1,$a1,$ff + str $a0,[$r_ptr,#0] + adcs $a2,$a2,$ff + str $a1,[$r_ptr,#4] + adcs $a3,$a3,#0 + str $a2,[$r_ptr,#8] + adcs $a4,$a4,#0 + str $a3,[$r_ptr,#12] + adcs $a5,$a5,#0 + str $a4,[$r_ptr,#16] + adcs $a6,$a6,$ff,lsr#31 + str $a5,[$r_ptr,#20] + adcs $a7,$a7,$ff + str $a6,[$r_ptr,#24] + str $a7,[$r_ptr,#28] + + mov pc,lr +.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from + +.type __ecp_nistz256_sub_morf,%function +.align 5 +__ecp_nistz256_sub_morf: + str lr,[sp,#-4]! @ push lr + + ldr $t0,[$b_ptr,#0] + ldr $t1,[$b_ptr,#4] + ldr $t2,[$b_ptr,#8] + ldr $t3,[$b_ptr,#12] + subs $a0,$t0,$a0 + ldr $t0,[$b_ptr,#16] + sbcs $a1,$t1,$a1 + ldr $t1,[$b_ptr,#20] + sbcs $a2,$t2,$a2 + ldr $t2,[$b_ptr,#24] + sbcs $a3,$t3,$a3 + ldr $t3,[$b_ptr,#28] + sbcs $a4,$t0,$a4 + sbcs $a5,$t1,$a5 + sbcs $a6,$t2,$a6 + sbcs $a7,$t3,$a7 + sbc $ff,$ff,$ff @ broadcast borrow bit + ldr lr,[sp],#4 @ pop lr + + adds $a0,$a0,$ff @ add synthesized modulus + adcs $a1,$a1,$ff + str $a0,[$r_ptr,#0] + adcs $a2,$a2,$ff + str $a1,[$r_ptr,#4] + adcs $a3,$a3,#0 + str $a2,[$r_ptr,#8] + adcs $a4,$a4,#0 + str $a3,[$r_ptr,#12] + adcs $a5,$a5,#0 + str $a4,[$r_ptr,#16] + adcs $a6,$a6,$ff,lsr#31 + str $a5,[$r_ptr,#20] + adcs $a7,$a7,$ff + str $a6,[$r_ptr,#24] + str $a7,[$r_ptr,#28] + + mov pc,lr +.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf + +.type __ecp_nistz256_add_self,%function +.align 4 +__ecp_nistz256_add_self: + adds $a0,$a0,$a0 @ a[0:7]+=a[0:7] + adcs $a1,$a1,$a1 + adcs $a2,$a2,$a2 + adcs $a3,$a3,$a3 + adcs $a4,$a4,$a4 + adcs $a5,$a5,$a5 + adcs $a6,$a6,$a6 + mov $ff,#0 + adcs $a7,$a7,$a7 + adc $ff,$ff,#0 + + @ if a+b >= modulus, subtract modulus. + @ + @ But since comparison implies subtraction, we subtract + @ modulus and then add it back if subtraction borrowed. + + subs $a0,$a0,#-1 + sbcs $a1,$a1,#-1 + sbcs $a2,$a2,#-1 + sbcs $a3,$a3,#0 + sbcs $a4,$a4,#0 + sbcs $a5,$a5,#0 + sbcs $a6,$a6,#1 + sbcs $a7,$a7,#-1 + sbc $ff,$ff,#0 + + @ Note that because mod has special form, i.e. consists of + @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by + @ using value of borrow as a whole or extracting single bit. + @ Follow $ff register... + + adds $a0,$a0,$ff @ add synthesized modulus + adcs $a1,$a1,$ff + str $a0,[$r_ptr,#0] + adcs $a2,$a2,$ff + str $a1,[$r_ptr,#4] + adcs $a3,$a3,#0 + str $a2,[$r_ptr,#8] + adcs $a4,$a4,#0 + str $a3,[$r_ptr,#12] + adcs $a5,$a5,#0 + str $a4,[$r_ptr,#16] + adcs $a6,$a6,$ff,lsr#31 + str $a5,[$r_ptr,#20] + adcs $a7,$a7,$ff + str $a6,[$r_ptr,#24] + str $a7,[$r_ptr,#28] + + mov pc,lr +.size __ecp_nistz256_add_self,.-__ecp_nistz256_add_self + +___ + +######################################################################## +# following subroutines are "literal" implementation of those found in +# ecp_nistz256.c +# +######################################################################## +# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); +# +{ +my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); +# above map() describes stack layout with 5 temporary +# 256-bit vectors on top. Then note that we push +# starting from r0, which means that we have copy of +# input arguments just below these temporary vectors. + +$code.=<<___; +.globl ecp_nistz256_point_double +.type ecp_nistz256_point_double,%function +.align 5 +ecp_nistz256_point_double: + stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional + sub sp,sp,#32*5 + +.Lpoint_double_shortcut: + add r3,sp,#$in_x + ldmia $a_ptr!,{r4-r11} @ copy in_x + stmia r3,{r4-r11} + + add $r_ptr,sp,#$S + bl __ecp_nistz256_mul_by_2 @ p256_mul_by_2(S, in_y); + + add $b_ptr,$a_ptr,#32 + add $a_ptr,$a_ptr,#32 + add $r_ptr,sp,#$Zsqr + bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Zsqr, in_z); + + add $a_ptr,sp,#$S + add $b_ptr,sp,#$S + add $r_ptr,sp,#$S + bl __ecp_nistz256_mul_mont @ p256_sqr_mont(S, S); + + ldr $b_ptr,[sp,#32*5+4] + add $a_ptr,$b_ptr,#32 + add $b_ptr,$b_ptr,#64 + add $r_ptr,sp,#$tmp0 + bl __ecp_nistz256_mul_mont @ p256_mul_mont(tmp0, in_z, in_y); + + ldr $r_ptr,[sp,#32*5] + add $r_ptr,$r_ptr,#64 + bl __ecp_nistz256_add_self @ p256_mul_by_2(res_z, tmp0); + + add $a_ptr,sp,#$in_x + add $b_ptr,sp,#$Zsqr + add $r_ptr,sp,#$M + bl __ecp_nistz256_add @ p256_add(M, in_x, Zsqr); + + add $a_ptr,sp,#$in_x + add $b_ptr,sp,#$Zsqr + add $r_ptr,sp,#$Zsqr + bl __ecp_nistz256_sub @ p256_sub(Zsqr, in_x, Zsqr); + + add $a_ptr,sp,#$S + add $b_ptr,sp,#$S + add $r_ptr,sp,#$tmp0 + bl __ecp_nistz256_mul_mont @ p256_sqr_mont(tmp0, S); + + add $a_ptr,sp,#$Zsqr + add $b_ptr,sp,#$M + add $r_ptr,sp,#$M + bl __ecp_nistz256_mul_mont @ p256_mul_mont(M, M, Zsqr); + + ldr $r_ptr,[sp,#32*5] + add $a_ptr,sp,#$tmp0 + add $r_ptr,$r_ptr,#32 + bl __ecp_nistz256_div_by_2 @ p256_div_by_2(res_y, tmp0); + + add $a_ptr,sp,#$M + add $r_ptr,sp,#$M + bl __ecp_nistz256_mul_by_3 @ p256_mul_by_3(M, M); + + add $a_ptr,sp,#$in_x + add $b_ptr,sp,#$S + add $r_ptr,sp,#$S + bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, in_x); + + add $r_ptr,sp,#$tmp0 + bl __ecp_nistz256_add_self @ p256_mul_by_2(tmp0, S); + + ldr $r_ptr,[sp,#32*5] + add $a_ptr,sp,#$M + add $b_ptr,sp,#$M + bl __ecp_nistz256_mul_mont @ p256_sqr_mont(res_x, M); + + add $b_ptr,sp,#$tmp0 + bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, tmp0); + + add $b_ptr,sp,#$S + add $r_ptr,sp,#$S + bl __ecp_nistz256_sub_morf @ p256_sub(S, S, res_x); + + add $a_ptr,sp,#$M + add $b_ptr,sp,#$S + bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, M); + + ldr $r_ptr,[sp,#32*5] + add $b_ptr,$r_ptr,#32 + add $r_ptr,$r_ptr,#32 + bl __ecp_nistz256_sub_from @ p256_sub(res_y, S, res_y); + + add sp,sp,#32*5+16 @ +16 means "skip even over saved r0-r3" +#if __ARM_ARCH__>=5 || !defined(__thumb__) + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + bx lr @ interoperable with Thumb ISA:-) +#endif +.size ecp_nistz256_point_double,.-ecp_nistz256_point_double +___ +} + +######################################################################## +# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, +# const P256_POINT *in2); +{ +my ($res_x,$res_y,$res_z, + $in1_x,$in1_y,$in1_z, + $in2_x,$in2_y,$in2_z, + $H,$Hsqr,$R,$Rsqr,$Hcub, + $U1,$U2,$S1,$S2)=map(32*$_,(0..17)); +my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); +# above map() describes stack layout with 18 temporary +# 256-bit vectors on top. Then note that we push +# starting from r0, which means that we have copy of +# input arguments just below these temporary vectors. +# We use three of them for !in1infty, !in2intfy and +# result of check for zero. + +$code.=<<___; +.globl ecp_nistz256_point_add +.type ecp_nistz256_point_add,%function +.align 5 +ecp_nistz256_point_add: + stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional + sub sp,sp,#32*18+16 + + ldmia $b_ptr!,{r4-r11} @ copy in2_x + add r3,sp,#$in2_x + stmia r3!,{r4-r11} + ldmia $b_ptr!,{r4-r11} @ copy in2_y + stmia r3!,{r4-r11} + ldmia $b_ptr,{r4-r11} @ copy in2_z + orr r12,r4,r5 + orr r12,r12,r6 + orr r12,r12,r7 + orr r12,r12,r8 + orr r12,r12,r9 + orr r12,r12,r10 + orr r12,r12,r11 + cmp r12,#0 +#ifdef __thumb2__ + it ne +#endif + movne r12,#-1 + stmia r3,{r4-r11} + str r12,[sp,#32*18+8] @ !in2infty + + ldmia $a_ptr!,{r4-r11} @ copy in1_x + add r3,sp,#$in1_x + stmia r3!,{r4-r11} + ldmia $a_ptr!,{r4-r11} @ copy in1_y + stmia r3!,{r4-r11} + ldmia $a_ptr,{r4-r11} @ copy in1_z + orr r12,r4,r5 + orr r12,r12,r6 + orr r12,r12,r7 + orr r12,r12,r8 + orr r12,r12,r9 + orr r12,r12,r10 + orr r12,r12,r11 + cmp r12,#0 +#ifdef __thumb2__ + it ne +#endif + movne r12,#-1 + stmia r3,{r4-r11} + str r12,[sp,#32*18+4] @ !in1infty + + add $a_ptr,sp,#$in2_z + add $b_ptr,sp,#$in2_z + add $r_ptr,sp,#$Z2sqr + bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z2sqr, in2_z); + + add $a_ptr,sp,#$in1_z + add $b_ptr,sp,#$in1_z + add $r_ptr,sp,#$Z1sqr + bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z); + + add $a_ptr,sp,#$in2_z + add $b_ptr,sp,#$Z2sqr + add $r_ptr,sp,#$S1 + bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, Z2sqr, in2_z); + + add $a_ptr,sp,#$in1_z + add $b_ptr,sp,#$Z1sqr + add $r_ptr,sp,#$S2 + bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z); + + add $a_ptr,sp,#$in1_y + add $b_ptr,sp,#$S1 + add $r_ptr,sp,#$S1 + bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, S1, in1_y); + + add $a_ptr,sp,#$in2_y + add $b_ptr,sp,#$S2 + add $r_ptr,sp,#$S2 + bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y); + + add $b_ptr,sp,#$S1 + add $r_ptr,sp,#$R + bl __ecp_nistz256_sub_from @ p256_sub(R, S2, S1); + + orr $a0,$a0,$a1 @ see if result is zero + orr $a2,$a2,$a3 + orr $a4,$a4,$a5 + orr $a0,$a0,$a2 + orr $a4,$a4,$a6 + orr $a0,$a0,$a7 + add $a_ptr,sp,#$in1_x + orr $a0,$a0,$a4 + add $b_ptr,sp,#$Z2sqr + str $a0,[sp,#32*18+12] + + add $r_ptr,sp,#$U1 + bl __ecp_nistz256_mul_mont @ p256_mul_mont(U1, in1_x, Z2sqr); + + add $a_ptr,sp,#$in2_x + add $b_ptr,sp,#$Z1sqr + add $r_ptr,sp,#$U2 + bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in2_x, Z1sqr); + + add $b_ptr,sp,#$U1 + add $r_ptr,sp,#$H + bl __ecp_nistz256_sub_from @ p256_sub(H, U2, U1); + + orr $a0,$a0,$a1 @ see if result is zero + orr $a2,$a2,$a3 + orr $a4,$a4,$a5 + orr $a0,$a0,$a2 + orr $a4,$a4,$a6 + orr $a0,$a0,$a7 + orrs $a0,$a0,$a4 + + bne .Ladd_proceed @ is_equal(U1,U2)? + + ldr $t0,[sp,#32*18+4] + ldr $t1,[sp,#32*18+8] + ldr $t2,[sp,#32*18+12] + tst $t0,$t1 + beq .Ladd_proceed @ (in1infty || in2infty)? + tst $t2,$t2 + beq .Ladd_double @ is_equal(S1,S2)? + + ldr $r_ptr,[sp,#32*18+16] + eor r4,r4,r4 + eor r5,r5,r5 + eor r6,r6,r6 + eor r7,r7,r7 + eor r8,r8,r8 + eor r9,r9,r9 + eor r10,r10,r10 + eor r11,r11,r11 + stmia $r_ptr!,{r4-r11} + stmia $r_ptr!,{r4-r11} + stmia $r_ptr!,{r4-r11} + b .Ladd_done + +.align 4 +.Ladd_double: + ldr $a_ptr,[sp,#32*18+20] + add sp,sp,#32*(18-5)+16 @ difference in frame sizes + b .Lpoint_double_shortcut + +.align 4 +.Ladd_proceed: + add $a_ptr,sp,#$R + add $b_ptr,sp,#$R + add $r_ptr,sp,#$Rsqr + bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R); + + add $a_ptr,sp,#$H + add $b_ptr,sp,#$in1_z + add $r_ptr,sp,#$res_z + bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z); + + add $a_ptr,sp,#$H + add $b_ptr,sp,#$H + add $r_ptr,sp,#$Hsqr + bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H); + + add $a_ptr,sp,#$in2_z + add $b_ptr,sp,#$res_z + add $r_ptr,sp,#$res_z + bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, res_z, in2_z); + + add $a_ptr,sp,#$H + add $b_ptr,sp,#$Hsqr + add $r_ptr,sp,#$Hcub + bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H); + + add $a_ptr,sp,#$Hsqr + add $b_ptr,sp,#$U1 + add $r_ptr,sp,#$U2 + bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, U1, Hsqr); + + add $r_ptr,sp,#$Hsqr + bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2); + + add $b_ptr,sp,#$Rsqr + add $r_ptr,sp,#$res_x + bl __ecp_nistz256_sub_morf @ p256_sub(res_x, Rsqr, Hsqr); + + add $b_ptr,sp,#$Hcub + bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, Hcub); + + add $b_ptr,sp,#$U2 + add $r_ptr,sp,#$res_y + bl __ecp_nistz256_sub_morf @ p256_sub(res_y, U2, res_x); + + add $a_ptr,sp,#$Hcub + add $b_ptr,sp,#$S1 + add $r_ptr,sp,#$S2 + bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S1, Hcub); + + add $a_ptr,sp,#$R + add $b_ptr,sp,#$res_y + add $r_ptr,sp,#$res_y + bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R); + + add $b_ptr,sp,#$S2 + bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2); + + ldr r11,[sp,#32*18+4] @ !in1intfy + ldr r12,[sp,#32*18+8] @ !in2intfy + add r1,sp,#$res_x + add r2,sp,#$in2_x + and r10,r11,r12 + mvn r11,r11 + add r3,sp,#$in1_x + and r11,r11,r12 + mvn r12,r12 + ldr $r_ptr,[sp,#32*18+16] +___ +for($i=0;$i<96;$i+=8) { # conditional moves +$code.=<<___; + ldmia r1!,{r4-r5} @ res_x + ldmia r2!,{r6-r7} @ in2_x + ldmia r3!,{r8-r9} @ in1_x + and r4,r4,r10 + and r5,r5,r10 + and r6,r6,r11 + and r7,r7,r11 + and r8,r8,r12 + and r9,r9,r12 + orr r4,r4,r6 + orr r5,r5,r7 + orr r4,r4,r8 + orr r5,r5,r9 + stmia $r_ptr!,{r4-r5} +___ +} +$code.=<<___; +.Ladd_done: + add sp,sp,#32*18+16+16 @ +16 means "skip even over saved r0-r3" +#if __ARM_ARCH__>=5 || !defined(__thumb__) + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + bx lr @ interoperable with Thumb ISA:-) +#endif +.size ecp_nistz256_point_add,.-ecp_nistz256_point_add +___ +} + +######################################################################## +# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, +# const P256_POINT_AFFINE *in2); +{ +my ($res_x,$res_y,$res_z, + $in1_x,$in1_y,$in1_z, + $in2_x,$in2_y, + $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14)); +my $Z1sqr = $S2; +# above map() describes stack layout with 18 temporary +# 256-bit vectors on top. Then note that we push +# starting from r0, which means that we have copy of +# input arguments just below these temporary vectors. +# We use two of them for !in1infty, !in2intfy. + +my @ONE_mont=(1,0,0,-1,-1,-1,-2,0); + +$code.=<<___; +.globl ecp_nistz256_point_add_affine +.type ecp_nistz256_point_add_affine,%function +.align 5 +ecp_nistz256_point_add_affine: + stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional + sub sp,sp,#32*15 + + ldmia $a_ptr!,{r4-r11} @ copy in1_x + add r3,sp,#$in1_x + stmia r3!,{r4-r11} + ldmia $a_ptr!,{r4-r11} @ copy in1_y + stmia r3!,{r4-r11} + ldmia $a_ptr,{r4-r11} @ copy in1_z + orr r12,r4,r5 + orr r12,r12,r6 + orr r12,r12,r7 + orr r12,r12,r8 + orr r12,r12,r9 + orr r12,r12,r10 + orr r12,r12,r11 + cmp r12,#0 +#ifdef __thumb2__ + it ne +#endif + movne r12,#-1 + stmia r3,{r4-r11} + str r12,[sp,#32*15+4] @ !in1infty + + ldmia $b_ptr!,{r4-r11} @ copy in2_x + add r3,sp,#$in2_x + orr r12,r4,r5 + orr r12,r12,r6 + orr r12,r12,r7 + orr r12,r12,r8 + orr r12,r12,r9 + orr r12,r12,r10 + orr r12,r12,r11 + stmia r3!,{r4-r11} + ldmia $b_ptr!,{r4-r11} @ copy in2_y + orr r12,r12,r4 + orr r12,r12,r5 + orr r12,r12,r6 + orr r12,r12,r7 + orr r12,r12,r8 + orr r12,r12,r9 + orr r12,r12,r10 + orr r12,r12,r11 + stmia r3!,{r4-r11} + cmp r12,#0 +#ifdef __thumb2__ + it ne +#endif + movne r12,#-1 + str r12,[sp,#32*15+8] @ !in2infty + + add $a_ptr,sp,#$in1_z + add $b_ptr,sp,#$in1_z + add $r_ptr,sp,#$Z1sqr + bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z); + + add $a_ptr,sp,#$Z1sqr + add $b_ptr,sp,#$in2_x + add $r_ptr,sp,#$U2 + bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, Z1sqr, in2_x); + + add $b_ptr,sp,#$in1_x + add $r_ptr,sp,#$H + bl __ecp_nistz256_sub_from @ p256_sub(H, U2, in1_x); + + add $a_ptr,sp,#$Z1sqr + add $b_ptr,sp,#$in1_z + add $r_ptr,sp,#$S2 + bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z); + + add $a_ptr,sp,#$H + add $b_ptr,sp,#$in1_z + add $r_ptr,sp,#$res_z + bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z); + + add $a_ptr,sp,#$in2_y + add $b_ptr,sp,#$S2 + add $r_ptr,sp,#$S2 + bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y); + + add $b_ptr,sp,#$in1_y + add $r_ptr,sp,#$R + bl __ecp_nistz256_sub_from @ p256_sub(R, S2, in1_y); + + add $a_ptr,sp,#$H + add $b_ptr,sp,#$H + add $r_ptr,sp,#$Hsqr + bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H); + + add $a_ptr,sp,#$R + add $b_ptr,sp,#$R + add $r_ptr,sp,#$Rsqr + bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R); + + add $a_ptr,sp,#$H + add $b_ptr,sp,#$Hsqr + add $r_ptr,sp,#$Hcub + bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H); + + add $a_ptr,sp,#$Hsqr + add $b_ptr,sp,#$in1_x + add $r_ptr,sp,#$U2 + bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in1_x, Hsqr); + + add $r_ptr,sp,#$Hsqr + bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2); + + add $b_ptr,sp,#$Rsqr + add $r_ptr,sp,#$res_x + bl __ecp_nistz256_sub_morf @ p256_sub(res_x, Rsqr, Hsqr); + + add $b_ptr,sp,#$Hcub + bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, Hcub); + + add $b_ptr,sp,#$U2 + add $r_ptr,sp,#$res_y + bl __ecp_nistz256_sub_morf @ p256_sub(res_y, U2, res_x); + + add $a_ptr,sp,#$Hcub + add $b_ptr,sp,#$in1_y + add $r_ptr,sp,#$S2 + bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, in1_y, Hcub); + + add $a_ptr,sp,#$R + add $b_ptr,sp,#$res_y + add $r_ptr,sp,#$res_y + bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R); + + add $b_ptr,sp,#$S2 + bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2); + + ldr r11,[sp,#32*15+4] @ !in1intfy + ldr r12,[sp,#32*15+8] @ !in2intfy + add r1,sp,#$res_x + add r2,sp,#$in2_x + and r10,r11,r12 + mvn r11,r11 + add r3,sp,#$in1_x + and r11,r11,r12 + mvn r12,r12 + ldr $r_ptr,[sp,#32*15] +___ +for($i=0;$i<64;$i+=8) { # conditional moves +$code.=<<___; + ldmia r1!,{r4-r5} @ res_x + ldmia r2!,{r6-r7} @ in2_x + ldmia r3!,{r8-r9} @ in1_x + and r4,r4,r10 + and r5,r5,r10 + and r6,r6,r11 + and r7,r7,r11 + and r8,r8,r12 + and r9,r9,r12 + orr r4,r4,r6 + orr r5,r5,r7 + orr r4,r4,r8 + orr r5,r5,r9 + stmia $r_ptr!,{r4-r5} +___ +} +for(;$i<96;$i+=8) { +my $j=($i-64)/4; +$code.=<<___; + ldmia r1!,{r4-r5} @ res_z + ldmia r3!,{r8-r9} @ in1_z + and r4,r4,r10 + and r5,r5,r10 + and r6,r11,#@ONE_mont[$j] + and r7,r11,#@ONE_mont[$j+1] + and r8,r8,r12 + and r9,r9,r12 + orr r4,r4,r6 + orr r5,r5,r7 + orr r4,r4,r8 + orr r5,r5,r9 + stmia $r_ptr!,{r4-r5} +___ +} +$code.=<<___; + add sp,sp,#32*15+16 @ +16 means "skip even over saved r0-r3" +#if __ARM_ARCH__>=5 || !defined(__thumb__) + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + bx lr @ interoperable with Thumb ISA:-) +#endif +.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine +___ +} }}} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; + + print $_,"\n"; +} +close STDOUT; # enforce flush diff --git a/crypto/ec/asm/ecp_nistz256-armv8.pl b/crypto/ec/asm/ecp_nistz256-armv8.pl new file mode 100755 index 000000000000..1361cb395ffb --- /dev/null +++ b/crypto/ec/asm/ecp_nistz256-armv8.pl @@ -0,0 +1,1857 @@ +#! /usr/bin/env perl +# Copyright 2015-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# ECP_NISTZ256 module for ARMv8. +# +# February 2015. +# +# Original ECP_NISTZ256 submission targeting x86_64 is detailed in +# http://eprint.iacr.org/2013/816. +# +# with/without -DECP_NISTZ256_ASM +# Apple A7 +190-360% +# Cortex-A53 +190-400% +# Cortex-A57 +190-350% +# Denver +230-400% +# +# Ranges denote minimum and maximum improvement coefficients depending +# on benchmark. Lower coefficients are for ECDSA sign, server-side +# operation. Keep in mind that +400% means 5x improvement. + +$flavour = shift; +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +{ +my ($rp,$ap,$bp,$bi,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$poly1,$poly3, + $acc0,$acc1,$acc2,$acc3,$acc4,$acc5) = + map("x$_",(0..17,19,20)); + +my ($acc6,$acc7)=($ap,$bp); # used in __ecp_nistz256_sqr_mont + +$code.=<<___; +#include "arm_arch.h" + +.text +___ +######################################################################## +# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 +# +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +open TABLE,"<ecp_nistz256_table.c" or +open TABLE,"<${dir}../ecp_nistz256_table.c" or +die "failed to open ecp_nistz256_table.c:",$!; + +use integer; + +foreach(<TABLE>) { + s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; +} +close TABLE; + +# See ecp_nistz256_table.c for explanation for why it's 64*16*37. +# 64*16*37-1 is because $#arr returns last valid index or @arr, not +# amount of elements. +die "insane number of elements" if ($#arr != 64*16*37-1); + +$code.=<<___; +.globl ecp_nistz256_precomputed +.type ecp_nistz256_precomputed,%object +.align 12 +ecp_nistz256_precomputed: +___ +######################################################################## +# this conversion smashes P256_POINT_AFFINE by individual bytes with +# 64 byte interval, similar to +# 1111222233334444 +# 1234123412341234 +for(1..37) { + @tbl = splice(@arr,0,64*16); + for($i=0;$i<64;$i++) { + undef @line; + for($j=0;$j<64;$j++) { + push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff; + } + $code.=".byte\t"; + $code.=join(',',map { sprintf "0x%02x",$_} @line); + $code.="\n"; + } +} +$code.=<<___; +.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed +.align 5 +.Lpoly: +.quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 +.LRR: // 2^512 mod P precomputed for NIST P256 polynomial +.quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd +.Lone_mont: +.quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe +.Lone: +.quad 1,0,0,0 +.Lord: +.quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 +.LordK: +.quad 0xccd1c8aaee00bc4f +.asciz "ECP_NISTZ256 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" + +// void ecp_nistz256_to_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_to_mont +.type ecp_nistz256_to_mont,%function +.align 6 +ecp_nistz256_to_mont: + stp x29,x30,[sp,#-32]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + + ldr $bi,.LRR // bp[0] + ldp $a0,$a1,[$ap] + ldp $a2,$a3,[$ap,#16] + ldr $poly1,.Lpoly+8 + ldr $poly3,.Lpoly+24 + adr $bp,.LRR // &bp[0] + + bl __ecp_nistz256_mul_mont + + ldp x19,x20,[sp,#16] + ldp x29,x30,[sp],#32 + ret +.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont + +// void ecp_nistz256_from_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_from_mont +.type ecp_nistz256_from_mont,%function +.align 4 +ecp_nistz256_from_mont: + stp x29,x30,[sp,#-32]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + + mov $bi,#1 // bp[0] + ldp $a0,$a1,[$ap] + ldp $a2,$a3,[$ap,#16] + ldr $poly1,.Lpoly+8 + ldr $poly3,.Lpoly+24 + adr $bp,.Lone // &bp[0] + + bl __ecp_nistz256_mul_mont + + ldp x19,x20,[sp,#16] + ldp x29,x30,[sp],#32 + ret +.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont + +// void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], +// const BN_ULONG x2[4]); +.globl ecp_nistz256_mul_mont +.type ecp_nistz256_mul_mont,%function +.align 4 +ecp_nistz256_mul_mont: + stp x29,x30,[sp,#-32]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + + ldr $bi,[$bp] // bp[0] + ldp $a0,$a1,[$ap] + ldp $a2,$a3,[$ap,#16] + ldr $poly1,.Lpoly+8 + ldr $poly3,.Lpoly+24 + + bl __ecp_nistz256_mul_mont + + ldp x19,x20,[sp,#16] + ldp x29,x30,[sp],#32 + ret +.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont + +// void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_sqr_mont +.type ecp_nistz256_sqr_mont,%function +.align 4 +ecp_nistz256_sqr_mont: + stp x29,x30,[sp,#-32]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + + ldp $a0,$a1,[$ap] + ldp $a2,$a3,[$ap,#16] + ldr $poly1,.Lpoly+8 + ldr $poly3,.Lpoly+24 + + bl __ecp_nistz256_sqr_mont + + ldp x19,x20,[sp,#16] + ldp x29,x30,[sp],#32 + ret +.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont + +// void ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4], +// const BN_ULONG x2[4]); +.globl ecp_nistz256_add +.type ecp_nistz256_add,%function +.align 4 +ecp_nistz256_add: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ldp $acc0,$acc1,[$ap] + ldp $t0,$t1,[$bp] + ldp $acc2,$acc3,[$ap,#16] + ldp $t2,$t3,[$bp,#16] + ldr $poly1,.Lpoly+8 + ldr $poly3,.Lpoly+24 + + bl __ecp_nistz256_add + + ldp x29,x30,[sp],#16 + ret +.size ecp_nistz256_add,.-ecp_nistz256_add + +// void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_div_by_2 +.type ecp_nistz256_div_by_2,%function +.align 4 +ecp_nistz256_div_by_2: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ldp $acc0,$acc1,[$ap] + ldp $acc2,$acc3,[$ap,#16] + ldr $poly1,.Lpoly+8 + ldr $poly3,.Lpoly+24 + + bl __ecp_nistz256_div_by_2 + + ldp x29,x30,[sp],#16 + ret +.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 + +// void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_mul_by_2 +.type ecp_nistz256_mul_by_2,%function +.align 4 +ecp_nistz256_mul_by_2: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ldp $acc0,$acc1,[$ap] + ldp $acc2,$acc3,[$ap,#16] + ldr $poly1,.Lpoly+8 + ldr $poly3,.Lpoly+24 + mov $t0,$acc0 + mov $t1,$acc1 + mov $t2,$acc2 + mov $t3,$acc3 + + bl __ecp_nistz256_add // ret = a+a // 2*a + + ldp x29,x30,[sp],#16 + ret +.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 + +// void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_mul_by_3 +.type ecp_nistz256_mul_by_3,%function +.align 4 +ecp_nistz256_mul_by_3: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ldp $acc0,$acc1,[$ap] + ldp $acc2,$acc3,[$ap,#16] + ldr $poly1,.Lpoly+8 + ldr $poly3,.Lpoly+24 + mov $t0,$acc0 + mov $t1,$acc1 + mov $t2,$acc2 + mov $t3,$acc3 + mov $a0,$acc0 + mov $a1,$acc1 + mov $a2,$acc2 + mov $a3,$acc3 + + bl __ecp_nistz256_add // ret = a+a // 2*a + + mov $t0,$a0 + mov $t1,$a1 + mov $t2,$a2 + mov $t3,$a3 + + bl __ecp_nistz256_add // ret += a // 2*a+a=3*a + + ldp x29,x30,[sp],#16 + ret +.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 + +// void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4], +// const BN_ULONG x2[4]); +.globl ecp_nistz256_sub +.type ecp_nistz256_sub,%function +.align 4 +ecp_nistz256_sub: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ldp $acc0,$acc1,[$ap] + ldp $acc2,$acc3,[$ap,#16] + ldr $poly1,.Lpoly+8 + ldr $poly3,.Lpoly+24 + + bl __ecp_nistz256_sub_from + + ldp x29,x30,[sp],#16 + ret +.size ecp_nistz256_sub,.-ecp_nistz256_sub + +// void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_neg +.type ecp_nistz256_neg,%function +.align 4 +ecp_nistz256_neg: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov $bp,$ap + mov $acc0,xzr // a = 0 + mov $acc1,xzr + mov $acc2,xzr + mov $acc3,xzr + ldr $poly1,.Lpoly+8 + ldr $poly3,.Lpoly+24 + + bl __ecp_nistz256_sub_from + + ldp x29,x30,[sp],#16 + ret +.size ecp_nistz256_neg,.-ecp_nistz256_neg + +// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded +// to $a0-$a3 and b[0] - to $bi +.type __ecp_nistz256_mul_mont,%function +.align 4 +__ecp_nistz256_mul_mont: + mul $acc0,$a0,$bi // a[0]*b[0] + umulh $t0,$a0,$bi + + mul $acc1,$a1,$bi // a[1]*b[0] + umulh $t1,$a1,$bi + + mul $acc2,$a2,$bi // a[2]*b[0] + umulh $t2,$a2,$bi + + mul $acc3,$a3,$bi // a[3]*b[0] + umulh $t3,$a3,$bi + ldr $bi,[$bp,#8] // b[1] + + adds $acc1,$acc1,$t0 // accumulate high parts of multiplication + lsl $t0,$acc0,#32 + adcs $acc2,$acc2,$t1 + lsr $t1,$acc0,#32 + adcs $acc3,$acc3,$t2 + adc $acc4,xzr,$t3 + mov $acc5,xzr +___ +for($i=1;$i<4;$i++) { + # Reduction iteration is normally performed by accumulating + # result of multiplication of modulus by "magic" digit [and + # omitting least significant word, which is guaranteed to + # be 0], but thanks to special form of modulus and "magic" + # digit being equal to least significant word, it can be + # performed with additions and subtractions alone. Indeed: + # + # ffff0001.00000000.0000ffff.ffffffff + # * abcdefgh + # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh + # + # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we + # rewrite above as: + # + # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh + # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000 + # - 0000abcd.efgh0000.00000000.00000000.abcdefgh + # + # or marking redundant operations: + # + # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.-------- + # + abcdefgh.abcdefgh.0000abcd.efgh0000.-------- + # - 0000abcd.efgh0000.--------.--------.-------- + +$code.=<<___; + subs $t2,$acc0,$t0 // "*0xffff0001" + sbc $t3,$acc0,$t1 + adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] + mul $t0,$a0,$bi // lo(a[0]*b[i]) + adcs $acc1,$acc2,$t1 + mul $t1,$a1,$bi // lo(a[1]*b[i]) + adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 + mul $t2,$a2,$bi // lo(a[2]*b[i]) + adcs $acc3,$acc4,$t3 + mul $t3,$a3,$bi // lo(a[3]*b[i]) + adc $acc4,$acc5,xzr + + adds $acc0,$acc0,$t0 // accumulate low parts of multiplication + umulh $t0,$a0,$bi // hi(a[0]*b[i]) + adcs $acc1,$acc1,$t1 + umulh $t1,$a1,$bi // hi(a[1]*b[i]) + adcs $acc2,$acc2,$t2 + umulh $t2,$a2,$bi // hi(a[2]*b[i]) + adcs $acc3,$acc3,$t3 + umulh $t3,$a3,$bi // hi(a[3]*b[i]) + adc $acc4,$acc4,xzr +___ +$code.=<<___ if ($i<3); + ldr $bi,[$bp,#8*($i+1)] // b[$i+1] +___ +$code.=<<___; + adds $acc1,$acc1,$t0 // accumulate high parts of multiplication + lsl $t0,$acc0,#32 + adcs $acc2,$acc2,$t1 + lsr $t1,$acc0,#32 + adcs $acc3,$acc3,$t2 + adcs $acc4,$acc4,$t3 + adc $acc5,xzr,xzr +___ +} +$code.=<<___; + // last reduction + subs $t2,$acc0,$t0 // "*0xffff0001" + sbc $t3,$acc0,$t1 + adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] + adcs $acc1,$acc2,$t1 + adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 + adcs $acc3,$acc4,$t3 + adc $acc4,$acc5,xzr + + adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus + sbcs $t1,$acc1,$poly1 + sbcs $t2,$acc2,xzr + sbcs $t3,$acc3,$poly3 + sbcs xzr,$acc4,xzr // did it borrow? + + csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus + csel $acc1,$acc1,$t1,lo + csel $acc2,$acc2,$t2,lo + stp $acc0,$acc1,[$rp] + csel $acc3,$acc3,$t3,lo + stp $acc2,$acc3,[$rp,#16] + + ret +.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont + +// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded +// to $a0-$a3 +.type __ecp_nistz256_sqr_mont,%function +.align 4 +__ecp_nistz256_sqr_mont: + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul $acc1,$a1,$a0 // a[1]*a[0] + umulh $t1,$a1,$a0 + mul $acc2,$a2,$a0 // a[2]*a[0] + umulh $t2,$a2,$a0 + mul $acc3,$a3,$a0 // a[3]*a[0] + umulh $acc4,$a3,$a0 + + adds $acc2,$acc2,$t1 // accumulate high parts of multiplication + mul $t0,$a2,$a1 // a[2]*a[1] + umulh $t1,$a2,$a1 + adcs $acc3,$acc3,$t2 + mul $t2,$a3,$a1 // a[3]*a[1] + umulh $t3,$a3,$a1 + adc $acc4,$acc4,xzr // can't overflow + + mul $acc5,$a3,$a2 // a[3]*a[2] + umulh $acc6,$a3,$a2 + + adds $t1,$t1,$t2 // accumulate high parts of multiplication + mul $acc0,$a0,$a0 // a[0]*a[0] + adc $t2,$t3,xzr // can't overflow + + adds $acc3,$acc3,$t0 // accumulate low parts of multiplication + umulh $a0,$a0,$a0 + adcs $acc4,$acc4,$t1 + mul $t1,$a1,$a1 // a[1]*a[1] + adcs $acc5,$acc5,$t2 + umulh $a1,$a1,$a1 + adc $acc6,$acc6,xzr // can't overflow + + adds $acc1,$acc1,$acc1 // acc[1-6]*=2 + mul $t2,$a2,$a2 // a[2]*a[2] + adcs $acc2,$acc2,$acc2 + umulh $a2,$a2,$a2 + adcs $acc3,$acc3,$acc3 + mul $t3,$a3,$a3 // a[3]*a[3] + adcs $acc4,$acc4,$acc4 + umulh $a3,$a3,$a3 + adcs $acc5,$acc5,$acc5 + adcs $acc6,$acc6,$acc6 + adc $acc7,xzr,xzr + + adds $acc1,$acc1,$a0 // +a[i]*a[i] + adcs $acc2,$acc2,$t1 + adcs $acc3,$acc3,$a1 + adcs $acc4,$acc4,$t2 + adcs $acc5,$acc5,$a2 + lsl $t0,$acc0,#32 + adcs $acc6,$acc6,$t3 + lsr $t1,$acc0,#32 + adc $acc7,$acc7,$a3 +___ +for($i=0;$i<3;$i++) { # reductions, see commentary in + # multiplication for details +$code.=<<___; + subs $t2,$acc0,$t0 // "*0xffff0001" + sbc $t3,$acc0,$t1 + adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] + adcs $acc1,$acc2,$t1 + lsl $t0,$acc0,#32 + adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 + lsr $t1,$acc0,#32 + adc $acc3,$t3,xzr // can't overflow +___ +} +$code.=<<___; + subs $t2,$acc0,$t0 // "*0xffff0001" + sbc $t3,$acc0,$t1 + adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] + adcs $acc1,$acc2,$t1 + adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 + adc $acc3,$t3,xzr // can't overflow + + adds $acc0,$acc0,$acc4 // accumulate upper half + adcs $acc1,$acc1,$acc5 + adcs $acc2,$acc2,$acc6 + adcs $acc3,$acc3,$acc7 + adc $acc4,xzr,xzr + + adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus + sbcs $t1,$acc1,$poly1 + sbcs $t2,$acc2,xzr + sbcs $t3,$acc3,$poly3 + sbcs xzr,$acc4,xzr // did it borrow? + + csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus + csel $acc1,$acc1,$t1,lo + csel $acc2,$acc2,$t2,lo + stp $acc0,$acc1,[$rp] + csel $acc3,$acc3,$t3,lo + stp $acc2,$acc3,[$rp,#16] + + ret +.size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont + +// Note that __ecp_nistz256_add expects both input vectors pre-loaded to +// $a0-$a3 and $t0-$t3. This is done because it's used in multiple +// contexts, e.g. in multiplication by 2 and 3... +.type __ecp_nistz256_add,%function +.align 4 +__ecp_nistz256_add: + adds $acc0,$acc0,$t0 // ret = a+b + adcs $acc1,$acc1,$t1 + adcs $acc2,$acc2,$t2 + adcs $acc3,$acc3,$t3 + adc $ap,xzr,xzr // zap $ap + + adds $t0,$acc0,#1 // subs $t0,$a0,#-1 // tmp = ret-modulus + sbcs $t1,$acc1,$poly1 + sbcs $t2,$acc2,xzr + sbcs $t3,$acc3,$poly3 + sbcs xzr,$ap,xzr // did subtraction borrow? + + csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus + csel $acc1,$acc1,$t1,lo + csel $acc2,$acc2,$t2,lo + stp $acc0,$acc1,[$rp] + csel $acc3,$acc3,$t3,lo + stp $acc2,$acc3,[$rp,#16] + + ret +.size __ecp_nistz256_add,.-__ecp_nistz256_add + +.type __ecp_nistz256_sub_from,%function +.align 4 +__ecp_nistz256_sub_from: + ldp $t0,$t1,[$bp] + ldp $t2,$t3,[$bp,#16] + subs $acc0,$acc0,$t0 // ret = a-b + sbcs $acc1,$acc1,$t1 + sbcs $acc2,$acc2,$t2 + sbcs $acc3,$acc3,$t3 + sbc $ap,xzr,xzr // zap $ap + + subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus + adcs $t1,$acc1,$poly1 + adcs $t2,$acc2,xzr + adc $t3,$acc3,$poly3 + cmp $ap,xzr // did subtraction borrow? + + csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret + csel $acc1,$acc1,$t1,eq + csel $acc2,$acc2,$t2,eq + stp $acc0,$acc1,[$rp] + csel $acc3,$acc3,$t3,eq + stp $acc2,$acc3,[$rp,#16] + + ret +.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from + +.type __ecp_nistz256_sub_morf,%function +.align 4 +__ecp_nistz256_sub_morf: + ldp $t0,$t1,[$bp] + ldp $t2,$t3,[$bp,#16] + subs $acc0,$t0,$acc0 // ret = b-a + sbcs $acc1,$t1,$acc1 + sbcs $acc2,$t2,$acc2 + sbcs $acc3,$t3,$acc3 + sbc $ap,xzr,xzr // zap $ap + + subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus + adcs $t1,$acc1,$poly1 + adcs $t2,$acc2,xzr + adc $t3,$acc3,$poly3 + cmp $ap,xzr // did subtraction borrow? + + csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret + csel $acc1,$acc1,$t1,eq + csel $acc2,$acc2,$t2,eq + stp $acc0,$acc1,[$rp] + csel $acc3,$acc3,$t3,eq + stp $acc2,$acc3,[$rp,#16] + + ret +.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf + +.type __ecp_nistz256_div_by_2,%function +.align 4 +__ecp_nistz256_div_by_2: + subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = a+modulus + adcs $t1,$acc1,$poly1 + adcs $t2,$acc2,xzr + adcs $t3,$acc3,$poly3 + adc $ap,xzr,xzr // zap $ap + tst $acc0,#1 // is a even? + + csel $acc0,$acc0,$t0,eq // ret = even ? a : a+modulus + csel $acc1,$acc1,$t1,eq + csel $acc2,$acc2,$t2,eq + csel $acc3,$acc3,$t3,eq + csel $ap,xzr,$ap,eq + + lsr $acc0,$acc0,#1 // ret >>= 1 + orr $acc0,$acc0,$acc1,lsl#63 + lsr $acc1,$acc1,#1 + orr $acc1,$acc1,$acc2,lsl#63 + lsr $acc2,$acc2,#1 + orr $acc2,$acc2,$acc3,lsl#63 + lsr $acc3,$acc3,#1 + stp $acc0,$acc1,[$rp] + orr $acc3,$acc3,$ap,lsl#63 + stp $acc2,$acc3,[$rp,#16] + + ret +.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 +___ +######################################################################## +# following subroutines are "literal" implementation of those found in +# ecp_nistz256.c +# +######################################################################## +# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); +# +{ +my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3)); +# above map() describes stack layout with 4 temporary +# 256-bit vectors on top. +my ($rp_real,$ap_real) = map("x$_",(21,22)); + +$code.=<<___; +.globl ecp_nistz256_point_double +.type ecp_nistz256_point_double,%function +.align 5 +ecp_nistz256_point_double: + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + sub sp,sp,#32*4 + +.Ldouble_shortcut: + ldp $acc0,$acc1,[$ap,#32] + mov $rp_real,$rp + ldp $acc2,$acc3,[$ap,#48] + mov $ap_real,$ap + ldr $poly1,.Lpoly+8 + mov $t0,$acc0 + ldr $poly3,.Lpoly+24 + mov $t1,$acc1 + ldp $a0,$a1,[$ap_real,#64] // forward load for p256_sqr_mont + mov $t2,$acc2 + mov $t3,$acc3 + ldp $a2,$a3,[$ap_real,#64+16] + add $rp,sp,#$S + bl __ecp_nistz256_add // p256_mul_by_2(S, in_y); + + add $rp,sp,#$Zsqr + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z); + + ldp $t0,$t1,[$ap_real] + ldp $t2,$t3,[$ap_real,#16] + mov $a0,$acc0 // put Zsqr aside for p256_sub + mov $a1,$acc1 + mov $a2,$acc2 + mov $a3,$acc3 + add $rp,sp,#$M + bl __ecp_nistz256_add // p256_add(M, Zsqr, in_x); + + add $bp,$ap_real,#0 + mov $acc0,$a0 // restore Zsqr + mov $acc1,$a1 + ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont + mov $acc2,$a2 + mov $acc3,$a3 + ldp $a2,$a3,[sp,#$S+16] + add $rp,sp,#$Zsqr + bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr); + + add $rp,sp,#$S + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S); + + ldr $bi,[$ap_real,#32] + ldp $a0,$a1,[$ap_real,#64] + ldp $a2,$a3,[$ap_real,#64+16] + add $bp,$ap_real,#32 + add $rp,sp,#$tmp0 + bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y); + + mov $t0,$acc0 + mov $t1,$acc1 + ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont + mov $t2,$acc2 + mov $t3,$acc3 + ldp $a2,$a3,[sp,#$S+16] + add $rp,$rp_real,#64 + bl __ecp_nistz256_add // p256_mul_by_2(res_z, tmp0); + + add $rp,sp,#$tmp0 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S); + + ldr $bi,[sp,#$Zsqr] // forward load for p256_mul_mont + ldp $a0,$a1,[sp,#$M] + ldp $a2,$a3,[sp,#$M+16] + add $rp,$rp_real,#32 + bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0); + + add $bp,sp,#$Zsqr + add $rp,sp,#$M + bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr); + + mov $t0,$acc0 // duplicate M + mov $t1,$acc1 + mov $t2,$acc2 + mov $t3,$acc3 + mov $a0,$acc0 // put M aside + mov $a1,$acc1 + mov $a2,$acc2 + mov $a3,$acc3 + add $rp,sp,#$M + bl __ecp_nistz256_add + mov $t0,$a0 // restore M + mov $t1,$a1 + ldr $bi,[$ap_real] // forward load for p256_mul_mont + mov $t2,$a2 + ldp $a0,$a1,[sp,#$S] + mov $t3,$a3 + ldp $a2,$a3,[sp,#$S+16] + bl __ecp_nistz256_add // p256_mul_by_3(M, M); + + add $bp,$ap_real,#0 + add $rp,sp,#$S + bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x); + + mov $t0,$acc0 + mov $t1,$acc1 + ldp $a0,$a1,[sp,#$M] // forward load for p256_sqr_mont + mov $t2,$acc2 + mov $t3,$acc3 + ldp $a2,$a3,[sp,#$M+16] + add $rp,sp,#$tmp0 + bl __ecp_nistz256_add // p256_mul_by_2(tmp0, S); + + add $rp,$rp_real,#0 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M); + + add $bp,sp,#$tmp0 + bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0); + + add $bp,sp,#$S + add $rp,sp,#$S + bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x); + + ldr $bi,[sp,#$M] + mov $a0,$acc0 // copy S + mov $a1,$acc1 + mov $a2,$acc2 + mov $a3,$acc3 + add $bp,sp,#$M + bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M); + + add $bp,$rp_real,#32 + add $rp,$rp_real,#32 + bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y); + + add sp,x29,#0 // destroy frame + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x29,x30,[sp],#80 + ret +.size ecp_nistz256_point_double,.-ecp_nistz256_point_double +___ +} + +######################################################################## +# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, +# const P256_POINT *in2); +{ +my ($res_x,$res_y,$res_z, + $H,$Hsqr,$R,$Rsqr,$Hcub, + $U1,$U2,$S1,$S2)=map(32*$_,(0..11)); +my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); +# above map() describes stack layout with 12 temporary +# 256-bit vectors on top. +my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26)); + +$code.=<<___; +.globl ecp_nistz256_point_add +.type ecp_nistz256_point_add,%function +.align 5 +ecp_nistz256_point_add: + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + sub sp,sp,#32*12 + + ldp $a0,$a1,[$bp,#64] // in2_z + ldp $a2,$a3,[$bp,#64+16] + mov $rp_real,$rp + mov $ap_real,$ap + mov $bp_real,$bp + ldr $poly1,.Lpoly+8 + ldr $poly3,.Lpoly+24 + orr $t0,$a0,$a1 + orr $t2,$a2,$a3 + orr $in2infty,$t0,$t2 + cmp $in2infty,#0 + csetm $in2infty,ne // !in2infty + add $rp,sp,#$Z2sqr + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z); + + ldp $a0,$a1,[$ap_real,#64] // in1_z + ldp $a2,$a3,[$ap_real,#64+16] + orr $t0,$a0,$a1 + orr $t2,$a2,$a3 + orr $in1infty,$t0,$t2 + cmp $in1infty,#0 + csetm $in1infty,ne // !in1infty + add $rp,sp,#$Z1sqr + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); + + ldr $bi,[$bp_real,#64] + ldp $a0,$a1,[sp,#$Z2sqr] + ldp $a2,$a3,[sp,#$Z2sqr+16] + add $bp,$bp_real,#64 + add $rp,sp,#$S1 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z); + + ldr $bi,[$ap_real,#64] + ldp $a0,$a1,[sp,#$Z1sqr] + ldp $a2,$a3,[sp,#$Z1sqr+16] + add $bp,$ap_real,#64 + add $rp,sp,#$S2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); + + ldr $bi,[$ap_real,#32] + ldp $a0,$a1,[sp,#$S1] + ldp $a2,$a3,[sp,#$S1+16] + add $bp,$ap_real,#32 + add $rp,sp,#$S1 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y); + + ldr $bi,[$bp_real,#32] + ldp $a0,$a1,[sp,#$S2] + ldp $a2,$a3,[sp,#$S2+16] + add $bp,$bp_real,#32 + add $rp,sp,#$S2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); + + add $bp,sp,#$S1 + ldr $bi,[sp,#$Z2sqr] // forward load for p256_mul_mont + ldp $a0,$a1,[$ap_real] + ldp $a2,$a3,[$ap_real,#16] + add $rp,sp,#$R + bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1); + + orr $acc0,$acc0,$acc1 // see if result is zero + orr $acc2,$acc2,$acc3 + orr $temp,$acc0,$acc2 + + add $bp,sp,#$Z2sqr + add $rp,sp,#$U1 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr); + + ldr $bi,[sp,#$Z1sqr] + ldp $a0,$a1,[$bp_real] + ldp $a2,$a3,[$bp_real,#16] + add $bp,sp,#$Z1sqr + add $rp,sp,#$U2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr); + + add $bp,sp,#$U1 + ldp $a0,$a1,[sp,#$R] // forward load for p256_sqr_mont + ldp $a2,$a3,[sp,#$R+16] + add $rp,sp,#$H + bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1); + + orr $acc0,$acc0,$acc1 // see if result is zero + orr $acc2,$acc2,$acc3 + orr $acc0,$acc0,$acc2 + tst $acc0,$acc0 + b.ne .Ladd_proceed // is_equal(U1,U2)? + + tst $in1infty,$in2infty + b.eq .Ladd_proceed // (in1infty || in2infty)? + + tst $temp,$temp + b.eq .Ladd_double // is_equal(S1,S2)? + + eor $a0,$a0,$a0 + eor $a1,$a1,$a1 + stp $a0,$a1,[$rp_real] + stp $a0,$a1,[$rp_real,#16] + stp $a0,$a1,[$rp_real,#32] + stp $a0,$a1,[$rp_real,#48] + stp $a0,$a1,[$rp_real,#64] + stp $a0,$a1,[$rp_real,#80] + b .Ladd_done + +.align 4 +.Ladd_double: + mov $ap,$ap_real + mov $rp,$rp_real + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + add sp,sp,#32*(12-4) // difference in stack frames + b .Ldouble_shortcut + +.align 4 +.Ladd_proceed: + add $rp,sp,#$Rsqr + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); + + ldr $bi,[$ap_real,#64] + ldp $a0,$a1,[sp,#$H] + ldp $a2,$a3,[sp,#$H+16] + add $bp,$ap_real,#64 + add $rp,sp,#$res_z + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); + + ldp $a0,$a1,[sp,#$H] + ldp $a2,$a3,[sp,#$H+16] + add $rp,sp,#$Hsqr + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); + + ldr $bi,[$bp_real,#64] + ldp $a0,$a1,[sp,#$res_z] + ldp $a2,$a3,[sp,#$res_z+16] + add $bp,$bp_real,#64 + add $rp,sp,#$res_z + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z); + + ldr $bi,[sp,#$H] + ldp $a0,$a1,[sp,#$Hsqr] + ldp $a2,$a3,[sp,#$Hsqr+16] + add $bp,sp,#$H + add $rp,sp,#$Hcub + bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); + + ldr $bi,[sp,#$Hsqr] + ldp $a0,$a1,[sp,#$U1] + ldp $a2,$a3,[sp,#$U1+16] + add $bp,sp,#$Hsqr + add $rp,sp,#$U2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr); + + mov $t0,$acc0 + mov $t1,$acc1 + mov $t2,$acc2 + mov $t3,$acc3 + add $rp,sp,#$Hsqr + bl __ecp_nistz256_add // p256_mul_by_2(Hsqr, U2); + + add $bp,sp,#$Rsqr + add $rp,sp,#$res_x + bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); + + add $bp,sp,#$Hcub + bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); + + add $bp,sp,#$U2 + ldr $bi,[sp,#$Hcub] // forward load for p256_mul_mont + ldp $a0,$a1,[sp,#$S1] + ldp $a2,$a3,[sp,#$S1+16] + add $rp,sp,#$res_y + bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); + + add $bp,sp,#$Hcub + add $rp,sp,#$S2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub); + + ldr $bi,[sp,#$R] + ldp $a0,$a1,[sp,#$res_y] + ldp $a2,$a3,[sp,#$res_y+16] + add $bp,sp,#$R + add $rp,sp,#$res_y + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); + + add $bp,sp,#$S2 + bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); + + ldp $a0,$a1,[sp,#$res_x] // res + ldp $a2,$a3,[sp,#$res_x+16] + ldp $t0,$t1,[$bp_real] // in2 + ldp $t2,$t3,[$bp_real,#16] +___ +for($i=0;$i<64;$i+=32) { # conditional moves +$code.=<<___; + ldp $acc0,$acc1,[$ap_real,#$i] // in1 + cmp $in1infty,#0 // !$in1intfy, remember? + ldp $acc2,$acc3,[$ap_real,#$i+16] + csel $t0,$a0,$t0,ne + csel $t1,$a1,$t1,ne + ldp $a0,$a1,[sp,#$res_x+$i+32] // res + csel $t2,$a2,$t2,ne + csel $t3,$a3,$t3,ne + cmp $in2infty,#0 // !$in2intfy, remember? + ldp $a2,$a3,[sp,#$res_x+$i+48] + csel $acc0,$t0,$acc0,ne + csel $acc1,$t1,$acc1,ne + ldp $t0,$t1,[$bp_real,#$i+32] // in2 + csel $acc2,$t2,$acc2,ne + csel $acc3,$t3,$acc3,ne + ldp $t2,$t3,[$bp_real,#$i+48] + stp $acc0,$acc1,[$rp_real,#$i] + stp $acc2,$acc3,[$rp_real,#$i+16] +___ +} +$code.=<<___; + ldp $acc0,$acc1,[$ap_real,#$i] // in1 + cmp $in1infty,#0 // !$in1intfy, remember? + ldp $acc2,$acc3,[$ap_real,#$i+16] + csel $t0,$a0,$t0,ne + csel $t1,$a1,$t1,ne + csel $t2,$a2,$t2,ne + csel $t3,$a3,$t3,ne + cmp $in2infty,#0 // !$in2intfy, remember? + csel $acc0,$t0,$acc0,ne + csel $acc1,$t1,$acc1,ne + csel $acc2,$t2,$acc2,ne + csel $acc3,$t3,$acc3,ne + stp $acc0,$acc1,[$rp_real,#$i] + stp $acc2,$acc3,[$rp_real,#$i+16] + +.Ladd_done: + add sp,x29,#0 // destroy frame + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x29,x30,[sp],#80 + ret +.size ecp_nistz256_point_add,.-ecp_nistz256_point_add +___ +} + +######################################################################## +# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, +# const P256_POINT_AFFINE *in2); +{ +my ($res_x,$res_y,$res_z, + $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9)); +my $Z1sqr = $S2; +# above map() describes stack layout with 10 temporary +# 256-bit vectors on top. +my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26)); + +$code.=<<___; +.globl ecp_nistz256_point_add_affine +.type ecp_nistz256_point_add_affine,%function +.align 5 +ecp_nistz256_point_add_affine: + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + sub sp,sp,#32*10 + + mov $rp_real,$rp + mov $ap_real,$ap + mov $bp_real,$bp + ldr $poly1,.Lpoly+8 + ldr $poly3,.Lpoly+24 + + ldp $a0,$a1,[$ap,#64] // in1_z + ldp $a2,$a3,[$ap,#64+16] + orr $t0,$a0,$a1 + orr $t2,$a2,$a3 + orr $in1infty,$t0,$t2 + cmp $in1infty,#0 + csetm $in1infty,ne // !in1infty + + ldp $acc0,$acc1,[$bp] // in2_x + ldp $acc2,$acc3,[$bp,#16] + ldp $t0,$t1,[$bp,#32] // in2_y + ldp $t2,$t3,[$bp,#48] + orr $acc0,$acc0,$acc1 + orr $acc2,$acc2,$acc3 + orr $t0,$t0,$t1 + orr $t2,$t2,$t3 + orr $acc0,$acc0,$acc2 + orr $t0,$t0,$t2 + orr $in2infty,$acc0,$t0 + cmp $in2infty,#0 + csetm $in2infty,ne // !in2infty + + add $rp,sp,#$Z1sqr + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); + + mov $a0,$acc0 + mov $a1,$acc1 + mov $a2,$acc2 + mov $a3,$acc3 + ldr $bi,[$bp_real] + add $bp,$bp_real,#0 + add $rp,sp,#$U2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x); + + add $bp,$ap_real,#0 + ldr $bi,[$ap_real,#64] // forward load for p256_mul_mont + ldp $a0,$a1,[sp,#$Z1sqr] + ldp $a2,$a3,[sp,#$Z1sqr+16] + add $rp,sp,#$H + bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x); + + add $bp,$ap_real,#64 + add $rp,sp,#$S2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); + + ldr $bi,[$ap_real,#64] + ldp $a0,$a1,[sp,#$H] + ldp $a2,$a3,[sp,#$H+16] + add $bp,$ap_real,#64 + add $rp,sp,#$res_z + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); + + ldr $bi,[$bp_real,#32] + ldp $a0,$a1,[sp,#$S2] + ldp $a2,$a3,[sp,#$S2+16] + add $bp,$bp_real,#32 + add $rp,sp,#$S2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); + + add $bp,$ap_real,#32 + ldp $a0,$a1,[sp,#$H] // forward load for p256_sqr_mont + ldp $a2,$a3,[sp,#$H+16] + add $rp,sp,#$R + bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y); + + add $rp,sp,#$Hsqr + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); + + ldp $a0,$a1,[sp,#$R] + ldp $a2,$a3,[sp,#$R+16] + add $rp,sp,#$Rsqr + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); + + ldr $bi,[sp,#$H] + ldp $a0,$a1,[sp,#$Hsqr] + ldp $a2,$a3,[sp,#$Hsqr+16] + add $bp,sp,#$H + add $rp,sp,#$Hcub + bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); + + ldr $bi,[$ap_real] + ldp $a0,$a1,[sp,#$Hsqr] + ldp $a2,$a3,[sp,#$Hsqr+16] + add $bp,$ap_real,#0 + add $rp,sp,#$U2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr); + + mov $t0,$acc0 + mov $t1,$acc1 + mov $t2,$acc2 + mov $t3,$acc3 + add $rp,sp,#$Hsqr + bl __ecp_nistz256_add // p256_mul_by_2(Hsqr, U2); + + add $bp,sp,#$Rsqr + add $rp,sp,#$res_x + bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); + + add $bp,sp,#$Hcub + bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); + + add $bp,sp,#$U2 + ldr $bi,[$ap_real,#32] // forward load for p256_mul_mont + ldp $a0,$a1,[sp,#$Hcub] + ldp $a2,$a3,[sp,#$Hcub+16] + add $rp,sp,#$res_y + bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); + + add $bp,$ap_real,#32 + add $rp,sp,#$S2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub); + + ldr $bi,[sp,#$R] + ldp $a0,$a1,[sp,#$res_y] + ldp $a2,$a3,[sp,#$res_y+16] + add $bp,sp,#$R + add $rp,sp,#$res_y + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); + + add $bp,sp,#$S2 + bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); + + ldp $a0,$a1,[sp,#$res_x] // res + ldp $a2,$a3,[sp,#$res_x+16] + ldp $t0,$t1,[$bp_real] // in2 + ldp $t2,$t3,[$bp_real,#16] +___ +for($i=0;$i<64;$i+=32) { # conditional moves +$code.=<<___; + ldp $acc0,$acc1,[$ap_real,#$i] // in1 + cmp $in1infty,#0 // !$in1intfy, remember? + ldp $acc2,$acc3,[$ap_real,#$i+16] + csel $t0,$a0,$t0,ne + csel $t1,$a1,$t1,ne + ldp $a0,$a1,[sp,#$res_x+$i+32] // res + csel $t2,$a2,$t2,ne + csel $t3,$a3,$t3,ne + cmp $in2infty,#0 // !$in2intfy, remember? + ldp $a2,$a3,[sp,#$res_x+$i+48] + csel $acc0,$t0,$acc0,ne + csel $acc1,$t1,$acc1,ne + ldp $t0,$t1,[$bp_real,#$i+32] // in2 + csel $acc2,$t2,$acc2,ne + csel $acc3,$t3,$acc3,ne + ldp $t2,$t3,[$bp_real,#$i+48] + stp $acc0,$acc1,[$rp_real,#$i] + stp $acc2,$acc3,[$rp_real,#$i+16] +___ +$code.=<<___ if ($i == 0); + adr $bp_real,.Lone_mont-64 +___ +} +$code.=<<___; + ldp $acc0,$acc1,[$ap_real,#$i] // in1 + cmp $in1infty,#0 // !$in1intfy, remember? + ldp $acc2,$acc3,[$ap_real,#$i+16] + csel $t0,$a0,$t0,ne + csel $t1,$a1,$t1,ne + csel $t2,$a2,$t2,ne + csel $t3,$a3,$t3,ne + cmp $in2infty,#0 // !$in2intfy, remember? + csel $acc0,$t0,$acc0,ne + csel $acc1,$t1,$acc1,ne + csel $acc2,$t2,$acc2,ne + csel $acc3,$t3,$acc3,ne + stp $acc0,$acc1,[$rp_real,#$i] + stp $acc2,$acc3,[$rp_real,#$i+16] + + add sp,x29,#0 // destroy frame + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x29,x30,[sp],#80 + ret +.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine +___ +} +if (1) { +my ($ord0,$ord1) = ($poly1,$poly3); +my ($ord2,$ord3,$ordk,$t4) = map("x$_",(21..24)); +my $acc7 = $bi; + +$code.=<<___; +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], +// uint64_t b[4]); +.globl ecp_nistz256_ord_mul_mont +.type ecp_nistz256_ord_mul_mont,%function +.align 4 +ecp_nistz256_ord_mul_mont: + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + adr $ordk,.Lord + ldr $bi,[$bp] // bp[0] + ldp $a0,$a1,[$ap] + ldp $a2,$a3,[$ap,#16] + + ldp $ord0,$ord1,[$ordk,#0] + ldp $ord2,$ord3,[$ordk,#16] + ldr $ordk,[$ordk,#32] + + mul $acc0,$a0,$bi // a[0]*b[0] + umulh $t0,$a0,$bi + + mul $acc1,$a1,$bi // a[1]*b[0] + umulh $t1,$a1,$bi + + mul $acc2,$a2,$bi // a[2]*b[0] + umulh $t2,$a2,$bi + + mul $acc3,$a3,$bi // a[3]*b[0] + umulh $acc4,$a3,$bi + + mul $t4,$acc0,$ordk + + adds $acc1,$acc1,$t0 // accumulate high parts of multiplication + adcs $acc2,$acc2,$t1 + adcs $acc3,$acc3,$t2 + adc $acc4,$acc4,xzr + mov $acc5,xzr +___ +for ($i=1;$i<4;$i++) { + ################################################################ + # ffff0000.ffffffff.yyyyyyyy.zzzzzzzz + # * abcdefgh + # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx + # + # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we + # rewrite above as: + # + # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx + # - 0000abcd.efgh0000.abcdefgh.00000000.00000000 + # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh +$code.=<<___; + ldr $bi,[$bp,#8*$i] // b[i] + + lsl $t0,$t4,#32 + subs $acc2,$acc2,$t4 + lsr $t1,$t4,#32 + sbcs $acc3,$acc3,$t0 + sbcs $acc4,$acc4,$t1 + sbc $acc5,$acc5,xzr + + subs xzr,$acc0,#1 + umulh $t1,$ord0,$t4 + mul $t2,$ord1,$t4 + umulh $t3,$ord1,$t4 + + adcs $t2,$t2,$t1 + mul $t0,$a0,$bi + adc $t3,$t3,xzr + mul $t1,$a1,$bi + + adds $acc0,$acc1,$t2 + mul $t2,$a2,$bi + adcs $acc1,$acc2,$t3 + mul $t3,$a3,$bi + adcs $acc2,$acc3,$t4 + adcs $acc3,$acc4,$t4 + adc $acc4,$acc5,xzr + + adds $acc0,$acc0,$t0 // accumulate low parts + umulh $t0,$a0,$bi + adcs $acc1,$acc1,$t1 + umulh $t1,$a1,$bi + adcs $acc2,$acc2,$t2 + umulh $t2,$a2,$bi + adcs $acc3,$acc3,$t3 + umulh $t3,$a3,$bi + adc $acc4,$acc4,xzr + mul $t4,$acc0,$ordk + adds $acc1,$acc1,$t0 // accumulate high parts + adcs $acc2,$acc2,$t1 + adcs $acc3,$acc3,$t2 + adcs $acc4,$acc4,$t3 + adc $acc5,xzr,xzr +___ +} +$code.=<<___; + lsl $t0,$t4,#32 // last reduction + subs $acc2,$acc2,$t4 + lsr $t1,$t4,#32 + sbcs $acc3,$acc3,$t0 + sbcs $acc4,$acc4,$t1 + sbc $acc5,$acc5,xzr + + subs xzr,$acc0,#1 + umulh $t1,$ord0,$t4 + mul $t2,$ord1,$t4 + umulh $t3,$ord1,$t4 + + adcs $t2,$t2,$t1 + adc $t3,$t3,xzr + + adds $acc0,$acc1,$t2 + adcs $acc1,$acc2,$t3 + adcs $acc2,$acc3,$t4 + adcs $acc3,$acc4,$t4 + adc $acc4,$acc5,xzr + + subs $t0,$acc0,$ord0 // ret -= modulus + sbcs $t1,$acc1,$ord1 + sbcs $t2,$acc2,$ord2 + sbcs $t3,$acc3,$ord3 + sbcs xzr,$acc4,xzr + + csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus + csel $acc1,$acc1,$t1,lo + csel $acc2,$acc2,$t2,lo + stp $acc0,$acc1,[$rp] + csel $acc3,$acc3,$t3,lo + stp $acc2,$acc3,[$rp,#16] + + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldr x29,[sp],#64 + ret +.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont + +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], +// int rep); +.globl ecp_nistz256_ord_sqr_mont +.type ecp_nistz256_ord_sqr_mont,%function +.align 4 +ecp_nistz256_ord_sqr_mont: + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + adr $ordk,.Lord + ldp $a0,$a1,[$ap] + ldp $a2,$a3,[$ap,#16] + + ldp $ord0,$ord1,[$ordk,#0] + ldp $ord2,$ord3,[$ordk,#16] + ldr $ordk,[$ordk,#32] + b .Loop_ord_sqr + +.align 4 +.Loop_ord_sqr: + sub $bp,$bp,#1 + //////////////////////////////////////////////////////////////// + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul $acc1,$a1,$a0 // a[1]*a[0] + umulh $t1,$a1,$a0 + mul $acc2,$a2,$a0 // a[2]*a[0] + umulh $t2,$a2,$a0 + mul $acc3,$a3,$a0 // a[3]*a[0] + umulh $acc4,$a3,$a0 + + adds $acc2,$acc2,$t1 // accumulate high parts of multiplication + mul $t0,$a2,$a1 // a[2]*a[1] + umulh $t1,$a2,$a1 + adcs $acc3,$acc3,$t2 + mul $t2,$a3,$a1 // a[3]*a[1] + umulh $t3,$a3,$a1 + adc $acc4,$acc4,xzr // can't overflow + + mul $acc5,$a3,$a2 // a[3]*a[2] + umulh $acc6,$a3,$a2 + + adds $t1,$t1,$t2 // accumulate high parts of multiplication + mul $acc0,$a0,$a0 // a[0]*a[0] + adc $t2,$t3,xzr // can't overflow + + adds $acc3,$acc3,$t0 // accumulate low parts of multiplication + umulh $a0,$a0,$a0 + adcs $acc4,$acc4,$t1 + mul $t1,$a1,$a1 // a[1]*a[1] + adcs $acc5,$acc5,$t2 + umulh $a1,$a1,$a1 + adc $acc6,$acc6,xzr // can't overflow + + adds $acc1,$acc1,$acc1 // acc[1-6]*=2 + mul $t2,$a2,$a2 // a[2]*a[2] + adcs $acc2,$acc2,$acc2 + umulh $a2,$a2,$a2 + adcs $acc3,$acc3,$acc3 + mul $t3,$a3,$a3 // a[3]*a[3] + adcs $acc4,$acc4,$acc4 + umulh $a3,$a3,$a3 + adcs $acc5,$acc5,$acc5 + adcs $acc6,$acc6,$acc6 + adc $acc7,xzr,xzr + + adds $acc1,$acc1,$a0 // +a[i]*a[i] + mul $t4,$acc0,$ordk + adcs $acc2,$acc2,$t1 + adcs $acc3,$acc3,$a1 + adcs $acc4,$acc4,$t2 + adcs $acc5,$acc5,$a2 + adcs $acc6,$acc6,$t3 + adc $acc7,$acc7,$a3 +___ +for($i=0; $i<4; $i++) { # reductions +$code.=<<___; + subs xzr,$acc0,#1 + umulh $t1,$ord0,$t4 + mul $t2,$ord1,$t4 + umulh $t3,$ord1,$t4 + + adcs $t2,$t2,$t1 + adc $t3,$t3,xzr + + adds $acc0,$acc1,$t2 + adcs $acc1,$acc2,$t3 + adcs $acc2,$acc3,$t4 + adc $acc3,xzr,$t4 // can't overflow +___ +$code.=<<___ if ($i<3); + mul $t3,$acc0,$ordk +___ +$code.=<<___; + lsl $t0,$t4,#32 + subs $acc1,$acc1,$t4 + lsr $t1,$t4,#32 + sbcs $acc2,$acc2,$t0 + sbc $acc3,$acc3,$t1 // can't borrow +___ + ($t3,$t4) = ($t4,$t3); +} +$code.=<<___; + adds $acc0,$acc0,$acc4 // accumulate upper half + adcs $acc1,$acc1,$acc5 + adcs $acc2,$acc2,$acc6 + adcs $acc3,$acc3,$acc7 + adc $acc4,xzr,xzr + + subs $t0,$acc0,$ord0 // ret -= modulus + sbcs $t1,$acc1,$ord1 + sbcs $t2,$acc2,$ord2 + sbcs $t3,$acc3,$ord3 + sbcs xzr,$acc4,xzr + + csel $a0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus + csel $a1,$acc1,$t1,lo + csel $a2,$acc2,$t2,lo + csel $a3,$acc3,$t3,lo + + cbnz $bp,.Loop_ord_sqr + + stp $a0,$a1,[$rp] + stp $a2,$a3,[$rp,#16] + + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldr x29,[sp],#64 + ret +.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont +___ +} } + +######################################################################## +# scatter-gather subroutines +{ +my ($out,$inp,$index,$mask)=map("x$_",(0..3)); +$code.=<<___; +// void ecp_nistz256_scatter_w5(void *x0,const P256_POINT *x1, +// int x2); +.globl ecp_nistz256_scatter_w5 +.type ecp_nistz256_scatter_w5,%function +.align 4 +ecp_nistz256_scatter_w5: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + add $out,$out,$index,lsl#2 + + ldp x4,x5,[$inp] // X + ldp x6,x7,[$inp,#16] + str w4,[$out,#64*0-4] + lsr x4,x4,#32 + str w5,[$out,#64*1-4] + lsr x5,x5,#32 + str w6,[$out,#64*2-4] + lsr x6,x6,#32 + str w7,[$out,#64*3-4] + lsr x7,x7,#32 + str w4,[$out,#64*4-4] + str w5,[$out,#64*5-4] + str w6,[$out,#64*6-4] + str w7,[$out,#64*7-4] + add $out,$out,#64*8 + + ldp x4,x5,[$inp,#32] // Y + ldp x6,x7,[$inp,#48] + str w4,[$out,#64*0-4] + lsr x4,x4,#32 + str w5,[$out,#64*1-4] + lsr x5,x5,#32 + str w6,[$out,#64*2-4] + lsr x6,x6,#32 + str w7,[$out,#64*3-4] + lsr x7,x7,#32 + str w4,[$out,#64*4-4] + str w5,[$out,#64*5-4] + str w6,[$out,#64*6-4] + str w7,[$out,#64*7-4] + add $out,$out,#64*8 + + ldp x4,x5,[$inp,#64] // Z + ldp x6,x7,[$inp,#80] + str w4,[$out,#64*0-4] + lsr x4,x4,#32 + str w5,[$out,#64*1-4] + lsr x5,x5,#32 + str w6,[$out,#64*2-4] + lsr x6,x6,#32 + str w7,[$out,#64*3-4] + lsr x7,x7,#32 + str w4,[$out,#64*4-4] + str w5,[$out,#64*5-4] + str w6,[$out,#64*6-4] + str w7,[$out,#64*7-4] + + ldr x29,[sp],#16 + ret +.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 + +// void ecp_nistz256_gather_w5(P256_POINT *x0,const void *x1, +// int x2); +.globl ecp_nistz256_gather_w5 +.type ecp_nistz256_gather_w5,%function +.align 4 +ecp_nistz256_gather_w5: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + cmp $index,xzr + csetm x3,ne + add $index,$index,x3 + add $inp,$inp,$index,lsl#2 + + ldr w4,[$inp,#64*0] + ldr w5,[$inp,#64*1] + ldr w6,[$inp,#64*2] + ldr w7,[$inp,#64*3] + ldr w8,[$inp,#64*4] + ldr w9,[$inp,#64*5] + ldr w10,[$inp,#64*6] + ldr w11,[$inp,#64*7] + add $inp,$inp,#64*8 + orr x4,x4,x8,lsl#32 + orr x5,x5,x9,lsl#32 + orr x6,x6,x10,lsl#32 + orr x7,x7,x11,lsl#32 + csel x4,x4,xzr,ne + csel x5,x5,xzr,ne + csel x6,x6,xzr,ne + csel x7,x7,xzr,ne + stp x4,x5,[$out] // X + stp x6,x7,[$out,#16] + + ldr w4,[$inp,#64*0] + ldr w5,[$inp,#64*1] + ldr w6,[$inp,#64*2] + ldr w7,[$inp,#64*3] + ldr w8,[$inp,#64*4] + ldr w9,[$inp,#64*5] + ldr w10,[$inp,#64*6] + ldr w11,[$inp,#64*7] + add $inp,$inp,#64*8 + orr x4,x4,x8,lsl#32 + orr x5,x5,x9,lsl#32 + orr x6,x6,x10,lsl#32 + orr x7,x7,x11,lsl#32 + csel x4,x4,xzr,ne + csel x5,x5,xzr,ne + csel x6,x6,xzr,ne + csel x7,x7,xzr,ne + stp x4,x5,[$out,#32] // Y + stp x6,x7,[$out,#48] + + ldr w4,[$inp,#64*0] + ldr w5,[$inp,#64*1] + ldr w6,[$inp,#64*2] + ldr w7,[$inp,#64*3] + ldr w8,[$inp,#64*4] + ldr w9,[$inp,#64*5] + ldr w10,[$inp,#64*6] + ldr w11,[$inp,#64*7] + orr x4,x4,x8,lsl#32 + orr x5,x5,x9,lsl#32 + orr x6,x6,x10,lsl#32 + orr x7,x7,x11,lsl#32 + csel x4,x4,xzr,ne + csel x5,x5,xzr,ne + csel x6,x6,xzr,ne + csel x7,x7,xzr,ne + stp x4,x5,[$out,#64] // Z + stp x6,x7,[$out,#80] + + ldr x29,[sp],#16 + ret +.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 + +// void ecp_nistz256_scatter_w7(void *x0,const P256_POINT_AFFINE *x1, +// int x2); +.globl ecp_nistz256_scatter_w7 +.type ecp_nistz256_scatter_w7,%function +.align 4 +ecp_nistz256_scatter_w7: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + add $out,$out,$index + mov $index,#64/8 +.Loop_scatter_w7: + ldr x3,[$inp],#8 + subs $index,$index,#1 + prfm pstl1strm,[$out,#4096+64*0] + prfm pstl1strm,[$out,#4096+64*1] + prfm pstl1strm,[$out,#4096+64*2] + prfm pstl1strm,[$out,#4096+64*3] + prfm pstl1strm,[$out,#4096+64*4] + prfm pstl1strm,[$out,#4096+64*5] + prfm pstl1strm,[$out,#4096+64*6] + prfm pstl1strm,[$out,#4096+64*7] + strb w3,[$out,#64*0] + lsr x3,x3,#8 + strb w3,[$out,#64*1] + lsr x3,x3,#8 + strb w3,[$out,#64*2] + lsr x3,x3,#8 + strb w3,[$out,#64*3] + lsr x3,x3,#8 + strb w3,[$out,#64*4] + lsr x3,x3,#8 + strb w3,[$out,#64*5] + lsr x3,x3,#8 + strb w3,[$out,#64*6] + lsr x3,x3,#8 + strb w3,[$out,#64*7] + add $out,$out,#64*8 + b.ne .Loop_scatter_w7 + + ldr x29,[sp],#16 + ret +.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 + +// void ecp_nistz256_gather_w7(P256_POINT_AFFINE *x0,const void *x1, +// int x2); +.globl ecp_nistz256_gather_w7 +.type ecp_nistz256_gather_w7,%function +.align 4 +ecp_nistz256_gather_w7: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + cmp $index,xzr + csetm x3,ne + add $index,$index,x3 + add $inp,$inp,$index + mov $index,#64/8 + nop +.Loop_gather_w7: + ldrb w4,[$inp,#64*0] + prfm pldl1strm,[$inp,#4096+64*0] + subs $index,$index,#1 + ldrb w5,[$inp,#64*1] + prfm pldl1strm,[$inp,#4096+64*1] + ldrb w6,[$inp,#64*2] + prfm pldl1strm,[$inp,#4096+64*2] + ldrb w7,[$inp,#64*3] + prfm pldl1strm,[$inp,#4096+64*3] + ldrb w8,[$inp,#64*4] + prfm pldl1strm,[$inp,#4096+64*4] + ldrb w9,[$inp,#64*5] + prfm pldl1strm,[$inp,#4096+64*5] + ldrb w10,[$inp,#64*6] + prfm pldl1strm,[$inp,#4096+64*6] + ldrb w11,[$inp,#64*7] + prfm pldl1strm,[$inp,#4096+64*7] + add $inp,$inp,#64*8 + orr x4,x4,x5,lsl#8 + orr x6,x6,x7,lsl#8 + orr x8,x8,x9,lsl#8 + orr x4,x4,x6,lsl#16 + orr x10,x10,x11,lsl#8 + orr x4,x4,x8,lsl#32 + orr x4,x4,x10,lsl#48 + and x4,x4,x3 + str x4,[$out],#8 + b.ne .Loop_gather_w7 + + ldr x29,[sp],#16 + ret +.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 +___ +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + + print $_,"\n"; +} +close STDOUT; # enforce flush diff --git a/crypto/ec/asm/ecp_nistz256-avx2.pl b/crypto/ec/asm/ecp_nistz256-avx2.pl index 4c220aa645f1..794e56a082fc 100755 --- a/crypto/ec/asm/ecp_nistz256-avx2.pl +++ b/crypto/ec/asm/ecp_nistz256-avx2.pl @@ -1,32 +1,19 @@ -#!/usr/bin/env perl - -############################################################################## -# # -# Copyright 2014 Intel Corporation # -# # -# Licensed under the Apache License, Version 2.0 (the "License"); # -# you may not use this file except in compliance with the License. # -# You may obtain a copy of the License at # -# # -# http://www.apache.org/licenses/LICENSE-2.0 # -# # -# Unless required by applicable law or agreed to in writing, software # -# distributed under the License is distributed on an "AS IS" BASIS, # -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # -# See the License for the specific language governing permissions and # -# limitations under the License. # -# # -############################################################################## -# # -# Developers and authors: # -# Shay Gueron (1, 2), and Vlad Krasnov (1) # -# (1) Intel Corporation, Israel Development Center # -# (2) University of Haifa # -# Reference: # -# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with# -# 256 Bit Primes" # -# # -############################################################################## +#! /usr/bin/env perl +# Copyright 2014-2018 The OpenSSL Project Authors. All Rights Reserved. +# Copyright (c) 2014, Intel Corporation. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html +# +# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1) +# (1) Intel Corporation, Israel Development Center, Haifa, Israel +# (2) University of Haifa, Israel +# +# Reference: +# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with +# 256 Bit Primes" $flavour = shift; $output = shift; @@ -60,7 +47,7 @@ if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && $addx = ($1>=12); } -if (!$addx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) { +if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9])\.([0-9]+)/) { my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 $avx = ($ver>=3.0) + ($ver>=3.01); $addx = ($ver>=3.03); @@ -149,8 +136,8 @@ $code.=<<___; ___ { -# This function recieves a pointer to an array of four affine points -# (X, Y, <1>) and rearanges the data for AVX2 execution, while +# This function receives a pointer to an array of four affine points +# (X, Y, <1>) and rearranges the data for AVX2 execution, while # converting it to 2^29 radix redundant form my ($X0,$X1,$X2,$X3, $Y0,$Y1,$Y2,$Y3, @@ -301,8 +288,8 @@ ___ } { ################################################################################ -# This function recieves a pointer to an array of four AVX2 formatted points -# (X, Y, Z) convert the data to normal representation, and rearanges the data +# This function receives a pointer to an array of four AVX2 formatted points +# (X, Y, Z) convert the data to normal representation, and rearranges the data my ($D0,$D1,$D2,$D3, $D4,$D5,$D6,$D7, $D8)=map("%ymm$_",(0..8)); my ($T0,$T1,$T2,$T3, $T4,$T5,$T6)=map("%ymm$_",(9..15)); @@ -1909,7 +1896,7 @@ ___ } { ################################################################################ -# void ecp_nistz256_avx2_multi_select_w7(void* RESULT, void *in, +# void ecp_nistz256_avx2_multi_gather_w7(void* RESULT, void *in, # int index0, int index1, int index2, int index3); ################################################################################ @@ -1919,10 +1906,10 @@ my ($R0a,$R0b,$R1a,$R1b,$R2a,$R2b,$R3a,$R3b)=map("%ymm$_",(4..11)); my ($M0,$T0,$T1,$TMP0)=map("%ymm$_",(12..15)); $code.=<<___; -.globl ecp_nistz256_avx2_multi_select_w7 -.type ecp_nistz256_avx2_multi_select_w7,\@function,6 +.globl ecp_nistz256_avx2_multi_gather_w7 +.type ecp_nistz256_avx2_multi_gather_w7,\@function,6 .align 32 -ecp_nistz256_avx2_multi_select_w7: +ecp_nistz256_avx2_multi_gather_w7: vzeroupper ___ $code.=<<___ if ($win64); @@ -2036,7 +2023,7 @@ $code.=<<___ if ($win64); ___ $code.=<<___; ret -.size ecp_nistz256_avx2_multi_select_w7,.-ecp_nistz256_avx2_multi_select_w7 +.size ecp_nistz256_avx2_multi_gather_w7,.-ecp_nistz256_avx2_multi_gather_w7 .extern OPENSSL_ia32cap_P .globl ecp_nistz_avx2_eligible @@ -2061,8 +2048,8 @@ $code.=<<___; .globl ecp_nistz256_avx2_to_mont .globl ecp_nistz256_avx2_from_mont .globl ecp_nistz256_avx2_set1 -.globl ecp_nistz256_avx2_multi_select_w7 -.type ecp_nistz256_avx2_multi_select_w7,\@abi-omnipotent +.globl ecp_nistz256_avx2_multi_gather_w7 +.type ecp_nistz256_avx2_multi_gather_w7,\@abi-omnipotent ecp_nistz256_avx2_transpose_convert: ecp_nistz256_avx2_convert_transpose_back: ecp_nistz256_avx2_point_add_affine_x4: @@ -2070,10 +2057,10 @@ ecp_nistz256_avx2_point_add_affines_x4: ecp_nistz256_avx2_to_mont: ecp_nistz256_avx2_from_mont: ecp_nistz256_avx2_set1: -ecp_nistz256_avx2_multi_select_w7: +ecp_nistz256_avx2_multi_gather_w7: .byte 0x0f,0x0b # ud2 ret -.size ecp_nistz256_avx2_multi_select_w7,.-ecp_nistz256_avx2_multi_select_w7 +.size ecp_nistz256_avx2_multi_gather_w7,.-ecp_nistz256_avx2_multi_gather_w7 .globl ecp_nistz_avx2_eligible .type ecp_nistz_avx2_eligible,\@abi-omnipotent diff --git a/crypto/ec/asm/ecp_nistz256-ppc64.pl b/crypto/ec/asm/ecp_nistz256-ppc64.pl new file mode 100755 index 000000000000..984c7f205056 --- /dev/null +++ b/crypto/ec/asm/ecp_nistz256-ppc64.pl @@ -0,0 +1,2382 @@ +#! /usr/bin/env perl +# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# ECP_NISTZ256 module for PPC64. +# +# August 2016. +# +# Original ECP_NISTZ256 submission targeting x86_64 is detailed in +# http://eprint.iacr.org/2013/816. +# +# with/without -DECP_NISTZ256_ASM +# POWER7 +260-530% +# POWER8 +220-340% + +$flavour = shift; +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +my $sp="r1"; + +{ +my ($rp,$ap,$bp,$bi,$acc0,$acc1,$acc2,$acc3,$poly1,$poly3, + $acc4,$acc5,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3) = + map("r$_",(3..12,22..31)); + +my ($acc6,$acc7)=($bp,$bi); # used in __ecp_nistz256_sqr_mont + +$code.=<<___; +.machine "any" +.text +___ +######################################################################## +# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 +# +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +open TABLE,"<ecp_nistz256_table.c" or +open TABLE,"<${dir}../ecp_nistz256_table.c" or +die "failed to open ecp_nistz256_table.c:",$!; + +use integer; + +foreach(<TABLE>) { + s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; +} +close TABLE; + +# See ecp_nistz256_table.c for explanation for why it's 64*16*37. +# 64*16*37-1 is because $#arr returns last valid index or @arr, not +# amount of elements. +die "insane number of elements" if ($#arr != 64*16*37-1); + +$code.=<<___; +.type ecp_nistz256_precomputed,\@object +.globl ecp_nistz256_precomputed +.align 12 +ecp_nistz256_precomputed: +___ +######################################################################## +# this conversion smashes P256_POINT_AFFINE by individual bytes with +# 64 byte interval, similar to +# 1111222233334444 +# 1234123412341234 +for(1..37) { + @tbl = splice(@arr,0,64*16); + for($i=0;$i<64;$i++) { + undef @line; + for($j=0;$j<64;$j++) { + push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff; + } + $code.=".byte\t"; + $code.=join(',',map { sprintf "0x%02x",$_} @line); + $code.="\n"; + } +} + +$code.=<<___; +.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed +.asciz "ECP_NISTZ256 for PPC64, CRYPTOGAMS by <appro\@openssl.org>" + +# void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], +# const BN_ULONG x2[4]); +.globl ecp_nistz256_mul_mont +.align 5 +ecp_nistz256_mul_mont: + stdu $sp,-128($sp) + mflr r0 + std r22,48($sp) + std r23,56($sp) + std r24,64($sp) + std r25,72($sp) + std r26,80($sp) + std r27,88($sp) + std r28,96($sp) + std r29,104($sp) + std r30,112($sp) + std r31,120($sp) + + ld $a0,0($ap) + ld $bi,0($bp) + ld $a1,8($ap) + ld $a2,16($ap) + ld $a3,24($ap) + + li $poly1,-1 + srdi $poly1,$poly1,32 # 0x00000000ffffffff + li $poly3,1 + orc $poly3,$poly3,$poly1 # 0xffffffff00000001 + + bl __ecp_nistz256_mul_mont + + mtlr r0 + ld r22,48($sp) + ld r23,56($sp) + ld r24,64($sp) + ld r25,72($sp) + ld r26,80($sp) + ld r27,88($sp) + ld r28,96($sp) + ld r29,104($sp) + ld r30,112($sp) + ld r31,120($sp) + addi $sp,$sp,128 + blr + .long 0 + .byte 0,12,4,0,0x80,10,3,0 + .long 0 +.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont + +# void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_sqr_mont +.align 4 +ecp_nistz256_sqr_mont: + stdu $sp,-128($sp) + mflr r0 + std r22,48($sp) + std r23,56($sp) + std r24,64($sp) + std r25,72($sp) + std r26,80($sp) + std r27,88($sp) + std r28,96($sp) + std r29,104($sp) + std r30,112($sp) + std r31,120($sp) + + ld $a0,0($ap) + ld $a1,8($ap) + ld $a2,16($ap) + ld $a3,24($ap) + + li $poly1,-1 + srdi $poly1,$poly1,32 # 0x00000000ffffffff + li $poly3,1 + orc $poly3,$poly3,$poly1 # 0xffffffff00000001 + + bl __ecp_nistz256_sqr_mont + + mtlr r0 + ld r22,48($sp) + ld r23,56($sp) + ld r24,64($sp) + ld r25,72($sp) + ld r26,80($sp) + ld r27,88($sp) + ld r28,96($sp) + ld r29,104($sp) + ld r30,112($sp) + ld r31,120($sp) + addi $sp,$sp,128 + blr + .long 0 + .byte 0,12,4,0,0x80,10,2,0 + .long 0 +.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont + +# void ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4], +# const BN_ULONG x2[4]); +.globl ecp_nistz256_add +.align 4 +ecp_nistz256_add: + stdu $sp,-128($sp) + mflr r0 + std r28,96($sp) + std r29,104($sp) + std r30,112($sp) + std r31,120($sp) + + ld $acc0,0($ap) + ld $t0, 0($bp) + ld $acc1,8($ap) + ld $t1, 8($bp) + ld $acc2,16($ap) + ld $t2, 16($bp) + ld $acc3,24($ap) + ld $t3, 24($bp) + + li $poly1,-1 + srdi $poly1,$poly1,32 # 0x00000000ffffffff + li $poly3,1 + orc $poly3,$poly3,$poly1 # 0xffffffff00000001 + + bl __ecp_nistz256_add + + mtlr r0 + ld r28,96($sp) + ld r29,104($sp) + ld r30,112($sp) + ld r31,120($sp) + addi $sp,$sp,128 + blr + .long 0 + .byte 0,12,4,0,0x80,4,3,0 + .long 0 +.size ecp_nistz256_add,.-ecp_nistz256_add + +# void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_div_by_2 +.align 4 +ecp_nistz256_div_by_2: + stdu $sp,-128($sp) + mflr r0 + std r28,96($sp) + std r29,104($sp) + std r30,112($sp) + std r31,120($sp) + + ld $acc0,0($ap) + ld $acc1,8($ap) + ld $acc2,16($ap) + ld $acc3,24($ap) + + li $poly1,-1 + srdi $poly1,$poly1,32 # 0x00000000ffffffff + li $poly3,1 + orc $poly3,$poly3,$poly1 # 0xffffffff00000001 + + bl __ecp_nistz256_div_by_2 + + mtlr r0 + ld r28,96($sp) + ld r29,104($sp) + ld r30,112($sp) + ld r31,120($sp) + addi $sp,$sp,128 + blr + .long 0 + .byte 0,12,4,0,0x80,4,2,0 + .long 0 +.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 + +# void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_mul_by_2 +.align 4 +ecp_nistz256_mul_by_2: + stdu $sp,-128($sp) + mflr r0 + std r28,96($sp) + std r29,104($sp) + std r30,112($sp) + std r31,120($sp) + + ld $acc0,0($ap) + ld $acc1,8($ap) + ld $acc2,16($ap) + ld $acc3,24($ap) + + mr $t0,$acc0 + mr $t1,$acc1 + mr $t2,$acc2 + mr $t3,$acc3 + + li $poly1,-1 + srdi $poly1,$poly1,32 # 0x00000000ffffffff + li $poly3,1 + orc $poly3,$poly3,$poly1 # 0xffffffff00000001 + + bl __ecp_nistz256_add # ret = a+a // 2*a + + mtlr r0 + ld r28,96($sp) + ld r29,104($sp) + ld r30,112($sp) + ld r31,120($sp) + addi $sp,$sp,128 + blr + .long 0 + .byte 0,12,4,0,0x80,4,3,0 + .long 0 +.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 + +# void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_mul_by_3 +.align 4 +ecp_nistz256_mul_by_3: + stdu $sp,-128($sp) + mflr r0 + std r28,96($sp) + std r29,104($sp) + std r30,112($sp) + std r31,120($sp) + + ld $acc0,0($ap) + ld $acc1,8($ap) + ld $acc2,16($ap) + ld $acc3,24($ap) + + mr $t0,$acc0 + std $acc0,64($sp) + mr $t1,$acc1 + std $acc1,72($sp) + mr $t2,$acc2 + std $acc2,80($sp) + mr $t3,$acc3 + std $acc3,88($sp) + + li $poly1,-1 + srdi $poly1,$poly1,32 # 0x00000000ffffffff + li $poly3,1 + orc $poly3,$poly3,$poly1 # 0xffffffff00000001 + + bl __ecp_nistz256_add # ret = a+a // 2*a + + ld $t0,64($sp) + ld $t1,72($sp) + ld $t2,80($sp) + ld $t3,88($sp) + + bl __ecp_nistz256_add # ret += a // 2*a+a=3*a + + mtlr r0 + ld r28,96($sp) + ld r29,104($sp) + ld r30,112($sp) + ld r31,120($sp) + addi $sp,$sp,128 + blr + .long 0 + .byte 0,12,4,0,0x80,4,2,0 + .long 0 +.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 + +# void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4], +# const BN_ULONG x2[4]); +.globl ecp_nistz256_sub +.align 4 +ecp_nistz256_sub: + stdu $sp,-128($sp) + mflr r0 + std r28,96($sp) + std r29,104($sp) + std r30,112($sp) + std r31,120($sp) + + ld $acc0,0($ap) + ld $acc1,8($ap) + ld $acc2,16($ap) + ld $acc3,24($ap) + + li $poly1,-1 + srdi $poly1,$poly1,32 # 0x00000000ffffffff + li $poly3,1 + orc $poly3,$poly3,$poly1 # 0xffffffff00000001 + + bl __ecp_nistz256_sub_from + + mtlr r0 + ld r28,96($sp) + ld r29,104($sp) + ld r30,112($sp) + ld r31,120($sp) + addi $sp,$sp,128 + blr + .long 0 + .byte 0,12,4,0,0x80,4,3,0 + .long 0 +.size ecp_nistz256_sub,.-ecp_nistz256_sub + +# void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_neg +.align 4 +ecp_nistz256_neg: + stdu $sp,-128($sp) + mflr r0 + std r28,96($sp) + std r29,104($sp) + std r30,112($sp) + std r31,120($sp) + + mr $bp,$ap + li $acc0,0 + li $acc1,0 + li $acc2,0 + li $acc3,0 + + li $poly1,-1 + srdi $poly1,$poly1,32 # 0x00000000ffffffff + li $poly3,1 + orc $poly3,$poly3,$poly1 # 0xffffffff00000001 + + bl __ecp_nistz256_sub_from + + mtlr r0 + ld r28,96($sp) + ld r29,104($sp) + ld r30,112($sp) + ld r31,120($sp) + addi $sp,$sp,128 + blr + .long 0 + .byte 0,12,4,0,0x80,4,2,0 + .long 0 +.size ecp_nistz256_neg,.-ecp_nistz256_neg + +# note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded +# to $a0-$a3 and b[0] - to $bi +.type __ecp_nistz256_mul_mont,\@function +.align 4 +__ecp_nistz256_mul_mont: + mulld $acc0,$a0,$bi # a[0]*b[0] + mulhdu $t0,$a0,$bi + + mulld $acc1,$a1,$bi # a[1]*b[0] + mulhdu $t1,$a1,$bi + + mulld $acc2,$a2,$bi # a[2]*b[0] + mulhdu $t2,$a2,$bi + + mulld $acc3,$a3,$bi # a[3]*b[0] + mulhdu $t3,$a3,$bi + ld $bi,8($bp) # b[1] + + addc $acc1,$acc1,$t0 # accumulate high parts of multiplication + sldi $t0,$acc0,32 + adde $acc2,$acc2,$t1 + srdi $t1,$acc0,32 + adde $acc3,$acc3,$t2 + addze $acc4,$t3 + li $acc5,0 +___ +for($i=1;$i<4;$i++) { + ################################################################ + # Reduction iteration is normally performed by accumulating + # result of multiplication of modulus by "magic" digit [and + # omitting least significant word, which is guaranteed to + # be 0], but thanks to special form of modulus and "magic" + # digit being equal to least significant word, it can be + # performed with additions and subtractions alone. Indeed: + # + # ffff0001.00000000.0000ffff.ffffffff + # * abcdefgh + # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh + # + # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we + # rewrite above as: + # + # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh + # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000 + # - 0000abcd.efgh0000.00000000.00000000.abcdefgh + # + # or marking redundant operations: + # + # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.-------- + # + abcdefgh.abcdefgh.0000abcd.efgh0000.-------- + # - 0000abcd.efgh0000.--------.--------.-------- + +$code.=<<___; + subfc $t2,$t0,$acc0 # "*0xffff0001" + subfe $t3,$t1,$acc0 + addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0] + adde $acc1,$acc2,$t1 + adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001 + adde $acc3,$acc4,$t3 + addze $acc4,$acc5 + + mulld $t0,$a0,$bi # lo(a[0]*b[i]) + mulld $t1,$a1,$bi # lo(a[1]*b[i]) + mulld $t2,$a2,$bi # lo(a[2]*b[i]) + mulld $t3,$a3,$bi # lo(a[3]*b[i]) + addc $acc0,$acc0,$t0 # accumulate low parts of multiplication + mulhdu $t0,$a0,$bi # hi(a[0]*b[i]) + adde $acc1,$acc1,$t1 + mulhdu $t1,$a1,$bi # hi(a[1]*b[i]) + adde $acc2,$acc2,$t2 + mulhdu $t2,$a2,$bi # hi(a[2]*b[i]) + adde $acc3,$acc3,$t3 + mulhdu $t3,$a3,$bi # hi(a[3]*b[i]) + addze $acc4,$acc4 +___ +$code.=<<___ if ($i<3); + ld $bi,8*($i+1)($bp) # b[$i+1] +___ +$code.=<<___; + addc $acc1,$acc1,$t0 # accumulate high parts of multiplication + sldi $t0,$acc0,32 + adde $acc2,$acc2,$t1 + srdi $t1,$acc0,32 + adde $acc3,$acc3,$t2 + adde $acc4,$acc4,$t3 + li $acc5,0 + addze $acc5,$acc5 +___ +} +$code.=<<___; + # last reduction + subfc $t2,$t0,$acc0 # "*0xffff0001" + subfe $t3,$t1,$acc0 + addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0] + adde $acc1,$acc2,$t1 + adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001 + adde $acc3,$acc4,$t3 + addze $acc4,$acc5 + + li $t2,0 + addic $acc0,$acc0,1 # ret -= modulus + subfe $acc1,$poly1,$acc1 + subfe $acc2,$t2,$acc2 + subfe $acc3,$poly3,$acc3 + subfe $acc4,$t2,$acc4 + + addc $acc0,$acc0,$acc4 # ret += modulus if borrow + and $t1,$poly1,$acc4 + and $t3,$poly3,$acc4 + adde $acc1,$acc1,$t1 + addze $acc2,$acc2 + adde $acc3,$acc3,$t3 + + std $acc0,0($rp) + std $acc1,8($rp) + std $acc2,16($rp) + std $acc3,24($rp) + + blr + .long 0 + .byte 0,12,0x14,0,0,0,1,0 + .long 0 +.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont + +# note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded +# to $a0-$a3 +.type __ecp_nistz256_sqr_mont,\@function +.align 4 +__ecp_nistz256_sqr_mont: + ################################################################ + # | | | | | |a1*a0| | + # | | | | |a2*a0| | | + # | |a3*a2|a3*a0| | | | + # | | | |a2*a1| | | | + # | | |a3*a1| | | | | + # *| | | | | | | | 2| + # +|a3*a3|a2*a2|a1*a1|a0*a0| + # |--+--+--+--+--+--+--+--| + # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx + # + # "can't overflow" below mark carrying into high part of + # multiplication result, which can't overflow, because it + # can never be all ones. + + mulld $acc1,$a1,$a0 # a[1]*a[0] + mulhdu $t1,$a1,$a0 + mulld $acc2,$a2,$a0 # a[2]*a[0] + mulhdu $t2,$a2,$a0 + mulld $acc3,$a3,$a0 # a[3]*a[0] + mulhdu $acc4,$a3,$a0 + + addc $acc2,$acc2,$t1 # accumulate high parts of multiplication + mulld $t0,$a2,$a1 # a[2]*a[1] + mulhdu $t1,$a2,$a1 + adde $acc3,$acc3,$t2 + mulld $t2,$a3,$a1 # a[3]*a[1] + mulhdu $t3,$a3,$a1 + addze $acc4,$acc4 # can't overflow + + mulld $acc5,$a3,$a2 # a[3]*a[2] + mulhdu $acc6,$a3,$a2 + + addc $t1,$t1,$t2 # accumulate high parts of multiplication + addze $t2,$t3 # can't overflow + + addc $acc3,$acc3,$t0 # accumulate low parts of multiplication + adde $acc4,$acc4,$t1 + adde $acc5,$acc5,$t2 + addze $acc6,$acc6 # can't overflow + + addc $acc1,$acc1,$acc1 # acc[1-6]*=2 + adde $acc2,$acc2,$acc2 + adde $acc3,$acc3,$acc3 + adde $acc4,$acc4,$acc4 + adde $acc5,$acc5,$acc5 + adde $acc6,$acc6,$acc6 + li $acc7,0 + addze $acc7,$acc7 + + mulld $acc0,$a0,$a0 # a[0]*a[0] + mulhdu $a0,$a0,$a0 + mulld $t1,$a1,$a1 # a[1]*a[1] + mulhdu $a1,$a1,$a1 + mulld $t2,$a2,$a2 # a[2]*a[2] + mulhdu $a2,$a2,$a2 + mulld $t3,$a3,$a3 # a[3]*a[3] + mulhdu $a3,$a3,$a3 + addc $acc1,$acc1,$a0 # +a[i]*a[i] + sldi $t0,$acc0,32 + adde $acc2,$acc2,$t1 + srdi $t1,$acc0,32 + adde $acc3,$acc3,$a1 + adde $acc4,$acc4,$t2 + adde $acc5,$acc5,$a2 + adde $acc6,$acc6,$t3 + adde $acc7,$acc7,$a3 +___ +for($i=0;$i<3;$i++) { # reductions, see commentary in + # multiplication for details +$code.=<<___; + subfc $t2,$t0,$acc0 # "*0xffff0001" + subfe $t3,$t1,$acc0 + addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0] + sldi $t0,$acc0,32 + adde $acc1,$acc2,$t1 + srdi $t1,$acc0,32 + adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001 + addze $acc3,$t3 # can't overflow +___ +} +$code.=<<___; + subfc $t2,$t0,$acc0 # "*0xffff0001" + subfe $t3,$t1,$acc0 + addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0] + adde $acc1,$acc2,$t1 + adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001 + addze $acc3,$t3 # can't overflow + + addc $acc0,$acc0,$acc4 # accumulate upper half + adde $acc1,$acc1,$acc5 + adde $acc2,$acc2,$acc6 + adde $acc3,$acc3,$acc7 + li $t2,0 + addze $acc4,$t2 + + addic $acc0,$acc0,1 # ret -= modulus + subfe $acc1,$poly1,$acc1 + subfe $acc2,$t2,$acc2 + subfe $acc3,$poly3,$acc3 + subfe $acc4,$t2,$acc4 + + addc $acc0,$acc0,$acc4 # ret += modulus if borrow + and $t1,$poly1,$acc4 + and $t3,$poly3,$acc4 + adde $acc1,$acc1,$t1 + addze $acc2,$acc2 + adde $acc3,$acc3,$t3 + + std $acc0,0($rp) + std $acc1,8($rp) + std $acc2,16($rp) + std $acc3,24($rp) + + blr + .long 0 + .byte 0,12,0x14,0,0,0,1,0 + .long 0 +.size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont + +# Note that __ecp_nistz256_add expects both input vectors pre-loaded to +# $a0-$a3 and $t0-$t3. This is done because it's used in multiple +# contexts, e.g. in multiplication by 2 and 3... +.type __ecp_nistz256_add,\@function +.align 4 +__ecp_nistz256_add: + addc $acc0,$acc0,$t0 # ret = a+b + adde $acc1,$acc1,$t1 + adde $acc2,$acc2,$t2 + li $t2,0 + adde $acc3,$acc3,$t3 + addze $t0,$t2 + + # if a+b >= modulus, subtract modulus + # + # But since comparison implies subtraction, we subtract + # modulus and then add it back if subtraction borrowed. + + subic $acc0,$acc0,-1 + subfe $acc1,$poly1,$acc1 + subfe $acc2,$t2,$acc2 + subfe $acc3,$poly3,$acc3 + subfe $t0,$t2,$t0 + + addc $acc0,$acc0,$t0 + and $t1,$poly1,$t0 + and $t3,$poly3,$t0 + adde $acc1,$acc1,$t1 + addze $acc2,$acc2 + adde $acc3,$acc3,$t3 + + std $acc0,0($rp) + std $acc1,8($rp) + std $acc2,16($rp) + std $acc3,24($rp) + + blr + .long 0 + .byte 0,12,0x14,0,0,0,3,0 + .long 0 +.size __ecp_nistz256_add,.-__ecp_nistz256_add + +.type __ecp_nistz256_sub_from,\@function +.align 4 +__ecp_nistz256_sub_from: + ld $t0,0($bp) + ld $t1,8($bp) + ld $t2,16($bp) + ld $t3,24($bp) + subfc $acc0,$t0,$acc0 # ret = a-b + subfe $acc1,$t1,$acc1 + subfe $acc2,$t2,$acc2 + subfe $acc3,$t3,$acc3 + subfe $t0,$t0,$t0 # t0 = borrow ? -1 : 0 + + # if a-b borrowed, add modulus + + addc $acc0,$acc0,$t0 # ret -= modulus & t0 + and $t1,$poly1,$t0 + and $t3,$poly3,$t0 + adde $acc1,$acc1,$t1 + addze $acc2,$acc2 + adde $acc3,$acc3,$t3 + + std $acc0,0($rp) + std $acc1,8($rp) + std $acc2,16($rp) + std $acc3,24($rp) + + blr + .long 0 + .byte 0,12,0x14,0,0,0,3,0 + .long 0 +.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from + +.type __ecp_nistz256_sub_morf,\@function +.align 4 +__ecp_nistz256_sub_morf: + ld $t0,0($bp) + ld $t1,8($bp) + ld $t2,16($bp) + ld $t3,24($bp) + subfc $acc0,$acc0,$t0 # ret = b-a + subfe $acc1,$acc1,$t1 + subfe $acc2,$acc2,$t2 + subfe $acc3,$acc3,$t3 + subfe $t0,$t0,$t0 # t0 = borrow ? -1 : 0 + + # if b-a borrowed, add modulus + + addc $acc0,$acc0,$t0 # ret -= modulus & t0 + and $t1,$poly1,$t0 + and $t3,$poly3,$t0 + adde $acc1,$acc1,$t1 + addze $acc2,$acc2 + adde $acc3,$acc3,$t3 + + std $acc0,0($rp) + std $acc1,8($rp) + std $acc2,16($rp) + std $acc3,24($rp) + + blr + .long 0 + .byte 0,12,0x14,0,0,0,3,0 + .long 0 +.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf + +.type __ecp_nistz256_div_by_2,\@function +.align 4 +__ecp_nistz256_div_by_2: + andi. $t0,$acc0,1 + addic $acc0,$acc0,-1 # a += modulus + neg $t0,$t0 + adde $acc1,$acc1,$poly1 + not $t0,$t0 + addze $acc2,$acc2 + li $t2,0 + adde $acc3,$acc3,$poly3 + and $t1,$poly1,$t0 + addze $ap,$t2 # ap = carry + and $t3,$poly3,$t0 + + subfc $acc0,$t0,$acc0 # a -= modulus if a was even + subfe $acc1,$t1,$acc1 + subfe $acc2,$t2,$acc2 + subfe $acc3,$t3,$acc3 + subfe $ap, $t2,$ap + + srdi $acc0,$acc0,1 + sldi $t0,$acc1,63 + srdi $acc1,$acc1,1 + sldi $t1,$acc2,63 + srdi $acc2,$acc2,1 + sldi $t2,$acc3,63 + srdi $acc3,$acc3,1 + sldi $t3,$ap,63 + or $acc0,$acc0,$t0 + or $acc1,$acc1,$t1 + or $acc2,$acc2,$t2 + or $acc3,$acc3,$t3 + + std $acc0,0($rp) + std $acc1,8($rp) + std $acc2,16($rp) + std $acc3,24($rp) + + blr + .long 0 + .byte 0,12,0x14,0,0,0,1,0 + .long 0 +.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 +___ +######################################################################## +# following subroutines are "literal" implementation of those found in +# ecp_nistz256.c +# +######################################################################## +# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); +# +if (1) { +my $FRAME=64+32*4+12*8; +my ($S,$M,$Zsqr,$tmp0)=map(64+32*$_,(0..3)); +# above map() describes stack layout with 4 temporary +# 256-bit vectors on top. +my ($rp_real,$ap_real) = map("r$_",(20,21)); + +$code.=<<___; +.globl ecp_nistz256_point_double +.align 5 +ecp_nistz256_point_double: + stdu $sp,-$FRAME($sp) + mflr r0 + std r20,$FRAME-8*12($sp) + std r21,$FRAME-8*11($sp) + std r22,$FRAME-8*10($sp) + std r23,$FRAME-8*9($sp) + std r24,$FRAME-8*8($sp) + std r25,$FRAME-8*7($sp) + std r26,$FRAME-8*6($sp) + std r27,$FRAME-8*5($sp) + std r28,$FRAME-8*4($sp) + std r29,$FRAME-8*3($sp) + std r30,$FRAME-8*2($sp) + std r31,$FRAME-8*1($sp) + + li $poly1,-1 + srdi $poly1,$poly1,32 # 0x00000000ffffffff + li $poly3,1 + orc $poly3,$poly3,$poly1 # 0xffffffff00000001 +.Ldouble_shortcut: + ld $acc0,32($ap) + ld $acc1,40($ap) + ld $acc2,48($ap) + ld $acc3,56($ap) + mr $t0,$acc0 + mr $t1,$acc1 + mr $t2,$acc2 + mr $t3,$acc3 + ld $a0,64($ap) # forward load for p256_sqr_mont + ld $a1,72($ap) + ld $a2,80($ap) + ld $a3,88($ap) + mr $rp_real,$rp + mr $ap_real,$ap + addi $rp,$sp,$S + bl __ecp_nistz256_add # p256_mul_by_2(S, in_y); + + addi $rp,$sp,$Zsqr + bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Zsqr, in_z); + + ld $t0,0($ap_real) + ld $t1,8($ap_real) + ld $t2,16($ap_real) + ld $t3,24($ap_real) + mr $a0,$acc0 # put Zsqr aside for p256_sub + mr $a1,$acc1 + mr $a2,$acc2 + mr $a3,$acc3 + addi $rp,$sp,$M + bl __ecp_nistz256_add # p256_add(M, Zsqr, in_x); + + addi $bp,$ap_real,0 + mr $acc0,$a0 # restore Zsqr + mr $acc1,$a1 + mr $acc2,$a2 + mr $acc3,$a3 + ld $a0,$S+0($sp) # forward load for p256_sqr_mont + ld $a1,$S+8($sp) + ld $a2,$S+16($sp) + ld $a3,$S+24($sp) + addi $rp,$sp,$Zsqr + bl __ecp_nistz256_sub_morf # p256_sub(Zsqr, in_x, Zsqr); + + addi $rp,$sp,$S + bl __ecp_nistz256_sqr_mont # p256_sqr_mont(S, S); + + ld $bi,32($ap_real) + ld $a0,64($ap_real) + ld $a1,72($ap_real) + ld $a2,80($ap_real) + ld $a3,88($ap_real) + addi $bp,$ap_real,32 + addi $rp,$sp,$tmp0 + bl __ecp_nistz256_mul_mont # p256_mul_mont(tmp0, in_z, in_y); + + mr $t0,$acc0 + mr $t1,$acc1 + mr $t2,$acc2 + mr $t3,$acc3 + ld $a0,$S+0($sp) # forward load for p256_sqr_mont + ld $a1,$S+8($sp) + ld $a2,$S+16($sp) + ld $a3,$S+24($sp) + addi $rp,$rp_real,64 + bl __ecp_nistz256_add # p256_mul_by_2(res_z, tmp0); + + addi $rp,$sp,$tmp0 + bl __ecp_nistz256_sqr_mont # p256_sqr_mont(tmp0, S); + + ld $bi,$Zsqr($sp) # forward load for p256_mul_mont + ld $a0,$M+0($sp) + ld $a1,$M+8($sp) + ld $a2,$M+16($sp) + ld $a3,$M+24($sp) + addi $rp,$rp_real,32 + bl __ecp_nistz256_div_by_2 # p256_div_by_2(res_y, tmp0); + + addi $bp,$sp,$Zsqr + addi $rp,$sp,$M + bl __ecp_nistz256_mul_mont # p256_mul_mont(M, M, Zsqr); + + mr $t0,$acc0 # duplicate M + mr $t1,$acc1 + mr $t2,$acc2 + mr $t3,$acc3 + mr $a0,$acc0 # put M aside + mr $a1,$acc1 + mr $a2,$acc2 + mr $a3,$acc3 + addi $rp,$sp,$M + bl __ecp_nistz256_add + mr $t0,$a0 # restore M + mr $t1,$a1 + mr $t2,$a2 + mr $t3,$a3 + ld $bi,0($ap_real) # forward load for p256_mul_mont + ld $a0,$S+0($sp) + ld $a1,$S+8($sp) + ld $a2,$S+16($sp) + ld $a3,$S+24($sp) + bl __ecp_nistz256_add # p256_mul_by_3(M, M); + + addi $bp,$ap_real,0 + addi $rp,$sp,$S + bl __ecp_nistz256_mul_mont # p256_mul_mont(S, S, in_x); + + mr $t0,$acc0 + mr $t1,$acc1 + mr $t2,$acc2 + mr $t3,$acc3 + ld $a0,$M+0($sp) # forward load for p256_sqr_mont + ld $a1,$M+8($sp) + ld $a2,$M+16($sp) + ld $a3,$M+24($sp) + addi $rp,$sp,$tmp0 + bl __ecp_nistz256_add # p256_mul_by_2(tmp0, S); + + addi $rp,$rp_real,0 + bl __ecp_nistz256_sqr_mont # p256_sqr_mont(res_x, M); + + addi $bp,$sp,$tmp0 + bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, tmp0); + + addi $bp,$sp,$S + addi $rp,$sp,$S + bl __ecp_nistz256_sub_morf # p256_sub(S, S, res_x); + + ld $bi,$M($sp) + mr $a0,$acc0 # copy S + mr $a1,$acc1 + mr $a2,$acc2 + mr $a3,$acc3 + addi $bp,$sp,$M + bl __ecp_nistz256_mul_mont # p256_mul_mont(S, S, M); + + addi $bp,$rp_real,32 + addi $rp,$rp_real,32 + bl __ecp_nistz256_sub_from # p256_sub(res_y, S, res_y); + + mtlr r0 + ld r20,$FRAME-8*12($sp) + ld r21,$FRAME-8*11($sp) + ld r22,$FRAME-8*10($sp) + ld r23,$FRAME-8*9($sp) + ld r24,$FRAME-8*8($sp) + ld r25,$FRAME-8*7($sp) + ld r26,$FRAME-8*6($sp) + ld r27,$FRAME-8*5($sp) + ld r28,$FRAME-8*4($sp) + ld r29,$FRAME-8*3($sp) + ld r30,$FRAME-8*2($sp) + ld r31,$FRAME-8*1($sp) + addi $sp,$sp,$FRAME + blr + .long 0 + .byte 0,12,4,0,0x80,12,2,0 + .long 0 +.size ecp_nistz256_point_double,.-ecp_nistz256_point_double +___ +} + +######################################################################## +# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, +# const P256_POINT *in2); +if (1) { +my $FRAME = 64 + 32*12 + 16*8; +my ($res_x,$res_y,$res_z, + $H,$Hsqr,$R,$Rsqr,$Hcub, + $U1,$U2,$S1,$S2)=map(64+32*$_,(0..11)); +my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); +# above map() describes stack layout with 12 temporary +# 256-bit vectors on top. +my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21)); + +$code.=<<___; +.globl ecp_nistz256_point_add +.align 5 +ecp_nistz256_point_add: + stdu $sp,-$FRAME($sp) + mflr r0 + std r16,$FRAME-8*16($sp) + std r17,$FRAME-8*15($sp) + std r18,$FRAME-8*14($sp) + std r19,$FRAME-8*13($sp) + std r20,$FRAME-8*12($sp) + std r21,$FRAME-8*11($sp) + std r22,$FRAME-8*10($sp) + std r23,$FRAME-8*9($sp) + std r24,$FRAME-8*8($sp) + std r25,$FRAME-8*7($sp) + std r26,$FRAME-8*6($sp) + std r27,$FRAME-8*5($sp) + std r28,$FRAME-8*4($sp) + std r29,$FRAME-8*3($sp) + std r30,$FRAME-8*2($sp) + std r31,$FRAME-8*1($sp) + + li $poly1,-1 + srdi $poly1,$poly1,32 # 0x00000000ffffffff + li $poly3,1 + orc $poly3,$poly3,$poly1 # 0xffffffff00000001 + + ld $a0,64($bp) # in2_z + ld $a1,72($bp) + ld $a2,80($bp) + ld $a3,88($bp) + mr $rp_real,$rp + mr $ap_real,$ap + mr $bp_real,$bp + or $t0,$a0,$a1 + or $t2,$a2,$a3 + or $in2infty,$t0,$t2 + neg $t0,$in2infty + or $in2infty,$in2infty,$t0 + sradi $in2infty,$in2infty,63 # !in2infty + addi $rp,$sp,$Z2sqr + bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z2sqr, in2_z); + + ld $a0,64($ap_real) # in1_z + ld $a1,72($ap_real) + ld $a2,80($ap_real) + ld $a3,88($ap_real) + or $t0,$a0,$a1 + or $t2,$a2,$a3 + or $in1infty,$t0,$t2 + neg $t0,$in1infty + or $in1infty,$in1infty,$t0 + sradi $in1infty,$in1infty,63 # !in1infty + addi $rp,$sp,$Z1sqr + bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z1sqr, in1_z); + + ld $bi,64($bp_real) + ld $a0,$Z2sqr+0($sp) + ld $a1,$Z2sqr+8($sp) + ld $a2,$Z2sqr+16($sp) + ld $a3,$Z2sqr+24($sp) + addi $bp,$bp_real,64 + addi $rp,$sp,$S1 + bl __ecp_nistz256_mul_mont # p256_mul_mont(S1, Z2sqr, in2_z); + + ld $bi,64($ap_real) + ld $a0,$Z1sqr+0($sp) + ld $a1,$Z1sqr+8($sp) + ld $a2,$Z1sqr+16($sp) + ld $a3,$Z1sqr+24($sp) + addi $bp,$ap_real,64 + addi $rp,$sp,$S2 + bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, Z1sqr, in1_z); + + ld $bi,32($ap_real) + ld $a0,$S1+0($sp) + ld $a1,$S1+8($sp) + ld $a2,$S1+16($sp) + ld $a3,$S1+24($sp) + addi $bp,$ap_real,32 + addi $rp,$sp,$S1 + bl __ecp_nistz256_mul_mont # p256_mul_mont(S1, S1, in1_y); + + ld $bi,32($bp_real) + ld $a0,$S2+0($sp) + ld $a1,$S2+8($sp) + ld $a2,$S2+16($sp) + ld $a3,$S2+24($sp) + addi $bp,$bp_real,32 + addi $rp,$sp,$S2 + bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S2, in2_y); + + addi $bp,$sp,$S1 + ld $bi,$Z2sqr($sp) # forward load for p256_mul_mont + ld $a0,0($ap_real) + ld $a1,8($ap_real) + ld $a2,16($ap_real) + ld $a3,24($ap_real) + addi $rp,$sp,$R + bl __ecp_nistz256_sub_from # p256_sub(R, S2, S1); + + or $acc0,$acc0,$acc1 # see if result is zero + or $acc2,$acc2,$acc3 + or $temp,$acc0,$acc2 + + addi $bp,$sp,$Z2sqr + addi $rp,$sp,$U1 + bl __ecp_nistz256_mul_mont # p256_mul_mont(U1, in1_x, Z2sqr); + + ld $bi,$Z1sqr($sp) + ld $a0,0($bp_real) + ld $a1,8($bp_real) + ld $a2,16($bp_real) + ld $a3,24($bp_real) + addi $bp,$sp,$Z1sqr + addi $rp,$sp,$U2 + bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, in2_x, Z1sqr); + + addi $bp,$sp,$U1 + ld $a0,$R+0($sp) # forward load for p256_sqr_mont + ld $a1,$R+8($sp) + ld $a2,$R+16($sp) + ld $a3,$R+24($sp) + addi $rp,$sp,$H + bl __ecp_nistz256_sub_from # p256_sub(H, U2, U1); + + or $acc0,$acc0,$acc1 # see if result is zero + or $acc2,$acc2,$acc3 + or. $acc0,$acc0,$acc2 + bne .Ladd_proceed # is_equal(U1,U2)? + + and. $t0,$in1infty,$in2infty + beq .Ladd_proceed # (in1infty || in2infty)? + + cmpldi $temp,0 + beq .Ladd_double # is_equal(S1,S2)? + + xor $a0,$a0,$a0 + std $a0,0($rp_real) + std $a0,8($rp_real) + std $a0,16($rp_real) + std $a0,24($rp_real) + std $a0,32($rp_real) + std $a0,40($rp_real) + std $a0,48($rp_real) + std $a0,56($rp_real) + std $a0,64($rp_real) + std $a0,72($rp_real) + std $a0,80($rp_real) + std $a0,88($rp_real) + b .Ladd_done + +.align 4 +.Ladd_double: + ld $bp,0($sp) # back-link + mr $ap,$ap_real + mr $rp,$rp_real + ld r16,$FRAME-8*16($sp) + ld r17,$FRAME-8*15($sp) + ld r18,$FRAME-8*14($sp) + ld r19,$FRAME-8*13($sp) + stdu $bp,$FRAME-288($sp) # difference in stack frame sizes + b .Ldouble_shortcut + +.align 4 +.Ladd_proceed: + addi $rp,$sp,$Rsqr + bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Rsqr, R); + + ld $bi,64($ap_real) + ld $a0,$H+0($sp) + ld $a1,$H+8($sp) + ld $a2,$H+16($sp) + ld $a3,$H+24($sp) + addi $bp,$ap_real,64 + addi $rp,$sp,$res_z + bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, H, in1_z); + + ld $a0,$H+0($sp) + ld $a1,$H+8($sp) + ld $a2,$H+16($sp) + ld $a3,$H+24($sp) + addi $rp,$sp,$Hsqr + bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Hsqr, H); + + ld $bi,64($bp_real) + ld $a0,$res_z+0($sp) + ld $a1,$res_z+8($sp) + ld $a2,$res_z+16($sp) + ld $a3,$res_z+24($sp) + addi $bp,$bp_real,64 + addi $rp,$sp,$res_z + bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, res_z, in2_z); + + ld $bi,$H($sp) + ld $a0,$Hsqr+0($sp) + ld $a1,$Hsqr+8($sp) + ld $a2,$Hsqr+16($sp) + ld $a3,$Hsqr+24($sp) + addi $bp,$sp,$H + addi $rp,$sp,$Hcub + bl __ecp_nistz256_mul_mont # p256_mul_mont(Hcub, Hsqr, H); + + ld $bi,$Hsqr($sp) + ld $a0,$U1+0($sp) + ld $a1,$U1+8($sp) + ld $a2,$U1+16($sp) + ld $a3,$U1+24($sp) + addi $bp,$sp,$Hsqr + addi $rp,$sp,$U2 + bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, U1, Hsqr); + + mr $t0,$acc0 + mr $t1,$acc1 + mr $t2,$acc2 + mr $t3,$acc3 + addi $rp,$sp,$Hsqr + bl __ecp_nistz256_add # p256_mul_by_2(Hsqr, U2); + + addi $bp,$sp,$Rsqr + addi $rp,$sp,$res_x + bl __ecp_nistz256_sub_morf # p256_sub(res_x, Rsqr, Hsqr); + + addi $bp,$sp,$Hcub + bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, Hcub); + + addi $bp,$sp,$U2 + ld $bi,$Hcub($sp) # forward load for p256_mul_mont + ld $a0,$S1+0($sp) + ld $a1,$S1+8($sp) + ld $a2,$S1+16($sp) + ld $a3,$S1+24($sp) + addi $rp,$sp,$res_y + bl __ecp_nistz256_sub_morf # p256_sub(res_y, U2, res_x); + + addi $bp,$sp,$Hcub + addi $rp,$sp,$S2 + bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S1, Hcub); + + ld $bi,$R($sp) + ld $a0,$res_y+0($sp) + ld $a1,$res_y+8($sp) + ld $a2,$res_y+16($sp) + ld $a3,$res_y+24($sp) + addi $bp,$sp,$R + addi $rp,$sp,$res_y + bl __ecp_nistz256_mul_mont # p256_mul_mont(res_y, res_y, R); + + addi $bp,$sp,$S2 + bl __ecp_nistz256_sub_from # p256_sub(res_y, res_y, S2); + + ld $t0,0($bp_real) # in2 + ld $t1,8($bp_real) + ld $t2,16($bp_real) + ld $t3,24($bp_real) + ld $a0,$res_x+0($sp) # res + ld $a1,$res_x+8($sp) + ld $a2,$res_x+16($sp) + ld $a3,$res_x+24($sp) +___ +for($i=0;$i<64;$i+=32) { # conditional moves +$code.=<<___; + ld $acc0,$i+0($ap_real) # in1 + ld $acc1,$i+8($ap_real) + ld $acc2,$i+16($ap_real) + ld $acc3,$i+24($ap_real) + andc $t0,$t0,$in1infty + andc $t1,$t1,$in1infty + andc $t2,$t2,$in1infty + andc $t3,$t3,$in1infty + and $a0,$a0,$in1infty + and $a1,$a1,$in1infty + and $a2,$a2,$in1infty + and $a3,$a3,$in1infty + or $t0,$t0,$a0 + or $t1,$t1,$a1 + or $t2,$t2,$a2 + or $t3,$t3,$a3 + andc $acc0,$acc0,$in2infty + andc $acc1,$acc1,$in2infty + andc $acc2,$acc2,$in2infty + andc $acc3,$acc3,$in2infty + and $t0,$t0,$in2infty + and $t1,$t1,$in2infty + and $t2,$t2,$in2infty + and $t3,$t3,$in2infty + or $acc0,$acc0,$t0 + or $acc1,$acc1,$t1 + or $acc2,$acc2,$t2 + or $acc3,$acc3,$t3 + + ld $t0,$i+32($bp_real) # in2 + ld $t1,$i+40($bp_real) + ld $t2,$i+48($bp_real) + ld $t3,$i+56($bp_real) + ld $a0,$res_x+$i+32($sp) + ld $a1,$res_x+$i+40($sp) + ld $a2,$res_x+$i+48($sp) + ld $a3,$res_x+$i+56($sp) + std $acc0,$i+0($rp_real) + std $acc1,$i+8($rp_real) + std $acc2,$i+16($rp_real) + std $acc3,$i+24($rp_real) +___ +} +$code.=<<___; + ld $acc0,$i+0($ap_real) # in1 + ld $acc1,$i+8($ap_real) + ld $acc2,$i+16($ap_real) + ld $acc3,$i+24($ap_real) + andc $t0,$t0,$in1infty + andc $t1,$t1,$in1infty + andc $t2,$t2,$in1infty + andc $t3,$t3,$in1infty + and $a0,$a0,$in1infty + and $a1,$a1,$in1infty + and $a2,$a2,$in1infty + and $a3,$a3,$in1infty + or $t0,$t0,$a0 + or $t1,$t1,$a1 + or $t2,$t2,$a2 + or $t3,$t3,$a3 + andc $acc0,$acc0,$in2infty + andc $acc1,$acc1,$in2infty + andc $acc2,$acc2,$in2infty + andc $acc3,$acc3,$in2infty + and $t0,$t0,$in2infty + and $t1,$t1,$in2infty + and $t2,$t2,$in2infty + and $t3,$t3,$in2infty + or $acc0,$acc0,$t0 + or $acc1,$acc1,$t1 + or $acc2,$acc2,$t2 + or $acc3,$acc3,$t3 + std $acc0,$i+0($rp_real) + std $acc1,$i+8($rp_real) + std $acc2,$i+16($rp_real) + std $acc3,$i+24($rp_real) + +.Ladd_done: + mtlr r0 + ld r16,$FRAME-8*16($sp) + ld r17,$FRAME-8*15($sp) + ld r18,$FRAME-8*14($sp) + ld r19,$FRAME-8*13($sp) + ld r20,$FRAME-8*12($sp) + ld r21,$FRAME-8*11($sp) + ld r22,$FRAME-8*10($sp) + ld r23,$FRAME-8*9($sp) + ld r24,$FRAME-8*8($sp) + ld r25,$FRAME-8*7($sp) + ld r26,$FRAME-8*6($sp) + ld r27,$FRAME-8*5($sp) + ld r28,$FRAME-8*4($sp) + ld r29,$FRAME-8*3($sp) + ld r30,$FRAME-8*2($sp) + ld r31,$FRAME-8*1($sp) + addi $sp,$sp,$FRAME + blr + .long 0 + .byte 0,12,4,0,0x80,16,3,0 + .long 0 +.size ecp_nistz256_point_add,.-ecp_nistz256_point_add +___ +} + +######################################################################## +# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, +# const P256_POINT_AFFINE *in2); +if (1) { +my $FRAME = 64 + 32*10 + 16*8; +my ($res_x,$res_y,$res_z, + $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(64+32*$_,(0..9)); +my $Z1sqr = $S2; +# above map() describes stack layout with 10 temporary +# 256-bit vectors on top. +my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21)); + +$code.=<<___; +.globl ecp_nistz256_point_add_affine +.align 5 +ecp_nistz256_point_add_affine: + stdu $sp,-$FRAME($sp) + mflr r0 + std r16,$FRAME-8*16($sp) + std r17,$FRAME-8*15($sp) + std r18,$FRAME-8*14($sp) + std r19,$FRAME-8*13($sp) + std r20,$FRAME-8*12($sp) + std r21,$FRAME-8*11($sp) + std r22,$FRAME-8*10($sp) + std r23,$FRAME-8*9($sp) + std r24,$FRAME-8*8($sp) + std r25,$FRAME-8*7($sp) + std r26,$FRAME-8*6($sp) + std r27,$FRAME-8*5($sp) + std r28,$FRAME-8*4($sp) + std r29,$FRAME-8*3($sp) + std r30,$FRAME-8*2($sp) + std r31,$FRAME-8*1($sp) + + li $poly1,-1 + srdi $poly1,$poly1,32 # 0x00000000ffffffff + li $poly3,1 + orc $poly3,$poly3,$poly1 # 0xffffffff00000001 + + mr $rp_real,$rp + mr $ap_real,$ap + mr $bp_real,$bp + + ld $a0,64($ap) # in1_z + ld $a1,72($ap) + ld $a2,80($ap) + ld $a3,88($ap) + or $t0,$a0,$a1 + or $t2,$a2,$a3 + or $in1infty,$t0,$t2 + neg $t0,$in1infty + or $in1infty,$in1infty,$t0 + sradi $in1infty,$in1infty,63 # !in1infty + + ld $acc0,0($bp) # in2_x + ld $acc1,8($bp) + ld $acc2,16($bp) + ld $acc3,24($bp) + ld $t0,32($bp) # in2_y + ld $t1,40($bp) + ld $t2,48($bp) + ld $t3,56($bp) + or $acc0,$acc0,$acc1 + or $acc2,$acc2,$acc3 + or $acc0,$acc0,$acc2 + or $t0,$t0,$t1 + or $t2,$t2,$t3 + or $t0,$t0,$t2 + or $in2infty,$acc0,$t0 + neg $t0,$in2infty + or $in2infty,$in2infty,$t0 + sradi $in2infty,$in2infty,63 # !in2infty + + addi $rp,$sp,$Z1sqr + bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z1sqr, in1_z); + + mr $a0,$acc0 + mr $a1,$acc1 + mr $a2,$acc2 + mr $a3,$acc3 + ld $bi,0($bp_real) + addi $bp,$bp_real,0 + addi $rp,$sp,$U2 + bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, Z1sqr, in2_x); + + addi $bp,$ap_real,0 + ld $bi,64($ap_real) # forward load for p256_mul_mont + ld $a0,$Z1sqr+0($sp) + ld $a1,$Z1sqr+8($sp) + ld $a2,$Z1sqr+16($sp) + ld $a3,$Z1sqr+24($sp) + addi $rp,$sp,$H + bl __ecp_nistz256_sub_from # p256_sub(H, U2, in1_x); + + addi $bp,$ap_real,64 + addi $rp,$sp,$S2 + bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, Z1sqr, in1_z); + + ld $bi,64($ap_real) + ld $a0,$H+0($sp) + ld $a1,$H+8($sp) + ld $a2,$H+16($sp) + ld $a3,$H+24($sp) + addi $bp,$ap_real,64 + addi $rp,$sp,$res_z + bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, H, in1_z); + + ld $bi,32($bp_real) + ld $a0,$S2+0($sp) + ld $a1,$S2+8($sp) + ld $a2,$S2+16($sp) + ld $a3,$S2+24($sp) + addi $bp,$bp_real,32 + addi $rp,$sp,$S2 + bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S2, in2_y); + + addi $bp,$ap_real,32 + ld $a0,$H+0($sp) # forward load for p256_sqr_mont + ld $a1,$H+8($sp) + ld $a2,$H+16($sp) + ld $a3,$H+24($sp) + addi $rp,$sp,$R + bl __ecp_nistz256_sub_from # p256_sub(R, S2, in1_y); + + addi $rp,$sp,$Hsqr + bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Hsqr, H); + + ld $a0,$R+0($sp) + ld $a1,$R+8($sp) + ld $a2,$R+16($sp) + ld $a3,$R+24($sp) + addi $rp,$sp,$Rsqr + bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Rsqr, R); + + ld $bi,$H($sp) + ld $a0,$Hsqr+0($sp) + ld $a1,$Hsqr+8($sp) + ld $a2,$Hsqr+16($sp) + ld $a3,$Hsqr+24($sp) + addi $bp,$sp,$H + addi $rp,$sp,$Hcub + bl __ecp_nistz256_mul_mont # p256_mul_mont(Hcub, Hsqr, H); + + ld $bi,0($ap_real) + ld $a0,$Hsqr+0($sp) + ld $a1,$Hsqr+8($sp) + ld $a2,$Hsqr+16($sp) + ld $a3,$Hsqr+24($sp) + addi $bp,$ap_real,0 + addi $rp,$sp,$U2 + bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, in1_x, Hsqr); + + mr $t0,$acc0 + mr $t1,$acc1 + mr $t2,$acc2 + mr $t3,$acc3 + addi $rp,$sp,$Hsqr + bl __ecp_nistz256_add # p256_mul_by_2(Hsqr, U2); + + addi $bp,$sp,$Rsqr + addi $rp,$sp,$res_x + bl __ecp_nistz256_sub_morf # p256_sub(res_x, Rsqr, Hsqr); + + addi $bp,$sp,$Hcub + bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, Hcub); + + addi $bp,$sp,$U2 + ld $bi,32($ap_real) # forward load for p256_mul_mont + ld $a0,$Hcub+0($sp) + ld $a1,$Hcub+8($sp) + ld $a2,$Hcub+16($sp) + ld $a3,$Hcub+24($sp) + addi $rp,$sp,$res_y + bl __ecp_nistz256_sub_morf # p256_sub(res_y, U2, res_x); + + addi $bp,$ap_real,32 + addi $rp,$sp,$S2 + bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, in1_y, Hcub); + + ld $bi,$R($sp) + ld $a0,$res_y+0($sp) + ld $a1,$res_y+8($sp) + ld $a2,$res_y+16($sp) + ld $a3,$res_y+24($sp) + addi $bp,$sp,$R + addi $rp,$sp,$res_y + bl __ecp_nistz256_mul_mont # p256_mul_mont(res_y, res_y, R); + + addi $bp,$sp,$S2 + bl __ecp_nistz256_sub_from # p256_sub(res_y, res_y, S2); + + ld $t0,0($bp_real) # in2 + ld $t1,8($bp_real) + ld $t2,16($bp_real) + ld $t3,24($bp_real) + ld $a0,$res_x+0($sp) # res + ld $a1,$res_x+8($sp) + ld $a2,$res_x+16($sp) + ld $a3,$res_x+24($sp) +___ +for($i=0;$i<64;$i+=32) { # conditional moves +$code.=<<___; + ld $acc0,$i+0($ap_real) # in1 + ld $acc1,$i+8($ap_real) + ld $acc2,$i+16($ap_real) + ld $acc3,$i+24($ap_real) + andc $t0,$t0,$in1infty + andc $t1,$t1,$in1infty + andc $t2,$t2,$in1infty + andc $t3,$t3,$in1infty + and $a0,$a0,$in1infty + and $a1,$a1,$in1infty + and $a2,$a2,$in1infty + and $a3,$a3,$in1infty + or $t0,$t0,$a0 + or $t1,$t1,$a1 + or $t2,$t2,$a2 + or $t3,$t3,$a3 + andc $acc0,$acc0,$in2infty + andc $acc1,$acc1,$in2infty + andc $acc2,$acc2,$in2infty + andc $acc3,$acc3,$in2infty + and $t0,$t0,$in2infty + and $t1,$t1,$in2infty + and $t2,$t2,$in2infty + and $t3,$t3,$in2infty + or $acc0,$acc0,$t0 + or $acc1,$acc1,$t1 + or $acc2,$acc2,$t2 + or $acc3,$acc3,$t3 +___ +$code.=<<___ if ($i==0); + ld $t0,32($bp_real) # in2 + ld $t1,40($bp_real) + ld $t2,48($bp_real) + ld $t3,56($bp_real) +___ +$code.=<<___ if ($i==32); + li $t0,1 # Lone_mont + not $t1,$poly1 + li $t2,-1 + not $t3,$poly3 +___ +$code.=<<___; + ld $a0,$res_x+$i+32($sp) + ld $a1,$res_x+$i+40($sp) + ld $a2,$res_x+$i+48($sp) + ld $a3,$res_x+$i+56($sp) + std $acc0,$i+0($rp_real) + std $acc1,$i+8($rp_real) + std $acc2,$i+16($rp_real) + std $acc3,$i+24($rp_real) +___ +} +$code.=<<___; + ld $acc0,$i+0($ap_real) # in1 + ld $acc1,$i+8($ap_real) + ld $acc2,$i+16($ap_real) + ld $acc3,$i+24($ap_real) + andc $t0,$t0,$in1infty + andc $t1,$t1,$in1infty + andc $t2,$t2,$in1infty + andc $t3,$t3,$in1infty + and $a0,$a0,$in1infty + and $a1,$a1,$in1infty + and $a2,$a2,$in1infty + and $a3,$a3,$in1infty + or $t0,$t0,$a0 + or $t1,$t1,$a1 + or $t2,$t2,$a2 + or $t3,$t3,$a3 + andc $acc0,$acc0,$in2infty + andc $acc1,$acc1,$in2infty + andc $acc2,$acc2,$in2infty + andc $acc3,$acc3,$in2infty + and $t0,$t0,$in2infty + and $t1,$t1,$in2infty + and $t2,$t2,$in2infty + and $t3,$t3,$in2infty + or $acc0,$acc0,$t0 + or $acc1,$acc1,$t1 + or $acc2,$acc2,$t2 + or $acc3,$acc3,$t3 + std $acc0,$i+0($rp_real) + std $acc1,$i+8($rp_real) + std $acc2,$i+16($rp_real) + std $acc3,$i+24($rp_real) + + mtlr r0 + ld r16,$FRAME-8*16($sp) + ld r17,$FRAME-8*15($sp) + ld r18,$FRAME-8*14($sp) + ld r19,$FRAME-8*13($sp) + ld r20,$FRAME-8*12($sp) + ld r21,$FRAME-8*11($sp) + ld r22,$FRAME-8*10($sp) + ld r23,$FRAME-8*9($sp) + ld r24,$FRAME-8*8($sp) + ld r25,$FRAME-8*7($sp) + ld r26,$FRAME-8*6($sp) + ld r27,$FRAME-8*5($sp) + ld r28,$FRAME-8*4($sp) + ld r29,$FRAME-8*3($sp) + ld r30,$FRAME-8*2($sp) + ld r31,$FRAME-8*1($sp) + addi $sp,$sp,$FRAME + blr + .long 0 + .byte 0,12,4,0,0x80,16,3,0 + .long 0 +.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine +___ +} +if (1) { +my ($ordk,$ord0,$ord1,$t4) = map("r$_",(18..21)); +my ($ord2,$ord3,$zr) = ($poly1,$poly3,"r0"); + +$code.=<<___; +######################################################################## +# void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], +# uint64_t b[4]); +.globl ecp_nistz256_ord_mul_mont +.align 5 +ecp_nistz256_ord_mul_mont: + stdu $sp,-160($sp) + std r18,48($sp) + std r19,56($sp) + std r20,64($sp) + std r21,72($sp) + std r22,80($sp) + std r23,88($sp) + std r24,96($sp) + std r25,104($sp) + std r26,112($sp) + std r27,120($sp) + std r28,128($sp) + std r29,136($sp) + std r30,144($sp) + std r31,152($sp) + + ld $a0,0($ap) + ld $bi,0($bp) + ld $a1,8($ap) + ld $a2,16($ap) + ld $a3,24($ap) + + lis $ordk,0xccd1 + lis $ord0,0xf3b9 + lis $ord1,0xbce6 + ori $ordk,$ordk,0xc8aa + ori $ord0,$ord0,0xcac2 + ori $ord1,$ord1,0xfaad + sldi $ordk,$ordk,32 + sldi $ord0,$ord0,32 + sldi $ord1,$ord1,32 + oris $ordk,$ordk,0xee00 + oris $ord0,$ord0,0xfc63 + oris $ord1,$ord1,0xa717 + ori $ordk,$ordk,0xbc4f # 0xccd1c8aaee00bc4f + ori $ord0,$ord0,0x2551 # 0xf3b9cac2fc632551 + ori $ord1,$ord1,0x9e84 # 0xbce6faada7179e84 + li $ord2,-1 # 0xffffffffffffffff + sldi $ord3,$ord2,32 # 0xffffffff00000000 + li $zr,0 + + mulld $acc0,$a0,$bi # a[0]*b[0] + mulhdu $t0,$a0,$bi + + mulld $acc1,$a1,$bi # a[1]*b[0] + mulhdu $t1,$a1,$bi + + mulld $acc2,$a2,$bi # a[2]*b[0] + mulhdu $t2,$a2,$bi + + mulld $acc3,$a3,$bi # a[3]*b[0] + mulhdu $acc4,$a3,$bi + + mulld $t4,$acc0,$ordk + + addc $acc1,$acc1,$t0 # accumulate high parts of multiplication + adde $acc2,$acc2,$t1 + adde $acc3,$acc3,$t2 + addze $acc4,$acc4 + li $acc5,0 +___ +for ($i=1;$i<4;$i++) { + ################################################################ + # ffff0000.ffffffff.yyyyyyyy.zzzzzzzz + # * abcdefgh + # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx + # + # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we + # rewrite above as: + # + # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx + # - 0000abcd.efgh0000.abcdefgh.00000000.00000000 + # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh +$code.=<<___; + ld $bi,8*$i($bp) # b[i] + + sldi $t0,$t4,32 + subfc $acc2,$t4,$acc2 + srdi $t1,$t4,32 + subfe $acc3,$t0,$acc3 + subfe $acc4,$t1,$acc4 + subfe $acc5,$zr,$acc5 + + addic $t0,$acc0,-1 # discarded + mulhdu $t1,$ord0,$t4 + mulld $t2,$ord1,$t4 + mulhdu $t3,$ord1,$t4 + + adde $t2,$t2,$t1 + mulld $t0,$a0,$bi + addze $t3,$t3 + mulld $t1,$a1,$bi + + addc $acc0,$acc1,$t2 + mulld $t2,$a2,$bi + adde $acc1,$acc2,$t3 + mulld $t3,$a3,$bi + adde $acc2,$acc3,$t4 + adde $acc3,$acc4,$t4 + addze $acc4,$acc5 + + addc $acc0,$acc0,$t0 # accumulate low parts + mulhdu $t0,$a0,$bi + adde $acc1,$acc1,$t1 + mulhdu $t1,$a1,$bi + adde $acc2,$acc2,$t2 + mulhdu $t2,$a2,$bi + adde $acc3,$acc3,$t3 + mulhdu $t3,$a3,$bi + addze $acc4,$acc4 + mulld $t4,$acc0,$ordk + addc $acc1,$acc1,$t0 # accumulate high parts + adde $acc2,$acc2,$t1 + adde $acc3,$acc3,$t2 + adde $acc4,$acc4,$t3 + addze $acc5,$zr +___ +} +$code.=<<___; + sldi $t0,$t4,32 # last reduction + subfc $acc2,$t4,$acc2 + srdi $t1,$t4,32 + subfe $acc3,$t0,$acc3 + subfe $acc4,$t1,$acc4 + subfe $acc5,$zr,$acc5 + + addic $t0,$acc0,-1 # discarded + mulhdu $t1,$ord0,$t4 + mulld $t2,$ord1,$t4 + mulhdu $t3,$ord1,$t4 + + adde $t2,$t2,$t1 + addze $t3,$t3 + + addc $acc0,$acc1,$t2 + adde $acc1,$acc2,$t3 + adde $acc2,$acc3,$t4 + adde $acc3,$acc4,$t4 + addze $acc4,$acc5 + + subfc $acc0,$ord0,$acc0 # ret -= modulus + subfe $acc1,$ord1,$acc1 + subfe $acc2,$ord2,$acc2 + subfe $acc3,$ord3,$acc3 + subfe $acc4,$zr,$acc4 + + and $t0,$ord0,$acc4 + and $t1,$ord1,$acc4 + addc $acc0,$acc0,$t0 # ret += modulus if borrow + and $t3,$ord3,$acc4 + adde $acc1,$acc1,$t1 + adde $acc2,$acc2,$acc4 + adde $acc3,$acc3,$t3 + + std $acc0,0($rp) + std $acc1,8($rp) + std $acc2,16($rp) + std $acc3,24($rp) + + ld r18,48($sp) + ld r19,56($sp) + ld r20,64($sp) + ld r21,72($sp) + ld r22,80($sp) + ld r23,88($sp) + ld r24,96($sp) + ld r25,104($sp) + ld r26,112($sp) + ld r27,120($sp) + ld r28,128($sp) + ld r29,136($sp) + ld r30,144($sp) + ld r31,152($sp) + addi $sp,$sp,160 + blr + .long 0 + .byte 0,12,4,0,0x80,14,3,0 + .long 0 +.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont + +################################################################################ +# void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], +# int rep); +.globl ecp_nistz256_ord_sqr_mont +.align 5 +ecp_nistz256_ord_sqr_mont: + stdu $sp,-160($sp) + std r18,48($sp) + std r19,56($sp) + std r20,64($sp) + std r21,72($sp) + std r22,80($sp) + std r23,88($sp) + std r24,96($sp) + std r25,104($sp) + std r26,112($sp) + std r27,120($sp) + std r28,128($sp) + std r29,136($sp) + std r30,144($sp) + std r31,152($sp) + + mtctr $bp + + ld $a0,0($ap) + ld $a1,8($ap) + ld $a2,16($ap) + ld $a3,24($ap) + + lis $ordk,0xccd1 + lis $ord0,0xf3b9 + lis $ord1,0xbce6 + ori $ordk,$ordk,0xc8aa + ori $ord0,$ord0,0xcac2 + ori $ord1,$ord1,0xfaad + sldi $ordk,$ordk,32 + sldi $ord0,$ord0,32 + sldi $ord1,$ord1,32 + oris $ordk,$ordk,0xee00 + oris $ord0,$ord0,0xfc63 + oris $ord1,$ord1,0xa717 + ori $ordk,$ordk,0xbc4f # 0xccd1c8aaee00bc4f + ori $ord0,$ord0,0x2551 # 0xf3b9cac2fc632551 + ori $ord1,$ord1,0x9e84 # 0xbce6faada7179e84 + li $ord2,-1 # 0xffffffffffffffff + sldi $ord3,$ord2,32 # 0xffffffff00000000 + li $zr,0 + b .Loop_ord_sqr + +.align 5 +.Loop_ord_sqr: + ################################################################ + # | | | | | |a1*a0| | + # | | | | |a2*a0| | | + # | |a3*a2|a3*a0| | | | + # | | | |a2*a1| | | | + # | | |a3*a1| | | | | + # *| | | | | | | | 2| + # +|a3*a3|a2*a2|a1*a1|a0*a0| + # |--+--+--+--+--+--+--+--| + # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx + # + # "can't overflow" below mark carrying into high part of + # multiplication result, which can't overflow, because it + # can never be all ones. + + mulld $acc1,$a1,$a0 # a[1]*a[0] + mulhdu $t1,$a1,$a0 + mulld $acc2,$a2,$a0 # a[2]*a[0] + mulhdu $t2,$a2,$a0 + mulld $acc3,$a3,$a0 # a[3]*a[0] + mulhdu $acc4,$a3,$a0 + + addc $acc2,$acc2,$t1 # accumulate high parts of multiplication + mulld $t0,$a2,$a1 # a[2]*a[1] + mulhdu $t1,$a2,$a1 + adde $acc3,$acc3,$t2 + mulld $t2,$a3,$a1 # a[3]*a[1] + mulhdu $t3,$a3,$a1 + addze $acc4,$acc4 # can't overflow + + mulld $acc5,$a3,$a2 # a[3]*a[2] + mulhdu $acc6,$a3,$a2 + + addc $t1,$t1,$t2 # accumulate high parts of multiplication + mulld $acc0,$a0,$a0 # a[0]*a[0] + addze $t2,$t3 # can't overflow + + addc $acc3,$acc3,$t0 # accumulate low parts of multiplication + mulhdu $a0,$a0,$a0 + adde $acc4,$acc4,$t1 + mulld $t1,$a1,$a1 # a[1]*a[1] + adde $acc5,$acc5,$t2 + mulhdu $a1,$a1,$a1 + addze $acc6,$acc6 # can't overflow + + addc $acc1,$acc1,$acc1 # acc[1-6]*=2 + mulld $t2,$a2,$a2 # a[2]*a[2] + adde $acc2,$acc2,$acc2 + mulhdu $a2,$a2,$a2 + adde $acc3,$acc3,$acc3 + mulld $t3,$a3,$a3 # a[3]*a[3] + adde $acc4,$acc4,$acc4 + mulhdu $a3,$a3,$a3 + adde $acc5,$acc5,$acc5 + adde $acc6,$acc6,$acc6 + addze $acc7,$zr + + addc $acc1,$acc1,$a0 # +a[i]*a[i] + mulld $t4,$acc0,$ordk + adde $acc2,$acc2,$t1 + adde $acc3,$acc3,$a1 + adde $acc4,$acc4,$t2 + adde $acc5,$acc5,$a2 + adde $acc6,$acc6,$t3 + adde $acc7,$acc7,$a3 +___ +for($i=0; $i<4; $i++) { # reductions +$code.=<<___; + addic $t0,$acc0,-1 # discarded + mulhdu $t1,$ord0,$t4 + mulld $t2,$ord1,$t4 + mulhdu $t3,$ord1,$t4 + + adde $t2,$t2,$t1 + addze $t3,$t3 + + addc $acc0,$acc1,$t2 + adde $acc1,$acc2,$t3 + adde $acc2,$acc3,$t4 + adde $acc3,$zr,$t4 # can't overflow +___ +$code.=<<___ if ($i<3); + mulld $t3,$acc0,$ordk +___ +$code.=<<___; + sldi $t0,$t4,32 + subfc $acc1,$t4,$acc1 + srdi $t1,$t4,32 + subfe $acc2,$t0,$acc2 + subfe $acc3,$t1,$acc3 # can't borrow +___ + ($t3,$t4) = ($t4,$t3); +} +$code.=<<___; + addc $acc0,$acc0,$acc4 # accumulate upper half + adde $acc1,$acc1,$acc5 + adde $acc2,$acc2,$acc6 + adde $acc3,$acc3,$acc7 + addze $acc4,$zr + + subfc $acc0,$ord0,$acc0 # ret -= modulus + subfe $acc1,$ord1,$acc1 + subfe $acc2,$ord2,$acc2 + subfe $acc3,$ord3,$acc3 + subfe $acc4,$zr,$acc4 + + and $t0,$ord0,$acc4 + and $t1,$ord1,$acc4 + addc $a0,$acc0,$t0 # ret += modulus if borrow + and $t3,$ord3,$acc4 + adde $a1,$acc1,$t1 + adde $a2,$acc2,$acc4 + adde $a3,$acc3,$t3 + + bdnz .Loop_ord_sqr + + std $a0,0($rp) + std $a1,8($rp) + std $a2,16($rp) + std $a3,24($rp) + + ld r18,48($sp) + ld r19,56($sp) + ld r20,64($sp) + ld r21,72($sp) + ld r22,80($sp) + ld r23,88($sp) + ld r24,96($sp) + ld r25,104($sp) + ld r26,112($sp) + ld r27,120($sp) + ld r28,128($sp) + ld r29,136($sp) + ld r30,144($sp) + ld r31,152($sp) + addi $sp,$sp,160 + blr + .long 0 + .byte 0,12,4,0,0x80,14,3,0 + .long 0 +.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont +___ +} } + +######################################################################## +# scatter-gather subroutines +{ +my ($out,$inp,$index,$mask)=map("r$_",(3..7)); +$code.=<<___; +######################################################################## +# void ecp_nistz256_scatter_w5(void *out, const P256_POINT *inp, +# int index); +.globl ecp_nistz256_scatter_w5 +.align 4 +ecp_nistz256_scatter_w5: + slwi $index,$index,2 + add $out,$out,$index + + ld r8, 0($inp) # X + ld r9, 8($inp) + ld r10,16($inp) + ld r11,24($inp) + + stw r8, 64*0-4($out) + srdi r8, r8, 32 + stw r9, 64*1-4($out) + srdi r9, r9, 32 + stw r10,64*2-4($out) + srdi r10,r10,32 + stw r11,64*3-4($out) + srdi r11,r11,32 + stw r8, 64*4-4($out) + stw r9, 64*5-4($out) + stw r10,64*6-4($out) + stw r11,64*7-4($out) + addi $out,$out,64*8 + + ld r8, 32($inp) # Y + ld r9, 40($inp) + ld r10,48($inp) + ld r11,56($inp) + + stw r8, 64*0-4($out) + srdi r8, r8, 32 + stw r9, 64*1-4($out) + srdi r9, r9, 32 + stw r10,64*2-4($out) + srdi r10,r10,32 + stw r11,64*3-4($out) + srdi r11,r11,32 + stw r8, 64*4-4($out) + stw r9, 64*5-4($out) + stw r10,64*6-4($out) + stw r11,64*7-4($out) + addi $out,$out,64*8 + + ld r8, 64($inp) # Z + ld r9, 72($inp) + ld r10,80($inp) + ld r11,88($inp) + + stw r8, 64*0-4($out) + srdi r8, r8, 32 + stw r9, 64*1-4($out) + srdi r9, r9, 32 + stw r10,64*2-4($out) + srdi r10,r10,32 + stw r11,64*3-4($out) + srdi r11,r11,32 + stw r8, 64*4-4($out) + stw r9, 64*5-4($out) + stw r10,64*6-4($out) + stw r11,64*7-4($out) + + blr + .long 0 + .byte 0,12,0x14,0,0,0,3,0 + .long 0 +.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 + +######################################################################## +# void ecp_nistz256_gather_w5(P256_POINT *out, const void *inp, +# int index); +.globl ecp_nistz256_gather_w5 +.align 4 +ecp_nistz256_gather_w5: + neg r0,$index + sradi r0,r0,63 + + add $index,$index,r0 + slwi $index,$index,2 + add $inp,$inp,$index + + lwz r5, 64*0($inp) + lwz r6, 64*1($inp) + lwz r7, 64*2($inp) + lwz r8, 64*3($inp) + lwz r9, 64*4($inp) + lwz r10,64*5($inp) + lwz r11,64*6($inp) + lwz r12,64*7($inp) + addi $inp,$inp,64*8 + sldi r9, r9, 32 + sldi r10,r10,32 + sldi r11,r11,32 + sldi r12,r12,32 + or r5,r5,r9 + or r6,r6,r10 + or r7,r7,r11 + or r8,r8,r12 + and r5,r5,r0 + and r6,r6,r0 + and r7,r7,r0 + and r8,r8,r0 + std r5,0($out) # X + std r6,8($out) + std r7,16($out) + std r8,24($out) + + lwz r5, 64*0($inp) + lwz r6, 64*1($inp) + lwz r7, 64*2($inp) + lwz r8, 64*3($inp) + lwz r9, 64*4($inp) + lwz r10,64*5($inp) + lwz r11,64*6($inp) + lwz r12,64*7($inp) + addi $inp,$inp,64*8 + sldi r9, r9, 32 + sldi r10,r10,32 + sldi r11,r11,32 + sldi r12,r12,32 + or r5,r5,r9 + or r6,r6,r10 + or r7,r7,r11 + or r8,r8,r12 + and r5,r5,r0 + and r6,r6,r0 + and r7,r7,r0 + and r8,r8,r0 + std r5,32($out) # Y + std r6,40($out) + std r7,48($out) + std r8,56($out) + + lwz r5, 64*0($inp) + lwz r6, 64*1($inp) + lwz r7, 64*2($inp) + lwz r8, 64*3($inp) + lwz r9, 64*4($inp) + lwz r10,64*5($inp) + lwz r11,64*6($inp) + lwz r12,64*7($inp) + sldi r9, r9, 32 + sldi r10,r10,32 + sldi r11,r11,32 + sldi r12,r12,32 + or r5,r5,r9 + or r6,r6,r10 + or r7,r7,r11 + or r8,r8,r12 + and r5,r5,r0 + and r6,r6,r0 + and r7,r7,r0 + and r8,r8,r0 + std r5,64($out) # Z + std r6,72($out) + std r7,80($out) + std r8,88($out) + + blr + .long 0 + .byte 0,12,0x14,0,0,0,3,0 + .long 0 +.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 + +######################################################################## +# void ecp_nistz256_scatter_w7(void *out, const P256_POINT_AFFINE *inp, +# int index); +.globl ecp_nistz256_scatter_w7 +.align 4 +ecp_nistz256_scatter_w7: + li r0,8 + mtctr r0 + add $out,$out,$index + subi $inp,$inp,8 + +.Loop_scatter_w7: + ldu r0,8($inp) + stb r0,64*0($out) + srdi r0,r0,8 + stb r0,64*1($out) + srdi r0,r0,8 + stb r0,64*2($out) + srdi r0,r0,8 + stb r0,64*3($out) + srdi r0,r0,8 + stb r0,64*4($out) + srdi r0,r0,8 + stb r0,64*5($out) + srdi r0,r0,8 + stb r0,64*6($out) + srdi r0,r0,8 + stb r0,64*7($out) + addi $out,$out,64*8 + bdnz .Loop_scatter_w7 + + blr + .long 0 + .byte 0,12,0x14,0,0,0,3,0 + .long 0 +.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 + +######################################################################## +# void ecp_nistz256_gather_w7(P256_POINT_AFFINE *out, const void *inp, +# int index); +.globl ecp_nistz256_gather_w7 +.align 4 +ecp_nistz256_gather_w7: + li r0,8 + mtctr r0 + neg r0,$index + sradi r0,r0,63 + + add $index,$index,r0 + add $inp,$inp,$index + subi $out,$out,8 + +.Loop_gather_w7: + lbz r5, 64*0($inp) + lbz r6, 64*1($inp) + lbz r7, 64*2($inp) + lbz r8, 64*3($inp) + lbz r9, 64*4($inp) + lbz r10,64*5($inp) + lbz r11,64*6($inp) + lbz r12,64*7($inp) + addi $inp,$inp,64*8 + + sldi r6, r6, 8 + sldi r7, r7, 16 + sldi r8, r8, 24 + sldi r9, r9, 32 + sldi r10,r10,40 + sldi r11,r11,48 + sldi r12,r12,56 + + or r5,r5,r6 + or r7,r7,r8 + or r9,r9,r10 + or r11,r11,r12 + or r5,r5,r7 + or r9,r9,r11 + or r5,r5,r9 + and r5,r5,r0 + stdu r5,8($out) + bdnz .Loop_gather_w7 + + blr + .long 0 + .byte 0,12,0x14,0,0,0,3,0 + .long 0 +.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 +___ +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + + print $_,"\n"; +} +close STDOUT; # enforce flush diff --git a/crypto/ec/asm/ecp_nistz256-sparcv9.pl b/crypto/ec/asm/ecp_nistz256-sparcv9.pl new file mode 100755 index 000000000000..0a4def6e2bf6 --- /dev/null +++ b/crypto/ec/asm/ecp_nistz256-sparcv9.pl @@ -0,0 +1,3061 @@ +#! /usr/bin/env perl +# Copyright 2015-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# ECP_NISTZ256 module for SPARCv9. +# +# February 2015. +# +# Original ECP_NISTZ256 submission targeting x86_64 is detailed in +# http://eprint.iacr.org/2013/816. In the process of adaptation +# original .c module was made 32-bit savvy in order to make this +# implementation possible. +# +# with/without -DECP_NISTZ256_ASM +# UltraSPARC III +12-18% +# SPARC T4 +99-550% (+66-150% on 32-bit Solaris) +# +# Ranges denote minimum and maximum improvement coefficients depending +# on benchmark. Lower coefficients are for ECDSA sign, server-side +# operation. Keep in mind that +200% means 3x improvement. + +$output = pop; +open STDOUT,">$output"; + +$code.=<<___; +#include "sparc_arch.h" + +#define LOCALS (STACK_BIAS+STACK_FRAME) +#ifdef __arch64__ +.register %g2,#scratch +.register %g3,#scratch +# define STACK64_FRAME STACK_FRAME +# define LOCALS64 LOCALS +#else +# define STACK64_FRAME (2047+192) +# define LOCALS64 STACK64_FRAME +#endif + +.section ".text",#alloc,#execinstr +___ +######################################################################## +# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 +# +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +open TABLE,"<ecp_nistz256_table.c" or +open TABLE,"<${dir}../ecp_nistz256_table.c" or +die "failed to open ecp_nistz256_table.c:",$!; + +use integer; + +foreach(<TABLE>) { + s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; +} +close TABLE; + +# See ecp_nistz256_table.c for explanation for why it's 64*16*37. +# 64*16*37-1 is because $#arr returns last valid index or @arr, not +# amount of elements. +die "insane number of elements" if ($#arr != 64*16*37-1); + +$code.=<<___; +.globl ecp_nistz256_precomputed +.align 4096 +ecp_nistz256_precomputed: +___ +######################################################################## +# this conversion smashes P256_POINT_AFFINE by individual bytes with +# 64 byte interval, similar to +# 1111222233334444 +# 1234123412341234 +for(1..37) { + @tbl = splice(@arr,0,64*16); + for($i=0;$i<64;$i++) { + undef @line; + for($j=0;$j<64;$j++) { + push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff; + } + $code.=".byte\t"; + $code.=join(',',map { sprintf "0x%02x",$_} @line); + $code.="\n"; + } +} + +{{{ +my ($rp,$ap,$bp)=map("%i$_",(0..2)); +my @acc=map("%l$_",(0..7)); +my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7)=(map("%o$_",(0..5)),"%g4","%g5"); +my ($bi,$a0,$mask,$carry)=(map("%i$_",(3..5)),"%g1"); +my ($rp_real,$ap_real)=("%g2","%g3"); + +$code.=<<___; +.type ecp_nistz256_precomputed,#object +.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed +.align 64 +.LRR: ! 2^512 mod P precomputed for NIST P256 polynomial +.long 0x00000003, 0x00000000, 0xffffffff, 0xfffffffb +.long 0xfffffffe, 0xffffffff, 0xfffffffd, 0x00000004 +.Lone: +.long 1,0,0,0,0,0,0,0 +.asciz "ECP_NISTZ256 for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" + +! void ecp_nistz256_to_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8]); +.globl ecp_nistz256_to_mont +.align 64 +ecp_nistz256_to_mont: + save %sp,-STACK_FRAME,%sp + nop +1: call .+8 + add %o7,.LRR-1b,$bp + call __ecp_nistz256_mul_mont + nop + ret + restore +.type ecp_nistz256_to_mont,#function +.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont + +! void ecp_nistz256_from_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8]); +.globl ecp_nistz256_from_mont +.align 32 +ecp_nistz256_from_mont: + save %sp,-STACK_FRAME,%sp + nop +1: call .+8 + add %o7,.Lone-1b,$bp + call __ecp_nistz256_mul_mont + nop + ret + restore +.type ecp_nistz256_from_mont,#function +.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont + +! void ecp_nistz256_mul_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8], +! const BN_ULONG %i2[8]); +.globl ecp_nistz256_mul_mont +.align 32 +ecp_nistz256_mul_mont: + save %sp,-STACK_FRAME,%sp + nop + call __ecp_nistz256_mul_mont + nop + ret + restore +.type ecp_nistz256_mul_mont,#function +.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont + +! void ecp_nistz256_sqr_mont(BN_ULONG %i0[8],const BN_ULONG %i2[8]); +.globl ecp_nistz256_sqr_mont +.align 32 +ecp_nistz256_sqr_mont: + save %sp,-STACK_FRAME,%sp + mov $ap,$bp + call __ecp_nistz256_mul_mont + nop + ret + restore +.type ecp_nistz256_sqr_mont,#function +.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont +___ + +######################################################################## +# Special thing to keep in mind is that $t0-$t7 hold 64-bit values, +# while all others are meant to keep 32. "Meant to" means that additions +# to @acc[0-7] do "contaminate" upper bits, but they are cleared before +# they can affect outcome (follow 'and' with $mask). Also keep in mind +# that addition with carry is addition with 32-bit carry, even though +# CPU is 64-bit. [Addition with 64-bit carry was introduced in T3, see +# below for VIS3 code paths.] + +$code.=<<___; +.align 32 +__ecp_nistz256_mul_mont: + ld [$bp+0],$bi ! b[0] + mov -1,$mask + ld [$ap+0],$a0 + srl $mask,0,$mask ! 0xffffffff + ld [$ap+4],$t1 + ld [$ap+8],$t2 + ld [$ap+12],$t3 + ld [$ap+16],$t4 + ld [$ap+20],$t5 + ld [$ap+24],$t6 + ld [$ap+28],$t7 + mulx $a0,$bi,$t0 ! a[0-7]*b[0], 64-bit results + mulx $t1,$bi,$t1 + mulx $t2,$bi,$t2 + mulx $t3,$bi,$t3 + mulx $t4,$bi,$t4 + mulx $t5,$bi,$t5 + mulx $t6,$bi,$t6 + mulx $t7,$bi,$t7 + srlx $t0,32,@acc[1] ! extract high parts + srlx $t1,32,@acc[2] + srlx $t2,32,@acc[3] + srlx $t3,32,@acc[4] + srlx $t4,32,@acc[5] + srlx $t5,32,@acc[6] + srlx $t6,32,@acc[7] + srlx $t7,32,@acc[0] ! "@acc[8]" + mov 0,$carry +___ +for($i=1;$i<8;$i++) { +$code.=<<___; + addcc @acc[1],$t1,@acc[1] ! accumulate high parts + ld [$bp+4*$i],$bi ! b[$i] + ld [$ap+4],$t1 ! re-load a[1-7] + addccc @acc[2],$t2,@acc[2] + addccc @acc[3],$t3,@acc[3] + ld [$ap+8],$t2 + ld [$ap+12],$t3 + addccc @acc[4],$t4,@acc[4] + addccc @acc[5],$t5,@acc[5] + ld [$ap+16],$t4 + ld [$ap+20],$t5 + addccc @acc[6],$t6,@acc[6] + addccc @acc[7],$t7,@acc[7] + ld [$ap+24],$t6 + ld [$ap+28],$t7 + addccc @acc[0],$carry,@acc[0] ! "@acc[8]" + addc %g0,%g0,$carry +___ + # Reduction iteration is normally performed by accumulating + # result of multiplication of modulus by "magic" digit [and + # omitting least significant word, which is guaranteed to + # be 0], but thanks to special form of modulus and "magic" + # digit being equal to least significant word, it can be + # performed with additions and subtractions alone. Indeed: + # + # ffff.0001.0000.0000.0000.ffff.ffff.ffff + # * abcd + # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd + # + # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we + # rewrite above as: + # + # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd + # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000 + # - abcd.0000.0000.0000.0000.0000.0000.abcd + # + # or marking redundant operations: + # + # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.---- + # + abcd.0000.abcd.0000.0000.abcd.----.----.---- + # - abcd.----.----.----.----.----.----.---- + +$code.=<<___; + ! multiplication-less reduction + addcc @acc[3],$t0,@acc[3] ! r[3]+=r[0] + addccc @acc[4],%g0,@acc[4] ! r[4]+=0 + and @acc[1],$mask,@acc[1] + and @acc[2],$mask,@acc[2] + addccc @acc[5],%g0,@acc[5] ! r[5]+=0 + addccc @acc[6],$t0,@acc[6] ! r[6]+=r[0] + and @acc[3],$mask,@acc[3] + and @acc[4],$mask,@acc[4] + addccc @acc[7],%g0,@acc[7] ! r[7]+=0 + addccc @acc[0],$t0,@acc[0] ! r[8]+=r[0] "@acc[8]" + and @acc[5],$mask,@acc[5] + and @acc[6],$mask,@acc[6] + addc $carry,%g0,$carry ! top-most carry + subcc @acc[7],$t0,@acc[7] ! r[7]-=r[0] + subccc @acc[0],%g0,@acc[0] ! r[8]-=0 "@acc[8]" + subc $carry,%g0,$carry ! top-most carry + and @acc[7],$mask,@acc[7] + and @acc[0],$mask,@acc[0] ! "@acc[8]" +___ + push(@acc,shift(@acc)); # rotate registers to "omit" acc[0] +$code.=<<___; + mulx $a0,$bi,$t0 ! a[0-7]*b[$i], 64-bit results + mulx $t1,$bi,$t1 + mulx $t2,$bi,$t2 + mulx $t3,$bi,$t3 + mulx $t4,$bi,$t4 + mulx $t5,$bi,$t5 + mulx $t6,$bi,$t6 + mulx $t7,$bi,$t7 + add @acc[0],$t0,$t0 ! accumulate low parts, can't overflow + add @acc[1],$t1,$t1 + srlx $t0,32,@acc[1] ! extract high parts + add @acc[2],$t2,$t2 + srlx $t1,32,@acc[2] + add @acc[3],$t3,$t3 + srlx $t2,32,@acc[3] + add @acc[4],$t4,$t4 + srlx $t3,32,@acc[4] + add @acc[5],$t5,$t5 + srlx $t4,32,@acc[5] + add @acc[6],$t6,$t6 + srlx $t5,32,@acc[6] + add @acc[7],$t7,$t7 + srlx $t6,32,@acc[7] + srlx $t7,32,@acc[0] ! "@acc[8]" +___ +} +$code.=<<___; + addcc @acc[1],$t1,@acc[1] ! accumulate high parts + addccc @acc[2],$t2,@acc[2] + addccc @acc[3],$t3,@acc[3] + addccc @acc[4],$t4,@acc[4] + addccc @acc[5],$t5,@acc[5] + addccc @acc[6],$t6,@acc[6] + addccc @acc[7],$t7,@acc[7] + addccc @acc[0],$carry,@acc[0] ! "@acc[8]" + addc %g0,%g0,$carry + + addcc @acc[3],$t0,@acc[3] ! multiplication-less reduction + addccc @acc[4],%g0,@acc[4] + addccc @acc[5],%g0,@acc[5] + addccc @acc[6],$t0,@acc[6] + addccc @acc[7],%g0,@acc[7] + addccc @acc[0],$t0,@acc[0] ! "@acc[8]" + addc $carry,%g0,$carry + subcc @acc[7],$t0,@acc[7] + subccc @acc[0],%g0,@acc[0] ! "@acc[8]" + subc $carry,%g0,$carry ! top-most carry +___ + push(@acc,shift(@acc)); # rotate registers to omit acc[0] +$code.=<<___; + ! Final step is "if result > mod, subtract mod", but we do it + ! "other way around", namely subtract modulus from result + ! and if it borrowed, add modulus back. + + subcc @acc[0],-1,@acc[0] ! subtract modulus + subccc @acc[1],-1,@acc[1] + subccc @acc[2],-1,@acc[2] + subccc @acc[3],0,@acc[3] + subccc @acc[4],0,@acc[4] + subccc @acc[5],0,@acc[5] + subccc @acc[6],1,@acc[6] + subccc @acc[7],-1,@acc[7] + subc $carry,0,$carry ! broadcast borrow bit + + ! Note that because mod has special form, i.e. consists of + ! 0xffffffff, 1 and 0s, we can conditionally synthesize it by + ! using value of broadcasted borrow and the borrow bit itself. + ! To minimize dependency chain we first broadcast and then + ! extract the bit by negating (follow $bi). + + addcc @acc[0],$carry,@acc[0] ! add modulus or zero + addccc @acc[1],$carry,@acc[1] + neg $carry,$bi + st @acc[0],[$rp] + addccc @acc[2],$carry,@acc[2] + st @acc[1],[$rp+4] + addccc @acc[3],0,@acc[3] + st @acc[2],[$rp+8] + addccc @acc[4],0,@acc[4] + st @acc[3],[$rp+12] + addccc @acc[5],0,@acc[5] + st @acc[4],[$rp+16] + addccc @acc[6],$bi,@acc[6] + st @acc[5],[$rp+20] + addc @acc[7],$carry,@acc[7] + st @acc[6],[$rp+24] + retl + st @acc[7],[$rp+28] +.type __ecp_nistz256_mul_mont,#function +.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont + +! void ecp_nistz256_add(BN_ULONG %i0[8],const BN_ULONG %i1[8], +! const BN_ULONG %i2[8]); +.globl ecp_nistz256_add +.align 32 +ecp_nistz256_add: + save %sp,-STACK_FRAME,%sp + ld [$ap],@acc[0] + ld [$ap+4],@acc[1] + ld [$ap+8],@acc[2] + ld [$ap+12],@acc[3] + ld [$ap+16],@acc[4] + ld [$ap+20],@acc[5] + ld [$ap+24],@acc[6] + call __ecp_nistz256_add + ld [$ap+28],@acc[7] + ret + restore +.type ecp_nistz256_add,#function +.size ecp_nistz256_add,.-ecp_nistz256_add + +.align 32 +__ecp_nistz256_add: + ld [$bp+0],$t0 ! b[0] + ld [$bp+4],$t1 + ld [$bp+8],$t2 + ld [$bp+12],$t3 + addcc @acc[0],$t0,@acc[0] + ld [$bp+16],$t4 + ld [$bp+20],$t5 + addccc @acc[1],$t1,@acc[1] + ld [$bp+24],$t6 + ld [$bp+28],$t7 + addccc @acc[2],$t2,@acc[2] + addccc @acc[3],$t3,@acc[3] + addccc @acc[4],$t4,@acc[4] + addccc @acc[5],$t5,@acc[5] + addccc @acc[6],$t6,@acc[6] + addccc @acc[7],$t7,@acc[7] + addc %g0,%g0,$carry + +.Lreduce_by_sub: + + ! if a+b >= modulus, subtract modulus. + ! + ! But since comparison implies subtraction, we subtract + ! modulus and then add it back if subtraction borrowed. + + subcc @acc[0],-1,@acc[0] + subccc @acc[1],-1,@acc[1] + subccc @acc[2],-1,@acc[2] + subccc @acc[3], 0,@acc[3] + subccc @acc[4], 0,@acc[4] + subccc @acc[5], 0,@acc[5] + subccc @acc[6], 1,@acc[6] + subccc @acc[7],-1,@acc[7] + subc $carry,0,$carry + + ! Note that because mod has special form, i.e. consists of + ! 0xffffffff, 1 and 0s, we can conditionally synthesize it by + ! using value of borrow and its negative. + + addcc @acc[0],$carry,@acc[0] ! add synthesized modulus + addccc @acc[1],$carry,@acc[1] + neg $carry,$bi + st @acc[0],[$rp] + addccc @acc[2],$carry,@acc[2] + st @acc[1],[$rp+4] + addccc @acc[3],0,@acc[3] + st @acc[2],[$rp+8] + addccc @acc[4],0,@acc[4] + st @acc[3],[$rp+12] + addccc @acc[5],0,@acc[5] + st @acc[4],[$rp+16] + addccc @acc[6],$bi,@acc[6] + st @acc[5],[$rp+20] + addc @acc[7],$carry,@acc[7] + st @acc[6],[$rp+24] + retl + st @acc[7],[$rp+28] +.type __ecp_nistz256_add,#function +.size __ecp_nistz256_add,.-__ecp_nistz256_add + +! void ecp_nistz256_mul_by_2(BN_ULONG %i0[8],const BN_ULONG %i1[8]); +.globl ecp_nistz256_mul_by_2 +.align 32 +ecp_nistz256_mul_by_2: + save %sp,-STACK_FRAME,%sp + ld [$ap],@acc[0] + ld [$ap+4],@acc[1] + ld [$ap+8],@acc[2] + ld [$ap+12],@acc[3] + ld [$ap+16],@acc[4] + ld [$ap+20],@acc[5] + ld [$ap+24],@acc[6] + call __ecp_nistz256_mul_by_2 + ld [$ap+28],@acc[7] + ret + restore +.type ecp_nistz256_mul_by_2,#function +.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 + +.align 32 +__ecp_nistz256_mul_by_2: + addcc @acc[0],@acc[0],@acc[0] ! a+a=2*a + addccc @acc[1],@acc[1],@acc[1] + addccc @acc[2],@acc[2],@acc[2] + addccc @acc[3],@acc[3],@acc[3] + addccc @acc[4],@acc[4],@acc[4] + addccc @acc[5],@acc[5],@acc[5] + addccc @acc[6],@acc[6],@acc[6] + addccc @acc[7],@acc[7],@acc[7] + b .Lreduce_by_sub + addc %g0,%g0,$carry +.type __ecp_nistz256_mul_by_2,#function +.size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2 + +! void ecp_nistz256_mul_by_3(BN_ULONG %i0[8],const BN_ULONG %i1[8]); +.globl ecp_nistz256_mul_by_3 +.align 32 +ecp_nistz256_mul_by_3: + save %sp,-STACK_FRAME,%sp + ld [$ap],@acc[0] + ld [$ap+4],@acc[1] + ld [$ap+8],@acc[2] + ld [$ap+12],@acc[3] + ld [$ap+16],@acc[4] + ld [$ap+20],@acc[5] + ld [$ap+24],@acc[6] + call __ecp_nistz256_mul_by_3 + ld [$ap+28],@acc[7] + ret + restore +.type ecp_nistz256_mul_by_3,#function +.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 + +.align 32 +__ecp_nistz256_mul_by_3: + addcc @acc[0],@acc[0],$t0 ! a+a=2*a + addccc @acc[1],@acc[1],$t1 + addccc @acc[2],@acc[2],$t2 + addccc @acc[3],@acc[3],$t3 + addccc @acc[4],@acc[4],$t4 + addccc @acc[5],@acc[5],$t5 + addccc @acc[6],@acc[6],$t6 + addccc @acc[7],@acc[7],$t7 + addc %g0,%g0,$carry + + subcc $t0,-1,$t0 ! .Lreduce_by_sub but without stores + subccc $t1,-1,$t1 + subccc $t2,-1,$t2 + subccc $t3, 0,$t3 + subccc $t4, 0,$t4 + subccc $t5, 0,$t5 + subccc $t6, 1,$t6 + subccc $t7,-1,$t7 + subc $carry,0,$carry + + addcc $t0,$carry,$t0 ! add synthesized modulus + addccc $t1,$carry,$t1 + neg $carry,$bi + addccc $t2,$carry,$t2 + addccc $t3,0,$t3 + addccc $t4,0,$t4 + addccc $t5,0,$t5 + addccc $t6,$bi,$t6 + addc $t7,$carry,$t7 + + addcc $t0,@acc[0],@acc[0] ! 2*a+a=3*a + addccc $t1,@acc[1],@acc[1] + addccc $t2,@acc[2],@acc[2] + addccc $t3,@acc[3],@acc[3] + addccc $t4,@acc[4],@acc[4] + addccc $t5,@acc[5],@acc[5] + addccc $t6,@acc[6],@acc[6] + addccc $t7,@acc[7],@acc[7] + b .Lreduce_by_sub + addc %g0,%g0,$carry +.type __ecp_nistz256_mul_by_3,#function +.size __ecp_nistz256_mul_by_3,.-__ecp_nistz256_mul_by_3 + +! void ecp_nistz256_sub(BN_ULONG %i0[8],const BN_ULONG %i1[8], +! const BN_ULONG %i2[8]); +.globl ecp_nistz256_sub +.align 32 +ecp_nistz256_sub: + save %sp,-STACK_FRAME,%sp + ld [$ap],@acc[0] + ld [$ap+4],@acc[1] + ld [$ap+8],@acc[2] + ld [$ap+12],@acc[3] + ld [$ap+16],@acc[4] + ld [$ap+20],@acc[5] + ld [$ap+24],@acc[6] + call __ecp_nistz256_sub_from + ld [$ap+28],@acc[7] + ret + restore +.type ecp_nistz256_sub,#function +.size ecp_nistz256_sub,.-ecp_nistz256_sub + +! void ecp_nistz256_neg(BN_ULONG %i0[8],const BN_ULONG %i1[8]); +.globl ecp_nistz256_neg +.align 32 +ecp_nistz256_neg: + save %sp,-STACK_FRAME,%sp + mov $ap,$bp + mov 0,@acc[0] + mov 0,@acc[1] + mov 0,@acc[2] + mov 0,@acc[3] + mov 0,@acc[4] + mov 0,@acc[5] + mov 0,@acc[6] + call __ecp_nistz256_sub_from + mov 0,@acc[7] + ret + restore +.type ecp_nistz256_neg,#function +.size ecp_nistz256_neg,.-ecp_nistz256_neg + +.align 32 +__ecp_nistz256_sub_from: + ld [$bp+0],$t0 ! b[0] + ld [$bp+4],$t1 + ld [$bp+8],$t2 + ld [$bp+12],$t3 + subcc @acc[0],$t0,@acc[0] + ld [$bp+16],$t4 + ld [$bp+20],$t5 + subccc @acc[1],$t1,@acc[1] + subccc @acc[2],$t2,@acc[2] + ld [$bp+24],$t6 + ld [$bp+28],$t7 + subccc @acc[3],$t3,@acc[3] + subccc @acc[4],$t4,@acc[4] + subccc @acc[5],$t5,@acc[5] + subccc @acc[6],$t6,@acc[6] + subccc @acc[7],$t7,@acc[7] + subc %g0,%g0,$carry ! broadcast borrow bit + +.Lreduce_by_add: + + ! if a-b borrows, add modulus. + ! + ! Note that because mod has special form, i.e. consists of + ! 0xffffffff, 1 and 0s, we can conditionally synthesize it by + ! using value of broadcasted borrow and the borrow bit itself. + ! To minimize dependency chain we first broadcast and then + ! extract the bit by negating (follow $bi). + + addcc @acc[0],$carry,@acc[0] ! add synthesized modulus + addccc @acc[1],$carry,@acc[1] + neg $carry,$bi + st @acc[0],[$rp] + addccc @acc[2],$carry,@acc[2] + st @acc[1],[$rp+4] + addccc @acc[3],0,@acc[3] + st @acc[2],[$rp+8] + addccc @acc[4],0,@acc[4] + st @acc[3],[$rp+12] + addccc @acc[5],0,@acc[5] + st @acc[4],[$rp+16] + addccc @acc[6],$bi,@acc[6] + st @acc[5],[$rp+20] + addc @acc[7],$carry,@acc[7] + st @acc[6],[$rp+24] + retl + st @acc[7],[$rp+28] +.type __ecp_nistz256_sub_from,#function +.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from + +.align 32 +__ecp_nistz256_sub_morf: + ld [$bp+0],$t0 ! b[0] + ld [$bp+4],$t1 + ld [$bp+8],$t2 + ld [$bp+12],$t3 + subcc $t0,@acc[0],@acc[0] + ld [$bp+16],$t4 + ld [$bp+20],$t5 + subccc $t1,@acc[1],@acc[1] + subccc $t2,@acc[2],@acc[2] + ld [$bp+24],$t6 + ld [$bp+28],$t7 + subccc $t3,@acc[3],@acc[3] + subccc $t4,@acc[4],@acc[4] + subccc $t5,@acc[5],@acc[5] + subccc $t6,@acc[6],@acc[6] + subccc $t7,@acc[7],@acc[7] + b .Lreduce_by_add + subc %g0,%g0,$carry ! broadcast borrow bit +.type __ecp_nistz256_sub_morf,#function +.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf + +! void ecp_nistz256_div_by_2(BN_ULONG %i0[8],const BN_ULONG %i1[8]); +.globl ecp_nistz256_div_by_2 +.align 32 +ecp_nistz256_div_by_2: + save %sp,-STACK_FRAME,%sp + ld [$ap],@acc[0] + ld [$ap+4],@acc[1] + ld [$ap+8],@acc[2] + ld [$ap+12],@acc[3] + ld [$ap+16],@acc[4] + ld [$ap+20],@acc[5] + ld [$ap+24],@acc[6] + call __ecp_nistz256_div_by_2 + ld [$ap+28],@acc[7] + ret + restore +.type ecp_nistz256_div_by_2,#function +.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 + +.align 32 +__ecp_nistz256_div_by_2: + ! ret = (a is odd ? a+mod : a) >> 1 + + and @acc[0],1,$bi + neg $bi,$carry + addcc @acc[0],$carry,@acc[0] + addccc @acc[1],$carry,@acc[1] + addccc @acc[2],$carry,@acc[2] + addccc @acc[3],0,@acc[3] + addccc @acc[4],0,@acc[4] + addccc @acc[5],0,@acc[5] + addccc @acc[6],$bi,@acc[6] + addccc @acc[7],$carry,@acc[7] + addc %g0,%g0,$carry + + ! ret >>= 1 + + srl @acc[0],1,@acc[0] + sll @acc[1],31,$t0 + srl @acc[1],1,@acc[1] + or @acc[0],$t0,@acc[0] + sll @acc[2],31,$t1 + srl @acc[2],1,@acc[2] + or @acc[1],$t1,@acc[1] + sll @acc[3],31,$t2 + st @acc[0],[$rp] + srl @acc[3],1,@acc[3] + or @acc[2],$t2,@acc[2] + sll @acc[4],31,$t3 + st @acc[1],[$rp+4] + srl @acc[4],1,@acc[4] + or @acc[3],$t3,@acc[3] + sll @acc[5],31,$t4 + st @acc[2],[$rp+8] + srl @acc[5],1,@acc[5] + or @acc[4],$t4,@acc[4] + sll @acc[6],31,$t5 + st @acc[3],[$rp+12] + srl @acc[6],1,@acc[6] + or @acc[5],$t5,@acc[5] + sll @acc[7],31,$t6 + st @acc[4],[$rp+16] + srl @acc[7],1,@acc[7] + or @acc[6],$t6,@acc[6] + sll $carry,31,$t7 + st @acc[5],[$rp+20] + or @acc[7],$t7,@acc[7] + st @acc[6],[$rp+24] + retl + st @acc[7],[$rp+28] +.type __ecp_nistz256_div_by_2,#function +.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 +___ + +######################################################################## +# following subroutines are "literal" implementation of those found in +# ecp_nistz256.c +# +######################################################################## +# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); +# +{ +my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3)); +# above map() describes stack layout with 4 temporary +# 256-bit vectors on top. + +$code.=<<___; +#ifdef __PIC__ +SPARC_PIC_THUNK(%g1) +#endif + +.globl ecp_nistz256_point_double +.align 32 +ecp_nistz256_point_double: + SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) + ld [%g1],%g1 ! OPENSSL_sparcv9cap_P[0] + and %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1 + cmp %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK) + be ecp_nistz256_point_double_vis3 + nop + + save %sp,-STACK_FRAME-32*4,%sp + + mov $rp,$rp_real + mov $ap,$ap_real + +.Lpoint_double_shortcut: + ld [$ap+32],@acc[0] + ld [$ap+32+4],@acc[1] + ld [$ap+32+8],@acc[2] + ld [$ap+32+12],@acc[3] + ld [$ap+32+16],@acc[4] + ld [$ap+32+20],@acc[5] + ld [$ap+32+24],@acc[6] + ld [$ap+32+28],@acc[7] + call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(S, in_y); + add %sp,LOCALS+$S,$rp + + add $ap_real,64,$bp + add $ap_real,64,$ap + call __ecp_nistz256_mul_mont ! p256_sqr_mont(Zsqr, in_z); + add %sp,LOCALS+$Zsqr,$rp + + add $ap_real,0,$bp + call __ecp_nistz256_add ! p256_add(M, Zsqr, in_x); + add %sp,LOCALS+$M,$rp + + add %sp,LOCALS+$S,$bp + add %sp,LOCALS+$S,$ap + call __ecp_nistz256_mul_mont ! p256_sqr_mont(S, S); + add %sp,LOCALS+$S,$rp + + ld [$ap_real],@acc[0] + add %sp,LOCALS+$Zsqr,$bp + ld [$ap_real+4],@acc[1] + ld [$ap_real+8],@acc[2] + ld [$ap_real+12],@acc[3] + ld [$ap_real+16],@acc[4] + ld [$ap_real+20],@acc[5] + ld [$ap_real+24],@acc[6] + ld [$ap_real+28],@acc[7] + call __ecp_nistz256_sub_from ! p256_sub(Zsqr, in_x, Zsqr); + add %sp,LOCALS+$Zsqr,$rp + + add $ap_real,32,$bp + add $ap_real,64,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(tmp0, in_z, in_y); + add %sp,LOCALS+$tmp0,$rp + + call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(res_z, tmp0); + add $rp_real,64,$rp + + add %sp,LOCALS+$Zsqr,$bp + add %sp,LOCALS+$M,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(M, M, Zsqr); + add %sp,LOCALS+$M,$rp + + call __ecp_nistz256_mul_by_3 ! p256_mul_by_3(M, M); + add %sp,LOCALS+$M,$rp + + add %sp,LOCALS+$S,$bp + add %sp,LOCALS+$S,$ap + call __ecp_nistz256_mul_mont ! p256_sqr_mont(tmp0, S); + add %sp,LOCALS+$tmp0,$rp + + call __ecp_nistz256_div_by_2 ! p256_div_by_2(res_y, tmp0); + add $rp_real,32,$rp + + add $ap_real,0,$bp + add %sp,LOCALS+$S,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(S, S, in_x); + add %sp,LOCALS+$S,$rp + + call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(tmp0, S); + add %sp,LOCALS+$tmp0,$rp + + add %sp,LOCALS+$M,$bp + add %sp,LOCALS+$M,$ap + call __ecp_nistz256_mul_mont ! p256_sqr_mont(res_x, M); + add $rp_real,0,$rp + + add %sp,LOCALS+$tmp0,$bp + call __ecp_nistz256_sub_from ! p256_sub(res_x, res_x, tmp0); + add $rp_real,0,$rp + + add %sp,LOCALS+$S,$bp + call __ecp_nistz256_sub_morf ! p256_sub(S, S, res_x); + add %sp,LOCALS+$S,$rp + + add %sp,LOCALS+$M,$bp + add %sp,LOCALS+$S,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(S, S, M); + add %sp,LOCALS+$S,$rp + + add $rp_real,32,$bp + call __ecp_nistz256_sub_from ! p256_sub(res_y, S, res_y); + add $rp_real,32,$rp + + ret + restore +.type ecp_nistz256_point_double,#function +.size ecp_nistz256_point_double,.-ecp_nistz256_point_double +___ +} + +######################################################################## +# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, +# const P256_POINT *in2); +{ +my ($res_x,$res_y,$res_z, + $H,$Hsqr,$R,$Rsqr,$Hcub, + $U1,$U2,$S1,$S2)=map(32*$_,(0..11)); +my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); + +# above map() describes stack layout with 12 temporary +# 256-bit vectors on top. Then we reserve some space for +# !in1infty, !in2infty, result of check for zero and return pointer. + +my $bp_real=$rp_real; + +$code.=<<___; +.globl ecp_nistz256_point_add +.align 32 +ecp_nistz256_point_add: + SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) + ld [%g1],%g1 ! OPENSSL_sparcv9cap_P[0] + and %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1 + cmp %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK) + be ecp_nistz256_point_add_vis3 + nop + + save %sp,-STACK_FRAME-32*12-32,%sp + + stx $rp,[%fp+STACK_BIAS-8] ! off-load $rp + mov $ap,$ap_real + mov $bp,$bp_real + + ld [$bp+64],$t0 ! in2_z + ld [$bp+64+4],$t1 + ld [$bp+64+8],$t2 + ld [$bp+64+12],$t3 + ld [$bp+64+16],$t4 + ld [$bp+64+20],$t5 + ld [$bp+64+24],$t6 + ld [$bp+64+28],$t7 + or $t1,$t0,$t0 + or $t3,$t2,$t2 + or $t5,$t4,$t4 + or $t7,$t6,$t6 + or $t2,$t0,$t0 + or $t6,$t4,$t4 + or $t4,$t0,$t0 ! !in2infty + movrnz $t0,-1,$t0 + st $t0,[%fp+STACK_BIAS-12] + + ld [$ap+64],$t0 ! in1_z + ld [$ap+64+4],$t1 + ld [$ap+64+8],$t2 + ld [$ap+64+12],$t3 + ld [$ap+64+16],$t4 + ld [$ap+64+20],$t5 + ld [$ap+64+24],$t6 + ld [$ap+64+28],$t7 + or $t1,$t0,$t0 + or $t3,$t2,$t2 + or $t5,$t4,$t4 + or $t7,$t6,$t6 + or $t2,$t0,$t0 + or $t6,$t4,$t4 + or $t4,$t0,$t0 ! !in1infty + movrnz $t0,-1,$t0 + st $t0,[%fp+STACK_BIAS-16] + + add $bp_real,64,$bp + add $bp_real,64,$ap + call __ecp_nistz256_mul_mont ! p256_sqr_mont(Z2sqr, in2_z); + add %sp,LOCALS+$Z2sqr,$rp + + add $ap_real,64,$bp + add $ap_real,64,$ap + call __ecp_nistz256_mul_mont ! p256_sqr_mont(Z1sqr, in1_z); + add %sp,LOCALS+$Z1sqr,$rp + + add $bp_real,64,$bp + add %sp,LOCALS+$Z2sqr,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(S1, Z2sqr, in2_z); + add %sp,LOCALS+$S1,$rp + + add $ap_real,64,$bp + add %sp,LOCALS+$Z1sqr,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, Z1sqr, in1_z); + add %sp,LOCALS+$S2,$rp + + add $ap_real,32,$bp + add %sp,LOCALS+$S1,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(S1, S1, in1_y); + add %sp,LOCALS+$S1,$rp + + add $bp_real,32,$bp + add %sp,LOCALS+$S2,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, S2, in2_y); + add %sp,LOCALS+$S2,$rp + + add %sp,LOCALS+$S1,$bp + call __ecp_nistz256_sub_from ! p256_sub(R, S2, S1); + add %sp,LOCALS+$R,$rp + + or @acc[1],@acc[0],@acc[0] ! see if result is zero + or @acc[3],@acc[2],@acc[2] + or @acc[5],@acc[4],@acc[4] + or @acc[7],@acc[6],@acc[6] + or @acc[2],@acc[0],@acc[0] + or @acc[6],@acc[4],@acc[4] + or @acc[4],@acc[0],@acc[0] + st @acc[0],[%fp+STACK_BIAS-20] + + add $ap_real,0,$bp + add %sp,LOCALS+$Z2sqr,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(U1, in1_x, Z2sqr); + add %sp,LOCALS+$U1,$rp + + add $bp_real,0,$bp + add %sp,LOCALS+$Z1sqr,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(U2, in2_x, Z1sqr); + add %sp,LOCALS+$U2,$rp + + add %sp,LOCALS+$U1,$bp + call __ecp_nistz256_sub_from ! p256_sub(H, U2, U1); + add %sp,LOCALS+$H,$rp + + or @acc[1],@acc[0],@acc[0] ! see if result is zero + or @acc[3],@acc[2],@acc[2] + or @acc[5],@acc[4],@acc[4] + or @acc[7],@acc[6],@acc[6] + or @acc[2],@acc[0],@acc[0] + or @acc[6],@acc[4],@acc[4] + orcc @acc[4],@acc[0],@acc[0] + + bne,pt %icc,.Ladd_proceed ! is_equal(U1,U2)? + nop + + ld [%fp+STACK_BIAS-12],$t0 + ld [%fp+STACK_BIAS-16],$t1 + ld [%fp+STACK_BIAS-20],$t2 + andcc $t0,$t1,%g0 + be,pt %icc,.Ladd_proceed ! (in1infty || in2infty)? + nop + andcc $t2,$t2,%g0 + be,pt %icc,.Ladd_double ! is_equal(S1,S2)? + nop + + ldx [%fp+STACK_BIAS-8],$rp + st %g0,[$rp] + st %g0,[$rp+4] + st %g0,[$rp+8] + st %g0,[$rp+12] + st %g0,[$rp+16] + st %g0,[$rp+20] + st %g0,[$rp+24] + st %g0,[$rp+28] + st %g0,[$rp+32] + st %g0,[$rp+32+4] + st %g0,[$rp+32+8] + st %g0,[$rp+32+12] + st %g0,[$rp+32+16] + st %g0,[$rp+32+20] + st %g0,[$rp+32+24] + st %g0,[$rp+32+28] + st %g0,[$rp+64] + st %g0,[$rp+64+4] + st %g0,[$rp+64+8] + st %g0,[$rp+64+12] + st %g0,[$rp+64+16] + st %g0,[$rp+64+20] + st %g0,[$rp+64+24] + st %g0,[$rp+64+28] + b .Ladd_done + nop + +.align 16 +.Ladd_double: + ldx [%fp+STACK_BIAS-8],$rp_real + mov $ap_real,$ap + b .Lpoint_double_shortcut + add %sp,32*(12-4)+32,%sp ! difference in frame sizes + +.align 16 +.Ladd_proceed: + add %sp,LOCALS+$R,$bp + add %sp,LOCALS+$R,$ap + call __ecp_nistz256_mul_mont ! p256_sqr_mont(Rsqr, R); + add %sp,LOCALS+$Rsqr,$rp + + add $ap_real,64,$bp + add %sp,LOCALS+$H,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(res_z, H, in1_z); + add %sp,LOCALS+$res_z,$rp + + add %sp,LOCALS+$H,$bp + add %sp,LOCALS+$H,$ap + call __ecp_nistz256_mul_mont ! p256_sqr_mont(Hsqr, H); + add %sp,LOCALS+$Hsqr,$rp + + add $bp_real,64,$bp + add %sp,LOCALS+$res_z,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(res_z, res_z, in2_z); + add %sp,LOCALS+$res_z,$rp + + add %sp,LOCALS+$H,$bp + add %sp,LOCALS+$Hsqr,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(Hcub, Hsqr, H); + add %sp,LOCALS+$Hcub,$rp + + add %sp,LOCALS+$U1,$bp + add %sp,LOCALS+$Hsqr,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(U2, U1, Hsqr); + add %sp,LOCALS+$U2,$rp + + call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(Hsqr, U2); + add %sp,LOCALS+$Hsqr,$rp + + add %sp,LOCALS+$Rsqr,$bp + call __ecp_nistz256_sub_morf ! p256_sub(res_x, Rsqr, Hsqr); + add %sp,LOCALS+$res_x,$rp + + add %sp,LOCALS+$Hcub,$bp + call __ecp_nistz256_sub_from ! p256_sub(res_x, res_x, Hcub); + add %sp,LOCALS+$res_x,$rp + + add %sp,LOCALS+$U2,$bp + call __ecp_nistz256_sub_morf ! p256_sub(res_y, U2, res_x); + add %sp,LOCALS+$res_y,$rp + + add %sp,LOCALS+$Hcub,$bp + add %sp,LOCALS+$S1,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, S1, Hcub); + add %sp,LOCALS+$S2,$rp + + add %sp,LOCALS+$R,$bp + add %sp,LOCALS+$res_y,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(res_y, res_y, R); + add %sp,LOCALS+$res_y,$rp + + add %sp,LOCALS+$S2,$bp + call __ecp_nistz256_sub_from ! p256_sub(res_y, res_y, S2); + add %sp,LOCALS+$res_y,$rp + + ld [%fp+STACK_BIAS-16],$t1 ! !in1infty + ld [%fp+STACK_BIAS-12],$t2 ! !in2infty + ldx [%fp+STACK_BIAS-8],$rp +___ +for($i=0;$i<96;$i+=8) { # conditional moves +$code.=<<___; + ld [%sp+LOCALS+$i],@acc[0] ! res + ld [%sp+LOCALS+$i+4],@acc[1] + ld [$bp_real+$i],@acc[2] ! in2 + ld [$bp_real+$i+4],@acc[3] + ld [$ap_real+$i],@acc[4] ! in1 + ld [$ap_real+$i+4],@acc[5] + movrz $t1,@acc[2],@acc[0] + movrz $t1,@acc[3],@acc[1] + movrz $t2,@acc[4],@acc[0] + movrz $t2,@acc[5],@acc[1] + st @acc[0],[$rp+$i] + st @acc[1],[$rp+$i+4] +___ +} +$code.=<<___; +.Ladd_done: + ret + restore +.type ecp_nistz256_point_add,#function +.size ecp_nistz256_point_add,.-ecp_nistz256_point_add +___ +} + +######################################################################## +# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, +# const P256_POINT_AFFINE *in2); +{ +my ($res_x,$res_y,$res_z, + $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9)); +my $Z1sqr = $S2; +# above map() describes stack layout with 10 temporary +# 256-bit vectors on top. Then we reserve some space for +# !in1infty, !in2infty, result of check for zero and return pointer. + +my @ONE_mont=(1,0,0,-1,-1,-1,-2,0); +my $bp_real=$rp_real; + +$code.=<<___; +.globl ecp_nistz256_point_add_affine +.align 32 +ecp_nistz256_point_add_affine: + SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) + ld [%g1],%g1 ! OPENSSL_sparcv9cap_P[0] + and %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1 + cmp %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK) + be ecp_nistz256_point_add_affine_vis3 + nop + + save %sp,-STACK_FRAME-32*10-32,%sp + + stx $rp,[%fp+STACK_BIAS-8] ! off-load $rp + mov $ap,$ap_real + mov $bp,$bp_real + + ld [$ap+64],$t0 ! in1_z + ld [$ap+64+4],$t1 + ld [$ap+64+8],$t2 + ld [$ap+64+12],$t3 + ld [$ap+64+16],$t4 + ld [$ap+64+20],$t5 + ld [$ap+64+24],$t6 + ld [$ap+64+28],$t7 + or $t1,$t0,$t0 + or $t3,$t2,$t2 + or $t5,$t4,$t4 + or $t7,$t6,$t6 + or $t2,$t0,$t0 + or $t6,$t4,$t4 + or $t4,$t0,$t0 ! !in1infty + movrnz $t0,-1,$t0 + st $t0,[%fp+STACK_BIAS-16] + + ld [$bp],@acc[0] ! in2_x + ld [$bp+4],@acc[1] + ld [$bp+8],@acc[2] + ld [$bp+12],@acc[3] + ld [$bp+16],@acc[4] + ld [$bp+20],@acc[5] + ld [$bp+24],@acc[6] + ld [$bp+28],@acc[7] + ld [$bp+32],$t0 ! in2_y + ld [$bp+32+4],$t1 + ld [$bp+32+8],$t2 + ld [$bp+32+12],$t3 + ld [$bp+32+16],$t4 + ld [$bp+32+20],$t5 + ld [$bp+32+24],$t6 + ld [$bp+32+28],$t7 + or @acc[1],@acc[0],@acc[0] + or @acc[3],@acc[2],@acc[2] + or @acc[5],@acc[4],@acc[4] + or @acc[7],@acc[6],@acc[6] + or @acc[2],@acc[0],@acc[0] + or @acc[6],@acc[4],@acc[4] + or @acc[4],@acc[0],@acc[0] + or $t1,$t0,$t0 + or $t3,$t2,$t2 + or $t5,$t4,$t4 + or $t7,$t6,$t6 + or $t2,$t0,$t0 + or $t6,$t4,$t4 + or $t4,$t0,$t0 + or @acc[0],$t0,$t0 ! !in2infty + movrnz $t0,-1,$t0 + st $t0,[%fp+STACK_BIAS-12] + + add $ap_real,64,$bp + add $ap_real,64,$ap + call __ecp_nistz256_mul_mont ! p256_sqr_mont(Z1sqr, in1_z); + add %sp,LOCALS+$Z1sqr,$rp + + add $bp_real,0,$bp + add %sp,LOCALS+$Z1sqr,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(U2, Z1sqr, in2_x); + add %sp,LOCALS+$U2,$rp + + add $ap_real,0,$bp + call __ecp_nistz256_sub_from ! p256_sub(H, U2, in1_x); + add %sp,LOCALS+$H,$rp + + add $ap_real,64,$bp + add %sp,LOCALS+$Z1sqr,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, Z1sqr, in1_z); + add %sp,LOCALS+$S2,$rp + + add $ap_real,64,$bp + add %sp,LOCALS+$H,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(res_z, H, in1_z); + add %sp,LOCALS+$res_z,$rp + + add $bp_real,32,$bp + add %sp,LOCALS+$S2,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, S2, in2_y); + add %sp,LOCALS+$S2,$rp + + add $ap_real,32,$bp + call __ecp_nistz256_sub_from ! p256_sub(R, S2, in1_y); + add %sp,LOCALS+$R,$rp + + add %sp,LOCALS+$H,$bp + add %sp,LOCALS+$H,$ap + call __ecp_nistz256_mul_mont ! p256_sqr_mont(Hsqr, H); + add %sp,LOCALS+$Hsqr,$rp + + add %sp,LOCALS+$R,$bp + add %sp,LOCALS+$R,$ap + call __ecp_nistz256_mul_mont ! p256_sqr_mont(Rsqr, R); + add %sp,LOCALS+$Rsqr,$rp + + add %sp,LOCALS+$H,$bp + add %sp,LOCALS+$Hsqr,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(Hcub, Hsqr, H); + add %sp,LOCALS+$Hcub,$rp + + add $ap_real,0,$bp + add %sp,LOCALS+$Hsqr,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(U2, in1_x, Hsqr); + add %sp,LOCALS+$U2,$rp + + call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(Hsqr, U2); + add %sp,LOCALS+$Hsqr,$rp + + add %sp,LOCALS+$Rsqr,$bp + call __ecp_nistz256_sub_morf ! p256_sub(res_x, Rsqr, Hsqr); + add %sp,LOCALS+$res_x,$rp + + add %sp,LOCALS+$Hcub,$bp + call __ecp_nistz256_sub_from ! p256_sub(res_x, res_x, Hcub); + add %sp,LOCALS+$res_x,$rp + + add %sp,LOCALS+$U2,$bp + call __ecp_nistz256_sub_morf ! p256_sub(res_y, U2, res_x); + add %sp,LOCALS+$res_y,$rp + + add $ap_real,32,$bp + add %sp,LOCALS+$Hcub,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, in1_y, Hcub); + add %sp,LOCALS+$S2,$rp + + add %sp,LOCALS+$R,$bp + add %sp,LOCALS+$res_y,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(res_y, res_y, R); + add %sp,LOCALS+$res_y,$rp + + add %sp,LOCALS+$S2,$bp + call __ecp_nistz256_sub_from ! p256_sub(res_y, res_y, S2); + add %sp,LOCALS+$res_y,$rp + + ld [%fp+STACK_BIAS-16],$t1 ! !in1infty + ld [%fp+STACK_BIAS-12],$t2 ! !in2infty + ldx [%fp+STACK_BIAS-8],$rp +___ +for($i=0;$i<64;$i+=8) { # conditional moves +$code.=<<___; + ld [%sp+LOCALS+$i],@acc[0] ! res + ld [%sp+LOCALS+$i+4],@acc[1] + ld [$bp_real+$i],@acc[2] ! in2 + ld [$bp_real+$i+4],@acc[3] + ld [$ap_real+$i],@acc[4] ! in1 + ld [$ap_real+$i+4],@acc[5] + movrz $t1,@acc[2],@acc[0] + movrz $t1,@acc[3],@acc[1] + movrz $t2,@acc[4],@acc[0] + movrz $t2,@acc[5],@acc[1] + st @acc[0],[$rp+$i] + st @acc[1],[$rp+$i+4] +___ +} +for(;$i<96;$i+=8) { +my $j=($i-64)/4; +$code.=<<___; + ld [%sp+LOCALS+$i],@acc[0] ! res + ld [%sp+LOCALS+$i+4],@acc[1] + ld [$ap_real+$i],@acc[4] ! in1 + ld [$ap_real+$i+4],@acc[5] + movrz $t1,@ONE_mont[$j],@acc[0] + movrz $t1,@ONE_mont[$j+1],@acc[1] + movrz $t2,@acc[4],@acc[0] + movrz $t2,@acc[5],@acc[1] + st @acc[0],[$rp+$i] + st @acc[1],[$rp+$i+4] +___ +} +$code.=<<___; + ret + restore +.type ecp_nistz256_point_add_affine,#function +.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine +___ +} }}} +{{{ +my ($out,$inp,$index)=map("%i$_",(0..2)); +my $mask="%o0"; + +$code.=<<___; +! void ecp_nistz256_scatter_w5(void *%i0,const P256_POINT *%i1, +! int %i2); +.globl ecp_nistz256_scatter_w5 +.align 32 +ecp_nistz256_scatter_w5: + save %sp,-STACK_FRAME,%sp + + sll $index,2,$index + add $out,$index,$out + + ld [$inp],%l0 ! X + ld [$inp+4],%l1 + ld [$inp+8],%l2 + ld [$inp+12],%l3 + ld [$inp+16],%l4 + ld [$inp+20],%l5 + ld [$inp+24],%l6 + ld [$inp+28],%l7 + add $inp,32,$inp + st %l0,[$out+64*0-4] + st %l1,[$out+64*1-4] + st %l2,[$out+64*2-4] + st %l3,[$out+64*3-4] + st %l4,[$out+64*4-4] + st %l5,[$out+64*5-4] + st %l6,[$out+64*6-4] + st %l7,[$out+64*7-4] + add $out,64*8,$out + + ld [$inp],%l0 ! Y + ld [$inp+4],%l1 + ld [$inp+8],%l2 + ld [$inp+12],%l3 + ld [$inp+16],%l4 + ld [$inp+20],%l5 + ld [$inp+24],%l6 + ld [$inp+28],%l7 + add $inp,32,$inp + st %l0,[$out+64*0-4] + st %l1,[$out+64*1-4] + st %l2,[$out+64*2-4] + st %l3,[$out+64*3-4] + st %l4,[$out+64*4-4] + st %l5,[$out+64*5-4] + st %l6,[$out+64*6-4] + st %l7,[$out+64*7-4] + add $out,64*8,$out + + ld [$inp],%l0 ! Z + ld [$inp+4],%l1 + ld [$inp+8],%l2 + ld [$inp+12],%l3 + ld [$inp+16],%l4 + ld [$inp+20],%l5 + ld [$inp+24],%l6 + ld [$inp+28],%l7 + st %l0,[$out+64*0-4] + st %l1,[$out+64*1-4] + st %l2,[$out+64*2-4] + st %l3,[$out+64*3-4] + st %l4,[$out+64*4-4] + st %l5,[$out+64*5-4] + st %l6,[$out+64*6-4] + st %l7,[$out+64*7-4] + + ret + restore +.type ecp_nistz256_scatter_w5,#function +.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 + +! void ecp_nistz256_gather_w5(P256_POINT *%i0,const void *%i1, +! int %i2); +.globl ecp_nistz256_gather_w5 +.align 32 +ecp_nistz256_gather_w5: + save %sp,-STACK_FRAME,%sp + + neg $index,$mask + srax $mask,63,$mask + + add $index,$mask,$index + sll $index,2,$index + add $inp,$index,$inp + + ld [$inp+64*0],%l0 + ld [$inp+64*1],%l1 + ld [$inp+64*2],%l2 + ld [$inp+64*3],%l3 + ld [$inp+64*4],%l4 + ld [$inp+64*5],%l5 + ld [$inp+64*6],%l6 + ld [$inp+64*7],%l7 + add $inp,64*8,$inp + and %l0,$mask,%l0 + and %l1,$mask,%l1 + st %l0,[$out] ! X + and %l2,$mask,%l2 + st %l1,[$out+4] + and %l3,$mask,%l3 + st %l2,[$out+8] + and %l4,$mask,%l4 + st %l3,[$out+12] + and %l5,$mask,%l5 + st %l4,[$out+16] + and %l6,$mask,%l6 + st %l5,[$out+20] + and %l7,$mask,%l7 + st %l6,[$out+24] + st %l7,[$out+28] + add $out,32,$out + + ld [$inp+64*0],%l0 + ld [$inp+64*1],%l1 + ld [$inp+64*2],%l2 + ld [$inp+64*3],%l3 + ld [$inp+64*4],%l4 + ld [$inp+64*5],%l5 + ld [$inp+64*6],%l6 + ld [$inp+64*7],%l7 + add $inp,64*8,$inp + and %l0,$mask,%l0 + and %l1,$mask,%l1 + st %l0,[$out] ! Y + and %l2,$mask,%l2 + st %l1,[$out+4] + and %l3,$mask,%l3 + st %l2,[$out+8] + and %l4,$mask,%l4 + st %l3,[$out+12] + and %l5,$mask,%l5 + st %l4,[$out+16] + and %l6,$mask,%l6 + st %l5,[$out+20] + and %l7,$mask,%l7 + st %l6,[$out+24] + st %l7,[$out+28] + add $out,32,$out + + ld [$inp+64*0],%l0 + ld [$inp+64*1],%l1 + ld [$inp+64*2],%l2 + ld [$inp+64*3],%l3 + ld [$inp+64*4],%l4 + ld [$inp+64*5],%l5 + ld [$inp+64*6],%l6 + ld [$inp+64*7],%l7 + and %l0,$mask,%l0 + and %l1,$mask,%l1 + st %l0,[$out] ! Z + and %l2,$mask,%l2 + st %l1,[$out+4] + and %l3,$mask,%l3 + st %l2,[$out+8] + and %l4,$mask,%l4 + st %l3,[$out+12] + and %l5,$mask,%l5 + st %l4,[$out+16] + and %l6,$mask,%l6 + st %l5,[$out+20] + and %l7,$mask,%l7 + st %l6,[$out+24] + st %l7,[$out+28] + + ret + restore +.type ecp_nistz256_gather_w5,#function +.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 + +! void ecp_nistz256_scatter_w7(void *%i0,const P256_POINT_AFFINE *%i1, +! int %i2); +.globl ecp_nistz256_scatter_w7 +.align 32 +ecp_nistz256_scatter_w7: + save %sp,-STACK_FRAME,%sp + nop + add $out,$index,$out + mov 64/4,$index +.Loop_scatter_w7: + ld [$inp],%l0 + add $inp,4,$inp + subcc $index,1,$index + stb %l0,[$out+64*0] + srl %l0,8,%l1 + stb %l1,[$out+64*1] + srl %l0,16,%l2 + stb %l2,[$out+64*2] + srl %l0,24,%l3 + stb %l3,[$out+64*3] + bne .Loop_scatter_w7 + add $out,64*4,$out + + ret + restore +.type ecp_nistz256_scatter_w7,#function +.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 + +! void ecp_nistz256_gather_w7(P256_POINT_AFFINE *%i0,const void *%i1, +! int %i2); +.globl ecp_nistz256_gather_w7 +.align 32 +ecp_nistz256_gather_w7: + save %sp,-STACK_FRAME,%sp + + neg $index,$mask + srax $mask,63,$mask + + add $index,$mask,$index + add $inp,$index,$inp + mov 64/4,$index + +.Loop_gather_w7: + ldub [$inp+64*0],%l0 + prefetch [$inp+3840+64*0],1 + subcc $index,1,$index + ldub [$inp+64*1],%l1 + prefetch [$inp+3840+64*1],1 + ldub [$inp+64*2],%l2 + prefetch [$inp+3840+64*2],1 + ldub [$inp+64*3],%l3 + prefetch [$inp+3840+64*3],1 + add $inp,64*4,$inp + sll %l1,8,%l1 + sll %l2,16,%l2 + or %l0,%l1,%l0 + sll %l3,24,%l3 + or %l0,%l2,%l0 + or %l0,%l3,%l0 + and %l0,$mask,%l0 + st %l0,[$out] + bne .Loop_gather_w7 + add $out,4,$out + + ret + restore +.type ecp_nistz256_gather_w7,#function +.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 +___ +}}} +{{{ +######################################################################## +# Following subroutines are VIS3 counterparts of those above that +# implement ones found in ecp_nistz256.c. Key difference is that they +# use 128-bit multiplication and addition with 64-bit carry, and in order +# to do that they perform conversion from uin32_t[8] to uint64_t[4] upon +# entry and vice versa on return. +# +my ($rp,$ap,$bp)=map("%i$_",(0..2)); +my ($t0,$t1,$t2,$t3,$a0,$a1,$a2,$a3)=map("%l$_",(0..7)); +my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5)=map("%o$_",(0..5)); +my ($bi,$poly1,$poly3,$minus1)=(map("%i$_",(3..5)),"%g1"); +my ($rp_real,$ap_real)=("%g2","%g3"); +my ($acc6,$acc7)=($bp,$bi); # used in squaring + +$code.=<<___; +.align 32 +__ecp_nistz256_mul_by_2_vis3: + addcc $acc0,$acc0,$acc0 + addxccc $acc1,$acc1,$acc1 + addxccc $acc2,$acc2,$acc2 + addxccc $acc3,$acc3,$acc3 + b .Lreduce_by_sub_vis3 + addxc %g0,%g0,$acc4 ! did it carry? +.type __ecp_nistz256_mul_by_2_vis3,#function +.size __ecp_nistz256_mul_by_2_vis3,.-__ecp_nistz256_mul_by_2_vis3 + +.align 32 +__ecp_nistz256_add_vis3: + ldx [$bp+0],$t0 + ldx [$bp+8],$t1 + ldx [$bp+16],$t2 + ldx [$bp+24],$t3 + +__ecp_nistz256_add_noload_vis3: + + addcc $t0,$acc0,$acc0 + addxccc $t1,$acc1,$acc1 + addxccc $t2,$acc2,$acc2 + addxccc $t3,$acc3,$acc3 + addxc %g0,%g0,$acc4 ! did it carry? + +.Lreduce_by_sub_vis3: + + addcc $acc0,1,$t0 ! add -modulus, i.e. subtract + addxccc $acc1,$poly1,$t1 + addxccc $acc2,$minus1,$t2 + addxccc $acc3,$poly3,$t3 + addxc $acc4,$minus1,$acc4 + + movrz $acc4,$t0,$acc0 ! ret = borrow ? ret : ret-modulus + movrz $acc4,$t1,$acc1 + stx $acc0,[$rp] + movrz $acc4,$t2,$acc2 + stx $acc1,[$rp+8] + movrz $acc4,$t3,$acc3 + stx $acc2,[$rp+16] + retl + stx $acc3,[$rp+24] +.type __ecp_nistz256_add_vis3,#function +.size __ecp_nistz256_add_vis3,.-__ecp_nistz256_add_vis3 + +! Trouble with subtraction is that there is no subtraction with 64-bit +! borrow, only with 32-bit one. For this reason we "decompose" 64-bit +! $acc0-$acc3 to 32-bit values and pick b[4] in 32-bit pieces. But +! recall that SPARC is big-endian, which is why you'll observe that +! b[4] is accessed as 4-0-12-8-20-16-28-24. And prior reduction we +! "collect" result back to 64-bit $acc0-$acc3. +.align 32 +__ecp_nistz256_sub_from_vis3: + ld [$bp+4],$t0 + ld [$bp+0],$t1 + ld [$bp+12],$t2 + ld [$bp+8],$t3 + + srlx $acc0,32,$acc4 + not $poly1,$poly1 + srlx $acc1,32,$acc5 + subcc $acc0,$t0,$acc0 + ld [$bp+20],$t0 + subccc $acc4,$t1,$acc4 + ld [$bp+16],$t1 + subccc $acc1,$t2,$acc1 + ld [$bp+28],$t2 + and $acc0,$poly1,$acc0 + subccc $acc5,$t3,$acc5 + ld [$bp+24],$t3 + sllx $acc4,32,$acc4 + and $acc1,$poly1,$acc1 + sllx $acc5,32,$acc5 + or $acc0,$acc4,$acc0 + srlx $acc2,32,$acc4 + or $acc1,$acc5,$acc1 + srlx $acc3,32,$acc5 + subccc $acc2,$t0,$acc2 + subccc $acc4,$t1,$acc4 + subccc $acc3,$t2,$acc3 + and $acc2,$poly1,$acc2 + subccc $acc5,$t3,$acc5 + sllx $acc4,32,$acc4 + and $acc3,$poly1,$acc3 + sllx $acc5,32,$acc5 + or $acc2,$acc4,$acc2 + subc %g0,%g0,$acc4 ! did it borrow? + b .Lreduce_by_add_vis3 + or $acc3,$acc5,$acc3 +.type __ecp_nistz256_sub_from_vis3,#function +.size __ecp_nistz256_sub_from_vis3,.-__ecp_nistz256_sub_from_vis3 + +.align 32 +__ecp_nistz256_sub_morf_vis3: + ld [$bp+4],$t0 + ld [$bp+0],$t1 + ld [$bp+12],$t2 + ld [$bp+8],$t3 + + srlx $acc0,32,$acc4 + not $poly1,$poly1 + srlx $acc1,32,$acc5 + subcc $t0,$acc0,$acc0 + ld [$bp+20],$t0 + subccc $t1,$acc4,$acc4 + ld [$bp+16],$t1 + subccc $t2,$acc1,$acc1 + ld [$bp+28],$t2 + and $acc0,$poly1,$acc0 + subccc $t3,$acc5,$acc5 + ld [$bp+24],$t3 + sllx $acc4,32,$acc4 + and $acc1,$poly1,$acc1 + sllx $acc5,32,$acc5 + or $acc0,$acc4,$acc0 + srlx $acc2,32,$acc4 + or $acc1,$acc5,$acc1 + srlx $acc3,32,$acc5 + subccc $t0,$acc2,$acc2 + subccc $t1,$acc4,$acc4 + subccc $t2,$acc3,$acc3 + and $acc2,$poly1,$acc2 + subccc $t3,$acc5,$acc5 + sllx $acc4,32,$acc4 + and $acc3,$poly1,$acc3 + sllx $acc5,32,$acc5 + or $acc2,$acc4,$acc2 + subc %g0,%g0,$acc4 ! did it borrow? + or $acc3,$acc5,$acc3 + +.Lreduce_by_add_vis3: + + addcc $acc0,-1,$t0 ! add modulus + not $poly3,$t3 + addxccc $acc1,$poly1,$t1 + not $poly1,$poly1 ! restore $poly1 + addxccc $acc2,%g0,$t2 + addxc $acc3,$t3,$t3 + + movrnz $acc4,$t0,$acc0 ! if a-b borrowed, ret = ret+mod + movrnz $acc4,$t1,$acc1 + stx $acc0,[$rp] + movrnz $acc4,$t2,$acc2 + stx $acc1,[$rp+8] + movrnz $acc4,$t3,$acc3 + stx $acc2,[$rp+16] + retl + stx $acc3,[$rp+24] +.type __ecp_nistz256_sub_morf_vis3,#function +.size __ecp_nistz256_sub_morf_vis3,.-__ecp_nistz256_sub_morf_vis3 + +.align 32 +__ecp_nistz256_div_by_2_vis3: + ! ret = (a is odd ? a+mod : a) >> 1 + + not $poly1,$t1 + not $poly3,$t3 + and $acc0,1,$acc5 + addcc $acc0,-1,$t0 ! add modulus + addxccc $acc1,$t1,$t1 + addxccc $acc2,%g0,$t2 + addxccc $acc3,$t3,$t3 + addxc %g0,%g0,$acc4 ! carry bit + + movrnz $acc5,$t0,$acc0 + movrnz $acc5,$t1,$acc1 + movrnz $acc5,$t2,$acc2 + movrnz $acc5,$t3,$acc3 + movrz $acc5,%g0,$acc4 + + ! ret >>= 1 + + srlx $acc0,1,$acc0 + sllx $acc1,63,$t0 + srlx $acc1,1,$acc1 + or $acc0,$t0,$acc0 + sllx $acc2,63,$t1 + srlx $acc2,1,$acc2 + or $acc1,$t1,$acc1 + sllx $acc3,63,$t2 + stx $acc0,[$rp] + srlx $acc3,1,$acc3 + or $acc2,$t2,$acc2 + sllx $acc4,63,$t3 ! don't forget carry bit + stx $acc1,[$rp+8] + or $acc3,$t3,$acc3 + stx $acc2,[$rp+16] + retl + stx $acc3,[$rp+24] +.type __ecp_nistz256_div_by_2_vis3,#function +.size __ecp_nistz256_div_by_2_vis3,.-__ecp_nistz256_div_by_2_vis3 + +! compared to __ecp_nistz256_mul_mont it's almost 4x smaller and +! 4x faster [on T4]... +.align 32 +__ecp_nistz256_mul_mont_vis3: + mulx $a0,$bi,$acc0 + not $poly3,$poly3 ! 0xFFFFFFFF00000001 + umulxhi $a0,$bi,$t0 + mulx $a1,$bi,$acc1 + umulxhi $a1,$bi,$t1 + mulx $a2,$bi,$acc2 + umulxhi $a2,$bi,$t2 + mulx $a3,$bi,$acc3 + umulxhi $a3,$bi,$t3 + ldx [$bp+8],$bi ! b[1] + + addcc $acc1,$t0,$acc1 ! accumulate high parts of multiplication + sllx $acc0,32,$t0 + addxccc $acc2,$t1,$acc2 + srlx $acc0,32,$t1 + addxccc $acc3,$t2,$acc3 + addxc %g0,$t3,$acc4 + mov 0,$acc5 +___ +for($i=1;$i<4;$i++) { + # Reduction iteration is normally performed by accumulating + # result of multiplication of modulus by "magic" digit [and + # omitting least significant word, which is guaranteed to + # be 0], but thanks to special form of modulus and "magic" + # digit being equal to least significant word, it can be + # performed with additions and subtractions alone. Indeed: + # + # ffff0001.00000000.0000ffff.ffffffff + # * abcdefgh + # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh + # + # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we + # rewrite above as: + # + # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh + # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000 + # - 0000abcd.efgh0000.00000000.00000000.abcdefgh + # + # or marking redundant operations: + # + # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.-------- + # + abcdefgh.abcdefgh.0000abcd.efgh0000.-------- + # - 0000abcd.efgh0000.--------.--------.-------- + # ^^^^^^^^ but this word is calculated with umulxhi, because + # there is no subtract with 64-bit borrow:-( + +$code.=<<___; + sub $acc0,$t0,$t2 ! acc0*0xFFFFFFFF00000001, low part + umulxhi $acc0,$poly3,$t3 ! acc0*0xFFFFFFFF00000001, high part + addcc $acc1,$t0,$acc0 ! +=acc[0]<<96 and omit acc[0] + mulx $a0,$bi,$t0 + addxccc $acc2,$t1,$acc1 + mulx $a1,$bi,$t1 + addxccc $acc3,$t2,$acc2 ! +=acc[0]*0xFFFFFFFF00000001 + mulx $a2,$bi,$t2 + addxccc $acc4,$t3,$acc3 + mulx $a3,$bi,$t3 + addxc $acc5,%g0,$acc4 + + addcc $acc0,$t0,$acc0 ! accumulate low parts of multiplication + umulxhi $a0,$bi,$t0 + addxccc $acc1,$t1,$acc1 + umulxhi $a1,$bi,$t1 + addxccc $acc2,$t2,$acc2 + umulxhi $a2,$bi,$t2 + addxccc $acc3,$t3,$acc3 + umulxhi $a3,$bi,$t3 + addxc $acc4,%g0,$acc4 +___ +$code.=<<___ if ($i<3); + ldx [$bp+8*($i+1)],$bi ! bp[$i+1] +___ +$code.=<<___; + addcc $acc1,$t0,$acc1 ! accumulate high parts of multiplication + sllx $acc0,32,$t0 + addxccc $acc2,$t1,$acc2 + srlx $acc0,32,$t1 + addxccc $acc3,$t2,$acc3 + addxccc $acc4,$t3,$acc4 + addxc %g0,%g0,$acc5 +___ +} +$code.=<<___; + sub $acc0,$t0,$t2 ! acc0*0xFFFFFFFF00000001, low part + umulxhi $acc0,$poly3,$t3 ! acc0*0xFFFFFFFF00000001, high part + addcc $acc1,$t0,$acc0 ! +=acc[0]<<96 and omit acc[0] + addxccc $acc2,$t1,$acc1 + addxccc $acc3,$t2,$acc2 ! +=acc[0]*0xFFFFFFFF00000001 + addxccc $acc4,$t3,$acc3 + b .Lmul_final_vis3 ! see below + addxc $acc5,%g0,$acc4 +.type __ecp_nistz256_mul_mont_vis3,#function +.size __ecp_nistz256_mul_mont_vis3,.-__ecp_nistz256_mul_mont_vis3 + +! compared to above __ecp_nistz256_mul_mont_vis3 it's 21% less +! instructions, but only 14% faster [on T4]... +.align 32 +__ecp_nistz256_sqr_mont_vis3: + ! | | | | | |a1*a0| | + ! | | | | |a2*a0| | | + ! | |a3*a2|a3*a0| | | | + ! | | | |a2*a1| | | | + ! | | |a3*a1| | | | | + ! *| | | | | | | | 2| + ! +|a3*a3|a2*a2|a1*a1|a0*a0| + ! |--+--+--+--+--+--+--+--| + ! |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx + ! + ! "can't overflow" below mark carrying into high part of + ! multiplication result, which can't overflow, because it + ! can never be all ones. + + mulx $a1,$a0,$acc1 ! a[1]*a[0] + umulxhi $a1,$a0,$t1 + mulx $a2,$a0,$acc2 ! a[2]*a[0] + umulxhi $a2,$a0,$t2 + mulx $a3,$a0,$acc3 ! a[3]*a[0] + umulxhi $a3,$a0,$acc4 + + addcc $acc2,$t1,$acc2 ! accumulate high parts of multiplication + mulx $a2,$a1,$t0 ! a[2]*a[1] + umulxhi $a2,$a1,$t1 + addxccc $acc3,$t2,$acc3 + mulx $a3,$a1,$t2 ! a[3]*a[1] + umulxhi $a3,$a1,$t3 + addxc $acc4,%g0,$acc4 ! can't overflow + + mulx $a3,$a2,$acc5 ! a[3]*a[2] + not $poly3,$poly3 ! 0xFFFFFFFF00000001 + umulxhi $a3,$a2,$acc6 + + addcc $t2,$t1,$t1 ! accumulate high parts of multiplication + mulx $a0,$a0,$acc0 ! a[0]*a[0] + addxc $t3,%g0,$t2 ! can't overflow + + addcc $acc3,$t0,$acc3 ! accumulate low parts of multiplication + umulxhi $a0,$a0,$a0 + addxccc $acc4,$t1,$acc4 + mulx $a1,$a1,$t1 ! a[1]*a[1] + addxccc $acc5,$t2,$acc5 + umulxhi $a1,$a1,$a1 + addxc $acc6,%g0,$acc6 ! can't overflow + + addcc $acc1,$acc1,$acc1 ! acc[1-6]*=2 + mulx $a2,$a2,$t2 ! a[2]*a[2] + addxccc $acc2,$acc2,$acc2 + umulxhi $a2,$a2,$a2 + addxccc $acc3,$acc3,$acc3 + mulx $a3,$a3,$t3 ! a[3]*a[3] + addxccc $acc4,$acc4,$acc4 + umulxhi $a3,$a3,$a3 + addxccc $acc5,$acc5,$acc5 + addxccc $acc6,$acc6,$acc6 + addxc %g0,%g0,$acc7 + + addcc $acc1,$a0,$acc1 ! +a[i]*a[i] + addxccc $acc2,$t1,$acc2 + addxccc $acc3,$a1,$acc3 + addxccc $acc4,$t2,$acc4 + sllx $acc0,32,$t0 + addxccc $acc5,$a2,$acc5 + srlx $acc0,32,$t1 + addxccc $acc6,$t3,$acc6 + sub $acc0,$t0,$t2 ! acc0*0xFFFFFFFF00000001, low part + addxc $acc7,$a3,$acc7 +___ +for($i=0;$i<3;$i++) { # reductions, see commentary + # in multiplication for details +$code.=<<___; + umulxhi $acc0,$poly3,$t3 ! acc0*0xFFFFFFFF00000001, high part + addcc $acc1,$t0,$acc0 ! +=acc[0]<<96 and omit acc[0] + sllx $acc0,32,$t0 + addxccc $acc2,$t1,$acc1 + srlx $acc0,32,$t1 + addxccc $acc3,$t2,$acc2 ! +=acc[0]*0xFFFFFFFF00000001 + sub $acc0,$t0,$t2 ! acc0*0xFFFFFFFF00000001, low part + addxc %g0,$t3,$acc3 ! can't overflow +___ +} +$code.=<<___; + umulxhi $acc0,$poly3,$t3 ! acc0*0xFFFFFFFF00000001, high part + addcc $acc1,$t0,$acc0 ! +=acc[0]<<96 and omit acc[0] + addxccc $acc2,$t1,$acc1 + addxccc $acc3,$t2,$acc2 ! +=acc[0]*0xFFFFFFFF00000001 + addxc %g0,$t3,$acc3 ! can't overflow + + addcc $acc0,$acc4,$acc0 ! accumulate upper half + addxccc $acc1,$acc5,$acc1 + addxccc $acc2,$acc6,$acc2 + addxccc $acc3,$acc7,$acc3 + addxc %g0,%g0,$acc4 + +.Lmul_final_vis3: + + ! Final step is "if result > mod, subtract mod", but as comparison + ! means subtraction, we do the subtraction and then copy outcome + ! if it didn't borrow. But note that as we [have to] replace + ! subtraction with addition with negative, carry/borrow logic is + ! inverse. + + addcc $acc0,1,$t0 ! add -modulus, i.e. subtract + not $poly3,$poly3 ! restore 0x00000000FFFFFFFE + addxccc $acc1,$poly1,$t1 + addxccc $acc2,$minus1,$t2 + addxccc $acc3,$poly3,$t3 + addxccc $acc4,$minus1,%g0 ! did it carry? + + movcs %xcc,$t0,$acc0 + movcs %xcc,$t1,$acc1 + stx $acc0,[$rp] + movcs %xcc,$t2,$acc2 + stx $acc1,[$rp+8] + movcs %xcc,$t3,$acc3 + stx $acc2,[$rp+16] + retl + stx $acc3,[$rp+24] +.type __ecp_nistz256_sqr_mont_vis3,#function +.size __ecp_nistz256_sqr_mont_vis3,.-__ecp_nistz256_sqr_mont_vis3 +___ + +######################################################################## +# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); +# +{ +my ($res_x,$res_y,$res_z, + $in_x,$in_y,$in_z, + $S,$M,$Zsqr,$tmp0)=map(32*$_,(0..9)); +# above map() describes stack layout with 10 temporary +# 256-bit vectors on top. + +$code.=<<___; +.align 32 +ecp_nistz256_point_double_vis3: + save %sp,-STACK64_FRAME-32*10,%sp + + mov $rp,$rp_real +.Ldouble_shortcut_vis3: + mov -1,$minus1 + mov -2,$poly3 + sllx $minus1,32,$poly1 ! 0xFFFFFFFF00000000 + srl $poly3,0,$poly3 ! 0x00000000FFFFFFFE + + ! convert input to uint64_t[4] + ld [$ap],$a0 ! in_x + ld [$ap+4],$t0 + ld [$ap+8],$a1 + ld [$ap+12],$t1 + ld [$ap+16],$a2 + ld [$ap+20],$t2 + ld [$ap+24],$a3 + ld [$ap+28],$t3 + sllx $t0,32,$t0 + sllx $t1,32,$t1 + ld [$ap+32],$acc0 ! in_y + or $a0,$t0,$a0 + ld [$ap+32+4],$t0 + sllx $t2,32,$t2 + ld [$ap+32+8],$acc1 + or $a1,$t1,$a1 + ld [$ap+32+12],$t1 + sllx $t3,32,$t3 + ld [$ap+32+16],$acc2 + or $a2,$t2,$a2 + ld [$ap+32+20],$t2 + or $a3,$t3,$a3 + ld [$ap+32+24],$acc3 + sllx $t0,32,$t0 + ld [$ap+32+28],$t3 + sllx $t1,32,$t1 + stx $a0,[%sp+LOCALS64+$in_x] + sllx $t2,32,$t2 + stx $a1,[%sp+LOCALS64+$in_x+8] + sllx $t3,32,$t3 + stx $a2,[%sp+LOCALS64+$in_x+16] + or $acc0,$t0,$acc0 + stx $a3,[%sp+LOCALS64+$in_x+24] + or $acc1,$t1,$acc1 + stx $acc0,[%sp+LOCALS64+$in_y] + or $acc2,$t2,$acc2 + stx $acc1,[%sp+LOCALS64+$in_y+8] + or $acc3,$t3,$acc3 + stx $acc2,[%sp+LOCALS64+$in_y+16] + stx $acc3,[%sp+LOCALS64+$in_y+24] + + ld [$ap+64],$a0 ! in_z + ld [$ap+64+4],$t0 + ld [$ap+64+8],$a1 + ld [$ap+64+12],$t1 + ld [$ap+64+16],$a2 + ld [$ap+64+20],$t2 + ld [$ap+64+24],$a3 + ld [$ap+64+28],$t3 + sllx $t0,32,$t0 + sllx $t1,32,$t1 + or $a0,$t0,$a0 + sllx $t2,32,$t2 + or $a1,$t1,$a1 + sllx $t3,32,$t3 + or $a2,$t2,$a2 + or $a3,$t3,$a3 + sllx $t0,32,$t0 + sllx $t1,32,$t1 + stx $a0,[%sp+LOCALS64+$in_z] + sllx $t2,32,$t2 + stx $a1,[%sp+LOCALS64+$in_z+8] + sllx $t3,32,$t3 + stx $a2,[%sp+LOCALS64+$in_z+16] + stx $a3,[%sp+LOCALS64+$in_z+24] + + ! in_y is still in $acc0-$acc3 + call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(S, in_y); + add %sp,LOCALS64+$S,$rp + + ! in_z is still in $a0-$a3 + call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Zsqr, in_z); + add %sp,LOCALS64+$Zsqr,$rp + + mov $acc0,$a0 ! put Zsqr aside + mov $acc1,$a1 + mov $acc2,$a2 + mov $acc3,$a3 + + add %sp,LOCALS64+$in_x,$bp + call __ecp_nistz256_add_vis3 ! p256_add(M, Zsqr, in_x); + add %sp,LOCALS64+$M,$rp + + mov $a0,$acc0 ! restore Zsqr + ldx [%sp+LOCALS64+$S],$a0 ! forward load + mov $a1,$acc1 + ldx [%sp+LOCALS64+$S+8],$a1 + mov $a2,$acc2 + ldx [%sp+LOCALS64+$S+16],$a2 + mov $a3,$acc3 + ldx [%sp+LOCALS64+$S+24],$a3 + + add %sp,LOCALS64+$in_x,$bp + call __ecp_nistz256_sub_morf_vis3 ! p256_sub(Zsqr, in_x, Zsqr); + add %sp,LOCALS64+$Zsqr,$rp + + call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(S, S); + add %sp,LOCALS64+$S,$rp + + ldx [%sp+LOCALS64+$in_z],$bi + ldx [%sp+LOCALS64+$in_y],$a0 + ldx [%sp+LOCALS64+$in_y+8],$a1 + ldx [%sp+LOCALS64+$in_y+16],$a2 + ldx [%sp+LOCALS64+$in_y+24],$a3 + add %sp,LOCALS64+$in_z,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(tmp0, in_z, in_y); + add %sp,LOCALS64+$tmp0,$rp + + ldx [%sp+LOCALS64+$M],$bi ! forward load + ldx [%sp+LOCALS64+$Zsqr],$a0 + ldx [%sp+LOCALS64+$Zsqr+8],$a1 + ldx [%sp+LOCALS64+$Zsqr+16],$a2 + ldx [%sp+LOCALS64+$Zsqr+24],$a3 + + call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(res_z, tmp0); + add %sp,LOCALS64+$res_z,$rp + + add %sp,LOCALS64+$M,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(M, M, Zsqr); + add %sp,LOCALS64+$M,$rp + + mov $acc0,$a0 ! put aside M + mov $acc1,$a1 + mov $acc2,$a2 + mov $acc3,$a3 + call __ecp_nistz256_mul_by_2_vis3 + add %sp,LOCALS64+$M,$rp + mov $a0,$t0 ! copy M + ldx [%sp+LOCALS64+$S],$a0 ! forward load + mov $a1,$t1 + ldx [%sp+LOCALS64+$S+8],$a1 + mov $a2,$t2 + ldx [%sp+LOCALS64+$S+16],$a2 + mov $a3,$t3 + ldx [%sp+LOCALS64+$S+24],$a3 + call __ecp_nistz256_add_noload_vis3 ! p256_mul_by_3(M, M); + add %sp,LOCALS64+$M,$rp + + call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(tmp0, S); + add %sp,LOCALS64+$tmp0,$rp + + ldx [%sp+LOCALS64+$S],$bi ! forward load + ldx [%sp+LOCALS64+$in_x],$a0 + ldx [%sp+LOCALS64+$in_x+8],$a1 + ldx [%sp+LOCALS64+$in_x+16],$a2 + ldx [%sp+LOCALS64+$in_x+24],$a3 + + call __ecp_nistz256_div_by_2_vis3 ! p256_div_by_2(res_y, tmp0); + add %sp,LOCALS64+$res_y,$rp + + add %sp,LOCALS64+$S,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S, S, in_x); + add %sp,LOCALS64+$S,$rp + + ldx [%sp+LOCALS64+$M],$a0 ! forward load + ldx [%sp+LOCALS64+$M+8],$a1 + ldx [%sp+LOCALS64+$M+16],$a2 + ldx [%sp+LOCALS64+$M+24],$a3 + + call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(tmp0, S); + add %sp,LOCALS64+$tmp0,$rp + + call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(res_x, M); + add %sp,LOCALS64+$res_x,$rp + + add %sp,LOCALS64+$tmp0,$bp + call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_x, res_x, tmp0); + add %sp,LOCALS64+$res_x,$rp + + ldx [%sp+LOCALS64+$M],$a0 ! forward load + ldx [%sp+LOCALS64+$M+8],$a1 + ldx [%sp+LOCALS64+$M+16],$a2 + ldx [%sp+LOCALS64+$M+24],$a3 + + add %sp,LOCALS64+$S,$bp + call __ecp_nistz256_sub_morf_vis3 ! p256_sub(S, S, res_x); + add %sp,LOCALS64+$S,$rp + + mov $acc0,$bi + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S, S, M); + add %sp,LOCALS64+$S,$rp + + ldx [%sp+LOCALS64+$res_x],$a0 ! forward load + ldx [%sp+LOCALS64+$res_x+8],$a1 + ldx [%sp+LOCALS64+$res_x+16],$a2 + ldx [%sp+LOCALS64+$res_x+24],$a3 + + add %sp,LOCALS64+$res_y,$bp + call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_y, S, res_y); + add %sp,LOCALS64+$res_y,$bp + + ! convert output to uint_32[8] + srlx $a0,32,$t0 + srlx $a1,32,$t1 + st $a0,[$rp_real] ! res_x + srlx $a2,32,$t2 + st $t0,[$rp_real+4] + srlx $a3,32,$t3 + st $a1,[$rp_real+8] + st $t1,[$rp_real+12] + st $a2,[$rp_real+16] + st $t2,[$rp_real+20] + st $a3,[$rp_real+24] + st $t3,[$rp_real+28] + + ldx [%sp+LOCALS64+$res_z],$a0 ! forward load + srlx $acc0,32,$t0 + ldx [%sp+LOCALS64+$res_z+8],$a1 + srlx $acc1,32,$t1 + ldx [%sp+LOCALS64+$res_z+16],$a2 + srlx $acc2,32,$t2 + ldx [%sp+LOCALS64+$res_z+24],$a3 + srlx $acc3,32,$t3 + st $acc0,[$rp_real+32] ! res_y + st $t0, [$rp_real+32+4] + st $acc1,[$rp_real+32+8] + st $t1, [$rp_real+32+12] + st $acc2,[$rp_real+32+16] + st $t2, [$rp_real+32+20] + st $acc3,[$rp_real+32+24] + st $t3, [$rp_real+32+28] + + srlx $a0,32,$t0 + srlx $a1,32,$t1 + st $a0,[$rp_real+64] ! res_z + srlx $a2,32,$t2 + st $t0,[$rp_real+64+4] + srlx $a3,32,$t3 + st $a1,[$rp_real+64+8] + st $t1,[$rp_real+64+12] + st $a2,[$rp_real+64+16] + st $t2,[$rp_real+64+20] + st $a3,[$rp_real+64+24] + st $t3,[$rp_real+64+28] + + ret + restore +.type ecp_nistz256_point_double_vis3,#function +.size ecp_nistz256_point_double_vis3,.-ecp_nistz256_point_double_vis3 +___ +} +######################################################################## +# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, +# const P256_POINT *in2); +{ +my ($res_x,$res_y,$res_z, + $in1_x,$in1_y,$in1_z, + $in2_x,$in2_y,$in2_z, + $H,$Hsqr,$R,$Rsqr,$Hcub, + $U1,$U2,$S1,$S2)=map(32*$_,(0..17)); +my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); + +# above map() describes stack layout with 18 temporary +# 256-bit vectors on top. Then we reserve some space for +# !in1infty, !in2infty and result of check for zero. + +$code.=<<___; +.globl ecp_nistz256_point_add_vis3 +.align 32 +ecp_nistz256_point_add_vis3: + save %sp,-STACK64_FRAME-32*18-32,%sp + + mov $rp,$rp_real + mov -1,$minus1 + mov -2,$poly3 + sllx $minus1,32,$poly1 ! 0xFFFFFFFF00000000 + srl $poly3,0,$poly3 ! 0x00000000FFFFFFFE + + ! convert input to uint64_t[4] + ld [$bp],$a0 ! in2_x + ld [$bp+4],$t0 + ld [$bp+8],$a1 + ld [$bp+12],$t1 + ld [$bp+16],$a2 + ld [$bp+20],$t2 + ld [$bp+24],$a3 + ld [$bp+28],$t3 + sllx $t0,32,$t0 + sllx $t1,32,$t1 + ld [$bp+32],$acc0 ! in2_y + or $a0,$t0,$a0 + ld [$bp+32+4],$t0 + sllx $t2,32,$t2 + ld [$bp+32+8],$acc1 + or $a1,$t1,$a1 + ld [$bp+32+12],$t1 + sllx $t3,32,$t3 + ld [$bp+32+16],$acc2 + or $a2,$t2,$a2 + ld [$bp+32+20],$t2 + or $a3,$t3,$a3 + ld [$bp+32+24],$acc3 + sllx $t0,32,$t0 + ld [$bp+32+28],$t3 + sllx $t1,32,$t1 + stx $a0,[%sp+LOCALS64+$in2_x] + sllx $t2,32,$t2 + stx $a1,[%sp+LOCALS64+$in2_x+8] + sllx $t3,32,$t3 + stx $a2,[%sp+LOCALS64+$in2_x+16] + or $acc0,$t0,$acc0 + stx $a3,[%sp+LOCALS64+$in2_x+24] + or $acc1,$t1,$acc1 + stx $acc0,[%sp+LOCALS64+$in2_y] + or $acc2,$t2,$acc2 + stx $acc1,[%sp+LOCALS64+$in2_y+8] + or $acc3,$t3,$acc3 + stx $acc2,[%sp+LOCALS64+$in2_y+16] + stx $acc3,[%sp+LOCALS64+$in2_y+24] + + ld [$bp+64],$acc0 ! in2_z + ld [$bp+64+4],$t0 + ld [$bp+64+8],$acc1 + ld [$bp+64+12],$t1 + ld [$bp+64+16],$acc2 + ld [$bp+64+20],$t2 + ld [$bp+64+24],$acc3 + ld [$bp+64+28],$t3 + sllx $t0,32,$t0 + sllx $t1,32,$t1 + ld [$ap],$a0 ! in1_x + or $acc0,$t0,$acc0 + ld [$ap+4],$t0 + sllx $t2,32,$t2 + ld [$ap+8],$a1 + or $acc1,$t1,$acc1 + ld [$ap+12],$t1 + sllx $t3,32,$t3 + ld [$ap+16],$a2 + or $acc2,$t2,$acc2 + ld [$ap+20],$t2 + or $acc3,$t3,$acc3 + ld [$ap+24],$a3 + sllx $t0,32,$t0 + ld [$ap+28],$t3 + sllx $t1,32,$t1 + stx $acc0,[%sp+LOCALS64+$in2_z] + sllx $t2,32,$t2 + stx $acc1,[%sp+LOCALS64+$in2_z+8] + sllx $t3,32,$t3 + stx $acc2,[%sp+LOCALS64+$in2_z+16] + stx $acc3,[%sp+LOCALS64+$in2_z+24] + + or $acc1,$acc0,$acc0 + or $acc3,$acc2,$acc2 + or $acc2,$acc0,$acc0 + movrnz $acc0,-1,$acc0 ! !in2infty + stx $acc0,[%fp+STACK_BIAS-8] + + or $a0,$t0,$a0 + ld [$ap+32],$acc0 ! in1_y + or $a1,$t1,$a1 + ld [$ap+32+4],$t0 + or $a2,$t2,$a2 + ld [$ap+32+8],$acc1 + or $a3,$t3,$a3 + ld [$ap+32+12],$t1 + ld [$ap+32+16],$acc2 + ld [$ap+32+20],$t2 + ld [$ap+32+24],$acc3 + sllx $t0,32,$t0 + ld [$ap+32+28],$t3 + sllx $t1,32,$t1 + stx $a0,[%sp+LOCALS64+$in1_x] + sllx $t2,32,$t2 + stx $a1,[%sp+LOCALS64+$in1_x+8] + sllx $t3,32,$t3 + stx $a2,[%sp+LOCALS64+$in1_x+16] + or $acc0,$t0,$acc0 + stx $a3,[%sp+LOCALS64+$in1_x+24] + or $acc1,$t1,$acc1 + stx $acc0,[%sp+LOCALS64+$in1_y] + or $acc2,$t2,$acc2 + stx $acc1,[%sp+LOCALS64+$in1_y+8] + or $acc3,$t3,$acc3 + stx $acc2,[%sp+LOCALS64+$in1_y+16] + stx $acc3,[%sp+LOCALS64+$in1_y+24] + + ldx [%sp+LOCALS64+$in2_z],$a0 ! forward load + ldx [%sp+LOCALS64+$in2_z+8],$a1 + ldx [%sp+LOCALS64+$in2_z+16],$a2 + ldx [%sp+LOCALS64+$in2_z+24],$a3 + + ld [$ap+64],$acc0 ! in1_z + ld [$ap+64+4],$t0 + ld [$ap+64+8],$acc1 + ld [$ap+64+12],$t1 + ld [$ap+64+16],$acc2 + ld [$ap+64+20],$t2 + ld [$ap+64+24],$acc3 + ld [$ap+64+28],$t3 + sllx $t0,32,$t0 + sllx $t1,32,$t1 + or $acc0,$t0,$acc0 + sllx $t2,32,$t2 + or $acc1,$t1,$acc1 + sllx $t3,32,$t3 + stx $acc0,[%sp+LOCALS64+$in1_z] + or $acc2,$t2,$acc2 + stx $acc1,[%sp+LOCALS64+$in1_z+8] + or $acc3,$t3,$acc3 + stx $acc2,[%sp+LOCALS64+$in1_z+16] + stx $acc3,[%sp+LOCALS64+$in1_z+24] + + or $acc1,$acc0,$acc0 + or $acc3,$acc2,$acc2 + or $acc2,$acc0,$acc0 + movrnz $acc0,-1,$acc0 ! !in1infty + stx $acc0,[%fp+STACK_BIAS-16] + + call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Z2sqr, in2_z); + add %sp,LOCALS64+$Z2sqr,$rp + + ldx [%sp+LOCALS64+$in1_z],$a0 + ldx [%sp+LOCALS64+$in1_z+8],$a1 + ldx [%sp+LOCALS64+$in1_z+16],$a2 + ldx [%sp+LOCALS64+$in1_z+24],$a3 + call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Z1sqr, in1_z); + add %sp,LOCALS64+$Z1sqr,$rp + + ldx [%sp+LOCALS64+$Z2sqr],$bi + ldx [%sp+LOCALS64+$in2_z],$a0 + ldx [%sp+LOCALS64+$in2_z+8],$a1 + ldx [%sp+LOCALS64+$in2_z+16],$a2 + ldx [%sp+LOCALS64+$in2_z+24],$a3 + add %sp,LOCALS64+$Z2sqr,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S1, Z2sqr, in2_z); + add %sp,LOCALS64+$S1,$rp + + ldx [%sp+LOCALS64+$Z1sqr],$bi + ldx [%sp+LOCALS64+$in1_z],$a0 + ldx [%sp+LOCALS64+$in1_z+8],$a1 + ldx [%sp+LOCALS64+$in1_z+16],$a2 + ldx [%sp+LOCALS64+$in1_z+24],$a3 + add %sp,LOCALS64+$Z1sqr,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, Z1sqr, in1_z); + add %sp,LOCALS64+$S2,$rp + + ldx [%sp+LOCALS64+$S1],$bi + ldx [%sp+LOCALS64+$in1_y],$a0 + ldx [%sp+LOCALS64+$in1_y+8],$a1 + ldx [%sp+LOCALS64+$in1_y+16],$a2 + ldx [%sp+LOCALS64+$in1_y+24],$a3 + add %sp,LOCALS64+$S1,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S1, S1, in1_y); + add %sp,LOCALS64+$S1,$rp + + ldx [%sp+LOCALS64+$S2],$bi + ldx [%sp+LOCALS64+$in2_y],$a0 + ldx [%sp+LOCALS64+$in2_y+8],$a1 + ldx [%sp+LOCALS64+$in2_y+16],$a2 + ldx [%sp+LOCALS64+$in2_y+24],$a3 + add %sp,LOCALS64+$S2,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, S2, in2_y); + add %sp,LOCALS64+$S2,$rp + + ldx [%sp+LOCALS64+$Z2sqr],$bi ! forward load + ldx [%sp+LOCALS64+$in1_x],$a0 + ldx [%sp+LOCALS64+$in1_x+8],$a1 + ldx [%sp+LOCALS64+$in1_x+16],$a2 + ldx [%sp+LOCALS64+$in1_x+24],$a3 + + add %sp,LOCALS64+$S1,$bp + call __ecp_nistz256_sub_from_vis3 ! p256_sub(R, S2, S1); + add %sp,LOCALS64+$R,$rp + + or $acc1,$acc0,$acc0 ! see if result is zero + or $acc3,$acc2,$acc2 + or $acc2,$acc0,$acc0 + stx $acc0,[%fp+STACK_BIAS-24] + + add %sp,LOCALS64+$Z2sqr,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U1, in1_x, Z2sqr); + add %sp,LOCALS64+$U1,$rp + + ldx [%sp+LOCALS64+$Z1sqr],$bi + ldx [%sp+LOCALS64+$in2_x],$a0 + ldx [%sp+LOCALS64+$in2_x+8],$a1 + ldx [%sp+LOCALS64+$in2_x+16],$a2 + ldx [%sp+LOCALS64+$in2_x+24],$a3 + add %sp,LOCALS64+$Z1sqr,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U2, in2_x, Z1sqr); + add %sp,LOCALS64+$U2,$rp + + ldx [%sp+LOCALS64+$R],$a0 ! forward load + ldx [%sp+LOCALS64+$R+8],$a1 + ldx [%sp+LOCALS64+$R+16],$a2 + ldx [%sp+LOCALS64+$R+24],$a3 + + add %sp,LOCALS64+$U1,$bp + call __ecp_nistz256_sub_from_vis3 ! p256_sub(H, U2, U1); + add %sp,LOCALS64+$H,$rp + + or $acc1,$acc0,$acc0 ! see if result is zero + or $acc3,$acc2,$acc2 + orcc $acc2,$acc0,$acc0 + + bne,pt %xcc,.Ladd_proceed_vis3 ! is_equal(U1,U2)? + nop + + ldx [%fp+STACK_BIAS-8],$t0 + ldx [%fp+STACK_BIAS-16],$t1 + ldx [%fp+STACK_BIAS-24],$t2 + andcc $t0,$t1,%g0 + be,pt %xcc,.Ladd_proceed_vis3 ! (in1infty || in2infty)? + nop + andcc $t2,$t2,%g0 + be,a,pt %xcc,.Ldouble_shortcut_vis3 ! is_equal(S1,S2)? + add %sp,32*(12-10)+32,%sp ! difference in frame sizes + + st %g0,[$rp_real] + st %g0,[$rp_real+4] + st %g0,[$rp_real+8] + st %g0,[$rp_real+12] + st %g0,[$rp_real+16] + st %g0,[$rp_real+20] + st %g0,[$rp_real+24] + st %g0,[$rp_real+28] + st %g0,[$rp_real+32] + st %g0,[$rp_real+32+4] + st %g0,[$rp_real+32+8] + st %g0,[$rp_real+32+12] + st %g0,[$rp_real+32+16] + st %g0,[$rp_real+32+20] + st %g0,[$rp_real+32+24] + st %g0,[$rp_real+32+28] + st %g0,[$rp_real+64] + st %g0,[$rp_real+64+4] + st %g0,[$rp_real+64+8] + st %g0,[$rp_real+64+12] + st %g0,[$rp_real+64+16] + st %g0,[$rp_real+64+20] + st %g0,[$rp_real+64+24] + st %g0,[$rp_real+64+28] + b .Ladd_done_vis3 + nop + +.align 16 +.Ladd_proceed_vis3: + call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Rsqr, R); + add %sp,LOCALS64+$Rsqr,$rp + + ldx [%sp+LOCALS64+$H],$bi + ldx [%sp+LOCALS64+$in1_z],$a0 + ldx [%sp+LOCALS64+$in1_z+8],$a1 + ldx [%sp+LOCALS64+$in1_z+16],$a2 + ldx [%sp+LOCALS64+$in1_z+24],$a3 + add %sp,LOCALS64+$H,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_z, H, in1_z); + add %sp,LOCALS64+$res_z,$rp + + ldx [%sp+LOCALS64+$H],$a0 + ldx [%sp+LOCALS64+$H+8],$a1 + ldx [%sp+LOCALS64+$H+16],$a2 + ldx [%sp+LOCALS64+$H+24],$a3 + call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Hsqr, H); + add %sp,LOCALS64+$Hsqr,$rp + + ldx [%sp+LOCALS64+$res_z],$bi + ldx [%sp+LOCALS64+$in2_z],$a0 + ldx [%sp+LOCALS64+$in2_z+8],$a1 + ldx [%sp+LOCALS64+$in2_z+16],$a2 + ldx [%sp+LOCALS64+$in2_z+24],$a3 + add %sp,LOCALS64+$res_z,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_z, res_z, in2_z); + add %sp,LOCALS64+$res_z,$rp + + ldx [%sp+LOCALS64+$H],$bi + ldx [%sp+LOCALS64+$Hsqr],$a0 + ldx [%sp+LOCALS64+$Hsqr+8],$a1 + ldx [%sp+LOCALS64+$Hsqr+16],$a2 + ldx [%sp+LOCALS64+$Hsqr+24],$a3 + add %sp,LOCALS64+$H,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(Hcub, Hsqr, H); + add %sp,LOCALS64+$Hcub,$rp + + ldx [%sp+LOCALS64+$U1],$bi + ldx [%sp+LOCALS64+$Hsqr],$a0 + ldx [%sp+LOCALS64+$Hsqr+8],$a1 + ldx [%sp+LOCALS64+$Hsqr+16],$a2 + ldx [%sp+LOCALS64+$Hsqr+24],$a3 + add %sp,LOCALS64+$U1,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U2, U1, Hsqr); + add %sp,LOCALS64+$U2,$rp + + call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(Hsqr, U2); + add %sp,LOCALS64+$Hsqr,$rp + + add %sp,LOCALS64+$Rsqr,$bp + call __ecp_nistz256_sub_morf_vis3 ! p256_sub(res_x, Rsqr, Hsqr); + add %sp,LOCALS64+$res_x,$rp + + add %sp,LOCALS64+$Hcub,$bp + call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_x, res_x, Hcub); + add %sp,LOCALS64+$res_x,$rp + + ldx [%sp+LOCALS64+$S1],$bi ! forward load + ldx [%sp+LOCALS64+$Hcub],$a0 + ldx [%sp+LOCALS64+$Hcub+8],$a1 + ldx [%sp+LOCALS64+$Hcub+16],$a2 + ldx [%sp+LOCALS64+$Hcub+24],$a3 + + add %sp,LOCALS64+$U2,$bp + call __ecp_nistz256_sub_morf_vis3 ! p256_sub(res_y, U2, res_x); + add %sp,LOCALS64+$res_y,$rp + + add %sp,LOCALS64+$S1,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, S1, Hcub); + add %sp,LOCALS64+$S2,$rp + + ldx [%sp+LOCALS64+$R],$bi + ldx [%sp+LOCALS64+$res_y],$a0 + ldx [%sp+LOCALS64+$res_y+8],$a1 + ldx [%sp+LOCALS64+$res_y+16],$a2 + ldx [%sp+LOCALS64+$res_y+24],$a3 + add %sp,LOCALS64+$R,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_y, res_y, R); + add %sp,LOCALS64+$res_y,$rp + + add %sp,LOCALS64+$S2,$bp + call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_y, res_y, S2); + add %sp,LOCALS64+$res_y,$rp + + ldx [%fp+STACK_BIAS-16],$t1 ! !in1infty + ldx [%fp+STACK_BIAS-8],$t2 ! !in2infty +___ +for($i=0;$i<96;$i+=16) { # conditional moves +$code.=<<___; + ldx [%sp+LOCALS64+$res_x+$i],$acc0 ! res + ldx [%sp+LOCALS64+$res_x+$i+8],$acc1 + ldx [%sp+LOCALS64+$in2_x+$i],$acc2 ! in2 + ldx [%sp+LOCALS64+$in2_x+$i+8],$acc3 + ldx [%sp+LOCALS64+$in1_x+$i],$acc4 ! in1 + ldx [%sp+LOCALS64+$in1_x+$i+8],$acc5 + movrz $t1,$acc2,$acc0 + movrz $t1,$acc3,$acc1 + movrz $t2,$acc4,$acc0 + movrz $t2,$acc5,$acc1 + srlx $acc0,32,$acc2 + srlx $acc1,32,$acc3 + st $acc0,[$rp_real+$i] + st $acc2,[$rp_real+$i+4] + st $acc1,[$rp_real+$i+8] + st $acc3,[$rp_real+$i+12] +___ +} +$code.=<<___; +.Ladd_done_vis3: + ret + restore +.type ecp_nistz256_point_add_vis3,#function +.size ecp_nistz256_point_add_vis3,.-ecp_nistz256_point_add_vis3 +___ +} +######################################################################## +# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, +# const P256_POINT_AFFINE *in2); +{ +my ($res_x,$res_y,$res_z, + $in1_x,$in1_y,$in1_z, + $in2_x,$in2_y, + $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14)); +my $Z1sqr = $S2; +# above map() describes stack layout with 15 temporary +# 256-bit vectors on top. Then we reserve some space for +# !in1infty and !in2infty. + +$code.=<<___; +.align 32 +ecp_nistz256_point_add_affine_vis3: + save %sp,-STACK64_FRAME-32*15-32,%sp + + mov $rp,$rp_real + mov -1,$minus1 + mov -2,$poly3 + sllx $minus1,32,$poly1 ! 0xFFFFFFFF00000000 + srl $poly3,0,$poly3 ! 0x00000000FFFFFFFE + + ! convert input to uint64_t[4] + ld [$bp],$a0 ! in2_x + ld [$bp+4],$t0 + ld [$bp+8],$a1 + ld [$bp+12],$t1 + ld [$bp+16],$a2 + ld [$bp+20],$t2 + ld [$bp+24],$a3 + ld [$bp+28],$t3 + sllx $t0,32,$t0 + sllx $t1,32,$t1 + ld [$bp+32],$acc0 ! in2_y + or $a0,$t0,$a0 + ld [$bp+32+4],$t0 + sllx $t2,32,$t2 + ld [$bp+32+8],$acc1 + or $a1,$t1,$a1 + ld [$bp+32+12],$t1 + sllx $t3,32,$t3 + ld [$bp+32+16],$acc2 + or $a2,$t2,$a2 + ld [$bp+32+20],$t2 + or $a3,$t3,$a3 + ld [$bp+32+24],$acc3 + sllx $t0,32,$t0 + ld [$bp+32+28],$t3 + sllx $t1,32,$t1 + stx $a0,[%sp+LOCALS64+$in2_x] + sllx $t2,32,$t2 + stx $a1,[%sp+LOCALS64+$in2_x+8] + sllx $t3,32,$t3 + stx $a2,[%sp+LOCALS64+$in2_x+16] + or $acc0,$t0,$acc0 + stx $a3,[%sp+LOCALS64+$in2_x+24] + or $acc1,$t1,$acc1 + stx $acc0,[%sp+LOCALS64+$in2_y] + or $acc2,$t2,$acc2 + stx $acc1,[%sp+LOCALS64+$in2_y+8] + or $acc3,$t3,$acc3 + stx $acc2,[%sp+LOCALS64+$in2_y+16] + stx $acc3,[%sp+LOCALS64+$in2_y+24] + + or $a1,$a0,$a0 + or $a3,$a2,$a2 + or $acc1,$acc0,$acc0 + or $acc3,$acc2,$acc2 + or $a2,$a0,$a0 + or $acc2,$acc0,$acc0 + or $acc0,$a0,$a0 + movrnz $a0,-1,$a0 ! !in2infty + stx $a0,[%fp+STACK_BIAS-8] + + ld [$ap],$a0 ! in1_x + ld [$ap+4],$t0 + ld [$ap+8],$a1 + ld [$ap+12],$t1 + ld [$ap+16],$a2 + ld [$ap+20],$t2 + ld [$ap+24],$a3 + ld [$ap+28],$t3 + sllx $t0,32,$t0 + sllx $t1,32,$t1 + ld [$ap+32],$acc0 ! in1_y + or $a0,$t0,$a0 + ld [$ap+32+4],$t0 + sllx $t2,32,$t2 + ld [$ap+32+8],$acc1 + or $a1,$t1,$a1 + ld [$ap+32+12],$t1 + sllx $t3,32,$t3 + ld [$ap+32+16],$acc2 + or $a2,$t2,$a2 + ld [$ap+32+20],$t2 + or $a3,$t3,$a3 + ld [$ap+32+24],$acc3 + sllx $t0,32,$t0 + ld [$ap+32+28],$t3 + sllx $t1,32,$t1 + stx $a0,[%sp+LOCALS64+$in1_x] + sllx $t2,32,$t2 + stx $a1,[%sp+LOCALS64+$in1_x+8] + sllx $t3,32,$t3 + stx $a2,[%sp+LOCALS64+$in1_x+16] + or $acc0,$t0,$acc0 + stx $a3,[%sp+LOCALS64+$in1_x+24] + or $acc1,$t1,$acc1 + stx $acc0,[%sp+LOCALS64+$in1_y] + or $acc2,$t2,$acc2 + stx $acc1,[%sp+LOCALS64+$in1_y+8] + or $acc3,$t3,$acc3 + stx $acc2,[%sp+LOCALS64+$in1_y+16] + stx $acc3,[%sp+LOCALS64+$in1_y+24] + + ld [$ap+64],$a0 ! in1_z + ld [$ap+64+4],$t0 + ld [$ap+64+8],$a1 + ld [$ap+64+12],$t1 + ld [$ap+64+16],$a2 + ld [$ap+64+20],$t2 + ld [$ap+64+24],$a3 + ld [$ap+64+28],$t3 + sllx $t0,32,$t0 + sllx $t1,32,$t1 + or $a0,$t0,$a0 + sllx $t2,32,$t2 + or $a1,$t1,$a1 + sllx $t3,32,$t3 + stx $a0,[%sp+LOCALS64+$in1_z] + or $a2,$t2,$a2 + stx $a1,[%sp+LOCALS64+$in1_z+8] + or $a3,$t3,$a3 + stx $a2,[%sp+LOCALS64+$in1_z+16] + stx $a3,[%sp+LOCALS64+$in1_z+24] + + or $a1,$a0,$t0 + or $a3,$a2,$t2 + or $t2,$t0,$t0 + movrnz $t0,-1,$t0 ! !in1infty + stx $t0,[%fp+STACK_BIAS-16] + + call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Z1sqr, in1_z); + add %sp,LOCALS64+$Z1sqr,$rp + + ldx [%sp+LOCALS64+$in2_x],$bi + mov $acc0,$a0 + mov $acc1,$a1 + mov $acc2,$a2 + mov $acc3,$a3 + add %sp,LOCALS64+$in2_x,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U2, Z1sqr, in2_x); + add %sp,LOCALS64+$U2,$rp + + ldx [%sp+LOCALS64+$Z1sqr],$bi ! forward load + ldx [%sp+LOCALS64+$in1_z],$a0 + ldx [%sp+LOCALS64+$in1_z+8],$a1 + ldx [%sp+LOCALS64+$in1_z+16],$a2 + ldx [%sp+LOCALS64+$in1_z+24],$a3 + + add %sp,LOCALS64+$in1_x,$bp + call __ecp_nistz256_sub_from_vis3 ! p256_sub(H, U2, in1_x); + add %sp,LOCALS64+$H,$rp + + add %sp,LOCALS64+$Z1sqr,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, Z1sqr, in1_z); + add %sp,LOCALS64+$S2,$rp + + ldx [%sp+LOCALS64+$H],$bi + ldx [%sp+LOCALS64+$in1_z],$a0 + ldx [%sp+LOCALS64+$in1_z+8],$a1 + ldx [%sp+LOCALS64+$in1_z+16],$a2 + ldx [%sp+LOCALS64+$in1_z+24],$a3 + add %sp,LOCALS64+$H,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_z, H, in1_z); + add %sp,LOCALS64+$res_z,$rp + + ldx [%sp+LOCALS64+$S2],$bi + ldx [%sp+LOCALS64+$in2_y],$a0 + ldx [%sp+LOCALS64+$in2_y+8],$a1 + ldx [%sp+LOCALS64+$in2_y+16],$a2 + ldx [%sp+LOCALS64+$in2_y+24],$a3 + add %sp,LOCALS64+$S2,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, S2, in2_y); + add %sp,LOCALS64+$S2,$rp + + ldx [%sp+LOCALS64+$H],$a0 ! forward load + ldx [%sp+LOCALS64+$H+8],$a1 + ldx [%sp+LOCALS64+$H+16],$a2 + ldx [%sp+LOCALS64+$H+24],$a3 + + add %sp,LOCALS64+$in1_y,$bp + call __ecp_nistz256_sub_from_vis3 ! p256_sub(R, S2, in1_y); + add %sp,LOCALS64+$R,$rp + + call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Hsqr, H); + add %sp,LOCALS64+$Hsqr,$rp + + ldx [%sp+LOCALS64+$R],$a0 + ldx [%sp+LOCALS64+$R+8],$a1 + ldx [%sp+LOCALS64+$R+16],$a2 + ldx [%sp+LOCALS64+$R+24],$a3 + call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Rsqr, R); + add %sp,LOCALS64+$Rsqr,$rp + + ldx [%sp+LOCALS64+$H],$bi + ldx [%sp+LOCALS64+$Hsqr],$a0 + ldx [%sp+LOCALS64+$Hsqr+8],$a1 + ldx [%sp+LOCALS64+$Hsqr+16],$a2 + ldx [%sp+LOCALS64+$Hsqr+24],$a3 + add %sp,LOCALS64+$H,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(Hcub, Hsqr, H); + add %sp,LOCALS64+$Hcub,$rp + + ldx [%sp+LOCALS64+$Hsqr],$bi + ldx [%sp+LOCALS64+$in1_x],$a0 + ldx [%sp+LOCALS64+$in1_x+8],$a1 + ldx [%sp+LOCALS64+$in1_x+16],$a2 + ldx [%sp+LOCALS64+$in1_x+24],$a3 + add %sp,LOCALS64+$Hsqr,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U2, in1_x, Hsqr); + add %sp,LOCALS64+$U2,$rp + + call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(Hsqr, U2); + add %sp,LOCALS64+$Hsqr,$rp + + add %sp,LOCALS64+$Rsqr,$bp + call __ecp_nistz256_sub_morf_vis3 ! p256_sub(res_x, Rsqr, Hsqr); + add %sp,LOCALS64+$res_x,$rp + + add %sp,LOCALS64+$Hcub,$bp + call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_x, res_x, Hcub); + add %sp,LOCALS64+$res_x,$rp + + ldx [%sp+LOCALS64+$Hcub],$bi ! forward load + ldx [%sp+LOCALS64+$in1_y],$a0 + ldx [%sp+LOCALS64+$in1_y+8],$a1 + ldx [%sp+LOCALS64+$in1_y+16],$a2 + ldx [%sp+LOCALS64+$in1_y+24],$a3 + + add %sp,LOCALS64+$U2,$bp + call __ecp_nistz256_sub_morf_vis3 ! p256_sub(res_y, U2, res_x); + add %sp,LOCALS64+$res_y,$rp + + add %sp,LOCALS64+$Hcub,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, in1_y, Hcub); + add %sp,LOCALS64+$S2,$rp + + ldx [%sp+LOCALS64+$R],$bi + ldx [%sp+LOCALS64+$res_y],$a0 + ldx [%sp+LOCALS64+$res_y+8],$a1 + ldx [%sp+LOCALS64+$res_y+16],$a2 + ldx [%sp+LOCALS64+$res_y+24],$a3 + add %sp,LOCALS64+$R,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_y, res_y, R); + add %sp,LOCALS64+$res_y,$rp + + add %sp,LOCALS64+$S2,$bp + call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_y, res_y, S2); + add %sp,LOCALS64+$res_y,$rp + + ldx [%fp+STACK_BIAS-16],$t1 ! !in1infty + ldx [%fp+STACK_BIAS-8],$t2 ! !in2infty +1: call .+8 + add %o7,.Lone_mont_vis3-1b,$bp +___ +for($i=0;$i<64;$i+=16) { # conditional moves +$code.=<<___; + ldx [%sp+LOCALS64+$res_x+$i],$acc0 ! res + ldx [%sp+LOCALS64+$res_x+$i+8],$acc1 + ldx [%sp+LOCALS64+$in2_x+$i],$acc2 ! in2 + ldx [%sp+LOCALS64+$in2_x+$i+8],$acc3 + ldx [%sp+LOCALS64+$in1_x+$i],$acc4 ! in1 + ldx [%sp+LOCALS64+$in1_x+$i+8],$acc5 + movrz $t1,$acc2,$acc0 + movrz $t1,$acc3,$acc1 + movrz $t2,$acc4,$acc0 + movrz $t2,$acc5,$acc1 + srlx $acc0,32,$acc2 + srlx $acc1,32,$acc3 + st $acc0,[$rp_real+$i] + st $acc2,[$rp_real+$i+4] + st $acc1,[$rp_real+$i+8] + st $acc3,[$rp_real+$i+12] +___ +} +for(;$i<96;$i+=16) { +$code.=<<___; + ldx [%sp+LOCALS64+$res_x+$i],$acc0 ! res + ldx [%sp+LOCALS64+$res_x+$i+8],$acc1 + ldx [$bp+$i-64],$acc2 ! "in2" + ldx [$bp+$i-64+8],$acc3 + ldx [%sp+LOCALS64+$in1_x+$i],$acc4 ! in1 + ldx [%sp+LOCALS64+$in1_x+$i+8],$acc5 + movrz $t1,$acc2,$acc0 + movrz $t1,$acc3,$acc1 + movrz $t2,$acc4,$acc0 + movrz $t2,$acc5,$acc1 + srlx $acc0,32,$acc2 + srlx $acc1,32,$acc3 + st $acc0,[$rp_real+$i] + st $acc2,[$rp_real+$i+4] + st $acc1,[$rp_real+$i+8] + st $acc3,[$rp_real+$i+12] +___ +} +$code.=<<___; + ret + restore +.type ecp_nistz256_point_add_affine_vis3,#function +.size ecp_nistz256_point_add_affine_vis3,.-ecp_nistz256_point_add_affine_vis3 +.align 64 +.Lone_mont_vis3: +.long 0x00000000,0x00000001, 0xffffffff,0x00000000 +.long 0xffffffff,0xffffffff, 0x00000000,0xfffffffe +.align 64 +___ +} }}} + +# Purpose of these subroutines is to explicitly encode VIS instructions, +# so that one can compile the module without having to specify VIS +# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. +# Idea is to reserve for option to produce "universal" binary and let +# programmer detect if current CPU is VIS capable at run-time. +sub unvis3 { +my ($mnemonic,$rs1,$rs2,$rd)=@_; +my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); +my ($ref,$opf); +my %visopf = ( "addxc" => 0x011, + "addxccc" => 0x013, + "umulxhi" => 0x016 ); + + $ref = "$mnemonic\t$rs1,$rs2,$rd"; + + if ($opf=$visopf{$mnemonic}) { + foreach ($rs1,$rs2,$rd) { + return $ref if (!/%([goli])([0-9])/); + $_=$bias{$1}+$2; + } + + return sprintf ".word\t0x%08x !%s", + 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, + $ref; + } else { + return $ref; + } +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + + s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ + &unvis3($1,$2,$3,$4) + /ge; + + print $_,"\n"; +} + +close STDOUT; diff --git a/crypto/ec/asm/ecp_nistz256-x86.pl b/crypto/ec/asm/ecp_nistz256-x86.pl new file mode 100755 index 000000000000..0c6fc665bf46 --- /dev/null +++ b/crypto/ec/asm/ecp_nistz256-x86.pl @@ -0,0 +1,1866 @@ +#! /usr/bin/env perl +# Copyright 2015-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# ECP_NISTZ256 module for x86/SSE2. +# +# October 2014. +# +# Original ECP_NISTZ256 submission targeting x86_64 is detailed in +# http://eprint.iacr.org/2013/816. In the process of adaptation +# original .c module was made 32-bit savvy in order to make this +# implementation possible. +# +# with/without -DECP_NISTZ256_ASM +# Pentium +66-163% +# PIII +72-172% +# P4 +65-132% +# Core2 +90-215% +# Sandy Bridge +105-265% (contemporary i[57]-* are all close to this) +# Atom +65-155% +# Opteron +54-110% +# Bulldozer +99-240% +# VIA Nano +93-290% +# +# Ranges denote minimum and maximum improvement coefficients depending +# on benchmark. Lower coefficients are for ECDSA sign, server-side +# operation. Keep in mind that +200% means 3x improvement. + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); +require "x86asm.pl"; + +$output=pop; +open STDOUT,">$output"; + +&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); + +$sse2=0; +for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } + +&external_label("OPENSSL_ia32cap_P") if ($sse2); + + +######################################################################## +# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 +# +open TABLE,"<ecp_nistz256_table.c" or +open TABLE,"<${dir}../ecp_nistz256_table.c" or +die "failed to open ecp_nistz256_table.c:",$!; + +use integer; + +foreach(<TABLE>) { + s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; +} +close TABLE; + +# See ecp_nistz256_table.c for explanation for why it's 64*16*37. +# 64*16*37-1 is because $#arr returns last valid index or @arr, not +# amount of elements. +die "insane number of elements" if ($#arr != 64*16*37-1); + +&public_label("ecp_nistz256_precomputed"); +&align(4096); +&set_label("ecp_nistz256_precomputed"); + +######################################################################## +# this conversion smashes P256_POINT_AFFINE by individual bytes with +# 64 byte interval, similar to +# 1111222233334444 +# 1234123412341234 +for(1..37) { + @tbl = splice(@arr,0,64*16); + for($i=0;$i<64;$i++) { + undef @line; + for($j=0;$j<64;$j++) { + push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff; + } + &data_byte(join(',',map { sprintf "0x%02x",$_} @line)); + } +} + +######################################################################## +# Keep in mind that constants are stored least to most significant word +&static_label("RR"); +&set_label("RR",64); +&data_word(3,0,-1,-5,-2,-1,-3,4); # 2^512 mod P-256 + +&static_label("ONE_mont"); +&set_label("ONE_mont"); +&data_word(1,0,0,-1,-1,-1,-2,0); + +&static_label("ONE"); +&set_label("ONE"); +&data_word(1,0,0,0,0,0,0,0); +&asciz("ECP_NISZ256 for x86/SSE2, CRYPTOGAMS by <appro\@openssl.org>"); +&align(64); + +######################################################################## +# void ecp_nistz256_mul_by_2(BN_ULONG edi[8],const BN_ULONG esi[8]); +&function_begin("ecp_nistz256_mul_by_2"); + &mov ("esi",&wparam(1)); + &mov ("edi",&wparam(0)); + &mov ("ebp","esi"); +######################################################################## +# common pattern for internal functions is that %edi is result pointer, +# %esi and %ebp are input ones, %ebp being optional. %edi is preserved. + &call ("_ecp_nistz256_add"); +&function_end("ecp_nistz256_mul_by_2"); + +######################################################################## +# void ecp_nistz256_mul_by_3(BN_ULONG edi[8],const BN_ULONG esi[8]); +&function_begin("ecp_nistz256_mul_by_3"); + &mov ("esi",&wparam(1)); + # multiplication by 3 is performed + # as 2*n+n, but we can't use output + # to store 2*n, because if output + # pointer equals to input, then + # we'll get 2*n+2*n. + &stack_push(8); # therefore we need to allocate + # 256-bit intermediate buffer. + &mov ("edi","esp"); + &mov ("ebp","esi"); + &call ("_ecp_nistz256_add"); + &lea ("esi",&DWP(0,"edi")); + &mov ("ebp",&wparam(1)); + &mov ("edi",&wparam(0)); + &call ("_ecp_nistz256_add"); + &stack_pop(8); +&function_end("ecp_nistz256_mul_by_3"); + +######################################################################## +# void ecp_nistz256_div_by_2(BN_ULONG edi[8],const BN_ULONG esi[8]); +&function_begin("ecp_nistz256_div_by_2"); + &mov ("esi",&wparam(1)); + &mov ("edi",&wparam(0)); + &call ("_ecp_nistz256_div_by_2"); +&function_end("ecp_nistz256_div_by_2"); + +&function_begin_B("_ecp_nistz256_div_by_2"); + # tmp = a is odd ? a+mod : a + # + # note that because mod has special form, i.e. consists of + # 0xffffffff, 1 and 0s, we can conditionally synthesize it by + # assigning least significant bit of input to one register, + # %ebp, and its negative to another, %edx. + + &mov ("ebp",&DWP(0,"esi")); + &xor ("edx","edx"); + &mov ("ebx",&DWP(4,"esi")); + &mov ("eax","ebp"); + &and ("ebp",1); + &mov ("ecx",&DWP(8,"esi")); + &sub ("edx","ebp"); + + &add ("eax","edx"); + &adc ("ebx","edx"); + &mov (&DWP(0,"edi"),"eax"); + &adc ("ecx","edx"); + &mov (&DWP(4,"edi"),"ebx"); + &mov (&DWP(8,"edi"),"ecx"); + + &mov ("eax",&DWP(12,"esi")); + &mov ("ebx",&DWP(16,"esi")); + &adc ("eax",0); + &mov ("ecx",&DWP(20,"esi")); + &adc ("ebx",0); + &mov (&DWP(12,"edi"),"eax"); + &adc ("ecx",0); + &mov (&DWP(16,"edi"),"ebx"); + &mov (&DWP(20,"edi"),"ecx"); + + &mov ("eax",&DWP(24,"esi")); + &mov ("ebx",&DWP(28,"esi")); + &adc ("eax","ebp"); + &adc ("ebx","edx"); + &mov (&DWP(24,"edi"),"eax"); + &sbb ("esi","esi"); # broadcast carry bit + &mov (&DWP(28,"edi"),"ebx"); + + # ret = tmp >> 1 + + &mov ("eax",&DWP(0,"edi")); + &mov ("ebx",&DWP(4,"edi")); + &mov ("ecx",&DWP(8,"edi")); + &mov ("edx",&DWP(12,"edi")); + + &shr ("eax",1); + &mov ("ebp","ebx"); + &shl ("ebx",31); + &or ("eax","ebx"); + + &shr ("ebp",1); + &mov ("ebx","ecx"); + &shl ("ecx",31); + &mov (&DWP(0,"edi"),"eax"); + &or ("ebp","ecx"); + &mov ("eax",&DWP(16,"edi")); + + &shr ("ebx",1); + &mov ("ecx","edx"); + &shl ("edx",31); + &mov (&DWP(4,"edi"),"ebp"); + &or ("ebx","edx"); + &mov ("ebp",&DWP(20,"edi")); + + &shr ("ecx",1); + &mov ("edx","eax"); + &shl ("eax",31); + &mov (&DWP(8,"edi"),"ebx"); + &or ("ecx","eax"); + &mov ("ebx",&DWP(24,"edi")); + + &shr ("edx",1); + &mov ("eax","ebp"); + &shl ("ebp",31); + &mov (&DWP(12,"edi"),"ecx"); + &or ("edx","ebp"); + &mov ("ecx",&DWP(28,"edi")); + + &shr ("eax",1); + &mov ("ebp","ebx"); + &shl ("ebx",31); + &mov (&DWP(16,"edi"),"edx"); + &or ("eax","ebx"); + + &shr ("ebp",1); + &mov ("ebx","ecx"); + &shl ("ecx",31); + &mov (&DWP(20,"edi"),"eax"); + &or ("ebp","ecx"); + + &shr ("ebx",1); + &shl ("esi",31); + &mov (&DWP(24,"edi"),"ebp"); + &or ("ebx","esi"); # handle top-most carry bit + &mov (&DWP(28,"edi"),"ebx"); + + &ret (); +&function_end_B("_ecp_nistz256_div_by_2"); + +######################################################################## +# void ecp_nistz256_add(BN_ULONG edi[8],const BN_ULONG esi[8], +# const BN_ULONG ebp[8]); +&function_begin("ecp_nistz256_add"); + &mov ("esi",&wparam(1)); + &mov ("ebp",&wparam(2)); + &mov ("edi",&wparam(0)); + &call ("_ecp_nistz256_add"); +&function_end("ecp_nistz256_add"); + +&function_begin_B("_ecp_nistz256_add"); + &mov ("eax",&DWP(0,"esi")); + &mov ("ebx",&DWP(4,"esi")); + &mov ("ecx",&DWP(8,"esi")); + &add ("eax",&DWP(0,"ebp")); + &mov ("edx",&DWP(12,"esi")); + &adc ("ebx",&DWP(4,"ebp")); + &mov (&DWP(0,"edi"),"eax"); + &adc ("ecx",&DWP(8,"ebp")); + &mov (&DWP(4,"edi"),"ebx"); + &adc ("edx",&DWP(12,"ebp")); + &mov (&DWP(8,"edi"),"ecx"); + &mov (&DWP(12,"edi"),"edx"); + + &mov ("eax",&DWP(16,"esi")); + &mov ("ebx",&DWP(20,"esi")); + &mov ("ecx",&DWP(24,"esi")); + &adc ("eax",&DWP(16,"ebp")); + &mov ("edx",&DWP(28,"esi")); + &adc ("ebx",&DWP(20,"ebp")); + &mov (&DWP(16,"edi"),"eax"); + &adc ("ecx",&DWP(24,"ebp")); + &mov (&DWP(20,"edi"),"ebx"); + &mov ("esi",0); + &adc ("edx",&DWP(28,"ebp")); + &mov (&DWP(24,"edi"),"ecx"); + &adc ("esi",0); + &mov (&DWP(28,"edi"),"edx"); + + # if a+b >= modulus, subtract modulus. + # + # But since comparison implies subtraction, we subtract modulus + # to see if it borrows, and then subtract it for real if + # subtraction didn't borrow. + + &mov ("eax",&DWP(0,"edi")); + &mov ("ebx",&DWP(4,"edi")); + &mov ("ecx",&DWP(8,"edi")); + &sub ("eax",-1); + &mov ("edx",&DWP(12,"edi")); + &sbb ("ebx",-1); + &mov ("eax",&DWP(16,"edi")); + &sbb ("ecx",-1); + &mov ("ebx",&DWP(20,"edi")); + &sbb ("edx",0); + &mov ("ecx",&DWP(24,"edi")); + &sbb ("eax",0); + &mov ("edx",&DWP(28,"edi")); + &sbb ("ebx",0); + &sbb ("ecx",1); + &sbb ("edx",-1); + &sbb ("esi",0); + + # Note that because mod has special form, i.e. consists of + # 0xffffffff, 1 and 0s, we can conditionally synthesize it by + # by using borrow. + + ¬ ("esi"); + &mov ("eax",&DWP(0,"edi")); + &mov ("ebp","esi"); + &mov ("ebx",&DWP(4,"edi")); + &shr ("ebp",31); + &mov ("ecx",&DWP(8,"edi")); + &sub ("eax","esi"); + &mov ("edx",&DWP(12,"edi")); + &sbb ("ebx","esi"); + &mov (&DWP(0,"edi"),"eax"); + &sbb ("ecx","esi"); + &mov (&DWP(4,"edi"),"ebx"); + &sbb ("edx",0); + &mov (&DWP(8,"edi"),"ecx"); + &mov (&DWP(12,"edi"),"edx"); + + &mov ("eax",&DWP(16,"edi")); + &mov ("ebx",&DWP(20,"edi")); + &mov ("ecx",&DWP(24,"edi")); + &sbb ("eax",0); + &mov ("edx",&DWP(28,"edi")); + &sbb ("ebx",0); + &mov (&DWP(16,"edi"),"eax"); + &sbb ("ecx","ebp"); + &mov (&DWP(20,"edi"),"ebx"); + &sbb ("edx","esi"); + &mov (&DWP(24,"edi"),"ecx"); + &mov (&DWP(28,"edi"),"edx"); + + &ret (); +&function_end_B("_ecp_nistz256_add"); + +######################################################################## +# void ecp_nistz256_sub(BN_ULONG edi[8],const BN_ULONG esi[8], +# const BN_ULONG ebp[8]); +&function_begin("ecp_nistz256_sub"); + &mov ("esi",&wparam(1)); + &mov ("ebp",&wparam(2)); + &mov ("edi",&wparam(0)); + &call ("_ecp_nistz256_sub"); +&function_end("ecp_nistz256_sub"); + +&function_begin_B("_ecp_nistz256_sub"); + &mov ("eax",&DWP(0,"esi")); + &mov ("ebx",&DWP(4,"esi")); + &mov ("ecx",&DWP(8,"esi")); + &sub ("eax",&DWP(0,"ebp")); + &mov ("edx",&DWP(12,"esi")); + &sbb ("ebx",&DWP(4,"ebp")); + &mov (&DWP(0,"edi"),"eax"); + &sbb ("ecx",&DWP(8,"ebp")); + &mov (&DWP(4,"edi"),"ebx"); + &sbb ("edx",&DWP(12,"ebp")); + &mov (&DWP(8,"edi"),"ecx"); + &mov (&DWP(12,"edi"),"edx"); + + &mov ("eax",&DWP(16,"esi")); + &mov ("ebx",&DWP(20,"esi")); + &mov ("ecx",&DWP(24,"esi")); + &sbb ("eax",&DWP(16,"ebp")); + &mov ("edx",&DWP(28,"esi")); + &sbb ("ebx",&DWP(20,"ebp")); + &sbb ("ecx",&DWP(24,"ebp")); + &mov (&DWP(16,"edi"),"eax"); + &sbb ("edx",&DWP(28,"ebp")); + &mov (&DWP(20,"edi"),"ebx"); + &sbb ("esi","esi"); # broadcast borrow bit + &mov (&DWP(24,"edi"),"ecx"); + &mov (&DWP(28,"edi"),"edx"); + + # if a-b borrows, add modulus. + # + # Note that because mod has special form, i.e. consists of + # 0xffffffff, 1 and 0s, we can conditionally synthesize it by + # assigning borrow bit to one register, %ebp, and its negative + # to another, %esi. But we started by calculating %esi... + + &mov ("eax",&DWP(0,"edi")); + &mov ("ebp","esi"); + &mov ("ebx",&DWP(4,"edi")); + &shr ("ebp",31); + &mov ("ecx",&DWP(8,"edi")); + &add ("eax","esi"); + &mov ("edx",&DWP(12,"edi")); + &adc ("ebx","esi"); + &mov (&DWP(0,"edi"),"eax"); + &adc ("ecx","esi"); + &mov (&DWP(4,"edi"),"ebx"); + &adc ("edx",0); + &mov (&DWP(8,"edi"),"ecx"); + &mov (&DWP(12,"edi"),"edx"); + + &mov ("eax",&DWP(16,"edi")); + &mov ("ebx",&DWP(20,"edi")); + &mov ("ecx",&DWP(24,"edi")); + &adc ("eax",0); + &mov ("edx",&DWP(28,"edi")); + &adc ("ebx",0); + &mov (&DWP(16,"edi"),"eax"); + &adc ("ecx","ebp"); + &mov (&DWP(20,"edi"),"ebx"); + &adc ("edx","esi"); + &mov (&DWP(24,"edi"),"ecx"); + &mov (&DWP(28,"edi"),"edx"); + + &ret (); +&function_end_B("_ecp_nistz256_sub"); + +######################################################################## +# void ecp_nistz256_neg(BN_ULONG edi[8],const BN_ULONG esi[8]); +&function_begin("ecp_nistz256_neg"); + &mov ("ebp",&wparam(1)); + &mov ("edi",&wparam(0)); + + &xor ("eax","eax"); + &stack_push(8); + &mov (&DWP(0,"esp"),"eax"); + &mov ("esi","esp"); + &mov (&DWP(4,"esp"),"eax"); + &mov (&DWP(8,"esp"),"eax"); + &mov (&DWP(12,"esp"),"eax"); + &mov (&DWP(16,"esp"),"eax"); + &mov (&DWP(20,"esp"),"eax"); + &mov (&DWP(24,"esp"),"eax"); + &mov (&DWP(28,"esp"),"eax"); + + &call ("_ecp_nistz256_sub"); + + &stack_pop(8); +&function_end("ecp_nistz256_neg"); + +&function_begin_B("_picup_eax"); + &mov ("eax",&DWP(0,"esp")); + &ret (); +&function_end_B("_picup_eax"); + +######################################################################## +# void ecp_nistz256_to_mont(BN_ULONG edi[8],const BN_ULONG esi[8]); +&function_begin("ecp_nistz256_to_mont"); + &mov ("esi",&wparam(1)); + &call ("_picup_eax"); + &set_label("pic"); + &lea ("ebp",&DWP(&label("RR")."-".&label("pic"),"eax")); + if ($sse2) { + &picmeup("eax","OPENSSL_ia32cap_P","eax",&label("pic")); + &mov ("eax",&DWP(0,"eax")); } + &mov ("edi",&wparam(0)); + &call ("_ecp_nistz256_mul_mont"); +&function_end("ecp_nistz256_to_mont"); + +######################################################################## +# void ecp_nistz256_from_mont(BN_ULONG edi[8],const BN_ULONG esi[8]); +&function_begin("ecp_nistz256_from_mont"); + &mov ("esi",&wparam(1)); + &call ("_picup_eax"); + &set_label("pic"); + &lea ("ebp",&DWP(&label("ONE")."-".&label("pic"),"eax")); + if ($sse2) { + &picmeup("eax","OPENSSL_ia32cap_P","eax",&label("pic")); + &mov ("eax",&DWP(0,"eax")); } + &mov ("edi",&wparam(0)); + &call ("_ecp_nistz256_mul_mont"); +&function_end("ecp_nistz256_from_mont"); + +######################################################################## +# void ecp_nistz256_mul_mont(BN_ULONG edi[8],const BN_ULONG esi[8], +# const BN_ULONG ebp[8]); +&function_begin("ecp_nistz256_mul_mont"); + &mov ("esi",&wparam(1)); + &mov ("ebp",&wparam(2)); + if ($sse2) { + &call ("_picup_eax"); + &set_label("pic"); + &picmeup("eax","OPENSSL_ia32cap_P","eax",&label("pic")); + &mov ("eax",&DWP(0,"eax")); } + &mov ("edi",&wparam(0)); + &call ("_ecp_nistz256_mul_mont"); +&function_end("ecp_nistz256_mul_mont"); + +######################################################################## +# void ecp_nistz256_sqr_mont(BN_ULONG edi[8],const BN_ULONG esi[8]); +&function_begin("ecp_nistz256_sqr_mont"); + &mov ("esi",&wparam(1)); + if ($sse2) { + &call ("_picup_eax"); + &set_label("pic"); + &picmeup("eax","OPENSSL_ia32cap_P","eax",&label("pic")); + &mov ("eax",&DWP(0,"eax")); } + &mov ("edi",&wparam(0)); + &mov ("ebp","esi"); + &call ("_ecp_nistz256_mul_mont"); +&function_end("ecp_nistz256_sqr_mont"); + +&function_begin_B("_ecp_nistz256_mul_mont"); + if ($sse2) { + &and ("eax",1<<24|1<<26); + &cmp ("eax",1<<24|1<<26); # see if XMM+SSE2 is on + &jne (&label("mul_mont_ialu")); + + ######################################## + # SSE2 code path featuring 32x16-bit + # multiplications is ~2x faster than + # IALU counterpart (except on Atom)... + ######################################## + # stack layout: + # +------------------------------------+< %esp + # | 7 16-byte temporary XMM words, | + # | "sliding" toward lower address | + # . . + # +------------------------------------+ + # | unused XMM word | + # +------------------------------------+< +128,%ebx + # | 8 16-byte XMM words holding copies | + # | of a[i]<<64|a[i] | + # . . + # . . + # +------------------------------------+< +256 + &mov ("edx","esp"); + &sub ("esp",0x100); + + &movd ("xmm7",&DWP(0,"ebp")); # b[0] -> 0000.00xy + &lea ("ebp",&DWP(4,"ebp")); + &pcmpeqd("xmm6","xmm6"); + &psrlq ("xmm6",48); # compose 0xffff<<64|0xffff + + &pshuflw("xmm7","xmm7",0b11011100); # 0000.00xy -> 0000.0x0y + &and ("esp",-64); + &pshufd ("xmm7","xmm7",0b11011100); # 0000.0x0y -> 000x.000y + &lea ("ebx",&DWP(0x80,"esp")); + + &movd ("xmm0",&DWP(4*0,"esi")); # a[0] -> 0000.00xy + &pshufd ("xmm0","xmm0",0b11001100); # 0000.00xy -> 00xy.00xy + &movd ("xmm1",&DWP(4*1,"esi")); # a[1] -> ... + &movdqa (&QWP(0x00,"ebx"),"xmm0"); # offload converted a[0] + &pmuludq("xmm0","xmm7"); # a[0]*b[0] + + &movd ("xmm2",&DWP(4*2,"esi")); + &pshufd ("xmm1","xmm1",0b11001100); + &movdqa (&QWP(0x10,"ebx"),"xmm1"); + &pmuludq("xmm1","xmm7"); # a[1]*b[0] + + &movq ("xmm4","xmm0"); # clear upper 64 bits + &pslldq("xmm4",6); + &paddq ("xmm4","xmm0"); + &movdqa("xmm5","xmm4"); + &psrldq("xmm4",10); # upper 32 bits of a[0]*b[0] + &pand ("xmm5","xmm6"); # lower 32 bits of a[0]*b[0] + + # Upper half of a[0]*b[i] is carried into next multiplication + # iteration, while lower one "participates" in actual reduction. + # Normally latter is done by accumulating result of multiplication + # of modulus by "magic" digit, but thanks to special form of modulus + # and "magic" digit it can be performed only with additions and + # subtractions (see note in IALU section below). Note that we are + # not bothered with carry bits, they are accumulated in "flatten" + # phase after all multiplications and reductions. + + &movd ("xmm3",&DWP(4*3,"esi")); + &pshufd ("xmm2","xmm2",0b11001100); + &movdqa (&QWP(0x20,"ebx"),"xmm2"); + &pmuludq("xmm2","xmm7"); # a[2]*b[0] + &paddq ("xmm1","xmm4"); # a[1]*b[0]+hw(a[0]*b[0]), carry + &movdqa (&QWP(0x00,"esp"),"xmm1"); # t[0] + + &movd ("xmm0",&DWP(4*4,"esi")); + &pshufd ("xmm3","xmm3",0b11001100); + &movdqa (&QWP(0x30,"ebx"),"xmm3"); + &pmuludq("xmm3","xmm7"); # a[3]*b[0] + &movdqa (&QWP(0x10,"esp"),"xmm2"); + + &movd ("xmm1",&DWP(4*5,"esi")); + &pshufd ("xmm0","xmm0",0b11001100); + &movdqa (&QWP(0x40,"ebx"),"xmm0"); + &pmuludq("xmm0","xmm7"); # a[4]*b[0] + &paddq ("xmm3","xmm5"); # a[3]*b[0]+lw(a[0]*b[0]), reduction step + &movdqa (&QWP(0x20,"esp"),"xmm3"); + + &movd ("xmm2",&DWP(4*6,"esi")); + &pshufd ("xmm1","xmm1",0b11001100); + &movdqa (&QWP(0x50,"ebx"),"xmm1"); + &pmuludq("xmm1","xmm7"); # a[5]*b[0] + &movdqa (&QWP(0x30,"esp"),"xmm0"); + &pshufd("xmm4","xmm5",0b10110001); # xmm4 = xmm5<<32, reduction step + + &movd ("xmm3",&DWP(4*7,"esi")); + &pshufd ("xmm2","xmm2",0b11001100); + &movdqa (&QWP(0x60,"ebx"),"xmm2"); + &pmuludq("xmm2","xmm7"); # a[6]*b[0] + &movdqa (&QWP(0x40,"esp"),"xmm1"); + &psubq ("xmm4","xmm5"); # xmm4 = xmm5*0xffffffff, reduction step + + &movd ("xmm0",&DWP(0,"ebp")); # b[1] -> 0000.00xy + &pshufd ("xmm3","xmm3",0b11001100); + &movdqa (&QWP(0x70,"ebx"),"xmm3"); + &pmuludq("xmm3","xmm7"); # a[7]*b[0] + + &pshuflw("xmm7","xmm0",0b11011100); # 0000.00xy -> 0000.0x0y + &movdqa ("xmm0",&QWP(0x00,"ebx")); # pre-load converted a[0] + &pshufd ("xmm7","xmm7",0b11011100); # 0000.0x0y -> 000x.000y + + &mov ("ecx",6); + &lea ("ebp",&DWP(4,"ebp")); + &jmp (&label("madd_sse2")); + +&set_label("madd_sse2",16); + &paddq ("xmm2","xmm5"); # a[6]*b[i-1]+lw(a[0]*b[i-1]), reduction step [modulo-scheduled] + &paddq ("xmm3","xmm4"); # a[7]*b[i-1]+lw(a[0]*b[i-1])*0xffffffff, reduction step [modulo-scheduled] + &movdqa ("xmm1",&QWP(0x10,"ebx")); + &pmuludq("xmm0","xmm7"); # a[0]*b[i] + &movdqa(&QWP(0x50,"esp"),"xmm2"); + + &movdqa ("xmm2",&QWP(0x20,"ebx")); + &pmuludq("xmm1","xmm7"); # a[1]*b[i] + &movdqa(&QWP(0x60,"esp"),"xmm3"); + &paddq ("xmm0",&QWP(0x00,"esp")); + + &movdqa ("xmm3",&QWP(0x30,"ebx")); + &pmuludq("xmm2","xmm7"); # a[2]*b[i] + &movq ("xmm4","xmm0"); # clear upper 64 bits + &pslldq("xmm4",6); + &paddq ("xmm1",&QWP(0x10,"esp")); + &paddq ("xmm4","xmm0"); + &movdqa("xmm5","xmm4"); + &psrldq("xmm4",10); # upper 33 bits of a[0]*b[i]+t[0] + + &movdqa ("xmm0",&QWP(0x40,"ebx")); + &pmuludq("xmm3","xmm7"); # a[3]*b[i] + &paddq ("xmm1","xmm4"); # a[1]*b[i]+hw(a[0]*b[i]), carry + &paddq ("xmm2",&QWP(0x20,"esp")); + &movdqa (&QWP(0x00,"esp"),"xmm1"); + + &movdqa ("xmm1",&QWP(0x50,"ebx")); + &pmuludq("xmm0","xmm7"); # a[4]*b[i] + &paddq ("xmm3",&QWP(0x30,"esp")); + &movdqa (&QWP(0x10,"esp"),"xmm2"); + &pand ("xmm5","xmm6"); # lower 32 bits of a[0]*b[i] + + &movdqa ("xmm2",&QWP(0x60,"ebx")); + &pmuludq("xmm1","xmm7"); # a[5]*b[i] + &paddq ("xmm3","xmm5"); # a[3]*b[i]+lw(a[0]*b[i]), reduction step + &paddq ("xmm0",&QWP(0x40,"esp")); + &movdqa (&QWP(0x20,"esp"),"xmm3"); + &pshufd("xmm4","xmm5",0b10110001); # xmm4 = xmm5<<32, reduction step + + &movdqa ("xmm3","xmm7"); + &pmuludq("xmm2","xmm7"); # a[6]*b[i] + &movd ("xmm7",&DWP(0,"ebp")); # b[i++] -> 0000.00xy + &lea ("ebp",&DWP(4,"ebp")); + &paddq ("xmm1",&QWP(0x50,"esp")); + &psubq ("xmm4","xmm5"); # xmm4 = xmm5*0xffffffff, reduction step + &movdqa (&QWP(0x30,"esp"),"xmm0"); + &pshuflw("xmm7","xmm7",0b11011100); # 0000.00xy -> 0000.0x0y + + &pmuludq("xmm3",&QWP(0x70,"ebx")); # a[7]*b[i] + &pshufd("xmm7","xmm7",0b11011100); # 0000.0x0y -> 000x.000y + &movdqa("xmm0",&QWP(0x00,"ebx")); # pre-load converted a[0] + &movdqa (&QWP(0x40,"esp"),"xmm1"); + &paddq ("xmm2",&QWP(0x60,"esp")); + + &dec ("ecx"); + &jnz (&label("madd_sse2")); + + &paddq ("xmm2","xmm5"); # a[6]*b[6]+lw(a[0]*b[6]), reduction step [modulo-scheduled] + &paddq ("xmm3","xmm4"); # a[7]*b[6]+lw(a[0]*b[6])*0xffffffff, reduction step [modulo-scheduled] + &movdqa ("xmm1",&QWP(0x10,"ebx")); + &pmuludq("xmm0","xmm7"); # a[0]*b[7] + &movdqa(&QWP(0x50,"esp"),"xmm2"); + + &movdqa ("xmm2",&QWP(0x20,"ebx")); + &pmuludq("xmm1","xmm7"); # a[1]*b[7] + &movdqa(&QWP(0x60,"esp"),"xmm3"); + &paddq ("xmm0",&QWP(0x00,"esp")); + + &movdqa ("xmm3",&QWP(0x30,"ebx")); + &pmuludq("xmm2","xmm7"); # a[2]*b[7] + &movq ("xmm4","xmm0"); # clear upper 64 bits + &pslldq("xmm4",6); + &paddq ("xmm1",&QWP(0x10,"esp")); + &paddq ("xmm4","xmm0"); + &movdqa("xmm5","xmm4"); + &psrldq("xmm4",10); # upper 33 bits of a[0]*b[i]+t[0] + + &movdqa ("xmm0",&QWP(0x40,"ebx")); + &pmuludq("xmm3","xmm7"); # a[3]*b[7] + &paddq ("xmm1","xmm4"); # a[1]*b[7]+hw(a[0]*b[7]), carry + &paddq ("xmm2",&QWP(0x20,"esp")); + &movdqa (&QWP(0x00,"esp"),"xmm1"); + + &movdqa ("xmm1",&QWP(0x50,"ebx")); + &pmuludq("xmm0","xmm7"); # a[4]*b[7] + &paddq ("xmm3",&QWP(0x30,"esp")); + &movdqa (&QWP(0x10,"esp"),"xmm2"); + &pand ("xmm5","xmm6"); # lower 32 bits of a[0]*b[i] + + &movdqa ("xmm2",&QWP(0x60,"ebx")); + &pmuludq("xmm1","xmm7"); # a[5]*b[7] + &paddq ("xmm3","xmm5"); # reduction step + &paddq ("xmm0",&QWP(0x40,"esp")); + &movdqa (&QWP(0x20,"esp"),"xmm3"); + &pshufd("xmm4","xmm5",0b10110001); # xmm4 = xmm5<<32, reduction step + + &movdqa ("xmm3",&QWP(0x70,"ebx")); + &pmuludq("xmm2","xmm7"); # a[6]*b[7] + &paddq ("xmm1",&QWP(0x50,"esp")); + &psubq ("xmm4","xmm5"); # xmm4 = xmm5*0xffffffff, reduction step + &movdqa (&QWP(0x30,"esp"),"xmm0"); + + &pmuludq("xmm3","xmm7"); # a[7]*b[7] + &pcmpeqd("xmm7","xmm7"); + &movdqa ("xmm0",&QWP(0x00,"esp")); + &pslldq ("xmm7",8); + &movdqa (&QWP(0x40,"esp"),"xmm1"); + &paddq ("xmm2",&QWP(0x60,"esp")); + + &paddq ("xmm2","xmm5"); # a[6]*b[7]+lw(a[0]*b[7]), reduction step + &paddq ("xmm3","xmm4"); # a[6]*b[7]+lw(a[0]*b[7])*0xffffffff, reduction step + &movdqa(&QWP(0x50,"esp"),"xmm2"); + &movdqa(&QWP(0x60,"esp"),"xmm3"); + + &movdqa ("xmm1",&QWP(0x10,"esp")); + &movdqa ("xmm2",&QWP(0x20,"esp")); + &movdqa ("xmm3",&QWP(0x30,"esp")); + + &movq ("xmm4","xmm0"); # "flatten" + &pand ("xmm0","xmm7"); + &xor ("ebp","ebp"); + &pslldq ("xmm4",6); + &movq ("xmm5","xmm1"); + &paddq ("xmm0","xmm4"); + &pand ("xmm1","xmm7"); + &psrldq ("xmm0",6); + &movd ("eax","xmm0"); + &psrldq ("xmm0",4); + + &paddq ("xmm5","xmm0"); + &movdqa ("xmm0",&QWP(0x40,"esp")); + &sub ("eax",-1); # start subtracting modulus, + # this is used to determine + # if result is larger/smaller + # than modulus (see below) + &pslldq ("xmm5",6); + &movq ("xmm4","xmm2"); + &paddq ("xmm1","xmm5"); + &pand ("xmm2","xmm7"); + &psrldq ("xmm1",6); + &mov (&DWP(4*0,"edi"),"eax"); + &movd ("eax","xmm1"); + &psrldq ("xmm1",4); + + &paddq ("xmm4","xmm1"); + &movdqa ("xmm1",&QWP(0x50,"esp")); + &sbb ("eax",-1); + &pslldq ("xmm4",6); + &movq ("xmm5","xmm3"); + &paddq ("xmm2","xmm4"); + &pand ("xmm3","xmm7"); + &psrldq ("xmm2",6); + &mov (&DWP(4*1,"edi"),"eax"); + &movd ("eax","xmm2"); + &psrldq ("xmm2",4); + + &paddq ("xmm5","xmm2"); + &movdqa ("xmm2",&QWP(0x60,"esp")); + &sbb ("eax",-1); + &pslldq ("xmm5",6); + &movq ("xmm4","xmm0"); + &paddq ("xmm3","xmm5"); + &pand ("xmm0","xmm7"); + &psrldq ("xmm3",6); + &mov (&DWP(4*2,"edi"),"eax"); + &movd ("eax","xmm3"); + &psrldq ("xmm3",4); + + &paddq ("xmm4","xmm3"); + &sbb ("eax",0); + &pslldq ("xmm4",6); + &movq ("xmm5","xmm1"); + &paddq ("xmm0","xmm4"); + &pand ("xmm1","xmm7"); + &psrldq ("xmm0",6); + &mov (&DWP(4*3,"edi"),"eax"); + &movd ("eax","xmm0"); + &psrldq ("xmm0",4); + + &paddq ("xmm5","xmm0"); + &sbb ("eax",0); + &pslldq ("xmm5",6); + &movq ("xmm4","xmm2"); + &paddq ("xmm1","xmm5"); + &pand ("xmm2","xmm7"); + &psrldq ("xmm1",6); + &movd ("ebx","xmm1"); + &psrldq ("xmm1",4); + &mov ("esp","edx"); + + &paddq ("xmm4","xmm1"); + &pslldq ("xmm4",6); + &paddq ("xmm2","xmm4"); + &psrldq ("xmm2",6); + &movd ("ecx","xmm2"); + &psrldq ("xmm2",4); + &sbb ("ebx",0); + &movd ("edx","xmm2"); + &pextrw ("esi","xmm2",2); # top-most overflow bit + &sbb ("ecx",1); + &sbb ("edx",-1); + &sbb ("esi",0); # borrow from subtraction + + # Final step is "if result > mod, subtract mod", and at this point + # we have result - mod written to output buffer, as well as borrow + # bit from this subtraction, and if borrow bit is set, we add + # modulus back. + # + # Note that because mod has special form, i.e. consists of + # 0xffffffff, 1 and 0s, we can conditionally synthesize it by + # assigning borrow bit to one register, %ebp, and its negative + # to another, %esi. But we started by calculating %esi... + + &sub ("ebp","esi"); + &add (&DWP(4*0,"edi"),"esi"); # add modulus or zero + &adc (&DWP(4*1,"edi"),"esi"); + &adc (&DWP(4*2,"edi"),"esi"); + &adc (&DWP(4*3,"edi"),0); + &adc ("eax",0); + &adc ("ebx",0); + &mov (&DWP(4*4,"edi"),"eax"); + &adc ("ecx","ebp"); + &mov (&DWP(4*5,"edi"),"ebx"); + &adc ("edx","esi"); + &mov (&DWP(4*6,"edi"),"ecx"); + &mov (&DWP(4*7,"edi"),"edx"); + + &ret (); + +&set_label("mul_mont_ialu",16); } + + ######################################## + # IALU code path suitable for all CPUs. + ######################################## + # stack layout: + # +------------------------------------+< %esp + # | 8 32-bit temporary words, accessed | + # | as circular buffer | + # . . + # . . + # +------------------------------------+< +32 + # | offloaded destination pointer | + # +------------------------------------+ + # | unused | + # +------------------------------------+< +40 + &sub ("esp",10*4); + + &mov ("eax",&DWP(0*4,"esi")); # a[0] + &mov ("ebx",&DWP(0*4,"ebp")); # b[0] + &mov (&DWP(8*4,"esp"),"edi"); # off-load dst ptr + + &mul ("ebx"); # a[0]*b[0] + &mov (&DWP(0*4,"esp"),"eax"); # t[0] + &mov ("eax",&DWP(1*4,"esi")); + &mov ("ecx","edx") + + &mul ("ebx"); # a[1]*b[0] + &add ("ecx","eax"); + &mov ("eax",&DWP(2*4,"esi")); + &adc ("edx",0); + &mov (&DWP(1*4,"esp"),"ecx"); # t[1] + &mov ("ecx","edx"); + + &mul ("ebx"); # a[2]*b[0] + &add ("ecx","eax"); + &mov ("eax",&DWP(3*4,"esi")); + &adc ("edx",0); + &mov (&DWP(2*4,"esp"),"ecx"); # t[2] + &mov ("ecx","edx"); + + &mul ("ebx"); # a[3]*b[0] + &add ("ecx","eax"); + &mov ("eax",&DWP(4*4,"esi")); + &adc ("edx",0); + &mov (&DWP(3*4,"esp"),"ecx"); # t[3] + &mov ("ecx","edx"); + + &mul ("ebx"); # a[4]*b[0] + &add ("ecx","eax"); + &mov ("eax",&DWP(5*4,"esi")); + &adc ("edx",0); + &mov (&DWP(4*4,"esp"),"ecx"); # t[4] + &mov ("ecx","edx"); + + &mul ("ebx"); # a[5]*b[0] + &add ("ecx","eax"); + &mov ("eax",&DWP(6*4,"esi")); + &adc ("edx",0); + &mov (&DWP(5*4,"esp"),"ecx"); # t[5] + &mov ("ecx","edx"); + + &mul ("ebx"); # a[6]*b[0] + &add ("ecx","eax"); + &mov ("eax",&DWP(7*4,"esi")); + &adc ("edx",0); + &mov (&DWP(6*4,"esp"),"ecx"); # t[6] + &mov ("ecx","edx"); + + &xor ("edi","edi"); # initial top-most carry + &mul ("ebx"); # a[7]*b[0] + &add ("ecx","eax"); # t[7] + &mov ("eax",&DWP(0*4,"esp")); # t[0] + &adc ("edx",0); # t[8] + +for ($i=0;$i<7;$i++) { + my $j=$i+1; + + # Reduction iteration is normally performed by accumulating + # result of multiplication of modulus by "magic" digit [and + # omitting least significant word, which is guaranteed to + # be 0], but thanks to special form of modulus and "magic" + # digit being equal to least significant word, it can be + # performed with additions and subtractions alone. Indeed: + # + # ffff.0001.0000.0000.0000.ffff.ffff.ffff + # * abcd + # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd + # + # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we + # rewrite above as: + # + # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd + # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000 + # - abcd.0000.0000.0000.0000.0000.0000.abcd + # + # or marking redundant operations: + # + # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.---- + # + abcd.0000.abcd.0000.0000.abcd.----.----.---- + # - abcd.----.----.----.----.----.----.---- + + &add (&DWP((($i+3)%8)*4,"esp"),"eax"); # t[3]+=t[0] + &adc (&DWP((($i+4)%8)*4,"esp"),0); # t[4]+=0 + &adc (&DWP((($i+5)%8)*4,"esp"),0); # t[5]+=0 + &adc (&DWP((($i+6)%8)*4,"esp"),"eax"); # t[6]+=t[0] + &adc ("ecx",0); # t[7]+=0 + &adc ("edx","eax"); # t[8]+=t[0] + &adc ("edi",0); # top-most carry + &mov ("ebx",&DWP($j*4,"ebp")); # b[i] + &sub ("ecx","eax"); # t[7]-=t[0] + &mov ("eax",&DWP(0*4,"esi")); # a[0] + &sbb ("edx",0); # t[8]-=0 + &mov (&DWP((($i+7)%8)*4,"esp"),"ecx"); + &sbb ("edi",0); # top-most carry, + # keep in mind that + # netto result is + # *addition* of value + # with (abcd<<32)-abcd + # on top, so that + # underflow is + # impossible, because + # (abcd<<32)-abcd + # doesn't underflow + &mov (&DWP((($i+8)%8)*4,"esp"),"edx"); + + &mul ("ebx"); # a[0]*b[i] + &add ("eax",&DWP((($j+0)%8)*4,"esp")); + &adc ("edx",0); + &mov (&DWP((($j+0)%8)*4,"esp"),"eax"); + &mov ("eax",&DWP(1*4,"esi")); + &mov ("ecx","edx") + + &mul ("ebx"); # a[1]*b[i] + &add ("ecx",&DWP((($j+1)%8)*4,"esp")); + &adc ("edx",0); + &add ("ecx","eax"); + &adc ("edx",0); + &mov ("eax",&DWP(2*4,"esi")); + &mov (&DWP((($j+1)%8)*4,"esp"),"ecx"); + &mov ("ecx","edx"); + + &mul ("ebx"); # a[2]*b[i] + &add ("ecx",&DWP((($j+2)%8)*4,"esp")); + &adc ("edx",0); + &add ("ecx","eax"); + &adc ("edx",0); + &mov ("eax",&DWP(3*4,"esi")); + &mov (&DWP((($j+2)%8)*4,"esp"),"ecx"); + &mov ("ecx","edx"); + + &mul ("ebx"); # a[3]*b[i] + &add ("ecx",&DWP((($j+3)%8)*4,"esp")); + &adc ("edx",0); + &add ("ecx","eax"); + &adc ("edx",0); + &mov ("eax",&DWP(4*4,"esi")); + &mov (&DWP((($j+3)%8)*4,"esp"),"ecx"); + &mov ("ecx","edx"); + + &mul ("ebx"); # a[4]*b[i] + &add ("ecx",&DWP((($j+4)%8)*4,"esp")); + &adc ("edx",0); + &add ("ecx","eax"); + &adc ("edx",0); + &mov ("eax",&DWP(5*4,"esi")); + &mov (&DWP((($j+4)%8)*4,"esp"),"ecx"); + &mov ("ecx","edx"); + + &mul ("ebx"); # a[5]*b[i] + &add ("ecx",&DWP((($j+5)%8)*4,"esp")); + &adc ("edx",0); + &add ("ecx","eax"); + &adc ("edx",0); + &mov ("eax",&DWP(6*4,"esi")); + &mov (&DWP((($j+5)%8)*4,"esp"),"ecx"); + &mov ("ecx","edx"); + + &mul ("ebx"); # a[6]*b[i] + &add ("ecx",&DWP((($j+6)%8)*4,"esp")); + &adc ("edx",0); + &add ("ecx","eax"); + &adc ("edx",0); + &mov ("eax",&DWP(7*4,"esi")); + &mov (&DWP((($j+6)%8)*4,"esp"),"ecx"); + &mov ("ecx","edx"); + + &mul ("ebx"); # a[7]*b[i] + &add ("ecx",&DWP((($j+7)%8)*4,"esp")); + &adc ("edx",0); + &add ("ecx","eax"); # t[7] + &mov ("eax",&DWP((($j+0)%8)*4,"esp")); # t[0] + &adc ("edx","edi"); # t[8] + &mov ("edi",0); + &adc ("edi",0); # top-most carry +} + &mov ("ebp",&DWP(8*4,"esp")); # restore dst ptr + &xor ("esi","esi"); + my $j=$i+1; + + # last multiplication-less reduction + &add (&DWP((($i+3)%8)*4,"esp"),"eax"); # t[3]+=t[0] + &adc (&DWP((($i+4)%8)*4,"esp"),0); # t[4]+=0 + &adc (&DWP((($i+5)%8)*4,"esp"),0); # t[5]+=0 + &adc (&DWP((($i+6)%8)*4,"esp"),"eax"); # t[6]+=t[0] + &adc ("ecx",0); # t[7]+=0 + &adc ("edx","eax"); # t[8]+=t[0] + &adc ("edi",0); # top-most carry + &mov ("ebx",&DWP((($j+1)%8)*4,"esp")); + &sub ("ecx","eax"); # t[7]-=t[0] + &mov ("eax",&DWP((($j+0)%8)*4,"esp")); + &sbb ("edx",0); # t[8]-=0 + &mov (&DWP((($i+7)%8)*4,"esp"),"ecx"); + &sbb ("edi",0); # top-most carry + &mov (&DWP((($i+8)%8)*4,"esp"),"edx"); + + # Final step is "if result > mod, subtract mod", but we do it + # "other way around", namely write result - mod to output buffer + # and if subtraction borrowed, add modulus back. + + &mov ("ecx",&DWP((($j+2)%8)*4,"esp")); + &sub ("eax",-1); + &mov ("edx",&DWP((($j+3)%8)*4,"esp")); + &sbb ("ebx",-1); + &mov (&DWP(0*4,"ebp"),"eax"); + &sbb ("ecx",-1); + &mov (&DWP(1*4,"ebp"),"ebx"); + &sbb ("edx",0); + &mov (&DWP(2*4,"ebp"),"ecx"); + &mov (&DWP(3*4,"ebp"),"edx"); + + &mov ("eax",&DWP((($j+4)%8)*4,"esp")); + &mov ("ebx",&DWP((($j+5)%8)*4,"esp")); + &mov ("ecx",&DWP((($j+6)%8)*4,"esp")); + &sbb ("eax",0); + &mov ("edx",&DWP((($j+7)%8)*4,"esp")); + &sbb ("ebx",0); + &sbb ("ecx",1); + &sbb ("edx",-1); + &sbb ("edi",0); + + # Note that because mod has special form, i.e. consists of + # 0xffffffff, 1 and 0s, we can conditionally synthesize it by + # assigning borrow bit to one register, %ebp, and its negative + # to another, %esi. But we started by calculating %esi... + + &sub ("esi","edi"); + &add (&DWP(0*4,"ebp"),"edi"); # add modulus or zero + &adc (&DWP(1*4,"ebp"),"edi"); + &adc (&DWP(2*4,"ebp"),"edi"); + &adc (&DWP(3*4,"ebp"),0); + &adc ("eax",0); + &adc ("ebx",0); + &mov (&DWP(4*4,"ebp"),"eax"); + &adc ("ecx","esi"); + &mov (&DWP(5*4,"ebp"),"ebx"); + &adc ("edx","edi"); + &mov (&DWP(6*4,"ebp"),"ecx"); + &mov ("edi","ebp"); # fulfill contract + &mov (&DWP(7*4,"ebp"),"edx"); + + &add ("esp",10*4); + &ret (); +&function_end_B("_ecp_nistz256_mul_mont"); + +######################################################################## +# void ecp_nistz256_scatter_w5(void *edi,const P256_POINT *esi, +# int ebp); +&function_begin("ecp_nistz256_scatter_w5"); + &mov ("edi",&wparam(0)); + &mov ("esi",&wparam(1)); + &mov ("ebp",&wparam(2)); + + &lea ("edi",&DWP(128-4,"edi","ebp",4)); + &mov ("ebp",96/16); +&set_label("scatter_w5_loop"); + &mov ("eax",&DWP(0,"esi")); + &mov ("ebx",&DWP(4,"esi")); + &mov ("ecx",&DWP(8,"esi")); + &mov ("edx",&DWP(12,"esi")); + &lea ("esi",&DWP(16,"esi")); + &mov (&DWP(64*0-128,"edi"),"eax"); + &mov (&DWP(64*1-128,"edi"),"ebx"); + &mov (&DWP(64*2-128,"edi"),"ecx"); + &mov (&DWP(64*3-128,"edi"),"edx"); + &lea ("edi",&DWP(64*4,"edi")); + &dec ("ebp"); + &jnz (&label("scatter_w5_loop")); +&function_end("ecp_nistz256_scatter_w5"); + +######################################################################## +# void ecp_nistz256_gather_w5(P256_POINT *edi,const void *esi, +# int ebp); +&function_begin("ecp_nistz256_gather_w5"); + &mov ("esi",&wparam(1)); + &mov ("ebp",&wparam(2)); + + &lea ("esi",&DWP(0,"esi","ebp",4)); + &neg ("ebp"); + &sar ("ebp",31); + &mov ("edi",&wparam(0)); + &lea ("esi",&DWP(0,"esi","ebp",4)); + + for($i=0;$i<24;$i+=4) { + &mov ("eax",&DWP(64*($i+0),"esi")); + &mov ("ebx",&DWP(64*($i+1),"esi")); + &mov ("ecx",&DWP(64*($i+2),"esi")); + &mov ("edx",&DWP(64*($i+3),"esi")); + &and ("eax","ebp"); + &and ("ebx","ebp"); + &and ("ecx","ebp"); + &and ("edx","ebp"); + &mov (&DWP(4*($i+0),"edi"),"eax"); + &mov (&DWP(4*($i+1),"edi"),"ebx"); + &mov (&DWP(4*($i+2),"edi"),"ecx"); + &mov (&DWP(4*($i+3),"edi"),"edx"); + } +&function_end("ecp_nistz256_gather_w5"); + +######################################################################## +# void ecp_nistz256_scatter_w7(void *edi,const P256_POINT_AFFINE *esi, +# int ebp); +&function_begin("ecp_nistz256_scatter_w7"); + &mov ("edi",&wparam(0)); + &mov ("esi",&wparam(1)); + &mov ("ebp",&wparam(2)); + + &lea ("edi",&DWP(0,"edi","ebp")); + &mov ("ebp",64/4); +&set_label("scatter_w7_loop"); + &mov ("eax",&DWP(0,"esi")); + &lea ("esi",&DWP(4,"esi")); + &mov (&BP(64*0,"edi"),"al"); + &mov (&BP(64*1,"edi"),"ah"); + &shr ("eax",16); + &mov (&BP(64*2,"edi"),"al"); + &mov (&BP(64*3,"edi"),"ah"); + &lea ("edi",&DWP(64*4,"edi")); + &dec ("ebp"); + &jnz (&label("scatter_w7_loop")); +&function_end("ecp_nistz256_scatter_w7"); + +######################################################################## +# void ecp_nistz256_gather_w7(P256_POINT_AFFINE *edi,const void *esi, +# int ebp); +&function_begin("ecp_nistz256_gather_w7"); + &mov ("esi",&wparam(1)); + &mov ("ebp",&wparam(2)); + + &add ("esi","ebp"); + &neg ("ebp"), + &sar ("ebp",31); + &mov ("edi",&wparam(0)); + &lea ("esi",&DWP(0,"esi","ebp")); + + for($i=0;$i<64;$i+=4) { + &movz ("eax",&BP(64*($i+0),"esi")); + &movz ("ebx",&BP(64*($i+1),"esi")); + &movz ("ecx",&BP(64*($i+2),"esi")); + &and ("eax","ebp"); + &movz ("edx",&BP(64*($i+3),"esi")); + &and ("ebx","ebp"); + &mov (&BP($i+0,"edi"),"al"); + &and ("ecx","ebp"); + &mov (&BP($i+1,"edi"),"bl"); + &and ("edx","ebp"); + &mov (&BP($i+2,"edi"),"cl"); + &mov (&BP($i+3,"edi"),"dl"); + } +&function_end("ecp_nistz256_gather_w7"); + +######################################################################## +# following subroutines are "literal" implementation of those found in +# ecp_nistz256.c +# +######################################################################## +# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); +# +&static_label("point_double_shortcut"); +&function_begin("ecp_nistz256_point_double"); +{ my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); + + &mov ("esi",&wparam(1)); + + # above map() describes stack layout with 5 temporary + # 256-bit vectors on top, then we take extra word for + # OPENSSL_ia32cap_P copy. + &stack_push(8*5+1); + if ($sse2) { + &call ("_picup_eax"); + &set_label("pic"); + &picmeup("edx","OPENSSL_ia32cap_P","eax",&label("pic")); + &mov ("ebp",&DWP(0,"edx")); } + +&set_label("point_double_shortcut"); + &mov ("eax",&DWP(0,"esi")); # copy in_x + &mov ("ebx",&DWP(4,"esi")); + &mov ("ecx",&DWP(8,"esi")); + &mov ("edx",&DWP(12,"esi")); + &mov (&DWP($in_x+0,"esp"),"eax"); + &mov (&DWP($in_x+4,"esp"),"ebx"); + &mov (&DWP($in_x+8,"esp"),"ecx"); + &mov (&DWP($in_x+12,"esp"),"edx"); + &mov ("eax",&DWP(16,"esi")); + &mov ("ebx",&DWP(20,"esi")); + &mov ("ecx",&DWP(24,"esi")); + &mov ("edx",&DWP(28,"esi")); + &mov (&DWP($in_x+16,"esp"),"eax"); + &mov (&DWP($in_x+20,"esp"),"ebx"); + &mov (&DWP($in_x+24,"esp"),"ecx"); + &mov (&DWP($in_x+28,"esp"),"edx"); + &mov (&DWP(32*5,"esp"),"ebp"); # OPENSSL_ia32cap_P copy + + &lea ("ebp",&DWP(32,"esi")); + &lea ("esi",&DWP(32,"esi")); + &lea ("edi",&DWP($S,"esp")); + &call ("_ecp_nistz256_add"); # p256_mul_by_2(S, in_y); + + &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy + &mov ("esi",64); + &add ("esi",&wparam(1)); + &lea ("edi",&DWP($Zsqr,"esp")); + &mov ("ebp","esi"); + &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Zsqr, in_z); + + &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($S,"esp")); + &lea ("ebp",&DWP($S,"esp")); + &lea ("edi",&DWP($S,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(S, S); + + &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy + &mov ("ebp",&wparam(1)); + &lea ("esi",&DWP(32,"ebp")); + &lea ("ebp",&DWP(64,"ebp")); + &lea ("edi",&DWP($tmp0,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(tmp0, in_z, in_y); + + &lea ("esi",&DWP($in_x,"esp")); + &lea ("ebp",&DWP($Zsqr,"esp")); + &lea ("edi",&DWP($M,"esp")); + &call ("_ecp_nistz256_add"); # p256_add(M, in_x, Zsqr); + + &mov ("edi",64); + &lea ("esi",&DWP($tmp0,"esp")); + &lea ("ebp",&DWP($tmp0,"esp")); + &add ("edi",&wparam(0)); + &call ("_ecp_nistz256_add"); # p256_mul_by_2(res_z, tmp0); + + &lea ("esi",&DWP($in_x,"esp")); + &lea ("ebp",&DWP($Zsqr,"esp")); + &lea ("edi",&DWP($Zsqr,"esp")); + &call ("_ecp_nistz256_sub"); # p256_sub(Zsqr, in_x, Zsqr); + + &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($S,"esp")); + &lea ("ebp",&DWP($S,"esp")); + &lea ("edi",&DWP($tmp0,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(tmp0, S); + + &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($M,"esp")); + &lea ("ebp",&DWP($Zsqr,"esp")); + &lea ("edi",&DWP($M,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(M, M, Zsqr); + + &mov ("edi",32); + &lea ("esi",&DWP($tmp0,"esp")); + &add ("edi",&wparam(0)); + &call ("_ecp_nistz256_div_by_2"); # p256_div_by_2(res_y, tmp0); + + &lea ("esi",&DWP($M,"esp")); + &lea ("ebp",&DWP($M,"esp")); + &lea ("edi",&DWP($tmp0,"esp")); + &call ("_ecp_nistz256_add"); # 1/2 p256_mul_by_3(M, M); + + &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($in_x,"esp")); + &lea ("ebp",&DWP($S,"esp")); + &lea ("edi",&DWP($S,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S, S, in_x); + + &lea ("esi",&DWP($tmp0,"esp")); + &lea ("ebp",&DWP($M,"esp")); + &lea ("edi",&DWP($M,"esp")); + &call ("_ecp_nistz256_add"); # 2/2 p256_mul_by_3(M, M); + + &lea ("esi",&DWP($S,"esp")); + &lea ("ebp",&DWP($S,"esp")); + &lea ("edi",&DWP($tmp0,"esp")); + &call ("_ecp_nistz256_add"); # p256_mul_by_2(tmp0, S); + + &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($M,"esp")); + &lea ("ebp",&DWP($M,"esp")); + &mov ("edi",&wparam(0)); + &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(res_x, M); + + &mov ("esi","edi"); # %edi is still res_x here + &lea ("ebp",&DWP($tmp0,"esp")); + &call ("_ecp_nistz256_sub"); # p256_sub(res_x, res_x, tmp0); + + &lea ("esi",&DWP($S,"esp")); + &mov ("ebp","edi"); # %edi is still res_x + &lea ("edi",&DWP($S,"esp")); + &call ("_ecp_nistz256_sub"); # p256_sub(S, S, res_x); + + &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy + &mov ("esi","edi"); # %edi is still &S + &lea ("ebp",&DWP($M,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S, S, M); + + &mov ("ebp",32); + &lea ("esi",&DWP($S,"esp")); + &add ("ebp",&wparam(0)); + &mov ("edi","ebp"); + &call ("_ecp_nistz256_sub"); # p256_sub(res_y, S, res_y); + + &stack_pop(8*5+1); +} &function_end("ecp_nistz256_point_double"); + +######################################################################## +# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, +# const P256_POINT *in2); +&function_begin("ecp_nistz256_point_add"); +{ my ($res_x,$res_y,$res_z, + $in1_x,$in1_y,$in1_z, + $in2_x,$in2_y,$in2_z, + $H,$Hsqr,$R,$Rsqr,$Hcub, + $U1,$U2,$S1,$S2)=map(32*$_,(0..17)); + my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); + + &mov ("esi",&wparam(2)); + + # above map() describes stack layout with 18 temporary + # 256-bit vectors on top, then we take extra words for + # !in1infty, !in2infty, result of check for zero and + # OPENSSL_ia32cap_P copy. [one unused word for padding] + &stack_push(8*18+5); + if ($sse2) { + &call ("_picup_eax"); + &set_label("pic"); + &picmeup("edx","OPENSSL_ia32cap_P","eax",&label("pic")); + &mov ("ebp",&DWP(0,"edx")); } + + &lea ("edi",&DWP($in2_x,"esp")); + for($i=0;$i<96;$i+=16) { + &mov ("eax",&DWP($i+0,"esi")); # copy in2 + &mov ("ebx",&DWP($i+4,"esi")); + &mov ("ecx",&DWP($i+8,"esi")); + &mov ("edx",&DWP($i+12,"esi")); + &mov (&DWP($i+0,"edi"),"eax"); + &mov (&DWP(32*18+12,"esp"),"ebp") if ($i==0); + &mov ("ebp","eax") if ($i==64); + &or ("ebp","eax") if ($i>64); + &mov (&DWP($i+4,"edi"),"ebx"); + &or ("ebp","ebx") if ($i>=64); + &mov (&DWP($i+8,"edi"),"ecx"); + &or ("ebp","ecx") if ($i>=64); + &mov (&DWP($i+12,"edi"),"edx"); + &or ("ebp","edx") if ($i>=64); + } + &xor ("eax","eax"); + &mov ("esi",&wparam(1)); + &sub ("eax","ebp"); + &or ("ebp","eax"); + &sar ("ebp",31); + &mov (&DWP(32*18+4,"esp"),"ebp"); # !in2infty + + &lea ("edi",&DWP($in1_x,"esp")); + for($i=0;$i<96;$i+=16) { + &mov ("eax",&DWP($i+0,"esi")); # copy in1 + &mov ("ebx",&DWP($i+4,"esi")); + &mov ("ecx",&DWP($i+8,"esi")); + &mov ("edx",&DWP($i+12,"esi")); + &mov (&DWP($i+0,"edi"),"eax"); + &mov ("ebp","eax") if ($i==64); + &or ("ebp","eax") if ($i>64); + &mov (&DWP($i+4,"edi"),"ebx"); + &or ("ebp","ebx") if ($i>=64); + &mov (&DWP($i+8,"edi"),"ecx"); + &or ("ebp","ecx") if ($i>=64); + &mov (&DWP($i+12,"edi"),"edx"); + &or ("ebp","edx") if ($i>=64); + } + &xor ("eax","eax"); + &sub ("eax","ebp"); + &or ("ebp","eax"); + &sar ("ebp",31); + &mov (&DWP(32*18+0,"esp"),"ebp"); # !in1infty + + &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($in2_z,"esp")); + &lea ("ebp",&DWP($in2_z,"esp")); + &lea ("edi",&DWP($Z2sqr,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Z2sqr, in2_z); + + &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($in1_z,"esp")); + &lea ("ebp",&DWP($in1_z,"esp")); + &lea ("edi",&DWP($Z1sqr,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Z1sqr, in1_z); + + &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($Z2sqr,"esp")); + &lea ("ebp",&DWP($in2_z,"esp")); + &lea ("edi",&DWP($S1,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S1, Z2sqr, in2_z); + + &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($Z1sqr,"esp")); + &lea ("ebp",&DWP($in1_z,"esp")); + &lea ("edi",&DWP($S2,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, Z1sqr, in1_z); + + &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($in1_y,"esp")); + &lea ("ebp",&DWP($S1,"esp")); + &lea ("edi",&DWP($S1,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S1, S1, in1_y); + + &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($in2_y,"esp")); + &lea ("ebp",&DWP($S2,"esp")); + &lea ("edi",&DWP($S2,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, S2, in2_y); + + &lea ("esi",&DWP($S2,"esp")); + &lea ("ebp",&DWP($S1,"esp")); + &lea ("edi",&DWP($R,"esp")); + &call ("_ecp_nistz256_sub"); # p256_sub(R, S2, S1); + + &or ("ebx","eax"); # see if result is zero + &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy + &or ("ebx","ecx"); + &or ("ebx","edx"); + &or ("ebx",&DWP(0,"edi")); + &or ("ebx",&DWP(4,"edi")); + &lea ("esi",&DWP($in1_x,"esp")); + &or ("ebx",&DWP(8,"edi")); + &lea ("ebp",&DWP($Z2sqr,"esp")); + &or ("ebx",&DWP(12,"edi")); + &lea ("edi",&DWP($U1,"esp")); + &mov (&DWP(32*18+8,"esp"),"ebx"); + + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U1, in1_x, Z2sqr); + + &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($in2_x,"esp")); + &lea ("ebp",&DWP($Z1sqr,"esp")); + &lea ("edi",&DWP($U2,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, in2_x, Z1sqr); + + &lea ("esi",&DWP($U2,"esp")); + &lea ("ebp",&DWP($U1,"esp")); + &lea ("edi",&DWP($H,"esp")); + &call ("_ecp_nistz256_sub"); # p256_sub(H, U2, U1); + + &or ("eax","ebx"); # see if result is zero + &or ("eax","ecx"); + &or ("eax","edx"); + &or ("eax",&DWP(0,"edi")); + &or ("eax",&DWP(4,"edi")); + &or ("eax",&DWP(8,"edi")); + &or ("eax",&DWP(12,"edi")); + + &data_byte(0x3e); # predict taken + &jnz (&label("add_proceed")); # is_equal(U1,U2)? + + &mov ("eax",&DWP(32*18+0,"esp")); + &and ("eax",&DWP(32*18+4,"esp")); + &mov ("ebx",&DWP(32*18+8,"esp")); + &jz (&label("add_proceed")); # (in1infty || in2infty)? + &test ("ebx","ebx"); + &jz (&label("add_double")); # is_equal(S1,S2)? + + &mov ("edi",&wparam(0)); + &xor ("eax","eax"); + &mov ("ecx",96/4); + &data_byte(0xfc,0xf3,0xab); # cld; stosd + &jmp (&label("add_done")); + +&set_label("add_double",16); + &mov ("esi",&wparam(1)); + &mov ("ebp",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy + &add ("esp",4*((8*18+5)-(8*5+1))); # difference in frame sizes + &jmp (&label("point_double_shortcut")); + +&set_label("add_proceed",16); + &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($R,"esp")); + &lea ("ebp",&DWP($R,"esp")); + &lea ("edi",&DWP($Rsqr,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Rsqr, R); + + &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($H,"esp")); + &lea ("ebp",&DWP($in1_z,"esp")); + &lea ("edi",&DWP($res_z,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_z, H, in1_z); + + &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($H,"esp")); + &lea ("ebp",&DWP($H,"esp")); + &lea ("edi",&DWP($Hsqr,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Hsqr, H); + + &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($in2_z,"esp")); + &lea ("ebp",&DWP($res_z,"esp")); + &lea ("edi",&DWP($res_z,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_z, res_z, in2_z); + + &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($Hsqr,"esp")); + &lea ("ebp",&DWP($U1,"esp")); + &lea ("edi",&DWP($U2,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, U1, Hsqr); + + &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($H,"esp")); + &lea ("ebp",&DWP($Hsqr,"esp")); + &lea ("edi",&DWP($Hcub,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(Hcub, Hsqr, H); + + &lea ("esi",&DWP($U2,"esp")); + &lea ("ebp",&DWP($U2,"esp")); + &lea ("edi",&DWP($Hsqr,"esp")); + &call ("_ecp_nistz256_add"); # p256_mul_by_2(Hsqr, U2); + + &lea ("esi",&DWP($Rsqr,"esp")); + &lea ("ebp",&DWP($Hsqr,"esp")); + &lea ("edi",&DWP($res_x,"esp")); + &call ("_ecp_nistz256_sub"); # p256_sub(res_x, Rsqr, Hsqr); + + &lea ("esi",&DWP($res_x,"esp")); + &lea ("ebp",&DWP($Hcub,"esp")); + &lea ("edi",&DWP($res_x,"esp")); + &call ("_ecp_nistz256_sub"); # p256_sub(res_x, res_x, Hcub); + + &lea ("esi",&DWP($U2,"esp")); + &lea ("ebp",&DWP($res_x,"esp")); + &lea ("edi",&DWP($res_y,"esp")); + &call ("_ecp_nistz256_sub"); # p256_sub(res_y, U2, res_x); + + &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($Hcub,"esp")); + &lea ("ebp",&DWP($S1,"esp")); + &lea ("edi",&DWP($S2,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, S1, Hcub); + + &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($R,"esp")); + &lea ("ebp",&DWP($res_y,"esp")); + &lea ("edi",&DWP($res_y,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_y, R, res_y); + + &lea ("esi",&DWP($res_y,"esp")); + &lea ("ebp",&DWP($S2,"esp")); + &lea ("edi",&DWP($res_y,"esp")); + &call ("_ecp_nistz256_sub"); # p256_sub(res_y, res_y, S2); + + &mov ("ebp",&DWP(32*18+0,"esp")); # !in1infty + &mov ("esi",&DWP(32*18+4,"esp")); # !in2infty + &mov ("edi",&wparam(0)); + &mov ("edx","ebp"); + ¬ ("ebp"); + &and ("edx","esi"); + &and ("ebp","esi"); + ¬ ("esi"); + + ######################################## + # conditional moves + for($i=64;$i<96;$i+=4) { + &mov ("eax","edx"); + &and ("eax",&DWP($res_x+$i,"esp")); + &mov ("ebx","ebp"); + &and ("ebx",&DWP($in2_x+$i,"esp")); + &mov ("ecx","esi"); + &and ("ecx",&DWP($in1_x+$i,"esp")); + &or ("eax","ebx"); + &or ("eax","ecx"); + &mov (&DWP($i,"edi"),"eax"); + } + for($i=0;$i<64;$i+=4) { + &mov ("eax","edx"); + &and ("eax",&DWP($res_x+$i,"esp")); + &mov ("ebx","ebp"); + &and ("ebx",&DWP($in2_x+$i,"esp")); + &mov ("ecx","esi"); + &and ("ecx",&DWP($in1_x+$i,"esp")); + &or ("eax","ebx"); + &or ("eax","ecx"); + &mov (&DWP($i,"edi"),"eax"); + } + &set_label("add_done"); + &stack_pop(8*18+5); +} &function_end("ecp_nistz256_point_add"); + +######################################################################## +# void ecp_nistz256_point_add_affine(P256_POINT *out, +# const P256_POINT *in1, +# const P256_POINT_AFFINE *in2); +&function_begin("ecp_nistz256_point_add_affine"); +{ + my ($res_x,$res_y,$res_z, + $in1_x,$in1_y,$in1_z, + $in2_x,$in2_y, + $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14)); + my $Z1sqr = $S2; + my @ONE_mont=(1,0,0,-1,-1,-1,-2,0); + + &mov ("esi",&wparam(1)); + + # above map() describes stack layout with 15 temporary + # 256-bit vectors on top, then we take extra words for + # !in1infty, !in2infty, and OPENSSL_ia32cap_P copy. + &stack_push(8*15+3); + if ($sse2) { + &call ("_picup_eax"); + &set_label("pic"); + &picmeup("edx","OPENSSL_ia32cap_P","eax",&label("pic")); + &mov ("ebp",&DWP(0,"edx")); } + + &lea ("edi",&DWP($in1_x,"esp")); + for($i=0;$i<96;$i+=16) { + &mov ("eax",&DWP($i+0,"esi")); # copy in1 + &mov ("ebx",&DWP($i+4,"esi")); + &mov ("ecx",&DWP($i+8,"esi")); + &mov ("edx",&DWP($i+12,"esi")); + &mov (&DWP($i+0,"edi"),"eax"); + &mov (&DWP(32*15+8,"esp"),"ebp") if ($i==0); + &mov ("ebp","eax") if ($i==64); + &or ("ebp","eax") if ($i>64); + &mov (&DWP($i+4,"edi"),"ebx"); + &or ("ebp","ebx") if ($i>=64); + &mov (&DWP($i+8,"edi"),"ecx"); + &or ("ebp","ecx") if ($i>=64); + &mov (&DWP($i+12,"edi"),"edx"); + &or ("ebp","edx") if ($i>=64); + } + &xor ("eax","eax"); + &mov ("esi",&wparam(2)); + &sub ("eax","ebp"); + &or ("ebp","eax"); + &sar ("ebp",31); + &mov (&DWP(32*15+0,"esp"),"ebp"); # !in1infty + + &lea ("edi",&DWP($in2_x,"esp")); + for($i=0;$i<64;$i+=16) { + &mov ("eax",&DWP($i+0,"esi")); # copy in2 + &mov ("ebx",&DWP($i+4,"esi")); + &mov ("ecx",&DWP($i+8,"esi")); + &mov ("edx",&DWP($i+12,"esi")); + &mov (&DWP($i+0,"edi"),"eax"); + &mov ("ebp","eax") if ($i==0); + &or ("ebp","eax") if ($i!=0); + &mov (&DWP($i+4,"edi"),"ebx"); + &or ("ebp","ebx"); + &mov (&DWP($i+8,"edi"),"ecx"); + &or ("ebp","ecx"); + &mov (&DWP($i+12,"edi"),"edx"); + &or ("ebp","edx"); + } + &xor ("ebx","ebx"); + &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy + &sub ("ebx","ebp"); + &lea ("esi",&DWP($in1_z,"esp")); + &or ("ebx","ebp"); + &lea ("ebp",&DWP($in1_z,"esp")); + &sar ("ebx",31); + &lea ("edi",&DWP($Z1sqr,"esp")); + &mov (&DWP(32*15+4,"esp"),"ebx"); # !in2infty + + &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Z1sqr, in1_z); + + &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($in2_x,"esp")); + &mov ("ebp","edi"); # %esi is stull &Z1sqr + &lea ("edi",&DWP($U2,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, Z1sqr, in2_x); + + &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($in1_z,"esp")); + &lea ("ebp",&DWP($Z1sqr,"esp")); + &lea ("edi",&DWP($S2,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, Z1sqr, in1_z); + + &lea ("esi",&DWP($U2,"esp")); + &lea ("ebp",&DWP($in1_x,"esp")); + &lea ("edi",&DWP($H,"esp")); + &call ("_ecp_nistz256_sub"); # p256_sub(H, U2, in1_x); + + &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($in2_y,"esp")); + &lea ("ebp",&DWP($S2,"esp")); + &lea ("edi",&DWP($S2,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, S2, in2_y); + + &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($in1_z,"esp")); + &lea ("ebp",&DWP($H,"esp")); + &lea ("edi",&DWP($res_z,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_z, H, in1_z); + + &lea ("esi",&DWP($S2,"esp")); + &lea ("ebp",&DWP($in1_y,"esp")); + &lea ("edi",&DWP($R,"esp")); + &call ("_ecp_nistz256_sub"); # p256_sub(R, S2, in1_y); + + &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($H,"esp")); + &lea ("ebp",&DWP($H,"esp")); + &lea ("edi",&DWP($Hsqr,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Hsqr, H); + + &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($R,"esp")); + &lea ("ebp",&DWP($R,"esp")); + &lea ("edi",&DWP($Rsqr,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Rsqr, R); + + &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($in1_x,"esp")); + &lea ("ebp",&DWP($Hsqr,"esp")); + &lea ("edi",&DWP($U2,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, in1_x, Hsqr); + + &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($H,"esp")); + &lea ("ebp",&DWP($Hsqr,"esp")); + &lea ("edi",&DWP($Hcub,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(Hcub, Hsqr, H); + + &lea ("esi",&DWP($U2,"esp")); + &lea ("ebp",&DWP($U2,"esp")); + &lea ("edi",&DWP($Hsqr,"esp")); + &call ("_ecp_nistz256_add"); # p256_mul_by_2(Hsqr, U2); + + &lea ("esi",&DWP($Rsqr,"esp")); + &lea ("ebp",&DWP($Hsqr,"esp")); + &lea ("edi",&DWP($res_x,"esp")); + &call ("_ecp_nistz256_sub"); # p256_sub(res_x, Rsqr, Hsqr); + + &lea ("esi",&DWP($res_x,"esp")); + &lea ("ebp",&DWP($Hcub,"esp")); + &lea ("edi",&DWP($res_x,"esp")); + &call ("_ecp_nistz256_sub"); # p256_sub(res_x, res_x, Hcub); + + &lea ("esi",&DWP($U2,"esp")); + &lea ("ebp",&DWP($res_x,"esp")); + &lea ("edi",&DWP($res_y,"esp")); + &call ("_ecp_nistz256_sub"); # p256_sub(res_y, U2, res_x); + + &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($Hcub,"esp")); + &lea ("ebp",&DWP($in1_y,"esp")); + &lea ("edi",&DWP($S2,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, Hcub, in1_y); + + &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($R,"esp")); + &lea ("ebp",&DWP($res_y,"esp")); + &lea ("edi",&DWP($res_y,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_y, res_y, R); + + &lea ("esi",&DWP($res_y,"esp")); + &lea ("ebp",&DWP($S2,"esp")); + &lea ("edi",&DWP($res_y,"esp")); + &call ("_ecp_nistz256_sub"); # p256_sub(res_y, res_y, S2); + + &mov ("ebp",&DWP(32*15+0,"esp")); # !in1infty + &mov ("esi",&DWP(32*15+4,"esp")); # !in2infty + &mov ("edi",&wparam(0)); + &mov ("edx","ebp"); + ¬ ("ebp"); + &and ("edx","esi"); + &and ("ebp","esi"); + ¬ ("esi"); + + ######################################## + # conditional moves + for($i=64;$i<96;$i+=4) { + my $one=@ONE_mont[($i-64)/4]; + + &mov ("eax","edx"); + &and ("eax",&DWP($res_x+$i,"esp")); + &mov ("ebx","ebp") if ($one && $one!=-1); + &and ("ebx",$one) if ($one && $one!=-1); + &mov ("ecx","esi"); + &and ("ecx",&DWP($in1_x+$i,"esp")); + &or ("eax",$one==-1?"ebp":"ebx") if ($one); + &or ("eax","ecx"); + &mov (&DWP($i,"edi"),"eax"); + } + for($i=0;$i<64;$i+=4) { + &mov ("eax","edx"); + &and ("eax",&DWP($res_x+$i,"esp")); + &mov ("ebx","ebp"); + &and ("ebx",&DWP($in2_x+$i,"esp")); + &mov ("ecx","esi"); + &and ("ecx",&DWP($in1_x+$i,"esp")); + &or ("eax","ebx"); + &or ("eax","ecx"); + &mov (&DWP($i,"edi"),"eax"); + } + &stack_pop(8*15+3); +} &function_end("ecp_nistz256_point_add_affine"); + +&asm_finish(); + +close STDOUT; diff --git a/crypto/ec/asm/ecp_nistz256-x86_64.pl b/crypto/ec/asm/ecp_nistz256-x86_64.pl index 35d2b6d146c1..eba6ffd430be 100755 --- a/crypto/ec/asm/ecp_nistz256-x86_64.pl +++ b/crypto/ec/asm/ecp_nistz256-x86_64.pl @@ -1,53 +1,44 @@ -#!/usr/bin/env perl - -############################################################################## -# # -# Copyright 2014 Intel Corporation # -# # -# Licensed under the Apache License, Version 2.0 (the "License"); # -# you may not use this file except in compliance with the License. # -# You may obtain a copy of the License at # -# # -# http://www.apache.org/licenses/LICENSE-2.0 # -# # -# Unless required by applicable law or agreed to in writing, software # -# distributed under the License is distributed on an "AS IS" BASIS, # -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # -# See the License for the specific language governing permissions and # -# limitations under the License. # -# # -############################################################################## -# # -# Developers and authors: # -# Shay Gueron (1, 2), and Vlad Krasnov (1) # -# (1) Intel Corporation, Israel Development Center # -# (2) University of Haifa # -# Reference: # -# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with# -# 256 Bit Primes" # -# # -############################################################################## +#! /usr/bin/env perl +# Copyright 2014-2018 The OpenSSL Project Authors. All Rights Reserved. +# Copyright (c) 2014, Intel Corporation. All Rights Reserved. +# Copyright (c) 2015 CloudFlare, Inc. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html +# +# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3) +# (1) Intel Corporation, Israel Development Center, Haifa, Israel +# (2) University of Haifa, Israel +# (3) CloudFlare, Inc. +# +# Reference: +# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with +# 256 Bit Primes" # Further optimization by <appro@openssl.org>: # # this/original with/without -DECP_NISTZ256_ASM(*) -# Opteron +12-49% +110-150% -# Bulldozer +14-45% +175-210% -# P4 +18-46% n/a :-( -# Westmere +12-34% +80-87% -# Sandy Bridge +9-35% +110-120% -# Ivy Bridge +9-35% +110-125% -# Haswell +8-37% +140-160% -# Broadwell +18-58% +145-210% -# Atom +15-50% +130-180% -# VIA Nano +43-160% +300-480% +# Opteron +15-49% +150-195% +# Bulldozer +18-45% +175-240% +# P4 +24-46% +100-150% +# Westmere +18-34% +87-160% +# Sandy Bridge +14-35% +120-185% +# Ivy Bridge +11-35% +125-180% +# Haswell +10-37% +160-200% +# Broadwell +24-58% +210-270% +# Atom +20-50% +180-240% +# VIA Nano +50-160% +480-480% # # (*) "without -DECP_NISTZ256_ASM" refers to build with # "enable-ec_nistp_64_gcc_128"; # # Ranges denote minimum and maximum improvement coefficients depending -# on benchmark. Lower coefficients are for ECDSA sign, relatively fastest -# server-side operation. Keep in mind that +100% means 2x improvement. +# on benchmark. In "this/original" column lower coefficient is for +# ECDSA sign, while in "with/without" - for ECDH key agreement, and +# higher - for ECDSA sign, relatively fastest server-side operation. +# Keep in mind that +100% means 2x improvement. $flavour = shift; $output = shift; @@ -60,7 +51,7 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; -open OUT,"| \"$^X\" $xlate $flavour $output"; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` @@ -108,6 +99,12 @@ $code.=<<___; .long 3,3,3,3,3,3,3,3 .LONE_mont: .quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe + +# Constants for computations modulo ord(p256) +.Lord: +.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000 +.LordK: +.quad 0xccd1c8aaee00bc4f ___ { @@ -124,8 +121,12 @@ $code.=<<___; .type ecp_nistz256_mul_by_2,\@function,2 .align 64 ecp_nistz256_mul_by_2: +.cfi_startproc push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 +.Lmul_by_2_body: mov 8*0($a_ptr), $a0 xor $t4,$t4 @@ -158,9 +159,15 @@ ecp_nistz256_mul_by_2: mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) - pop %r13 - pop %r12 + mov 0(%rsp),%r13 +.cfi_restore %r13 + mov 8(%rsp),%r12 +.cfi_restore %r12 + lea 16(%rsp),%rsp +.cfi_adjust_cfa_offset -16 +.Lmul_by_2_epilogue: ret +.cfi_endproc .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 ################################################################################ @@ -169,8 +176,12 @@ ecp_nistz256_mul_by_2: .type ecp_nistz256_div_by_2,\@function,2 .align 32 ecp_nistz256_div_by_2: +.cfi_startproc push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 +.Ldiv_by_2_body: mov 8*0($a_ptr), $a0 mov 8*1($a_ptr), $a1 @@ -218,9 +229,15 @@ ecp_nistz256_div_by_2: mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) - pop %r13 - pop %r12 + mov 0(%rsp),%r13 +.cfi_restore %r13 + mov 8(%rsp),%r12 +.cfi_restore %r12 + lea 16(%rsp),%rsp +.cfi_adjust_cfa_offset -16 +.Ldiv_by_2_epilogue: ret +.cfi_endproc .size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 ################################################################################ @@ -229,8 +246,12 @@ ecp_nistz256_div_by_2: .type ecp_nistz256_mul_by_3,\@function,2 .align 32 ecp_nistz256_mul_by_3: +.cfi_startproc push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 +.Lmul_by_3_body: mov 8*0($a_ptr), $a0 xor $t4, $t4 @@ -284,9 +305,15 @@ ecp_nistz256_mul_by_3: mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) - pop %r13 - pop %r12 + mov 0(%rsp),%r13 +.cfi_restore %r13 + mov 8(%rsp),%r12 +.cfi_restore %r12 + lea 16(%rsp),%rsp +.cfi_adjust_cfa_offset -16 +.Lmul_by_3_epilogue: ret +.cfi_endproc .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 ################################################################################ @@ -295,8 +322,12 @@ ecp_nistz256_mul_by_3: .type ecp_nistz256_add,\@function,3 .align 32 ecp_nistz256_add: +.cfi_startproc push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 +.Ladd_body: mov 8*0($a_ptr), $a0 xor $t4, $t4 @@ -330,9 +361,15 @@ ecp_nistz256_add: mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) - pop %r13 - pop %r12 + mov 0(%rsp),%r13 +.cfi_restore %r13 + mov 8(%rsp),%r12 +.cfi_restore %r12 + lea 16(%rsp),%rsp +.cfi_adjust_cfa_offset -16 +.Ladd_epilogue: ret +.cfi_endproc .size ecp_nistz256_add,.-ecp_nistz256_add ################################################################################ @@ -341,8 +378,12 @@ ecp_nistz256_add: .type ecp_nistz256_sub,\@function,3 .align 32 ecp_nistz256_sub: +.cfi_startproc push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 +.Lsub_body: mov 8*0($a_ptr), $a0 xor $t4, $t4 @@ -376,9 +417,15 @@ ecp_nistz256_sub: mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) - pop %r13 - pop %r12 + mov 0(%rsp),%r13 +.cfi_restore %r13 + mov 8(%rsp),%r12 +.cfi_restore %r12 + lea 16(%rsp),%rsp +.cfi_adjust_cfa_offset -16 +.Lsub_epilogue: ret +.cfi_endproc .size ecp_nistz256_sub,.-ecp_nistz256_sub ################################################################################ @@ -387,8 +434,12 @@ ecp_nistz256_sub: .type ecp_nistz256_neg,\@function,2 .align 32 ecp_nistz256_neg: +.cfi_startproc push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 +.Lneg_body: xor $a0, $a0 xor $a1, $a1 @@ -422,9 +473,15 @@ ecp_nistz256_neg: mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) - pop %r13 - pop %r12 + mov 0(%rsp),%r13 +.cfi_restore %r13 + mov 8(%rsp),%r12 +.cfi_restore %r12 + lea 16(%rsp),%rsp +.cfi_adjust_cfa_offset -16 +.Lneg_epilogue: ret +.cfi_endproc .size ecp_nistz256_neg,.-ecp_nistz256_neg ___ } @@ -436,6 +493,1085 @@ my ($poly1,$poly3)=($acc6,$acc7); $code.=<<___; ################################################################################ +# void ecp_nistz256_ord_mul_mont( +# uint64_t res[4], +# uint64_t a[4], +# uint64_t b[4]); + +.globl ecp_nistz256_ord_mul_mont +.type ecp_nistz256_ord_mul_mont,\@function,3 +.align 32 +ecp_nistz256_ord_mul_mont: +.cfi_startproc +___ +$code.=<<___ if ($addx); + mov \$0x80100, %ecx + and OPENSSL_ia32cap_P+8(%rip), %ecx + cmp \$0x80100, %ecx + je .Lecp_nistz256_ord_mul_montx +___ +$code.=<<___; + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lord_mul_body: + + mov 8*0($b_org), %rax + mov $b_org, $b_ptr + lea .Lord(%rip), %r14 + mov .LordK(%rip), %r15 + + ################################# * b[0] + mov %rax, $t0 + mulq 8*0($a_ptr) + mov %rax, $acc0 + mov $t0, %rax + mov %rdx, $acc1 + + mulq 8*1($a_ptr) + add %rax, $acc1 + mov $t0, %rax + adc \$0, %rdx + mov %rdx, $acc2 + + mulq 8*2($a_ptr) + add %rax, $acc2 + mov $t0, %rax + adc \$0, %rdx + + mov $acc0, $acc5 + imulq %r15,$acc0 + + mov %rdx, $acc3 + mulq 8*3($a_ptr) + add %rax, $acc3 + mov $acc0, %rax + adc \$0, %rdx + mov %rdx, $acc4 + + ################################# First reduction step + mulq 8*0(%r14) + mov $acc0, $t1 + add %rax, $acc5 # guaranteed to be zero + mov $acc0, %rax + adc \$0, %rdx + mov %rdx, $t0 + + sub $acc0, $acc2 + sbb \$0, $acc0 # can't borrow + + mulq 8*1(%r14) + add $t0, $acc1 + adc \$0, %rdx + add %rax, $acc1 + mov $t1, %rax + adc %rdx, $acc2 + mov $t1, %rdx + adc \$0, $acc0 # can't overflow + + shl \$32, %rax + shr \$32, %rdx + sub %rax, $acc3 + mov 8*1($b_ptr), %rax + sbb %rdx, $t1 # can't borrow + + add $acc0, $acc3 + adc $t1, $acc4 + adc \$0, $acc5 + + ################################# * b[1] + mov %rax, $t0 + mulq 8*0($a_ptr) + add %rax, $acc1 + mov $t0, %rax + adc \$0, %rdx + mov %rdx, $t1 + + mulq 8*1($a_ptr) + add $t1, $acc2 + adc \$0, %rdx + add %rax, $acc2 + mov $t0, %rax + adc \$0, %rdx + mov %rdx, $t1 + + mulq 8*2($a_ptr) + add $t1, $acc3 + adc \$0, %rdx + add %rax, $acc3 + mov $t0, %rax + adc \$0, %rdx + + mov $acc1, $t0 + imulq %r15, $acc1 + + mov %rdx, $t1 + mulq 8*3($a_ptr) + add $t1, $acc4 + adc \$0, %rdx + xor $acc0, $acc0 + add %rax, $acc4 + mov $acc1, %rax + adc %rdx, $acc5 + adc \$0, $acc0 + + ################################# Second reduction step + mulq 8*0(%r14) + mov $acc1, $t1 + add %rax, $t0 # guaranteed to be zero + mov $acc1, %rax + adc %rdx, $t0 + + sub $acc1, $acc3 + sbb \$0, $acc1 # can't borrow + + mulq 8*1(%r14) + add $t0, $acc2 + adc \$0, %rdx + add %rax, $acc2 + mov $t1, %rax + adc %rdx, $acc3 + mov $t1, %rdx + adc \$0, $acc1 # can't overflow + + shl \$32, %rax + shr \$32, %rdx + sub %rax, $acc4 + mov 8*2($b_ptr), %rax + sbb %rdx, $t1 # can't borrow + + add $acc1, $acc4 + adc $t1, $acc5 + adc \$0, $acc0 + + ################################## * b[2] + mov %rax, $t0 + mulq 8*0($a_ptr) + add %rax, $acc2 + mov $t0, %rax + adc \$0, %rdx + mov %rdx, $t1 + + mulq 8*1($a_ptr) + add $t1, $acc3 + adc \$0, %rdx + add %rax, $acc3 + mov $t0, %rax + adc \$0, %rdx + mov %rdx, $t1 + + mulq 8*2($a_ptr) + add $t1, $acc4 + adc \$0, %rdx + add %rax, $acc4 + mov $t0, %rax + adc \$0, %rdx + + mov $acc2, $t0 + imulq %r15, $acc2 + + mov %rdx, $t1 + mulq 8*3($a_ptr) + add $t1, $acc5 + adc \$0, %rdx + xor $acc1, $acc1 + add %rax, $acc5 + mov $acc2, %rax + adc %rdx, $acc0 + adc \$0, $acc1 + + ################################# Third reduction step + mulq 8*0(%r14) + mov $acc2, $t1 + add %rax, $t0 # guaranteed to be zero + mov $acc2, %rax + adc %rdx, $t0 + + sub $acc2, $acc4 + sbb \$0, $acc2 # can't borrow + + mulq 8*1(%r14) + add $t0, $acc3 + adc \$0, %rdx + add %rax, $acc3 + mov $t1, %rax + adc %rdx, $acc4 + mov $t1, %rdx + adc \$0, $acc2 # can't overflow + + shl \$32, %rax + shr \$32, %rdx + sub %rax, $acc5 + mov 8*3($b_ptr), %rax + sbb %rdx, $t1 # can't borrow + + add $acc2, $acc5 + adc $t1, $acc0 + adc \$0, $acc1 + + ################################# * b[3] + mov %rax, $t0 + mulq 8*0($a_ptr) + add %rax, $acc3 + mov $t0, %rax + adc \$0, %rdx + mov %rdx, $t1 + + mulq 8*1($a_ptr) + add $t1, $acc4 + adc \$0, %rdx + add %rax, $acc4 + mov $t0, %rax + adc \$0, %rdx + mov %rdx, $t1 + + mulq 8*2($a_ptr) + add $t1, $acc5 + adc \$0, %rdx + add %rax, $acc5 + mov $t0, %rax + adc \$0, %rdx + + mov $acc3, $t0 + imulq %r15, $acc3 + + mov %rdx, $t1 + mulq 8*3($a_ptr) + add $t1, $acc0 + adc \$0, %rdx + xor $acc2, $acc2 + add %rax, $acc0 + mov $acc3, %rax + adc %rdx, $acc1 + adc \$0, $acc2 + + ################################# Last reduction step + mulq 8*0(%r14) + mov $acc3, $t1 + add %rax, $t0 # guaranteed to be zero + mov $acc3, %rax + adc %rdx, $t0 + + sub $acc3, $acc5 + sbb \$0, $acc3 # can't borrow + + mulq 8*1(%r14) + add $t0, $acc4 + adc \$0, %rdx + add %rax, $acc4 + mov $t1, %rax + adc %rdx, $acc5 + mov $t1, %rdx + adc \$0, $acc3 # can't overflow + + shl \$32, %rax + shr \$32, %rdx + sub %rax, $acc0 + sbb %rdx, $t1 # can't borrow + + add $acc3, $acc0 + adc $t1, $acc1 + adc \$0, $acc2 + + ################################# Subtract ord + mov $acc4, $a_ptr + sub 8*0(%r14), $acc4 + mov $acc5, $acc3 + sbb 8*1(%r14), $acc5 + mov $acc0, $t0 + sbb 8*2(%r14), $acc0 + mov $acc1, $t1 + sbb 8*3(%r14), $acc1 + sbb \$0, $acc2 + + cmovc $a_ptr, $acc4 + cmovc $acc3, $acc5 + cmovc $t0, $acc0 + cmovc $t1, $acc1 + + mov $acc4, 8*0($r_ptr) + mov $acc5, 8*1($r_ptr) + mov $acc0, 8*2($r_ptr) + mov $acc1, 8*3($r_ptr) + + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbx +.cfi_restore %rbx + mov 40(%rsp),%rbp +.cfi_restore %rbp + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lord_mul_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont + +################################################################################ +# void ecp_nistz256_ord_sqr_mont( +# uint64_t res[4], +# uint64_t a[4], +# int rep); + +.globl ecp_nistz256_ord_sqr_mont +.type ecp_nistz256_ord_sqr_mont,\@function,3 +.align 32 +ecp_nistz256_ord_sqr_mont: +.cfi_startproc +___ +$code.=<<___ if ($addx); + mov \$0x80100, %ecx + and OPENSSL_ia32cap_P+8(%rip), %ecx + cmp \$0x80100, %ecx + je .Lecp_nistz256_ord_sqr_montx +___ +$code.=<<___; + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lord_sqr_body: + + mov 8*0($a_ptr), $acc0 + mov 8*1($a_ptr), %rax + mov 8*2($a_ptr), $acc6 + mov 8*3($a_ptr), $acc7 + lea .Lord(%rip), $a_ptr # pointer to modulus + mov $b_org, $b_ptr + jmp .Loop_ord_sqr + +.align 32 +.Loop_ord_sqr: + ################################# a[1:] * a[0] + mov %rax, $t1 # put aside a[1] + mul $acc0 # a[1] * a[0] + mov %rax, $acc1 + movq $t1, %xmm1 # offload a[1] + mov $acc6, %rax + mov %rdx, $acc2 + + mul $acc0 # a[2] * a[0] + add %rax, $acc2 + mov $acc7, %rax + movq $acc6, %xmm2 # offload a[2] + adc \$0, %rdx + mov %rdx, $acc3 + + mul $acc0 # a[3] * a[0] + add %rax, $acc3 + mov $acc7, %rax + movq $acc7, %xmm3 # offload a[3] + adc \$0, %rdx + mov %rdx, $acc4 + + ################################# a[3] * a[2] + mul $acc6 # a[3] * a[2] + mov %rax, $acc5 + mov $acc6, %rax + mov %rdx, $acc6 + + ################################# a[2:] * a[1] + mul $t1 # a[2] * a[1] + add %rax, $acc3 + mov $acc7, %rax + adc \$0, %rdx + mov %rdx, $acc7 + + mul $t1 # a[3] * a[1] + add %rax, $acc4 + adc \$0, %rdx + + add $acc7, $acc4 + adc %rdx, $acc5 + adc \$0, $acc6 # can't overflow + + ################################# *2 + xor $acc7, $acc7 + mov $acc0, %rax + add $acc1, $acc1 + adc $acc2, $acc2 + adc $acc3, $acc3 + adc $acc4, $acc4 + adc $acc5, $acc5 + adc $acc6, $acc6 + adc \$0, $acc7 + + ################################# Missing products + mul %rax # a[0] * a[0] + mov %rax, $acc0 + movq %xmm1, %rax + mov %rdx, $t1 + + mul %rax # a[1] * a[1] + add $t1, $acc1 + adc %rax, $acc2 + movq %xmm2, %rax + adc \$0, %rdx + mov %rdx, $t1 + + mul %rax # a[2] * a[2] + add $t1, $acc3 + adc %rax, $acc4 + movq %xmm3, %rax + adc \$0, %rdx + mov %rdx, $t1 + + mov $acc0, $t0 + imulq 8*4($a_ptr), $acc0 # *= .LordK + + mul %rax # a[3] * a[3] + add $t1, $acc5 + adc %rax, $acc6 + mov 8*0($a_ptr), %rax # modulus[0] + adc %rdx, $acc7 # can't overflow + + ################################# First reduction step + mul $acc0 + mov $acc0, $t1 + add %rax, $t0 # guaranteed to be zero + mov 8*1($a_ptr), %rax # modulus[1] + adc %rdx, $t0 + + sub $acc0, $acc2 + sbb \$0, $t1 # can't borrow + + mul $acc0 + add $t0, $acc1 + adc \$0, %rdx + add %rax, $acc1 + mov $acc0, %rax + adc %rdx, $acc2 + mov $acc0, %rdx + adc \$0, $t1 # can't overflow + + mov $acc1, $t0 + imulq 8*4($a_ptr), $acc1 # *= .LordK + + shl \$32, %rax + shr \$32, %rdx + sub %rax, $acc3 + mov 8*0($a_ptr), %rax + sbb %rdx, $acc0 # can't borrow + + add $t1, $acc3 + adc \$0, $acc0 # can't overflow + + ################################# Second reduction step + mul $acc1 + mov $acc1, $t1 + add %rax, $t0 # guaranteed to be zero + mov 8*1($a_ptr), %rax + adc %rdx, $t0 + + sub $acc1, $acc3 + sbb \$0, $t1 # can't borrow + + mul $acc1 + add $t0, $acc2 + adc \$0, %rdx + add %rax, $acc2 + mov $acc1, %rax + adc %rdx, $acc3 + mov $acc1, %rdx + adc \$0, $t1 # can't overflow + + mov $acc2, $t0 + imulq 8*4($a_ptr), $acc2 # *= .LordK + + shl \$32, %rax + shr \$32, %rdx + sub %rax, $acc0 + mov 8*0($a_ptr), %rax + sbb %rdx, $acc1 # can't borrow + + add $t1, $acc0 + adc \$0, $acc1 # can't overflow + + ################################# Third reduction step + mul $acc2 + mov $acc2, $t1 + add %rax, $t0 # guaranteed to be zero + mov 8*1($a_ptr), %rax + adc %rdx, $t0 + + sub $acc2, $acc0 + sbb \$0, $t1 # can't borrow + + mul $acc2 + add $t0, $acc3 + adc \$0, %rdx + add %rax, $acc3 + mov $acc2, %rax + adc %rdx, $acc0 + mov $acc2, %rdx + adc \$0, $t1 # can't overflow + + mov $acc3, $t0 + imulq 8*4($a_ptr), $acc3 # *= .LordK + + shl \$32, %rax + shr \$32, %rdx + sub %rax, $acc1 + mov 8*0($a_ptr), %rax + sbb %rdx, $acc2 # can't borrow + + add $t1, $acc1 + adc \$0, $acc2 # can't overflow + + ################################# Last reduction step + mul $acc3 + mov $acc3, $t1 + add %rax, $t0 # guaranteed to be zero + mov 8*1($a_ptr), %rax + adc %rdx, $t0 + + sub $acc3, $acc1 + sbb \$0, $t1 # can't borrow + + mul $acc3 + add $t0, $acc0 + adc \$0, %rdx + add %rax, $acc0 + mov $acc3, %rax + adc %rdx, $acc1 + mov $acc3, %rdx + adc \$0, $t1 # can't overflow + + shl \$32, %rax + shr \$32, %rdx + sub %rax, $acc2 + sbb %rdx, $acc3 # can't borrow + + add $t1, $acc2 + adc \$0, $acc3 # can't overflow + + ################################# Add bits [511:256] of the sqr result + xor %rdx, %rdx + add $acc4, $acc0 + adc $acc5, $acc1 + mov $acc0, $acc4 + adc $acc6, $acc2 + adc $acc7, $acc3 + mov $acc1, %rax + adc \$0, %rdx + + ################################# Compare to modulus + sub 8*0($a_ptr), $acc0 + mov $acc2, $acc6 + sbb 8*1($a_ptr), $acc1 + sbb 8*2($a_ptr), $acc2 + mov $acc3, $acc7 + sbb 8*3($a_ptr), $acc3 + sbb \$0, %rdx + + cmovc $acc4, $acc0 + cmovnc $acc1, %rax + cmovnc $acc2, $acc6 + cmovnc $acc3, $acc7 + + dec $b_ptr + jnz .Loop_ord_sqr + + mov $acc0, 8*0($r_ptr) + mov %rax, 8*1($r_ptr) + pxor %xmm1, %xmm1 + mov $acc6, 8*2($r_ptr) + pxor %xmm2, %xmm2 + mov $acc7, 8*3($r_ptr) + pxor %xmm3, %xmm3 + + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbx +.cfi_restore %rbx + mov 40(%rsp),%rbp +.cfi_restore %rbp + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lord_sqr_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont +___ + +$code.=<<___ if ($addx); +################################################################################ +.type ecp_nistz256_ord_mul_montx,\@function,3 +.align 32 +ecp_nistz256_ord_mul_montx: +.cfi_startproc +.Lecp_nistz256_ord_mul_montx: + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lord_mulx_body: + + mov $b_org, $b_ptr + mov 8*0($b_org), %rdx + mov 8*0($a_ptr), $acc1 + mov 8*1($a_ptr), $acc2 + mov 8*2($a_ptr), $acc3 + mov 8*3($a_ptr), $acc4 + lea -128($a_ptr), $a_ptr # control u-op density + lea .Lord-128(%rip), %r14 + mov .LordK(%rip), %r15 + + ################################# Multiply by b[0] + mulx $acc1, $acc0, $acc1 + mulx $acc2, $t0, $acc2 + mulx $acc3, $t1, $acc3 + add $t0, $acc1 + mulx $acc4, $t0, $acc4 + mov $acc0, %rdx + mulx %r15, %rdx, %rax + adc $t1, $acc2 + adc $t0, $acc3 + adc \$0, $acc4 + + ################################# reduction + xor $acc5, $acc5 # $acc5=0, cf=0, of=0 + mulx 8*0+128(%r14), $t0, $t1 + adcx $t0, $acc0 # guaranteed to be zero + adox $t1, $acc1 + + mulx 8*1+128(%r14), $t0, $t1 + adcx $t0, $acc1 + adox $t1, $acc2 + + mulx 8*2+128(%r14), $t0, $t1 + adcx $t0, $acc2 + adox $t1, $acc3 + + mulx 8*3+128(%r14), $t0, $t1 + mov 8*1($b_ptr), %rdx + adcx $t0, $acc3 + adox $t1, $acc4 + adcx $acc0, $acc4 + adox $acc0, $acc5 + adc \$0, $acc5 # cf=0, of=0 + + ################################# Multiply by b[1] + mulx 8*0+128($a_ptr), $t0, $t1 + adcx $t0, $acc1 + adox $t1, $acc2 + + mulx 8*1+128($a_ptr), $t0, $t1 + adcx $t0, $acc2 + adox $t1, $acc3 + + mulx 8*2+128($a_ptr), $t0, $t1 + adcx $t0, $acc3 + adox $t1, $acc4 + + mulx 8*3+128($a_ptr), $t0, $t1 + mov $acc1, %rdx + mulx %r15, %rdx, %rax + adcx $t0, $acc4 + adox $t1, $acc5 + + adcx $acc0, $acc5 + adox $acc0, $acc0 + adc \$0, $acc0 # cf=0, of=0 + + ################################# reduction + mulx 8*0+128(%r14), $t0, $t1 + adcx $t0, $acc1 # guaranteed to be zero + adox $t1, $acc2 + + mulx 8*1+128(%r14), $t0, $t1 + adcx $t0, $acc2 + adox $t1, $acc3 + + mulx 8*2+128(%r14), $t0, $t1 + adcx $t0, $acc3 + adox $t1, $acc4 + + mulx 8*3+128(%r14), $t0, $t1 + mov 8*2($b_ptr), %rdx + adcx $t0, $acc4 + adox $t1, $acc5 + adcx $acc1, $acc5 + adox $acc1, $acc0 + adc \$0, $acc0 # cf=0, of=0 + + ################################# Multiply by b[2] + mulx 8*0+128($a_ptr), $t0, $t1 + adcx $t0, $acc2 + adox $t1, $acc3 + + mulx 8*1+128($a_ptr), $t0, $t1 + adcx $t0, $acc3 + adox $t1, $acc4 + + mulx 8*2+128($a_ptr), $t0, $t1 + adcx $t0, $acc4 + adox $t1, $acc5 + + mulx 8*3+128($a_ptr), $t0, $t1 + mov $acc2, %rdx + mulx %r15, %rdx, %rax + adcx $t0, $acc5 + adox $t1, $acc0 + + adcx $acc1, $acc0 + adox $acc1, $acc1 + adc \$0, $acc1 # cf=0, of=0 + + ################################# reduction + mulx 8*0+128(%r14), $t0, $t1 + adcx $t0, $acc2 # guaranteed to be zero + adox $t1, $acc3 + + mulx 8*1+128(%r14), $t0, $t1 + adcx $t0, $acc3 + adox $t1, $acc4 + + mulx 8*2+128(%r14), $t0, $t1 + adcx $t0, $acc4 + adox $t1, $acc5 + + mulx 8*3+128(%r14), $t0, $t1 + mov 8*3($b_ptr), %rdx + adcx $t0, $acc5 + adox $t1, $acc0 + adcx $acc2, $acc0 + adox $acc2, $acc1 + adc \$0, $acc1 # cf=0, of=0 + + ################################# Multiply by b[3] + mulx 8*0+128($a_ptr), $t0, $t1 + adcx $t0, $acc3 + adox $t1, $acc4 + + mulx 8*1+128($a_ptr), $t0, $t1 + adcx $t0, $acc4 + adox $t1, $acc5 + + mulx 8*2+128($a_ptr), $t0, $t1 + adcx $t0, $acc5 + adox $t1, $acc0 + + mulx 8*3+128($a_ptr), $t0, $t1 + mov $acc3, %rdx + mulx %r15, %rdx, %rax + adcx $t0, $acc0 + adox $t1, $acc1 + + adcx $acc2, $acc1 + adox $acc2, $acc2 + adc \$0, $acc2 # cf=0, of=0 + + ################################# reduction + mulx 8*0+128(%r14), $t0, $t1 + adcx $t0, $acc3 # guranteed to be zero + adox $t1, $acc4 + + mulx 8*1+128(%r14), $t0, $t1 + adcx $t0, $acc4 + adox $t1, $acc5 + + mulx 8*2+128(%r14), $t0, $t1 + adcx $t0, $acc5 + adox $t1, $acc0 + + mulx 8*3+128(%r14), $t0, $t1 + lea 128(%r14),%r14 + mov $acc4, $t2 + adcx $t0, $acc0 + adox $t1, $acc1 + mov $acc5, $t3 + adcx $acc3, $acc1 + adox $acc3, $acc2 + adc \$0, $acc2 + + ################################# + # Branch-less conditional subtraction of P + mov $acc0, $t0 + sub 8*0(%r14), $acc4 + sbb 8*1(%r14), $acc5 + sbb 8*2(%r14), $acc0 + mov $acc1, $t1 + sbb 8*3(%r14), $acc1 + sbb \$0, $acc2 + + cmovc $t2, $acc4 + cmovc $t3, $acc5 + cmovc $t0, $acc0 + cmovc $t1, $acc1 + + mov $acc4, 8*0($r_ptr) + mov $acc5, 8*1($r_ptr) + mov $acc0, 8*2($r_ptr) + mov $acc1, 8*3($r_ptr) + + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbx +.cfi_restore %rbx + mov 40(%rsp),%rbp +.cfi_restore %rbp + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lord_mulx_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx + +.type ecp_nistz256_ord_sqr_montx,\@function,3 +.align 32 +ecp_nistz256_ord_sqr_montx: +.cfi_startproc +.Lecp_nistz256_ord_sqr_montx: + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lord_sqrx_body: + + mov $b_org, $b_ptr + mov 8*0($a_ptr), %rdx + mov 8*1($a_ptr), $acc6 + mov 8*2($a_ptr), $acc7 + mov 8*3($a_ptr), $acc0 + lea .Lord(%rip), $a_ptr + jmp .Loop_ord_sqrx + +.align 32 +.Loop_ord_sqrx: + mulx $acc6, $acc1, $acc2 # a[0]*a[1] + mulx $acc7, $t0, $acc3 # a[0]*a[2] + mov %rdx, %rax # offload a[0] + movq $acc6, %xmm1 # offload a[1] + mulx $acc0, $t1, $acc4 # a[0]*a[3] + mov $acc6, %rdx + add $t0, $acc2 + movq $acc7, %xmm2 # offload a[2] + adc $t1, $acc3 + adc \$0, $acc4 + xor $acc5, $acc5 # $acc5=0,cf=0,of=0 + ################################# + mulx $acc7, $t0, $t1 # a[1]*a[2] + adcx $t0, $acc3 + adox $t1, $acc4 + + mulx $acc0, $t0, $t1 # a[1]*a[3] + mov $acc7, %rdx + adcx $t0, $acc4 + adox $t1, $acc5 + adc \$0, $acc5 + ################################# + mulx $acc0, $t0, $acc6 # a[2]*a[3] + mov %rax, %rdx + movq $acc0, %xmm3 # offload a[3] + xor $acc7, $acc7 # $acc7=0,cf=0,of=0 + adcx $acc1, $acc1 # acc1:6<<1 + adox $t0, $acc5 + adcx $acc2, $acc2 + adox $acc7, $acc6 # of=0 + + ################################# a[i]*a[i] + mulx %rdx, $acc0, $t1 + movq %xmm1, %rdx + adcx $acc3, $acc3 + adox $t1, $acc1 + adcx $acc4, $acc4 + mulx %rdx, $t0, $t4 + movq %xmm2, %rdx + adcx $acc5, $acc5 + adox $t0, $acc2 + adcx $acc6, $acc6 + mulx %rdx, $t0, $t1 + .byte 0x67 + movq %xmm3, %rdx + adox $t4, $acc3 + adcx $acc7, $acc7 + adox $t0, $acc4 + adox $t1, $acc5 + mulx %rdx, $t0, $t4 + adox $t0, $acc6 + adox $t4, $acc7 + + ################################# reduction + mov $acc0, %rdx + mulx 8*4($a_ptr), %rdx, $t0 + + xor %rax, %rax # cf=0, of=0 + mulx 8*0($a_ptr), $t0, $t1 + adcx $t0, $acc0 # guaranteed to be zero + adox $t1, $acc1 + mulx 8*1($a_ptr), $t0, $t1 + adcx $t0, $acc1 + adox $t1, $acc2 + mulx 8*2($a_ptr), $t0, $t1 + adcx $t0, $acc2 + adox $t1, $acc3 + mulx 8*3($a_ptr), $t0, $t1 + adcx $t0, $acc3 + adox $t1, $acc0 # of=0 + adcx %rax, $acc0 # cf=0 + + ################################# + mov $acc1, %rdx + mulx 8*4($a_ptr), %rdx, $t0 + + mulx 8*0($a_ptr), $t0, $t1 + adox $t0, $acc1 # guaranteed to be zero + adcx $t1, $acc2 + mulx 8*1($a_ptr), $t0, $t1 + adox $t0, $acc2 + adcx $t1, $acc3 + mulx 8*2($a_ptr), $t0, $t1 + adox $t0, $acc3 + adcx $t1, $acc0 + mulx 8*3($a_ptr), $t0, $t1 + adox $t0, $acc0 + adcx $t1, $acc1 # cf=0 + adox %rax, $acc1 # of=0 + + ################################# + mov $acc2, %rdx + mulx 8*4($a_ptr), %rdx, $t0 + + mulx 8*0($a_ptr), $t0, $t1 + adcx $t0, $acc2 # guaranteed to be zero + adox $t1, $acc3 + mulx 8*1($a_ptr), $t0, $t1 + adcx $t0, $acc3 + adox $t1, $acc0 + mulx 8*2($a_ptr), $t0, $t1 + adcx $t0, $acc0 + adox $t1, $acc1 + mulx 8*3($a_ptr), $t0, $t1 + adcx $t0, $acc1 + adox $t1, $acc2 # of=0 + adcx %rax, $acc2 # cf=0 + + ################################# + mov $acc3, %rdx + mulx 8*4($a_ptr), %rdx, $t0 + + mulx 8*0($a_ptr), $t0, $t1 + adox $t0, $acc3 # guaranteed to be zero + adcx $t1, $acc0 + mulx 8*1($a_ptr), $t0, $t1 + adox $t0, $acc0 + adcx $t1, $acc1 + mulx 8*2($a_ptr), $t0, $t1 + adox $t0, $acc1 + adcx $t1, $acc2 + mulx 8*3($a_ptr), $t0, $t1 + adox $t0, $acc2 + adcx $t1, $acc3 + adox %rax, $acc3 + + ################################# accumulate upper half + add $acc0, $acc4 # add $acc4, $acc0 + adc $acc5, $acc1 + mov $acc4, %rdx + adc $acc6, $acc2 + adc $acc7, $acc3 + mov $acc1, $acc6 + adc \$0, %rax + + ################################# compare to modulus + sub 8*0($a_ptr), $acc4 + mov $acc2, $acc7 + sbb 8*1($a_ptr), $acc1 + sbb 8*2($a_ptr), $acc2 + mov $acc3, $acc0 + sbb 8*3($a_ptr), $acc3 + sbb \$0, %rax + + cmovnc $acc4, %rdx + cmovnc $acc1, $acc6 + cmovnc $acc2, $acc7 + cmovnc $acc3, $acc0 + + dec $b_ptr + jnz .Loop_ord_sqrx + + mov %rdx, 8*0($r_ptr) + mov $acc6, 8*1($r_ptr) + pxor %xmm1, %xmm1 + mov $acc7, 8*2($r_ptr) + pxor %xmm2, %xmm2 + mov $acc0, 8*3($r_ptr) + pxor %xmm3, %xmm3 + + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbx +.cfi_restore %rbx + mov 40(%rsp),%rbp +.cfi_restore %rbp + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lord_sqrx_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx +___ + +$code.=<<___; +################################################################################ # void ecp_nistz256_to_mont( # uint64_t res[4], # uint64_t in[4]); @@ -463,6 +1599,7 @@ $code.=<<___; .type ecp_nistz256_mul_mont,\@function,3 .align 32 ecp_nistz256_mul_mont: +.cfi_startproc ___ $code.=<<___ if ($addx); mov \$0x80100, %ecx @@ -471,11 +1608,18 @@ ___ $code.=<<___; .Lmul_mont: push %rbp +.cfi_push %rbp push %rbx +.cfi_push %rbx push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 +.Lmul_body: ___ $code.=<<___ if ($addx); cmp \$0x80100, %ecx @@ -508,13 +1652,23 @@ $code.=<<___ if ($addx); ___ $code.=<<___; .Lmul_mont_done: - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbx - pop %rbp + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbx +.cfi_restore %rbx + mov 40(%rsp),%rbp +.cfi_restore %rbp + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lmul_epilogue: ret +.cfi_endproc .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont .type __ecp_nistz256_mul_montq,\@abi-omnipotent @@ -604,7 +1758,7 @@ __ecp_nistz256_mul_montq: adc \$0, $acc0 ######################################################################## - # Second reduction step + # Second reduction step mov $acc1, $t1 shl \$32, $acc1 mulq $poly3 @@ -651,7 +1805,7 @@ __ecp_nistz256_mul_montq: adc \$0, $acc1 ######################################################################## - # Third reduction step + # Third reduction step mov $acc2, $t1 shl \$32, $acc2 mulq $poly3 @@ -698,7 +1852,7 @@ __ecp_nistz256_mul_montq: adc \$0, $acc2 ######################################################################## - # Final reduction step + # Final reduction step mov $acc3, $t1 shl \$32, $acc3 mulq $poly3 @@ -711,7 +1865,7 @@ __ecp_nistz256_mul_montq: mov $acc5, $t1 adc \$0, $acc2 - ######################################################################## + ######################################################################## # Branch-less conditional subtraction of P sub \$-1, $acc4 # .Lpoly[0] mov $acc0, $t2 @@ -744,6 +1898,7 @@ __ecp_nistz256_mul_montq: .type ecp_nistz256_sqr_mont,\@function,2 .align 32 ecp_nistz256_sqr_mont: +.cfi_startproc ___ $code.=<<___ if ($addx); mov \$0x80100, %ecx @@ -751,11 +1906,18 @@ $code.=<<___ if ($addx); ___ $code.=<<___; push %rbp +.cfi_push %rbp push %rbx +.cfi_push %rbx push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 +.Lsqr_body: ___ $code.=<<___ if ($addx); cmp \$0x80100, %ecx @@ -784,13 +1946,23 @@ $code.=<<___ if ($addx); ___ $code.=<<___; .Lsqr_mont_done: - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbx - pop %rbp + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbx +.cfi_restore %rbx + mov 40(%rsp),%rbp +.cfi_restore %rbp + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lsqr_epilogue: ret +.cfi_endproc .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont .type __ecp_nistz256_sqr_montq,\@abi-omnipotent @@ -1271,8 +2443,12 @@ $code.=<<___; .type ecp_nistz256_from_mont,\@function,2 .align 32 ecp_nistz256_from_mont: +.cfi_startproc push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 +.Lfrom_body: mov 8*0($in_ptr), %rax mov .Lpoly+8*3(%rip), $t2 @@ -1353,9 +2529,15 @@ ecp_nistz256_from_mont: mov $acc2, 8*2($r_ptr) mov $acc3, 8*3($r_ptr) - pop %r13 - pop %r12 + mov 0(%rsp),%r13 +.cfi_restore %r13 + mov 8(%rsp),%r12 +.cfi_restore %r12 + lea 16(%rsp),%rsp +.cfi_adjust_cfa_offset -16 +.Lfrom_epilogue: ret +.cfi_endproc .size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont ___ } @@ -1367,20 +2549,44 @@ my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15)); $code.=<<___; ################################################################################ -# void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index); -.globl ecp_nistz256_select_w5 -.type ecp_nistz256_select_w5,\@abi-omnipotent +# void ecp_nistz256_scatter_w5(uint64_t *val, uint64_t *in_t, int index); +.globl ecp_nistz256_scatter_w5 +.type ecp_nistz256_scatter_w5,\@abi-omnipotent +.align 32 +ecp_nistz256_scatter_w5: + lea -3($index,$index,2), $index + movdqa 0x00($in_t), %xmm0 + shl \$5, $index + movdqa 0x10($in_t), %xmm1 + movdqa 0x20($in_t), %xmm2 + movdqa 0x30($in_t), %xmm3 + movdqa 0x40($in_t), %xmm4 + movdqa 0x50($in_t), %xmm5 + movdqa %xmm0, 0x00($val,$index) + movdqa %xmm1, 0x10($val,$index) + movdqa %xmm2, 0x20($val,$index) + movdqa %xmm3, 0x30($val,$index) + movdqa %xmm4, 0x40($val,$index) + movdqa %xmm5, 0x50($val,$index) + + ret +.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 + +################################################################################ +# void ecp_nistz256_gather_w5(uint64_t *val, uint64_t *in_t, int index); +.globl ecp_nistz256_gather_w5 +.type ecp_nistz256_gather_w5,\@abi-omnipotent .align 32 -ecp_nistz256_select_w5: +ecp_nistz256_gather_w5: ___ $code.=<<___ if ($avx>1); mov OPENSSL_ia32cap_P+8(%rip), %eax test \$`1<<5`, %eax - jnz .Lavx2_select_w5 + jnz .Lavx2_gather_w5 ___ $code.=<<___ if ($win64); lea -0x88(%rsp), %rax -.LSEH_begin_ecp_nistz256_select_w5: +.LSEH_begin_ecp_nistz256_gather_w5: .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) @@ -1457,27 +2663,46 @@ $code.=<<___ if ($win64); movaps 0x80(%rsp), %xmm14 movaps 0x90(%rsp), %xmm15 lea 0xa8(%rsp), %rsp -.LSEH_end_ecp_nistz256_select_w5: ___ $code.=<<___; ret -.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 +.LSEH_end_ecp_nistz256_gather_w5: +.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 ################################################################################ -# void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index); -.globl ecp_nistz256_select_w7 -.type ecp_nistz256_select_w7,\@abi-omnipotent +# void ecp_nistz256_scatter_w7(uint64_t *val, uint64_t *in_t, int index); +.globl ecp_nistz256_scatter_w7 +.type ecp_nistz256_scatter_w7,\@abi-omnipotent .align 32 -ecp_nistz256_select_w7: +ecp_nistz256_scatter_w7: + movdqu 0x00($in_t), %xmm0 + shl \$6, $index + movdqu 0x10($in_t), %xmm1 + movdqu 0x20($in_t), %xmm2 + movdqu 0x30($in_t), %xmm3 + movdqa %xmm0, 0x00($val,$index) + movdqa %xmm1, 0x10($val,$index) + movdqa %xmm2, 0x20($val,$index) + movdqa %xmm3, 0x30($val,$index) + + ret +.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 + +################################################################################ +# void ecp_nistz256_gather_w7(uint64_t *val, uint64_t *in_t, int index); +.globl ecp_nistz256_gather_w7 +.type ecp_nistz256_gather_w7,\@abi-omnipotent +.align 32 +ecp_nistz256_gather_w7: ___ $code.=<<___ if ($avx>1); mov OPENSSL_ia32cap_P+8(%rip), %eax test \$`1<<5`, %eax - jnz .Lavx2_select_w7 + jnz .Lavx2_gather_w7 ___ $code.=<<___ if ($win64); lea -0x88(%rsp), %rax -.LSEH_begin_ecp_nistz256_select_w7: +.LSEH_begin_ecp_nistz256_gather_w7: .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) @@ -1543,11 +2768,11 @@ $code.=<<___ if ($win64); movaps 0x80(%rsp), %xmm14 movaps 0x90(%rsp), %xmm15 lea 0xa8(%rsp), %rsp -.LSEH_end_ecp_nistz256_select_w7: ___ $code.=<<___; ret -.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 +.LSEH_end_ecp_nistz256_gather_w7: +.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 ___ } if ($avx>1) { @@ -1558,27 +2783,28 @@ my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14)); $code.=<<___; ################################################################################ -# void ecp_nistz256_avx2_select_w5(uint64_t *val, uint64_t *in_t, int index); -.type ecp_nistz256_avx2_select_w5,\@abi-omnipotent +# void ecp_nistz256_avx2_gather_w5(uint64_t *val, uint64_t *in_t, int index); +.type ecp_nistz256_avx2_gather_w5,\@abi-omnipotent .align 32 -ecp_nistz256_avx2_select_w5: -.Lavx2_select_w5: +ecp_nistz256_avx2_gather_w5: +.Lavx2_gather_w5: vzeroupper ___ $code.=<<___ if ($win64); lea -0x88(%rsp), %rax -.LSEH_begin_ecp_nistz256_avx2_select_w5: - .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp - .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax) - .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax) - .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8, 8(%rax) - .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9, 0x10(%rax) - .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10, 0x20(%rax) - .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11, 0x30(%rax) - .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12, 0x40(%rax) - .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13, 0x50(%rax) - .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14, 0x60(%rax) - .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15, 0x70(%rax) + mov %rsp,%r11 +.LSEH_begin_ecp_nistz256_avx2_gather_w5: + .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax), %rsp + .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6, -0x20(%rax) + .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7, -0x10(%rax) + .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8, 8(%rax) + .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9, 0x10(%rax) + .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10, 0x20(%rax) + .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11, 0x30(%rax) + .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12, 0x40(%rax) + .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13, 0x50(%rax) + .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14, 0x60(%rax) + .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15, 0x70(%rax) ___ $code.=<<___; vmovdqa .LTwo(%rip), $TWO @@ -1644,12 +2870,12 @@ $code.=<<___ if ($win64); movaps 0x70(%rsp), %xmm13 movaps 0x80(%rsp), %xmm14 movaps 0x90(%rsp), %xmm15 - lea 0xa8(%rsp), %rsp -.LSEH_end_ecp_nistz256_avx2_select_w5: + lea (%r11), %rsp ___ $code.=<<___; ret -.size ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5 +.LSEH_end_ecp_nistz256_avx2_gather_w5: +.size ecp_nistz256_avx2_gather_w5,.-ecp_nistz256_avx2_gather_w5 ___ } if ($avx>1) { @@ -1662,28 +2888,29 @@ my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15)); $code.=<<___; ################################################################################ -# void ecp_nistz256_avx2_select_w7(uint64_t *val, uint64_t *in_t, int index); -.globl ecp_nistz256_avx2_select_w7 -.type ecp_nistz256_avx2_select_w7,\@abi-omnipotent +# void ecp_nistz256_avx2_gather_w7(uint64_t *val, uint64_t *in_t, int index); +.globl ecp_nistz256_avx2_gather_w7 +.type ecp_nistz256_avx2_gather_w7,\@abi-omnipotent .align 32 -ecp_nistz256_avx2_select_w7: -.Lavx2_select_w7: +ecp_nistz256_avx2_gather_w7: +.Lavx2_gather_w7: vzeroupper ___ $code.=<<___ if ($win64); + mov %rsp,%r11 lea -0x88(%rsp), %rax -.LSEH_begin_ecp_nistz256_avx2_select_w7: - .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp - .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax) - .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax) - .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8, 8(%rax) - .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9, 0x10(%rax) - .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10, 0x20(%rax) - .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11, 0x30(%rax) - .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12, 0x40(%rax) - .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13, 0x50(%rax) - .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14, 0x60(%rax) - .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15, 0x70(%rax) +.LSEH_begin_ecp_nistz256_avx2_gather_w7: + .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax), %rsp + .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6, -0x20(%rax) + .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7, -0x10(%rax) + .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8, 8(%rax) + .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9, 0x10(%rax) + .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10, 0x20(%rax) + .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11, 0x30(%rax) + .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12, 0x40(%rax) + .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13, 0x50(%rax) + .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14, 0x60(%rax) + .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15, 0x70(%rax) ___ $code.=<<___; vmovdqa .LThree(%rip), $THREE @@ -1764,22 +2991,22 @@ $code.=<<___ if ($win64); movaps 0x70(%rsp), %xmm13 movaps 0x80(%rsp), %xmm14 movaps 0x90(%rsp), %xmm15 - lea 0xa8(%rsp), %rsp -.LSEH_end_ecp_nistz256_avx2_select_w7: + lea (%r11), %rsp ___ $code.=<<___; ret -.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7 +.LSEH_end_ecp_nistz256_avx2_gather_w7: +.size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7 ___ } else { $code.=<<___; -.globl ecp_nistz256_avx2_select_w7 -.type ecp_nistz256_avx2_select_w7,\@function,3 +.globl ecp_nistz256_avx2_gather_w7 +.type ecp_nistz256_avx2_gather_w7,\@function,3 .align 32 -ecp_nistz256_avx2_select_w7: +ecp_nistz256_avx2_gather_w7: .byte 0x0f,0x0b # ud2 ret -.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7 +.size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7 ___ } {{{ @@ -1972,6 +3199,7 @@ $code.=<<___; .type ecp_nistz256_point_double,\@function,2 .align 32 ecp_nistz256_point_double: +.cfi_startproc ___ $code.=<<___ if ($addx); mov \$0x80100, %ecx @@ -1988,17 +3216,26 @@ $code.=<<___; .type ecp_nistz256_point_doublex,\@function,2 .align 32 ecp_nistz256_point_doublex: +.cfi_startproc .Lpoint_doublex: ___ } $code.=<<___; push %rbp +.cfi_push %rbp push %rbx +.cfi_push %rbx push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 sub \$32*5+8, %rsp +.cfi_adjust_cfa_offset 32*5+8 +.Lpoint_double${x}_body: .Lpoint_double_shortcut$x: movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x @@ -2064,7 +3301,7 @@ $code.=<<___; movq %xmm1, $r_ptr call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S); ___ -{ +{ ######## ecp_nistz256_div_by_2(res_y, res_y); ########################## # operate in 4-5-6-7 "name space" that matches squaring output # @@ -2153,7 +3390,7 @@ $code.=<<___; lea $M(%rsp), $b_ptr mov $acc4, $acc6 # harmonize sub output and mul input xor %ecx, %ecx - mov $acc4, $S+8*0(%rsp) # have to save:-( + mov $acc4, $S+8*0(%rsp) # have to save:-( mov $acc5, $acc2 mov $acc5, $S+8*1(%rsp) cmovz $acc0, $acc3 @@ -2169,14 +3406,25 @@ $code.=<<___; movq %xmm1, $r_ptr call __ecp_nistz256_sub_from$x # p256_sub(res_y, S, res_y); - add \$32*5+8, %rsp - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbx - pop %rbp + lea 32*5+56(%rsp), %rsi +.cfi_def_cfa %rsi,8 + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbx +.cfi_restore %rbx + mov -8(%rsi),%rbp +.cfi_restore %rbp + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpoint_double${x}_epilogue: ret +.cfi_endproc .size ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx ___ } @@ -2202,6 +3450,7 @@ $code.=<<___; .type ecp_nistz256_point_add,\@function,3 .align 32 ecp_nistz256_point_add: +.cfi_startproc ___ $code.=<<___ if ($addx); mov \$0x80100, %ecx @@ -2218,17 +3467,26 @@ $code.=<<___; .type ecp_nistz256_point_addx,\@function,3 .align 32 ecp_nistz256_point_addx: +.cfi_startproc .Lpoint_addx: ___ } $code.=<<___; push %rbp +.cfi_push %rbp push %rbx +.cfi_push %rbx push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 sub \$32*18+8, %rsp +.cfi_adjust_cfa_offset 32*18+8 +.Lpoint_add${x}_body: movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr movdqu 0x10($a_ptr), %xmm1 @@ -2537,14 +3795,25 @@ $code.=<<___; movdqu %xmm3, 0x30($r_ptr) .Ladd_done$x: - add \$32*18+8, %rsp - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbx - pop %rbp + lea 32*18+56(%rsp), %rsi +.cfi_def_cfa %rsi,8 + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbx +.cfi_restore %rbx + mov -8(%rsi),%rbp +.cfi_restore %rbp + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpoint_add${x}_epilogue: ret +.cfi_endproc .size ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx ___ } @@ -2569,6 +3838,7 @@ $code.=<<___; .type ecp_nistz256_point_add_affine,\@function,3 .align 32 ecp_nistz256_point_add_affine: +.cfi_startproc ___ $code.=<<___ if ($addx); mov \$0x80100, %ecx @@ -2585,17 +3855,26 @@ $code.=<<___; .type ecp_nistz256_point_add_affinex,\@function,3 .align 32 ecp_nistz256_point_add_affinex: +.cfi_startproc .Lpoint_add_affinex: ___ } $code.=<<___; push %rbp +.cfi_push %rbp push %rbx +.cfi_push %rbx push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 sub \$32*15+8, %rsp +.cfi_adjust_cfa_offset 32*15+8 +.Ladd_affine${x}_body: movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr mov $b_org, $b_ptr # reassign @@ -2840,14 +4119,25 @@ $code.=<<___; movdqu %xmm2, 0x20($r_ptr) movdqu %xmm3, 0x30($r_ptr) - add \$32*15+8, %rsp - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbx - pop %rbp + lea 32*15+56(%rsp), %rsi +.cfi_def_cfa %rsi,8 + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbx +.cfi_restore %rbx + mov -8(%rsi),%rbp +.cfi_restore %rbp + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Ladd_affine${x}_epilogue: ret +.cfi_endproc .size ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx ___ } @@ -2998,6 +4288,420 @@ ___ } }}} +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind + +.type short_handler,\@abi-omnipotent +.align 16 +short_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # end of prologue label + cmp %r10,%rbx # context->Rip<end of prologue label + jb .Lcommon_seh_tail + + mov 152($context),%rax # pull context->Rsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + lea 16(%rax),%rax + + mov -8(%rax),%r12 + mov -16(%rax),%r13 + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + + jmp .Lcommon_seh_tail +.size short_handler,.-short_handler + +.type full_handler,\@abi-omnipotent +.align 16 +full_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # end of prologue label + cmp %r10,%rbx # context->Rip<end of prologue label + jb .Lcommon_seh_tail + + mov 152($context),%rax # pull context->Rsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + mov 8(%r11),%r10d # HandlerData[2] + lea (%rax,%r10),%rax + + mov -8(%rax),%rbp + mov -16(%rax),%rbx + mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov -48(%rax),%r15 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R15 + +.Lcommon_seh_tail: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size full_handler,.-full_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_ecp_nistz256_mul_by_2 + .rva .LSEH_end_ecp_nistz256_mul_by_2 + .rva .LSEH_info_ecp_nistz256_mul_by_2 + + .rva .LSEH_begin_ecp_nistz256_div_by_2 + .rva .LSEH_end_ecp_nistz256_div_by_2 + .rva .LSEH_info_ecp_nistz256_div_by_2 + + .rva .LSEH_begin_ecp_nistz256_mul_by_3 + .rva .LSEH_end_ecp_nistz256_mul_by_3 + .rva .LSEH_info_ecp_nistz256_mul_by_3 + + .rva .LSEH_begin_ecp_nistz256_add + .rva .LSEH_end_ecp_nistz256_add + .rva .LSEH_info_ecp_nistz256_add + + .rva .LSEH_begin_ecp_nistz256_sub + .rva .LSEH_end_ecp_nistz256_sub + .rva .LSEH_info_ecp_nistz256_sub + + .rva .LSEH_begin_ecp_nistz256_neg + .rva .LSEH_end_ecp_nistz256_neg + .rva .LSEH_info_ecp_nistz256_neg + + .rva .LSEH_begin_ecp_nistz256_ord_mul_mont + .rva .LSEH_end_ecp_nistz256_ord_mul_mont + .rva .LSEH_info_ecp_nistz256_ord_mul_mont + + .rva .LSEH_begin_ecp_nistz256_ord_sqr_mont + .rva .LSEH_end_ecp_nistz256_ord_sqr_mont + .rva .LSEH_info_ecp_nistz256_ord_sqr_mont +___ +$code.=<<___ if ($addx); + .rva .LSEH_begin_ecp_nistz256_ord_mul_montx + .rva .LSEH_end_ecp_nistz256_ord_mul_montx + .rva .LSEH_info_ecp_nistz256_ord_mul_montx + + .rva .LSEH_begin_ecp_nistz256_ord_sqr_montx + .rva .LSEH_end_ecp_nistz256_ord_sqr_montx + .rva .LSEH_info_ecp_nistz256_ord_sqr_montx +___ +$code.=<<___; + .rva .LSEH_begin_ecp_nistz256_to_mont + .rva .LSEH_end_ecp_nistz256_to_mont + .rva .LSEH_info_ecp_nistz256_to_mont + + .rva .LSEH_begin_ecp_nistz256_mul_mont + .rva .LSEH_end_ecp_nistz256_mul_mont + .rva .LSEH_info_ecp_nistz256_mul_mont + + .rva .LSEH_begin_ecp_nistz256_sqr_mont + .rva .LSEH_end_ecp_nistz256_sqr_mont + .rva .LSEH_info_ecp_nistz256_sqr_mont + + .rva .LSEH_begin_ecp_nistz256_from_mont + .rva .LSEH_end_ecp_nistz256_from_mont + .rva .LSEH_info_ecp_nistz256_from_mont + + .rva .LSEH_begin_ecp_nistz256_gather_w5 + .rva .LSEH_end_ecp_nistz256_gather_w5 + .rva .LSEH_info_ecp_nistz256_gather_wX + + .rva .LSEH_begin_ecp_nistz256_gather_w7 + .rva .LSEH_end_ecp_nistz256_gather_w7 + .rva .LSEH_info_ecp_nistz256_gather_wX +___ +$code.=<<___ if ($avx>1); + .rva .LSEH_begin_ecp_nistz256_avx2_gather_w5 + .rva .LSEH_end_ecp_nistz256_avx2_gather_w5 + .rva .LSEH_info_ecp_nistz256_avx2_gather_wX + + .rva .LSEH_begin_ecp_nistz256_avx2_gather_w7 + .rva .LSEH_end_ecp_nistz256_avx2_gather_w7 + .rva .LSEH_info_ecp_nistz256_avx2_gather_wX +___ +$code.=<<___; + .rva .LSEH_begin_ecp_nistz256_point_double + .rva .LSEH_end_ecp_nistz256_point_double + .rva .LSEH_info_ecp_nistz256_point_double + + .rva .LSEH_begin_ecp_nistz256_point_add + .rva .LSEH_end_ecp_nistz256_point_add + .rva .LSEH_info_ecp_nistz256_point_add + + .rva .LSEH_begin_ecp_nistz256_point_add_affine + .rva .LSEH_end_ecp_nistz256_point_add_affine + .rva .LSEH_info_ecp_nistz256_point_add_affine +___ +$code.=<<___ if ($addx); + .rva .LSEH_begin_ecp_nistz256_point_doublex + .rva .LSEH_end_ecp_nistz256_point_doublex + .rva .LSEH_info_ecp_nistz256_point_doublex + + .rva .LSEH_begin_ecp_nistz256_point_addx + .rva .LSEH_end_ecp_nistz256_point_addx + .rva .LSEH_info_ecp_nistz256_point_addx + + .rva .LSEH_begin_ecp_nistz256_point_add_affinex + .rva .LSEH_end_ecp_nistz256_point_add_affinex + .rva .LSEH_info_ecp_nistz256_point_add_affinex +___ +$code.=<<___; + +.section .xdata +.align 8 +.LSEH_info_ecp_nistz256_mul_by_2: + .byte 9,0,0,0 + .rva short_handler + .rva .Lmul_by_2_body,.Lmul_by_2_epilogue # HandlerData[] +.LSEH_info_ecp_nistz256_div_by_2: + .byte 9,0,0,0 + .rva short_handler + .rva .Ldiv_by_2_body,.Ldiv_by_2_epilogue # HandlerData[] +.LSEH_info_ecp_nistz256_mul_by_3: + .byte 9,0,0,0 + .rva short_handler + .rva .Lmul_by_3_body,.Lmul_by_3_epilogue # HandlerData[] +.LSEH_info_ecp_nistz256_add: + .byte 9,0,0,0 + .rva short_handler + .rva .Ladd_body,.Ladd_epilogue # HandlerData[] +.LSEH_info_ecp_nistz256_sub: + .byte 9,0,0,0 + .rva short_handler + .rva .Lsub_body,.Lsub_epilogue # HandlerData[] +.LSEH_info_ecp_nistz256_neg: + .byte 9,0,0,0 + .rva short_handler + .rva .Lneg_body,.Lneg_epilogue # HandlerData[] +.LSEH_info_ecp_nistz256_ord_mul_mont: + .byte 9,0,0,0 + .rva full_handler + .rva .Lord_mul_body,.Lord_mul_epilogue # HandlerData[] + .long 48,0 +.LSEH_info_ecp_nistz256_ord_sqr_mont: + .byte 9,0,0,0 + .rva full_handler + .rva .Lord_sqr_body,.Lord_sqr_epilogue # HandlerData[] + .long 48,0 +___ +$code.=<<___ if ($addx); +.LSEH_info_ecp_nistz256_ord_mul_montx: + .byte 9,0,0,0 + .rva full_handler + .rva .Lord_mulx_body,.Lord_mulx_epilogue # HandlerData[] + .long 48,0 +.LSEH_info_ecp_nistz256_ord_sqr_montx: + .byte 9,0,0,0 + .rva full_handler + .rva .Lord_sqrx_body,.Lord_sqrx_epilogue # HandlerData[] + .long 48,0 +___ +$code.=<<___; +.LSEH_info_ecp_nistz256_to_mont: + .byte 9,0,0,0 + .rva full_handler + .rva .Lmul_body,.Lmul_epilogue # HandlerData[] + .long 48,0 +.LSEH_info_ecp_nistz256_mul_mont: + .byte 9,0,0,0 + .rva full_handler + .rva .Lmul_body,.Lmul_epilogue # HandlerData[] + .long 48,0 +.LSEH_info_ecp_nistz256_sqr_mont: + .byte 9,0,0,0 + .rva full_handler + .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[] + .long 48,0 +.LSEH_info_ecp_nistz256_from_mont: + .byte 9,0,0,0 + .rva short_handler + .rva .Lfrom_body,.Lfrom_epilogue # HandlerData[] +.LSEH_info_ecp_nistz256_gather_wX: + .byte 0x01,0x33,0x16,0x00 + .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15 + .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14 + .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13 + .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12 + .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11 + .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10 + .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9 + .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8 + .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 + .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 + .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8 + .align 8 +___ +$code.=<<___ if ($avx>1); +.LSEH_info_ecp_nistz256_avx2_gather_wX: + .byte 0x01,0x36,0x17,0x0b + .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15 + .byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14 + .byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13 + .byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12 + .byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11 + .byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10 + .byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9 + .byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8 + .byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7 + .byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6 + .byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8 + .byte 0x00,0xb3,0x00,0x00 # set_frame r11 + .align 8 +___ +$code.=<<___; +.LSEH_info_ecp_nistz256_point_double: + .byte 9,0,0,0 + .rva full_handler + .rva .Lpoint_doubleq_body,.Lpoint_doubleq_epilogue # HandlerData[] + .long 32*5+56,0 +.LSEH_info_ecp_nistz256_point_add: + .byte 9,0,0,0 + .rva full_handler + .rva .Lpoint_addq_body,.Lpoint_addq_epilogue # HandlerData[] + .long 32*18+56,0 +.LSEH_info_ecp_nistz256_point_add_affine: + .byte 9,0,0,0 + .rva full_handler + .rva .Ladd_affineq_body,.Ladd_affineq_epilogue # HandlerData[] + .long 32*15+56,0 +___ +$code.=<<___ if ($addx); +.align 8 +.LSEH_info_ecp_nistz256_point_doublex: + .byte 9,0,0,0 + .rva full_handler + .rva .Lpoint_doublex_body,.Lpoint_doublex_epilogue # HandlerData[] + .long 32*5+56,0 +.LSEH_info_ecp_nistz256_point_addx: + .byte 9,0,0,0 + .rva full_handler + .rva .Lpoint_addx_body,.Lpoint_addx_epilogue # HandlerData[] + .long 32*18+56,0 +.LSEH_info_ecp_nistz256_point_add_affinex: + .byte 9,0,0,0 + .rva full_handler + .rva .Ladd_affinex_body,.Ladd_affinex_epilogue # HandlerData[] + .long 32*15+56,0 +___ +} + +######################################################################## +# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 +# +open TABLE,"<ecp_nistz256_table.c" or +open TABLE,"<${dir}../ecp_nistz256_table.c" or +die "failed to open ecp_nistz256_table.c:",$!; + +use integer; + +foreach(<TABLE>) { + s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; +} +close TABLE; + +die "insane number of elements" if ($#arr != 64*16*37-1); + +print <<___; +.text +.globl ecp_nistz256_precomputed +.type ecp_nistz256_precomputed,\@object +.align 4096 +ecp_nistz256_precomputed: +___ +while (@line=splice(@arr,0,16)) { + print ".long\t",join(',',map { sprintf "0x%08x",$_} @line),"\n"; +} +print <<___; +.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed +___ + $code =~ s/\`([^\`]*)\`/eval $1/gem; print $code; close STDOUT; diff --git a/crypto/ec/asm/x25519-ppc64.pl b/crypto/ec/asm/x25519-ppc64.pl new file mode 100755 index 000000000000..3773cb27cd65 --- /dev/null +++ b/crypto/ec/asm/x25519-ppc64.pl @@ -0,0 +1,824 @@ +#! /usr/bin/env perl +# Copyright 2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# X25519 lower-level primitives for PPC64. +# +# July 2018. +# +# Base 2^64 is faster than base 2^51 on pre-POWER8, most notably ~15% +# faster on PPC970/G5. POWER8 on the other hand seems to trip on own +# shoelaces when handling longer carry chains. As base 2^51 has just +# single-carry pairs, it's 25% faster than base 2^64. Since PPC970 is +# pretty old, base 2^64 implementation is not engaged. Comparison to +# compiler-generated code is complicated by the fact that not all +# compilers support 128-bit integers. When compiler doesn't, like xlc, +# this module delivers more than 2x improvement, and when it does, +# from 12% to 30% improvement was measured... + +$flavour = shift; +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +my $sp = "r1"; +my ($rp,$ap,$bp) = map("r$_",3..5); + +####################################################### base 2^64 +if (0) { +my ($bi,$a0,$a1,$a2,$a3,$t0,$t1, $t2,$t3, + $acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7) = + map("r$_",(6..12,22..31)); +my $zero = "r0"; +my $FRAME = 16*8; + +$code.=<<___; +.text + +.globl x25519_fe64_mul +.type x25519_fe64_mul,\@function +.align 5 +x25519_fe64_mul: + stdu $sp,-$FRAME($sp) + std r22,`$FRAME-8*10`($sp) + std r23,`$FRAME-8*9`($sp) + std r24,`$FRAME-8*8`($sp) + std r25,`$FRAME-8*7`($sp) + std r26,`$FRAME-8*6`($sp) + std r27,`$FRAME-8*5`($sp) + std r28,`$FRAME-8*4`($sp) + std r29,`$FRAME-8*3`($sp) + std r30,`$FRAME-8*2`($sp) + std r31,`$FRAME-8*1`($sp) + + ld $bi,0($bp) + ld $a0,0($ap) + xor $zero,$zero,$zero + ld $a1,8($ap) + ld $a2,16($ap) + ld $a3,24($ap) + + mulld $acc0,$a0,$bi # a[0]*b[0] + mulhdu $t0,$a0,$bi + mulld $acc1,$a1,$bi # a[1]*b[0] + mulhdu $t1,$a1,$bi + mulld $acc2,$a2,$bi # a[2]*b[0] + mulhdu $t2,$a2,$bi + mulld $acc3,$a3,$bi # a[3]*b[0] + mulhdu $t3,$a3,$bi +___ +for(my @acc=($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7), + my $i=1; $i<4; shift(@acc), $i++) { +my $acc4 = $i==1? $zero : @acc[4]; + +$code.=<<___; + ld $bi,`8*$i`($bp) + addc @acc[1],@acc[1],$t0 # accumulate high parts + mulld $t0,$a0,$bi + adde @acc[2],@acc[2],$t1 + mulld $t1,$a1,$bi + adde @acc[3],@acc[3],$t2 + mulld $t2,$a2,$bi + adde @acc[4],$acc4,$t3 + mulld $t3,$a3,$bi + addc @acc[1],@acc[1],$t0 # accumulate low parts + mulhdu $t0,$a0,$bi + adde @acc[2],@acc[2],$t1 + mulhdu $t1,$a1,$bi + adde @acc[3],@acc[3],$t2 + mulhdu $t2,$a2,$bi + adde @acc[4],@acc[4],$t3 + mulhdu $t3,$a3,$bi + adde @acc[5],$zero,$zero +___ +} +$code.=<<___; + li $bi,38 + addc $acc4,$acc4,$t0 + mulld $t0,$acc4,$bi + adde $acc5,$acc5,$t1 + mulld $t1,$acc5,$bi + adde $acc6,$acc6,$t2 + mulld $t2,$acc6,$bi + adde $acc7,$acc7,$t3 + mulld $t3,$acc7,$bi + + addc $acc0,$acc0,$t0 + mulhdu $t0,$acc4,$bi + adde $acc1,$acc1,$t1 + mulhdu $t1,$acc5,$bi + adde $acc2,$acc2,$t2 + mulhdu $t2,$acc6,$bi + adde $acc3,$acc3,$t3 + mulhdu $t3,$acc7,$bi + adde $acc4,$zero,$zero + + addc $acc1,$acc1,$t0 + adde $acc2,$acc2,$t1 + adde $acc3,$acc3,$t2 + adde $acc4,$acc4,$t3 + + mulld $acc4,$acc4,$bi + + addc $acc0,$acc0,$acc4 + addze $acc1,$acc1 + addze $acc2,$acc2 + addze $acc3,$acc3 + + subfe $acc4,$acc4,$acc4 # carry -> ~mask + std $acc1,8($rp) + andc $acc4,$bi,$acc4 + std $acc2,16($rp) + add $acc0,$acc0,$acc4 + std $acc3,24($rp) + std $acc0,0($rp) + + ld r22,`$FRAME-8*10`($sp) + ld r23,`$FRAME-8*9`($sp) + ld r24,`$FRAME-8*8`($sp) + ld r25,`$FRAME-8*7`($sp) + ld r26,`$FRAME-8*6`($sp) + ld r27,`$FRAME-8*5`($sp) + ld r28,`$FRAME-8*4`($sp) + ld r29,`$FRAME-8*3`($sp) + ld r30,`$FRAME-8*2`($sp) + ld r31,`$FRAME-8*1`($sp) + addi $sp,$sp,$FRAME + blr + .long 0 + .byte 0,12,4,0,0x80,10,3,0 + .long 0 +.size x25519_fe64_mul,.-x25519_fe64_mul + +.globl x25519_fe64_sqr +.type x25519_fe64_sqr,\@function +.align 5 +x25519_fe64_sqr: + stdu $sp,-$FRAME($sp) + std r22,`$FRAME-8*10`($sp) + std r23,`$FRAME-8*9`($sp) + std r24,`$FRAME-8*8`($sp) + std r25,`$FRAME-8*7`($sp) + std r26,`$FRAME-8*6`($sp) + std r27,`$FRAME-8*5`($sp) + std r28,`$FRAME-8*4`($sp) + std r29,`$FRAME-8*3`($sp) + std r30,`$FRAME-8*2`($sp) + std r31,`$FRAME-8*1`($sp) + + ld $a0,0($ap) + xor $zero,$zero,$zero + ld $a1,8($ap) + ld $a2,16($ap) + ld $a3,24($ap) + + ################################ + # | | | | | |a1*a0| | + # | | | | |a2*a0| | | + # | |a3*a2|a3*a0| | | | + # | | | |a2*a1| | | | + # | | |a3*a1| | | | | + # *| | | | | | | | 2| + # +|a3*a3|a2*a2|a1*a1|a0*a0| + # |--+--+--+--+--+--+--+--| + # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx + # + # "can't overflow" below mark carrying into high part of + # multiplication result, which can't overflow, because it + # can never be all ones. + + mulld $acc1,$a1,$a0 # a[1]*a[0] + mulhdu $t1,$a1,$a0 + mulld $acc2,$a2,$a0 # a[2]*a[0] + mulhdu $t2,$a2,$a0 + mulld $acc3,$a3,$a0 # a[3]*a[0] + mulhdu $acc4,$a3,$a0 + + addc $acc2,$acc2,$t1 # accumulate high parts of multiplication + mulld $t0,$a2,$a1 # a[2]*a[1] + mulhdu $t1,$a2,$a1 + adde $acc3,$acc3,$t2 + mulld $t2,$a3,$a1 # a[3]*a[1] + mulhdu $t3,$a3,$a1 + addze $acc4,$acc4 # can't overflow + + mulld $acc5,$a3,$a2 # a[3]*a[2] + mulhdu $acc6,$a3,$a2 + + addc $t1,$t1,$t2 # accumulate high parts of multiplication + mulld $acc0,$a0,$a0 # a[0]*a[0] + addze $t2,$t3 # can't overflow + + addc $acc3,$acc3,$t0 # accumulate low parts of multiplication + mulhdu $a0,$a0,$a0 + adde $acc4,$acc4,$t1 + mulld $t1,$a1,$a1 # a[1]*a[1] + adde $acc5,$acc5,$t2 + mulhdu $a1,$a1,$a1 + addze $acc6,$acc6 # can't overflow + + addc $acc1,$acc1,$acc1 # acc[1-6]*=2 + mulld $t2,$a2,$a2 # a[2]*a[2] + adde $acc2,$acc2,$acc2 + mulhdu $a2,$a2,$a2 + adde $acc3,$acc3,$acc3 + mulld $t3,$a3,$a3 # a[3]*a[3] + adde $acc4,$acc4,$acc4 + mulhdu $a3,$a3,$a3 + adde $acc5,$acc5,$acc5 + adde $acc6,$acc6,$acc6 + addze $acc7,$zero + + addc $acc1,$acc1,$a0 # +a[i]*a[i] + li $bi,38 + adde $acc2,$acc2,$t1 + adde $acc3,$acc3,$a1 + adde $acc4,$acc4,$t2 + adde $acc5,$acc5,$a2 + adde $acc6,$acc6,$t3 + adde $acc7,$acc7,$a3 + + mulld $t0,$acc4,$bi + mulld $t1,$acc5,$bi + mulld $t2,$acc6,$bi + mulld $t3,$acc7,$bi + + addc $acc0,$acc0,$t0 + mulhdu $t0,$acc4,$bi + adde $acc1,$acc1,$t1 + mulhdu $t1,$acc5,$bi + adde $acc2,$acc2,$t2 + mulhdu $t2,$acc6,$bi + adde $acc3,$acc3,$t3 + mulhdu $t3,$acc7,$bi + addze $acc4,$zero + + addc $acc1,$acc1,$t0 + adde $acc2,$acc2,$t1 + adde $acc3,$acc3,$t2 + adde $acc4,$acc4,$t3 + + mulld $acc4,$acc4,$bi + + addc $acc0,$acc0,$acc4 + addze $acc1,$acc1 + addze $acc2,$acc2 + addze $acc3,$acc3 + + subfe $acc4,$acc4,$acc4 # carry -> ~mask + std $acc1,8($rp) + andc $acc4,$bi,$acc4 + std $acc2,16($rp) + add $acc0,$acc0,$acc4 + std $acc3,24($rp) + std $acc0,0($rp) + + ld r22,`$FRAME-8*10`($sp) + ld r23,`$FRAME-8*9`($sp) + ld r24,`$FRAME-8*8`($sp) + ld r25,`$FRAME-8*7`($sp) + ld r26,`$FRAME-8*6`($sp) + ld r27,`$FRAME-8*5`($sp) + ld r28,`$FRAME-8*4`($sp) + ld r29,`$FRAME-8*3`($sp) + ld r30,`$FRAME-8*2`($sp) + ld r31,`$FRAME-8*1`($sp) + addi $sp,$sp,$FRAME + blr + .long 0 + .byte 0,12,4,0,0x80,10,2,0 + .long 0 +.size x25519_fe64_sqr,.-x25519_fe64_sqr + +.globl x25519_fe64_mul121666 +.type x25519_fe64_mul121666,\@function +.align 5 +x25519_fe64_mul121666: + lis $bi,`65536>>16` + ori $bi,$bi,`121666-65536` + + ld $t0,0($ap) + ld $t1,8($ap) + ld $bp,16($ap) + ld $ap,24($ap) + + mulld $a0,$t0,$bi + mulhdu $t0,$t0,$bi + mulld $a1,$t1,$bi + mulhdu $t1,$t1,$bi + mulld $a2,$bp,$bi + mulhdu $bp,$bp,$bi + mulld $a3,$ap,$bi + mulhdu $ap,$ap,$bi + + addc $a1,$a1,$t0 + adde $a2,$a2,$t1 + adde $a3,$a3,$bp + addze $ap, $ap + + mulli $ap,$ap,38 + + addc $a0,$a0,$ap + addze $a1,$a1 + addze $a2,$a2 + addze $a3,$a3 + + subfe $t1,$t1,$t1 # carry -> ~mask + std $a1,8($rp) + andc $t0,$t0,$t1 + std $a2,16($rp) + add $a0,$a0,$t0 + std $a3,24($rp) + std $a0,0($rp) + + blr + .long 0 + .byte 0,12,0x14,0,0,0,2,0 + .long 0 +.size x25519_fe64_mul121666,.-x25519_fe64_mul121666 + +.globl x25519_fe64_add +.type x25519_fe64_add,\@function +.align 5 +x25519_fe64_add: + ld $a0,0($ap) + ld $t0,0($bp) + ld $a1,8($ap) + ld $t1,8($bp) + ld $a2,16($ap) + ld $bi,16($bp) + ld $a3,24($ap) + ld $bp,24($bp) + + addc $a0,$a0,$t0 + adde $a1,$a1,$t1 + adde $a2,$a2,$bi + adde $a3,$a3,$bp + + li $t0,38 + subfe $t1,$t1,$t1 # carry -> ~mask + andc $t1,$t0,$t1 + + addc $a0,$a0,$t1 + addze $a1,$a1 + addze $a2,$a2 + addze $a3,$a3 + + subfe $t1,$t1,$t1 # carry -> ~mask + std $a1,8($rp) + andc $t0,$t0,$t1 + std $a2,16($rp) + add $a0,$a0,$t0 + std $a3,24($rp) + std $a0,0($rp) + + blr + .long 0 + .byte 0,12,0x14,0,0,0,3,0 + .long 0 +.size x25519_fe64_add,.-x25519_fe64_add + +.globl x25519_fe64_sub +.type x25519_fe64_sub,\@function +.align 5 +x25519_fe64_sub: + ld $a0,0($ap) + ld $t0,0($bp) + ld $a1,8($ap) + ld $t1,8($bp) + ld $a2,16($ap) + ld $bi,16($bp) + ld $a3,24($ap) + ld $bp,24($bp) + + subfc $a0,$t0,$a0 + subfe $a1,$t1,$a1 + subfe $a2,$bi,$a2 + subfe $a3,$bp,$a3 + + li $t0,38 + subfe $t1,$t1,$t1 # borrow -> mask + xor $zero,$zero,$zero + and $t1,$t0,$t1 + + subfc $a0,$t1,$a0 + subfe $a1,$zero,$a1 + subfe $a2,$zero,$a2 + subfe $a3,$zero,$a3 + + subfe $t1,$t1,$t1 # borrow -> mask + std $a1,8($rp) + and $t0,$t0,$t1 + std $a2,16($rp) + subf $a0,$t0,$a0 + std $a3,24($rp) + std $a0,0($rp) + + blr + .long 0 + .byte 0,12,0x14,0,0,0,3,0 + .long 0 +.size x25519_fe64_sub,.-x25519_fe64_sub + +.globl x25519_fe64_tobytes +.type x25519_fe64_tobytes,\@function +.align 5 +x25519_fe64_tobytes: + ld $a3,24($ap) + ld $a0,0($ap) + ld $a1,8($ap) + ld $a2,16($ap) + + sradi $t0,$a3,63 # most significant bit -> mask + li $t1,19 + and $t0,$t0,$t1 + sldi $a3,$a3,1 + add $t0,$t0,$t1 # compare to modulus in the same go + srdi $a3,$a3,1 # most signifcant bit cleared + + addc $a0,$a0,$t0 + addze $a1,$a1 + addze $a2,$a2 + addze $a3,$a3 + + xor $zero,$zero,$zero + sradi $t0,$a3,63 # most significant bit -> mask + sldi $a3,$a3,1 + andc $t0,$t1,$t0 + srdi $a3,$a3,1 # most signifcant bit cleared + + subi $rp,$rp,1 + subfc $a0,$t0,$a0 + subfe $a1,$zero,$a1 + subfe $a2,$zero,$a2 + subfe $a3,$zero,$a3 + +___ +for (my @a=($a0,$a1,$a2,$a3), my $i=0; $i<4; shift(@a), $i++) { +$code.=<<___; + srdi $t0,@a[0],8 + stbu @a[0],1($rp) + srdi @a[0],@a[0],16 + stbu $t0,1($rp) + srdi $t0,@a[0],8 + stbu @a[0],1($rp) + srdi @a[0],@a[0],16 + stbu $t0,1($rp) + srdi $t0,@a[0],8 + stbu @a[0],1($rp) + srdi @a[0],@a[0],16 + stbu $t0,1($rp) + srdi $t0,@a[0],8 + stbu @a[0],1($rp) + stbu $t0,1($rp) +___ +} +$code.=<<___; + blr + .long 0 + .byte 0,12,0x14,0,0,0,2,0 + .long 0 +.size x25519_fe64_tobytes,.-x25519_fe64_tobytes +___ +} +####################################################### base 2^51 +{ +my ($bi,$a0,$a1,$a2,$a3,$a4,$t0, $t1, + $h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,$h4lo,$h4hi) = + map("r$_",(6..12,21..31)); +my $mask = "r0"; +my $FRAME = 18*8; + +$code.=<<___; +.text + +.globl x25519_fe51_mul +.type x25519_fe51_mul,\@function +.align 5 +x25519_fe51_mul: + stdu $sp,-$FRAME($sp) + std r21,`$FRAME-8*11`($sp) + std r22,`$FRAME-8*10`($sp) + std r23,`$FRAME-8*9`($sp) + std r24,`$FRAME-8*8`($sp) + std r25,`$FRAME-8*7`($sp) + std r26,`$FRAME-8*6`($sp) + std r27,`$FRAME-8*5`($sp) + std r28,`$FRAME-8*4`($sp) + std r29,`$FRAME-8*3`($sp) + std r30,`$FRAME-8*2`($sp) + std r31,`$FRAME-8*1`($sp) + + ld $bi,0($bp) + ld $a0,0($ap) + ld $a1,8($ap) + ld $a2,16($ap) + ld $a3,24($ap) + ld $a4,32($ap) + + mulld $h0lo,$a0,$bi # a[0]*b[0] + mulhdu $h0hi,$a0,$bi + + mulld $h1lo,$a1,$bi # a[1]*b[0] + mulhdu $h1hi,$a1,$bi + + mulld $h4lo,$a4,$bi # a[4]*b[0] + mulhdu $h4hi,$a4,$bi + ld $ap,8($bp) + mulli $a4,$a4,19 + + mulld $h2lo,$a2,$bi # a[2]*b[0] + mulhdu $h2hi,$a2,$bi + + mulld $h3lo,$a3,$bi # a[3]*b[0] + mulhdu $h3hi,$a3,$bi +___ +for(my @a=($a0,$a1,$a2,$a3,$a4), + my $i=1; $i<4; $i++) { + ($ap,$bi) = ($bi,$ap); +$code.=<<___; + mulld $t0,@a[4],$bi + mulhdu $t1,@a[4],$bi + addc $h0lo,$h0lo,$t0 + adde $h0hi,$h0hi,$t1 + + mulld $t0,@a[0],$bi + mulhdu $t1,@a[0],$bi + addc $h1lo,$h1lo,$t0 + adde $h1hi,$h1hi,$t1 + + mulld $t0,@a[3],$bi + mulhdu $t1,@a[3],$bi + ld $ap,`8*($i+1)`($bp) + mulli @a[3],@a[3],19 + addc $h4lo,$h4lo,$t0 + adde $h4hi,$h4hi,$t1 + + mulld $t0,@a[1],$bi + mulhdu $t1,@a[1],$bi + addc $h2lo,$h2lo,$t0 + adde $h2hi,$h2hi,$t1 + + mulld $t0,@a[2],$bi + mulhdu $t1,@a[2],$bi + addc $h3lo,$h3lo,$t0 + adde $h3hi,$h3hi,$t1 +___ + unshift(@a,pop(@a)); +} + ($ap,$bi) = ($bi,$ap); +$code.=<<___; + mulld $t0,$a1,$bi + mulhdu $t1,$a1,$bi + addc $h0lo,$h0lo,$t0 + adde $h0hi,$h0hi,$t1 + + mulld $t0,$a2,$bi + mulhdu $t1,$a2,$bi + addc $h1lo,$h1lo,$t0 + adde $h1hi,$h1hi,$t1 + + mulld $t0,$a3,$bi + mulhdu $t1,$a3,$bi + addc $h2lo,$h2lo,$t0 + adde $h2hi,$h2hi,$t1 + + mulld $t0,$a4,$bi + mulhdu $t1,$a4,$bi + addc $h3lo,$h3lo,$t0 + adde $h3hi,$h3hi,$t1 + + mulld $t0,$a0,$bi + mulhdu $t1,$a0,$bi + addc $h4lo,$h4lo,$t0 + adde $h4hi,$h4hi,$t1 + +.Lfe51_reduce: + li $mask,-1 + srdi $mask,$mask,13 # 0x7ffffffffffff + + srdi $t0,$h2lo,51 + and $a2,$h2lo,$mask + insrdi $t0,$h2hi,51,0 # h2>>51 + srdi $t1,$h0lo,51 + and $a0,$h0lo,$mask + insrdi $t1,$h0hi,51,0 # h0>>51 + addc $h3lo,$h3lo,$t0 + addze $h3hi,$h3hi + addc $h1lo,$h1lo,$t1 + addze $h1hi,$h1hi + + srdi $t0,$h3lo,51 + and $a3,$h3lo,$mask + insrdi $t0,$h3hi,51,0 # h3>>51 + srdi $t1,$h1lo,51 + and $a1,$h1lo,$mask + insrdi $t1,$h1hi,51,0 # h1>>51 + addc $h4lo,$h4lo,$t0 + addze $h4hi,$h4hi + add $a2,$a2,$t1 + + srdi $t0,$h4lo,51 + and $a4,$h4lo,$mask + insrdi $t0,$h4hi,51,0 + mulli $t0,$t0,19 # (h4 >> 51) * 19 + + add $a0,$a0,$t0 + + srdi $t1,$a2,51 + and $a2,$a2,$mask + add $a3,$a3,$t1 + + srdi $t0,$a0,51 + and $a0,$a0,$mask + add $a1,$a1,$t0 + + std $a2,16($rp) + std $a3,24($rp) + std $a4,32($rp) + std $a0,0($rp) + std $a1,8($rp) + + ld r21,`$FRAME-8*11`($sp) + ld r22,`$FRAME-8*10`($sp) + ld r23,`$FRAME-8*9`($sp) + ld r24,`$FRAME-8*8`($sp) + ld r25,`$FRAME-8*7`($sp) + ld r26,`$FRAME-8*6`($sp) + ld r27,`$FRAME-8*5`($sp) + ld r28,`$FRAME-8*4`($sp) + ld r29,`$FRAME-8*3`($sp) + ld r30,`$FRAME-8*2`($sp) + ld r31,`$FRAME-8*1`($sp) + addi $sp,$sp,$FRAME + blr + .long 0 + .byte 0,12,4,0,0x80,11,3,0 + .long 0 +.size x25519_fe51_mul,.-x25519_fe51_mul +___ +{ +my ($a0,$a1,$a2,$a3,$a4,$t0,$t1) = ($a0,$a1,$a2,$a3,$a4,$t0,$t1); +$code.=<<___; +.globl x25519_fe51_sqr +.type x25519_fe51_sqr,\@function +.align 5 +x25519_fe51_sqr: + stdu $sp,-$FRAME($sp) + std r21,`$FRAME-8*11`($sp) + std r22,`$FRAME-8*10`($sp) + std r23,`$FRAME-8*9`($sp) + std r24,`$FRAME-8*8`($sp) + std r25,`$FRAME-8*7`($sp) + std r26,`$FRAME-8*6`($sp) + std r27,`$FRAME-8*5`($sp) + std r28,`$FRAME-8*4`($sp) + std r29,`$FRAME-8*3`($sp) + std r30,`$FRAME-8*2`($sp) + std r31,`$FRAME-8*1`($sp) + + ld $a0,0($ap) + ld $a1,8($ap) + ld $a2,16($ap) + ld $a3,24($ap) + ld $a4,32($ap) + + add $bi,$a0,$a0 # a[0]*2 + mulli $t1,$a4,19 # a[4]*19 + + mulld $h0lo,$a0,$a0 + mulhdu $h0hi,$a0,$a0 + mulld $h1lo,$a1,$bi + mulhdu $h1hi,$a1,$bi + mulld $h2lo,$a2,$bi + mulhdu $h2hi,$a2,$bi + mulld $h3lo,$a3,$bi + mulhdu $h3hi,$a3,$bi + mulld $h4lo,$a4,$bi + mulhdu $h4hi,$a4,$bi + add $bi,$a1,$a1 # a[1]*2 +___ + ($a4,$t1) = ($t1,$a4); +$code.=<<___; + mulld $t0,$t1,$a4 + mulhdu $t1,$t1,$a4 + addc $h3lo,$h3lo,$t0 + adde $h3hi,$h3hi,$t1 + + mulli $bp,$a3,19 # a[3]*19 + + mulld $t0,$a1,$a1 + mulhdu $t1,$a1,$a1 + addc $h2lo,$h2lo,$t0 + adde $h2hi,$h2hi,$t1 + mulld $t0,$a2,$bi + mulhdu $t1,$a2,$bi + addc $h3lo,$h3lo,$t0 + adde $h3hi,$h3hi,$t1 + mulld $t0,$a3,$bi + mulhdu $t1,$a3,$bi + addc $h4lo,$h4lo,$t0 + adde $h4hi,$h4hi,$t1 + mulld $t0,$a4,$bi + mulhdu $t1,$a4,$bi + add $bi,$a3,$a3 # a[3]*2 + addc $h0lo,$h0lo,$t0 + adde $h0hi,$h0hi,$t1 +___ + ($a3,$t1) = ($bp,$a3); +$code.=<<___; + mulld $t0,$t1,$a3 + mulhdu $t1,$t1,$a3 + addc $h1lo,$h1lo,$t0 + adde $h1hi,$h1hi,$t1 + mulld $t0,$bi,$a4 + mulhdu $t1,$bi,$a4 + add $bi,$a2,$a2 # a[2]*2 + addc $h2lo,$h2lo,$t0 + adde $h2hi,$h2hi,$t1 + + mulld $t0,$a2,$a2 + mulhdu $t1,$a2,$a2 + addc $h4lo,$h4lo,$t0 + adde $h4hi,$h4hi,$t1 + mulld $t0,$a3,$bi + mulhdu $t1,$a3,$bi + addc $h0lo,$h0lo,$t0 + adde $h0hi,$h0hi,$t1 + mulld $t0,$a4,$bi + mulhdu $t1,$a4,$bi + addc $h1lo,$h1lo,$t0 + adde $h1hi,$h1hi,$t1 + + b .Lfe51_reduce + .long 0 + .byte 0,12,4,0,0x80,11,2,0 + .long 0 +.size x25519_fe51_sqr,.-x25519_fe51_sqr +___ +} +$code.=<<___; +.globl x25519_fe51_mul121666 +.type x25519_fe51_mul121666,\@function +.align 5 +x25519_fe51_mul121666: + stdu $sp,-$FRAME($sp) + std r21,`$FRAME-8*11`($sp) + std r22,`$FRAME-8*10`($sp) + std r23,`$FRAME-8*9`($sp) + std r24,`$FRAME-8*8`($sp) + std r25,`$FRAME-8*7`($sp) + std r26,`$FRAME-8*6`($sp) + std r27,`$FRAME-8*5`($sp) + std r28,`$FRAME-8*4`($sp) + std r29,`$FRAME-8*3`($sp) + std r30,`$FRAME-8*2`($sp) + std r31,`$FRAME-8*1`($sp) + + lis $bi,`65536>>16` + ori $bi,$bi,`121666-65536` + ld $a0,0($ap) + ld $a1,8($ap) + ld $a2,16($ap) + ld $a3,24($ap) + ld $a4,32($ap) + + mulld $h0lo,$a0,$bi # a[0]*121666 + mulhdu $h0hi,$a0,$bi + mulld $h1lo,$a1,$bi # a[1]*121666 + mulhdu $h1hi,$a1,$bi + mulld $h2lo,$a2,$bi # a[2]*121666 + mulhdu $h2hi,$a2,$bi + mulld $h3lo,$a3,$bi # a[3]*121666 + mulhdu $h3hi,$a3,$bi + mulld $h4lo,$a4,$bi # a[4]*121666 + mulhdu $h4hi,$a4,$bi + + b .Lfe51_reduce + .long 0 + .byte 0,12,4,0,0x80,11,2,0 + .long 0 +.size x25519_fe51_mul121666,.-x25519_fe51_mul121666 +___ +} + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +print $code; +close STDOUT; diff --git a/crypto/ec/asm/x25519-x86_64.pl b/crypto/ec/asm/x25519-x86_64.pl new file mode 100755 index 000000000000..18dc6af9fae9 --- /dev/null +++ b/crypto/ec/asm/x25519-x86_64.pl @@ -0,0 +1,1117 @@ +#!/usr/bin/env perl +# Copyright 2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# X25519 lower-level primitives for x86_64. +# +# February 2018. +# +# This module implements radix 2^51 multiplication and squaring, and +# radix 2^64 multiplication, squaring, addition, subtraction and final +# reduction. Latter radix is used on ADCX/ADOX-capable processors such +# as Broadwell. On related note one should mention that there are +# vector implementations that provide significantly better performance +# on some processors(*), but they are large and overly complex. Which +# in combination with them being effectively processor-specific makes +# the undertaking hard to justify. The goal for this implementation +# is rather versatility and simplicity [and ultimately formal +# verification]. +# +# (*) For example sandy2x should provide ~30% improvement on Sandy +# Bridge, but only nominal ~5% on Haswell [and big loss on +# Broadwell and successors]. +# +###################################################################### +# Improvement coefficients: +# +# amd64-51(*) gcc-5.x(**) +# +# P4 +22% +40% +# Sandy Bridge -3% +11% +# Haswell -1% +13% +# Broadwell(***) +30% +35% +# Skylake(***) +33% +47% +# Silvermont +20% +26% +# Goldmont +40% +50% +# Bulldozer +20% +9% +# Ryzen(***) +43% +40% +# VIA +170% +120% +# +# (*) amd64-51 is popular assembly implementation with 2^51 radix, +# only multiplication and squaring subroutines were linked +# for comparison, but not complete ladder step; gain on most +# processors is because this module refrains from shld, and +# minor regression on others is because this does result in +# higher instruction count; +# (**) compiler is free to inline functions, in assembly one would +# need to implement ladder step to do that, and it will improve +# performance by several percent; +# (***) ADCX/ADOX result for 2^64 radix, there is no corresponding +# C implementation, so that comparison is always against +# 2^51 radix; + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT=*OUT; + +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $addx = ($1>=2.23); +} + +if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { + $addx = ($1>=2.10); +} + +if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $addx = ($1>=12); +} + +if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) { + my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 + $addx = ($ver>=3.03); +} + +$code.=<<___; +.text + +.globl x25519_fe51_mul +.type x25519_fe51_mul,\@function,3 +.align 32 +x25519_fe51_mul: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + lea -8*5(%rsp),%rsp +.cfi_adjust_cfa_offset 40 +.Lfe51_mul_body: + + mov 8*0(%rsi),%rax # f[0] + mov 8*0(%rdx),%r11 # load g[0-4] + mov 8*1(%rdx),%r12 + mov 8*2(%rdx),%r13 + mov 8*3(%rdx),%rbp + mov 8*4(%rdx),%r14 + + mov %rdi,8*4(%rsp) # offload 1st argument + mov %rax,%rdi + mulq %r11 # f[0]*g[0] + mov %r11,8*0(%rsp) # offload g[0] + mov %rax,%rbx # %rbx:%rcx = h0 + mov %rdi,%rax + mov %rdx,%rcx + mulq %r12 # f[0]*g[1] + mov %r12,8*1(%rsp) # offload g[1] + mov %rax,%r8 # %r8:%r9 = h1 + mov %rdi,%rax + lea (%r14,%r14,8),%r15 + mov %rdx,%r9 + mulq %r13 # f[0]*g[2] + mov %r13,8*2(%rsp) # offload g[2] + mov %rax,%r10 # %r10:%r11 = h2 + mov %rdi,%rax + lea (%r14,%r15,2),%rdi # g[4]*19 + mov %rdx,%r11 + mulq %rbp # f[0]*g[3] + mov %rax,%r12 # %r12:%r13 = h3 + mov 8*0(%rsi),%rax # f[0] + mov %rdx,%r13 + mulq %r14 # f[0]*g[4] + mov %rax,%r14 # %r14:%r15 = h4 + mov 8*1(%rsi),%rax # f[1] + mov %rdx,%r15 + + mulq %rdi # f[1]*g[4]*19 + add %rax,%rbx + mov 8*2(%rsi),%rax # f[2] + adc %rdx,%rcx + mulq %rdi # f[2]*g[4]*19 + add %rax,%r8 + mov 8*3(%rsi),%rax # f[3] + adc %rdx,%r9 + mulq %rdi # f[3]*g[4]*19 + add %rax,%r10 + mov 8*4(%rsi),%rax # f[4] + adc %rdx,%r11 + mulq %rdi # f[4]*g[4]*19 + imulq \$19,%rbp,%rdi # g[3]*19 + add %rax,%r12 + mov 8*1(%rsi),%rax # f[1] + adc %rdx,%r13 + mulq %rbp # f[1]*g[3] + mov 8*2(%rsp),%rbp # g[2] + add %rax,%r14 + mov 8*2(%rsi),%rax # f[2] + adc %rdx,%r15 + + mulq %rdi # f[2]*g[3]*19 + add %rax,%rbx + mov 8*3(%rsi),%rax # f[3] + adc %rdx,%rcx + mulq %rdi # f[3]*g[3]*19 + add %rax,%r8 + mov 8*4(%rsi),%rax # f[4] + adc %rdx,%r9 + mulq %rdi # f[4]*g[3]*19 + imulq \$19,%rbp,%rdi # g[2]*19 + add %rax,%r10 + mov 8*1(%rsi),%rax # f[1] + adc %rdx,%r11 + mulq %rbp # f[1]*g[2] + add %rax,%r12 + mov 8*2(%rsi),%rax # f[2] + adc %rdx,%r13 + mulq %rbp # f[2]*g[2] + mov 8*1(%rsp),%rbp # g[1] + add %rax,%r14 + mov 8*3(%rsi),%rax # f[3] + adc %rdx,%r15 + + mulq %rdi # f[3]*g[2]*19 + add %rax,%rbx + mov 8*4(%rsi),%rax # f[3] + adc %rdx,%rcx + mulq %rdi # f[4]*g[2]*19 + add %rax,%r8 + mov 8*1(%rsi),%rax # f[1] + adc %rdx,%r9 + mulq %rbp # f[1]*g[1] + imulq \$19,%rbp,%rdi + add %rax,%r10 + mov 8*2(%rsi),%rax # f[2] + adc %rdx,%r11 + mulq %rbp # f[2]*g[1] + add %rax,%r12 + mov 8*3(%rsi),%rax # f[3] + adc %rdx,%r13 + mulq %rbp # f[3]*g[1] + mov 8*0(%rsp),%rbp # g[0] + add %rax,%r14 + mov 8*4(%rsi),%rax # f[4] + adc %rdx,%r15 + + mulq %rdi # f[4]*g[1]*19 + add %rax,%rbx + mov 8*1(%rsi),%rax # f[1] + adc %rdx,%rcx + mul %rbp # f[1]*g[0] + add %rax,%r8 + mov 8*2(%rsi),%rax # f[2] + adc %rdx,%r9 + mul %rbp # f[2]*g[0] + add %rax,%r10 + mov 8*3(%rsi),%rax # f[3] + adc %rdx,%r11 + mul %rbp # f[3]*g[0] + add %rax,%r12 + mov 8*4(%rsi),%rax # f[4] + adc %rdx,%r13 + mulq %rbp # f[4]*g[0] + add %rax,%r14 + adc %rdx,%r15 + + mov 8*4(%rsp),%rdi # restore 1st argument + jmp .Lreduce51 +.Lfe51_mul_epilogue: +.cfi_endproc +.size x25519_fe51_mul,.-x25519_fe51_mul + +.globl x25519_fe51_sqr +.type x25519_fe51_sqr,\@function,2 +.align 32 +x25519_fe51_sqr: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + lea -8*5(%rsp),%rsp +.cfi_adjust_cfa_offset 40 +.Lfe51_sqr_body: + + mov 8*0(%rsi),%rax # g[0] + mov 8*2(%rsi),%r15 # g[2] + mov 8*4(%rsi),%rbp # g[4] + + mov %rdi,8*4(%rsp) # offload 1st argument + lea (%rax,%rax),%r14 + mulq %rax # g[0]*g[0] + mov %rax,%rbx + mov 8*1(%rsi),%rax # g[1] + mov %rdx,%rcx + mulq %r14 # 2*g[0]*g[1] + mov %rax,%r8 + mov %r15,%rax + mov %r15,8*0(%rsp) # offload g[2] + mov %rdx,%r9 + mulq %r14 # 2*g[0]*g[2] + mov %rax,%r10 + mov 8*3(%rsi),%rax + mov %rdx,%r11 + imulq \$19,%rbp,%rdi # g[4]*19 + mulq %r14 # 2*g[0]*g[3] + mov %rax,%r12 + mov %rbp,%rax + mov %rdx,%r13 + mulq %r14 # 2*g[0]*g[4] + mov %rax,%r14 + mov %rbp,%rax + mov %rdx,%r15 + + mulq %rdi # g[4]*g[4]*19 + add %rax,%r12 + mov 8*1(%rsi),%rax # g[1] + adc %rdx,%r13 + + mov 8*3(%rsi),%rsi # g[3] + lea (%rax,%rax),%rbp + mulq %rax # g[1]*g[1] + add %rax,%r10 + mov 8*0(%rsp),%rax # g[2] + adc %rdx,%r11 + mulq %rbp # 2*g[1]*g[2] + add %rax,%r12 + mov %rbp,%rax + adc %rdx,%r13 + mulq %rsi # 2*g[1]*g[3] + add %rax,%r14 + mov %rbp,%rax + adc %rdx,%r15 + imulq \$19,%rsi,%rbp # g[3]*19 + mulq %rdi # 2*g[1]*g[4]*19 + add %rax,%rbx + lea (%rsi,%rsi),%rax + adc %rdx,%rcx + + mulq %rdi # 2*g[3]*g[4]*19 + add %rax,%r10 + mov %rsi,%rax + adc %rdx,%r11 + mulq %rbp # g[3]*g[3]*19 + add %rax,%r8 + mov 8*0(%rsp),%rax # g[2] + adc %rdx,%r9 + + lea (%rax,%rax),%rsi + mulq %rax # g[2]*g[2] + add %rax,%r14 + mov %rbp,%rax + adc %rdx,%r15 + mulq %rsi # 2*g[2]*g[3]*19 + add %rax,%rbx + mov %rsi,%rax + adc %rdx,%rcx + mulq %rdi # 2*g[2]*g[4]*19 + add %rax,%r8 + adc %rdx,%r9 + + mov 8*4(%rsp),%rdi # restore 1st argument + jmp .Lreduce51 + +.align 32 +.Lreduce51: + mov \$0x7ffffffffffff,%rbp + + mov %r10,%rdx + shr \$51,%r10 + shl \$13,%r11 + and %rbp,%rdx # %rdx = g2 = h2 & mask + or %r10,%r11 # h2>>51 + add %r11,%r12 + adc \$0,%r13 # h3 += h2>>51 + + mov %rbx,%rax + shr \$51,%rbx + shl \$13,%rcx + and %rbp,%rax # %rax = g0 = h0 & mask + or %rbx,%rcx # h0>>51 + add %rcx,%r8 # h1 += h0>>51 + adc \$0,%r9 + + mov %r12,%rbx + shr \$51,%r12 + shl \$13,%r13 + and %rbp,%rbx # %rbx = g3 = h3 & mask + or %r12,%r13 # h3>>51 + add %r13,%r14 # h4 += h3>>51 + adc \$0,%r15 + + mov %r8,%rcx + shr \$51,%r8 + shl \$13,%r9 + and %rbp,%rcx # %rcx = g1 = h1 & mask + or %r8,%r9 + add %r9,%rdx # g2 += h1>>51 + + mov %r14,%r10 + shr \$51,%r14 + shl \$13,%r15 + and %rbp,%r10 # %r10 = g4 = h0 & mask + or %r14,%r15 # h0>>51 + + lea (%r15,%r15,8),%r14 + lea (%r15,%r14,2),%r15 + add %r15,%rax # g0 += (h0>>51)*19 + + mov %rdx,%r8 + and %rbp,%rdx # g2 &= mask + shr \$51,%r8 + add %r8,%rbx # g3 += g2>>51 + + mov %rax,%r9 + and %rbp,%rax # g0 &= mask + shr \$51,%r9 + add %r9,%rcx # g1 += g0>>51 + + mov %rax,8*0(%rdi) # save the result + mov %rcx,8*1(%rdi) + mov %rdx,8*2(%rdi) + mov %rbx,8*3(%rdi) + mov %r10,8*4(%rdi) + + mov 8*5(%rsp),%r15 +.cfi_restore %r15 + mov 8*6(%rsp),%r14 +.cfi_restore %r14 + mov 8*7(%rsp),%r13 +.cfi_restore %r13 + mov 8*8(%rsp),%r12 +.cfi_restore %r12 + mov 8*9(%rsp),%rbx +.cfi_restore %rbx + mov 8*10(%rsp),%rbp +.cfi_restore %rbp + lea 8*11(%rsp),%rsp +.cfi_adjust_cfa_offset 88 +.Lfe51_sqr_epilogue: + ret +.cfi_endproc +.size x25519_fe51_sqr,.-x25519_fe51_sqr + +.globl x25519_fe51_mul121666 +.type x25519_fe51_mul121666,\@function,2 +.align 32 +x25519_fe51_mul121666: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + lea -8*5(%rsp),%rsp +.cfi_adjust_cfa_offset 40 +.Lfe51_mul121666_body: + mov \$121666,%eax + + mulq 8*0(%rsi) + mov %rax,%rbx # %rbx:%rcx = h0 + mov \$121666,%eax + mov %rdx,%rcx + mulq 8*1(%rsi) + mov %rax,%r8 # %r8:%r9 = h1 + mov \$121666,%eax + mov %rdx,%r9 + mulq 8*2(%rsi) + mov %rax,%r10 # %r10:%r11 = h2 + mov \$121666,%eax + mov %rdx,%r11 + mulq 8*3(%rsi) + mov %rax,%r12 # %r12:%r13 = h3 + mov \$121666,%eax # f[0] + mov %rdx,%r13 + mulq 8*4(%rsi) + mov %rax,%r14 # %r14:%r15 = h4 + mov %rdx,%r15 + + jmp .Lreduce51 +.Lfe51_mul121666_epilogue: +.cfi_endproc +.size x25519_fe51_mul121666,.-x25519_fe51_mul121666 +___ +######################################################################## +# Base 2^64 subroutines modulo 2*(2^255-19) +# +if ($addx) { +my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7) = map("%r$_",(8..15)); + +$code.=<<___; +.extern OPENSSL_ia32cap_P +.globl x25519_fe64_eligible +.type x25519_fe64_eligible,\@abi-omnipotent +.align 32 +x25519_fe64_eligible: + mov OPENSSL_ia32cap_P+8(%rip),%ecx + xor %eax,%eax + and \$0x80100,%ecx + cmp \$0x80100,%ecx + cmove %ecx,%eax + ret +.size x25519_fe64_eligible,.-x25519_fe64_eligible + +.globl x25519_fe64_mul +.type x25519_fe64_mul,\@function,3 +.align 32 +x25519_fe64_mul: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push %rdi # offload dst +.cfi_push %rdi + lea -8*2(%rsp),%rsp +.cfi_adjust_cfa_offset 16 +.Lfe64_mul_body: + + mov %rdx,%rax + mov 8*0(%rdx),%rbp # b[0] + mov 8*0(%rsi),%rdx # a[0] + mov 8*1(%rax),%rcx # b[1] + mov 8*2(%rax),$acc6 # b[2] + mov 8*3(%rax),$acc7 # b[3] + + mulx %rbp,$acc0,%rax # a[0]*b[0] + xor %edi,%edi # cf=0,of=0 + mulx %rcx,$acc1,%rbx # a[0]*b[1] + adcx %rax,$acc1 + mulx $acc6,$acc2,%rax # a[0]*b[2] + adcx %rbx,$acc2 + mulx $acc7,$acc3,$acc4 # a[0]*b[3] + mov 8*1(%rsi),%rdx # a[1] + adcx %rax,$acc3 + mov $acc6,(%rsp) # offload b[2] + adcx %rdi,$acc4 # cf=0 + + mulx %rbp,%rax,%rbx # a[1]*b[0] + adox %rax,$acc1 + adcx %rbx,$acc2 + mulx %rcx,%rax,%rbx # a[1]*b[1] + adox %rax,$acc2 + adcx %rbx,$acc3 + mulx $acc6,%rax,%rbx # a[1]*b[2] + adox %rax,$acc3 + adcx %rbx,$acc4 + mulx $acc7,%rax,$acc5 # a[1]*b[3] + mov 8*2(%rsi),%rdx # a[2] + adox %rax,$acc4 + adcx %rdi,$acc5 # cf=0 + adox %rdi,$acc5 # of=0 + + mulx %rbp,%rax,%rbx # a[2]*b[0] + adcx %rax,$acc2 + adox %rbx,$acc3 + mulx %rcx,%rax,%rbx # a[2]*b[1] + adcx %rax,$acc3 + adox %rbx,$acc4 + mulx $acc6,%rax,%rbx # a[2]*b[2] + adcx %rax,$acc4 + adox %rbx,$acc5 + mulx $acc7,%rax,$acc6 # a[2]*b[3] + mov 8*3(%rsi),%rdx # a[3] + adcx %rax,$acc5 + adox %rdi,$acc6 # of=0 + adcx %rdi,$acc6 # cf=0 + + mulx %rbp,%rax,%rbx # a[3]*b[0] + adox %rax,$acc3 + adcx %rbx,$acc4 + mulx %rcx,%rax,%rbx # a[3]*b[1] + adox %rax,$acc4 + adcx %rbx,$acc5 + mulx (%rsp),%rax,%rbx # a[3]*b[2] + adox %rax,$acc5 + adcx %rbx,$acc6 + mulx $acc7,%rax,$acc7 # a[3]*b[3] + mov \$38,%edx + adox %rax,$acc6 + adcx %rdi,$acc7 # cf=0 + adox %rdi,$acc7 # of=0 + + jmp .Lreduce64 +.Lfe64_mul_epilogue: +.cfi_endproc +.size x25519_fe64_mul,.-x25519_fe64_mul + +.globl x25519_fe64_sqr +.type x25519_fe64_sqr,\@function,2 +.align 32 +x25519_fe64_sqr: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push %rdi # offload dst +.cfi_push %rdi + lea -8*2(%rsp),%rsp +.cfi_adjust_cfa_offset 16 +.Lfe64_sqr_body: + + mov 8*0(%rsi),%rdx # a[0] + mov 8*1(%rsi),%rcx # a[1] + mov 8*2(%rsi),%rbp # a[2] + mov 8*3(%rsi),%rsi # a[3] + + ################################################################ + mulx %rdx,$acc0,$acc7 # a[0]*a[0] + mulx %rcx,$acc1,%rax # a[0]*a[1] + xor %edi,%edi # cf=0,of=0 + mulx %rbp,$acc2,%rbx # a[0]*a[2] + adcx %rax,$acc2 + mulx %rsi,$acc3,$acc4 # a[0]*a[3] + mov %rcx,%rdx # a[1] + adcx %rbx,$acc3 + adcx %rdi,$acc4 # cf=0 + + ################################################################ + mulx %rbp,%rax,%rbx # a[1]*a[2] + adox %rax,$acc3 + adcx %rbx,$acc4 + mulx %rsi,%rax,$acc5 # a[1]*a[3] + mov %rbp,%rdx # a[2] + adox %rax,$acc4 + adcx %rdi,$acc5 + + ################################################################ + mulx %rsi,%rax,$acc6 # a[2]*a[3] + mov %rcx,%rdx # a[1] + adox %rax,$acc5 + adcx %rdi,$acc6 # cf=0 + adox %rdi,$acc6 # of=0 + + adcx $acc1,$acc1 # acc1:6<<1 + adox $acc7,$acc1 + adcx $acc2,$acc2 + mulx %rdx,%rax,%rbx # a[1]*a[1] + mov %rbp,%rdx # a[2] + adcx $acc3,$acc3 + adox %rax,$acc2 + adcx $acc4,$acc4 + adox %rbx,$acc3 + mulx %rdx,%rax,%rbx # a[2]*a[2] + mov %rsi,%rdx # a[3] + adcx $acc5,$acc5 + adox %rax,$acc4 + adcx $acc6,$acc6 + adox %rbx,$acc5 + mulx %rdx,%rax,$acc7 # a[3]*a[3] + mov \$38,%edx + adox %rax,$acc6 + adcx %rdi,$acc7 # cf=0 + adox %rdi,$acc7 # of=0 + jmp .Lreduce64 + +.align 32 +.Lreduce64: + mulx $acc4,%rax,%rbx + adcx %rax,$acc0 + adox %rbx,$acc1 + mulx $acc5,%rax,%rbx + adcx %rax,$acc1 + adox %rbx,$acc2 + mulx $acc6,%rax,%rbx + adcx %rax,$acc2 + adox %rbx,$acc3 + mulx $acc7,%rax,$acc4 + adcx %rax,$acc3 + adox %rdi,$acc4 + adcx %rdi,$acc4 + + mov 8*2(%rsp),%rdi # restore dst + imulq %rdx,$acc4 + + add $acc4,$acc0 + adc \$0,$acc1 + adc \$0,$acc2 + adc \$0,$acc3 + + sbb %rax,%rax # cf -> mask + and \$38,%rax + + add %rax,$acc0 + mov $acc1,8*1(%rdi) + mov $acc2,8*2(%rdi) + mov $acc3,8*3(%rdi) + mov $acc0,8*0(%rdi) + + mov 8*3(%rsp),%r15 +.cfi_restore %r15 + mov 8*4(%rsp),%r14 +.cfi_restore %r14 + mov 8*5(%rsp),%r13 +.cfi_restore %r13 + mov 8*6(%rsp),%r12 +.cfi_restore %r12 + mov 8*7(%rsp),%rbx +.cfi_restore %rbx + mov 8*8(%rsp),%rbp +.cfi_restore %rbp + lea 8*9(%rsp),%rsp +.cfi_adjust_cfa_offset 88 +.Lfe64_sqr_epilogue: + ret +.cfi_endproc +.size x25519_fe64_sqr,.-x25519_fe64_sqr + +.globl x25519_fe64_mul121666 +.type x25519_fe64_mul121666,\@function,2 +.align 32 +x25519_fe64_mul121666: +.Lfe64_mul121666_body: + mov \$121666,%edx + mulx 8*0(%rsi),$acc0,%rcx + mulx 8*1(%rsi),$acc1,%rax + add %rcx,$acc1 + mulx 8*2(%rsi),$acc2,%rcx + adc %rax,$acc2 + mulx 8*3(%rsi),$acc3,%rax + adc %rcx,$acc3 + adc \$0,%rax + + imulq \$38,%rax,%rax + + add %rax,$acc0 + adc \$0,$acc1 + adc \$0,$acc2 + adc \$0,$acc3 + + sbb %rax,%rax # cf -> mask + and \$38,%rax + + add %rax,$acc0 + mov $acc1,8*1(%rdi) + mov $acc2,8*2(%rdi) + mov $acc3,8*3(%rdi) + mov $acc0,8*0(%rdi) + +.Lfe64_mul121666_epilogue: + ret +.size x25519_fe64_mul121666,.-x25519_fe64_mul121666 + +.globl x25519_fe64_add +.type x25519_fe64_add,\@function,3 +.align 32 +x25519_fe64_add: +.Lfe64_add_body: + mov 8*0(%rsi),$acc0 + mov 8*1(%rsi),$acc1 + mov 8*2(%rsi),$acc2 + mov 8*3(%rsi),$acc3 + + add 8*0(%rdx),$acc0 + adc 8*1(%rdx),$acc1 + adc 8*2(%rdx),$acc2 + adc 8*3(%rdx),$acc3 + + sbb %rax,%rax # cf -> mask + and \$38,%rax + + add %rax,$acc0 + adc \$0,$acc1 + adc \$0,$acc2 + mov $acc1,8*1(%rdi) + adc \$0,$acc3 + mov $acc2,8*2(%rdi) + sbb %rax,%rax # cf -> mask + mov $acc3,8*3(%rdi) + and \$38,%rax + + add %rax,$acc0 + mov $acc0,8*0(%rdi) + +.Lfe64_add_epilogue: + ret +.size x25519_fe64_add,.-x25519_fe64_add + +.globl x25519_fe64_sub +.type x25519_fe64_sub,\@function,3 +.align 32 +x25519_fe64_sub: +.Lfe64_sub_body: + mov 8*0(%rsi),$acc0 + mov 8*1(%rsi),$acc1 + mov 8*2(%rsi),$acc2 + mov 8*3(%rsi),$acc3 + + sub 8*0(%rdx),$acc0 + sbb 8*1(%rdx),$acc1 + sbb 8*2(%rdx),$acc2 + sbb 8*3(%rdx),$acc3 + + sbb %rax,%rax # cf -> mask + and \$38,%rax + + sub %rax,$acc0 + sbb \$0,$acc1 + sbb \$0,$acc2 + mov $acc1,8*1(%rdi) + sbb \$0,$acc3 + mov $acc2,8*2(%rdi) + sbb %rax,%rax # cf -> mask + mov $acc3,8*3(%rdi) + and \$38,%rax + + sub %rax,$acc0 + mov $acc0,8*0(%rdi) + +.Lfe64_sub_epilogue: + ret +.size x25519_fe64_sub,.-x25519_fe64_sub + +.globl x25519_fe64_tobytes +.type x25519_fe64_tobytes,\@function,2 +.align 32 +x25519_fe64_tobytes: +.Lfe64_to_body: + mov 8*0(%rsi),$acc0 + mov 8*1(%rsi),$acc1 + mov 8*2(%rsi),$acc2 + mov 8*3(%rsi),$acc3 + + ################################# reduction modulo 2^255-19 + lea ($acc3,$acc3),%rax + sar \$63,$acc3 # most significant bit -> mask + shr \$1,%rax # most significant bit cleared + and \$19,$acc3 + add \$19,$acc3 # compare to modulus in the same go + + add $acc3,$acc0 + adc \$0,$acc1 + adc \$0,$acc2 + adc \$0,%rax + + lea (%rax,%rax),$acc3 + sar \$63,%rax # most significant bit -> mask + shr \$1,$acc3 # most significant bit cleared + not %rax + and \$19,%rax + + sub %rax,$acc0 + sbb \$0,$acc1 + sbb \$0,$acc2 + sbb \$0,$acc3 + + mov $acc0,8*0(%rdi) + mov $acc1,8*1(%rdi) + mov $acc2,8*2(%rdi) + mov $acc3,8*3(%rdi) + +.Lfe64_to_epilogue: + ret +.size x25519_fe64_tobytes,.-x25519_fe64_tobytes +___ +} else { +$code.=<<___; +.globl x25519_fe64_eligible +.type x25519_fe64_eligible,\@abi-omnipotent +.align 32 +x25519_fe64_eligible: + xor %eax,%eax + ret +.size x25519_fe64_eligible,.-x25519_fe64_eligible + +.globl x25519_fe64_mul +.type x25519_fe64_mul,\@abi-omnipotent +.globl x25519_fe64_sqr +.globl x25519_fe64_mul121666 +.globl x25519_fe64_add +.globl x25519_fe64_sub +.globl x25519_fe64_tobytes +x25519_fe64_mul: +x25519_fe64_sqr: +x25519_fe64_mul121666: +x25519_fe64_add: +x25519_fe64_sub: +x25519_fe64_tobytes: + .byte 0x0f,0x0b # ud2 + ret +.size x25519_fe64_mul,.-x25519_fe64_mul +___ +} +$code.=<<___; +.asciz "X25519 primitives for x86_64, CRYPTOGAMS by <appro\@openssl.org>" +___ + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind + +.type short_handler,\@abi-omnipotent +.align 16 +short_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # end of prologue label + cmp %r10,%rbx # context->Rip<end of prologue label + jb .Lcommon_seh_tail + + mov 152($context),%rax # pull context->Rsp + jmp .Lcommon_seh_tail +.size short_handler,.-short_handler + +.type full_handler,\@abi-omnipotent +.align 16 +full_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # end of prologue label + cmp %r10,%rbx # context->Rip<end of prologue label + jb .Lcommon_seh_tail + + mov 152($context),%rax # pull context->Rsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + mov 8(%r11),%r10d # HandlerData[2] + lea (%rax,%r10),%rax + + mov -8(%rax),%rbp + mov -16(%rax),%rbx + mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov -48(%rax),%r15 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R15 + +.Lcommon_seh_tail: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size full_handler,.-full_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_x25519_fe51_mul + .rva .LSEH_end_x25519_fe51_mul + .rva .LSEH_info_x25519_fe51_mul + + .rva .LSEH_begin_x25519_fe51_sqr + .rva .LSEH_end_x25519_fe51_sqr + .rva .LSEH_info_x25519_fe51_sqr + + .rva .LSEH_begin_x25519_fe51_mul121666 + .rva .LSEH_end_x25519_fe51_mul121666 + .rva .LSEH_info_x25519_fe51_mul121666 +___ +$code.=<<___ if ($addx); + .rva .LSEH_begin_x25519_fe64_mul + .rva .LSEH_end_x25519_fe64_mul + .rva .LSEH_info_x25519_fe64_mul + + .rva .LSEH_begin_x25519_fe64_sqr + .rva .LSEH_end_x25519_fe64_sqr + .rva .LSEH_info_x25519_fe64_sqr + + .rva .LSEH_begin_x25519_fe64_mul121666 + .rva .LSEH_end_x25519_fe64_mul121666 + .rva .LSEH_info_x25519_fe64_mul121666 + + .rva .LSEH_begin_x25519_fe64_add + .rva .LSEH_end_x25519_fe64_add + .rva .LSEH_info_x25519_fe64_add + + .rva .LSEH_begin_x25519_fe64_sub + .rva .LSEH_end_x25519_fe64_sub + .rva .LSEH_info_x25519_fe64_sub + + .rva .LSEH_begin_x25519_fe64_tobytes + .rva .LSEH_end_x25519_fe64_tobytes + .rva .LSEH_info_x25519_fe64_tobytes +___ +$code.=<<___; +.section .xdata +.align 8 +.LSEH_info_x25519_fe51_mul: + .byte 9,0,0,0 + .rva full_handler + .rva .Lfe51_mul_body,.Lfe51_mul_epilogue # HandlerData[] + .long 88,0 +.LSEH_info_x25519_fe51_sqr: + .byte 9,0,0,0 + .rva full_handler + .rva .Lfe51_sqr_body,.Lfe51_sqr_epilogue # HandlerData[] + .long 88,0 +.LSEH_info_x25519_fe51_mul121666: + .byte 9,0,0,0 + .rva full_handler + .rva .Lfe51_mul121666_body,.Lfe51_mul121666_epilogue # HandlerData[] + .long 88,0 +___ +$code.=<<___ if ($addx); +.LSEH_info_x25519_fe64_mul: + .byte 9,0,0,0 + .rva full_handler + .rva .Lfe64_mul_body,.Lfe64_mul_epilogue # HandlerData[] + .long 72,0 +.LSEH_info_x25519_fe64_sqr: + .byte 9,0,0,0 + .rva full_handler + .rva .Lfe64_sqr_body,.Lfe64_sqr_epilogue # HandlerData[] + .long 72,0 +.LSEH_info_x25519_fe64_mul121666: + .byte 9,0,0,0 + .rva short_handler + .rva .Lfe64_mul121666_body,.Lfe64_mul121666_epilogue # HandlerData[] +.LSEH_info_x25519_fe64_add: + .byte 9,0,0,0 + .rva short_handler + .rva .Lfe64_add_body,.Lfe64_add_epilogue # HandlerData[] +.LSEH_info_x25519_fe64_sub: + .byte 9,0,0,0 + .rva short_handler + .rva .Lfe64_sub_body,.Lfe64_sub_epilogue # HandlerData[] +.LSEH_info_x25519_fe64_tobytes: + .byte 9,0,0,0 + .rva short_handler + .rva .Lfe64_to_body,.Lfe64_to_epilogue # HandlerData[] +___ +} + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +print $code; +close STDOUT; diff --git a/crypto/ec/build.info b/crypto/ec/build.info new file mode 100644 index 000000000000..a1e673e347d0 --- /dev/null +++ b/crypto/ec/build.info @@ -0,0 +1,42 @@ +LIBS=../../libcrypto +SOURCE[../../libcrypto]=\ + ec_lib.c ecp_smpl.c ecp_mont.c ecp_nist.c ec_cvt.c ec_mult.c \ + ec_err.c ec_curve.c ec_check.c ec_print.c ec_asn1.c ec_key.c \ + ec2_smpl.c ec_ameth.c ec_pmeth.c eck_prn.c \ + ecp_nistp224.c ecp_nistp256.c ecp_nistp521.c ecp_nistputil.c \ + ecp_oct.c ec2_oct.c ec_oct.c ec_kmeth.c ecdh_ossl.c ecdh_kdf.c \ + ecdsa_ossl.c ecdsa_sign.c ecdsa_vrf.c curve25519.c ecx_meth.c \ + curve448/arch_32/f_impl.c curve448/f_generic.c curve448/scalar.c \ + curve448/curve448_tables.c curve448/eddsa.c curve448/curve448.c \ + {- $target{ec_asm_src} -} + +GENERATE[ecp_nistz256-x86.s]=asm/ecp_nistz256-x86.pl \ + $(PERLASM_SCHEME) $(LIB_CFLAGS) $(LIB_CPPFLAGS) $(PROCESSOR) + +GENERATE[ecp_nistz256-x86_64.s]=asm/ecp_nistz256-x86_64.pl $(PERLASM_SCHEME) + +GENERATE[ecp_nistz256-avx2.s]=asm/ecp_nistz256-avx2.pl $(PERLASM_SCHEME) + +GENERATE[ecp_nistz256-sparcv9.S]=asm/ecp_nistz256-sparcv9.pl $(PERLASM_SCHEME) +INCLUDE[ecp_nistz256-sparcv9.o]=.. + +GENERATE[ecp_nistz256-armv4.S]=asm/ecp_nistz256-armv4.pl $(PERLASM_SCHEME) +INCLUDE[ecp_nistz256-armv4.o]=.. +GENERATE[ecp_nistz256-armv8.S]=asm/ecp_nistz256-armv8.pl $(PERLASM_SCHEME) +INCLUDE[ecp_nistz256-armv8.o]=.. +GENERATE[ecp_nistz256-ppc64.s]=asm/ecp_nistz256-ppc64.pl $(PERLASM_SCHEME) + +GENERATE[x25519-x86_64.s]=asm/x25519-x86_64.pl $(PERLASM_SCHEME) +GENERATE[x25519-ppc64.s]=asm/x25519-ppc64.pl $(PERLASM_SCHEME) + +BEGINRAW[Makefile] +{- $builddir -}/ecp_nistz256-%.S: {- $sourcedir -}/asm/ecp_nistz256-%.pl + CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@ +ENDRAW[Makefile] + +INCLUDE[curve448/arch_32/f_impl.o]=curve448/arch_32 curve448 +INCLUDE[curve448/f_generic.o]=curve448/arch_32 curve448 +INCLUDE[curve448/scalar.o]=curve448/arch_32 curve448 +INCLUDE[curve448/curve448_tables.o]=curve448/arch_32 curve448 +INCLUDE[curve448/eddsa.o]=curve448/arch_32 curve448 +INCLUDE[curve448/curve448.o]=curve448/arch_32 curve448 diff --git a/crypto/ec/curve25519.c b/crypto/ec/curve25519.c new file mode 100644 index 000000000000..abe9b9cbf6dd --- /dev/null +++ b/crypto/ec/curve25519.c @@ -0,0 +1,5457 @@ +/* + * Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#include <string.h> +#include "ec_lcl.h" +#include <openssl/sha.h> + +#if defined(X25519_ASM) && (defined(__x86_64) || defined(__x86_64__) || \ + defined(_M_AMD64) || defined(_M_X64)) + +# define BASE_2_64_IMPLEMENTED + +typedef uint64_t fe64[4]; + +int x25519_fe64_eligible(void); + +/* + * Following subroutines perform corresponding operations modulo + * 2^256-38, i.e. double the curve modulus. However, inputs and + * outputs are permitted to be partially reduced, i.e. to remain + * in [0..2^256) range. It's all tied up in final fe64_tobytes + * that performs full reduction modulo 2^255-19. + * + * There are no reference C implementations for these. + */ +void x25519_fe64_mul(fe64 h, const fe64 f, const fe64 g); +void x25519_fe64_sqr(fe64 h, const fe64 f); +void x25519_fe64_mul121666(fe64 h, fe64 f); +void x25519_fe64_add(fe64 h, const fe64 f, const fe64 g); +void x25519_fe64_sub(fe64 h, const fe64 f, const fe64 g); +void x25519_fe64_tobytes(uint8_t *s, const fe64 f); +# define fe64_mul x25519_fe64_mul +# define fe64_sqr x25519_fe64_sqr +# define fe64_mul121666 x25519_fe64_mul121666 +# define fe64_add x25519_fe64_add +# define fe64_sub x25519_fe64_sub +# define fe64_tobytes x25519_fe64_tobytes + +static uint64_t load_8(const uint8_t *in) +{ + uint64_t result; + + result = in[0]; + result |= ((uint64_t)in[1]) << 8; + result |= ((uint64_t)in[2]) << 16; + result |= ((uint64_t)in[3]) << 24; + result |= ((uint64_t)in[4]) << 32; + result |= ((uint64_t)in[5]) << 40; + result |= ((uint64_t)in[6]) << 48; + result |= ((uint64_t)in[7]) << 56; + + return result; +} + +static void fe64_frombytes(fe64 h, const uint8_t *s) +{ + h[0] = load_8(s); + h[1] = load_8(s + 8); + h[2] = load_8(s + 16); + h[3] = load_8(s + 24) & 0x7fffffffffffffff; +} + +static void fe64_0(fe64 h) +{ + h[0] = 0; + h[1] = 0; + h[2] = 0; + h[3] = 0; +} + +static void fe64_1(fe64 h) +{ + h[0] = 1; + h[1] = 0; + h[2] = 0; + h[3] = 0; +} + +static void fe64_copy(fe64 h, const fe64 f) +{ + h[0] = f[0]; + h[1] = f[1]; + h[2] = f[2]; + h[3] = f[3]; +} + +static void fe64_cswap(fe64 f, fe64 g, unsigned int b) +{ + int i; + uint64_t mask = 0 - (uint64_t)b; + + for (i = 0; i < 4; i++) { + uint64_t x = f[i] ^ g[i]; + x &= mask; + f[i] ^= x; + g[i] ^= x; + } +} + +static void fe64_invert(fe64 out, const fe64 z) +{ + fe64 t0; + fe64 t1; + fe64 t2; + fe64 t3; + int i; + + /* + * Compute z ** -1 = z ** (2 ** 255 - 19 - 2) with the exponent as + * 2 ** 255 - 21 = (2 ** 5) * (2 ** 250 - 1) + 11. + */ + + /* t0 = z ** 2 */ + fe64_sqr(t0, z); + + /* t1 = t0 ** (2 ** 2) = z ** 8 */ + fe64_sqr(t1, t0); + fe64_sqr(t1, t1); + + /* t1 = z * t1 = z ** 9 */ + fe64_mul(t1, z, t1); + /* t0 = t0 * t1 = z ** 11 -- stash t0 away for the end. */ + fe64_mul(t0, t0, t1); + + /* t2 = t0 ** 2 = z ** 22 */ + fe64_sqr(t2, t0); + + /* t1 = t1 * t2 = z ** (2 ** 5 - 1) */ + fe64_mul(t1, t1, t2); + + /* t2 = t1 ** (2 ** 5) = z ** ((2 ** 5) * (2 ** 5 - 1)) */ + fe64_sqr(t2, t1); + for (i = 1; i < 5; ++i) + fe64_sqr(t2, t2); + + /* t1 = t1 * t2 = z ** ((2 ** 5 + 1) * (2 ** 5 - 1)) = z ** (2 ** 10 - 1) */ + fe64_mul(t1, t2, t1); + + /* Continuing similarly... */ + + /* t2 = z ** (2 ** 20 - 1) */ + fe64_sqr(t2, t1); + for (i = 1; i < 10; ++i) + fe64_sqr(t2, t2); + + fe64_mul(t2, t2, t1); + + /* t2 = z ** (2 ** 40 - 1) */ + fe64_sqr(t3, t2); + for (i = 1; i < 20; ++i) + fe64_sqr(t3, t3); + + fe64_mul(t2, t3, t2); + + /* t2 = z ** (2 ** 10) * (2 ** 40 - 1) */ + for (i = 0; i < 10; ++i) + fe64_sqr(t2, t2); + + /* t1 = z ** (2 ** 50 - 1) */ + fe64_mul(t1, t2, t1); + + /* t2 = z ** (2 ** 100 - 1) */ + fe64_sqr(t2, t1); + for (i = 1; i < 50; ++i) + fe64_sqr(t2, t2); + + fe64_mul(t2, t2, t1); + + /* t2 = z ** (2 ** 200 - 1) */ + fe64_sqr(t3, t2); + for (i = 1; i < 100; ++i) + fe64_sqr(t3, t3); + + fe64_mul(t2, t3, t2); + + /* t2 = z ** ((2 ** 50) * (2 ** 200 - 1) */ + for (i = 0; i < 50; ++i) + fe64_sqr(t2, t2); + + /* t1 = z ** (2 ** 250 - 1) */ + fe64_mul(t1, t2, t1); + + /* t1 = z ** ((2 ** 5) * (2 ** 250 - 1)) */ + for (i = 0; i < 5; ++i) + fe64_sqr(t1, t1); + + /* Recall t0 = z ** 11; out = z ** (2 ** 255 - 21) */ + fe64_mul(out, t1, t0); +} + +/* + * Duplicate of original x25519_scalar_mult_generic, but using + * fe64_* subroutines. + */ +static void x25519_scalar_mulx(uint8_t out[32], const uint8_t scalar[32], + const uint8_t point[32]) +{ + fe64 x1, x2, z2, x3, z3, tmp0, tmp1; + uint8_t e[32]; + unsigned swap = 0; + int pos; + + memcpy(e, scalar, 32); + e[0] &= 0xf8; + e[31] &= 0x7f; + e[31] |= 0x40; + fe64_frombytes(x1, point); + fe64_1(x2); + fe64_0(z2); + fe64_copy(x3, x1); + fe64_1(z3); + + for (pos = 254; pos >= 0; --pos) { + unsigned int b = 1 & (e[pos / 8] >> (pos & 7)); + + swap ^= b; + fe64_cswap(x2, x3, swap); + fe64_cswap(z2, z3, swap); + swap = b; + fe64_sub(tmp0, x3, z3); + fe64_sub(tmp1, x2, z2); + fe64_add(x2, x2, z2); + fe64_add(z2, x3, z3); + fe64_mul(z3, x2, tmp0); + fe64_mul(z2, z2, tmp1); + fe64_sqr(tmp0, tmp1); + fe64_sqr(tmp1, x2); + fe64_add(x3, z3, z2); + fe64_sub(z2, z3, z2); + fe64_mul(x2, tmp1, tmp0); + fe64_sub(tmp1, tmp1, tmp0); + fe64_sqr(z2, z2); + fe64_mul121666(z3, tmp1); + fe64_sqr(x3, x3); + fe64_add(tmp0, tmp0, z3); + fe64_mul(z3, x1, z2); + fe64_mul(z2, tmp1, tmp0); + } + + fe64_invert(z2, z2); + fe64_mul(x2, x2, z2); + fe64_tobytes(out, x2); + + OPENSSL_cleanse(e, sizeof(e)); +} +#endif + +#if defined(X25519_ASM) \ + || ( (defined(__SIZEOF_INT128__) && __SIZEOF_INT128__ == 16) \ + && !defined(__sparc__) \ + && !(defined(__ANDROID__) && !defined(__clang__)) ) +/* + * Base 2^51 implementation. It's virtually no different from reference + * base 2^25.5 implementation in respect to lax boundary conditions for + * intermediate values and even individual limbs. So that whatever you + * know about the reference, applies even here... + */ +# define BASE_2_51_IMPLEMENTED + +typedef uint64_t fe51[5]; + +static const uint64_t MASK51 = 0x7ffffffffffff; + +static uint64_t load_7(const uint8_t *in) +{ + uint64_t result; + + result = in[0]; + result |= ((uint64_t)in[1]) << 8; + result |= ((uint64_t)in[2]) << 16; + result |= ((uint64_t)in[3]) << 24; + result |= ((uint64_t)in[4]) << 32; + result |= ((uint64_t)in[5]) << 40; + result |= ((uint64_t)in[6]) << 48; + + return result; +} + +static uint64_t load_6(const uint8_t *in) +{ + uint64_t result; + + result = in[0]; + result |= ((uint64_t)in[1]) << 8; + result |= ((uint64_t)in[2]) << 16; + result |= ((uint64_t)in[3]) << 24; + result |= ((uint64_t)in[4]) << 32; + result |= ((uint64_t)in[5]) << 40; + + return result; +} + +static void fe51_frombytes(fe51 h, const uint8_t *s) +{ + uint64_t h0 = load_7(s); /* 56 bits */ + uint64_t h1 = load_6(s + 7) << 5; /* 53 bits */ + uint64_t h2 = load_7(s + 13) << 2; /* 58 bits */ + uint64_t h3 = load_6(s + 20) << 7; /* 55 bits */ + uint64_t h4 = (load_6(s + 26) & 0x7fffffffffff) << 4; /* 51 bits */ + + h1 |= h0 >> 51; h0 &= MASK51; + h2 |= h1 >> 51; h1 &= MASK51; + h3 |= h2 >> 51; h2 &= MASK51; + h4 |= h3 >> 51; h3 &= MASK51; + + h[0] = h0; + h[1] = h1; + h[2] = h2; + h[3] = h3; + h[4] = h4; +} + +static void fe51_tobytes(uint8_t *s, const fe51 h) +{ + uint64_t h0 = h[0]; + uint64_t h1 = h[1]; + uint64_t h2 = h[2]; + uint64_t h3 = h[3]; + uint64_t h4 = h[4]; + uint64_t q; + + /* compare to modulus */ + q = (h0 + 19) >> 51; + q = (h1 + q) >> 51; + q = (h2 + q) >> 51; + q = (h3 + q) >> 51; + q = (h4 + q) >> 51; + + /* full reduce */ + h0 += 19 * q; + h1 += h0 >> 51; h0 &= MASK51; + h2 += h1 >> 51; h1 &= MASK51; + h3 += h2 >> 51; h2 &= MASK51; + h4 += h3 >> 51; h3 &= MASK51; + h4 &= MASK51; + + /* smash */ + s[0] = (uint8_t)(h0 >> 0); + s[1] = (uint8_t)(h0 >> 8); + s[2] = (uint8_t)(h0 >> 16); + s[3] = (uint8_t)(h0 >> 24); + s[4] = (uint8_t)(h0 >> 32); + s[5] = (uint8_t)(h0 >> 40); + s[6] = (uint8_t)((h0 >> 48) | ((uint32_t)h1 << 3)); + s[7] = (uint8_t)(h1 >> 5); + s[8] = (uint8_t)(h1 >> 13); + s[9] = (uint8_t)(h1 >> 21); + s[10] = (uint8_t)(h1 >> 29); + s[11] = (uint8_t)(h1 >> 37); + s[12] = (uint8_t)((h1 >> 45) | ((uint32_t)h2 << 6)); + s[13] = (uint8_t)(h2 >> 2); + s[14] = (uint8_t)(h2 >> 10); + s[15] = (uint8_t)(h2 >> 18); + s[16] = (uint8_t)(h2 >> 26); + s[17] = (uint8_t)(h2 >> 34); + s[18] = (uint8_t)(h2 >> 42); + s[19] = (uint8_t)((h2 >> 50) | ((uint32_t)h3 << 1)); + s[20] = (uint8_t)(h3 >> 7); + s[21] = (uint8_t)(h3 >> 15); + s[22] = (uint8_t)(h3 >> 23); + s[23] = (uint8_t)(h3 >> 31); + s[24] = (uint8_t)(h3 >> 39); + s[25] = (uint8_t)((h3 >> 47) | ((uint32_t)h4 << 4)); + s[26] = (uint8_t)(h4 >> 4); + s[27] = (uint8_t)(h4 >> 12); + s[28] = (uint8_t)(h4 >> 20); + s[29] = (uint8_t)(h4 >> 28); + s[30] = (uint8_t)(h4 >> 36); + s[31] = (uint8_t)(h4 >> 44); +} + +# if defined(X25519_ASM) +void x25519_fe51_mul(fe51 h, const fe51 f, const fe51 g); +void x25519_fe51_sqr(fe51 h, const fe51 f); +void x25519_fe51_mul121666(fe51 h, fe51 f); +# define fe51_mul x25519_fe51_mul +# define fe51_sq x25519_fe51_sqr +# define fe51_mul121666 x25519_fe51_mul121666 +# else + +typedef __uint128_t u128; + +static void fe51_mul(fe51 h, const fe51 f, const fe51 g) +{ + u128 h0, h1, h2, h3, h4; + uint64_t f_i, g0, g1, g2, g3, g4; + + f_i = f[0]; + h0 = (u128)f_i * (g0 = g[0]); + h1 = (u128)f_i * (g1 = g[1]); + h2 = (u128)f_i * (g2 = g[2]); + h3 = (u128)f_i * (g3 = g[3]); + h4 = (u128)f_i * (g4 = g[4]); + + f_i = f[1]; + h0 += (u128)f_i * (g4 *= 19); + h1 += (u128)f_i * g0; + h2 += (u128)f_i * g1; + h3 += (u128)f_i * g2; + h4 += (u128)f_i * g3; + + f_i = f[2]; + h0 += (u128)f_i * (g3 *= 19); + h1 += (u128)f_i * g4; + h2 += (u128)f_i * g0; + h3 += (u128)f_i * g1; + h4 += (u128)f_i * g2; + + f_i = f[3]; + h0 += (u128)f_i * (g2 *= 19); + h1 += (u128)f_i * g3; + h2 += (u128)f_i * g4; + h3 += (u128)f_i * g0; + h4 += (u128)f_i * g1; + + f_i = f[4]; + h0 += (u128)f_i * (g1 *= 19); + h1 += (u128)f_i * g2; + h2 += (u128)f_i * g3; + h3 += (u128)f_i * g4; + h4 += (u128)f_i * g0; + + /* partial [lazy] reduction */ + h3 += (uint64_t)(h2 >> 51); g2 = (uint64_t)h2 & MASK51; + h1 += (uint64_t)(h0 >> 51); g0 = (uint64_t)h0 & MASK51; + + h4 += (uint64_t)(h3 >> 51); g3 = (uint64_t)h3 & MASK51; + g2 += (uint64_t)(h1 >> 51); g1 = (uint64_t)h1 & MASK51; + + g0 += (uint64_t)(h4 >> 51) * 19; g4 = (uint64_t)h4 & MASK51; + g3 += g2 >> 51; g2 &= MASK51; + g1 += g0 >> 51; g0 &= MASK51; + + h[0] = g0; + h[1] = g1; + h[2] = g2; + h[3] = g3; + h[4] = g4; +} + +static void fe51_sq(fe51 h, const fe51 f) +{ +# if defined(OPENSSL_SMALL_FOOTPRINT) + fe51_mul(h, f, f); +# else + /* dedicated squaring gives 16-25% overall improvement */ + uint64_t g0 = f[0]; + uint64_t g1 = f[1]; + uint64_t g2 = f[2]; + uint64_t g3 = f[3]; + uint64_t g4 = f[4]; + u128 h0, h1, h2, h3, h4; + + h0 = (u128)g0 * g0; g0 *= 2; + h1 = (u128)g0 * g1; + h2 = (u128)g0 * g2; + h3 = (u128)g0 * g3; + h4 = (u128)g0 * g4; + + g0 = g4; /* borrow g0 */ + h3 += (u128)g0 * (g4 *= 19); + + h2 += (u128)g1 * g1; g1 *= 2; + h3 += (u128)g1 * g2; + h4 += (u128)g1 * g3; + h0 += (u128)g1 * g4; + + g0 = g3; /* borrow g0 */ + h1 += (u128)g0 * (g3 *= 19); + h2 += (u128)(g0 * 2) * g4; + + h4 += (u128)g2 * g2; g2 *= 2; + h0 += (u128)g2 * g3; + h1 += (u128)g2 * g4; + + /* partial [lazy] reduction */ + h3 += (uint64_t)(h2 >> 51); g2 = (uint64_t)h2 & MASK51; + h1 += (uint64_t)(h0 >> 51); g0 = (uint64_t)h0 & MASK51; + + h4 += (uint64_t)(h3 >> 51); g3 = (uint64_t)h3 & MASK51; + g2 += (uint64_t)(h1 >> 51); g1 = (uint64_t)h1 & MASK51; + + g0 += (uint64_t)(h4 >> 51) * 19; g4 = (uint64_t)h4 & MASK51; + g3 += g2 >> 51; g2 &= MASK51; + g1 += g0 >> 51; g0 &= MASK51; + + h[0] = g0; + h[1] = g1; + h[2] = g2; + h[3] = g3; + h[4] = g4; +# endif +} + +static void fe51_mul121666(fe51 h, fe51 f) +{ + u128 h0 = f[0] * (u128)121666; + u128 h1 = f[1] * (u128)121666; + u128 h2 = f[2] * (u128)121666; + u128 h3 = f[3] * (u128)121666; + u128 h4 = f[4] * (u128)121666; + uint64_t g0, g1, g2, g3, g4; + + h3 += (uint64_t)(h2 >> 51); g2 = (uint64_t)h2 & MASK51; + h1 += (uint64_t)(h0 >> 51); g0 = (uint64_t)h0 & MASK51; + + h4 += (uint64_t)(h3 >> 51); g3 = (uint64_t)h3 & MASK51; + g2 += (uint64_t)(h1 >> 51); g1 = (uint64_t)h1 & MASK51; + + g0 += (uint64_t)(h4 >> 51) * 19; g4 = (uint64_t)h4 & MASK51; + g3 += g2 >> 51; g2 &= MASK51; + g1 += g0 >> 51; g0 &= MASK51; + + h[0] = g0; + h[1] = g1; + h[2] = g2; + h[3] = g3; + h[4] = g4; +} +# endif + +static void fe51_add(fe51 h, const fe51 f, const fe51 g) +{ + h[0] = f[0] + g[0]; + h[1] = f[1] + g[1]; + h[2] = f[2] + g[2]; + h[3] = f[3] + g[3]; + h[4] = f[4] + g[4]; +} + +static void fe51_sub(fe51 h, const fe51 f, const fe51 g) +{ + /* + * Add 2*modulus to ensure that result remains positive + * even if subtrahend is partially reduced. + */ + h[0] = (f[0] + 0xfffffffffffda) - g[0]; + h[1] = (f[1] + 0xffffffffffffe) - g[1]; + h[2] = (f[2] + 0xffffffffffffe) - g[2]; + h[3] = (f[3] + 0xffffffffffffe) - g[3]; + h[4] = (f[4] + 0xffffffffffffe) - g[4]; +} + +static void fe51_0(fe51 h) +{ + h[0] = 0; + h[1] = 0; + h[2] = 0; + h[3] = 0; + h[4] = 0; +} + +static void fe51_1(fe51 h) +{ + h[0] = 1; + h[1] = 0; + h[2] = 0; + h[3] = 0; + h[4] = 0; +} + +static void fe51_copy(fe51 h, const fe51 f) +{ + h[0] = f[0]; + h[1] = f[1]; + h[2] = f[2]; + h[3] = f[3]; + h[4] = f[4]; +} + +static void fe51_cswap(fe51 f, fe51 g, unsigned int b) +{ + int i; + uint64_t mask = 0 - (uint64_t)b; + + for (i = 0; i < 5; i++) { + int64_t x = f[i] ^ g[i]; + x &= mask; + f[i] ^= x; + g[i] ^= x; + } +} + +static void fe51_invert(fe51 out, const fe51 z) +{ + fe51 t0; + fe51 t1; + fe51 t2; + fe51 t3; + int i; + + /* + * Compute z ** -1 = z ** (2 ** 255 - 19 - 2) with the exponent as + * 2 ** 255 - 21 = (2 ** 5) * (2 ** 250 - 1) + 11. + */ + + /* t0 = z ** 2 */ + fe51_sq(t0, z); + + /* t1 = t0 ** (2 ** 2) = z ** 8 */ + fe51_sq(t1, t0); + fe51_sq(t1, t1); + + /* t1 = z * t1 = z ** 9 */ + fe51_mul(t1, z, t1); + /* t0 = t0 * t1 = z ** 11 -- stash t0 away for the end. */ + fe51_mul(t0, t0, t1); + + /* t2 = t0 ** 2 = z ** 22 */ + fe51_sq(t2, t0); + + /* t1 = t1 * t2 = z ** (2 ** 5 - 1) */ + fe51_mul(t1, t1, t2); + + /* t2 = t1 ** (2 ** 5) = z ** ((2 ** 5) * (2 ** 5 - 1)) */ + fe51_sq(t2, t1); + for (i = 1; i < 5; ++i) + fe51_sq(t2, t2); + + /* t1 = t1 * t2 = z ** ((2 ** 5 + 1) * (2 ** 5 - 1)) = z ** (2 ** 10 - 1) */ + fe51_mul(t1, t2, t1); + + /* Continuing similarly... */ + + /* t2 = z ** (2 ** 20 - 1) */ + fe51_sq(t2, t1); + for (i = 1; i < 10; ++i) + fe51_sq(t2, t2); + + fe51_mul(t2, t2, t1); + + /* t2 = z ** (2 ** 40 - 1) */ + fe51_sq(t3, t2); + for (i = 1; i < 20; ++i) + fe51_sq(t3, t3); + + fe51_mul(t2, t3, t2); + + /* t2 = z ** (2 ** 10) * (2 ** 40 - 1) */ + for (i = 0; i < 10; ++i) + fe51_sq(t2, t2); + + /* t1 = z ** (2 ** 50 - 1) */ + fe51_mul(t1, t2, t1); + + /* t2 = z ** (2 ** 100 - 1) */ + fe51_sq(t2, t1); + for (i = 1; i < 50; ++i) + fe51_sq(t2, t2); + + fe51_mul(t2, t2, t1); + + /* t2 = z ** (2 ** 200 - 1) */ + fe51_sq(t3, t2); + for (i = 1; i < 100; ++i) + fe51_sq(t3, t3); + + fe51_mul(t2, t3, t2); + + /* t2 = z ** ((2 ** 50) * (2 ** 200 - 1) */ + for (i = 0; i < 50; ++i) + fe51_sq(t2, t2); + + /* t1 = z ** (2 ** 250 - 1) */ + fe51_mul(t1, t2, t1); + + /* t1 = z ** ((2 ** 5) * (2 ** 250 - 1)) */ + for (i = 0; i < 5; ++i) + fe51_sq(t1, t1); + + /* Recall t0 = z ** 11; out = z ** (2 ** 255 - 21) */ + fe51_mul(out, t1, t0); +} + +/* + * Duplicate of original x25519_scalar_mult_generic, but using + * fe51_* subroutines. + */ +static void x25519_scalar_mult(uint8_t out[32], const uint8_t scalar[32], + const uint8_t point[32]) +{ + fe51 x1, x2, z2, x3, z3, tmp0, tmp1; + uint8_t e[32]; + unsigned swap = 0; + int pos; + +# ifdef BASE_2_64_IMPLEMENTED + if (x25519_fe64_eligible()) { + x25519_scalar_mulx(out, scalar, point); + return; + } +# endif + + memcpy(e, scalar, 32); + e[0] &= 0xf8; + e[31] &= 0x7f; + e[31] |= 0x40; + fe51_frombytes(x1, point); + fe51_1(x2); + fe51_0(z2); + fe51_copy(x3, x1); + fe51_1(z3); + + for (pos = 254; pos >= 0; --pos) { + unsigned int b = 1 & (e[pos / 8] >> (pos & 7)); + + swap ^= b; + fe51_cswap(x2, x3, swap); + fe51_cswap(z2, z3, swap); + swap = b; + fe51_sub(tmp0, x3, z3); + fe51_sub(tmp1, x2, z2); + fe51_add(x2, x2, z2); + fe51_add(z2, x3, z3); + fe51_mul(z3, tmp0, x2); + fe51_mul(z2, z2, tmp1); + fe51_sq(tmp0, tmp1); + fe51_sq(tmp1, x2); + fe51_add(x3, z3, z2); + fe51_sub(z2, z3, z2); + fe51_mul(x2, tmp1, tmp0); + fe51_sub(tmp1, tmp1, tmp0); + fe51_sq(z2, z2); + fe51_mul121666(z3, tmp1); + fe51_sq(x3, x3); + fe51_add(tmp0, tmp0, z3); + fe51_mul(z3, x1, z2); + fe51_mul(z2, tmp1, tmp0); + } + + fe51_invert(z2, z2); + fe51_mul(x2, x2, z2); + fe51_tobytes(out, x2); + + OPENSSL_cleanse(e, sizeof(e)); +} +#endif + +/* + * Reference base 2^25.5 implementation. + */ +/* + * This code is mostly taken from the ref10 version of Ed25519 in SUPERCOP + * 20141124 (http://bench.cr.yp.to/supercop.html). + * + * The field functions are shared by Ed25519 and X25519 where possible. + */ + +/* fe means field element. Here the field is \Z/(2^255-19). An element t, + * entries t[0]...t[9], represents the integer t[0]+2^26 t[1]+2^51 t[2]+2^77 + * t[3]+2^102 t[4]+...+2^230 t[9]. Bounds on each t[i] vary depending on + * context. */ +typedef int32_t fe[10]; + +static const int64_t kBottom25Bits = 0x1ffffffLL; +static const int64_t kBottom26Bits = 0x3ffffffLL; +static const int64_t kTop39Bits = 0xfffffffffe000000LL; +static const int64_t kTop38Bits = 0xfffffffffc000000LL; + +static uint64_t load_3(const uint8_t *in) { + uint64_t result; + result = (uint64_t)in[0]; + result |= ((uint64_t)in[1]) << 8; + result |= ((uint64_t)in[2]) << 16; + return result; +} + +static uint64_t load_4(const uint8_t *in) { + uint64_t result; + result = (uint64_t)in[0]; + result |= ((uint64_t)in[1]) << 8; + result |= ((uint64_t)in[2]) << 16; + result |= ((uint64_t)in[3]) << 24; + return result; +} + +static void fe_frombytes(fe h, const uint8_t *s) { + /* Ignores top bit of h. */ + int64_t h0 = load_4(s); + int64_t h1 = load_3(s + 4) << 6; + int64_t h2 = load_3(s + 7) << 5; + int64_t h3 = load_3(s + 10) << 3; + int64_t h4 = load_3(s + 13) << 2; + int64_t h5 = load_4(s + 16); + int64_t h6 = load_3(s + 20) << 7; + int64_t h7 = load_3(s + 23) << 5; + int64_t h8 = load_3(s + 26) << 4; + int64_t h9 = (load_3(s + 29) & 8388607) << 2; + int64_t carry0; + int64_t carry1; + int64_t carry2; + int64_t carry3; + int64_t carry4; + int64_t carry5; + int64_t carry6; + int64_t carry7; + int64_t carry8; + int64_t carry9; + + carry9 = h9 + (1 << 24); h0 += (carry9 >> 25) * 19; h9 -= carry9 & kTop39Bits; + carry1 = h1 + (1 << 24); h2 += carry1 >> 25; h1 -= carry1 & kTop39Bits; + carry3 = h3 + (1 << 24); h4 += carry3 >> 25; h3 -= carry3 & kTop39Bits; + carry5 = h5 + (1 << 24); h6 += carry5 >> 25; h5 -= carry5 & kTop39Bits; + carry7 = h7 + (1 << 24); h8 += carry7 >> 25; h7 -= carry7 & kTop39Bits; + + carry0 = h0 + (1 << 25); h1 += carry0 >> 26; h0 -= carry0 & kTop38Bits; + carry2 = h2 + (1 << 25); h3 += carry2 >> 26; h2 -= carry2 & kTop38Bits; + carry4 = h4 + (1 << 25); h5 += carry4 >> 26; h4 -= carry4 & kTop38Bits; + carry6 = h6 + (1 << 25); h7 += carry6 >> 26; h6 -= carry6 & kTop38Bits; + carry8 = h8 + (1 << 25); h9 += carry8 >> 26; h8 -= carry8 & kTop38Bits; + + h[0] = (int32_t)h0; + h[1] = (int32_t)h1; + h[2] = (int32_t)h2; + h[3] = (int32_t)h3; + h[4] = (int32_t)h4; + h[5] = (int32_t)h5; + h[6] = (int32_t)h6; + h[7] = (int32_t)h7; + h[8] = (int32_t)h8; + h[9] = (int32_t)h9; +} + +/* Preconditions: + * |h| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc. + * + * Write p=2^255-19; q=floor(h/p). + * Basic claim: q = floor(2^(-255)(h + 19 2^(-25)h9 + 2^(-1))). + * + * Proof: + * Have |h|<=p so |q|<=1 so |19^2 2^(-255) q|<1/4. + * Also have |h-2^230 h9|<2^231 so |19 2^(-255)(h-2^230 h9)|<1/4. + * + * Write y=2^(-1)-19^2 2^(-255)q-19 2^(-255)(h-2^230 h9). + * Then 0<y<1. + * + * Write r=h-pq. + * Have 0<=r<=p-1=2^255-20. + * Thus 0<=r+19(2^-255)r<r+19(2^-255)2^255<=2^255-1. + * + * Write x=r+19(2^-255)r+y. + * Then 0<x<2^255 so floor(2^(-255)x) = 0 so floor(q+2^(-255)x) = q. + * + * Have q+2^(-255)x = 2^(-255)(h + 19 2^(-25) h9 + 2^(-1)) + * so floor(2^(-255)(h + 19 2^(-25) h9 + 2^(-1))) = q. */ +static void fe_tobytes(uint8_t *s, const fe h) { + int32_t h0 = h[0]; + int32_t h1 = h[1]; + int32_t h2 = h[2]; + int32_t h3 = h[3]; + int32_t h4 = h[4]; + int32_t h5 = h[5]; + int32_t h6 = h[6]; + int32_t h7 = h[7]; + int32_t h8 = h[8]; + int32_t h9 = h[9]; + int32_t q; + + q = (19 * h9 + (((int32_t) 1) << 24)) >> 25; + q = (h0 + q) >> 26; + q = (h1 + q) >> 25; + q = (h2 + q) >> 26; + q = (h3 + q) >> 25; + q = (h4 + q) >> 26; + q = (h5 + q) >> 25; + q = (h6 + q) >> 26; + q = (h7 + q) >> 25; + q = (h8 + q) >> 26; + q = (h9 + q) >> 25; + + /* Goal: Output h-(2^255-19)q, which is between 0 and 2^255-20. */ + h0 += 19 * q; + /* Goal: Output h-2^255 q, which is between 0 and 2^255-20. */ + + h1 += h0 >> 26; h0 &= kBottom26Bits; + h2 += h1 >> 25; h1 &= kBottom25Bits; + h3 += h2 >> 26; h2 &= kBottom26Bits; + h4 += h3 >> 25; h3 &= kBottom25Bits; + h5 += h4 >> 26; h4 &= kBottom26Bits; + h6 += h5 >> 25; h5 &= kBottom25Bits; + h7 += h6 >> 26; h6 &= kBottom26Bits; + h8 += h7 >> 25; h7 &= kBottom25Bits; + h9 += h8 >> 26; h8 &= kBottom26Bits; + h9 &= kBottom25Bits; + /* h10 = carry9 */ + + /* Goal: Output h0+...+2^255 h10-2^255 q, which is between 0 and 2^255-20. + * Have h0+...+2^230 h9 between 0 and 2^255-1; + * evidently 2^255 h10-2^255 q = 0. + * Goal: Output h0+...+2^230 h9. */ + + s[0] = (uint8_t)(h0 >> 0); + s[1] = (uint8_t)(h0 >> 8); + s[2] = (uint8_t)(h0 >> 16); + s[3] = (uint8_t)((h0 >> 24) | ((uint32_t)(h1) << 2)); + s[4] = (uint8_t)(h1 >> 6); + s[5] = (uint8_t)(h1 >> 14); + s[6] = (uint8_t)((h1 >> 22) | ((uint32_t)(h2) << 3)); + s[7] = (uint8_t)(h2 >> 5); + s[8] = (uint8_t)(h2 >> 13); + s[9] = (uint8_t)((h2 >> 21) | ((uint32_t)(h3) << 5)); + s[10] = (uint8_t)(h3 >> 3); + s[11] = (uint8_t)(h3 >> 11); + s[12] = (uint8_t)((h3 >> 19) | ((uint32_t)(h4) << 6)); + s[13] = (uint8_t)(h4 >> 2); + s[14] = (uint8_t)(h4 >> 10); + s[15] = (uint8_t)(h4 >> 18); + s[16] = (uint8_t)(h5 >> 0); + s[17] = (uint8_t)(h5 >> 8); + s[18] = (uint8_t)(h5 >> 16); + s[19] = (uint8_t)((h5 >> 24) | ((uint32_t)(h6) << 1)); + s[20] = (uint8_t)(h6 >> 7); + s[21] = (uint8_t)(h6 >> 15); + s[22] = (uint8_t)((h6 >> 23) | ((uint32_t)(h7) << 3)); + s[23] = (uint8_t)(h7 >> 5); + s[24] = (uint8_t)(h7 >> 13); + s[25] = (uint8_t)((h7 >> 21) | ((uint32_t)(h8) << 4)); + s[26] = (uint8_t)(h8 >> 4); + s[27] = (uint8_t)(h8 >> 12); + s[28] = (uint8_t)((h8 >> 20) | ((uint32_t)(h9) << 6)); + s[29] = (uint8_t)(h9 >> 2); + s[30] = (uint8_t)(h9 >> 10); + s[31] = (uint8_t)(h9 >> 18); +} + +/* h = f */ +static void fe_copy(fe h, const fe f) { + memmove(h, f, sizeof(int32_t) * 10); +} + +/* h = 0 */ +static void fe_0(fe h) { memset(h, 0, sizeof(int32_t) * 10); } + +/* h = 1 */ +static void fe_1(fe h) { + memset(h, 0, sizeof(int32_t) * 10); + h[0] = 1; +} + +/* h = f + g + * Can overlap h with f or g. + * + * Preconditions: + * |f| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc. + * |g| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc. + * + * Postconditions: + * |h| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc. */ +static void fe_add(fe h, const fe f, const fe g) { + unsigned i; + for (i = 0; i < 10; i++) { + h[i] = f[i] + g[i]; + } +} + +/* h = f - g + * Can overlap h with f or g. + * + * Preconditions: + * |f| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc. + * |g| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc. + * + * Postconditions: + * |h| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc. */ +static void fe_sub(fe h, const fe f, const fe g) { + unsigned i; + for (i = 0; i < 10; i++) { + h[i] = f[i] - g[i]; + } +} + +/* h = f * g + * Can overlap h with f or g. + * + * Preconditions: + * |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc. + * |g| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc. + * + * Postconditions: + * |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc. + * + * Notes on implementation strategy: + * + * Using schoolbook multiplication. + * Karatsuba would save a little in some cost models. + * + * Most multiplications by 2 and 19 are 32-bit precomputations; + * cheaper than 64-bit postcomputations. + * + * There is one remaining multiplication by 19 in the carry chain; + * one *19 precomputation can be merged into this, + * but the resulting data flow is considerably less clean. + * + * There are 12 carries below. + * 10 of them are 2-way parallelizable and vectorizable. + * Can get away with 11 carries, but then data flow is much deeper. + * + * With tighter constraints on inputs can squeeze carries into int32. */ +static void fe_mul(fe h, const fe f, const fe g) { + int32_t f0 = f[0]; + int32_t f1 = f[1]; + int32_t f2 = f[2]; + int32_t f3 = f[3]; + int32_t f4 = f[4]; + int32_t f5 = f[5]; + int32_t f6 = f[6]; + int32_t f7 = f[7]; + int32_t f8 = f[8]; + int32_t f9 = f[9]; + int32_t g0 = g[0]; + int32_t g1 = g[1]; + int32_t g2 = g[2]; + int32_t g3 = g[3]; + int32_t g4 = g[4]; + int32_t g5 = g[5]; + int32_t g6 = g[6]; + int32_t g7 = g[7]; + int32_t g8 = g[8]; + int32_t g9 = g[9]; + int32_t g1_19 = 19 * g1; /* 1.959375*2^29 */ + int32_t g2_19 = 19 * g2; /* 1.959375*2^30; still ok */ + int32_t g3_19 = 19 * g3; + int32_t g4_19 = 19 * g4; + int32_t g5_19 = 19 * g5; + int32_t g6_19 = 19 * g6; + int32_t g7_19 = 19 * g7; + int32_t g8_19 = 19 * g8; + int32_t g9_19 = 19 * g9; + int32_t f1_2 = 2 * f1; + int32_t f3_2 = 2 * f3; + int32_t f5_2 = 2 * f5; + int32_t f7_2 = 2 * f7; + int32_t f9_2 = 2 * f9; + int64_t f0g0 = f0 * (int64_t) g0; + int64_t f0g1 = f0 * (int64_t) g1; + int64_t f0g2 = f0 * (int64_t) g2; + int64_t f0g3 = f0 * (int64_t) g3; + int64_t f0g4 = f0 * (int64_t) g4; + int64_t f0g5 = f0 * (int64_t) g5; + int64_t f0g6 = f0 * (int64_t) g6; + int64_t f0g7 = f0 * (int64_t) g7; + int64_t f0g8 = f0 * (int64_t) g8; + int64_t f0g9 = f0 * (int64_t) g9; + int64_t f1g0 = f1 * (int64_t) g0; + int64_t f1g1_2 = f1_2 * (int64_t) g1; + int64_t f1g2 = f1 * (int64_t) g2; + int64_t f1g3_2 = f1_2 * (int64_t) g3; + int64_t f1g4 = f1 * (int64_t) g4; + int64_t f1g5_2 = f1_2 * (int64_t) g5; + int64_t f1g6 = f1 * (int64_t) g6; + int64_t f1g7_2 = f1_2 * (int64_t) g7; + int64_t f1g8 = f1 * (int64_t) g8; + int64_t f1g9_38 = f1_2 * (int64_t) g9_19; + int64_t f2g0 = f2 * (int64_t) g0; + int64_t f2g1 = f2 * (int64_t) g1; + int64_t f2g2 = f2 * (int64_t) g2; + int64_t f2g3 = f2 * (int64_t) g3; + int64_t f2g4 = f2 * (int64_t) g4; + int64_t f2g5 = f2 * (int64_t) g5; + int64_t f2g6 = f2 * (int64_t) g6; + int64_t f2g7 = f2 * (int64_t) g7; + int64_t f2g8_19 = f2 * (int64_t) g8_19; + int64_t f2g9_19 = f2 * (int64_t) g9_19; + int64_t f3g0 = f3 * (int64_t) g0; + int64_t f3g1_2 = f3_2 * (int64_t) g1; + int64_t f3g2 = f3 * (int64_t) g2; + int64_t f3g3_2 = f3_2 * (int64_t) g3; + int64_t f3g4 = f3 * (int64_t) g4; + int64_t f3g5_2 = f3_2 * (int64_t) g5; + int64_t f3g6 = f3 * (int64_t) g6; + int64_t f3g7_38 = f3_2 * (int64_t) g7_19; + int64_t f3g8_19 = f3 * (int64_t) g8_19; + int64_t f3g9_38 = f3_2 * (int64_t) g9_19; + int64_t f4g0 = f4 * (int64_t) g0; + int64_t f4g1 = f4 * (int64_t) g1; + int64_t f4g2 = f4 * (int64_t) g2; + int64_t f4g3 = f4 * (int64_t) g3; + int64_t f4g4 = f4 * (int64_t) g4; + int64_t f4g5 = f4 * (int64_t) g5; + int64_t f4g6_19 = f4 * (int64_t) g6_19; + int64_t f4g7_19 = f4 * (int64_t) g7_19; + int64_t f4g8_19 = f4 * (int64_t) g8_19; + int64_t f4g9_19 = f4 * (int64_t) g9_19; + int64_t f5g0 = f5 * (int64_t) g0; + int64_t f5g1_2 = f5_2 * (int64_t) g1; + int64_t f5g2 = f5 * (int64_t) g2; + int64_t f5g3_2 = f5_2 * (int64_t) g3; + int64_t f5g4 = f5 * (int64_t) g4; + int64_t f5g5_38 = f5_2 * (int64_t) g5_19; + int64_t f5g6_19 = f5 * (int64_t) g6_19; + int64_t f5g7_38 = f5_2 * (int64_t) g7_19; + int64_t f5g8_19 = f5 * (int64_t) g8_19; + int64_t f5g9_38 = f5_2 * (int64_t) g9_19; + int64_t f6g0 = f6 * (int64_t) g0; + int64_t f6g1 = f6 * (int64_t) g1; + int64_t f6g2 = f6 * (int64_t) g2; + int64_t f6g3 = f6 * (int64_t) g3; + int64_t f6g4_19 = f6 * (int64_t) g4_19; + int64_t f6g5_19 = f6 * (int64_t) g5_19; + int64_t f6g6_19 = f6 * (int64_t) g6_19; + int64_t f6g7_19 = f6 * (int64_t) g7_19; + int64_t f6g8_19 = f6 * (int64_t) g8_19; + int64_t f6g9_19 = f6 * (int64_t) g9_19; + int64_t f7g0 = f7 * (int64_t) g0; + int64_t f7g1_2 = f7_2 * (int64_t) g1; + int64_t f7g2 = f7 * (int64_t) g2; + int64_t f7g3_38 = f7_2 * (int64_t) g3_19; + int64_t f7g4_19 = f7 * (int64_t) g4_19; + int64_t f7g5_38 = f7_2 * (int64_t) g5_19; + int64_t f7g6_19 = f7 * (int64_t) g6_19; + int64_t f7g7_38 = f7_2 * (int64_t) g7_19; + int64_t f7g8_19 = f7 * (int64_t) g8_19; + int64_t f7g9_38 = f7_2 * (int64_t) g9_19; + int64_t f8g0 = f8 * (int64_t) g0; + int64_t f8g1 = f8 * (int64_t) g1; + int64_t f8g2_19 = f8 * (int64_t) g2_19; + int64_t f8g3_19 = f8 * (int64_t) g3_19; + int64_t f8g4_19 = f8 * (int64_t) g4_19; + int64_t f8g5_19 = f8 * (int64_t) g5_19; + int64_t f8g6_19 = f8 * (int64_t) g6_19; + int64_t f8g7_19 = f8 * (int64_t) g7_19; + int64_t f8g8_19 = f8 * (int64_t) g8_19; + int64_t f8g9_19 = f8 * (int64_t) g9_19; + int64_t f9g0 = f9 * (int64_t) g0; + int64_t f9g1_38 = f9_2 * (int64_t) g1_19; + int64_t f9g2_19 = f9 * (int64_t) g2_19; + int64_t f9g3_38 = f9_2 * (int64_t) g3_19; + int64_t f9g4_19 = f9 * (int64_t) g4_19; + int64_t f9g5_38 = f9_2 * (int64_t) g5_19; + int64_t f9g6_19 = f9 * (int64_t) g6_19; + int64_t f9g7_38 = f9_2 * (int64_t) g7_19; + int64_t f9g8_19 = f9 * (int64_t) g8_19; + int64_t f9g9_38 = f9_2 * (int64_t) g9_19; + int64_t h0 = f0g0+f1g9_38+f2g8_19+f3g7_38+f4g6_19+f5g5_38+f6g4_19+f7g3_38+f8g2_19+f9g1_38; + int64_t h1 = f0g1+f1g0 +f2g9_19+f3g8_19+f4g7_19+f5g6_19+f6g5_19+f7g4_19+f8g3_19+f9g2_19; + int64_t h2 = f0g2+f1g1_2 +f2g0 +f3g9_38+f4g8_19+f5g7_38+f6g6_19+f7g5_38+f8g4_19+f9g3_38; + int64_t h3 = f0g3+f1g2 +f2g1 +f3g0 +f4g9_19+f5g8_19+f6g7_19+f7g6_19+f8g5_19+f9g4_19; + int64_t h4 = f0g4+f1g3_2 +f2g2 +f3g1_2 +f4g0 +f5g9_38+f6g8_19+f7g7_38+f8g6_19+f9g5_38; + int64_t h5 = f0g5+f1g4 +f2g3 +f3g2 +f4g1 +f5g0 +f6g9_19+f7g8_19+f8g7_19+f9g6_19; + int64_t h6 = f0g6+f1g5_2 +f2g4 +f3g3_2 +f4g2 +f5g1_2 +f6g0 +f7g9_38+f8g8_19+f9g7_38; + int64_t h7 = f0g7+f1g6 +f2g5 +f3g4 +f4g3 +f5g2 +f6g1 +f7g0 +f8g9_19+f9g8_19; + int64_t h8 = f0g8+f1g7_2 +f2g6 +f3g5_2 +f4g4 +f5g3_2 +f6g2 +f7g1_2 +f8g0 +f9g9_38; + int64_t h9 = f0g9+f1g8 +f2g7 +f3g6 +f4g5 +f5g4 +f6g3 +f7g2 +f8g1 +f9g0 ; + int64_t carry0; + int64_t carry1; + int64_t carry2; + int64_t carry3; + int64_t carry4; + int64_t carry5; + int64_t carry6; + int64_t carry7; + int64_t carry8; + int64_t carry9; + + /* |h0| <= (1.65*1.65*2^52*(1+19+19+19+19)+1.65*1.65*2^50*(38+38+38+38+38)) + * i.e. |h0| <= 1.4*2^60; narrower ranges for h2, h4, h6, h8 + * |h1| <= (1.65*1.65*2^51*(1+1+19+19+19+19+19+19+19+19)) + * i.e. |h1| <= 1.7*2^59; narrower ranges for h3, h5, h7, h9 */ + + carry0 = h0 + (1 << 25); h1 += carry0 >> 26; h0 -= carry0 & kTop38Bits; + carry4 = h4 + (1 << 25); h5 += carry4 >> 26; h4 -= carry4 & kTop38Bits; + /* |h0| <= 2^25 */ + /* |h4| <= 2^25 */ + /* |h1| <= 1.71*2^59 */ + /* |h5| <= 1.71*2^59 */ + + carry1 = h1 + (1 << 24); h2 += carry1 >> 25; h1 -= carry1 & kTop39Bits; + carry5 = h5 + (1 << 24); h6 += carry5 >> 25; h5 -= carry5 & kTop39Bits; + /* |h1| <= 2^24; from now on fits into int32 */ + /* |h5| <= 2^24; from now on fits into int32 */ + /* |h2| <= 1.41*2^60 */ + /* |h6| <= 1.41*2^60 */ + + carry2 = h2 + (1 << 25); h3 += carry2 >> 26; h2 -= carry2 & kTop38Bits; + carry6 = h6 + (1 << 25); h7 += carry6 >> 26; h6 -= carry6 & kTop38Bits; + /* |h2| <= 2^25; from now on fits into int32 unchanged */ + /* |h6| <= 2^25; from now on fits into int32 unchanged */ + /* |h3| <= 1.71*2^59 */ + /* |h7| <= 1.71*2^59 */ + + carry3 = h3 + (1 << 24); h4 += carry3 >> 25; h3 -= carry3 & kTop39Bits; + carry7 = h7 + (1 << 24); h8 += carry7 >> 25; h7 -= carry7 & kTop39Bits; + /* |h3| <= 2^24; from now on fits into int32 unchanged */ + /* |h7| <= 2^24; from now on fits into int32 unchanged */ + /* |h4| <= 1.72*2^34 */ + /* |h8| <= 1.41*2^60 */ + + carry4 = h4 + (1 << 25); h5 += carry4 >> 26; h4 -= carry4 & kTop38Bits; + carry8 = h8 + (1 << 25); h9 += carry8 >> 26; h8 -= carry8 & kTop38Bits; + /* |h4| <= 2^25; from now on fits into int32 unchanged */ + /* |h8| <= 2^25; from now on fits into int32 unchanged */ + /* |h5| <= 1.01*2^24 */ + /* |h9| <= 1.71*2^59 */ + + carry9 = h9 + (1 << 24); h0 += (carry9 >> 25) * 19; h9 -= carry9 & kTop39Bits; + /* |h9| <= 2^24; from now on fits into int32 unchanged */ + /* |h0| <= 1.1*2^39 */ + + carry0 = h0 + (1 << 25); h1 += carry0 >> 26; h0 -= carry0 & kTop38Bits; + /* |h0| <= 2^25; from now on fits into int32 unchanged */ + /* |h1| <= 1.01*2^24 */ + + h[0] = (int32_t)h0; + h[1] = (int32_t)h1; + h[2] = (int32_t)h2; + h[3] = (int32_t)h3; + h[4] = (int32_t)h4; + h[5] = (int32_t)h5; + h[6] = (int32_t)h6; + h[7] = (int32_t)h7; + h[8] = (int32_t)h8; + h[9] = (int32_t)h9; +} + +/* h = f * f + * Can overlap h with f. + * + * Preconditions: + * |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc. + * + * Postconditions: + * |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc. + * + * See fe_mul.c for discussion of implementation strategy. */ +static void fe_sq(fe h, const fe f) { + int32_t f0 = f[0]; + int32_t f1 = f[1]; + int32_t f2 = f[2]; + int32_t f3 = f[3]; + int32_t f4 = f[4]; + int32_t f5 = f[5]; + int32_t f6 = f[6]; + int32_t f7 = f[7]; + int32_t f8 = f[8]; + int32_t f9 = f[9]; + int32_t f0_2 = 2 * f0; + int32_t f1_2 = 2 * f1; + int32_t f2_2 = 2 * f2; + int32_t f3_2 = 2 * f3; + int32_t f4_2 = 2 * f4; + int32_t f5_2 = 2 * f5; + int32_t f6_2 = 2 * f6; + int32_t f7_2 = 2 * f7; + int32_t f5_38 = 38 * f5; /* 1.959375*2^30 */ + int32_t f6_19 = 19 * f6; /* 1.959375*2^30 */ + int32_t f7_38 = 38 * f7; /* 1.959375*2^30 */ + int32_t f8_19 = 19 * f8; /* 1.959375*2^30 */ + int32_t f9_38 = 38 * f9; /* 1.959375*2^30 */ + int64_t f0f0 = f0 * (int64_t) f0; + int64_t f0f1_2 = f0_2 * (int64_t) f1; + int64_t f0f2_2 = f0_2 * (int64_t) f2; + int64_t f0f3_2 = f0_2 * (int64_t) f3; + int64_t f0f4_2 = f0_2 * (int64_t) f4; + int64_t f0f5_2 = f0_2 * (int64_t) f5; + int64_t f0f6_2 = f0_2 * (int64_t) f6; + int64_t f0f7_2 = f0_2 * (int64_t) f7; + int64_t f0f8_2 = f0_2 * (int64_t) f8; + int64_t f0f9_2 = f0_2 * (int64_t) f9; + int64_t f1f1_2 = f1_2 * (int64_t) f1; + int64_t f1f2_2 = f1_2 * (int64_t) f2; + int64_t f1f3_4 = f1_2 * (int64_t) f3_2; + int64_t f1f4_2 = f1_2 * (int64_t) f4; + int64_t f1f5_4 = f1_2 * (int64_t) f5_2; + int64_t f1f6_2 = f1_2 * (int64_t) f6; + int64_t f1f7_4 = f1_2 * (int64_t) f7_2; + int64_t f1f8_2 = f1_2 * (int64_t) f8; + int64_t f1f9_76 = f1_2 * (int64_t) f9_38; + int64_t f2f2 = f2 * (int64_t) f2; + int64_t f2f3_2 = f2_2 * (int64_t) f3; + int64_t f2f4_2 = f2_2 * (int64_t) f4; + int64_t f2f5_2 = f2_2 * (int64_t) f5; + int64_t f2f6_2 = f2_2 * (int64_t) f6; + int64_t f2f7_2 = f2_2 * (int64_t) f7; + int64_t f2f8_38 = f2_2 * (int64_t) f8_19; + int64_t f2f9_38 = f2 * (int64_t) f9_38; + int64_t f3f3_2 = f3_2 * (int64_t) f3; + int64_t f3f4_2 = f3_2 * (int64_t) f4; + int64_t f3f5_4 = f3_2 * (int64_t) f5_2; + int64_t f3f6_2 = f3_2 * (int64_t) f6; + int64_t f3f7_76 = f3_2 * (int64_t) f7_38; + int64_t f3f8_38 = f3_2 * (int64_t) f8_19; + int64_t f3f9_76 = f3_2 * (int64_t) f9_38; + int64_t f4f4 = f4 * (int64_t) f4; + int64_t f4f5_2 = f4_2 * (int64_t) f5; + int64_t f4f6_38 = f4_2 * (int64_t) f6_19; + int64_t f4f7_38 = f4 * (int64_t) f7_38; + int64_t f4f8_38 = f4_2 * (int64_t) f8_19; + int64_t f4f9_38 = f4 * (int64_t) f9_38; + int64_t f5f5_38 = f5 * (int64_t) f5_38; + int64_t f5f6_38 = f5_2 * (int64_t) f6_19; + int64_t f5f7_76 = f5_2 * (int64_t) f7_38; + int64_t f5f8_38 = f5_2 * (int64_t) f8_19; + int64_t f5f9_76 = f5_2 * (int64_t) f9_38; + int64_t f6f6_19 = f6 * (int64_t) f6_19; + int64_t f6f7_38 = f6 * (int64_t) f7_38; + int64_t f6f8_38 = f6_2 * (int64_t) f8_19; + int64_t f6f9_38 = f6 * (int64_t) f9_38; + int64_t f7f7_38 = f7 * (int64_t) f7_38; + int64_t f7f8_38 = f7_2 * (int64_t) f8_19; + int64_t f7f9_76 = f7_2 * (int64_t) f9_38; + int64_t f8f8_19 = f8 * (int64_t) f8_19; + int64_t f8f9_38 = f8 * (int64_t) f9_38; + int64_t f9f9_38 = f9 * (int64_t) f9_38; + int64_t h0 = f0f0 +f1f9_76+f2f8_38+f3f7_76+f4f6_38+f5f5_38; + int64_t h1 = f0f1_2+f2f9_38+f3f8_38+f4f7_38+f5f6_38; + int64_t h2 = f0f2_2+f1f1_2 +f3f9_76+f4f8_38+f5f7_76+f6f6_19; + int64_t h3 = f0f3_2+f1f2_2 +f4f9_38+f5f8_38+f6f7_38; + int64_t h4 = f0f4_2+f1f3_4 +f2f2 +f5f9_76+f6f8_38+f7f7_38; + int64_t h5 = f0f5_2+f1f4_2 +f2f3_2 +f6f9_38+f7f8_38; + int64_t h6 = f0f6_2+f1f5_4 +f2f4_2 +f3f3_2 +f7f9_76+f8f8_19; + int64_t h7 = f0f7_2+f1f6_2 +f2f5_2 +f3f4_2 +f8f9_38; + int64_t h8 = f0f8_2+f1f7_4 +f2f6_2 +f3f5_4 +f4f4 +f9f9_38; + int64_t h9 = f0f9_2+f1f8_2 +f2f7_2 +f3f6_2 +f4f5_2; + int64_t carry0; + int64_t carry1; + int64_t carry2; + int64_t carry3; + int64_t carry4; + int64_t carry5; + int64_t carry6; + int64_t carry7; + int64_t carry8; + int64_t carry9; + + carry0 = h0 + (1 << 25); h1 += carry0 >> 26; h0 -= carry0 & kTop38Bits; + carry4 = h4 + (1 << 25); h5 += carry4 >> 26; h4 -= carry4 & kTop38Bits; + + carry1 = h1 + (1 << 24); h2 += carry1 >> 25; h1 -= carry1 & kTop39Bits; + carry5 = h5 + (1 << 24); h6 += carry5 >> 25; h5 -= carry5 & kTop39Bits; + + carry2 = h2 + (1 << 25); h3 += carry2 >> 26; h2 -= carry2 & kTop38Bits; + carry6 = h6 + (1 << 25); h7 += carry6 >> 26; h6 -= carry6 & kTop38Bits; + + carry3 = h3 + (1 << 24); h4 += carry3 >> 25; h3 -= carry3 & kTop39Bits; + carry7 = h7 + (1 << 24); h8 += carry7 >> 25; h7 -= carry7 & kTop39Bits; + + carry4 = h4 + (1 << 25); h5 += carry4 >> 26; h4 -= carry4 & kTop38Bits; + carry8 = h8 + (1 << 25); h9 += carry8 >> 26; h8 -= carry8 & kTop38Bits; + + carry9 = h9 + (1 << 24); h0 += (carry9 >> 25) * 19; h9 -= carry9 & kTop39Bits; + + carry0 = h0 + (1 << 25); h1 += carry0 >> 26; h0 -= carry0 & kTop38Bits; + + h[0] = (int32_t)h0; + h[1] = (int32_t)h1; + h[2] = (int32_t)h2; + h[3] = (int32_t)h3; + h[4] = (int32_t)h4; + h[5] = (int32_t)h5; + h[6] = (int32_t)h6; + h[7] = (int32_t)h7; + h[8] = (int32_t)h8; + h[9] = (int32_t)h9; +} + +static void fe_invert(fe out, const fe z) { + fe t0; + fe t1; + fe t2; + fe t3; + int i; + + /* + * Compute z ** -1 = z ** (2 ** 255 - 19 - 2) with the exponent as + * 2 ** 255 - 21 = (2 ** 5) * (2 ** 250 - 1) + 11. + */ + + /* t0 = z ** 2 */ + fe_sq(t0, z); + + /* t1 = t0 ** (2 ** 2) = z ** 8 */ + fe_sq(t1, t0); + fe_sq(t1, t1); + + /* t1 = z * t1 = z ** 9 */ + fe_mul(t1, z, t1); + /* t0 = t0 * t1 = z ** 11 -- stash t0 away for the end. */ + fe_mul(t0, t0, t1); + + /* t2 = t0 ** 2 = z ** 22 */ + fe_sq(t2, t0); + + /* t1 = t1 * t2 = z ** (2 ** 5 - 1) */ + fe_mul(t1, t1, t2); + + /* t2 = t1 ** (2 ** 5) = z ** ((2 ** 5) * (2 ** 5 - 1)) */ + fe_sq(t2, t1); + for (i = 1; i < 5; ++i) { + fe_sq(t2, t2); + } + + /* t1 = t1 * t2 = z ** ((2 ** 5 + 1) * (2 ** 5 - 1)) = z ** (2 ** 10 - 1) */ + fe_mul(t1, t2, t1); + + /* Continuing similarly... */ + + /* t2 = z ** (2 ** 20 - 1) */ + fe_sq(t2, t1); + for (i = 1; i < 10; ++i) { + fe_sq(t2, t2); + } + fe_mul(t2, t2, t1); + + /* t2 = z ** (2 ** 40 - 1) */ + fe_sq(t3, t2); + for (i = 1; i < 20; ++i) { + fe_sq(t3, t3); + } + fe_mul(t2, t3, t2); + + /* t2 = z ** (2 ** 10) * (2 ** 40 - 1) */ + for (i = 0; i < 10; ++i) { + fe_sq(t2, t2); + } + /* t1 = z ** (2 ** 50 - 1) */ + fe_mul(t1, t2, t1); + + /* t2 = z ** (2 ** 100 - 1) */ + fe_sq(t2, t1); + for (i = 1; i < 50; ++i) { + fe_sq(t2, t2); + } + fe_mul(t2, t2, t1); + + /* t2 = z ** (2 ** 200 - 1) */ + fe_sq(t3, t2); + for (i = 1; i < 100; ++i) { + fe_sq(t3, t3); + } + fe_mul(t2, t3, t2); + + /* t2 = z ** ((2 ** 50) * (2 ** 200 - 1) */ + fe_sq(t2, t2); + for (i = 1; i < 50; ++i) { + fe_sq(t2, t2); + } + + /* t1 = z ** (2 ** 250 - 1) */ + fe_mul(t1, t2, t1); + + /* t1 = z ** ((2 ** 5) * (2 ** 250 - 1)) */ + fe_sq(t1, t1); + for (i = 1; i < 5; ++i) { + fe_sq(t1, t1); + } + + /* Recall t0 = z ** 11; out = z ** (2 ** 255 - 21) */ + fe_mul(out, t1, t0); +} + +/* h = -f + * + * Preconditions: + * |f| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc. + * + * Postconditions: + * |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc. */ +static void fe_neg(fe h, const fe f) { + unsigned i; + for (i = 0; i < 10; i++) { + h[i] = -f[i]; + } +} + +/* Replace (f,g) with (g,g) if b == 1; + * replace (f,g) with (f,g) if b == 0. + * + * Preconditions: b in {0,1}. */ +static void fe_cmov(fe f, const fe g, unsigned b) { + size_t i; + b = 0-b; + for (i = 0; i < 10; i++) { + int32_t x = f[i] ^ g[i]; + x &= b; + f[i] ^= x; + } +} + +/* return 0 if f == 0 + * return 1 if f != 0 + * + * Preconditions: + * |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc. */ +static int fe_isnonzero(const fe f) { + uint8_t s[32]; + static const uint8_t zero[32] = {0}; + fe_tobytes(s, f); + + return CRYPTO_memcmp(s, zero, sizeof(zero)) != 0; +} + +/* return 1 if f is in {1,3,5,...,q-2} + * return 0 if f is in {0,2,4,...,q-1} + * + * Preconditions: + * |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc. */ +static int fe_isnegative(const fe f) { + uint8_t s[32]; + fe_tobytes(s, f); + return s[0] & 1; +} + +/* h = 2 * f * f + * Can overlap h with f. + * + * Preconditions: + * |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc. + * + * Postconditions: + * |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc. + * + * See fe_mul.c for discussion of implementation strategy. */ +static void fe_sq2(fe h, const fe f) { + int32_t f0 = f[0]; + int32_t f1 = f[1]; + int32_t f2 = f[2]; + int32_t f3 = f[3]; + int32_t f4 = f[4]; + int32_t f5 = f[5]; + int32_t f6 = f[6]; + int32_t f7 = f[7]; + int32_t f8 = f[8]; + int32_t f9 = f[9]; + int32_t f0_2 = 2 * f0; + int32_t f1_2 = 2 * f1; + int32_t f2_2 = 2 * f2; + int32_t f3_2 = 2 * f3; + int32_t f4_2 = 2 * f4; + int32_t f5_2 = 2 * f5; + int32_t f6_2 = 2 * f6; + int32_t f7_2 = 2 * f7; + int32_t f5_38 = 38 * f5; /* 1.959375*2^30 */ + int32_t f6_19 = 19 * f6; /* 1.959375*2^30 */ + int32_t f7_38 = 38 * f7; /* 1.959375*2^30 */ + int32_t f8_19 = 19 * f8; /* 1.959375*2^30 */ + int32_t f9_38 = 38 * f9; /* 1.959375*2^30 */ + int64_t f0f0 = f0 * (int64_t) f0; + int64_t f0f1_2 = f0_2 * (int64_t) f1; + int64_t f0f2_2 = f0_2 * (int64_t) f2; + int64_t f0f3_2 = f0_2 * (int64_t) f3; + int64_t f0f4_2 = f0_2 * (int64_t) f4; + int64_t f0f5_2 = f0_2 * (int64_t) f5; + int64_t f0f6_2 = f0_2 * (int64_t) f6; + int64_t f0f7_2 = f0_2 * (int64_t) f7; + int64_t f0f8_2 = f0_2 * (int64_t) f8; + int64_t f0f9_2 = f0_2 * (int64_t) f9; + int64_t f1f1_2 = f1_2 * (int64_t) f1; + int64_t f1f2_2 = f1_2 * (int64_t) f2; + int64_t f1f3_4 = f1_2 * (int64_t) f3_2; + int64_t f1f4_2 = f1_2 * (int64_t) f4; + int64_t f1f5_4 = f1_2 * (int64_t) f5_2; + int64_t f1f6_2 = f1_2 * (int64_t) f6; + int64_t f1f7_4 = f1_2 * (int64_t) f7_2; + int64_t f1f8_2 = f1_2 * (int64_t) f8; + int64_t f1f9_76 = f1_2 * (int64_t) f9_38; + int64_t f2f2 = f2 * (int64_t) f2; + int64_t f2f3_2 = f2_2 * (int64_t) f3; + int64_t f2f4_2 = f2_2 * (int64_t) f4; + int64_t f2f5_2 = f2_2 * (int64_t) f5; + int64_t f2f6_2 = f2_2 * (int64_t) f6; + int64_t f2f7_2 = f2_2 * (int64_t) f7; + int64_t f2f8_38 = f2_2 * (int64_t) f8_19; + int64_t f2f9_38 = f2 * (int64_t) f9_38; + int64_t f3f3_2 = f3_2 * (int64_t) f3; + int64_t f3f4_2 = f3_2 * (int64_t) f4; + int64_t f3f5_4 = f3_2 * (int64_t) f5_2; + int64_t f3f6_2 = f3_2 * (int64_t) f6; + int64_t f3f7_76 = f3_2 * (int64_t) f7_38; + int64_t f3f8_38 = f3_2 * (int64_t) f8_19; + int64_t f3f9_76 = f3_2 * (int64_t) f9_38; + int64_t f4f4 = f4 * (int64_t) f4; + int64_t f4f5_2 = f4_2 * (int64_t) f5; + int64_t f4f6_38 = f4_2 * (int64_t) f6_19; + int64_t f4f7_38 = f4 * (int64_t) f7_38; + int64_t f4f8_38 = f4_2 * (int64_t) f8_19; + int64_t f4f9_38 = f4 * (int64_t) f9_38; + int64_t f5f5_38 = f5 * (int64_t) f5_38; + int64_t f5f6_38 = f5_2 * (int64_t) f6_19; + int64_t f5f7_76 = f5_2 * (int64_t) f7_38; + int64_t f5f8_38 = f5_2 * (int64_t) f8_19; + int64_t f5f9_76 = f5_2 * (int64_t) f9_38; + int64_t f6f6_19 = f6 * (int64_t) f6_19; + int64_t f6f7_38 = f6 * (int64_t) f7_38; + int64_t f6f8_38 = f6_2 * (int64_t) f8_19; + int64_t f6f9_38 = f6 * (int64_t) f9_38; + int64_t f7f7_38 = f7 * (int64_t) f7_38; + int64_t f7f8_38 = f7_2 * (int64_t) f8_19; + int64_t f7f9_76 = f7_2 * (int64_t) f9_38; + int64_t f8f8_19 = f8 * (int64_t) f8_19; + int64_t f8f9_38 = f8 * (int64_t) f9_38; + int64_t f9f9_38 = f9 * (int64_t) f9_38; + int64_t h0 = f0f0 +f1f9_76+f2f8_38+f3f7_76+f4f6_38+f5f5_38; + int64_t h1 = f0f1_2+f2f9_38+f3f8_38+f4f7_38+f5f6_38; + int64_t h2 = f0f2_2+f1f1_2 +f3f9_76+f4f8_38+f5f7_76+f6f6_19; + int64_t h3 = f0f3_2+f1f2_2 +f4f9_38+f5f8_38+f6f7_38; + int64_t h4 = f0f4_2+f1f3_4 +f2f2 +f5f9_76+f6f8_38+f7f7_38; + int64_t h5 = f0f5_2+f1f4_2 +f2f3_2 +f6f9_38+f7f8_38; + int64_t h6 = f0f6_2+f1f5_4 +f2f4_2 +f3f3_2 +f7f9_76+f8f8_19; + int64_t h7 = f0f7_2+f1f6_2 +f2f5_2 +f3f4_2 +f8f9_38; + int64_t h8 = f0f8_2+f1f7_4 +f2f6_2 +f3f5_4 +f4f4 +f9f9_38; + int64_t h9 = f0f9_2+f1f8_2 +f2f7_2 +f3f6_2 +f4f5_2; + int64_t carry0; + int64_t carry1; + int64_t carry2; + int64_t carry3; + int64_t carry4; + int64_t carry5; + int64_t carry6; + int64_t carry7; + int64_t carry8; + int64_t carry9; + + h0 += h0; + h1 += h1; + h2 += h2; + h3 += h3; + h4 += h4; + h5 += h5; + h6 += h6; + h7 += h7; + h8 += h8; + h9 += h9; + + carry0 = h0 + (1 << 25); h1 += carry0 >> 26; h0 -= carry0 & kTop38Bits; + carry4 = h4 + (1 << 25); h5 += carry4 >> 26; h4 -= carry4 & kTop38Bits; + + carry1 = h1 + (1 << 24); h2 += carry1 >> 25; h1 -= carry1 & kTop39Bits; + carry5 = h5 + (1 << 24); h6 += carry5 >> 25; h5 -= carry5 & kTop39Bits; + + carry2 = h2 + (1 << 25); h3 += carry2 >> 26; h2 -= carry2 & kTop38Bits; + carry6 = h6 + (1 << 25); h7 += carry6 >> 26; h6 -= carry6 & kTop38Bits; + + carry3 = h3 + (1 << 24); h4 += carry3 >> 25; h3 -= carry3 & kTop39Bits; + carry7 = h7 + (1 << 24); h8 += carry7 >> 25; h7 -= carry7 & kTop39Bits; + + carry4 = h4 + (1 << 25); h5 += carry4 >> 26; h4 -= carry4 & kTop38Bits; + carry8 = h8 + (1 << 25); h9 += carry8 >> 26; h8 -= carry8 & kTop38Bits; + + carry9 = h9 + (1 << 24); h0 += (carry9 >> 25) * 19; h9 -= carry9 & kTop39Bits; + + carry0 = h0 + (1 << 25); h1 += carry0 >> 26; h0 -= carry0 & kTop38Bits; + + h[0] = (int32_t)h0; + h[1] = (int32_t)h1; + h[2] = (int32_t)h2; + h[3] = (int32_t)h3; + h[4] = (int32_t)h4; + h[5] = (int32_t)h5; + h[6] = (int32_t)h6; + h[7] = (int32_t)h7; + h[8] = (int32_t)h8; + h[9] = (int32_t)h9; +} + +static void fe_pow22523(fe out, const fe z) { + fe t0; + fe t1; + fe t2; + int i; + + fe_sq(t0, z); + fe_sq(t1, t0); + for (i = 1; i < 2; ++i) { + fe_sq(t1, t1); + } + fe_mul(t1, z, t1); + fe_mul(t0, t0, t1); + fe_sq(t0, t0); + fe_mul(t0, t1, t0); + fe_sq(t1, t0); + for (i = 1; i < 5; ++i) { + fe_sq(t1, t1); + } + fe_mul(t0, t1, t0); + fe_sq(t1, t0); + for (i = 1; i < 10; ++i) { + fe_sq(t1, t1); + } + fe_mul(t1, t1, t0); + fe_sq(t2, t1); + for (i = 1; i < 20; ++i) { + fe_sq(t2, t2); + } + fe_mul(t1, t2, t1); + fe_sq(t1, t1); + for (i = 1; i < 10; ++i) { + fe_sq(t1, t1); + } + fe_mul(t0, t1, t0); + fe_sq(t1, t0); + for (i = 1; i < 50; ++i) { + fe_sq(t1, t1); + } + fe_mul(t1, t1, t0); + fe_sq(t2, t1); + for (i = 1; i < 100; ++i) { + fe_sq(t2, t2); + } + fe_mul(t1, t2, t1); + fe_sq(t1, t1); + for (i = 1; i < 50; ++i) { + fe_sq(t1, t1); + } + fe_mul(t0, t1, t0); + fe_sq(t0, t0); + for (i = 1; i < 2; ++i) { + fe_sq(t0, t0); + } + fe_mul(out, t0, z); +} + +/* ge means group element. + + * Here the group is the set of pairs (x,y) of field elements (see fe.h) + * satisfying -x^2 + y^2 = 1 + d x^2y^2 + * where d = -121665/121666. + * + * Representations: + * ge_p2 (projective): (X:Y:Z) satisfying x=X/Z, y=Y/Z + * ge_p3 (extended): (X:Y:Z:T) satisfying x=X/Z, y=Y/Z, XY=ZT + * ge_p1p1 (completed): ((X:Z),(Y:T)) satisfying x=X/Z, y=Y/T + * ge_precomp (Duif): (y+x,y-x,2dxy) */ + +typedef struct { + fe X; + fe Y; + fe Z; +} ge_p2; + +typedef struct { + fe X; + fe Y; + fe Z; + fe T; +} ge_p3; + +typedef struct { + fe X; + fe Y; + fe Z; + fe T; +} ge_p1p1; + +typedef struct { + fe yplusx; + fe yminusx; + fe xy2d; +} ge_precomp; + +typedef struct { + fe YplusX; + fe YminusX; + fe Z; + fe T2d; +} ge_cached; + +static void ge_tobytes(uint8_t *s, const ge_p2 *h) { + fe recip; + fe x; + fe y; + + fe_invert(recip, h->Z); + fe_mul(x, h->X, recip); + fe_mul(y, h->Y, recip); + fe_tobytes(s, y); + s[31] ^= fe_isnegative(x) << 7; +} + +static void ge_p3_tobytes(uint8_t *s, const ge_p3 *h) { + fe recip; + fe x; + fe y; + + fe_invert(recip, h->Z); + fe_mul(x, h->X, recip); + fe_mul(y, h->Y, recip); + fe_tobytes(s, y); + s[31] ^= fe_isnegative(x) << 7; +} + +static const fe d = {-10913610, 13857413, -15372611, 6949391, 114729, + -8787816, -6275908, -3247719, -18696448, -12055116}; + +static const fe sqrtm1 = {-32595792, -7943725, 9377950, 3500415, 12389472, + -272473, -25146209, -2005654, 326686, 11406482}; + +static int ge_frombytes_vartime(ge_p3 *h, const uint8_t *s) { + fe u; + fe v; + fe v3; + fe vxx; + fe check; + + fe_frombytes(h->Y, s); + fe_1(h->Z); + fe_sq(u, h->Y); + fe_mul(v, u, d); + fe_sub(u, u, h->Z); /* u = y^2-1 */ + fe_add(v, v, h->Z); /* v = dy^2+1 */ + + fe_sq(v3, v); + fe_mul(v3, v3, v); /* v3 = v^3 */ + fe_sq(h->X, v3); + fe_mul(h->X, h->X, v); + fe_mul(h->X, h->X, u); /* x = uv^7 */ + + fe_pow22523(h->X, h->X); /* x = (uv^7)^((q-5)/8) */ + fe_mul(h->X, h->X, v3); + fe_mul(h->X, h->X, u); /* x = uv^3(uv^7)^((q-5)/8) */ + + fe_sq(vxx, h->X); + fe_mul(vxx, vxx, v); + fe_sub(check, vxx, u); /* vx^2-u */ + if (fe_isnonzero(check)) { + fe_add(check, vxx, u); /* vx^2+u */ + if (fe_isnonzero(check)) { + return -1; + } + fe_mul(h->X, h->X, sqrtm1); + } + + if (fe_isnegative(h->X) != (s[31] >> 7)) { + fe_neg(h->X, h->X); + } + + fe_mul(h->T, h->X, h->Y); + return 0; +} + +static void ge_p2_0(ge_p2 *h) { + fe_0(h->X); + fe_1(h->Y); + fe_1(h->Z); +} + +static void ge_p3_0(ge_p3 *h) { + fe_0(h->X); + fe_1(h->Y); + fe_1(h->Z); + fe_0(h->T); +} + +static void ge_precomp_0(ge_precomp *h) { + fe_1(h->yplusx); + fe_1(h->yminusx); + fe_0(h->xy2d); +} + +/* r = p */ +static void ge_p3_to_p2(ge_p2 *r, const ge_p3 *p) { + fe_copy(r->X, p->X); + fe_copy(r->Y, p->Y); + fe_copy(r->Z, p->Z); +} + +static const fe d2 = {-21827239, -5839606, -30745221, 13898782, 229458, + 15978800, -12551817, -6495438, 29715968, 9444199}; + +/* r = p */ +static void ge_p3_to_cached(ge_cached *r, const ge_p3 *p) { + fe_add(r->YplusX, p->Y, p->X); + fe_sub(r->YminusX, p->Y, p->X); + fe_copy(r->Z, p->Z); + fe_mul(r->T2d, p->T, d2); +} + +/* r = p */ +static void ge_p1p1_to_p2(ge_p2 *r, const ge_p1p1 *p) { + fe_mul(r->X, p->X, p->T); + fe_mul(r->Y, p->Y, p->Z); + fe_mul(r->Z, p->Z, p->T); +} + +/* r = p */ +static void ge_p1p1_to_p3(ge_p3 *r, const ge_p1p1 *p) { + fe_mul(r->X, p->X, p->T); + fe_mul(r->Y, p->Y, p->Z); + fe_mul(r->Z, p->Z, p->T); + fe_mul(r->T, p->X, p->Y); +} + +/* r = 2 * p */ +static void ge_p2_dbl(ge_p1p1 *r, const ge_p2 *p) { + fe t0; + + fe_sq(r->X, p->X); + fe_sq(r->Z, p->Y); + fe_sq2(r->T, p->Z); + fe_add(r->Y, p->X, p->Y); + fe_sq(t0, r->Y); + fe_add(r->Y, r->Z, r->X); + fe_sub(r->Z, r->Z, r->X); + fe_sub(r->X, t0, r->Y); + fe_sub(r->T, r->T, r->Z); +} + +/* r = 2 * p */ +static void ge_p3_dbl(ge_p1p1 *r, const ge_p3 *p) { + ge_p2 q; + ge_p3_to_p2(&q, p); + ge_p2_dbl(r, &q); +} + +/* r = p + q */ +static void ge_madd(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q) { + fe t0; + + fe_add(r->X, p->Y, p->X); + fe_sub(r->Y, p->Y, p->X); + fe_mul(r->Z, r->X, q->yplusx); + fe_mul(r->Y, r->Y, q->yminusx); + fe_mul(r->T, q->xy2d, p->T); + fe_add(t0, p->Z, p->Z); + fe_sub(r->X, r->Z, r->Y); + fe_add(r->Y, r->Z, r->Y); + fe_add(r->Z, t0, r->T); + fe_sub(r->T, t0, r->T); +} + +/* r = p - q */ +static void ge_msub(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q) { + fe t0; + + fe_add(r->X, p->Y, p->X); + fe_sub(r->Y, p->Y, p->X); + fe_mul(r->Z, r->X, q->yminusx); + fe_mul(r->Y, r->Y, q->yplusx); + fe_mul(r->T, q->xy2d, p->T); + fe_add(t0, p->Z, p->Z); + fe_sub(r->X, r->Z, r->Y); + fe_add(r->Y, r->Z, r->Y); + fe_sub(r->Z, t0, r->T); + fe_add(r->T, t0, r->T); +} + +/* r = p + q */ +static void ge_add(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) { + fe t0; + + fe_add(r->X, p->Y, p->X); + fe_sub(r->Y, p->Y, p->X); + fe_mul(r->Z, r->X, q->YplusX); + fe_mul(r->Y, r->Y, q->YminusX); + fe_mul(r->T, q->T2d, p->T); + fe_mul(r->X, p->Z, q->Z); + fe_add(t0, r->X, r->X); + fe_sub(r->X, r->Z, r->Y); + fe_add(r->Y, r->Z, r->Y); + fe_add(r->Z, t0, r->T); + fe_sub(r->T, t0, r->T); +} + +/* r = p - q */ +static void ge_sub(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) { + fe t0; + + fe_add(r->X, p->Y, p->X); + fe_sub(r->Y, p->Y, p->X); + fe_mul(r->Z, r->X, q->YminusX); + fe_mul(r->Y, r->Y, q->YplusX); + fe_mul(r->T, q->T2d, p->T); + fe_mul(r->X, p->Z, q->Z); + fe_add(t0, r->X, r->X); + fe_sub(r->X, r->Z, r->Y); + fe_add(r->Y, r->Z, r->Y); + fe_sub(r->Z, t0, r->T); + fe_add(r->T, t0, r->T); +} + +static uint8_t equal(signed char b, signed char c) { + uint8_t ub = b; + uint8_t uc = c; + uint8_t x = ub ^ uc; /* 0: yes; 1..255: no */ + uint32_t y = x; /* 0: yes; 1..255: no */ + y -= 1; /* 4294967295: yes; 0..254: no */ + y >>= 31; /* 1: yes; 0: no */ + return y; +} + +static void cmov(ge_precomp *t, const ge_precomp *u, uint8_t b) { + fe_cmov(t->yplusx, u->yplusx, b); + fe_cmov(t->yminusx, u->yminusx, b); + fe_cmov(t->xy2d, u->xy2d, b); +} + +/* k25519Precomp[i][j] = (j+1)*256^i*B */ +static const ge_precomp k25519Precomp[32][8] = { + { + { + {25967493, -14356035, 29566456, 3660896, -12694345, 4014787, + 27544626, -11754271, -6079156, 2047605}, + {-12545711, 934262, -2722910, 3049990, -727428, 9406986, 12720692, + 5043384, 19500929, -15469378}, + {-8738181, 4489570, 9688441, -14785194, 10184609, -12363380, + 29287919, 11864899, -24514362, -4438546}, + }, + { + {-12815894, -12976347, -21581243, 11784320, -25355658, -2750717, + -11717903, -3814571, -358445, -10211303}, + {-21703237, 6903825, 27185491, 6451973, -29577724, -9554005, + -15616551, 11189268, -26829678, -5319081}, + {26966642, 11152617, 32442495, 15396054, 14353839, -12752335, + -3128826, -9541118, -15472047, -4166697}, + }, + { + {15636291, -9688557, 24204773, -7912398, 616977, -16685262, + 27787600, -14772189, 28944400, -1550024}, + {16568933, 4717097, -11556148, -1102322, 15682896, -11807043, + 16354577, -11775962, 7689662, 11199574}, + {30464156, -5976125, -11779434, -15670865, 23220365, 15915852, + 7512774, 10017326, -17749093, -9920357}, + }, + { + {-17036878, 13921892, 10945806, -6033431, 27105052, -16084379, + -28926210, 15006023, 3284568, -6276540}, + {23599295, -8306047, -11193664, -7687416, 13236774, 10506355, + 7464579, 9656445, 13059162, 10374397}, + {7798556, 16710257, 3033922, 2874086, 28997861, 2835604, 32406664, + -3839045, -641708, -101325}, + }, + { + {10861363, 11473154, 27284546, 1981175, -30064349, 12577861, + 32867885, 14515107, -15438304, 10819380}, + {4708026, 6336745, 20377586, 9066809, -11272109, 6594696, -25653668, + 12483688, -12668491, 5581306}, + {19563160, 16186464, -29386857, 4097519, 10237984, -4348115, + 28542350, 13850243, -23678021, -15815942}, + }, + { + {-15371964, -12862754, 32573250, 4720197, -26436522, 5875511, + -19188627, -15224819, -9818940, -12085777}, + {-8549212, 109983, 15149363, 2178705, 22900618, 4543417, 3044240, + -15689887, 1762328, 14866737}, + {-18199695, -15951423, -10473290, 1707278, -17185920, 3916101, + -28236412, 3959421, 27914454, 4383652}, + }, + { + {5153746, 9909285, 1723747, -2777874, 30523605, 5516873, 19480852, + 5230134, -23952439, -15175766}, + {-30269007, -3463509, 7665486, 10083793, 28475525, 1649722, + 20654025, 16520125, 30598449, 7715701}, + {28881845, 14381568, 9657904, 3680757, -20181635, 7843316, + -31400660, 1370708, 29794553, -1409300}, + }, + { + {14499471, -2729599, -33191113, -4254652, 28494862, 14271267, + 30290735, 10876454, -33154098, 2381726}, + {-7195431, -2655363, -14730155, 462251, -27724326, 3941372, + -6236617, 3696005, -32300832, 15351955}, + {27431194, 8222322, 16448760, -3907995, -18707002, 11938355, + -32961401, -2970515, 29551813, 10109425}, + }, + }, + { + { + {-13657040, -13155431, -31283750, 11777098, 21447386, 6519384, + -2378284, -1627556, 10092783, -4764171}, + {27939166, 14210322, 4677035, 16277044, -22964462, -12398139, + -32508754, 12005538, -17810127, 12803510}, + {17228999, -15661624, -1233527, 300140, -1224870, -11714777, + 30364213, -9038194, 18016357, 4397660}, + }, + { + {-10958843, -7690207, 4776341, -14954238, 27850028, -15602212, + -26619106, 14544525, -17477504, 982639}, + {29253598, 15796703, -2863982, -9908884, 10057023, 3163536, 7332899, + -4120128, -21047696, 9934963}, + {5793303, 16271923, -24131614, -10116404, 29188560, 1206517, + -14747930, 4559895, -30123922, -10897950}, + }, + { + {-27643952, -11493006, 16282657, -11036493, 28414021, -15012264, + 24191034, 4541697, -13338309, 5500568}, + {12650548, -1497113, 9052871, 11355358, -17680037, -8400164, + -17430592, 12264343, 10874051, 13524335}, + {25556948, -3045990, 714651, 2510400, 23394682, -10415330, 33119038, + 5080568, -22528059, 5376628}, + }, + { + {-26088264, -4011052, -17013699, -3537628, -6726793, 1920897, + -22321305, -9447443, 4535768, 1569007}, + {-2255422, 14606630, -21692440, -8039818, 28430649, 8775819, + -30494562, 3044290, 31848280, 12543772}, + {-22028579, 2943893, -31857513, 6777306, 13784462, -4292203, + -27377195, -2062731, 7718482, 14474653}, + }, + { + {2385315, 2454213, -22631320, 46603, -4437935, -15680415, 656965, + -7236665, 24316168, -5253567}, + {13741529, 10911568, -33233417, -8603737, -20177830, -1033297, + 33040651, -13424532, -20729456, 8321686}, + {21060490, -2212744, 15712757, -4336099, 1639040, 10656336, + 23845965, -11874838, -9984458, 608372}, + }, + { + {-13672732, -15087586, -10889693, -7557059, -6036909, 11305547, + 1123968, -6780577, 27229399, 23887}, + {-23244140, -294205, -11744728, 14712571, -29465699, -2029617, + 12797024, -6440308, -1633405, 16678954}, + {-29500620, 4770662, -16054387, 14001338, 7830047, 9564805, + -1508144, -4795045, -17169265, 4904953}, + }, + { + {24059557, 14617003, 19037157, -15039908, 19766093, -14906429, + 5169211, 16191880, 2128236, -4326833}, + {-16981152, 4124966, -8540610, -10653797, 30336522, -14105247, + -29806336, 916033, -6882542, -2986532}, + {-22630907, 12419372, -7134229, -7473371, -16478904, 16739175, + 285431, 2763829, 15736322, 4143876}, + }, + { + {2379352, 11839345, -4110402, -5988665, 11274298, 794957, 212801, + -14594663, 23527084, -16458268}, + {33431127, -11130478, -17838966, -15626900, 8909499, 8376530, + -32625340, 4087881, -15188911, -14416214}, + {1767683, 7197987, -13205226, -2022635, -13091350, 448826, 5799055, + 4357868, -4774191, -16323038}, + }, + }, + { + { + {6721966, 13833823, -23523388, -1551314, 26354293, -11863321, + 23365147, -3949732, 7390890, 2759800}, + {4409041, 2052381, 23373853, 10530217, 7676779, -12885954, 21302353, + -4264057, 1244380, -12919645}, + {-4421239, 7169619, 4982368, -2957590, 30256825, -2777540, 14086413, + 9208236, 15886429, 16489664}, + }, + { + {1996075, 10375649, 14346367, 13311202, -6874135, -16438411, + -13693198, 398369, -30606455, -712933}, + {-25307465, 9795880, -2777414, 14878809, -33531835, 14780363, + 13348553, 12076947, -30836462, 5113182}, + {-17770784, 11797796, 31950843, 13929123, -25888302, 12288344, + -30341101, -7336386, 13847711, 5387222}, + }, + { + {-18582163, -3416217, 17824843, -2340966, 22744343, -10442611, + 8763061, 3617786, -19600662, 10370991}, + {20246567, -14369378, 22358229, -543712, 18507283, -10413996, + 14554437, -8746092, 32232924, 16763880}, + {9648505, 10094563, 26416693, 14745928, -30374318, -6472621, + 11094161, 15689506, 3140038, -16510092}, + }, + { + {-16160072, 5472695, 31895588, 4744994, 8823515, 10365685, + -27224800, 9448613, -28774454, 366295}, + {19153450, 11523972, -11096490, -6503142, -24647631, 5420647, + 28344573, 8041113, 719605, 11671788}, + {8678025, 2694440, -6808014, 2517372, 4964326, 11152271, -15432916, + -15266516, 27000813, -10195553}, + }, + { + {-15157904, 7134312, 8639287, -2814877, -7235688, 10421742, 564065, + 5336097, 6750977, -14521026}, + {11836410, -3979488, 26297894, 16080799, 23455045, 15735944, + 1695823, -8819122, 8169720, 16220347}, + {-18115838, 8653647, 17578566, -6092619, -8025777, -16012763, + -11144307, -2627664, -5990708, -14166033}, + }, + { + {-23308498, -10968312, 15213228, -10081214, -30853605, -11050004, + 27884329, 2847284, 2655861, 1738395}, + {-27537433, -14253021, -25336301, -8002780, -9370762, 8129821, + 21651608, -3239336, -19087449, -11005278}, + {1533110, 3437855, 23735889, 459276, 29970501, 11335377, 26030092, + 5821408, 10478196, 8544890}, + }, + { + {32173121, -16129311, 24896207, 3921497, 22579056, -3410854, + 19270449, 12217473, 17789017, -3395995}, + {-30552961, -2228401, -15578829, -10147201, 13243889, 517024, + 15479401, -3853233, 30460520, 1052596}, + {-11614875, 13323618, 32618793, 8175907, -15230173, 12596687, + 27491595, -4612359, 3179268, -9478891}, + }, + { + {31947069, -14366651, -4640583, -15339921, -15125977, -6039709, + -14756777, -16411740, 19072640, -9511060}, + {11685058, 11822410, 3158003, -13952594, 33402194, -4165066, + 5977896, -5215017, 473099, 5040608}, + {-20290863, 8198642, -27410132, 11602123, 1290375, -2799760, + 28326862, 1721092, -19558642, -3131606}, + }, + }, + { + { + {7881532, 10687937, 7578723, 7738378, -18951012, -2553952, 21820786, + 8076149, -27868496, 11538389}, + {-19935666, 3899861, 18283497, -6801568, -15728660, -11249211, + 8754525, 7446702, -5676054, 5797016}, + {-11295600, -3793569, -15782110, -7964573, 12708869, -8456199, + 2014099, -9050574, -2369172, -5877341}, + }, + { + {-22472376, -11568741, -27682020, 1146375, 18956691, 16640559, + 1192730, -3714199, 15123619, 10811505}, + {14352098, -3419715, -18942044, 10822655, 32750596, 4699007, -70363, + 15776356, -28886779, -11974553}, + {-28241164, -8072475, -4978962, -5315317, 29416931, 1847569, + -20654173, -16484855, 4714547, -9600655}, + }, + { + {15200332, 8368572, 19679101, 15970074, -31872674, 1959451, + 24611599, -4543832, -11745876, 12340220}, + {12876937, -10480056, 33134381, 6590940, -6307776, 14872440, + 9613953, 8241152, 15370987, 9608631}, + {-4143277, -12014408, 8446281, -391603, 4407738, 13629032, -7724868, + 15866074, -28210621, -8814099}, + }, + { + {26660628, -15677655, 8393734, 358047, -7401291, 992988, -23904233, + 858697, 20571223, 8420556}, + {14620715, 13067227, -15447274, 8264467, 14106269, 15080814, + 33531827, 12516406, -21574435, -12476749}, + {236881, 10476226, 57258, -14677024, 6472998, 2466984, 17258519, + 7256740, 8791136, 15069930}, + }, + { + {1276410, -9371918, 22949635, -16322807, -23493039, -5702186, + 14711875, 4874229, -30663140, -2331391}, + {5855666, 4990204, -13711848, 7294284, -7804282, 1924647, -1423175, + -7912378, -33069337, 9234253}, + {20590503, -9018988, 31529744, -7352666, -2706834, 10650548, + 31559055, -11609587, 18979186, 13396066}, + }, + { + {24474287, 4968103, 22267082, 4407354, 24063882, -8325180, + -18816887, 13594782, 33514650, 7021958}, + {-11566906, -6565505, -21365085, 15928892, -26158305, 4315421, + -25948728, -3916677, -21480480, 12868082}, + {-28635013, 13504661, 19988037, -2132761, 21078225, 6443208, + -21446107, 2244500, -12455797, -8089383}, + }, + { + {-30595528, 13793479, -5852820, 319136, -25723172, -6263899, + 33086546, 8957937, -15233648, 5540521}, + {-11630176, -11503902, -8119500, -7643073, 2620056, 1022908, + -23710744, -1568984, -16128528, -14962807}, + {23152971, 775386, 27395463, 14006635, -9701118, 4649512, 1689819, + 892185, -11513277, -15205948}, + }, + { + {9770129, 9586738, 26496094, 4324120, 1556511, -3550024, 27453819, + 4763127, -19179614, 5867134}, + {-32765025, 1927590, 31726409, -4753295, 23962434, -16019500, + 27846559, 5931263, -29749703, -16108455}, + {27461885, -2977536, 22380810, 1815854, -23033753, -3031938, + 7283490, -15148073, -19526700, 7734629}, + }, + }, + { + { + {-8010264, -9590817, -11120403, 6196038, 29344158, -13430885, + 7585295, -3176626, 18549497, 15302069}, + {-32658337, -6171222, -7672793, -11051681, 6258878, 13504381, + 10458790, -6418461, -8872242, 8424746}, + {24687205, 8613276, -30667046, -3233545, 1863892, -1830544, + 19206234, 7134917, -11284482, -828919}, + }, + { + {11334899, -9218022, 8025293, 12707519, 17523892, -10476071, + 10243738, -14685461, -5066034, 16498837}, + {8911542, 6887158, -9584260, -6958590, 11145641, -9543680, 17303925, + -14124238, 6536641, 10543906}, + {-28946384, 15479763, -17466835, 568876, -1497683, 11223454, + -2669190, -16625574, -27235709, 8876771}, + }, + { + {-25742899, -12566864, -15649966, -846607, -33026686, -796288, + -33481822, 15824474, -604426, -9039817}, + {10330056, 70051, 7957388, -9002667, 9764902, 15609756, 27698697, + -4890037, 1657394, 3084098}, + {10477963, -7470260, 12119566, -13250805, 29016247, -5365589, + 31280319, 14396151, -30233575, 15272409}, + }, + { + {-12288309, 3169463, 28813183, 16658753, 25116432, -5630466, + -25173957, -12636138, -25014757, 1950504}, + {-26180358, 9489187, 11053416, -14746161, -31053720, 5825630, + -8384306, -8767532, 15341279, 8373727}, + {28685821, 7759505, -14378516, -12002860, -31971820, 4079242, + 298136, -10232602, -2878207, 15190420}, + }, + { + {-32932876, 13806336, -14337485, -15794431, -24004620, 10940928, + 8669718, 2742393, -26033313, -6875003}, + {-1580388, -11729417, -25979658, -11445023, -17411874, -10912854, + 9291594, -16247779, -12154742, 6048605}, + {-30305315, 14843444, 1539301, 11864366, 20201677, 1900163, + 13934231, 5128323, 11213262, 9168384}, + }, + { + {-26280513, 11007847, 19408960, -940758, -18592965, -4328580, + -5088060, -11105150, 20470157, -16398701}, + {-23136053, 9282192, 14855179, -15390078, -7362815, -14408560, + -22783952, 14461608, 14042978, 5230683}, + {29969567, -2741594, -16711867, -8552442, 9175486, -2468974, + 21556951, 3506042, -5933891, -12449708}, + }, + { + {-3144746, 8744661, 19704003, 4581278, -20430686, 6830683, + -21284170, 8971513, -28539189, 15326563}, + {-19464629, 10110288, -17262528, -3503892, -23500387, 1355669, + -15523050, 15300988, -20514118, 9168260}, + {-5353335, 4488613, -23803248, 16314347, 7780487, -15638939, + -28948358, 9601605, 33087103, -9011387}, + }, + { + {-19443170, -15512900, -20797467, -12445323, -29824447, 10229461, + -27444329, -15000531, -5996870, 15664672}, + {23294591, -16632613, -22650781, -8470978, 27844204, 11461195, + 13099750, -2460356, 18151676, 13417686}, + {-24722913, -4176517, -31150679, 5988919, -26858785, 6685065, + 1661597, -12551441, 15271676, -15452665}, + }, + }, + { + { + {11433042, -13228665, 8239631, -5279517, -1985436, -725718, + -18698764, 2167544, -6921301, -13440182}, + {-31436171, 15575146, 30436815, 12192228, -22463353, 9395379, + -9917708, -8638997, 12215110, 12028277}, + {14098400, 6555944, 23007258, 5757252, -15427832, -12950502, + 30123440, 4617780, -16900089, -655628}, + }, + { + {-4026201, -15240835, 11893168, 13718664, -14809462, 1847385, + -15819999, 10154009, 23973261, -12684474}, + {-26531820, -3695990, -1908898, 2534301, -31870557, -16550355, + 18341390, -11419951, 32013174, -10103539}, + {-25479301, 10876443, -11771086, -14625140, -12369567, 1838104, + 21911214, 6354752, 4425632, -837822}, + }, + { + {-10433389, -14612966, 22229858, -3091047, -13191166, 776729, + -17415375, -12020462, 4725005, 14044970}, + {19268650, -7304421, 1555349, 8692754, -21474059, -9910664, 6347390, + -1411784, -19522291, -16109756}, + {-24864089, 12986008, -10898878, -5558584, -11312371, -148526, + 19541418, 8180106, 9282262, 10282508}, + }, + { + {-26205082, 4428547, -8661196, -13194263, 4098402, -14165257, + 15522535, 8372215, 5542595, -10702683}, + {-10562541, 14895633, 26814552, -16673850, -17480754, -2489360, + -2781891, 6993761, -18093885, 10114655}, + {-20107055, -929418, 31422704, 10427861, -7110749, 6150669, + -29091755, -11529146, 25953725, -106158}, + }, + { + {-4234397, -8039292, -9119125, 3046000, 2101609, -12607294, + 19390020, 6094296, -3315279, 12831125}, + {-15998678, 7578152, 5310217, 14408357, -33548620, -224739, + 31575954, 6326196, 7381791, -2421839}, + {-20902779, 3296811, 24736065, -16328389, 18374254, 7318640, + 6295303, 8082724, -15362489, 12339664}, + }, + { + {27724736, 2291157, 6088201, -14184798, 1792727, 5857634, 13848414, + 15768922, 25091167, 14856294}, + {-18866652, 8331043, 24373479, 8541013, -701998, -9269457, 12927300, + -12695493, -22182473, -9012899}, + {-11423429, -5421590, 11632845, 3405020, 30536730, -11674039, + -27260765, 13866390, 30146206, 9142070}, + }, + { + {3924129, -15307516, -13817122, -10054960, 12291820, -668366, + -27702774, 9326384, -8237858, 4171294}, + {-15921940, 16037937, 6713787, 16606682, -21612135, 2790944, + 26396185, 3731949, 345228, -5462949}, + {-21327538, 13448259, 25284571, 1143661, 20614966, -8849387, + 2031539, -12391231, -16253183, -13582083}, + }, + { + {31016211, -16722429, 26371392, -14451233, -5027349, 14854137, + 17477601, 3842657, 28012650, -16405420}, + {-5075835, 9368966, -8562079, -4600902, -15249953, 6970560, + -9189873, 16292057, -8867157, 3507940}, + {29439664, 3537914, 23333589, 6997794, -17555561, -11018068, + -15209202, -15051267, -9164929, 6580396}, + }, + }, + { + { + {-12185861, -7679788, 16438269, 10826160, -8696817, -6235611, + 17860444, -9273846, -2095802, 9304567}, + {20714564, -4336911, 29088195, 7406487, 11426967, -5095705, + 14792667, -14608617, 5289421, -477127}, + {-16665533, -10650790, -6160345, -13305760, 9192020, -1802462, + 17271490, 12349094, 26939669, -3752294}, + }, + { + {-12889898, 9373458, 31595848, 16374215, 21471720, 13221525, + -27283495, -12348559, -3698806, 117887}, + {22263325, -6560050, 3984570, -11174646, -15114008, -566785, + 28311253, 5358056, -23319780, 541964}, + {16259219, 3261970, 2309254, -15534474, -16885711, -4581916, + 24134070, -16705829, -13337066, -13552195}, + }, + { + {9378160, -13140186, -22845982, -12745264, 28198281, -7244098, + -2399684, -717351, 690426, 14876244}, + {24977353, -314384, -8223969, -13465086, 28432343, -1176353, + -13068804, -12297348, -22380984, 6618999}, + {-1538174, 11685646, 12944378, 13682314, -24389511, -14413193, + 8044829, -13817328, 32239829, -5652762}, + }, + { + {-18603066, 4762990, -926250, 8885304, -28412480, -3187315, 9781647, + -10350059, 32779359, 5095274}, + {-33008130, -5214506, -32264887, -3685216, 9460461, -9327423, + -24601656, 14506724, 21639561, -2630236}, + {-16400943, -13112215, 25239338, 15531969, 3987758, -4499318, + -1289502, -6863535, 17874574, 558605}, + }, + { + {-13600129, 10240081, 9171883, 16131053, -20869254, 9599700, + 33499487, 5080151, 2085892, 5119761}, + {-22205145, -2519528, -16381601, 414691, -25019550, 2170430, + 30634760, -8363614, -31999993, -5759884}, + {-6845704, 15791202, 8550074, -1312654, 29928809, -12092256, + 27534430, -7192145, -22351378, 12961482}, + }, + { + {-24492060, -9570771, 10368194, 11582341, -23397293, -2245287, + 16533930, 8206996, -30194652, -5159638}, + {-11121496, -3382234, 2307366, 6362031, -135455, 8868177, -16835630, + 7031275, 7589640, 8945490}, + {-32152748, 8917967, 6661220, -11677616, -1192060, -15793393, + 7251489, -11182180, 24099109, -14456170}, + }, + { + {5019558, -7907470, 4244127, -14714356, -26933272, 6453165, + -19118182, -13289025, -6231896, -10280736}, + {10853594, 10721687, 26480089, 5861829, -22995819, 1972175, + -1866647, -10557898, -3363451, -6441124}, + {-17002408, 5906790, 221599, -6563147, 7828208, -13248918, 24362661, + -2008168, -13866408, 7421392}, + }, + { + {8139927, -6546497, 32257646, -5890546, 30375719, 1886181, + -21175108, 15441252, 28826358, -4123029}, + {6267086, 9695052, 7709135, -16603597, -32869068, -1886135, + 14795160, -7840124, 13746021, -1742048}, + {28584902, 7787108, -6732942, -15050729, 22846041, -7571236, + -3181936, -363524, 4771362, -8419958}, + }, + }, + { + { + {24949256, 6376279, -27466481, -8174608, -18646154, -9930606, + 33543569, -12141695, 3569627, 11342593}, + {26514989, 4740088, 27912651, 3697550, 19331575, -11472339, 6809886, + 4608608, 7325975, -14801071}, + {-11618399, -14554430, -24321212, 7655128, -1369274, 5214312, + -27400540, 10258390, -17646694, -8186692}, + }, + { + {11431204, 15823007, 26570245, 14329124, 18029990, 4796082, + -31446179, 15580664, 9280358, -3973687}, + {-160783, -10326257, -22855316, -4304997, -20861367, -13621002, + -32810901, -11181622, -15545091, 4387441}, + {-20799378, 12194512, 3937617, -5805892, -27154820, 9340370, + -24513992, 8548137, 20617071, -7482001}, + }, + { + {-938825, -3930586, -8714311, 16124718, 24603125, -6225393, + -13775352, -11875822, 24345683, 10325460}, + {-19855277, -1568885, -22202708, 8714034, 14007766, 6928528, + 16318175, -1010689, 4766743, 3552007}, + {-21751364, -16730916, 1351763, -803421, -4009670, 3950935, 3217514, + 14481909, 10988822, -3994762}, + }, + { + {15564307, -14311570, 3101243, 5684148, 30446780, -8051356, + 12677127, -6505343, -8295852, 13296005}, + {-9442290, 6624296, -30298964, -11913677, -4670981, -2057379, + 31521204, 9614054, -30000824, 12074674}, + {4771191, -135239, 14290749, -13089852, 27992298, 14998318, + -1413936, -1556716, 29832613, -16391035}, + }, + { + {7064884, -7541174, -19161962, -5067537, -18891269, -2912736, + 25825242, 5293297, -27122660, 13101590}, + {-2298563, 2439670, -7466610, 1719965, -27267541, -16328445, + 32512469, -5317593, -30356070, -4190957}, + {-30006540, 10162316, -33180176, 3981723, -16482138, -13070044, + 14413974, 9515896, 19568978, 9628812}, + }, + { + {33053803, 199357, 15894591, 1583059, 27380243, -4580435, -17838894, + -6106839, -6291786, 3437740}, + {-18978877, 3884493, 19469877, 12726490, 15913552, 13614290, + -22961733, 70104, 7463304, 4176122}, + {-27124001, 10659917, 11482427, -16070381, 12771467, -6635117, + -32719404, -5322751, 24216882, 5944158}, + }, + { + {8894125, 7450974, -2664149, -9765752, -28080517, -12389115, + 19345746, 14680796, 11632993, 5847885}, + {26942781, -2315317, 9129564, -4906607, 26024105, 11769399, + -11518837, 6367194, -9727230, 4782140}, + {19916461, -4828410, -22910704, -11414391, 25606324, -5972441, + 33253853, 8220911, 6358847, -1873857}, + }, + { + {801428, -2081702, 16569428, 11065167, 29875704, 96627, 7908388, + -4480480, -13538503, 1387155}, + {19646058, 5720633, -11416706, 12814209, 11607948, 12749789, + 14147075, 15156355, -21866831, 11835260}, + {19299512, 1155910, 28703737, 14890794, 2925026, 7269399, 26121523, + 15467869, -26560550, 5052483}, + }, + }, + { + { + {-3017432, 10058206, 1980837, 3964243, 22160966, 12322533, -6431123, + -12618185, 12228557, -7003677}, + {32944382, 14922211, -22844894, 5188528, 21913450, -8719943, + 4001465, 13238564, -6114803, 8653815}, + {22865569, -4652735, 27603668, -12545395, 14348958, 8234005, + 24808405, 5719875, 28483275, 2841751}, + }, + { + {-16420968, -1113305, -327719, -12107856, 21886282, -15552774, + -1887966, -315658, 19932058, -12739203}, + {-11656086, 10087521, -8864888, -5536143, -19278573, -3055912, + 3999228, 13239134, -4777469, -13910208}, + {1382174, -11694719, 17266790, 9194690, -13324356, 9720081, + 20403944, 11284705, -14013818, 3093230}, + }, + { + {16650921, -11037932, -1064178, 1570629, -8329746, 7352753, -302424, + 16271225, -24049421, -6691850}, + {-21911077, -5927941, -4611316, -5560156, -31744103, -10785293, + 24123614, 15193618, -21652117, -16739389}, + {-9935934, -4289447, -25279823, 4372842, 2087473, 10399484, + 31870908, 14690798, 17361620, 11864968}, + }, + { + {-11307610, 6210372, 13206574, 5806320, -29017692, -13967200, + -12331205, -7486601, -25578460, -16240689}, + {14668462, -12270235, 26039039, 15305210, 25515617, 4542480, + 10453892, 6577524, 9145645, -6443880}, + {5974874, 3053895, -9433049, -10385191, -31865124, 3225009, + -7972642, 3936128, -5652273, -3050304}, + }, + { + {30625386, -4729400, -25555961, -12792866, -20484575, 7695099, + 17097188, -16303496, -27999779, 1803632}, + {-3553091, 9865099, -5228566, 4272701, -5673832, -16689700, + 14911344, 12196514, -21405489, 7047412}, + {20093277, 9920966, -11138194, -5343857, 13161587, 12044805, + -32856851, 4124601, -32343828, -10257566}, + }, + { + {-20788824, 14084654, -13531713, 7842147, 19119038, -13822605, + 4752377, -8714640, -21679658, 2288038}, + {-26819236, -3283715, 29965059, 3039786, -14473765, 2540457, + 29457502, 14625692, -24819617, 12570232}, + {-1063558, -11551823, 16920318, 12494842, 1278292, -5869109, + -21159943, -3498680, -11974704, 4724943}, + }, + { + {17960970, -11775534, -4140968, -9702530, -8876562, -1410617, + -12907383, -8659932, -29576300, 1903856}, + {23134274, -14279132, -10681997, -1611936, 20684485, 15770816, + -12989750, 3190296, 26955097, 14109738}, + {15308788, 5320727, -30113809, -14318877, 22902008, 7767164, + 29425325, -11277562, 31960942, 11934971}, + }, + { + {-27395711, 8435796, 4109644, 12222639, -24627868, 14818669, + 20638173, 4875028, 10491392, 1379718}, + {-13159415, 9197841, 3875503, -8936108, -1383712, -5879801, + 33518459, 16176658, 21432314, 12180697}, + {-11787308, 11500838, 13787581, -13832590, -22430679, 10140205, + 1465425, 12689540, -10301319, -13872883}, + }, + }, + { + { + {5414091, -15386041, -21007664, 9643570, 12834970, 1186149, + -2622916, -1342231, 26128231, 6032912}, + {-26337395, -13766162, 32496025, -13653919, 17847801, -12669156, + 3604025, 8316894, -25875034, -10437358}, + {3296484, 6223048, 24680646, -12246460, -23052020, 5903205, + -8862297, -4639164, 12376617, 3188849}, + }, + { + {29190488, -14659046, 27549113, -1183516, 3520066, -10697301, + 32049515, -7309113, -16109234, -9852307}, + {-14744486, -9309156, 735818, -598978, -20407687, -5057904, + 25246078, -15795669, 18640741, -960977}, + {-6928835, -16430795, 10361374, 5642961, 4910474, 12345252, + -31638386, -494430, 10530747, 1053335}, + }, + { + {-29265967, -14186805, -13538216, -12117373, -19457059, -10655384, + -31462369, -2948985, 24018831, 15026644}, + {-22592535, -3145277, -2289276, 5953843, -13440189, 9425631, + 25310643, 13003497, -2314791, -15145616}, + {-27419985, -603321, -8043984, -1669117, -26092265, 13987819, + -27297622, 187899, -23166419, -2531735}, + }, + { + {-21744398, -13810475, 1844840, 5021428, -10434399, -15911473, + 9716667, 16266922, -5070217, 726099}, + {29370922, -6053998, 7334071, -15342259, 9385287, 2247707, + -13661962, -4839461, 30007388, -15823341}, + {-936379, 16086691, 23751945, -543318, -1167538, -5189036, 9137109, + 730663, 9835848, 4555336}, + }, + { + {-23376435, 1410446, -22253753, -12899614, 30867635, 15826977, + 17693930, 544696, -11985298, 12422646}, + {31117226, -12215734, -13502838, 6561947, -9876867, -12757670, + -5118685, -4096706, 29120153, 13924425}, + {-17400879, -14233209, 19675799, -2734756, -11006962, -5858820, + -9383939, -11317700, 7240931, -237388}, + }, + { + {-31361739, -11346780, -15007447, -5856218, -22453340, -12152771, + 1222336, 4389483, 3293637, -15551743}, + {-16684801, -14444245, 11038544, 11054958, -13801175, -3338533, + -24319580, 7733547, 12796905, -6335822}, + {-8759414, -10817836, -25418864, 10783769, -30615557, -9746811, + -28253339, 3647836, 3222231, -11160462}, + }, + { + {18606113, 1693100, -25448386, -15170272, 4112353, 10045021, + 23603893, -2048234, -7550776, 2484985}, + {9255317, -3131197, -12156162, -1004256, 13098013, -9214866, + 16377220, -2102812, -19802075, -3034702}, + {-22729289, 7496160, -5742199, 11329249, 19991973, -3347502, + -31718148, 9936966, -30097688, -10618797}, + }, + { + {21878590, -5001297, 4338336, 13643897, -3036865, 13160960, + 19708896, 5415497, -7360503, -4109293}, + {27736861, 10103576, 12500508, 8502413, -3413016, -9633558, + 10436918, -1550276, -23659143, -8132100}, + {19492550, -12104365, -29681976, -852630, -3208171, 12403437, + 30066266, 8367329, 13243957, 8709688}, + }, + }, + { + { + {12015105, 2801261, 28198131, 10151021, 24818120, -4743133, + -11194191, -5645734, 5150968, 7274186}, + {2831366, -12492146, 1478975, 6122054, 23825128, -12733586, + 31097299, 6083058, 31021603, -9793610}, + {-2529932, -2229646, 445613, 10720828, -13849527, -11505937, + -23507731, 16354465, 15067285, -14147707}, + }, + { + {7840942, 14037873, -33364863, 15934016, -728213, -3642706, + 21403988, 1057586, -19379462, -12403220}, + {915865, -16469274, 15608285, -8789130, -24357026, 6060030, + -17371319, 8410997, -7220461, 16527025}, + {32922597, -556987, 20336074, -16184568, 10903705, -5384487, + 16957574, 52992, 23834301, 6588044}, + }, + { + {32752030, 11232950, 3381995, -8714866, 22652988, -10744103, + 17159699, 16689107, -20314580, -1305992}, + {-4689649, 9166776, -25710296, -10847306, 11576752, 12733943, + 7924251, -2752281, 1976123, -7249027}, + {21251222, 16309901, -2983015, -6783122, 30810597, 12967303, 156041, + -3371252, 12331345, -8237197}, + }, + { + {8651614, -4477032, -16085636, -4996994, 13002507, 2950805, + 29054427, -5106970, 10008136, -4667901}, + {31486080, 15114593, -14261250, 12951354, 14369431, -7387845, + 16347321, -13662089, 8684155, -10532952}, + {19443825, 11385320, 24468943, -9659068, -23919258, 2187569, + -26263207, -6086921, 31316348, 14219878}, + }, + { + {-28594490, 1193785, 32245219, 11392485, 31092169, 15722801, + 27146014, 6992409, 29126555, 9207390}, + {32382935, 1110093, 18477781, 11028262, -27411763, -7548111, + -4980517, 10843782, -7957600, -14435730}, + {2814918, 7836403, 27519878, -7868156, -20894015, -11553689, + -21494559, 8550130, 28346258, 1994730}, + }, + { + {-19578299, 8085545, -14000519, -3948622, 2785838, -16231307, + -19516951, 7174894, 22628102, 8115180}, + {-30405132, 955511, -11133838, -15078069, -32447087, -13278079, + -25651578, 3317160, -9943017, 930272}, + {-15303681, -6833769, 28856490, 1357446, 23421993, 1057177, + 24091212, -1388970, -22765376, -10650715}, + }, + { + {-22751231, -5303997, -12907607, -12768866, -15811511, -7797053, + -14839018, -16554220, -1867018, 8398970}, + {-31969310, 2106403, -4736360, 1362501, 12813763, 16200670, + 22981545, -6291273, 18009408, -15772772}, + {-17220923, -9545221, -27784654, 14166835, 29815394, 7444469, + 29551787, -3727419, 19288549, 1325865}, + }, + { + {15100157, -15835752, -23923978, -1005098, -26450192, 15509408, + 12376730, -3479146, 33166107, -8042750}, + {20909231, 13023121, -9209752, 16251778, -5778415, -8094914, + 12412151, 10018715, 2213263, -13878373}, + {32529814, -11074689, 30361439, -16689753, -9135940, 1513226, + 22922121, 6382134, -5766928, 8371348}, + }, + }, + { + { + {9923462, 11271500, 12616794, 3544722, -29998368, -1721626, + 12891687, -8193132, -26442943, 10486144}, + {-22597207, -7012665, 8587003, -8257861, 4084309, -12970062, 361726, + 2610596, -23921530, -11455195}, + {5408411, -1136691, -4969122, 10561668, 24145918, 14240566, + 31319731, -4235541, 19985175, -3436086}, + }, + { + {-13994457, 16616821, 14549246, 3341099, 32155958, 13648976, + -17577068, 8849297, 65030, 8370684}, + {-8320926, -12049626, 31204563, 5839400, -20627288, -1057277, + -19442942, 6922164, 12743482, -9800518}, + {-2361371, 12678785, 28815050, 4759974, -23893047, 4884717, + 23783145, 11038569, 18800704, 255233}, + }, + { + {-5269658, -1773886, 13957886, 7990715, 23132995, 728773, 13393847, + 9066957, 19258688, -14753793}, + {-2936654, -10827535, -10432089, 14516793, -3640786, 4372541, + -31934921, 2209390, -1524053, 2055794}, + {580882, 16705327, 5468415, -2683018, -30926419, -14696000, + -7203346, -8994389, -30021019, 7394435}, + }, + { + {23838809, 1822728, -15738443, 15242727, 8318092, -3733104, + -21672180, -3492205, -4821741, 14799921}, + {13345610, 9759151, 3371034, -16137791, 16353039, 8577942, 31129804, + 13496856, -9056018, 7402518}, + {2286874, -4435931, -20042458, -2008336, -13696227, 5038122, + 11006906, -15760352, 8205061, 1607563}, + }, + { + {14414086, -8002132, 3331830, -3208217, 22249151, -5594188, + 18364661, -2906958, 30019587, -9029278}, + {-27688051, 1585953, -10775053, 931069, -29120221, -11002319, + -14410829, 12029093, 9944378, 8024}, + {4368715, -3709630, 29874200, -15022983, -20230386, -11410704, + -16114594, -999085, -8142388, 5640030}, + }, + { + {10299610, 13746483, 11661824, 16234854, 7630238, 5998374, 9809887, + -16694564, 15219798, -14327783}, + {27425505, -5719081, 3055006, 10660664, 23458024, 595578, -15398605, + -1173195, -18342183, 9742717}, + {6744077, 2427284, 26042789, 2720740, -847906, 1118974, 32324614, + 7406442, 12420155, 1994844}, + }, + { + {14012521, -5024720, -18384453, -9578469, -26485342, -3936439, + -13033478, -10909803, 24319929, -6446333}, + {16412690, -4507367, 10772641, 15929391, -17068788, -4658621, + 10555945, -10484049, -30102368, -4739048}, + {22397382, -7767684, -9293161, -12792868, 17166287, -9755136, + -27333065, 6199366, 21880021, -12250760}, + }, + { + {-4283307, 5368523, -31117018, 8163389, -30323063, 3209128, + 16557151, 8890729, 8840445, 4957760}, + {-15447727, 709327, -6919446, -10870178, -29777922, 6522332, + -21720181, 12130072, -14796503, 5005757}, + {-2114751, -14308128, 23019042, 15765735, -25269683, 6002752, + 10183197, -13239326, -16395286, -2176112}, + }, + }, + { + { + {-19025756, 1632005, 13466291, -7995100, -23640451, 16573537, + -32013908, -3057104, 22208662, 2000468}, + {3065073, -1412761, -25598674, -361432, -17683065, -5703415, + -8164212, 11248527, -3691214, -7414184}, + {10379208, -6045554, 8877319, 1473647, -29291284, -12507580, + 16690915, 2553332, -3132688, 16400289}, + }, + { + {15716668, 1254266, -18472690, 7446274, -8448918, 6344164, + -22097271, -7285580, 26894937, 9132066}, + {24158887, 12938817, 11085297, -8177598, -28063478, -4457083, + -30576463, 64452, -6817084, -2692882}, + {13488534, 7794716, 22236231, 5989356, 25426474, -12578208, 2350710, + -3418511, -4688006, 2364226}, + }, + { + {16335052, 9132434, 25640582, 6678888, 1725628, 8517937, -11807024, + -11697457, 15445875, -7798101}, + {29004207, -7867081, 28661402, -640412, -12794003, -7943086, + 31863255, -4135540, -278050, -15759279}, + {-6122061, -14866665, -28614905, 14569919, -10857999, -3591829, + 10343412, -6976290, -29828287, -10815811}, + }, + { + {27081650, 3463984, 14099042, -4517604, 1616303, -6205604, 29542636, + 15372179, 17293797, 960709}, + {20263915, 11434237, -5765435, 11236810, 13505955, -10857102, + -16111345, 6493122, -19384511, 7639714}, + {-2830798, -14839232, 25403038, -8215196, -8317012, -16173699, + 18006287, -16043750, 29994677, -15808121}, + }, + { + {9769828, 5202651, -24157398, -13631392, -28051003, -11561624, + -24613141, -13860782, -31184575, 709464}, + {12286395, 13076066, -21775189, -1176622, -25003198, 4057652, + -32018128, -8890874, 16102007, 13205847}, + {13733362, 5599946, 10557076, 3195751, -5557991, 8536970, -25540170, + 8525972, 10151379, 10394400}, + }, + { + {4024660, -16137551, 22436262, 12276534, -9099015, -2686099, + 19698229, 11743039, -33302334, 8934414}, + {-15879800, -4525240, -8580747, -2934061, 14634845, -698278, + -9449077, 3137094, -11536886, 11721158}, + {17555939, -5013938, 8268606, 2331751, -22738815, 9761013, 9319229, + 8835153, -9205489, -1280045}, + }, + { + {-461409, -7830014, 20614118, 16688288, -7514766, -4807119, + 22300304, 505429, 6108462, -6183415}, + {-5070281, 12367917, -30663534, 3234473, 32617080, -8422642, + 29880583, -13483331, -26898490, -7867459}, + {-31975283, 5726539, 26934134, 10237677, -3173717, -605053, + 24199304, 3795095, 7592688, -14992079}, + }, + { + {21594432, -14964228, 17466408, -4077222, 32537084, 2739898, + 6407723, 12018833, -28256052, 4298412}, + {-20650503, -11961496, -27236275, 570498, 3767144, -1717540, + 13891942, -1569194, 13717174, 10805743}, + {-14676630, -15644296, 15287174, 11927123, 24177847, -8175568, + -796431, 14860609, -26938930, -5863836}, + }, + }, + { + { + {12962541, 5311799, -10060768, 11658280, 18855286, -7954201, + 13286263, -12808704, -4381056, 9882022}, + {18512079, 11319350, -20123124, 15090309, 18818594, 5271736, + -22727904, 3666879, -23967430, -3299429}, + {-6789020, -3146043, 16192429, 13241070, 15898607, -14206114, + -10084880, -6661110, -2403099, 5276065}, + }, + { + {30169808, -5317648, 26306206, -11750859, 27814964, 7069267, + 7152851, 3684982, 1449224, 13082861}, + {10342826, 3098505, 2119311, 193222, 25702612, 12233820, 23697382, + 15056736, -21016438, -8202000}, + {-33150110, 3261608, 22745853, 7948688, 19370557, -15177665, + -26171976, 6482814, -10300080, -11060101}, + }, + { + {32869458, -5408545, 25609743, 15678670, -10687769, -15471071, + 26112421, 2521008, -22664288, 6904815}, + {29506923, 4457497, 3377935, -9796444, -30510046, 12935080, 1561737, + 3841096, -29003639, -6657642}, + {10340844, -6630377, -18656632, -2278430, 12621151, -13339055, + 30878497, -11824370, -25584551, 5181966}, + }, + { + {25940115, -12658025, 17324188, -10307374, -8671468, 15029094, + 24396252, -16450922, -2322852, -12388574}, + {-21765684, 9916823, -1300409, 4079498, -1028346, 11909559, 1782390, + 12641087, 20603771, -6561742}, + {-18882287, -11673380, 24849422, 11501709, 13161720, -4768874, + 1925523, 11914390, 4662781, 7820689}, + }, + { + {12241050, -425982, 8132691, 9393934, 32846760, -1599620, 29749456, + 12172924, 16136752, 15264020}, + {-10349955, -14680563, -8211979, 2330220, -17662549, -14545780, + 10658213, 6671822, 19012087, 3772772}, + {3753511, -3421066, 10617074, 2028709, 14841030, -6721664, 28718732, + -15762884, 20527771, 12988982}, + }, + { + {-14822485, -5797269, -3707987, 12689773, -898983, -10914866, + -24183046, -10564943, 3299665, -12424953}, + {-16777703, -15253301, -9642417, 4978983, 3308785, 8755439, 6943197, + 6461331, -25583147, 8991218}, + {-17226263, 1816362, -1673288, -6086439, 31783888, -8175991, + -32948145, 7417950, -30242287, 1507265}, + }, + { + {29692663, 6829891, -10498800, 4334896, 20945975, -11906496, + -28887608, 8209391, 14606362, -10647073}, + {-3481570, 8707081, 32188102, 5672294, 22096700, 1711240, -33020695, + 9761487, 4170404, -2085325}, + {-11587470, 14855945, -4127778, -1531857, -26649089, 15084046, + 22186522, 16002000, -14276837, -8400798}, + }, + { + {-4811456, 13761029, -31703877, -2483919, -3312471, 7869047, + -7113572, -9620092, 13240845, 10965870}, + {-7742563, -8256762, -14768334, -13656260, -23232383, 12387166, + 4498947, 14147411, 29514390, 4302863}, + {-13413405, -12407859, 20757302, -13801832, 14785143, 8976368, + -5061276, -2144373, 17846988, -13971927}, + }, + }, + { + { + {-2244452, -754728, -4597030, -1066309, -6247172, 1455299, + -21647728, -9214789, -5222701, 12650267}, + {-9906797, -16070310, 21134160, 12198166, -27064575, 708126, 387813, + 13770293, -19134326, 10958663}, + {22470984, 12369526, 23446014, -5441109, -21520802, -9698723, + -11772496, -11574455, -25083830, 4271862}, + }, + { + {-25169565, -10053642, -19909332, 15361595, -5984358, 2159192, + 75375, -4278529, -32526221, 8469673}, + {15854970, 4148314, -8893890, 7259002, 11666551, 13824734, + -30531198, 2697372, 24154791, -9460943}, + {15446137, -15806644, 29759747, 14019369, 30811221, -9610191, + -31582008, 12840104, 24913809, 9815020}, + }, + { + {-4709286, -5614269, -31841498, -12288893, -14443537, 10799414, + -9103676, 13438769, 18735128, 9466238}, + {11933045, 9281483, 5081055, -5183824, -2628162, -4905629, -7727821, + -10896103, -22728655, 16199064}, + {14576810, 379472, -26786533, -8317236, -29426508, -10812974, + -102766, 1876699, 30801119, 2164795}, + }, + { + {15995086, 3199873, 13672555, 13712240, -19378835, -4647646, + -13081610, -15496269, -13492807, 1268052}, + {-10290614, -3659039, -3286592, 10948818, 23037027, 3794475, + -3470338, -12600221, -17055369, 3565904}, + {29210088, -9419337, -5919792, -4952785, 10834811, -13327726, + -16512102, -10820713, -27162222, -14030531}, + }, + { + {-13161890, 15508588, 16663704, -8156150, -28349942, 9019123, + -29183421, -3769423, 2244111, -14001979}, + {-5152875, -3800936, -9306475, -6071583, 16243069, 14684434, + -25673088, -16180800, 13491506, 4641841}, + {10813417, 643330, -19188515, -728916, 30292062, -16600078, + 27548447, -7721242, 14476989, -12767431}, + }, + { + {10292079, 9984945, 6481436, 8279905, -7251514, 7032743, 27282937, + -1644259, -27912810, 12651324}, + {-31185513, -813383, 22271204, 11835308, 10201545, 15351028, + 17099662, 3988035, 21721536, -3148940}, + {10202177, -6545839, -31373232, -9574638, -32150642, -8119683, + -12906320, 3852694, 13216206, 14842320}, + }, + { + {-15815640, -10601066, -6538952, -7258995, -6984659, -6581778, + -31500847, 13765824, -27434397, 9900184}, + {14465505, -13833331, -32133984, -14738873, -27443187, 12990492, + 33046193, 15796406, -7051866, -8040114}, + {30924417, -8279620, 6359016, -12816335, 16508377, 9071735, + -25488601, 15413635, 9524356, -7018878}, + }, + { + {12274201, -13175547, 32627641, -1785326, 6736625, 13267305, + 5237659, -5109483, 15663516, 4035784}, + {-2951309, 8903985, 17349946, 601635, -16432815, -4612556, + -13732739, -15889334, -22258478, 4659091}, + {-16916263, -4952973, -30393711, -15158821, 20774812, 15897498, + 5736189, 15026997, -2178256, -13455585}, + }, + }, + { + { + {-8858980, -2219056, 28571666, -10155518, -474467, -10105698, + -3801496, 278095, 23440562, -290208}, + {10226241, -5928702, 15139956, 120818, -14867693, 5218603, 32937275, + 11551483, -16571960, -7442864}, + {17932739, -12437276, -24039557, 10749060, 11316803, 7535897, + 22503767, 5561594, -3646624, 3898661}, + }, + { + {7749907, -969567, -16339731, -16464, -25018111, 15122143, -1573531, + 7152530, 21831162, 1245233}, + {26958459, -14658026, 4314586, 8346991, -5677764, 11960072, + -32589295, -620035, -30402091, -16716212}, + {-12165896, 9166947, 33491384, 13673479, 29787085, 13096535, + 6280834, 14587357, -22338025, 13987525}, + }, + { + {-24349909, 7778775, 21116000, 15572597, -4833266, -5357778, + -4300898, -5124639, -7469781, -2858068}, + {9681908, -6737123, -31951644, 13591838, -6883821, 386950, 31622781, + 6439245, -14581012, 4091397}, + {-8426427, 1470727, -28109679, -1596990, 3978627, -5123623, + -19622683, 12092163, 29077877, -14741988}, + }, + { + {5269168, -6859726, -13230211, -8020715, 25932563, 1763552, + -5606110, -5505881, -20017847, 2357889}, + {32264008, -15407652, -5387735, -1160093, -2091322, -3946900, + 23104804, -12869908, 5727338, 189038}, + {14609123, -8954470, -6000566, -16622781, -14577387, -7743898, + -26745169, 10942115, -25888931, -14884697}, + }, + { + {20513500, 5557931, -15604613, 7829531, 26413943, -2019404, + -21378968, 7471781, 13913677, -5137875}, + {-25574376, 11967826, 29233242, 12948236, -6754465, 4713227, + -8940970, 14059180, 12878652, 8511905}, + {-25656801, 3393631, -2955415, -7075526, -2250709, 9366908, + -30223418, 6812974, 5568676, -3127656}, + }, + { + {11630004, 12144454, 2116339, 13606037, 27378885, 15676917, + -17408753, -13504373, -14395196, 8070818}, + {27117696, -10007378, -31282771, -5570088, 1127282, 12772488, + -29845906, 10483306, -11552749, -1028714}, + {10637467, -5688064, 5674781, 1072708, -26343588, -6982302, + -1683975, 9177853, -27493162, 15431203}, + }, + { + {20525145, 10892566, -12742472, 12779443, -29493034, 16150075, + -28240519, 14943142, -15056790, -7935931}, + {-30024462, 5626926, -551567, -9981087, 753598, 11981191, 25244767, + -3239766, -3356550, 9594024}, + {-23752644, 2636870, -5163910, -10103818, 585134, 7877383, 11345683, + -6492290, 13352335, -10977084}, + }, + { + {-1931799, -5407458, 3304649, -12884869, 17015806, -4877091, + -29783850, -7752482, -13215537, -319204}, + {20239939, 6607058, 6203985, 3483793, -18386976, -779229, -20723742, + 15077870, -22750759, 14523817}, + {27406042, -6041657, 27423596, -4497394, 4996214, 10002360, + -28842031, -4545494, -30172742, -4805667}, + }, + }, + { + { + {11374242, 12660715, 17861383, -12540833, 10935568, 1099227, + -13886076, -9091740, -27727044, 11358504}, + {-12730809, 10311867, 1510375, 10778093, -2119455, -9145702, + 32676003, 11149336, -26123651, 4985768}, + {-19096303, 341147, -6197485, -239033, 15756973, -8796662, -983043, + 13794114, -19414307, -15621255}, + }, + { + {6490081, 11940286, 25495923, -7726360, 8668373, -8751316, 3367603, + 6970005, -1691065, -9004790}, + {1656497, 13457317, 15370807, 6364910, 13605745, 8362338, -19174622, + -5475723, -16796596, -5031438}, + {-22273315, -13524424, -64685, -4334223, -18605636, -10921968, + -20571065, -7007978, -99853, -10237333}, + }, + { + {17747465, 10039260, 19368299, -4050591, -20630635, -16041286, + 31992683, -15857976, -29260363, -5511971}, + {31932027, -4986141, -19612382, 16366580, 22023614, 88450, 11371999, + -3744247, 4882242, -10626905}, + {29796507, 37186, 19818052, 10115756, -11829032, 3352736, 18551198, + 3272828, -5190932, -4162409}, + }, + { + {12501286, 4044383, -8612957, -13392385, -32430052, 5136599, + -19230378, -3529697, 330070, -3659409}, + {6384877, 2899513, 17807477, 7663917, -2358888, 12363165, 25366522, + -8573892, -271295, 12071499}, + {-8365515, -4042521, 25133448, -4517355, -6211027, 2265927, + -32769618, 1936675, -5159697, 3829363}, + }, + { + {28425966, -5835433, -577090, -4697198, -14217555, 6870930, 7921550, + -6567787, 26333140, 14267664}, + {-11067219, 11871231, 27385719, -10559544, -4585914, -11189312, + 10004786, -8709488, -21761224, 8930324}, + {-21197785, -16396035, 25654216, -1725397, 12282012, 11008919, + 1541940, 4757911, -26491501, -16408940}, + }, + { + {13537262, -7759490, -20604840, 10961927, -5922820, -13218065, + -13156584, 6217254, -15943699, 13814990}, + {-17422573, 15157790, 18705543, 29619, 24409717, -260476, 27361681, + 9257833, -1956526, -1776914}, + {-25045300, -10191966, 15366585, 15166509, -13105086, 8423556, + -29171540, 12361135, -18685978, 4578290}, + }, + { + {24579768, 3711570, 1342322, -11180126, -27005135, 14124956, + -22544529, 14074919, 21964432, 8235257}, + {-6528613, -2411497, 9442966, -5925588, 12025640, -1487420, + -2981514, -1669206, 13006806, 2355433}, + {-16304899, -13605259, -6632427, -5142349, 16974359, -10911083, + 27202044, 1719366, 1141648, -12796236}, + }, + { + {-12863944, -13219986, -8318266, -11018091, -6810145, -4843894, + 13475066, -3133972, 32674895, 13715045}, + {11423335, -5468059, 32344216, 8962751, 24989809, 9241752, + -13265253, 16086212, -28740881, -15642093}, + {-1409668, 12530728, -6368726, 10847387, 19531186, -14132160, + -11709148, 7791794, -27245943, 4383347}, + }, + }, + { + { + {-28970898, 5271447, -1266009, -9736989, -12455236, 16732599, + -4862407, -4906449, 27193557, 6245191}, + {-15193956, 5362278, -1783893, 2695834, 4960227, 12840725, 23061898, + 3260492, 22510453, 8577507}, + {-12632451, 11257346, -32692994, 13548177, -721004, 10879011, + 31168030, 13952092, -29571492, -3635906}, + }, + { + {3877321, -9572739, 32416692, 5405324, -11004407, -13656635, + 3759769, 11935320, 5611860, 8164018}, + {-16275802, 14667797, 15906460, 12155291, -22111149, -9039718, + 32003002, -8832289, 5773085, -8422109}, + {-23788118, -8254300, 1950875, 8937633, 18686727, 16459170, -905725, + 12376320, 31632953, 190926}, + }, + { + {-24593607, -16138885, -8423991, 13378746, 14162407, 6901328, + -8288749, 4508564, -25341555, -3627528}, + {8884438, -5884009, 6023974, 10104341, -6881569, -4941533, 18722941, + -14786005, -1672488, 827625}, + {-32720583, -16289296, -32503547, 7101210, 13354605, 2659080, + -1800575, -14108036, -24878478, 1541286}, + }, + { + {2901347, -1117687, 3880376, -10059388, -17620940, -3612781, + -21802117, -3567481, 20456845, -1885033}, + {27019610, 12299467, -13658288, -1603234, -12861660, -4861471, + -19540150, -5016058, 29439641, 15138866}, + {21536104, -6626420, -32447818, -10690208, -22408077, 5175814, + -5420040, -16361163, 7779328, 109896}, + }, + { + {30279744, 14648750, -8044871, 6425558, 13639621, -743509, 28698390, + 12180118, 23177719, -554075}, + {26572847, 3405927, -31701700, 12890905, -19265668, 5335866, + -6493768, 2378492, 4439158, -13279347}, + {-22716706, 3489070, -9225266, -332753, 18875722, -1140095, + 14819434, -12731527, -17717757, -5461437}, + }, + { + {-5056483, 16566551, 15953661, 3767752, -10436499, 15627060, + -820954, 2177225, 8550082, -15114165}, + {-18473302, 16596775, -381660, 15663611, 22860960, 15585581, + -27844109, -3582739, -23260460, -8428588}, + {-32480551, 15707275, -8205912, -5652081, 29464558, 2713815, + -22725137, 15860482, -21902570, 1494193}, + }, + { + {-19562091, -14087393, -25583872, -9299552, 13127842, 759709, + 21923482, 16529112, 8742704, 12967017}, + {-28464899, 1553205, 32536856, -10473729, -24691605, -406174, + -8914625, -2933896, -29903758, 15553883}, + {21877909, 3230008, 9881174, 10539357, -4797115, 2841332, 11543572, + 14513274, 19375923, -12647961}, + }, + { + {8832269, -14495485, 13253511, 5137575, 5037871, 4078777, 24880818, + -6222716, 2862653, 9455043}, + {29306751, 5123106, 20245049, -14149889, 9592566, 8447059, -2077124, + -2990080, 15511449, 4789663}, + {-20679756, 7004547, 8824831, -9434977, -4045704, -3750736, + -5754762, 108893, 23513200, 16652362}, + }, + }, + { + { + {-33256173, 4144782, -4476029, -6579123, 10770039, -7155542, + -6650416, -12936300, -18319198, 10212860}, + {2756081, 8598110, 7383731, -6859892, 22312759, -1105012, 21179801, + 2600940, -9988298, -12506466}, + {-24645692, 13317462, -30449259, -15653928, 21365574, -10869657, + 11344424, 864440, -2499677, -16710063}, + }, + { + {-26432803, 6148329, -17184412, -14474154, 18782929, -275997, + -22561534, 211300, 2719757, 4940997}, + {-1323882, 3911313, -6948744, 14759765, -30027150, 7851207, + 21690126, 8518463, 26699843, 5276295}, + {-13149873, -6429067, 9396249, 365013, 24703301, -10488939, 1321586, + 149635, -15452774, 7159369}, + }, + { + {9987780, -3404759, 17507962, 9505530, 9731535, -2165514, 22356009, + 8312176, 22477218, -8403385}, + {18155857, -16504990, 19744716, 9006923, 15154154, -10538976, + 24256460, -4864995, -22548173, 9334109}, + {2986088, -4911893, 10776628, -3473844, 10620590, -7083203, + -21413845, 14253545, -22587149, 536906}, + }, + { + {4377756, 8115836, 24567078, 15495314, 11625074, 13064599, 7390551, + 10589625, 10838060, -15420424}, + {-19342404, 867880, 9277171, -3218459, -14431572, -1986443, + 19295826, -15796950, 6378260, 699185}, + {7895026, 4057113, -7081772, -13077756, -17886831, -323126, -716039, + 15693155, -5045064, -13373962}, + }, + { + {-7737563, -5869402, -14566319, -7406919, 11385654, 13201616, + 31730678, -10962840, -3918636, -9669325}, + {10188286, -15770834, -7336361, 13427543, 22223443, 14896287, + 30743455, 7116568, -21786507, 5427593}, + {696102, 13206899, 27047647, -10632082, 15285305, -9853179, + 10798490, -4578720, 19236243, 12477404}, + }, + { + {-11229439, 11243796, -17054270, -8040865, -788228, -8167967, + -3897669, 11180504, -23169516, 7733644}, + {17800790, -14036179, -27000429, -11766671, 23887827, 3149671, + 23466177, -10538171, 10322027, 15313801}, + {26246234, 11968874, 32263343, -5468728, 6830755, -13323031, + -15794704, -101982, -24449242, 10890804}, + }, + { + {-31365647, 10271363, -12660625, -6267268, 16690207, -13062544, + -14982212, 16484931, 25180797, -5334884}, + {-586574, 10376444, -32586414, -11286356, 19801893, 10997610, + 2276632, 9482883, 316878, 13820577}, + {-9882808, -4510367, -2115506, 16457136, -11100081, 11674996, + 30756178, -7515054, 30696930, -3712849}, + }, + { + {32988917, -9603412, 12499366, 7910787, -10617257, -11931514, + -7342816, -9985397, -32349517, 7392473}, + {-8855661, 15927861, 9866406, -3649411, -2396914, -16655781, + -30409476, -9134995, 25112947, -2926644}, + {-2504044, -436966, 25621774, -5678772, 15085042, -5479877, + -24884878, -13526194, 5537438, -13914319}, + }, + }, + { + { + {-11225584, 2320285, -9584280, 10149187, -33444663, 5808648, + -14876251, -1729667, 31234590, 6090599}, + {-9633316, 116426, 26083934, 2897444, -6364437, -2688086, 609721, + 15878753, -6970405, -9034768}, + {-27757857, 247744, -15194774, -9002551, 23288161, -10011936, + -23869595, 6503646, 20650474, 1804084}, + }, + { + {-27589786, 15456424, 8972517, 8469608, 15640622, 4439847, 3121995, + -10329713, 27842616, -202328}, + {-15306973, 2839644, 22530074, 10026331, 4602058, 5048462, 28248656, + 5031932, -11375082, 12714369}, + {20807691, -7270825, 29286141, 11421711, -27876523, -13868230, + -21227475, 1035546, -19733229, 12796920}, + }, + { + {12076899, -14301286, -8785001, -11848922, -25012791, 16400684, + -17591495, -12899438, 3480665, -15182815}, + {-32361549, 5457597, 28548107, 7833186, 7303070, -11953545, + -24363064, -15921875, -33374054, 2771025}, + {-21389266, 421932, 26597266, 6860826, 22486084, -6737172, + -17137485, -4210226, -24552282, 15673397}, + }, + { + {-20184622, 2338216, 19788685, -9620956, -4001265, -8740893, + -20271184, 4733254, 3727144, -12934448}, + {6120119, 814863, -11794402, -622716, 6812205, -15747771, 2019594, + 7975683, 31123697, -10958981}, + {30069250, -11435332, 30434654, 2958439, 18399564, -976289, + 12296869, 9204260, -16432438, 9648165}, + }, + { + {32705432, -1550977, 30705658, 7451065, -11805606, 9631813, 3305266, + 5248604, -26008332, -11377501}, + {17219865, 2375039, -31570947, -5575615, -19459679, 9219903, 294711, + 15298639, 2662509, -16297073}, + {-1172927, -7558695, -4366770, -4287744, -21346413, -8434326, + 32087529, -1222777, 32247248, -14389861}, + }, + { + {14312628, 1221556, 17395390, -8700143, -4945741, -8684635, + -28197744, -9637817, -16027623, -13378845}, + {-1428825, -9678990, -9235681, 6549687, -7383069, -468664, 23046502, + 9803137, 17597934, 2346211}, + {18510800, 15337574, 26171504, 981392, -22241552, 7827556, + -23491134, -11323352, 3059833, -11782870}, + }, + { + {10141598, 6082907, 17829293, -1947643, 9830092, 13613136, + -25556636, -5544586, -33502212, 3592096}, + {33114168, -15889352, -26525686, -13343397, 33076705, 8716171, + 1151462, 1521897, -982665, -6837803}, + {-32939165, -4255815, 23947181, -324178, -33072974, -12305637, + -16637686, 3891704, 26353178, 693168}, + }, + { + {30374239, 1595580, -16884039, 13186931, 4600344, 406904, 9585294, + -400668, 31375464, 14369965}, + {-14370654, -7772529, 1510301, 6434173, -18784789, -6262728, + 32732230, -13108839, 17901441, 16011505}, + {18171223, -11934626, -12500402, 15197122, -11038147, -15230035, + -19172240, -16046376, 8764035, 12309598}, + }, + }, + { + { + {5975908, -5243188, -19459362, -9681747, -11541277, 14015782, + -23665757, 1228319, 17544096, -10593782}, + {5811932, -1715293, 3442887, -2269310, -18367348, -8359541, + -18044043, -15410127, -5565381, 12348900}, + {-31399660, 11407555, 25755363, 6891399, -3256938, 14872274, + -24849353, 8141295, -10632534, -585479}, + }, + { + {-12675304, 694026, -5076145, 13300344, 14015258, -14451394, + -9698672, -11329050, 30944593, 1130208}, + {8247766, -6710942, -26562381, -7709309, -14401939, -14648910, + 4652152, 2488540, 23550156, -271232}, + {17294316, -3788438, 7026748, 15626851, 22990044, 113481, 2267737, + -5908146, -408818, -137719}, + }, + { + {16091085, -16253926, 18599252, 7340678, 2137637, -1221657, + -3364161, 14550936, 3260525, -7166271}, + {-4910104, -13332887, 18550887, 10864893, -16459325, -7291596, + -23028869, -13204905, -12748722, 2701326}, + {-8574695, 16099415, 4629974, -16340524, -20786213, -6005432, + -10018363, 9276971, 11329923, 1862132}, + }, + { + {14763076, -15903608, -30918270, 3689867, 3511892, 10313526, + -21951088, 12219231, -9037963, -940300}, + {8894987, -3446094, 6150753, 3013931, 301220, 15693451, -31981216, + -2909717, -15438168, 11595570}, + {15214962, 3537601, -26238722, -14058872, 4418657, -15230761, + 13947276, 10730794, -13489462, -4363670}, + }, + { + {-2538306, 7682793, 32759013, 263109, -29984731, -7955452, + -22332124, -10188635, 977108, 699994}, + {-12466472, 4195084, -9211532, 550904, -15565337, 12917920, + 19118110, -439841, -30534533, -14337913}, + {31788461, -14507657, 4799989, 7372237, 8808585, -14747943, 9408237, + -10051775, 12493932, -5409317}, + }, + { + {-25680606, 5260744, -19235809, -6284470, -3695942, 16566087, + 27218280, 2607121, 29375955, 6024730}, + {842132, -2794693, -4763381, -8722815, 26332018, -12405641, + 11831880, 6985184, -9940361, 2854096}, + {-4847262, -7969331, 2516242, -5847713, 9695691, -7221186, 16512645, + 960770, 12121869, 16648078}, + }, + { + {-15218652, 14667096, -13336229, 2013717, 30598287, -464137, + -31504922, -7882064, 20237806, 2838411}, + {-19288047, 4453152, 15298546, -16178388, 22115043, -15972604, + 12544294, -13470457, 1068881, -12499905}, + {-9558883, -16518835, 33238498, 13506958, 30505848, -1114596, + -8486907, -2630053, 12521378, 4845654}, + }, + { + {-28198521, 10744108, -2958380, 10199664, 7759311, -13088600, + 3409348, -873400, -6482306, -12885870}, + {-23561822, 6230156, -20382013, 10655314, -24040585, -11621172, + 10477734, -1240216, -3113227, 13974498}, + {12966261, 15550616, -32038948, -1615346, 21025980, -629444, + 5642325, 7188737, 18895762, 12629579}, + }, + }, + { + { + {14741879, -14946887, 22177208, -11721237, 1279741, 8058600, + 11758140, 789443, 32195181, 3895677}, + {10758205, 15755439, -4509950, 9243698, -4879422, 6879879, -2204575, + -3566119, -8982069, 4429647}, + {-2453894, 15725973, -20436342, -10410672, -5803908, -11040220, + -7135870, -11642895, 18047436, -15281743}, + }, + { + {-25173001, -11307165, 29759956, 11776784, -22262383, -15820455, + 10993114, -12850837, -17620701, -9408468}, + {21987233, 700364, -24505048, 14972008, -7774265, -5718395, + 32155026, 2581431, -29958985, 8773375}, + {-25568350, 454463, -13211935, 16126715, 25240068, 8594567, + 20656846, 12017935, -7874389, -13920155}, + }, + { + {6028182, 6263078, -31011806, -11301710, -818919, 2461772, + -31841174, -5468042, -1721788, -2776725}, + {-12278994, 16624277, 987579, -5922598, 32908203, 1248608, 7719845, + -4166698, 28408820, 6816612}, + {-10358094, -8237829, 19549651, -12169222, 22082623, 16147817, + 20613181, 13982702, -10339570, 5067943}, + }, + { + {-30505967, -3821767, 12074681, 13582412, -19877972, 2443951, + -19719286, 12746132, 5331210, -10105944}, + {30528811, 3601899, -1957090, 4619785, -27361822, -15436388, + 24180793, -12570394, 27679908, -1648928}, + {9402404, -13957065, 32834043, 10838634, -26580150, -13237195, + 26653274, -8685565, 22611444, -12715406}, + }, + { + {22190590, 1118029, 22736441, 15130463, -30460692, -5991321, + 19189625, -4648942, 4854859, 6622139}, + {-8310738, -2953450, -8262579, -3388049, -10401731, -271929, + 13424426, -3567227, 26404409, 13001963}, + {-31241838, -15415700, -2994250, 8939346, 11562230, -12840670, + -26064365, -11621720, -15405155, 11020693}, + }, + { + {1866042, -7949489, -7898649, -10301010, 12483315, 13477547, + 3175636, -12424163, 28761762, 1406734}, + {-448555, -1777666, 13018551, 3194501, -9580420, -11161737, + 24760585, -4347088, 25577411, -13378680}, + {-24290378, 4759345, -690653, -1852816, 2066747, 10693769, + -29595790, 9884936, -9368926, 4745410}, + }, + { + {-9141284, 6049714, -19531061, -4341411, -31260798, 9944276, + -15462008, -11311852, 10931924, -11931931}, + {-16561513, 14112680, -8012645, 4817318, -8040464, -11414606, + -22853429, 10856641, -20470770, 13434654}, + {22759489, -10073434, -16766264, -1871422, 13637442, -10168091, + 1765144, -12654326, 28445307, -5364710}, + }, + { + {29875063, 12493613, 2795536, -3786330, 1710620, 15181182, + -10195717, -8788675, 9074234, 1167180}, + {-26205683, 11014233, -9842651, -2635485, -26908120, 7532294, + -18716888, -9535498, 3843903, 9367684}, + {-10969595, -6403711, 9591134, 9582310, 11349256, 108879, 16235123, + 8601684, -139197, 4242895}, + }, + }, + { + { + {22092954, -13191123, -2042793, -11968512, 32186753, -11517388, + -6574341, 2470660, -27417366, 16625501}, + {-11057722, 3042016, 13770083, -9257922, 584236, -544855, -7770857, + 2602725, -27351616, 14247413}, + {6314175, -10264892, -32772502, 15957557, -10157730, 168750, + -8618807, 14290061, 27108877, -1180880}, + }, + { + {-8586597, -7170966, 13241782, 10960156, -32991015, -13794596, + 33547976, -11058889, -27148451, 981874}, + {22833440, 9293594, -32649448, -13618667, -9136966, 14756819, + -22928859, -13970780, -10479804, -16197962}, + {-7768587, 3326786, -28111797, 10783824, 19178761, 14905060, + 22680049, 13906969, -15933690, 3797899}, + }, + { + {21721356, -4212746, -12206123, 9310182, -3882239, -13653110, + 23740224, -2709232, 20491983, -8042152}, + {9209270, -15135055, -13256557, -6167798, -731016, 15289673, + 25947805, 15286587, 30997318, -6703063}, + {7392032, 16618386, 23946583, -8039892, -13265164, -1533858, + -14197445, -2321576, 17649998, -250080}, + }, + { + {-9301088, -14193827, 30609526, -3049543, -25175069, -1283752, + -15241566, -9525724, -2233253, 7662146}, + {-17558673, 1763594, -33114336, 15908610, -30040870, -12174295, + 7335080, -8472199, -3174674, 3440183}, + {-19889700, -5977008, -24111293, -9688870, 10799743, -16571957, + 40450, -4431835, 4862400, 1133}, + }, + { + {-32856209, -7873957, -5422389, 14860950, -16319031, 7956142, + 7258061, 311861, -30594991, -7379421}, + {-3773428, -1565936, 28985340, 7499440, 24445838, 9325937, 29727763, + 16527196, 18278453, 15405622}, + {-4381906, 8508652, -19898366, -3674424, -5984453, 15149970, + -13313598, 843523, -21875062, 13626197}, + }, + { + {2281448, -13487055, -10915418, -2609910, 1879358, 16164207, + -10783882, 3953792, 13340839, 15928663}, + {31727126, -7179855, -18437503, -8283652, 2875793, -16390330, + -25269894, -7014826, -23452306, 5964753}, + {4100420, -5959452, -17179337, 6017714, -18705837, 12227141, + -26684835, 11344144, 2538215, -7570755}, + }, + { + {-9433605, 6123113, 11159803, -2156608, 30016280, 14966241, + -20474983, 1485421, -629256, -15958862}, + {-26804558, 4260919, 11851389, 9658551, -32017107, 16367492, + -20205425, -13191288, 11659922, -11115118}, + {26180396, 10015009, -30844224, -8581293, 5418197, 9480663, 2231568, + -10170080, 33100372, -1306171}, + }, + { + {15121113, -5201871, -10389905, 15427821, -27509937, -15992507, + 21670947, 4486675, -5931810, -14466380}, + {16166486, -9483733, -11104130, 6023908, -31926798, -1364923, + 2340060, -16254968, -10735770, -10039824}, + {28042865, -3557089, -12126526, 12259706, -3717498, -6945899, + 6766453, -8689599, 18036436, 5803270}, + }, + }, + { + { + {-817581, 6763912, 11803561, 1585585, 10958447, -2671165, 23855391, + 4598332, -6159431, -14117438}, + {-31031306, -14256194, 17332029, -2383520, 31312682, -5967183, + 696309, 50292, -20095739, 11763584}, + {-594563, -2514283, -32234153, 12643980, 12650761, 14811489, 665117, + -12613632, -19773211, -10713562}, + }, + { + {30464590, -11262872, -4127476, -12734478, 19835327, -7105613, + -24396175, 2075773, -17020157, 992471}, + {18357185, -6994433, 7766382, 16342475, -29324918, 411174, 14578841, + 8080033, -11574335, -10601610}, + {19598397, 10334610, 12555054, 2555664, 18821899, -10339780, + 21873263, 16014234, 26224780, 16452269}, + }, + { + {-30223925, 5145196, 5944548, 16385966, 3976735, 2009897, -11377804, + -7618186, -20533829, 3698650}, + {14187449, 3448569, -10636236, -10810935, -22663880, -3433596, + 7268410, -10890444, 27394301, 12015369}, + {19695761, 16087646, 28032085, 12999827, 6817792, 11427614, + 20244189, -1312777, -13259127, -3402461}, + }, + { + {30860103, 12735208, -1888245, -4699734, -16974906, 2256940, + -8166013, 12298312, -8550524, -10393462}, + {-5719826, -11245325, -1910649, 15569035, 26642876, -7587760, + -5789354, -15118654, -4976164, 12651793}, + {-2848395, 9953421, 11531313, -5282879, 26895123, -12697089, + -13118820, -16517902, 9768698, -2533218}, + }, + { + {-24719459, 1894651, -287698, -4704085, 15348719, -8156530, + 32767513, 12765450, 4940095, 10678226}, + {18860224, 15980149, -18987240, -1562570, -26233012, -11071856, + -7843882, 13944024, -24372348, 16582019}, + {-15504260, 4970268, |