1
0

curve25519-donna-sse2.h 57 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115
  1. /*
  2. Public domain by Andrew M. <liquidsun@gmail.com>
  3. See: https://github.com/floodyberry/curve25519-donna
  4. SSE2 curve25519 implementation
  5. */
  6. #if defined(ED25519_SSE2)
  7. #include <emmintrin.h>
  8. typedef __m128i xmmi;
  9. typedef union packedelem8_t {
  10. unsigned char u[16];
  11. xmmi v;
  12. } packedelem8;
  13. typedef union packedelem32_t {
  14. uint32_t u[4];
  15. xmmi v;
  16. } packedelem32;
  17. typedef union packedelem64_t {
  18. uint64_t u[2];
  19. xmmi v;
  20. } packedelem64;
  21. /* 10 elements + an extra 2 to fit in 3 xmm registers */
  22. typedef uint32_t bignum25519[12];
  23. typedef packedelem32 packed32bignum25519[5];
  24. typedef packedelem64 packed64bignum25519[10];
  25. static const packedelem32 bot32bitmask = {{0xffffffff, 0x00000000, 0xffffffff, 0x00000000}};
  26. static const packedelem32 top32bitmask = {{0x00000000, 0xffffffff, 0x00000000, 0xffffffff}};
  27. static const packedelem32 top64bitmask = {{0x00000000, 0x00000000, 0xffffffff, 0xffffffff}};
  28. static const packedelem32 bot64bitmask = {{0xffffffff, 0xffffffff, 0x00000000, 0x00000000}};
  29. /* reduction masks */
  30. static const packedelem64 packedmask26 = {{0x03ffffff, 0x03ffffff}};
  31. static const packedelem64 packedmask25 = {{0x01ffffff, 0x01ffffff}};
  32. static const packedelem32 packedmask2625 = {{0x3ffffff,0,0x1ffffff,0}};
  33. static const packedelem32 packedmask26262626 = {{0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff}};
  34. static const packedelem32 packedmask25252525 = {{0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff}};
  35. /* multipliers */
  36. static const packedelem64 packednineteen = {{19, 19}};
  37. static const packedelem64 packednineteenone = {{19, 1}};
  38. static const packedelem64 packedthirtyeight = {{38, 38}};
  39. static const packedelem64 packed3819 = {{19*2,19}};
  40. static const packedelem64 packed9638 = {{19*4,19*2}};
  41. /* 121666,121665 */
  42. static const packedelem64 packed121666121665 = {{121666, 121665}};
  43. /* 2*(2^255 - 19) = 0 mod p */
  44. static const packedelem32 packed2p0 = {{0x7ffffda,0x3fffffe,0x7fffffe,0x3fffffe}};
  45. static const packedelem32 packed2p1 = {{0x7fffffe,0x3fffffe,0x7fffffe,0x3fffffe}};
  46. static const packedelem32 packed2p2 = {{0x7fffffe,0x3fffffe,0x0000000,0x0000000}};
  47. static const packedelem32 packed32packed2p0 = {{0x7ffffda,0x7ffffda,0x3fffffe,0x3fffffe}};
  48. static const packedelem32 packed32packed2p1 = {{0x7fffffe,0x7fffffe,0x3fffffe,0x3fffffe}};
  49. /* 4*(2^255 - 19) = 0 mod p */
  50. static const packedelem32 packed4p0 = {{0xfffffb4,0x7fffffc,0xffffffc,0x7fffffc}};
  51. static const packedelem32 packed4p1 = {{0xffffffc,0x7fffffc,0xffffffc,0x7fffffc}};
  52. static const packedelem32 packed4p2 = {{0xffffffc,0x7fffffc,0x0000000,0x0000000}};
  53. static const packedelem32 packed32packed4p0 = {{0xfffffb4,0xfffffb4,0x7fffffc,0x7fffffc}};
  54. static const packedelem32 packed32packed4p1 = {{0xffffffc,0xffffffc,0x7fffffc,0x7fffffc}};
  55. /* out = in */
  56. DONNA_INLINE static void
  57. curve25519_copy(bignum25519 out, const bignum25519 in) {
  58. xmmi x0,x1,x2;
  59. x0 = _mm_load_si128((xmmi*)in + 0);
  60. x1 = _mm_load_si128((xmmi*)in + 1);
  61. x2 = _mm_load_si128((xmmi*)in + 2);
  62. _mm_store_si128((xmmi*)out + 0, x0);
  63. _mm_store_si128((xmmi*)out + 1, x1);
  64. _mm_store_si128((xmmi*)out + 2, x2);
  65. }
  66. /* out = a + b */
  67. DONNA_INLINE static void
  68. curve25519_add(bignum25519 out, const bignum25519 a, const bignum25519 b) {
  69. xmmi a0,a1,a2,b0,b1,b2;
  70. a0 = _mm_load_si128((xmmi*)a + 0);
  71. a1 = _mm_load_si128((xmmi*)a + 1);
  72. a2 = _mm_load_si128((xmmi*)a + 2);
  73. b0 = _mm_load_si128((xmmi*)b + 0);
  74. b1 = _mm_load_si128((xmmi*)b + 1);
  75. b2 = _mm_load_si128((xmmi*)b + 2);
  76. a0 = _mm_add_epi32(a0, b0);
  77. a1 = _mm_add_epi32(a1, b1);
  78. a2 = _mm_add_epi32(a2, b2);
  79. _mm_store_si128((xmmi*)out + 0, a0);
  80. _mm_store_si128((xmmi*)out + 1, a1);
  81. _mm_store_si128((xmmi*)out + 2, a2);
  82. }
  83. #define curve25519_add_after_basic curve25519_add_reduce
  84. DONNA_INLINE static void
  85. curve25519_add_reduce(bignum25519 out, const bignum25519 a, const bignum25519 b) {
  86. xmmi a0,a1,a2,b0,b1,b2;
  87. xmmi c1,c2,c3;
  88. xmmi r0,r1,r2,r3,r4,r5;
  89. a0 = _mm_load_si128((xmmi*)a + 0);
  90. a1 = _mm_load_si128((xmmi*)a + 1);
  91. a2 = _mm_load_si128((xmmi*)a + 2);
  92. b0 = _mm_load_si128((xmmi*)b + 0);
  93. b1 = _mm_load_si128((xmmi*)b + 1);
  94. b2 = _mm_load_si128((xmmi*)b + 2);
  95. a0 = _mm_add_epi32(a0, b0);
  96. a1 = _mm_add_epi32(a1, b1);
  97. a2 = _mm_add_epi32(a2, b2);
  98. r0 = _mm_and_si128(_mm_unpacklo_epi64(a0, a1), bot32bitmask.v);
  99. r1 = _mm_srli_epi64(_mm_unpacklo_epi64(a0, a1), 32);
  100. r2 = _mm_and_si128(_mm_unpackhi_epi64(a0, a1), bot32bitmask.v);
  101. r3 = _mm_srli_epi64(_mm_unpackhi_epi64(a0, a1), 32);
  102. r4 = _mm_and_si128(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), bot32bitmask.v);
  103. r5 = _mm_srli_epi64(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), 32);
  104. c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
  105. c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
  106. c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
  107. c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
  108. c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
  109. _mm_store_si128((xmmi*)out + 0, _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpacklo_epi32(r2, r3)));
  110. _mm_store_si128((xmmi*)out + 1, _mm_unpacklo_epi64(_mm_unpackhi_epi32(r0, r1), _mm_unpackhi_epi32(r2, r3)));
  111. _mm_store_si128((xmmi*)out + 2, _mm_unpackhi_epi32(r4, r5));
  112. }
  113. DONNA_INLINE static void
  114. curve25519_sub(bignum25519 out, const bignum25519 a, const bignum25519 b) {
  115. xmmi a0,a1,a2,b0,b1,b2;
  116. xmmi c1,c2;
  117. xmmi r0,r1;
  118. a0 = _mm_load_si128((xmmi*)a + 0);
  119. a1 = _mm_load_si128((xmmi*)a + 1);
  120. a2 = _mm_load_si128((xmmi*)a + 2);
  121. a0 = _mm_add_epi32(a0, packed2p0.v);
  122. a1 = _mm_add_epi32(a1, packed2p1.v);
  123. a2 = _mm_add_epi32(a2, packed2p2.v);
  124. b0 = _mm_load_si128((xmmi*)b + 0);
  125. b1 = _mm_load_si128((xmmi*)b + 1);
  126. b2 = _mm_load_si128((xmmi*)b + 2);
  127. a0 = _mm_sub_epi32(a0, b0);
  128. a1 = _mm_sub_epi32(a1, b1);
  129. a2 = _mm_sub_epi32(a2, b2);
  130. r0 = _mm_and_si128(_mm_shuffle_epi32(a0, _MM_SHUFFLE(2,2,0,0)), bot32bitmask.v);
  131. r1 = _mm_and_si128(_mm_shuffle_epi32(a0, _MM_SHUFFLE(3,3,1,1)), bot32bitmask.v);
  132. c1 = _mm_srli_epi32(r0, 26);
  133. c2 = _mm_srli_epi32(r1, 25);
  134. r0 = _mm_and_si128(r0, packedmask26.v);
  135. r1 = _mm_and_si128(r1, packedmask25.v);
  136. r0 = _mm_add_epi32(r0, _mm_slli_si128(c2, 8));
  137. r1 = _mm_add_epi32(r1, c1);
  138. a0 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpackhi_epi32(r0, r1));
  139. a1 = _mm_add_epi32(a1, _mm_srli_si128(c2, 8));
  140. _mm_store_si128((xmmi*)out + 0, a0);
  141. _mm_store_si128((xmmi*)out + 1, a1);
  142. _mm_store_si128((xmmi*)out + 2, a2);
  143. }
  144. DONNA_INLINE static void
  145. curve25519_sub_after_basic(bignum25519 out, const bignum25519 a, const bignum25519 b) {
  146. xmmi a0,a1,a2,b0,b1,b2;
  147. xmmi c1,c2,c3;
  148. xmmi r0,r1,r2,r3,r4,r5;
  149. a0 = _mm_load_si128((xmmi*)a + 0);
  150. a1 = _mm_load_si128((xmmi*)a + 1);
  151. a2 = _mm_load_si128((xmmi*)a + 2);
  152. a0 = _mm_add_epi32(a0, packed4p0.v);
  153. a1 = _mm_add_epi32(a1, packed4p1.v);
  154. a2 = _mm_add_epi32(a2, packed4p2.v);
  155. b0 = _mm_load_si128((xmmi*)b + 0);
  156. b1 = _mm_load_si128((xmmi*)b + 1);
  157. b2 = _mm_load_si128((xmmi*)b + 2);
  158. a0 = _mm_sub_epi32(a0, b0);
  159. a1 = _mm_sub_epi32(a1, b1);
  160. a2 = _mm_sub_epi32(a2, b2);
  161. r0 = _mm_and_si128(_mm_unpacklo_epi64(a0, a1), bot32bitmask.v);
  162. r1 = _mm_srli_epi64(_mm_unpacklo_epi64(a0, a1), 32);
  163. r2 = _mm_and_si128(_mm_unpackhi_epi64(a0, a1), bot32bitmask.v);
  164. r3 = _mm_srli_epi64(_mm_unpackhi_epi64(a0, a1), 32);
  165. r4 = _mm_and_si128(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), bot32bitmask.v);
  166. r5 = _mm_srli_epi64(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), 32);
  167. c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
  168. c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
  169. c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
  170. c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
  171. c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
  172. _mm_store_si128((xmmi*)out + 0, _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpacklo_epi32(r2, r3)));
  173. _mm_store_si128((xmmi*)out + 1, _mm_unpacklo_epi64(_mm_unpackhi_epi32(r0, r1), _mm_unpackhi_epi32(r2, r3)));
  174. _mm_store_si128((xmmi*)out + 2, _mm_unpackhi_epi32(r4, r5));
  175. }
  176. DONNA_INLINE static void
  177. curve25519_sub_reduce(bignum25519 out, const bignum25519 a, const bignum25519 b) {
  178. xmmi a0,a1,a2,b0,b1,b2;
  179. xmmi c1,c2,c3;
  180. xmmi r0,r1,r2,r3,r4,r5;
  181. a0 = _mm_load_si128((xmmi*)a + 0);
  182. a1 = _mm_load_si128((xmmi*)a + 1);
  183. a2 = _mm_load_si128((xmmi*)a + 2);
  184. a0 = _mm_add_epi32(a0, packed2p0.v);
  185. a1 = _mm_add_epi32(a1, packed2p1.v);
  186. a2 = _mm_add_epi32(a2, packed2p2.v);
  187. b0 = _mm_load_si128((xmmi*)b + 0);
  188. b1 = _mm_load_si128((xmmi*)b + 1);
  189. b2 = _mm_load_si128((xmmi*)b + 2);
  190. a0 = _mm_sub_epi32(a0, b0);
  191. a1 = _mm_sub_epi32(a1, b1);
  192. a2 = _mm_sub_epi32(a2, b2);
  193. r0 = _mm_and_si128(_mm_unpacklo_epi64(a0, a1), bot32bitmask.v);
  194. r1 = _mm_srli_epi64(_mm_unpacklo_epi64(a0, a1), 32);
  195. r2 = _mm_and_si128(_mm_unpackhi_epi64(a0, a1), bot32bitmask.v);
  196. r3 = _mm_srli_epi64(_mm_unpackhi_epi64(a0, a1), 32);
  197. r4 = _mm_and_si128(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), bot32bitmask.v);
  198. r5 = _mm_srli_epi64(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), 32);
  199. c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
  200. c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
  201. c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
  202. c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
  203. c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
  204. _mm_store_si128((xmmi*)out + 0, _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpacklo_epi32(r2, r3)));
  205. _mm_store_si128((xmmi*)out + 1, _mm_unpacklo_epi64(_mm_unpackhi_epi32(r0, r1), _mm_unpackhi_epi32(r2, r3)));
  206. _mm_store_si128((xmmi*)out + 2, _mm_unpackhi_epi32(r4, r5));
  207. }
  208. DONNA_INLINE static void
  209. curve25519_neg(bignum25519 out, const bignum25519 b) {
  210. xmmi a0,a1,a2,b0,b1,b2;
  211. xmmi c1,c2,c3;
  212. xmmi r0,r1,r2,r3,r4,r5;
  213. a0 = packed2p0.v;
  214. a1 = packed2p1.v;
  215. a2 = packed2p2.v;
  216. b0 = _mm_load_si128((xmmi*)b + 0);
  217. b1 = _mm_load_si128((xmmi*)b + 1);
  218. b2 = _mm_load_si128((xmmi*)b + 2);
  219. a0 = _mm_sub_epi32(a0, b0);
  220. a1 = _mm_sub_epi32(a1, b1);
  221. a2 = _mm_sub_epi32(a2, b2);
  222. r0 = _mm_and_si128(_mm_unpacklo_epi64(a0, a1), bot32bitmask.v);
  223. r1 = _mm_srli_epi64(_mm_unpacklo_epi64(a0, a1), 32);
  224. r2 = _mm_and_si128(_mm_unpackhi_epi64(a0, a1), bot32bitmask.v);
  225. r3 = _mm_srli_epi64(_mm_unpackhi_epi64(a0, a1), 32);
  226. r4 = _mm_and_si128(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), bot32bitmask.v);
  227. r5 = _mm_srli_epi64(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), 32);
  228. c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
  229. c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
  230. c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
  231. c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
  232. c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
  233. _mm_store_si128((xmmi*)out + 0, _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpacklo_epi32(r2, r3)));
  234. _mm_store_si128((xmmi*)out + 1, _mm_unpacklo_epi64(_mm_unpackhi_epi32(r0, r1), _mm_unpackhi_epi32(r2, r3)));
  235. _mm_store_si128((xmmi*)out + 2, _mm_unpackhi_epi32(r4, r5));
  236. }
  237. /* Multiply two numbers: out = in2 * in */
  238. static void
  239. curve25519_mul(bignum25519 out, const bignum25519 r, const bignum25519 s) {
  240. xmmi m01,m23,m45,m67,m89;
  241. xmmi m0123,m4567;
  242. xmmi s0123,s4567;
  243. xmmi s01,s23,s45,s67,s89;
  244. xmmi s12,s34,s56,s78,s9;
  245. xmmi r0,r2,r4,r6,r8;
  246. xmmi r1,r3,r5,r7,r9;
  247. xmmi r119,r219,r319,r419,r519,r619,r719,r819,r919;
  248. xmmi c1,c2,c3;
  249. s0123 = _mm_load_si128((xmmi*)s + 0);
  250. s01 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,1,2,0));
  251. s12 = _mm_shuffle_epi32(s0123, _MM_SHUFFLE(2,2,1,1));
  252. s23 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,3,2,2));
  253. s4567 = _mm_load_si128((xmmi*)s + 1);
  254. s34 = _mm_unpacklo_epi64(_mm_srli_si128(s0123,12),s4567);
  255. s45 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,1,2,0));
  256. s56 = _mm_shuffle_epi32(s4567, _MM_SHUFFLE(2,2,1,1));
  257. s67 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,3,2,2));
  258. s89 = _mm_load_si128((xmmi*)s + 2);
  259. s78 = _mm_unpacklo_epi64(_mm_srli_si128(s4567,12),s89);
  260. s89 = _mm_shuffle_epi32(s89,_MM_SHUFFLE(3,1,2,0));
  261. s9 = _mm_shuffle_epi32(s89, _MM_SHUFFLE(3,3,2,2));
  262. r0 = _mm_load_si128((xmmi*)r + 0);
  263. r1 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(1,1,1,1));
  264. r1 = _mm_add_epi64(r1, _mm_and_si128(r1, top64bitmask.v));
  265. r2 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(2,2,2,2));
  266. r3 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(3,3,3,3));
  267. r3 = _mm_add_epi64(r3, _mm_and_si128(r3, top64bitmask.v));
  268. r0 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(0,0,0,0));
  269. r4 = _mm_load_si128((xmmi*)r + 1);
  270. r5 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(1,1,1,1));
  271. r5 = _mm_add_epi64(r5, _mm_and_si128(r5, top64bitmask.v));
  272. r6 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(2,2,2,2));
  273. r7 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(3,3,3,3));
  274. r7 = _mm_add_epi64(r7, _mm_and_si128(r7, top64bitmask.v));
  275. r4 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(0,0,0,0));
  276. r8 = _mm_load_si128((xmmi*)r + 2);
  277. r9 = _mm_shuffle_epi32(r8, _MM_SHUFFLE(3,1,3,1));
  278. r9 = _mm_add_epi64(r9, _mm_and_si128(r9, top64bitmask.v));
  279. r8 = _mm_shuffle_epi32(r8, _MM_SHUFFLE(3,0,3,0));
  280. m01 = _mm_mul_epu32(r1,s01);
  281. m23 = _mm_mul_epu32(r1,s23);
  282. m45 = _mm_mul_epu32(r1,s45);
  283. m67 = _mm_mul_epu32(r1,s67);
  284. m23 = _mm_add_epi64(m23,_mm_mul_epu32(r3,s01));
  285. m45 = _mm_add_epi64(m45,_mm_mul_epu32(r3,s23));
  286. m67 = _mm_add_epi64(m67,_mm_mul_epu32(r3,s45));
  287. m89 = _mm_mul_epu32(r1,s89);
  288. m45 = _mm_add_epi64(m45,_mm_mul_epu32(r5,s01));
  289. m67 = _mm_add_epi64(m67,_mm_mul_epu32(r5,s23));
  290. m89 = _mm_add_epi64(m89,_mm_mul_epu32(r3,s67));
  291. m67 = _mm_add_epi64(m67,_mm_mul_epu32(r7,s01));
  292. m89 = _mm_add_epi64(m89,_mm_mul_epu32(r5,s45));
  293. m89 = _mm_add_epi64(m89,_mm_mul_epu32(r7,s23));
  294. m89 = _mm_add_epi64(m89,_mm_mul_epu32(r9,s01));
  295. /* shift up */
  296. m89 = _mm_unpackhi_epi64(m67,_mm_slli_si128(m89,8));
  297. m67 = _mm_unpackhi_epi64(m45,_mm_slli_si128(m67,8));
  298. m45 = _mm_unpackhi_epi64(m23,_mm_slli_si128(m45,8));
  299. m23 = _mm_unpackhi_epi64(m01,_mm_slli_si128(m23,8));
  300. m01 = _mm_unpackhi_epi64(_mm_setzero_si128(),_mm_slli_si128(m01,8));
  301. m01 = _mm_add_epi64(m01,_mm_mul_epu32(r0,s01));
  302. m23 = _mm_add_epi64(m23,_mm_mul_epu32(r0,s23));
  303. m45 = _mm_add_epi64(m45,_mm_mul_epu32(r0,s45));
  304. m67 = _mm_add_epi64(m67,_mm_mul_epu32(r0,s67));
  305. m23 = _mm_add_epi64(m23,_mm_mul_epu32(r2,s01));
  306. m45 = _mm_add_epi64(m45,_mm_mul_epu32(r2,s23));
  307. m67 = _mm_add_epi64(m67,_mm_mul_epu32(r4,s23));
  308. m89 = _mm_add_epi64(m89,_mm_mul_epu32(r0,s89));
  309. m45 = _mm_add_epi64(m45,_mm_mul_epu32(r4,s01));
  310. m67 = _mm_add_epi64(m67,_mm_mul_epu32(r2,s45));
  311. m89 = _mm_add_epi64(m89,_mm_mul_epu32(r2,s67));
  312. m67 = _mm_add_epi64(m67,_mm_mul_epu32(r6,s01));
  313. m89 = _mm_add_epi64(m89,_mm_mul_epu32(r4,s45));
  314. m89 = _mm_add_epi64(m89,_mm_mul_epu32(r6,s23));
  315. m89 = _mm_add_epi64(m89,_mm_mul_epu32(r8,s01));
  316. r219 = _mm_mul_epu32(r2, packednineteen.v);
  317. r419 = _mm_mul_epu32(r4, packednineteen.v);
  318. r619 = _mm_mul_epu32(r6, packednineteen.v);
  319. r819 = _mm_mul_epu32(r8, packednineteen.v);
  320. r119 = _mm_shuffle_epi32(r1,_MM_SHUFFLE(0,0,2,2)); r119 = _mm_mul_epu32(r119, packednineteen.v);
  321. r319 = _mm_shuffle_epi32(r3,_MM_SHUFFLE(0,0,2,2)); r319 = _mm_mul_epu32(r319, packednineteen.v);
  322. r519 = _mm_shuffle_epi32(r5,_MM_SHUFFLE(0,0,2,2)); r519 = _mm_mul_epu32(r519, packednineteen.v);
  323. r719 = _mm_shuffle_epi32(r7,_MM_SHUFFLE(0,0,2,2)); r719 = _mm_mul_epu32(r719, packednineteen.v);
  324. r919 = _mm_shuffle_epi32(r9,_MM_SHUFFLE(0,0,2,2)); r919 = _mm_mul_epu32(r919, packednineteen.v);
  325. m01 = _mm_add_epi64(m01,_mm_mul_epu32(r919,s12));
  326. m23 = _mm_add_epi64(m23,_mm_mul_epu32(r919,s34));
  327. m45 = _mm_add_epi64(m45,_mm_mul_epu32(r919,s56));
  328. m67 = _mm_add_epi64(m67,_mm_mul_epu32(r919,s78));
  329. m01 = _mm_add_epi64(m01,_mm_mul_epu32(r719,s34));
  330. m23 = _mm_add_epi64(m23,_mm_mul_epu32(r719,s56));
  331. m45 = _mm_add_epi64(m45,_mm_mul_epu32(r719,s78));
  332. m67 = _mm_add_epi64(m67,_mm_mul_epu32(r719,s9));
  333. m01 = _mm_add_epi64(m01,_mm_mul_epu32(r519,s56));
  334. m23 = _mm_add_epi64(m23,_mm_mul_epu32(r519,s78));
  335. m45 = _mm_add_epi64(m45,_mm_mul_epu32(r519,s9));
  336. m67 = _mm_add_epi64(m67,_mm_mul_epu32(r819,s89));
  337. m01 = _mm_add_epi64(m01,_mm_mul_epu32(r319,s78));
  338. m23 = _mm_add_epi64(m23,_mm_mul_epu32(r319,s9));
  339. m45 = _mm_add_epi64(m45,_mm_mul_epu32(r619,s89));
  340. m89 = _mm_add_epi64(m89,_mm_mul_epu32(r919,s9));
  341. m01 = _mm_add_epi64(m01,_mm_mul_epu32(r819,s23));
  342. m23 = _mm_add_epi64(m23,_mm_mul_epu32(r819,s45));
  343. m45 = _mm_add_epi64(m45,_mm_mul_epu32(r819,s67));
  344. m01 = _mm_add_epi64(m01,_mm_mul_epu32(r619,s45));
  345. m23 = _mm_add_epi64(m23,_mm_mul_epu32(r619,s67));
  346. m01 = _mm_add_epi64(m01,_mm_mul_epu32(r419,s67));
  347. m23 = _mm_add_epi64(m23,_mm_mul_epu32(r419,s89));
  348. m01 = _mm_add_epi64(m01,_mm_mul_epu32(r219,s89));
  349. m01 = _mm_add_epi64(m01,_mm_mul_epu32(r119,s9));
  350. r0 = _mm_unpacklo_epi64(m01, m45);
  351. r1 = _mm_unpackhi_epi64(m01, m45);
  352. r2 = _mm_unpacklo_epi64(m23, m67);
  353. r3 = _mm_unpackhi_epi64(m23, m67);
  354. r4 = _mm_unpacklo_epi64(m89, m89);
  355. r5 = _mm_unpackhi_epi64(m89, m89);
  356. c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
  357. c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
  358. c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
  359. c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
  360. c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
  361. m0123 = _mm_unpacklo_epi32(r0, r1);
  362. m4567 = _mm_unpackhi_epi32(r0, r1);
  363. m0123 = _mm_unpacklo_epi64(m0123, _mm_unpacklo_epi32(r2, r3));
  364. m4567 = _mm_unpacklo_epi64(m4567, _mm_unpackhi_epi32(r2, r3));
  365. m89 = _mm_unpackhi_epi32(r4, r5);
  366. _mm_store_si128((xmmi*)out + 0, m0123);
  367. _mm_store_si128((xmmi*)out + 1, m4567);
  368. _mm_store_si128((xmmi*)out + 2, m89);
  369. }
  370. DONNA_NOINLINE static void
  371. curve25519_mul_noinline(bignum25519 out, const bignum25519 r, const bignum25519 s) {
  372. curve25519_mul(out, r, s);
  373. }
  374. #define curve25519_square(r, n) curve25519_square_times(r, n, 1)
  375. static void
  376. curve25519_square_times(bignum25519 r, const bignum25519 in, int count) {
  377. xmmi m01,m23,m45,m67,m89;
  378. xmmi r0,r1,r2,r3,r4,r5,r6,r7,r8,r9;
  379. xmmi r0a,r1a,r2a,r3a,r7a,r9a;
  380. xmmi r0123,r4567;
  381. xmmi r01,r23,r45,r67,r6x,r89,r8x;
  382. xmmi r12,r34,r56,r78,r9x;
  383. xmmi r5619;
  384. xmmi c1,c2,c3;
  385. r0123 = _mm_load_si128((xmmi*)in + 0);
  386. r01 = _mm_shuffle_epi32(r0123,_MM_SHUFFLE(3,1,2,0));
  387. r23 = _mm_shuffle_epi32(r0123,_MM_SHUFFLE(3,3,2,2));
  388. r4567 = _mm_load_si128((xmmi*)in + 1);
  389. r45 = _mm_shuffle_epi32(r4567,_MM_SHUFFLE(3,1,2,0));
  390. r67 = _mm_shuffle_epi32(r4567,_MM_SHUFFLE(3,3,2,2));
  391. r89 = _mm_load_si128((xmmi*)in + 2);
  392. r89 = _mm_shuffle_epi32(r89,_MM_SHUFFLE(3,1,2,0));
  393. do {
  394. r12 = _mm_unpackhi_epi64(r01, _mm_slli_si128(r23, 8));
  395. r0 = _mm_shuffle_epi32(r01, _MM_SHUFFLE(0,0,0,0));
  396. r0 = _mm_add_epi64(r0, _mm_and_si128(r0, top64bitmask.v));
  397. r0a = _mm_shuffle_epi32(r0,_MM_SHUFFLE(3,2,1,2));
  398. r1 = _mm_shuffle_epi32(r01, _MM_SHUFFLE(2,2,2,2));
  399. r2 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(0,0,0,0));
  400. r2 = _mm_add_epi64(r2, _mm_and_si128(r2, top64bitmask.v));
  401. r2a = _mm_shuffle_epi32(r2,_MM_SHUFFLE(3,2,1,2));
  402. r3 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(2,2,2,2));
  403. r34 = _mm_unpackhi_epi64(r23, _mm_slli_si128(r45, 8));
  404. r4 = _mm_shuffle_epi32(r45, _MM_SHUFFLE(0,0,0,0));
  405. r4 = _mm_add_epi64(r4, _mm_and_si128(r4, top64bitmask.v));
  406. r56 = _mm_unpackhi_epi64(r45, _mm_slli_si128(r67, 8));
  407. r5619 = _mm_mul_epu32(r56, packednineteen.v);
  408. r5 = _mm_shuffle_epi32(r5619, _MM_SHUFFLE(1,1,1,0));
  409. r6 = _mm_shuffle_epi32(r5619, _MM_SHUFFLE(3,2,3,2));
  410. r78 = _mm_unpackhi_epi64(r67, _mm_slli_si128(r89, 8));
  411. r6x = _mm_unpacklo_epi64(r67, _mm_setzero_si128());
  412. r7 = _mm_shuffle_epi32(r67, _MM_SHUFFLE(2,2,2,2));
  413. r7 = _mm_mul_epu32(r7, packed3819.v);
  414. r7a = _mm_shuffle_epi32(r7, _MM_SHUFFLE(3,3,3,2));
  415. r8x = _mm_unpacklo_epi64(r89, _mm_setzero_si128());
  416. r8 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(0,0,0,0));
  417. r8 = _mm_mul_epu32(r8, packednineteen.v);
  418. r9 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(2,2,2,2));
  419. r9x = _mm_slli_epi32(_mm_shuffle_epi32(r89, _MM_SHUFFLE(3,3,3,2)), 1);
  420. r9 = _mm_mul_epu32(r9, packed3819.v);
  421. r9a = _mm_shuffle_epi32(r9, _MM_SHUFFLE(2,2,2,2));
  422. m01 = _mm_mul_epu32(r01, r0);
  423. m23 = _mm_mul_epu32(r23, r0a);
  424. m45 = _mm_mul_epu32(r45, r0a);
  425. m45 = _mm_add_epi64(m45, _mm_mul_epu32(r23, r2));
  426. r23 = _mm_slli_epi32(r23, 1);
  427. m67 = _mm_mul_epu32(r67, r0a);
  428. m67 = _mm_add_epi64(m67, _mm_mul_epu32(r45, r2a));
  429. m89 = _mm_mul_epu32(r89, r0a);
  430. m89 = _mm_add_epi64(m89, _mm_mul_epu32(r67, r2a));
  431. r67 = _mm_slli_epi32(r67, 1);
  432. m89 = _mm_add_epi64(m89, _mm_mul_epu32(r45, r4));
  433. r45 = _mm_slli_epi32(r45, 1);
  434. r1 = _mm_slli_epi32(r1, 1);
  435. r3 = _mm_slli_epi32(r3, 1);
  436. r1a = _mm_add_epi64(r1, _mm_and_si128(r1, bot64bitmask.v));
  437. r3a = _mm_add_epi64(r3, _mm_and_si128(r3, bot64bitmask.v));
  438. m23 = _mm_add_epi64(m23, _mm_mul_epu32(r12, r1));
  439. m45 = _mm_add_epi64(m45, _mm_mul_epu32(r34, r1a));
  440. m67 = _mm_add_epi64(m67, _mm_mul_epu32(r56, r1a));
  441. m67 = _mm_add_epi64(m67, _mm_mul_epu32(r34, r3));
  442. r34 = _mm_slli_epi32(r34, 1);
  443. m89 = _mm_add_epi64(m89, _mm_mul_epu32(r78, r1a));
  444. r78 = _mm_slli_epi32(r78, 1);
  445. m89 = _mm_add_epi64(m89, _mm_mul_epu32(r56, r3a));
  446. r56 = _mm_slli_epi32(r56, 1);
  447. m01 = _mm_add_epi64(m01, _mm_mul_epu32(_mm_slli_epi32(r12, 1), r9));
  448. m01 = _mm_add_epi64(m01, _mm_mul_epu32(r34, r7));
  449. m23 = _mm_add_epi64(m23, _mm_mul_epu32(r34, r9));
  450. m01 = _mm_add_epi64(m01, _mm_mul_epu32(r56, r5));
  451. m23 = _mm_add_epi64(m23, _mm_mul_epu32(r56, r7));
  452. m45 = _mm_add_epi64(m45, _mm_mul_epu32(r56, r9));
  453. m01 = _mm_add_epi64(m01, _mm_mul_epu32(r23, r8));
  454. m01 = _mm_add_epi64(m01, _mm_mul_epu32(r45, r6));
  455. m23 = _mm_add_epi64(m23, _mm_mul_epu32(r45, r8));
  456. m23 = _mm_add_epi64(m23, _mm_mul_epu32(r6x, r6));
  457. m45 = _mm_add_epi64(m45, _mm_mul_epu32(r78, r7a));
  458. m67 = _mm_add_epi64(m67, _mm_mul_epu32(r78, r9));
  459. m45 = _mm_add_epi64(m45, _mm_mul_epu32(r67, r8));
  460. m67 = _mm_add_epi64(m67, _mm_mul_epu32(r8x, r8));
  461. m89 = _mm_add_epi64(m89, _mm_mul_epu32(r9x, r9a));
  462. r0 = _mm_unpacklo_epi64(m01, m45);
  463. r1 = _mm_unpackhi_epi64(m01, m45);
  464. r2 = _mm_unpacklo_epi64(m23, m67);
  465. r3 = _mm_unpackhi_epi64(m23, m67);
  466. r4 = _mm_unpacklo_epi64(m89, m89);
  467. r5 = _mm_unpackhi_epi64(m89, m89);
  468. c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
  469. c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
  470. c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
  471. c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
  472. c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
  473. r01 = _mm_unpacklo_epi64(r0, r1);
  474. r45 = _mm_unpackhi_epi64(r0, r1);
  475. r23 = _mm_unpacklo_epi64(r2, r3);
  476. r67 = _mm_unpackhi_epi64(r2, r3);
  477. r89 = _mm_unpackhi_epi64(r4, r5);
  478. } while (--count);
  479. r0123 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(2,0,3,3));
  480. r4567 = _mm_shuffle_epi32(r67, _MM_SHUFFLE(2,0,3,3));
  481. r0123 = _mm_or_si128(r0123, _mm_shuffle_epi32(r01, _MM_SHUFFLE(3,3,2,0)));
  482. r4567 = _mm_or_si128(r4567, _mm_shuffle_epi32(r45, _MM_SHUFFLE(3,3,2,0)));
  483. r89 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(3,3,2,0));
  484. _mm_store_si128((xmmi*)r + 0, r0123);
  485. _mm_store_si128((xmmi*)r + 1, r4567);
  486. _mm_store_si128((xmmi*)r + 2, r89);
  487. }
  488. DONNA_INLINE static void
  489. curve25519_tangle32(packedelem32 *out, const bignum25519 x, const bignum25519 z) {
  490. xmmi x0,x1,x2,z0,z1,z2;
  491. x0 = _mm_load_si128((xmmi *)(x + 0));
  492. x1 = _mm_load_si128((xmmi *)(x + 4));
  493. x2 = _mm_load_si128((xmmi *)(x + 8));
  494. z0 = _mm_load_si128((xmmi *)(z + 0));
  495. z1 = _mm_load_si128((xmmi *)(z + 4));
  496. z2 = _mm_load_si128((xmmi *)(z + 8));
  497. out[0].v = _mm_unpacklo_epi32(x0, z0);
  498. out[1].v = _mm_unpackhi_epi32(x0, z0);
  499. out[2].v = _mm_unpacklo_epi32(x1, z1);
  500. out[3].v = _mm_unpackhi_epi32(x1, z1);
  501. out[4].v = _mm_unpacklo_epi32(x2, z2);
  502. }
  503. DONNA_INLINE static void
  504. curve25519_untangle32(bignum25519 x, bignum25519 z, const packedelem32 *in) {
  505. xmmi t0,t1,t2,t3,t4,zero;
  506. t0 = _mm_shuffle_epi32(in[0].v, _MM_SHUFFLE(3,1,2,0));
  507. t1 = _mm_shuffle_epi32(in[1].v, _MM_SHUFFLE(3,1,2,0));
  508. t2 = _mm_shuffle_epi32(in[2].v, _MM_SHUFFLE(3,1,2,0));
  509. t3 = _mm_shuffle_epi32(in[3].v, _MM_SHUFFLE(3,1,2,0));
  510. t4 = _mm_shuffle_epi32(in[4].v, _MM_SHUFFLE(3,1,2,0));
  511. zero = _mm_setzero_si128();
  512. _mm_store_si128((xmmi *)x + 0, _mm_unpacklo_epi64(t0, t1));
  513. _mm_store_si128((xmmi *)x + 1, _mm_unpacklo_epi64(t2, t3));
  514. _mm_store_si128((xmmi *)x + 2, _mm_unpacklo_epi64(t4, zero));
  515. _mm_store_si128((xmmi *)z + 0, _mm_unpackhi_epi64(t0, t1));
  516. _mm_store_si128((xmmi *)z + 1, _mm_unpackhi_epi64(t2, t3));
  517. _mm_store_si128((xmmi *)z + 2, _mm_unpackhi_epi64(t4, zero));
  518. }
  519. DONNA_INLINE static void
  520. curve25519_add_reduce_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) {
  521. xmmi r0,r1,r2,r3,r4;
  522. xmmi s0,s1,s2,s3,s4,s5;
  523. xmmi c1,c2;
  524. r0 = _mm_add_epi32(r[0].v, s[0].v);
  525. r1 = _mm_add_epi32(r[1].v, s[1].v);
  526. r2 = _mm_add_epi32(r[2].v, s[2].v);
  527. r3 = _mm_add_epi32(r[3].v, s[3].v);
  528. r4 = _mm_add_epi32(r[4].v, s[4].v);
  529. s0 = _mm_unpacklo_epi64(r0, r2); /* 00 44 */
  530. s1 = _mm_unpackhi_epi64(r0, r2); /* 11 55 */
  531. s2 = _mm_unpacklo_epi64(r1, r3); /* 22 66 */
  532. s3 = _mm_unpackhi_epi64(r1, r3); /* 33 77 */
  533. s4 = _mm_unpacklo_epi64(_mm_setzero_si128(), r4); /* 00 88 */
  534. s5 = _mm_unpackhi_epi64(_mm_setzero_si128(), r4); /* 00 99 */
  535. c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
  536. c1 = _mm_srli_epi32(s1, 25); c2 = _mm_srli_epi32(s3, 25); s1 = _mm_and_si128(s1, packedmask25252525.v); s3 = _mm_and_si128(s3, packedmask25252525.v); s2 = _mm_add_epi32(s2, c1); s4 = _mm_add_epi32(s4, _mm_unpackhi_epi64(_mm_setzero_si128(), c2)); s0 = _mm_add_epi32(s0, _mm_unpacklo_epi64(_mm_setzero_si128(), c2));
  537. c1 = _mm_srli_epi32(s2, 26); c2 = _mm_srli_epi32(s4, 26); s2 = _mm_and_si128(s2, packedmask26262626.v); s4 = _mm_and_si128(s4, packedmask26262626.v); s3 = _mm_add_epi32(s3, c1); s5 = _mm_add_epi32(s5, c2);
  538. c1 = _mm_srli_epi32(s3, 25); c2 = _mm_srli_epi32(s5, 25); s3 = _mm_and_si128(s3, packedmask25252525.v); s5 = _mm_and_si128(s5, packedmask25252525.v); s4 = _mm_add_epi32(s4, c1); s0 = _mm_add_epi32(s0, _mm_or_si128(_mm_slli_si128(c1, 8), _mm_srli_si128(_mm_add_epi32(_mm_add_epi32(_mm_slli_epi32(c2, 4), _mm_slli_epi32(c2, 1)), c2), 8)));
  539. c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
  540. out[0].v = _mm_unpacklo_epi64(s0, s1); /* 00 11 */
  541. out[1].v = _mm_unpacklo_epi64(s2, s3); /* 22 33 */
  542. out[2].v = _mm_unpackhi_epi64(s0, s1); /* 44 55 */
  543. out[3].v = _mm_unpackhi_epi64(s2, s3); /* 66 77 */
  544. out[4].v = _mm_unpackhi_epi64(s4, s5); /* 88 99 */
  545. }
  546. DONNA_INLINE static void
  547. curve25519_add_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) {
  548. out[0].v = _mm_add_epi32(r[0].v, s[0].v);
  549. out[1].v = _mm_add_epi32(r[1].v, s[1].v);
  550. out[2].v = _mm_add_epi32(r[2].v, s[2].v);
  551. out[3].v = _mm_add_epi32(r[3].v, s[3].v);
  552. out[4].v = _mm_add_epi32(r[4].v, s[4].v);
  553. }
  554. DONNA_INLINE static void
  555. curve25519_sub_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) {
  556. xmmi r0,r1,r2,r3,r4;
  557. xmmi s0,s1,s2,s3;
  558. xmmi c1,c2;
  559. r0 = _mm_add_epi32(r[0].v, packed32packed2p0.v);
  560. r1 = _mm_add_epi32(r[1].v, packed32packed2p1.v);
  561. r2 = _mm_add_epi32(r[2].v, packed32packed2p1.v);
  562. r3 = _mm_add_epi32(r[3].v, packed32packed2p1.v);
  563. r4 = _mm_add_epi32(r[4].v, packed32packed2p1.v);
  564. r0 = _mm_sub_epi32(r0, s[0].v); /* 00 11 */
  565. r1 = _mm_sub_epi32(r1, s[1].v); /* 22 33 */
  566. r2 = _mm_sub_epi32(r2, s[2].v); /* 44 55 */
  567. r3 = _mm_sub_epi32(r3, s[3].v); /* 66 77 */
  568. r4 = _mm_sub_epi32(r4, s[4].v); /* 88 99 */
  569. s0 = _mm_unpacklo_epi64(r0, r2); /* 00 44 */
  570. s1 = _mm_unpackhi_epi64(r0, r2); /* 11 55 */
  571. s2 = _mm_unpacklo_epi64(r1, r3); /* 22 66 */
  572. s3 = _mm_unpackhi_epi64(r1, r3); /* 33 77 */
  573. c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
  574. c1 = _mm_srli_epi32(s1, 25); c2 = _mm_srli_epi32(s3, 25); s1 = _mm_and_si128(s1, packedmask25252525.v); s3 = _mm_and_si128(s3, packedmask25252525.v); s2 = _mm_add_epi32(s2, c1); r4 = _mm_add_epi32(r4, _mm_srli_si128(c2, 8)); s0 = _mm_add_epi32(s0, _mm_slli_si128(c2, 8));
  575. out[0].v = _mm_unpacklo_epi64(s0, s1); /* 00 11 */
  576. out[1].v = _mm_unpacklo_epi64(s2, s3); /* 22 33 */
  577. out[2].v = _mm_unpackhi_epi64(s0, s1); /* 44 55 */
  578. out[3].v = _mm_unpackhi_epi64(s2, s3); /* 66 77 */
  579. out[4].v = r4;
  580. }
  581. DONNA_INLINE static void
  582. curve25519_sub_after_basic_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) {
  583. xmmi r0,r1,r2,r3,r4;
  584. xmmi s0,s1,s2,s3,s4,s5;
  585. xmmi c1,c2;
  586. r0 = _mm_add_epi32(r[0].v, packed32packed4p0.v);
  587. r1 = _mm_add_epi32(r[1].v, packed32packed4p1.v);
  588. r2 = _mm_add_epi32(r[2].v, packed32packed4p1.v);
  589. r3 = _mm_add_epi32(r[3].v, packed32packed4p1.v);
  590. r4 = _mm_add_epi32(r[4].v, packed32packed4p1.v);
  591. r0 = _mm_sub_epi32(r0, s[0].v); /* 00 11 */
  592. r1 = _mm_sub_epi32(r1, s[1].v); /* 22 33 */
  593. r2 = _mm_sub_epi32(r2, s[2].v); /* 44 55 */
  594. r3 = _mm_sub_epi32(r3, s[3].v); /* 66 77 */
  595. r4 = _mm_sub_epi32(r4, s[4].v); /* 88 99 */
  596. s0 = _mm_unpacklo_epi64(r0, r2); /* 00 44 */
  597. s1 = _mm_unpackhi_epi64(r0, r2); /* 11 55 */
  598. s2 = _mm_unpacklo_epi64(r1, r3); /* 22 66 */
  599. s3 = _mm_unpackhi_epi64(r1, r3); /* 33 77 */
  600. s4 = _mm_unpacklo_epi64(_mm_setzero_si128(), r4); /* 00 88 */
  601. s5 = _mm_unpackhi_epi64(_mm_setzero_si128(), r4); /* 00 99 */
  602. c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
  603. c1 = _mm_srli_epi32(s1, 25); c2 = _mm_srli_epi32(s3, 25); s1 = _mm_and_si128(s1, packedmask25252525.v); s3 = _mm_and_si128(s3, packedmask25252525.v); s2 = _mm_add_epi32(s2, c1); s4 = _mm_add_epi32(s4, _mm_unpackhi_epi64(_mm_setzero_si128(), c2)); s0 = _mm_add_epi32(s0, _mm_unpacklo_epi64(_mm_setzero_si128(), c2));
  604. c1 = _mm_srli_epi32(s2, 26); c2 = _mm_srli_epi32(s4, 26); s2 = _mm_and_si128(s2, packedmask26262626.v); s4 = _mm_and_si128(s4, packedmask26262626.v); s3 = _mm_add_epi32(s3, c1); s5 = _mm_add_epi32(s5, c2);
  605. c1 = _mm_srli_epi32(s3, 25); c2 = _mm_srli_epi32(s5, 25); s3 = _mm_and_si128(s3, packedmask25252525.v); s5 = _mm_and_si128(s5, packedmask25252525.v); s4 = _mm_add_epi32(s4, c1); s0 = _mm_add_epi32(s0, _mm_or_si128(_mm_slli_si128(c1, 8), _mm_srli_si128(_mm_add_epi32(_mm_add_epi32(_mm_slli_epi32(c2, 4), _mm_slli_epi32(c2, 1)), c2), 8)));
  606. c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
  607. out[0].v = _mm_unpacklo_epi64(s0, s1); /* 00 11 */
  608. out[1].v = _mm_unpacklo_epi64(s2, s3); /* 22 33 */
  609. out[2].v = _mm_unpackhi_epi64(s0, s1); /* 44 55 */
  610. out[3].v = _mm_unpackhi_epi64(s2, s3); /* 66 77 */
  611. out[4].v = _mm_unpackhi_epi64(s4, s5); /* 88 99 */
  612. }
  613. DONNA_INLINE static void
  614. curve25519_tangle64_from32(packedelem64 *a, packedelem64 *b, const packedelem32 *c, const packedelem32 *d) {
  615. xmmi c0,c1,c2,c3,c4,c5,t;
  616. xmmi d0,d1,d2,d3,d4,d5;
  617. xmmi t0,t1,t2,t3,t4,zero;
  618. t0 = _mm_shuffle_epi32(c[0].v, _MM_SHUFFLE(3,1,2,0));
  619. t1 = _mm_shuffle_epi32(c[1].v, _MM_SHUFFLE(3,1,2,0));
  620. t2 = _mm_shuffle_epi32(d[0].v, _MM_SHUFFLE(3,1,2,0));
  621. t3 = _mm_shuffle_epi32(d[1].v, _MM_SHUFFLE(3,1,2,0));
  622. c0 = _mm_unpacklo_epi64(t0, t1);
  623. c3 = _mm_unpackhi_epi64(t0, t1);
  624. d0 = _mm_unpacklo_epi64(t2, t3);
  625. d3 = _mm_unpackhi_epi64(t2, t3);
  626. t = _mm_unpacklo_epi64(c0, d0); a[0].v = t; a[1].v = _mm_srli_epi64(t, 32);
  627. t = _mm_unpackhi_epi64(c0, d0); a[2].v = t; a[3].v = _mm_srli_epi64(t, 32);
  628. t = _mm_unpacklo_epi64(c3, d3); b[0].v = t; b[1].v = _mm_srli_epi64(t, 32);
  629. t = _mm_unpackhi_epi64(c3, d3); b[2].v = t; b[3].v = _mm_srli_epi64(t, 32);
  630. t0 = _mm_shuffle_epi32(c[2].v, _MM_SHUFFLE(3,1,2,0));
  631. t1 = _mm_shuffle_epi32(c[3].v, _MM_SHUFFLE(3,1,2,0));
  632. t2 = _mm_shuffle_epi32(d[2].v, _MM_SHUFFLE(3,1,2,0));
  633. t3 = _mm_shuffle_epi32(d[3].v, _MM_SHUFFLE(3,1,2,0));
  634. c1 = _mm_unpacklo_epi64(t0, t1);
  635. c4 = _mm_unpackhi_epi64(t0, t1);
  636. d1 = _mm_unpacklo_epi64(t2, t3);
  637. d4 = _mm_unpackhi_epi64(t2, t3);
  638. t = _mm_unpacklo_epi64(c1, d1); a[4].v = t; a[5].v = _mm_srli_epi64(t, 32);
  639. t = _mm_unpackhi_epi64(c1, d1); a[6].v = t; a[7].v = _mm_srli_epi64(t, 32);
  640. t = _mm_unpacklo_epi64(c4, d4); b[4].v = t; b[5].v = _mm_srli_epi64(t, 32);
  641. t = _mm_unpackhi_epi64(c4, d4); b[6].v = t; b[7].v = _mm_srli_epi64(t, 32);
  642. t4 = _mm_shuffle_epi32(c[4].v, _MM_SHUFFLE(3,1,2,0));
  643. zero = _mm_setzero_si128();
  644. c2 = _mm_unpacklo_epi64(t4, zero);
  645. c5 = _mm_unpackhi_epi64(t4, zero);
  646. t4 = _mm_shuffle_epi32(d[4].v, _MM_SHUFFLE(3,1,2,0));
  647. d2 = _mm_unpacklo_epi64(t4, zero);
  648. d5 = _mm_unpackhi_epi64(t4, zero);
  649. t = _mm_unpacklo_epi64(c2, d2); a[8].v = t; a[9].v = _mm_srli_epi64(t, 32);
  650. t = _mm_unpacklo_epi64(c5, d5); b[8].v = t; b[9].v = _mm_srli_epi64(t, 32);
  651. }
  652. DONNA_INLINE static void
  653. curve25519_tangle64(packedelem64 *out, const bignum25519 x, const bignum25519 z) {
  654. xmmi x0,x1,x2,z0,z1,z2,t;
  655. x0 = _mm_load_si128((xmmi *)x + 0);
  656. x1 = _mm_load_si128((xmmi *)x + 1);
  657. x2 = _mm_load_si128((xmmi *)x + 2);
  658. z0 = _mm_load_si128((xmmi *)z + 0);
  659. z1 = _mm_load_si128((xmmi *)z + 1);
  660. z2 = _mm_load_si128((xmmi *)z + 2);
  661. t = _mm_unpacklo_epi64(x0, z0); out[0].v = t; out[1].v = _mm_srli_epi64(t, 32);
  662. t = _mm_unpackhi_epi64(x0, z0); out[2].v = t; out[3].v = _mm_srli_epi64(t, 32);
  663. t = _mm_unpacklo_epi64(x1, z1); out[4].v = t; out[5].v = _mm_srli_epi64(t, 32);
  664. t = _mm_unpackhi_epi64(x1, z1); out[6].v = t; out[7].v = _mm_srli_epi64(t, 32);
  665. t = _mm_unpacklo_epi64(x2, z2); out[8].v = t; out[9].v = _mm_srli_epi64(t, 32);
  666. }
  667. DONNA_INLINE static void
  668. curve25519_tangleone64(packedelem64 *out, const bignum25519 x) {
  669. xmmi x0,x1,x2;
  670. x0 = _mm_load_si128((xmmi *)(x + 0));
  671. x1 = _mm_load_si128((xmmi *)(x + 4));
  672. x2 = _mm_load_si128((xmmi *)(x + 8));
  673. out[0].v = _mm_shuffle_epi32(x0, _MM_SHUFFLE(0,0,0,0));
  674. out[1].v = _mm_shuffle_epi32(x0, _MM_SHUFFLE(1,1,1,1));
  675. out[2].v = _mm_shuffle_epi32(x0, _MM_SHUFFLE(2,2,2,2));
  676. out[3].v = _mm_shuffle_epi32(x0, _MM_SHUFFLE(3,3,3,3));
  677. out[4].v = _mm_shuffle_epi32(x1, _MM_SHUFFLE(0,0,0,0));
  678. out[5].v = _mm_shuffle_epi32(x1, _MM_SHUFFLE(1,1,1,1));
  679. out[6].v = _mm_shuffle_epi32(x1, _MM_SHUFFLE(2,2,2,2));
  680. out[7].v = _mm_shuffle_epi32(x1, _MM_SHUFFLE(3,3,3,3));
  681. out[8].v = _mm_shuffle_epi32(x2, _MM_SHUFFLE(0,0,0,0));
  682. out[9].v = _mm_shuffle_epi32(x2, _MM_SHUFFLE(1,1,1,1));
  683. }
  684. DONNA_INLINE static void
  685. curve25519_swap64(packedelem64 *out) {
  686. out[0].v = _mm_shuffle_epi32(out[0].v, _MM_SHUFFLE(1,0,3,2));
  687. out[1].v = _mm_shuffle_epi32(out[1].v, _MM_SHUFFLE(1,0,3,2));
  688. out[2].v = _mm_shuffle_epi32(out[2].v, _MM_SHUFFLE(1,0,3,2));
  689. out[3].v = _mm_shuffle_epi32(out[3].v, _MM_SHUFFLE(1,0,3,2));
  690. out[4].v = _mm_shuffle_epi32(out[4].v, _MM_SHUFFLE(1,0,3,2));
  691. out[5].v = _mm_shuffle_epi32(out[5].v, _MM_SHUFFLE(1,0,3,2));
  692. out[6].v = _mm_shuffle_epi32(out[6].v, _MM_SHUFFLE(1,0,3,2));
  693. out[7].v = _mm_shuffle_epi32(out[7].v, _MM_SHUFFLE(1,0,3,2));
  694. out[8].v = _mm_shuffle_epi32(out[8].v, _MM_SHUFFLE(1,0,3,2));
  695. out[9].v = _mm_shuffle_epi32(out[9].v, _MM_SHUFFLE(1,0,3,2));
  696. }
  697. DONNA_INLINE static void
  698. curve25519_untangle64(bignum25519 x, bignum25519 z, const packedelem64 *in) {
  699. _mm_store_si128((xmmi *)(x + 0), _mm_unpacklo_epi64(_mm_unpacklo_epi32(in[0].v, in[1].v), _mm_unpacklo_epi32(in[2].v, in[3].v)));
  700. _mm_store_si128((xmmi *)(x + 4), _mm_unpacklo_epi64(_mm_unpacklo_epi32(in[4].v, in[5].v), _mm_unpacklo_epi32(in[6].v, in[7].v)));
  701. _mm_store_si128((xmmi *)(x + 8), _mm_unpacklo_epi32(in[8].v, in[9].v) );
  702. _mm_store_si128((xmmi *)(z + 0), _mm_unpacklo_epi64(_mm_unpackhi_epi32(in[0].v, in[1].v), _mm_unpackhi_epi32(in[2].v, in[3].v)));
  703. _mm_store_si128((xmmi *)(z + 4), _mm_unpacklo_epi64(_mm_unpackhi_epi32(in[4].v, in[5].v), _mm_unpackhi_epi32(in[6].v, in[7].v)));
  704. _mm_store_si128((xmmi *)(z + 8), _mm_unpackhi_epi32(in[8].v, in[9].v) );
  705. }
  706. DONNA_INLINE static void
  707. curve25519_mul_packed64(packedelem64 *out, const packedelem64 *r, const packedelem64 *s) {
  708. xmmi r1,r2,r3,r4,r5,r6,r7,r8,r9;
  709. xmmi r1_2,r3_2,r5_2,r7_2,r9_2;
  710. xmmi c1,c2;
  711. out[0].v = _mm_mul_epu32(r[0].v, s[0].v);
  712. out[1].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[1].v), _mm_mul_epu32(r[1].v, s[0].v));
  713. r1_2 = _mm_slli_epi32(r[1].v, 1);
  714. out[2].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[1].v), _mm_mul_epu32(r[2].v, s[0].v)));
  715. out[3].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[1].v), _mm_mul_epu32(r[3].v, s[0].v))));
  716. r3_2 = _mm_slli_epi32(r[3].v, 1);
  717. out[4].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r3_2 , s[1].v), _mm_mul_epu32(r[4].v, s[0].v)))));
  718. out[5].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[1].v), _mm_mul_epu32(r[5].v, s[0].v))))));
  719. r5_2 = _mm_slli_epi32(r[5].v, 1);
  720. out[6].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[5].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r3_2 , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r5_2 , s[1].v), _mm_mul_epu32(r[6].v, s[0].v)))))));
  721. out[7].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[7].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[5].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[1].v), _mm_mul_epu32(r[7].v , s[0].v))))))));
  722. r7_2 = _mm_slli_epi32(r[7].v, 1);
  723. out[8].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[8].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[7].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r3_2 , s[5].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r5_2 , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r7_2 , s[1].v), _mm_mul_epu32(r[8].v, s[0].v)))))))));
  724. out[9].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[9].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[8].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[7].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[5].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[7].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[8].v, s[1].v), _mm_mul_epu32(r[9].v, s[0].v))))))))));
  725. r1 = _mm_mul_epu32(r[1].v, packednineteen.v);
  726. r2 = _mm_mul_epu32(r[2].v, packednineteen.v);
  727. r1_2 = _mm_slli_epi32(r1, 1);
  728. r3 = _mm_mul_epu32(r[3].v, packednineteen.v);
  729. r4 = _mm_mul_epu32(r[4].v, packednineteen.v);
  730. r3_2 = _mm_slli_epi32(r3, 1);
  731. r5 = _mm_mul_epu32(r[5].v, packednineteen.v);
  732. r6 = _mm_mul_epu32(r[6].v, packednineteen.v);
  733. r5_2 = _mm_slli_epi32(r5, 1);
  734. r7 = _mm_mul_epu32(r[7].v, packednineteen.v);
  735. r8 = _mm_mul_epu32(r[8].v, packednineteen.v);
  736. r7_2 = _mm_slli_epi32(r7, 1);
  737. r9 = _mm_mul_epu32(r[9].v, packednineteen.v);
  738. r9_2 = _mm_slli_epi32(r9, 1);
  739. out[0].v = _mm_add_epi64(out[0].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[1].v), _mm_add_epi64(_mm_mul_epu32(r8, s[2].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[3].v), _mm_add_epi64(_mm_mul_epu32(r6, s[4].v), _mm_add_epi64(_mm_mul_epu32(r5_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r4, s[6].v), _mm_add_epi64(_mm_mul_epu32(r3_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r2, s[8].v), _mm_mul_epu32(r1_2, s[9].v))))))))));
  740. out[1].v = _mm_add_epi64(out[1].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[2].v), _mm_add_epi64(_mm_mul_epu32(r8, s[3].v), _mm_add_epi64(_mm_mul_epu32(r7 , s[4].v), _mm_add_epi64(_mm_mul_epu32(r6, s[5].v), _mm_add_epi64(_mm_mul_epu32(r5 , s[6].v), _mm_add_epi64(_mm_mul_epu32(r4, s[7].v), _mm_add_epi64(_mm_mul_epu32(r3 , s[8].v), _mm_mul_epu32(r2, s[9].v)))))))));
  741. out[2].v = _mm_add_epi64(out[2].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[3].v), _mm_add_epi64(_mm_mul_epu32(r8, s[4].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r6, s[6].v), _mm_add_epi64(_mm_mul_epu32(r5_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r4, s[8].v), _mm_mul_epu32(r3_2, s[9].v))))))));
  742. out[3].v = _mm_add_epi64(out[3].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[4].v), _mm_add_epi64(_mm_mul_epu32(r8, s[5].v), _mm_add_epi64(_mm_mul_epu32(r7 , s[6].v), _mm_add_epi64(_mm_mul_epu32(r6, s[7].v), _mm_add_epi64(_mm_mul_epu32(r5 , s[8].v), _mm_mul_epu32(r4, s[9].v)))))));
  743. out[4].v = _mm_add_epi64(out[4].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r8, s[6].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r6, s[8].v), _mm_mul_epu32(r5_2, s[9].v))))));
  744. out[5].v = _mm_add_epi64(out[5].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[6].v), _mm_add_epi64(_mm_mul_epu32(r8, s[7].v), _mm_add_epi64(_mm_mul_epu32(r7 , s[8].v), _mm_mul_epu32(r6, s[9].v)))));
  745. out[6].v = _mm_add_epi64(out[6].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r8, s[8].v), _mm_mul_epu32(r7_2, s[9].v))));
  746. out[7].v = _mm_add_epi64(out[7].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[8].v), _mm_mul_epu32(r8, s[9].v)));
  747. out[8].v = _mm_add_epi64(out[8].v, _mm_mul_epu32(r9_2, s[9].v));
  748. c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
  749. c1 = _mm_srli_epi64(out[1].v, 25); c2 = _mm_srli_epi64(out[5].v, 25); out[1].v = _mm_and_si128(out[1].v, packedmask25.v); out[5].v = _mm_and_si128(out[5].v, packedmask25.v); out[2].v = _mm_add_epi64(out[2].v, c1); out[6].v = _mm_add_epi64(out[6].v, c2);
  750. c1 = _mm_srli_epi64(out[2].v, 26); c2 = _mm_srli_epi64(out[6].v, 26); out[2].v = _mm_and_si128(out[2].v, packedmask26.v); out[6].v = _mm_and_si128(out[6].v, packedmask26.v); out[3].v = _mm_add_epi64(out[3].v, c1); out[7].v = _mm_add_epi64(out[7].v, c2);
  751. c1 = _mm_srli_epi64(out[3].v, 25); c2 = _mm_srli_epi64(out[7].v, 25); out[3].v = _mm_and_si128(out[3].v, packedmask25.v); out[7].v = _mm_and_si128(out[7].v, packedmask25.v); out[4].v = _mm_add_epi64(out[4].v, c1); out[8].v = _mm_add_epi64(out[8].v, c2);
  752. c2 = _mm_srli_epi64(out[8].v, 26); out[8].v = _mm_and_si128(out[8].v, packedmask26.v); out[9].v = _mm_add_epi64(out[9].v, c2);
  753. c2 = _mm_srli_epi64(out[9].v, 25); out[9].v = _mm_and_si128(out[9].v, packedmask25.v); out[0].v = _mm_add_epi64(out[0].v, _mm_mul_epu32(c2, packednineteen.v));
  754. c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
  755. }
  756. DONNA_INLINE static void
  757. curve25519_square_packed64(packedelem64 *out, const packedelem64 *r) {
  758. xmmi r0,r1,r2,r3;
  759. xmmi r1_2,r3_2,r4_2,r5_2,r6_2,r7_2;
  760. xmmi d5,d6,d7,d8,d9;
  761. xmmi c1,c2;
  762. r0 = r[0].v;
  763. r1 = r[1].v;
  764. r2 = r[2].v;
  765. r3 = r[3].v;
  766. out[0].v = _mm_mul_epu32(r0, r0);
  767. r0 = _mm_slli_epi32(r0, 1);
  768. out[1].v = _mm_mul_epu32(r0, r1);
  769. r1_2 = _mm_slli_epi32(r1, 1);
  770. out[2].v = _mm_add_epi64(_mm_mul_epu32(r0, r2 ), _mm_mul_epu32(r1, r1_2));
  771. r1 = r1_2;
  772. out[3].v = _mm_add_epi64(_mm_mul_epu32(r0, r3 ), _mm_mul_epu32(r1, r2 ));
  773. r3_2 = _mm_slli_epi32(r3, 1);
  774. out[4].v = _mm_add_epi64(_mm_mul_epu32(r0, r[4].v), _mm_add_epi64(_mm_mul_epu32(r1, r3_2 ), _mm_mul_epu32(r2, r2)));
  775. r2 = _mm_slli_epi32(r2, 1);
  776. out[5].v = _mm_add_epi64(_mm_mul_epu32(r0, r[5].v), _mm_add_epi64(_mm_mul_epu32(r1, r[4].v), _mm_mul_epu32(r2, r3)));
  777. r5_2 = _mm_slli_epi32(r[5].v, 1);
  778. out[6].v = _mm_add_epi64(_mm_mul_epu32(r0, r[6].v), _mm_add_epi64(_mm_mul_epu32(r1, r5_2 ), _mm_add_epi64(_mm_mul_epu32(r2, r[4].v), _mm_mul_epu32(r3, r3_2 ))));
  779. r3 = r3_2;
  780. out[7].v = _mm_add_epi64(_mm_mul_epu32(r0, r[7].v), _mm_add_epi64(_mm_mul_epu32(r1, r[6].v), _mm_add_epi64(_mm_mul_epu32(r2, r[5].v), _mm_mul_epu32(r3, r[4].v))));
  781. r7_2 = _mm_slli_epi32(r[7].v, 1);
  782. out[8].v = _mm_add_epi64(_mm_mul_epu32(r0, r[8].v), _mm_add_epi64(_mm_mul_epu32(r1, r7_2 ), _mm_add_epi64(_mm_mul_epu32(r2, r[6].v), _mm_add_epi64(_mm_mul_epu32(r3, r5_2 ), _mm_mul_epu32(r[4].v, r[4].v)))));
  783. out[9].v = _mm_add_epi64(_mm_mul_epu32(r0, r[9].v), _mm_add_epi64(_mm_mul_epu32(r1, r[8].v), _mm_add_epi64(_mm_mul_epu32(r2, r[7].v), _mm_add_epi64(_mm_mul_epu32(r3, r[6].v), _mm_mul_epu32(r[4].v, r5_2 )))));
  784. d5 = _mm_mul_epu32(r[5].v, packedthirtyeight.v);
  785. d6 = _mm_mul_epu32(r[6].v, packednineteen.v);
  786. d7 = _mm_mul_epu32(r[7].v, packedthirtyeight.v);
  787. d8 = _mm_mul_epu32(r[8].v, packednineteen.v);
  788. d9 = _mm_mul_epu32(r[9].v, packedthirtyeight.v);
  789. r4_2 = _mm_slli_epi32(r[4].v, 1);
  790. r6_2 = _mm_slli_epi32(r[6].v, 1);
  791. out[0].v = _mm_add_epi64(out[0].v, _mm_add_epi64(_mm_mul_epu32(d9, r1 ), _mm_add_epi64(_mm_mul_epu32(d8, r2 ), _mm_add_epi64(_mm_mul_epu32(d7, r3 ), _mm_add_epi64(_mm_mul_epu32(d6, r4_2), _mm_mul_epu32(d5, r[5].v))))));
  792. out[1].v = _mm_add_epi64(out[1].v, _mm_add_epi64(_mm_mul_epu32(d9, _mm_srli_epi32(r2, 1)), _mm_add_epi64(_mm_mul_epu32(d8, r3 ), _mm_add_epi64(_mm_mul_epu32(d7, r[4].v), _mm_mul_epu32(d6, r5_2 )))));
  793. out[2].v = _mm_add_epi64(out[2].v, _mm_add_epi64(_mm_mul_epu32(d9, r3 ), _mm_add_epi64(_mm_mul_epu32(d8, r4_2), _mm_add_epi64(_mm_mul_epu32(d7, r5_2 ), _mm_mul_epu32(d6, r[6].v)))));
  794. out[3].v = _mm_add_epi64(out[3].v, _mm_add_epi64(_mm_mul_epu32(d9, r[4].v ), _mm_add_epi64(_mm_mul_epu32(d8, r5_2), _mm_mul_epu32(d7, r[6].v))));
  795. out[4].v = _mm_add_epi64(out[4].v, _mm_add_epi64(_mm_mul_epu32(d9, r5_2 ), _mm_add_epi64(_mm_mul_epu32(d8, r6_2), _mm_mul_epu32(d7, r[7].v))));
  796. out[5].v = _mm_add_epi64(out[5].v, _mm_add_epi64(_mm_mul_epu32(d9, r[6].v ), _mm_mul_epu32(d8, r7_2 )));
  797. out[6].v = _mm_add_epi64(out[6].v, _mm_add_epi64(_mm_mul_epu32(d9, r7_2 ), _mm_mul_epu32(d8, r[8].v)));
  798. out[7].v = _mm_add_epi64(out[7].v, _mm_mul_epu32(d9, r[8].v));
  799. out[8].v = _mm_add_epi64(out[8].v, _mm_mul_epu32(d9, r[9].v));
  800. c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
  801. c1 = _mm_srli_epi64(out[1].v, 25); c2 = _mm_srli_epi64(out[5].v, 25); out[1].v = _mm_and_si128(out[1].v, packedmask25.v); out[5].v = _mm_and_si128(out[5].v, packedmask25.v); out[2].v = _mm_add_epi64(out[2].v, c1); out[6].v = _mm_add_epi64(out[6].v, c2);
  802. c1 = _mm_srli_epi64(out[2].v, 26); c2 = _mm_srli_epi64(out[6].v, 26); out[2].v = _mm_and_si128(out[2].v, packedmask26.v); out[6].v = _mm_and_si128(out[6].v, packedmask26.v); out[3].v = _mm_add_epi64(out[3].v, c1); out[7].v = _mm_add_epi64(out[7].v, c2);
  803. c1 = _mm_srli_epi64(out[3].v, 25); c2 = _mm_srli_epi64(out[7].v, 25); out[3].v = _mm_and_si128(out[3].v, packedmask25.v); out[7].v = _mm_and_si128(out[7].v, packedmask25.v); out[4].v = _mm_add_epi64(out[4].v, c1); out[8].v = _mm_add_epi64(out[8].v, c2);
  804. c2 = _mm_srli_epi64(out[8].v, 26); out[8].v = _mm_and_si128(out[8].v, packedmask26.v); out[9].v = _mm_add_epi64(out[9].v, c2);
  805. c2 = _mm_srli_epi64(out[9].v, 25); out[9].v = _mm_and_si128(out[9].v, packedmask25.v); out[0].v = _mm_add_epi64(out[0].v, _mm_mul_epu32(c2, packednineteen.v));
  806. c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
  807. }
  808. /* Take a little-endian, 32-byte number and expand it into polynomial form */
  809. static void
  810. curve25519_expand(bignum25519 out, const unsigned char in[32]) {
  811. uint32_t x0,x1,x2,x3,x4,x5,x6,x7;
  812. x0 = *(uint32_t *)(in + 0);
  813. x1 = *(uint32_t *)(in + 4);
  814. x2 = *(uint32_t *)(in + 8);
  815. x3 = *(uint32_t *)(in + 12);
  816. x4 = *(uint32_t *)(in + 16);
  817. x5 = *(uint32_t *)(in + 20);
  818. x6 = *(uint32_t *)(in + 24);
  819. x7 = *(uint32_t *)(in + 28);
  820. out[0] = ( x0 ) & 0x3ffffff;
  821. out[1] = ((((uint64_t)x1 << 32) | x0) >> 26) & 0x1ffffff;
  822. out[2] = ((((uint64_t)x2 << 32) | x1) >> 19) & 0x3ffffff;
  823. out[3] = ((((uint64_t)x3 << 32) | x2) >> 13) & 0x1ffffff;
  824. out[4] = (( x3) >> 6) & 0x3ffffff;
  825. out[5] = ( x4 ) & 0x1ffffff;
  826. out[6] = ((((uint64_t)x5 << 32) | x4) >> 25) & 0x3ffffff;
  827. out[7] = ((((uint64_t)x6 << 32) | x5) >> 19) & 0x1ffffff;
  828. out[8] = ((((uint64_t)x7 << 32) | x6) >> 12) & 0x3ffffff;
  829. out[9] = (( x7) >> 6) & 0x1ffffff;
  830. out[10] = 0;
  831. out[11] = 0;
  832. }
  833. /* Take a fully reduced polynomial form number and contract it into a
  834. * little-endian, 32-byte array
  835. */
  836. static void
  837. curve25519_contract(unsigned char out[32], const bignum25519 in) {
  838. bignum25519 ALIGN(16) f;
  839. curve25519_copy(f, in);
  840. #define carry_pass() \
  841. f[1] += f[0] >> 26; f[0] &= 0x3ffffff; \
  842. f[2] += f[1] >> 25; f[1] &= 0x1ffffff; \
  843. f[3] += f[2] >> 26; f[2] &= 0x3ffffff; \
  844. f[4] += f[3] >> 25; f[3] &= 0x1ffffff; \
  845. f[5] += f[4] >> 26; f[4] &= 0x3ffffff; \
  846. f[6] += f[5] >> 25; f[5] &= 0x1ffffff; \
  847. f[7] += f[6] >> 26; f[6] &= 0x3ffffff; \
  848. f[8] += f[7] >> 25; f[7] &= 0x1ffffff; \
  849. f[9] += f[8] >> 26; f[8] &= 0x3ffffff;
  850. #define carry_pass_full() \
  851. carry_pass() \
  852. f[0] += 19 * (f[9] >> 25); f[9] &= 0x1ffffff;
  853. #define carry_pass_final() \
  854. carry_pass() \
  855. f[9] &= 0x1ffffff;
  856. carry_pass_full()
  857. carry_pass_full()
  858. /* now t is between 0 and 2^255-1, properly carried. */
  859. /* case 1: between 0 and 2^255-20. case 2: between 2^255-19 and 2^255-1. */
  860. f[0] += 19;
  861. carry_pass_full()
  862. /* now between 19 and 2^255-1 in both cases, and offset by 19. */
  863. f[0] += (1 << 26) - 19;
  864. f[1] += (1 << 25) - 1;
  865. f[2] += (1 << 26) - 1;
  866. f[3] += (1 << 25) - 1;
  867. f[4] += (1 << 26) - 1;
  868. f[5] += (1 << 25) - 1;
  869. f[6] += (1 << 26) - 1;
  870. f[7] += (1 << 25) - 1;
  871. f[8] += (1 << 26) - 1;
  872. f[9] += (1 << 25) - 1;
  873. /* now between 2^255 and 2^256-20, and offset by 2^255. */
  874. carry_pass_final()
  875. #undef carry_pass
  876. #undef carry_full
  877. #undef carry_final
  878. f[1] <<= 2;
  879. f[2] <<= 3;
  880. f[3] <<= 5;
  881. f[4] <<= 6;
  882. f[6] <<= 1;
  883. f[7] <<= 3;
  884. f[8] <<= 4;
  885. f[9] <<= 6;
  886. #define F(i, s) \
  887. out[s+0] |= (unsigned char )(f[i] & 0xff); \
  888. out[s+1] = (unsigned char )((f[i] >> 8) & 0xff); \
  889. out[s+2] = (unsigned char )((f[i] >> 16) & 0xff); \
  890. out[s+3] = (unsigned char )((f[i] >> 24) & 0xff);
  891. out[0] = 0;
  892. out[16] = 0;
  893. F(0,0);
  894. F(1,3);
  895. F(2,6);
  896. F(3,9);
  897. F(4,12);
  898. F(5,16);
  899. F(6,19);
  900. F(7,22);
  901. F(8,25);
  902. F(9,28);
  903. #undef F
  904. }
  905. /* if (iswap) swap(a, b) */
  906. DONNA_INLINE static void
  907. curve25519_swap_conditional(bignum25519 a, bignum25519 b, uint32_t iswap) {
  908. const uint32_t swap = (uint32_t)(-(int32_t)iswap);
  909. xmmi a0,a1,a2,b0,b1,b2,x0,x1,x2;
  910. xmmi mask = _mm_cvtsi32_si128(swap);
  911. mask = _mm_shuffle_epi32(mask, 0);
  912. a0 = _mm_load_si128((xmmi *)a + 0);
  913. a1 = _mm_load_si128((xmmi *)a + 1);
  914. b0 = _mm_load_si128((xmmi *)b + 0);
  915. b1 = _mm_load_si128((xmmi *)b + 1);
  916. b0 = _mm_xor_si128(a0, b0);
  917. b1 = _mm_xor_si128(a1, b1);
  918. x0 = _mm_and_si128(b0, mask);
  919. x1 = _mm_and_si128(b1, mask);
  920. x0 = _mm_xor_si128(x0, a0);
  921. x1 = _mm_xor_si128(x1, a1);
  922. a0 = _mm_xor_si128(x0, b0);
  923. a1 = _mm_xor_si128(x1, b1);
  924. _mm_store_si128((xmmi *)a + 0, x0);
  925. _mm_store_si128((xmmi *)a + 1, x1);
  926. _mm_store_si128((xmmi *)b + 0, a0);
  927. _mm_store_si128((xmmi *)b + 1, a1);
  928. a2 = _mm_load_si128((xmmi *)a + 2);
  929. b2 = _mm_load_si128((xmmi *)b + 2);
  930. b2 = _mm_xor_si128(a2, b2);
  931. x2 = _mm_and_si128(b2, mask);
  932. x2 = _mm_xor_si128(x2, a2);
  933. a2 = _mm_xor_si128(x2, b2);
  934. _mm_store_si128((xmmi *)b + 2, a2);
  935. _mm_store_si128((xmmi *)a + 2, x2);
  936. }
  937. /* out = (flag) ? out : in */
  938. DONNA_INLINE static void
  939. curve25519_move_conditional_bytes(uint8_t out[96], const uint8_t in[96], uint32_t flag) {
  940. xmmi a0,a1,a2,a3,a4,a5,b0,b1,b2,b3,b4,b5;
  941. const uint32_t nb = flag - 1;
  942. xmmi masknb = _mm_shuffle_epi32(_mm_cvtsi32_si128(nb),0);
  943. a0 = _mm_load_si128((xmmi *)in + 0);
  944. a1 = _mm_load_si128((xmmi *)in + 1);
  945. a2 = _mm_load_si128((xmmi *)in + 2);
  946. b0 = _mm_load_si128((xmmi *)out + 0);
  947. b1 = _mm_load_si128((xmmi *)out + 1);
  948. b2 = _mm_load_si128((xmmi *)out + 2);
  949. a0 = _mm_andnot_si128(masknb, a0);
  950. a1 = _mm_andnot_si128(masknb, a1);
  951. a2 = _mm_andnot_si128(masknb, a2);
  952. b0 = _mm_and_si128(masknb, b0);
  953. b1 = _mm_and_si128(masknb, b1);
  954. b2 = _mm_and_si128(masknb, b2);
  955. a0 = _mm_or_si128(a0, b0);
  956. a1 = _mm_or_si128(a1, b1);
  957. a2 = _mm_or_si128(a2, b2);
  958. _mm_store_si128((xmmi*)out + 0, a0);
  959. _mm_store_si128((xmmi*)out + 1, a1);
  960. _mm_store_si128((xmmi*)out + 2, a2);
  961. a3 = _mm_load_si128((xmmi *)in + 3);
  962. a4 = _mm_load_si128((xmmi *)in + 4);
  963. a5 = _mm_load_si128((xmmi *)in + 5);
  964. b3 = _mm_load_si128((xmmi *)out + 3);
  965. b4 = _mm_load_si128((xmmi *)out + 4);
  966. b5 = _mm_load_si128((xmmi *)out + 5);
  967. a3 = _mm_andnot_si128(masknb, a3);
  968. a4 = _mm_andnot_si128(masknb, a4);
  969. a5 = _mm_andnot_si128(masknb, a5);
  970. b3 = _mm_and_si128(masknb, b3);
  971. b4 = _mm_and_si128(masknb, b4);
  972. b5 = _mm_and_si128(masknb, b5);
  973. a3 = _mm_or_si128(a3, b3);
  974. a4 = _mm_or_si128(a4, b4);
  975. a5 = _mm_or_si128(a5, b5);
  976. _mm_store_si128((xmmi*)out + 3, a3);
  977. _mm_store_si128((xmmi*)out + 4, a4);
  978. _mm_store_si128((xmmi*)out + 5, a5);
  979. }
  980. #endif /* defined(ED25519_SSE2) */