1
0

modm-donna-64bit.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365
  1. /*
  2. Public domain by Andrew M. <liquidsun@gmail.com>
  3. */
  4. /* separate uint128 check for 64 bit sse2 */
  5. #if defined(HAVE_UINT128) && !defined(ED25519_FORCE_32BIT)
  6. /*
  7. Arithmetic modulo the group order n = 2^252 + 27742317777372353535851937790883648493 = 7237005577332262213973186563042994240857116359379907606001950938285454250989
  8. k = 32
  9. b = 1 << 8 = 256
  10. m = 2^252 + 27742317777372353535851937790883648493 = 0x1000000000000000000000000000000014def9dea2f79cd65812631a5cf5d3ed
  11. mu = floor( b^(k*2) / m ) = 0xfffffffffffffffffffffffffffffffeb2106215d086329a7ed9ce5a30a2c131b
  12. */
  13. #define bignum256modm_bits_per_limb 56
  14. #define bignum256modm_limb_size 5
  15. typedef uint64_t bignum256modm_element_t;
  16. typedef bignum256modm_element_t bignum256modm[5];
  17. static const bignum256modm modm_m = {
  18. 0x12631a5cf5d3ed,
  19. 0xf9dea2f79cd658,
  20. 0x000000000014de,
  21. 0x00000000000000,
  22. 0x00000010000000
  23. };
  24. static const bignum256modm modm_mu = {
  25. 0x9ce5a30a2c131b,
  26. 0x215d086329a7ed,
  27. 0xffffffffeb2106,
  28. 0xffffffffffffff,
  29. 0x00000fffffffff
  30. };
  31. static bignum256modm_element_t
  32. lt_modm(bignum256modm_element_t a, bignum256modm_element_t b) {
  33. return (a - b) >> 63;
  34. }
  35. static void
  36. reduce256_modm(bignum256modm r) {
  37. bignum256modm t;
  38. bignum256modm_element_t b = 0, pb, mask;
  39. /* t = r - m */
  40. pb = 0;
  41. pb += modm_m[0]; b = lt_modm(r[0], pb); t[0] = (r[0] - pb + (b << 56)); pb = b;
  42. pb += modm_m[1]; b = lt_modm(r[1], pb); t[1] = (r[1] - pb + (b << 56)); pb = b;
  43. pb += modm_m[2]; b = lt_modm(r[2], pb); t[2] = (r[2] - pb + (b << 56)); pb = b;
  44. pb += modm_m[3]; b = lt_modm(r[3], pb); t[3] = (r[3] - pb + (b << 56)); pb = b;
  45. pb += modm_m[4]; b = lt_modm(r[4], pb); t[4] = (r[4] - pb + (b << 32));
  46. /* keep r if r was smaller than m */
  47. mask = b - 1;
  48. r[0] ^= mask & (r[0] ^ t[0]);
  49. r[1] ^= mask & (r[1] ^ t[1]);
  50. r[2] ^= mask & (r[2] ^ t[2]);
  51. r[3] ^= mask & (r[3] ^ t[3]);
  52. r[4] ^= mask & (r[4] ^ t[4]);
  53. }
  54. static void
  55. barrett_reduce256_modm(bignum256modm r, const bignum256modm q1, const bignum256modm r1) {
  56. bignum256modm q3, r2;
  57. uint128_t c, mul;
  58. bignum256modm_element_t f, b, pb;
  59. /* q1 = x >> 248 = 264 bits = 5 56 bit elements
  60. q2 = mu * q1
  61. q3 = (q2 / 256(32+1)) = q2 / (2^8)^(32+1) = q2 >> 264 */
  62. mul64x64_128(c, modm_mu[0], q1[3]) mul64x64_128(mul, modm_mu[3], q1[0]) add128(c, mul) mul64x64_128(mul, modm_mu[1], q1[2]) add128(c, mul) mul64x64_128(mul, modm_mu[2], q1[1]) add128(c, mul) shr128(f, c, 56);
  63. mul64x64_128(c, modm_mu[0], q1[4]) add128_64(c, f) mul64x64_128(mul, modm_mu[4], q1[0]) add128(c, mul) mul64x64_128(mul, modm_mu[3], q1[1]) add128(c, mul) mul64x64_128(mul, modm_mu[1], q1[3]) add128(c, mul) mul64x64_128(mul, modm_mu[2], q1[2]) add128(c, mul)
  64. f = lo128(c); q3[0] = (f >> 40) & 0xffff; shr128(f, c, 56);
  65. mul64x64_128(c, modm_mu[4], q1[1]) add128_64(c, f) mul64x64_128(mul, modm_mu[1], q1[4]) add128(c, mul) mul64x64_128(mul, modm_mu[2], q1[3]) add128(c, mul) mul64x64_128(mul, modm_mu[3], q1[2]) add128(c, mul)
  66. f = lo128(c); q3[0] |= (f << 16) & 0xffffffffffffff; q3[1] = (f >> 40) & 0xffff; shr128(f, c, 56);
  67. mul64x64_128(c, modm_mu[4], q1[2]) add128_64(c, f) mul64x64_128(mul, modm_mu[2], q1[4]) add128(c, mul) mul64x64_128(mul, modm_mu[3], q1[3]) add128(c, mul)
  68. f = lo128(c); q3[1] |= (f << 16) & 0xffffffffffffff; q3[2] = (f >> 40) & 0xffff; shr128(f, c, 56);
  69. mul64x64_128(c, modm_mu[4], q1[3]) add128_64(c, f) mul64x64_128(mul, modm_mu[3], q1[4]) add128(c, mul)
  70. f = lo128(c); q3[2] |= (f << 16) & 0xffffffffffffff; q3[3] = (f >> 40) & 0xffff; shr128(f, c, 56);
  71. mul64x64_128(c, modm_mu[4], q1[4]) add128_64(c, f)
  72. f = lo128(c); q3[3] |= (f << 16) & 0xffffffffffffff; q3[4] = (f >> 40) & 0xffff; shr128(f, c, 56);
  73. q3[4] |= (f << 16);
  74. mul64x64_128(c, modm_m[0], q3[0])
  75. r2[0] = lo128(c) & 0xffffffffffffff; shr128(f, c, 56);
  76. mul64x64_128(c, modm_m[0], q3[1]) add128_64(c, f) mul64x64_128(mul, modm_m[1], q3[0]) add128(c, mul)
  77. r2[1] = lo128(c) & 0xffffffffffffff; shr128(f, c, 56);
  78. mul64x64_128(c, modm_m[0], q3[2]) add128_64(c, f) mul64x64_128(mul, modm_m[2], q3[0]) add128(c, mul) mul64x64_128(mul, modm_m[1], q3[1]) add128(c, mul)
  79. r2[2] = lo128(c) & 0xffffffffffffff; shr128(f, c, 56);
  80. mul64x64_128(c, modm_m[0], q3[3]) add128_64(c, f) mul64x64_128(mul, modm_m[3], q3[0]) add128(c, mul) mul64x64_128(mul, modm_m[1], q3[2]) add128(c, mul) mul64x64_128(mul, modm_m[2], q3[1]) add128(c, mul)
  81. r2[3] = lo128(c) & 0xffffffffffffff; shr128(f, c, 56);
  82. mul64x64_128(c, modm_m[0], q3[4]) add128_64(c, f) mul64x64_128(mul, modm_m[4], q3[0]) add128(c, mul) mul64x64_128(mul, modm_m[3], q3[1]) add128(c, mul) mul64x64_128(mul, modm_m[1], q3[3]) add128(c, mul) mul64x64_128(mul, modm_m[2], q3[2]) add128(c, mul)
  83. r2[4] = lo128(c) & 0x0000ffffffffff;
  84. pb = 0;
  85. pb += r2[0]; b = lt_modm(r1[0], pb); r[0] = (r1[0] - pb + (b << 56)); pb = b;
  86. pb += r2[1]; b = lt_modm(r1[1], pb); r[1] = (r1[1] - pb + (b << 56)); pb = b;
  87. pb += r2[2]; b = lt_modm(r1[2], pb); r[2] = (r1[2] - pb + (b << 56)); pb = b;
  88. pb += r2[3]; b = lt_modm(r1[3], pb); r[3] = (r1[3] - pb + (b << 56)); pb = b;
  89. pb += r2[4]; b = lt_modm(r1[4], pb); r[4] = (r1[4] - pb + (b << 40));
  90. reduce256_modm(r);
  91. reduce256_modm(r);
  92. }
  93. static void
  94. add256_modm(bignum256modm r, const bignum256modm x, const bignum256modm y) {
  95. bignum256modm_element_t c;
  96. c = x[0] + y[0]; r[0] = c & 0xffffffffffffff; c >>= 56;
  97. c += x[1] + y[1]; r[1] = c & 0xffffffffffffff; c >>= 56;
  98. c += x[2] + y[2]; r[2] = c & 0xffffffffffffff; c >>= 56;
  99. c += x[3] + y[3]; r[3] = c & 0xffffffffffffff; c >>= 56;
  100. c += x[4] + y[4]; r[4] = c;
  101. reduce256_modm(r);
  102. }
  103. static void
  104. mul256_modm(bignum256modm r, const bignum256modm x, const bignum256modm y) {
  105. bignum256modm q1, r1;
  106. uint128_t c, mul;
  107. bignum256modm_element_t f;
  108. mul64x64_128(c, x[0], y[0])
  109. f = lo128(c); r1[0] = f & 0xffffffffffffff; shr128(f, c, 56);
  110. mul64x64_128(c, x[0], y[1]) add128_64(c, f) mul64x64_128(mul, x[1], y[0]) add128(c, mul)
  111. f = lo128(c); r1[1] = f & 0xffffffffffffff; shr128(f, c, 56);
  112. mul64x64_128(c, x[0], y[2]) add128_64(c, f) mul64x64_128(mul, x[2], y[0]) add128(c, mul) mul64x64_128(mul, x[1], y[1]) add128(c, mul)
  113. f = lo128(c); r1[2] = f & 0xffffffffffffff; shr128(f, c, 56);
  114. mul64x64_128(c, x[0], y[3]) add128_64(c, f) mul64x64_128(mul, x[3], y[0]) add128(c, mul) mul64x64_128(mul, x[1], y[2]) add128(c, mul) mul64x64_128(mul, x[2], y[1]) add128(c, mul)
  115. f = lo128(c); r1[3] = f & 0xffffffffffffff; shr128(f, c, 56);
  116. mul64x64_128(c, x[0], y[4]) add128_64(c, f) mul64x64_128(mul, x[4], y[0]) add128(c, mul) mul64x64_128(mul, x[3], y[1]) add128(c, mul) mul64x64_128(mul, x[1], y[3]) add128(c, mul) mul64x64_128(mul, x[2], y[2]) add128(c, mul)
  117. f = lo128(c); r1[4] = f & 0x0000ffffffffff; q1[0] = (f >> 24) & 0xffffffff; shr128(f, c, 56);
  118. mul64x64_128(c, x[4], y[1]) add128_64(c, f) mul64x64_128(mul, x[1], y[4]) add128(c, mul) mul64x64_128(mul, x[2], y[3]) add128(c, mul) mul64x64_128(mul, x[3], y[2]) add128(c, mul)
  119. f = lo128(c); q1[0] |= (f << 32) & 0xffffffffffffff; q1[1] = (f >> 24) & 0xffffffff; shr128(f, c, 56);
  120. mul64x64_128(c, x[4], y[2]) add128_64(c, f) mul64x64_128(mul, x[2], y[4]) add128(c, mul) mul64x64_128(mul, x[3], y[3]) add128(c, mul)
  121. f = lo128(c); q1[1] |= (f << 32) & 0xffffffffffffff; q1[2] = (f >> 24) & 0xffffffff; shr128(f, c, 56);
  122. mul64x64_128(c, x[4], y[3]) add128_64(c, f) mul64x64_128(mul, x[3], y[4]) add128(c, mul)
  123. f = lo128(c); q1[2] |= (f << 32) & 0xffffffffffffff; q1[3] = (f >> 24) & 0xffffffff; shr128(f, c, 56);
  124. mul64x64_128(c, x[4], y[4]) add128_64(c, f)
  125. f = lo128(c); q1[3] |= (f << 32) & 0xffffffffffffff; q1[4] = (f >> 24) & 0xffffffff; shr128(f, c, 56);
  126. q1[4] |= (f << 32);
  127. barrett_reduce256_modm(r, q1, r1);
  128. }
  129. static void
  130. expand256_modm(bignum256modm out, const unsigned char *in, size_t len) {
  131. unsigned char work[64] = {0};
  132. bignum256modm_element_t x[16];
  133. bignum256modm q1;
  134. memcpy(work, in, len);
  135. x[0] = U8TO64_LE(work + 0);
  136. x[1] = U8TO64_LE(work + 8);
  137. x[2] = U8TO64_LE(work + 16);
  138. x[3] = U8TO64_LE(work + 24);
  139. x[4] = U8TO64_LE(work + 32);
  140. x[5] = U8TO64_LE(work + 40);
  141. x[6] = U8TO64_LE(work + 48);
  142. x[7] = U8TO64_LE(work + 56);
  143. /* r1 = (x mod 256^(32+1)) = x mod (2^8)(31+1) = x & ((1 << 264) - 1) */
  144. out[0] = ( x[0]) & 0xffffffffffffff;
  145. out[1] = ((x[ 0] >> 56) | (x[ 1] << 8)) & 0xffffffffffffff;
  146. out[2] = ((x[ 1] >> 48) | (x[ 2] << 16)) & 0xffffffffffffff;
  147. out[3] = ((x[ 2] >> 40) | (x[ 3] << 24)) & 0xffffffffffffff;
  148. out[4] = ((x[ 3] >> 32) | (x[ 4] << 32)) & 0x0000ffffffffff;
  149. /* under 252 bits, no need to reduce */
  150. if (len < 32)
  151. return;
  152. /* q1 = x >> 248 = 264 bits */
  153. q1[0] = ((x[ 3] >> 56) | (x[ 4] << 8)) & 0xffffffffffffff;
  154. q1[1] = ((x[ 4] >> 48) | (x[ 5] << 16)) & 0xffffffffffffff;
  155. q1[2] = ((x[ 5] >> 40) | (x[ 6] << 24)) & 0xffffffffffffff;
  156. q1[3] = ((x[ 6] >> 32) | (x[ 7] << 32)) & 0xffffffffffffff;
  157. q1[4] = ((x[ 7] >> 24) );
  158. barrett_reduce256_modm(out, q1, out);
  159. }
  160. static void
  161. expand_raw256_modm(bignum256modm out, const unsigned char in[32]) {
  162. bignum256modm_element_t x[4];
  163. x[0] = U8TO64_LE(in + 0);
  164. x[1] = U8TO64_LE(in + 8);
  165. x[2] = U8TO64_LE(in + 16);
  166. x[3] = U8TO64_LE(in + 24);
  167. out[0] = ( x[0]) & 0xffffffffffffff;
  168. out[1] = ((x[ 0] >> 56) | (x[ 1] << 8)) & 0xffffffffffffff;
  169. out[2] = ((x[ 1] >> 48) | (x[ 2] << 16)) & 0xffffffffffffff;
  170. out[3] = ((x[ 2] >> 40) | (x[ 3] << 24)) & 0xffffffffffffff;
  171. out[4] = ((x[ 3] >> 32) ) & 0x000000ffffffff;
  172. }
  173. static void
  174. contract256_modm(unsigned char out[32], const bignum256modm in) {
  175. U64TO8_LE(out + 0, (in[0] ) | (in[1] << 56));
  176. U64TO8_LE(out + 8, (in[1] >> 8) | (in[2] << 48));
  177. U64TO8_LE(out + 16, (in[2] >> 16) | (in[3] << 40));
  178. U64TO8_LE(out + 24, (in[3] >> 24) | (in[4] << 32));
  179. }
  180. static void
  181. contract256_window4_modm(signed char r[64], const bignum256modm in) {
  182. char carry;
  183. signed char *quads = r;
  184. bignum256modm_element_t i, j, v, m;
  185. for (i = 0; i < 5; i++) {
  186. v = in[i];
  187. m = (i == 4) ? 8 : 14;
  188. for (j = 0; j < m; j++) {
  189. *quads++ = (v & 15);
  190. v >>= 4;
  191. }
  192. }
  193. /* making it signed */
  194. carry = 0;
  195. for(i = 0; i < 63; i++) {
  196. r[i] += carry;
  197. r[i+1] += (r[i] >> 4);
  198. r[i] &= 15;
  199. carry = (r[i] >> 3);
  200. r[i] -= (carry << 4);
  201. }
  202. r[63] += carry;
  203. }
  204. static void
  205. contract256_slidingwindow_modm(signed char r[256], const bignum256modm s, int windowsize) {
  206. int i,j,k,b;
  207. int m = (1 << (windowsize - 1)) - 1, soplen = 256;
  208. signed char *bits = r;
  209. bignum256modm_element_t v;
  210. /* first put the binary expansion into r */
  211. for (i = 0; i < 4; i++) {
  212. v = s[i];
  213. for (j = 0; j < 56; j++, v >>= 1)
  214. *bits++ = (v & 1);
  215. }
  216. v = s[4];
  217. for (j = 0; j < 32; j++, v >>= 1)
  218. *bits++ = (v & 1);
  219. /* Making it sliding window */
  220. for (j = 0; j < soplen; j++) {
  221. if (!r[j])
  222. continue;
  223. for (b = 1; (b < (soplen - j)) && (b <= 6); b++) {
  224. if ((r[j] + (r[j + b] << b)) <= m) {
  225. r[j] += r[j + b] << b;
  226. r[j + b] = 0;
  227. } else if ((r[j] - (r[j + b] << b)) >= -m) {
  228. r[j] -= r[j + b] << b;
  229. for (k = j + b; k < soplen; k++) {
  230. if (!r[k]) {
  231. r[k] = 1;
  232. break;
  233. }
  234. r[k] = 0;
  235. }
  236. } else if (r[j + b]) {
  237. break;
  238. }
  239. }
  240. }
  241. }
  242. /*
  243. helpers for batch verifcation, are allowed to be vartime
  244. */
  245. /* out = a - b, a must be larger than b */
  246. static void
  247. sub256_modm_batch(bignum256modm out, const bignum256modm a, const bignum256modm b, size_t limbsize) {
  248. size_t i = 0;
  249. bignum256modm_element_t carry = 0;
  250. switch (limbsize) {
  251. case 4: out[i] = (a[i] - b[i]) ; carry = (out[i] >> 63); out[i] &= 0xffffffffffffff; i++;
  252. case 3: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 63); out[i] &= 0xffffffffffffff; i++;
  253. case 2: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 63); out[i] &= 0xffffffffffffff; i++;
  254. case 1: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 63); out[i] &= 0xffffffffffffff; i++;
  255. case 0:
  256. default: out[i] = (a[i] - b[i]) - carry;
  257. }
  258. }
  259. /* is a < b */
  260. static int
  261. lt256_modm_batch(const bignum256modm a, const bignum256modm b, size_t limbsize) {
  262. size_t i = 0;
  263. bignum256modm_element_t t, carry = 0;
  264. switch (limbsize) {
  265. case 4: t = (a[i] - b[i]) ; carry = (t >> 63); i++;
  266. case 3: t = (a[i] - b[i]) - carry; carry = (t >> 63); i++;
  267. case 2: t = (a[i] - b[i]) - carry; carry = (t >> 63); i++;
  268. case 1: t = (a[i] - b[i]) - carry; carry = (t >> 63); i++;
  269. case 0: t = (a[i] - b[i]) - carry; carry = (t >> 63);
  270. }
  271. return (int)carry;
  272. }
  273. /* is a <= b */
  274. static int
  275. lte256_modm_batch(const bignum256modm a, const bignum256modm b, size_t limbsize) {
  276. size_t i = 0;
  277. bignum256modm_element_t t, carry = 0;
  278. switch (limbsize) {
  279. case 4: t = (b[i] - a[i]) ; carry = (t >> 63); i++;
  280. case 3: t = (b[i] - a[i]) - carry; carry = (t >> 63); i++;
  281. case 2: t = (b[i] - a[i]) - carry; carry = (t >> 63); i++;
  282. case 1: t = (b[i] - a[i]) - carry; carry = (t >> 63); i++;
  283. case 0: t = (b[i] - a[i]) - carry; carry = (t >> 63);
  284. }
  285. return (int)!carry;
  286. }
  287. /* is a == 0 */
  288. static int
  289. iszero256_modm_batch(const bignum256modm a) {
  290. size_t i;
  291. for (i = 0; i < 5; i++)
  292. if (a[i])
  293. return 0;
  294. return 1;
  295. }
  296. /* is a == 1 */
  297. static int
  298. isone256_modm_batch(const bignum256modm a) {
  299. size_t i;
  300. for (i = 0; i < 5; i++)
  301. if (a[i] != ((i) ? 0 : 1))
  302. return 0;
  303. return 1;
  304. }
  305. /* can a fit in to (at most) 128 bits */
  306. static int
  307. isatmost128bits256_modm_batch(const bignum256modm a) {
  308. uint64_t mask =
  309. ((a[4] ) | /* 32 */
  310. (a[3] ) | /* 88 */
  311. (a[2] & 0xffffffffff0000));
  312. return (mask == 0);
  313. }
  314. #endif /* defined(HAVE_UINT128) && !defined(ED25519_FORCE_32BIT) */