You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

695 lines
13 KiB

  1. // Code generated by goff (v0.2.0) DO NOT EDIT
  2. #include "textflag.h"
  3. // func MulAssignElement(res,y *Element)
  4. // montgomery multiplication of res by y
  5. // stores the result in res
  6. TEXT ·MulAssignElement(SB), NOSPLIT, $0-16
  7. // dereference our parameters
  8. MOVQ res+0(FP), DI
  9. MOVQ y+8(FP), R8
  10. // check if we support adx and mulx
  11. CMPB ·supportAdx(SB), $1
  12. JNE no_adx
  13. // the algorithm is described here
  14. // https://hackmd.io/@zkteam/modular_multiplication
  15. // however, to benefit from the ADCX and ADOX carry chains
  16. // we split the inner loops in 2:
  17. // for i=0 to N-1
  18. // for j=0 to N-1
  19. // (A,t[j]) := t[j] + a[j]*b[i] + A
  20. // m := t[0]*q'[0] mod W
  21. // C,_ := t[0] + m*q[0]
  22. // for j=1 to N-1
  23. // (C,t[j-1]) := t[j] + m*q[j] + C
  24. // t[N-1] = C + A
  25. // ---------------------------------------------------------------------------------------------
  26. // outter loop 0
  27. // clear up the carry flags
  28. XORQ R9 , R9
  29. // R12 = y[0]
  30. MOVQ 0(R8), R12
  31. // for j=0 to N-1
  32. // (A,t[j]) := t[j] + x[j]*y[i] + A
  33. // DX = res[0]
  34. MOVQ 0(DI), DX
  35. MULXQ R12, CX , R9
  36. // DX = res[1]
  37. MOVQ 8(DI), DX
  38. MOVQ R9, BX
  39. MULXQ R12, AX, R9
  40. ADOXQ AX, BX
  41. // DX = res[2]
  42. MOVQ 16(DI), DX
  43. MOVQ R9, BP
  44. MULXQ R12, AX, R9
  45. ADOXQ AX, BP
  46. // DX = res[3]
  47. MOVQ 24(DI), DX
  48. MOVQ R9, SI
  49. MULXQ R12, AX, R9
  50. ADOXQ AX, SI
  51. // add the last carries to R9
  52. MOVQ $0, DX
  53. ADCXQ DX, R9
  54. ADOXQ DX, R9
  55. // m := t[0]*q'[0] mod W
  56. MOVQ $0xc2e1f593efffffff, DX
  57. MULXQ CX,R11, DX
  58. // clear the carry flags
  59. XORQ DX, DX
  60. // C,_ := t[0] + m*q[0]
  61. MOVQ $0x43e1f593f0000001, DX
  62. MULXQ R11, AX, R10
  63. ADCXQ CX ,AX
  64. // for j=1 to N-1
  65. // (C,t[j-1]) := t[j] + m*q[j] + C
  66. MOVQ $0x2833e84879b97091, DX
  67. MULXQ R11, AX, DX
  68. ADCXQ BX, R10
  69. ADOXQ AX, R10
  70. MOVQ R10, CX
  71. MOVQ DX, R10
  72. MOVQ $0xb85045b68181585d, DX
  73. MULXQ R11, AX, DX
  74. ADCXQ BP, R10
  75. ADOXQ AX, R10
  76. MOVQ R10, BX
  77. MOVQ DX, R10
  78. MOVQ $0x30644e72e131a029, DX
  79. MULXQ R11, AX, DX
  80. ADCXQ SI, R10
  81. ADOXQ AX, R10
  82. MOVQ R10, BP
  83. MOVQ $0, AX
  84. ADCXQ AX, DX
  85. ADOXQ DX, R9
  86. MOVQ R9, SI
  87. // ---------------------------------------------------------------------------------------------
  88. // outter loop 1
  89. // clear up the carry flags
  90. XORQ R9 , R9
  91. // R12 = y[1]
  92. MOVQ 8(R8), R12
  93. // for j=0 to N-1
  94. // (A,t[j]) := t[j] + x[j]*y[i] + A
  95. // DX = res[0]
  96. MOVQ 0(DI), DX
  97. MULXQ R12, AX, R9
  98. ADOXQ AX, CX
  99. // DX = res[1]
  100. MOVQ 8(DI), DX
  101. ADCXQ R9, BX
  102. MULXQ R12, AX, R9
  103. ADOXQ AX, BX
  104. // DX = res[2]
  105. MOVQ 16(DI), DX
  106. ADCXQ R9, BP
  107. MULXQ R12, AX, R9
  108. ADOXQ AX, BP
  109. // DX = res[3]
  110. MOVQ 24(DI), DX
  111. ADCXQ R9, SI
  112. MULXQ R12, AX, R9
  113. ADOXQ AX, SI
  114. // add the last carries to R9
  115. MOVQ $0, DX
  116. ADCXQ DX, R9
  117. ADOXQ DX, R9
  118. // m := t[0]*q'[0] mod W
  119. MOVQ $0xc2e1f593efffffff, DX
  120. MULXQ CX,R11, DX
  121. // clear the carry flags
  122. XORQ DX, DX
  123. // C,_ := t[0] + m*q[0]
  124. MOVQ $0x43e1f593f0000001, DX
  125. MULXQ R11, AX, R10
  126. ADCXQ CX ,AX
  127. // for j=1 to N-1
  128. // (C,t[j-1]) := t[j] + m*q[j] + C
  129. MOVQ $0x2833e84879b97091, DX
  130. MULXQ R11, AX, DX
  131. ADCXQ BX, R10
  132. ADOXQ AX, R10
  133. MOVQ R10, CX
  134. MOVQ DX, R10
  135. MOVQ $0xb85045b68181585d, DX
  136. MULXQ R11, AX, DX
  137. ADCXQ BP, R10
  138. ADOXQ AX, R10
  139. MOVQ R10, BX
  140. MOVQ DX, R10
  141. MOVQ $0x30644e72e131a029, DX
  142. MULXQ R11, AX, DX
  143. ADCXQ SI, R10
  144. ADOXQ AX, R10
  145. MOVQ R10, BP
  146. MOVQ $0, AX
  147. ADCXQ AX, DX
  148. ADOXQ DX, R9
  149. MOVQ R9, SI
  150. // ---------------------------------------------------------------------------------------------
  151. // outter loop 2
  152. // clear up the carry flags
  153. XORQ R9 , R9
  154. // R12 = y[2]
  155. MOVQ 16(R8), R12
  156. // for j=0 to N-1
  157. // (A,t[j]) := t[j] + x[j]*y[i] + A
  158. // DX = res[0]
  159. MOVQ 0(DI), DX
  160. MULXQ R12, AX, R9
  161. ADOXQ AX, CX
  162. // DX = res[1]
  163. MOVQ 8(DI), DX
  164. ADCXQ R9, BX
  165. MULXQ R12, AX, R9
  166. ADOXQ AX, BX
  167. // DX = res[2]
  168. MOVQ 16(DI), DX
  169. ADCXQ R9, BP
  170. MULXQ R12, AX, R9
  171. ADOXQ AX, BP
  172. // DX = res[3]
  173. MOVQ 24(DI), DX
  174. ADCXQ R9, SI
  175. MULXQ R12, AX, R9
  176. ADOXQ AX, SI
  177. // add the last carries to R9
  178. MOVQ $0, DX
  179. ADCXQ DX, R9
  180. ADOXQ DX, R9
  181. // m := t[0]*q'[0] mod W
  182. MOVQ $0xc2e1f593efffffff, DX
  183. MULXQ CX,R11, DX
  184. // clear the carry flags
  185. XORQ DX, DX
  186. // C,_ := t[0] + m*q[0]
  187. MOVQ $0x43e1f593f0000001, DX
  188. MULXQ R11, AX, R10
  189. ADCXQ CX ,AX
  190. // for j=1 to N-1
  191. // (C,t[j-1]) := t[j] + m*q[j] + C
  192. MOVQ $0x2833e84879b97091, DX
  193. MULXQ R11, AX, DX
  194. ADCXQ BX, R10
  195. ADOXQ AX, R10
  196. MOVQ R10, CX
  197. MOVQ DX, R10
  198. MOVQ $0xb85045b68181585d, DX
  199. MULXQ R11, AX, DX
  200. ADCXQ BP, R10
  201. ADOXQ AX, R10
  202. MOVQ R10, BX
  203. MOVQ DX, R10
  204. MOVQ $0x30644e72e131a029, DX
  205. MULXQ R11, AX, DX
  206. ADCXQ SI, R10
  207. ADOXQ AX, R10
  208. MOVQ R10, BP
  209. MOVQ $0, AX
  210. ADCXQ AX, DX
  211. ADOXQ DX, R9
  212. MOVQ R9, SI
  213. // ---------------------------------------------------------------------------------------------
  214. // outter loop 3
  215. // clear up the carry flags
  216. XORQ R9 , R9
  217. // R12 = y[3]
  218. MOVQ 24(R8), R12
  219. // for j=0 to N-1
  220. // (A,t[j]) := t[j] + x[j]*y[i] + A
  221. // DX = res[0]
  222. MOVQ 0(DI), DX
  223. MULXQ R12, AX, R9
  224. ADOXQ AX, CX
  225. // DX = res[1]
  226. MOVQ 8(DI), DX
  227. ADCXQ R9, BX
  228. MULXQ R12, AX, R9
  229. ADOXQ AX, BX
  230. // DX = res[2]
  231. MOVQ 16(DI), DX
  232. ADCXQ R9, BP
  233. MULXQ R12, AX, R9
  234. ADOXQ AX, BP
  235. // DX = res[3]
  236. MOVQ 24(DI), DX
  237. ADCXQ R9, SI
  238. MULXQ R12, AX, R9
  239. ADOXQ AX, SI
  240. // add the last carries to R9
  241. MOVQ $0, DX
  242. ADCXQ DX, R9
  243. ADOXQ DX, R9
  244. // m := t[0]*q'[0] mod W
  245. MOVQ $0xc2e1f593efffffff, DX
  246. MULXQ CX,R11, DX
  247. // clear the carry flags
  248. XORQ DX, DX
  249. // C,_ := t[0] + m*q[0]
  250. MOVQ $0x43e1f593f0000001, DX
  251. MULXQ R11, AX, R10
  252. ADCXQ CX ,AX
  253. // for j=1 to N-1
  254. // (C,t[j-1]) := t[j] + m*q[j] + C
  255. MOVQ $0x2833e84879b97091, DX
  256. MULXQ R11, AX, DX
  257. ADCXQ BX, R10
  258. ADOXQ AX, R10
  259. MOVQ R10, CX
  260. MOVQ DX, R10
  261. MOVQ $0xb85045b68181585d, DX
  262. MULXQ R11, AX, DX
  263. ADCXQ BP, R10
  264. ADOXQ AX, R10
  265. MOVQ R10, BX
  266. MOVQ DX, R10
  267. MOVQ $0x30644e72e131a029, DX
  268. MULXQ R11, AX, DX
  269. ADCXQ SI, R10
  270. ADOXQ AX, R10
  271. MOVQ R10, BP
  272. MOVQ $0, AX
  273. ADCXQ AX, DX
  274. ADOXQ DX, R9
  275. MOVQ R9, SI
  276. reduce:
  277. // reduce, constant time version
  278. // first we copy registers storing t in a separate set of registers
  279. // as SUBQ modifies the 2nd operand
  280. MOVQ CX, DX
  281. MOVQ BX, R8
  282. MOVQ BP, R9
  283. MOVQ SI, R10
  284. MOVQ $0x43e1f593f0000001, R11
  285. SUBQ R11, DX
  286. MOVQ $0x2833e84879b97091, R11
  287. SBBQ R11, R8
  288. MOVQ $0xb85045b68181585d, R11
  289. SBBQ R11, R9
  290. MOVQ $0x30644e72e131a029, R11
  291. SBBQ R11, R10
  292. JCS t_is_smaller // no borrow, we return t
  293. // borrow is set, we return u
  294. MOVQ DX, (DI)
  295. MOVQ R8, 8(DI)
  296. MOVQ R9, 16(DI)
  297. MOVQ R10, 24(DI)
  298. RET
  299. t_is_smaller:
  300. MOVQ CX, 0(DI)
  301. MOVQ BX, 8(DI)
  302. MOVQ BP, 16(DI)
  303. MOVQ SI, 24(DI)
  304. RET
  305. no_adx:
  306. // ---------------------------------------------------------------------------------------------
  307. // outter loop 0
  308. // (A,t[0]) := t[0] + x[0]*y[0]
  309. MOVQ (DI), AX // x[0]
  310. MOVQ 0(R8), R12
  311. MULQ R12 // x[0] * y[0]
  312. MOVQ DX, R9
  313. MOVQ AX, CX
  314. // m := t[0]*q'[0] mod W
  315. MOVQ $0xc2e1f593efffffff, R11
  316. IMULQ CX , R11
  317. // C,_ := t[0] + m*q[0]
  318. MOVQ $0x43e1f593f0000001, AX
  319. MULQ R11
  320. ADDQ CX ,AX
  321. ADCQ $0, DX
  322. MOVQ DX, R10
  323. // for j=1 to N-1
  324. // (A,t[j]) := t[j] + x[j]*y[i] + A
  325. // (C,t[j-1]) := t[j] + m*q[j] + C
  326. MOVQ 8(DI), AX
  327. MULQ R12 // x[1] * y[0]
  328. MOVQ R9, BX
  329. ADDQ AX, BX
  330. ADCQ $0, DX
  331. MOVQ DX, R9
  332. MOVQ $0x2833e84879b97091, AX
  333. MULQ R11
  334. ADDQ BX, R10
  335. ADCQ $0, DX
  336. ADDQ AX, R10
  337. ADCQ $0, DX
  338. MOVQ R10, CX
  339. MOVQ DX, R10
  340. MOVQ 16(DI), AX
  341. MULQ R12 // x[2] * y[0]
  342. MOVQ R9, BP
  343. ADDQ AX, BP
  344. ADCQ $0, DX
  345. MOVQ DX, R9
  346. MOVQ $0xb85045b68181585d, AX
  347. MULQ R11
  348. ADDQ BP, R10
  349. ADCQ $0, DX
  350. ADDQ AX, R10
  351. ADCQ $0, DX
  352. MOVQ R10, BX
  353. MOVQ DX, R10
  354. MOVQ 24(DI), AX
  355. MULQ R12 // x[3] * y[0]
  356. MOVQ R9, SI
  357. ADDQ AX, SI
  358. ADCQ $0, DX
  359. MOVQ DX, R9
  360. MOVQ $0x30644e72e131a029, AX
  361. MULQ R11
  362. ADDQ SI, R10
  363. ADCQ $0, DX
  364. ADDQ AX, R10
  365. ADCQ $0, DX
  366. MOVQ R10, BP
  367. MOVQ DX, R10
  368. ADDQ R10, R9
  369. MOVQ R9, SI
  370. // ---------------------------------------------------------------------------------------------
  371. // outter loop 1
  372. // (A,t[0]) := t[0] + x[0]*y[1]
  373. MOVQ (DI), AX // x[0]
  374. MOVQ 8(R8), R12
  375. MULQ R12 // x[0] * y[1]
  376. ADDQ AX, CX
  377. ADCQ $0, DX
  378. MOVQ DX, R9
  379. // m := t[0]*q'[0] mod W
  380. MOVQ $0xc2e1f593efffffff, R11
  381. IMULQ CX , R11
  382. // C,_ := t[0] + m*q[0]
  383. MOVQ $0x43e1f593f0000001, AX
  384. MULQ R11
  385. ADDQ CX ,AX
  386. ADCQ $0, DX
  387. MOVQ DX, R10
  388. // for j=1 to N-1
  389. // (A,t[j]) := t[j] + x[j]*y[i] + A
  390. // (C,t[j-1]) := t[j] + m*q[j] + C
  391. MOVQ 8(DI), AX
  392. MULQ R12 // x[1] * y[1]
  393. ADDQ R9, BX
  394. ADCQ $0, DX
  395. ADDQ AX, BX
  396. ADCQ $0, DX
  397. MOVQ DX, R9
  398. MOVQ $0x2833e84879b97091, AX
  399. MULQ R11
  400. ADDQ BX, R10
  401. ADCQ $0, DX
  402. ADDQ AX, R10
  403. ADCQ $0, DX
  404. MOVQ R10, CX
  405. MOVQ DX, R10
  406. MOVQ 16(DI), AX
  407. MULQ R12 // x[2] * y[1]
  408. ADDQ R9, BP
  409. ADCQ $0, DX
  410. ADDQ AX, BP
  411. ADCQ $0, DX
  412. MOVQ DX, R9
  413. MOVQ $0xb85045b68181585d, AX
  414. MULQ R11
  415. ADDQ BP, R10
  416. ADCQ $0, DX
  417. ADDQ AX, R10
  418. ADCQ $0, DX
  419. MOVQ R10, BX
  420. MOVQ DX, R10
  421. MOVQ 24(DI), AX
  422. MULQ R12 // x[3] * y[1]
  423. ADDQ R9, SI
  424. ADCQ $0, DX
  425. ADDQ AX, SI
  426. ADCQ $0, DX
  427. MOVQ DX, R9
  428. MOVQ $0x30644e72e131a029, AX
  429. MULQ R11
  430. ADDQ SI, R10
  431. ADCQ $0, DX
  432. ADDQ AX, R10
  433. ADCQ $0, DX
  434. MOVQ R10, BP
  435. MOVQ DX, R10
  436. ADDQ R10, R9
  437. MOVQ R9, SI
  438. // ---------------------------------------------------------------------------------------------
  439. // outter loop 2
  440. // (A,t[0]) := t[0] + x[0]*y[2]
  441. MOVQ (DI), AX // x[0]
  442. MOVQ 16(R8), R12
  443. MULQ R12 // x[0] * y[2]
  444. ADDQ AX, CX
  445. ADCQ $0, DX
  446. MOVQ DX, R9
  447. // m := t[0]*q'[0] mod W
  448. MOVQ $0xc2e1f593efffffff, R11
  449. IMULQ CX , R11
  450. // C,_ := t[0] + m*q[0]
  451. MOVQ $0x43e1f593f0000001, AX
  452. MULQ R11
  453. ADDQ CX ,AX
  454. ADCQ $0, DX
  455. MOVQ DX, R10
  456. // for j=1 to N-1
  457. // (A,t[j]) := t[j] + x[j]*y[i] + A
  458. // (C,t[j-1]) := t[j] + m*q[j] + C
  459. MOVQ 8(DI), AX
  460. MULQ R12 // x[1] * y[2]
  461. ADDQ R9, BX
  462. ADCQ $0, DX
  463. ADDQ AX, BX
  464. ADCQ $0, DX
  465. MOVQ DX, R9
  466. MOVQ $0x2833e84879b97091, AX
  467. MULQ R11
  468. ADDQ BX, R10
  469. ADCQ $0, DX
  470. ADDQ AX, R10
  471. ADCQ $0, DX
  472. MOVQ R10, CX
  473. MOVQ DX, R10
  474. MOVQ 16(DI), AX
  475. MULQ R12 // x[2] * y[2]
  476. ADDQ R9, BP
  477. ADCQ $0, DX
  478. ADDQ AX, BP
  479. ADCQ $0, DX
  480. MOVQ DX, R9
  481. MOVQ $0xb85045b68181585d, AX
  482. MULQ R11
  483. ADDQ BP, R10
  484. ADCQ $0, DX
  485. ADDQ AX, R10
  486. ADCQ $0, DX
  487. MOVQ R10, BX
  488. MOVQ DX, R10
  489. MOVQ 24(DI), AX
  490. MULQ R12 // x[3] * y[2]
  491. ADDQ R9, SI
  492. ADCQ $0, DX
  493. ADDQ AX, SI
  494. ADCQ $0, DX
  495. MOVQ DX, R9
  496. MOVQ $0x30644e72e131a029, AX
  497. MULQ R11
  498. ADDQ SI, R10
  499. ADCQ $0, DX
  500. ADDQ AX, R10
  501. ADCQ $0, DX
  502. MOVQ R10, BP
  503. MOVQ DX, R10
  504. ADDQ R10, R9
  505. MOVQ R9, SI
  506. // ---------------------------------------------------------------------------------------------
  507. // outter loop 3
  508. // (A,t[0]) := t[0] + x[0]*y[3]
  509. MOVQ (DI), AX // x[0]
  510. MOVQ 24(R8), R12
  511. MULQ R12 // x[0] * y[3]
  512. ADDQ AX, CX
  513. ADCQ $0, DX
  514. MOVQ DX, R9
  515. // m := t[0]*q'[0] mod W
  516. MOVQ $0xc2e1f593efffffff, R11
  517. IMULQ CX , R11
  518. // C,_ := t[0] + m*q[0]
  519. MOVQ $0x43e1f593f0000001, AX
  520. MULQ R11
  521. ADDQ CX ,AX
  522. ADCQ $0, DX
  523. MOVQ DX, R10
  524. // for j=1 to N-1
  525. // (A,t[j]) := t[j] + x[j]*y[i] + A
  526. // (C,t[j-1]) := t[j] + m*q[j] + C
  527. MOVQ 8(DI), AX
  528. MULQ R12 // x[1] * y[3]
  529. ADDQ R9, BX
  530. ADCQ $0, DX
  531. ADDQ AX, BX
  532. ADCQ $0, DX
  533. MOVQ DX, R9
  534. MOVQ $0x2833e84879b97091, AX
  535. MULQ R11
  536. ADDQ BX, R10
  537. ADCQ $0, DX
  538. ADDQ AX, R10
  539. ADCQ $0, DX
  540. MOVQ R10, CX
  541. MOVQ DX, R10
  542. MOVQ 16(DI), AX
  543. MULQ R12 // x[2] * y[3]
  544. ADDQ R9, BP
  545. ADCQ $0, DX
  546. ADDQ AX, BP
  547. ADCQ $0, DX
  548. MOVQ DX, R9
  549. MOVQ $0xb85045b68181585d, AX
  550. MULQ R11
  551. ADDQ BP, R10
  552. ADCQ $0, DX
  553. ADDQ AX, R10
  554. ADCQ $0, DX
  555. MOVQ R10, BX
  556. MOVQ DX, R10
  557. MOVQ 24(DI), AX
  558. MULQ R12 // x[3] * y[3]
  559. ADDQ R9, SI
  560. ADCQ $0, DX
  561. ADDQ AX, SI
  562. ADCQ $0, DX
  563. MOVQ DX, R9
  564. MOVQ $0x30644e72e131a029, AX
  565. MULQ R11
  566. ADDQ SI, R10
  567. ADCQ $0, DX
  568. ADDQ AX, R10
  569. ADCQ $0, DX
  570. MOVQ R10, BP
  571. MOVQ DX, R10
  572. ADDQ R10, R9
  573. MOVQ R9, SI
  574. JMP reduce