You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

5713 lines
105 KiB

  1. global Fr_copy
  2. global Fr_copyn
  3. global Fr_add
  4. global Fr_sub
  5. global Fr_neg
  6. global Fr_mul
  7. global Fr_square
  8. global Fr_band
  9. global Fr_bor
  10. global Fr_bxor
  11. global Fr_bnot
  12. global Fr_eq
  13. global Fr_neq
  14. global Fr_lt
  15. global Fr_gt
  16. global Fr_leq
  17. global Fr_geq
  18. global Fr_land
  19. global Fr_lor
  20. global Fr_lnot
  21. global Fr_toNormal
  22. global Fr_toLongNormal
  23. global Fr_toMontgomery
  24. global Fr_q
  25. DEFAULT REL
  26. section .text
  27. ;;;;;;;;;;;;;;;;;;;;;;
  28. ; copy
  29. ;;;;;;;;;;;;;;;;;;;;;;
  30. ; Copies
  31. ; Params:
  32. ; rsi <= the src
  33. ; rdi <= the dest
  34. ;
  35. ; Nidified registers:
  36. ; rax
  37. ;;;;;;;;;;;;;;;;;;;;;;;
  38. Fr_copy:
  39. mov rax, [rsi + 0]
  40. mov [rdi + 0], rax
  41. mov rax, [rsi + 8]
  42. mov [rdi + 8], rax
  43. mov rax, [rsi + 16]
  44. mov [rdi + 16], rax
  45. mov rax, [rsi + 24]
  46. mov [rdi + 24], rax
  47. mov rax, [rsi + 32]
  48. mov [rdi + 32], rax
  49. ret
  50. ;;;;;;;;;;;;;;;;;;;;;;
  51. ; copy an array of integers
  52. ;;;;;;;;;;;;;;;;;;;;;;
  53. ; Copies
  54. ; Params:
  55. ; rsi <= the src
  56. ; rdi <= the dest
  57. ; rdx <= number of integers to copy
  58. ;
  59. ; Nidified registers:
  60. ; rax
  61. ;;;;;;;;;;;;;;;;;;;;;;;
  62. Fr_copyn:
  63. Fr_copyn_loop:
  64. mov r8, rsi
  65. mov r9, rdi
  66. mov rax, 5
  67. mul rdx
  68. mov rcx, rax
  69. cld
  70. rep movsq
  71. mov rsi, r8
  72. mov rdi, r9
  73. ret
  74. ;;;;;;;;;;;;;;;;;;;;;;
  75. ; rawCopyS2L
  76. ;;;;;;;;;;;;;;;;;;;;;;
  77. ; Convert a 64 bit integer to a long format field element
  78. ; Params:
  79. ; rsi <= the integer
  80. ; rdi <= Pointer to the overwritted element
  81. ;
  82. ; Nidified registers:
  83. ; rax
  84. ;;;;;;;;;;;;;;;;;;;;;;;
  85. rawCopyS2L:
  86. mov al, 0x80
  87. shl rax, 56
  88. mov [rdi], rax ; set the result to LONG normal
  89. cmp rsi, 0
  90. js u64toLong_adjust_neg
  91. mov [rdi + 8], rsi
  92. xor rax, rax
  93. mov [rdi + 16], rax
  94. mov [rdi + 24], rax
  95. mov [rdi + 32], rax
  96. ret
  97. u64toLong_adjust_neg:
  98. add rsi, [q] ; Set the first digit
  99. mov [rdi + 8], rsi ;
  100. mov rsi, -1 ; all ones
  101. mov rax, rsi ; Add to q
  102. adc rax, [q + 8 ]
  103. mov [rdi + 16], rax
  104. mov rax, rsi ; Add to q
  105. adc rax, [q + 16 ]
  106. mov [rdi + 24], rax
  107. mov rax, rsi ; Add to q
  108. adc rax, [q + 24 ]
  109. mov [rdi + 32], rax
  110. ret
  111. ;;;;;;;;;;;;;;;;;;;;;;
  112. ; rawMontgomeryMul
  113. ;;;;;;;;;;;;;;;;;;;;;;
  114. ; Multiply two elements in montgomery form
  115. ; Params:
  116. ; rsi <= Pointer to the long data of element 1
  117. ; rdx <= Pointer to the long data of element 2
  118. ; rdi <= Pointer to the long data of result
  119. ; Modified registers:
  120. ; r8, r9, 10, r11, rax, rcx
  121. ;;;;;;;;;;;;;;;;;;;;;;
  122. rawMontgomeryMul:
  123. sub rsp, 32 ; Reserve space for ms
  124. mov rcx, rdx ; rdx is needed for multiplications so keep it in cx
  125. mov r11, 0xc2e1f593efffffff ; np
  126. xor r8,r8
  127. xor r9,r9
  128. xor r10,r10
  129. mov rax, [rsi + 0]
  130. mul qword [rcx + 0]
  131. add r8, rax
  132. adc r9, rdx
  133. adc r10, 0x0
  134. mov rax, r8
  135. mul r11
  136. mov [rsp + 0], rax
  137. mul qword [q]
  138. add r8, rax
  139. adc r9, rdx
  140. adc r10, 0x0
  141. mov rax, [rsi + 0]
  142. mul qword [rcx + 8]
  143. add r9, rax
  144. adc r10, rdx
  145. adc r8, 0x0
  146. mov rax, [rsi + 8]
  147. mul qword [rcx + 0]
  148. add r9, rax
  149. adc r10, rdx
  150. adc r8, 0x0
  151. mov rax, [rsp + 0]
  152. mul qword [q + 8]
  153. add r9, rax
  154. adc r10, rdx
  155. adc r8, 0x0
  156. mov rax, r9
  157. mul r11
  158. mov [rsp + 8], rax
  159. mul qword [q]
  160. add r9, rax
  161. adc r10, rdx
  162. adc r8, 0x0
  163. mov rax, [rsi + 0]
  164. mul qword [rcx + 16]
  165. add r10, rax
  166. adc r8, rdx
  167. adc r9, 0x0
  168. mov rax, [rsi + 8]
  169. mul qword [rcx + 8]
  170. add r10, rax
  171. adc r8, rdx
  172. adc r9, 0x0
  173. mov rax, [rsi + 16]
  174. mul qword [rcx + 0]
  175. add r10, rax
  176. adc r8, rdx
  177. adc r9, 0x0
  178. mov rax, [rsp + 8]
  179. mul qword [q + 8]
  180. add r10, rax
  181. adc r8, rdx
  182. adc r9, 0x0
  183. mov rax, [rsp + 0]
  184. mul qword [q + 16]
  185. add r10, rax
  186. adc r8, rdx
  187. adc r9, 0x0
  188. mov rax, r10
  189. mul r11
  190. mov [rsp + 16], rax
  191. mul qword [q]
  192. add r10, rax
  193. adc r8, rdx
  194. adc r9, 0x0
  195. mov rax, [rsi + 0]
  196. mul qword [rcx + 24]
  197. add r8, rax
  198. adc r9, rdx
  199. adc r10, 0x0
  200. mov rax, [rsi + 8]
  201. mul qword [rcx + 16]
  202. add r8, rax
  203. adc r9, rdx
  204. adc r10, 0x0
  205. mov rax, [rsi + 16]
  206. mul qword [rcx + 8]
  207. add r8, rax
  208. adc r9, rdx
  209. adc r10, 0x0
  210. mov rax, [rsi + 24]
  211. mul qword [rcx + 0]
  212. add r8, rax
  213. adc r9, rdx
  214. adc r10, 0x0
  215. mov rax, [rsp + 16]
  216. mul qword [q + 8]
  217. add r8, rax
  218. adc r9, rdx
  219. adc r10, 0x0
  220. mov rax, [rsp + 8]
  221. mul qword [q + 16]
  222. add r8, rax
  223. adc r9, rdx
  224. adc r10, 0x0
  225. mov rax, [rsp + 0]
  226. mul qword [q + 24]
  227. add r8, rax
  228. adc r9, rdx
  229. adc r10, 0x0
  230. mov rax, r8
  231. mul r11
  232. mov [rsp + 24], rax
  233. mul qword [q]
  234. add r8, rax
  235. adc r9, rdx
  236. adc r10, 0x0
  237. mov rax, [rsi + 8]
  238. mul qword [rcx + 24]
  239. add r9, rax
  240. adc r10, rdx
  241. adc r8, 0x0
  242. mov rax, [rsi + 16]
  243. mul qword [rcx + 16]
  244. add r9, rax
  245. adc r10, rdx
  246. adc r8, 0x0
  247. mov rax, [rsi + 24]
  248. mul qword [rcx + 8]
  249. add r9, rax
  250. adc r10, rdx
  251. adc r8, 0x0
  252. mov rax, [rsp + 24]
  253. mul qword [q + 8]
  254. add r9, rax
  255. adc r10, rdx
  256. adc r8, 0x0
  257. mov rax, [rsp + 16]
  258. mul qword [q + 16]
  259. add r9, rax
  260. adc r10, rdx
  261. adc r8, 0x0
  262. mov rax, [rsp + 8]
  263. mul qword [q + 24]
  264. add r9, rax
  265. adc r10, rdx
  266. adc r8, 0x0
  267. mov [rdi + 0 ], r9
  268. xor r9,r9
  269. mov rax, [rsi + 16]
  270. mul qword [rcx + 24]
  271. add r10, rax
  272. adc r8, rdx
  273. adc r9, 0x0
  274. mov rax, [rsi + 24]
  275. mul qword [rcx + 16]
  276. add r10, rax
  277. adc r8, rdx
  278. adc r9, 0x0
  279. mov rax, [rsp + 24]
  280. mul qword [q + 16]
  281. add r10, rax
  282. adc r8, rdx
  283. adc r9, 0x0
  284. mov rax, [rsp + 16]
  285. mul qword [q + 24]
  286. add r10, rax
  287. adc r8, rdx
  288. adc r9, 0x0
  289. mov [rdi + 8 ], r10
  290. xor r10,r10
  291. mov rax, [rsi + 24]
  292. mul qword [rcx + 24]
  293. add r8, rax
  294. adc r9, rdx
  295. adc r10, 0x0
  296. mov rax, [rsp + 24]
  297. mul qword [q + 24]
  298. add r8, rax
  299. adc r9, rdx
  300. adc r10, 0x0
  301. mov [rdi + 16 ], r8
  302. xor r8,r8
  303. mov [rdi + 24 ], r9
  304. xor r9,r9
  305. test r10, r10
  306. jnz rawMontgomeryMul_mulM_sq
  307. ; Compare with q
  308. mov rax, [rdi + 24]
  309. cmp rax, [q + 24]
  310. jc rawMontgomeryMul_mulM_done ; q is bigget so done.
  311. jnz rawMontgomeryMul_mulM_sq ; q is lower
  312. mov rax, [rdi + 16]
  313. cmp rax, [q + 16]
  314. jc rawMontgomeryMul_mulM_done ; q is bigget so done.
  315. jnz rawMontgomeryMul_mulM_sq ; q is lower
  316. mov rax, [rdi + 8]
  317. cmp rax, [q + 8]
  318. jc rawMontgomeryMul_mulM_done ; q is bigget so done.
  319. jnz rawMontgomeryMul_mulM_sq ; q is lower
  320. mov rax, [rdi + 0]
  321. cmp rax, [q + 0]
  322. jc rawMontgomeryMul_mulM_done ; q is bigget so done.
  323. jnz rawMontgomeryMul_mulM_sq ; q is lower
  324. ; If equal substract q
  325. rawMontgomeryMul_mulM_sq:
  326. mov rax, [q + 0]
  327. sub [rdi + 0], rax
  328. mov rax, [q + 8]
  329. sbb [rdi + 8], rax
  330. mov rax, [q + 16]
  331. sbb [rdi + 16], rax
  332. mov rax, [q + 24]
  333. sbb [rdi + 24], rax
  334. rawMontgomeryMul_mulM_done:
  335. mov rdx, rcx ; recover rdx to its original place.
  336. add rsp, 32 ; recover rsp
  337. ret
  338. ;;;;;;;;;;;;;;;;;;;;;;
  339. ; rawMontgomerySquare
  340. ;;;;;;;;;;;;;;;;;;;;;;
  341. ; Square an element
  342. ; Params:
  343. ; rsi <= Pointer to the long data of element 1
  344. ; rdi <= Pointer to the long data of result
  345. ; Modified registers:
  346. ; r8, r9, 10, r11, rax, rcx
  347. ;;;;;;;;;;;;;;;;;;;;;;
  348. rawMontgomerySquare:
  349. sub rsp, 32 ; Reserve space for ms
  350. mov rcx, rdx ; rdx is needed for multiplications so keep it in cx
  351. mov r11, 0xc2e1f593efffffff ; np
  352. xor r8,r8
  353. xor r9,r9
  354. xor r10,r10
  355. mov rax, [rsi + 0]
  356. mul rax
  357. add r8, rax
  358. adc r9, rdx
  359. adc r10, 0x0
  360. mov rax, r8
  361. mul r11
  362. mov [rsp + 0], rax
  363. mul qword [q]
  364. add r8, rax
  365. adc r9, rdx
  366. adc r10, 0x0
  367. mov rax, [rsi + 0]
  368. mul qword [rsi + 8]
  369. add r9, rax
  370. adc r10, rdx
  371. adc r8, 0x0
  372. add r9, rax
  373. adc r10, rdx
  374. adc r8, 0x0
  375. mov rax, [rsp + 0]
  376. mul qword [q + 8]
  377. add r9, rax
  378. adc r10, rdx
  379. adc r8, 0x0
  380. mov rax, r9
  381. mul r11
  382. mov [rsp + 8], rax
  383. mul qword [q]
  384. add r9, rax
  385. adc r10, rdx
  386. adc r8, 0x0
  387. mov rax, [rsi + 0]
  388. mul qword [rsi + 16]
  389. add r10, rax
  390. adc r8, rdx
  391. adc r9, 0x0
  392. add r10, rax
  393. adc r8, rdx
  394. adc r9, 0x0
  395. mov rax, [rsi + 8]
  396. mul rax
  397. add r10, rax
  398. adc r8, rdx
  399. adc r9, 0x0
  400. mov rax, [rsp + 8]
  401. mul qword [q + 8]
  402. add r10, rax
  403. adc r8, rdx
  404. adc r9, 0x0
  405. mov rax, [rsp + 0]
  406. mul qword [q + 16]
  407. add r10, rax
  408. adc r8, rdx
  409. adc r9, 0x0
  410. mov rax, r10
  411. mul r11
  412. mov [rsp + 16], rax
  413. mul qword [q]
  414. add r10, rax
  415. adc r8, rdx
  416. adc r9, 0x0
  417. mov rax, [rsi + 0]
  418. mul qword [rsi + 24]
  419. add r8, rax
  420. adc r9, rdx
  421. adc r10, 0x0
  422. add r8, rax
  423. adc r9, rdx
  424. adc r10, 0x0
  425. mov rax, [rsi + 8]
  426. mul qword [rsi + 16]
  427. add r8, rax
  428. adc r9, rdx
  429. adc r10, 0x0
  430. add r8, rax
  431. adc r9, rdx
  432. adc r10, 0x0
  433. mov rax, [rsp + 16]
  434. mul qword [q + 8]
  435. add r8, rax
  436. adc r9, rdx
  437. adc r10, 0x0
  438. mov rax, [rsp + 8]
  439. mul qword [q + 16]
  440. add r8, rax
  441. adc r9, rdx
  442. adc r10, 0x0
  443. mov rax, [rsp + 0]
  444. mul qword [q + 24]
  445. add r8, rax
  446. adc r9, rdx
  447. adc r10, 0x0
  448. mov rax, r8
  449. mul r11
  450. mov [rsp + 24], rax
  451. mul qword [q]
  452. add r8, rax
  453. adc r9, rdx
  454. adc r10, 0x0
  455. mov rax, [rsi + 8]
  456. mul qword [rsi + 24]
  457. add r9, rax
  458. adc r10, rdx
  459. adc r8, 0x0
  460. add r9, rax
  461. adc r10, rdx
  462. adc r8, 0x0
  463. mov rax, [rsi + 16]
  464. mul rax
  465. add r9, rax
  466. adc r10, rdx
  467. adc r8, 0x0
  468. mov rax, [rsp + 24]
  469. mul qword [q + 8]
  470. add r9, rax
  471. adc r10, rdx
  472. adc r8, 0x0
  473. mov rax, [rsp + 16]
  474. mul qword [q + 16]
  475. add r9, rax
  476. adc r10, rdx
  477. adc r8, 0x0
  478. mov rax, [rsp + 8]
  479. mul qword [q + 24]
  480. add r9, rax
  481. adc r10, rdx
  482. adc r8, 0x0
  483. mov [rdi + 0 ], r9
  484. xor r9,r9
  485. mov rax, [rsi + 16]
  486. mul qword [rsi + 24]
  487. add r10, rax
  488. adc r8, rdx
  489. adc r9, 0x0
  490. add r10, rax
  491. adc r8, rdx
  492. adc r9, 0x0
  493. mov rax, [rsp + 24]
  494. mul qword [q + 16]
  495. add r10, rax
  496. adc r8, rdx
  497. adc r9, 0x0
  498. mov rax, [rsp + 16]
  499. mul qword [q + 24]
  500. add r10, rax
  501. adc r8, rdx
  502. adc r9, 0x0
  503. mov [rdi + 8 ], r10
  504. xor r10,r10
  505. mov rax, [rsi + 24]
  506. mul rax
  507. add r8, rax
  508. adc r9, rdx
  509. adc r10, 0x0
  510. mov rax, [rsp + 24]
  511. mul qword [q + 24]
  512. add r8, rax
  513. adc r9, rdx
  514. adc r10, 0x0
  515. mov [rdi + 16 ], r8
  516. xor r8,r8
  517. mov [rdi + 24 ], r9
  518. xor r9,r9
  519. test r10, r10
  520. jnz rawMontgomerySquare_mulM_sq
  521. ; Compare with q
  522. mov rax, [rdi + 24]
  523. cmp rax, [q + 24]
  524. jc rawMontgomerySquare_mulM_done ; q is bigget so done.
  525. jnz rawMontgomerySquare_mulM_sq ; q is lower
  526. mov rax, [rdi + 16]
  527. cmp rax, [q + 16]
  528. jc rawMontgomerySquare_mulM_done ; q is bigget so done.
  529. jnz rawMontgomerySquare_mulM_sq ; q is lower
  530. mov rax, [rdi + 8]
  531. cmp rax, [q + 8]
  532. jc rawMontgomerySquare_mulM_done ; q is bigget so done.
  533. jnz rawMontgomerySquare_mulM_sq ; q is lower
  534. mov rax, [rdi + 0]
  535. cmp rax, [q + 0]
  536. jc rawMontgomerySquare_mulM_done ; q is bigget so done.
  537. jnz rawMontgomerySquare_mulM_sq ; q is lower
  538. ; If equal substract q
  539. rawMontgomerySquare_mulM_sq:
  540. mov rax, [q + 0]
  541. sub [rdi + 0], rax
  542. mov rax, [q + 8]
  543. sbb [rdi + 8], rax
  544. mov rax, [q + 16]
  545. sbb [rdi + 16], rax
  546. mov rax, [q + 24]
  547. sbb [rdi + 24], rax
  548. rawMontgomerySquare_mulM_done:
  549. mov rdx, rcx ; recover rdx to its original place.
  550. add rsp, 32 ; recover rsp
  551. ret
  552. ;;;;;;;;;;;;;;;;;;;;;;
  553. ; rawMontgomeryMul1
  554. ;;;;;;;;;;;;;;;;;;;;;;
  555. ; Multiply two elements in montgomery form
  556. ; Params:
  557. ; rsi <= Pointer to the long data of element 1
  558. ; rdx <= second operand
  559. ; rdi <= Pointer to the long data of result
  560. ; Modified registers:
  561. ; r8, r9, 10, r11, rax, rcx
  562. ;;;;;;;;;;;;;;;;;;;;;;
  563. rawMontgomeryMul1:
  564. sub rsp, 32 ; Reserve space for ms
  565. mov rcx, rdx ; rdx is needed for multiplications so keep it in cx
  566. mov r11, 0xc2e1f593efffffff ; np
  567. xor r8,r8
  568. xor r9,r9
  569. xor r10,r10
  570. mov rax, [rsi + 0]
  571. mul rcx
  572. add r8, rax
  573. adc r9, rdx
  574. adc r10, 0x0
  575. mov rax, r8
  576. mul r11
  577. mov [rsp + 0], rax
  578. mul qword [q]
  579. add r8, rax
  580. adc r9, rdx
  581. adc r10, 0x0
  582. mov rax, [rsi + 8]
  583. mul rcx
  584. add r9, rax
  585. adc r10, rdx
  586. adc r8, 0x0
  587. mov rax, [rsp + 0]
  588. mul qword [q + 8]
  589. add r9, rax
  590. adc r10, rdx
  591. adc r8, 0x0
  592. mov rax, r9
  593. mul r11
  594. mov [rsp + 8], rax
  595. mul qword [q]
  596. add r9, rax
  597. adc r10, rdx
  598. adc r8, 0x0
  599. mov rax, [rsi + 16]
  600. mul rcx
  601. add r10, rax
  602. adc r8, rdx
  603. adc r9, 0x0
  604. mov rax, [rsp + 8]
  605. mul qword [q + 8]
  606. add r10, rax
  607. adc r8, rdx
  608. adc r9, 0x0
  609. mov rax, [rsp + 0]
  610. mul qword [q + 16]
  611. add r10, rax
  612. adc r8, rdx
  613. adc r9, 0x0
  614. mov rax, r10
  615. mul r11
  616. mov [rsp + 16], rax
  617. mul qword [q]
  618. add r10, rax
  619. adc r8, rdx
  620. adc r9, 0x0
  621. mov rax, [rsi + 24]
  622. mul rcx
  623. add r8, rax
  624. adc r9, rdx
  625. adc r10, 0x0
  626. mov rax, [rsp + 16]
  627. mul qword [q + 8]
  628. add r8, rax
  629. adc r9, rdx
  630. adc r10, 0x0
  631. mov rax, [rsp + 8]
  632. mul qword [q + 16]
  633. add r8, rax
  634. adc r9, rdx
  635. adc r10, 0x0
  636. mov rax, [rsp + 0]
  637. mul qword [q + 24]
  638. add r8, rax
  639. adc r9, rdx
  640. adc r10, 0x0
  641. mov rax, r8
  642. mul r11
  643. mov [rsp + 24], rax
  644. mul qword [q]
  645. add r8, rax
  646. adc r9, rdx
  647. adc r10, 0x0
  648. mov rax, [rsp + 24]
  649. mul qword [q + 8]
  650. add r9, rax
  651. adc r10, rdx
  652. adc r8, 0x0
  653. mov rax, [rsp + 16]
  654. mul qword [q + 16]
  655. add r9, rax
  656. adc r10, rdx
  657. adc r8, 0x0
  658. mov rax, [rsp + 8]
  659. mul qword [q + 24]
  660. add r9, rax
  661. adc r10, rdx
  662. adc r8, 0x0
  663. mov [rdi + 0 ], r9
  664. xor r9,r9
  665. mov rax, [rsp + 24]
  666. mul qword [q + 16]
  667. add r10, rax
  668. adc r8, rdx
  669. adc r9, 0x0
  670. mov rax, [rsp + 16]
  671. mul qword [q + 24]
  672. add r10, rax
  673. adc r8, rdx
  674. adc r9, 0x0
  675. mov [rdi + 8 ], r10
  676. xor r10,r10
  677. mov rax, [rsp + 24]
  678. mul qword [q + 24]
  679. add r8, rax
  680. adc r9, rdx
  681. adc r10, 0x0
  682. mov [rdi + 16 ], r8
  683. xor r8,r8
  684. mov [rdi + 24 ], r9
  685. xor r9,r9
  686. test r10, r10
  687. jnz rawMontgomeryMul1_mulM_sq
  688. ; Compare with q
  689. mov rax, [rdi + 24]
  690. cmp rax, [q + 24]
  691. jc rawMontgomeryMul1_mulM_done ; q is bigget so done.
  692. jnz rawMontgomeryMul1_mulM_sq ; q is lower
  693. mov rax, [rdi + 16]
  694. cmp rax, [q + 16]
  695. jc rawMontgomeryMul1_mulM_done ; q is bigget so done.
  696. jnz rawMontgomeryMul1_mulM_sq ; q is lower
  697. mov rax, [rdi + 8]
  698. cmp rax, [q + 8]
  699. jc rawMontgomeryMul1_mulM_done ; q is bigget so done.
  700. jnz rawMontgomeryMul1_mulM_sq ; q is lower
  701. mov rax, [rdi + 0]
  702. cmp rax, [q + 0]
  703. jc rawMontgomeryMul1_mulM_done ; q is bigget so done.
  704. jnz rawMontgomeryMul1_mulM_sq ; q is lower
  705. ; If equal substract q
  706. rawMontgomeryMul1_mulM_sq:
  707. mov rax, [q + 0]
  708. sub [rdi + 0], rax
  709. mov rax, [q + 8]
  710. sbb [rdi + 8], rax
  711. mov rax, [q + 16]
  712. sbb [rdi + 16], rax
  713. mov rax, [q + 24]
  714. sbb [rdi + 24], rax
  715. rawMontgomeryMul1_mulM_done:
  716. mov rdx, rcx ; recover rdx to its original place.
  717. add rsp, 32 ; recover rsp
  718. ret
  719. ;;;;;;;;;;;;;;;;;;;;;;
  720. ; rawFromMontgomery
  721. ;;;;;;;;;;;;;;;;;;;;;;
  722. ; Multiply two elements in montgomery form
  723. ; Params:
  724. ; rsi <= Pointer to the long data of element 1
  725. ; rdi <= Pointer to the long data of result
  726. ; Modified registers:
  727. ; r8, r9, 10, r11, rax, rcx
  728. ;;;;;;;;;;;;;;;;;;;;;;
  729. rawFromMontgomery:
  730. sub rsp, 32 ; Reserve space for ms
  731. mov rcx, rdx ; rdx is needed for multiplications so keep it in cx
  732. mov r11, 0xc2e1f593efffffff ; np
  733. xor r8,r8
  734. xor r9,r9
  735. xor r10,r10
  736. add r8, [rdi + 0]
  737. adc r9, 0x0
  738. adc r10, 0x0
  739. mov rax, r8
  740. mul r11
  741. mov [rsp + 0], rax
  742. mul qword [q]
  743. add r8, rax
  744. adc r9, rdx
  745. adc r10, 0x0
  746. add r9, [rdi + 8]
  747. adc r10, 0x0
  748. adc r8, 0x0
  749. mov rax, [rsp + 0]
  750. mul qword [q + 8]
  751. add r9, rax
  752. adc r10, rdx
  753. adc r8, 0x0
  754. mov rax, r9
  755. mul r11
  756. mov [rsp + 8], rax
  757. mul qword [q]
  758. add r9, rax
  759. adc r10, rdx
  760. adc r8, 0x0
  761. add r10, [rdi + 16]
  762. adc r8, 0x0
  763. adc r9, 0x0
  764. mov rax, [rsp + 8]
  765. mul qword [q + 8]
  766. add r10, rax
  767. adc r8, rdx
  768. adc r9, 0x0
  769. mov rax, [rsp + 0]
  770. mul qword [q + 16]
  771. add r10, rax
  772. adc r8, rdx
  773. adc r9, 0x0
  774. mov rax, r10
  775. mul r11
  776. mov [rsp + 16], rax
  777. mul qword [q]
  778. add r10, rax
  779. adc r8, rdx
  780. adc r9, 0x0
  781. add r8, [rdi + 24]
  782. adc r9, 0x0
  783. adc r10, 0x0
  784. mov rax, [rsp + 16]
  785. mul qword [q + 8]
  786. add r8, rax
  787. adc r9, rdx
  788. adc r10, 0x0
  789. mov rax, [rsp + 8]
  790. mul qword [q + 16]
  791. add r8, rax
  792. adc r9, rdx
  793. adc r10, 0x0
  794. mov rax, [rsp + 0]
  795. mul qword [q + 24]
  796. add r8, rax
  797. adc r9, rdx
  798. adc r10, 0x0
  799. mov rax, r8
  800. mul r11
  801. mov [rsp + 24], rax
  802. mul qword [q]
  803. add r8, rax
  804. adc r9, rdx
  805. adc r10, 0x0
  806. mov rax, [rsp + 24]
  807. mul qword [q + 8]
  808. add r9, rax
  809. adc r10, rdx
  810. adc r8, 0x0
  811. mov rax, [rsp + 16]
  812. mul qword [q + 16]
  813. add r9, rax
  814. adc r10, rdx
  815. adc r8, 0x0
  816. mov rax, [rsp + 8]
  817. mul qword [q + 24]
  818. add r9, rax
  819. adc r10, rdx
  820. adc r8, 0x0
  821. mov [rdi + 0 ], r9
  822. xor r9,r9
  823. mov rax, [rsp + 24]
  824. mul qword [q + 16]
  825. add r10, rax
  826. adc r8, rdx
  827. adc r9, 0x0
  828. mov rax, [rsp + 16]
  829. mul qword [q + 24]
  830. add r10, rax
  831. adc r8, rdx
  832. adc r9, 0x0
  833. mov [rdi + 8 ], r10
  834. xor r10,r10
  835. mov rax, [rsp + 24]
  836. mul qword [q + 24]
  837. add r8, rax
  838. adc r9, rdx
  839. adc r10, 0x0
  840. mov [rdi + 16 ], r8
  841. xor r8,r8
  842. mov [rdi + 24 ], r9
  843. xor r9,r9
  844. test r10, r10
  845. jnz rawFromMontgomery_mulM_sq
  846. ; Compare with q
  847. mov rax, [rdi + 24]
  848. cmp rax, [q + 24]
  849. jc rawFromMontgomery_mulM_done ; q is bigget so done.
  850. jnz rawFromMontgomery_mulM_sq ; q is lower
  851. mov rax, [rdi + 16]
  852. cmp rax, [q + 16]
  853. jc rawFromMontgomery_mulM_done ; q is bigget so done.
  854. jnz rawFromMontgomery_mulM_sq ; q is lower
  855. mov rax, [rdi + 8]
  856. cmp rax, [q + 8]
  857. jc rawFromMontgomery_mulM_done ; q is bigget so done.
  858. jnz rawFromMontgomery_mulM_sq ; q is lower
  859. mov rax, [rdi + 0]
  860. cmp rax, [q + 0]
  861. jc rawFromMontgomery_mulM_done ; q is bigget so done.
  862. jnz rawFromMontgomery_mulM_sq ; q is lower
  863. ; If equal substract q
  864. rawFromMontgomery_mulM_sq:
  865. mov rax, [q + 0]
  866. sub [rdi + 0], rax
  867. mov rax, [q + 8]
  868. sbb [rdi + 8], rax
  869. mov rax, [q + 16]
  870. sbb [rdi + 16], rax
  871. mov rax, [q + 24]
  872. sbb [rdi + 24], rax
  873. rawFromMontgomery_mulM_done:
  874. mov rdx, rcx ; recover rdx to its original place.
  875. add rsp, 32 ; recover rsp
  876. ret
  877. ;;;;;;;;;;;;;;;;;;;;;;
  878. ; toMontgomery
  879. ;;;;;;;;;;;;;;;;;;;;;;
  880. ; Convert a number to Montgomery
  881. ; rdi <= Pointer element to convert
  882. ; Modified registers:
  883. ; r8, r9, 10, r11, rax, rcx
  884. ;;;;;;;;;;;;;;;;;;;;
  885. Fr_toMontgomery:
  886. mov rax, [rdi]
  887. bts rax, 62 ; check if montgomery
  888. jc toMontgomery_doNothing
  889. bts rax, 63
  890. jc toMontgomeryLong
  891. toMontgomeryShort:
  892. mov [rdi], rax
  893. add rdi, 8
  894. push rsi
  895. lea rsi, [R2]
  896. movsx rdx, eax
  897. cmp rdx, 0
  898. js negMontgomeryShort
  899. posMontgomeryShort:
  900. call rawMontgomeryMul1
  901. pop rsi
  902. sub rdi, 8
  903. ret
  904. negMontgomeryShort:
  905. neg rdx ; Do the multiplication positive and then negate the result.
  906. call rawMontgomeryMul1
  907. mov rsi, rdi
  908. call rawNegL
  909. pop rsi
  910. sub rdi, 8
  911. ret
  912. toMontgomeryLong:
  913. mov [rdi], rax
  914. add rdi, 8
  915. push rsi
  916. mov rdx, rdi
  917. lea rsi, [R2]
  918. call rawMontgomeryMul
  919. pop rsi
  920. sub rdi, 8
  921. toMontgomery_doNothing:
  922. ret
  923. ;;;;;;;;;;;;;;;;;;;;;;
  924. ; toNormal
  925. ;;;;;;;;;;;;;;;;;;;;;;
  926. ; Convert a number from Montgomery
  927. ; rdi <= Pointer element to convert
  928. ; Modified registers:
  929. ; r8, r9, 10, r11, rax, rcx
  930. ;;;;;;;;;;;;;;;;;;;;
  931. Fr_toNormal:
  932. mov rax, [rdi]
  933. btc rax, 62 ; check if montgomery
  934. jnc toNormal_doNothing
  935. bt rax, 63 ; if short, it means it's converted
  936. jnc toNormal_doNothing
  937. toNormalLong:
  938. mov [rdi], rax
  939. add rdi, 8
  940. call rawFromMontgomery
  941. sub rdi, 8
  942. toNormal_doNothing:
  943. ret
  944. ;;;;;;;;;;;;;;;;;;;;;;
  945. ; toLongNormal
  946. ;;;;;;;;;;;;;;;;;;;;;;
  947. ; Convert a number to long normal
  948. ; rdi <= Pointer element to convert
  949. ; Modified registers:
  950. ; r8, r9, 10, r11, rax, rcx
  951. ;;;;;;;;;;;;;;;;;;;;
  952. Fr_toLongNormal:
  953. mov rax, [rdi]
  954. bt rax, 62 ; check if montgomery
  955. jc toLongNormal_fromMontgomery
  956. bt rax, 63 ; check if long
  957. jnc toLongNormal_fromShort
  958. ret ; It is already long
  959. toLongNormal_fromMontgomery:
  960. add rdi, 8
  961. call rawFromMontgomery
  962. sub rdi, 8
  963. ret
  964. toLongNormal_fromShort:
  965. mov r8, rsi ; save rsi
  966. movsx rsi, eax
  967. call rawCopyS2L
  968. mov rsi, r8 ; recover rsi
  969. ;;;;;;;;;;;;;;;;;;;;;;
  970. ; add
  971. ;;;;;;;;;;;;;;;;;;;;;;
  972. ; Adds two elements of any kind
  973. ; Params:
  974. ; rsi <= Pointer to element 1
  975. ; rdx <= Pointer to element 2
  976. ; rdi <= Pointer to result
  977. ; Modified Registers:
  978. ; r8, r9, 10, r11, rax, rcx
  979. ;;;;;;;;;;;;;;;;;;;;;;
  980. Fr_add:
  981. mov rax, [rsi]
  982. mov rcx, [rdx]
  983. bt rax, 63 ; Check if is short first operand
  984. jc add_l1
  985. bt rcx, 63 ; Check if is short second operand
  986. jc add_s1l2
  987. add_s1s2: ; Both operands are short
  988. xor rdx, rdx
  989. mov edx, eax
  990. add edx, ecx
  991. jo add_manageOverflow ; rsi already is the 64bits result
  992. mov [rdi], rdx ; not necessary to adjust so just save and return
  993. ret
  994. add_manageOverflow: ; Do the operation in 64 bits
  995. push rsi
  996. movsx rsi, eax
  997. movsx rdx, ecx
  998. add rsi, rdx
  999. call rawCopyS2L
  1000. pop rsi
  1001. ret
  1002. add_l1:
  1003. bt rcx, 63 ; Check if is short second operand
  1004. jc add_l1l2
  1005. ;;;;;;;;
  1006. add_l1s2:
  1007. bt rax, 62 ; check if montgomery first
  1008. jc add_l1ms2
  1009. add_l1ns2:
  1010. mov r11b, 0x80
  1011. shl r11, 56
  1012. mov [rdi], r11
  1013. add rsi, 8
  1014. movsx rdx, ecx
  1015. add rdi, 8
  1016. cmp rdx, 0
  1017. jns tmp_1
  1018. neg rdx
  1019. call rawSubLS
  1020. sub rdi, 8
  1021. sub rsi, 8
  1022. ret
  1023. tmp_1:
  1024. call rawAddLS
  1025. sub rdi, 8
  1026. sub rsi, 8
  1027. ret
  1028. add_l1ms2:
  1029. bt rcx, 62 ; check if montgomery second
  1030. jc add_l1ms2m
  1031. add_l1ms2n:
  1032. mov r11b, 0xC0
  1033. shl r11, 56
  1034. mov [rdi], r11
  1035. push rdi
  1036. mov rdi, rdx
  1037. call Fr_toMontgomery
  1038. mov rdx, rdi
  1039. pop rdi
  1040. add rdi, 8
  1041. add rsi, 8
  1042. add rdx, 8
  1043. call rawAddLL
  1044. sub rdi, 8
  1045. sub rsi, 8
  1046. ret
  1047. add_l1ms2m:
  1048. mov r11b, 0xC0
  1049. shl r11, 56
  1050. mov [rdi], r11
  1051. add rdi, 8
  1052. add rsi, 8
  1053. add rdx, 8
  1054. call rawAddLL
  1055. sub rdi, 8
  1056. sub rsi, 8
  1057. ret
  1058. ;;;;;;;;
  1059. add_s1l2:
  1060. bt rcx, 62 ; check if montgomery first
  1061. jc add_s1l2m
  1062. add_s1l2n:
  1063. mov r11b, 0x80
  1064. shl r11, 56
  1065. mov [rdi], r11
  1066. lea rsi, [rdx + 8]
  1067. movsx rdx, eax
  1068. add rdi, 8
  1069. cmp rdx, 0
  1070. jns tmp_2
  1071. neg rdx
  1072. call rawSubLS
  1073. sub rdi, 8
  1074. sub rsi, 8
  1075. ret
  1076. tmp_2:
  1077. call rawAddLS
  1078. sub rdi, 8
  1079. sub rsi, 8
  1080. ret
  1081. add_s1l2m:
  1082. bt rax, 62 ; check if montgomery second
  1083. jc add_s1ml2m
  1084. add_s1nl2m:
  1085. mov r11b, 0xC0
  1086. shl r11, 56
  1087. mov [rdi], r11
  1088. push rdi
  1089. mov rdi, rsi
  1090. mov rsi, rdx
  1091. call Fr_toMontgomery
  1092. mov rdx, rsi
  1093. mov rsi, rdi
  1094. pop rdi
  1095. add rdi, 8
  1096. add rsi, 8
  1097. add rdx, 8
  1098. call rawAddLL
  1099. sub rdi, 8
  1100. sub rsi, 8
  1101. ret
  1102. add_s1ml2m:
  1103. mov r11b, 0xC0
  1104. shl r11, 56
  1105. mov [rdi], r11
  1106. add rdi, 8
  1107. add rsi, 8
  1108. add rdx, 8
  1109. call rawAddLL
  1110. sub rdi, 8
  1111. sub rsi, 8
  1112. ret
  1113. ;;;;
  1114. add_l1l2:
  1115. bt rax, 62 ; check if montgomery first
  1116. jc add_l1ml2
  1117. add_l1nl2:
  1118. bt rcx, 62 ; check if montgomery second
  1119. jc add_l1nl2m
  1120. add_l1nl2n:
  1121. mov r11b, 0x80
  1122. shl r11, 56
  1123. mov [rdi], r11
  1124. add rdi, 8
  1125. add rsi, 8
  1126. add rdx, 8
  1127. call rawAddLL
  1128. sub rdi, 8
  1129. sub rsi, 8
  1130. ret
  1131. add_l1nl2m:
  1132. mov r11b, 0xC0
  1133. shl r11, 56
  1134. mov [rdi], r11
  1135. push rdi
  1136. mov rdi, rsi
  1137. mov rsi, rdx
  1138. call Fr_toMontgomery
  1139. mov rdx, rsi
  1140. mov rsi, rdi
  1141. pop rdi
  1142. add rdi, 8
  1143. add rsi, 8
  1144. add rdx, 8
  1145. call rawAddLL
  1146. sub rdi, 8
  1147. sub rsi, 8
  1148. ret
  1149. add_l1ml2:
  1150. bt rcx, 62 ; check if montgomery seconf
  1151. jc add_l1ml2m
  1152. add_l1ml2n:
  1153. mov r11b, 0xC0
  1154. shl r11, 56
  1155. mov [rdi], r11
  1156. push rdi
  1157. mov rdi, rdx
  1158. call Fr_toMontgomery
  1159. mov rdx, rdi
  1160. pop rdi
  1161. add rdi, 8
  1162. add rsi, 8
  1163. add rdx, 8
  1164. call rawAddLL
  1165. sub rdi, 8
  1166. sub rsi, 8
  1167. ret
  1168. add_l1ml2m:
  1169. mov r11b, 0xC0
  1170. shl r11, 56
  1171. mov [rdi], r11
  1172. add rdi, 8
  1173. add rsi, 8
  1174. add rdx, 8
  1175. call rawAddLL
  1176. sub rdi, 8
  1177. sub rsi, 8
  1178. ret
  1179. ;;;;;;;;;;;;;;;;;;;;;;
  1180. ; rawAddLL
  1181. ;;;;;;;;;;;;;;;;;;;;;;
  1182. ; Adds two elements of type long
  1183. ; Params:
  1184. ; rsi <= Pointer to the long data of element 1
  1185. ; rdx <= Pointer to the long data of element 2
  1186. ; rdi <= Pointer to the long data of result
  1187. ; Modified Registers:
  1188. ; rax
  1189. ;;;;;;;;;;;;;;;;;;;;;;
  1190. rawAddLL:
  1191. ; Add component by component with carry
  1192. mov rax, [rsi + 0]
  1193. add rax, [rdx + 0]
  1194. mov [rdi + 0], rax
  1195. mov rax, [rsi + 8]
  1196. adc rax, [rdx + 8]
  1197. mov [rdi + 8], rax
  1198. mov rax, [rsi + 16]
  1199. adc rax, [rdx + 16]
  1200. mov [rdi + 16], rax
  1201. mov rax, [rsi + 24]
  1202. adc rax, [rdx + 24]
  1203. mov [rdi + 24], rax
  1204. jc rawAddLL_sq ; if overflow, substract q
  1205. ; Compare with q
  1206. cmp rax, [q + 24]
  1207. jc rawAddLL_done ; q is bigget so done.
  1208. jnz rawAddLL_sq ; q is lower
  1209. mov rax, [rdi + 16]
  1210. cmp rax, [q + 16]
  1211. jc rawAddLL_done ; q is bigget so done.
  1212. jnz rawAddLL_sq ; q is lower
  1213. mov rax, [rdi + 8]
  1214. cmp rax, [q + 8]
  1215. jc rawAddLL_done ; q is bigget so done.
  1216. jnz rawAddLL_sq ; q is lower
  1217. mov rax, [rdi + 0]
  1218. cmp rax, [q + 0]
  1219. jc rawAddLL_done ; q is bigget so done.
  1220. jnz rawAddLL_sq ; q is lower
  1221. ; If equal substract q
  1222. rawAddLL_sq:
  1223. mov rax, [q + 0]
  1224. sub [rdi + 0], rax
  1225. mov rax, [q + 8]
  1226. sbb [rdi + 8], rax
  1227. mov rax, [q + 16]
  1228. sbb [rdi + 16], rax
  1229. mov rax, [q + 24]
  1230. sbb [rdi + 24], rax
  1231. rawAddLL_done:
  1232. ret
  1233. ;;;;;;;;;;;;;;;;;;;;;;
  1234. ; rawAddLS
  1235. ;;;;;;;;;;;;;;;;;;;;;;
  1236. ; Adds two elements of type long
  1237. ; Params:
  1238. ; rdi <= Pointer to the long data of result
  1239. ; rsi <= Pointer to the long data of element 1
  1240. ; rdx <= Value to be added
  1241. ;;;;;;;;;;;;;;;;;;;;;;
  1242. rawAddLS:
  1243. ; Add component by component with carry
  1244. add rdx, [rsi]
  1245. mov [rdi] ,rdx
  1246. mov rdx, 0
  1247. adc rdx, [rsi + 8]
  1248. mov [rdi + 8], rdx
  1249. mov rdx, 0
  1250. adc rdx, [rsi + 16]
  1251. mov [rdi + 16], rdx
  1252. mov rdx, 0
  1253. adc rdx, [rsi + 24]
  1254. mov [rdi + 24], rdx
  1255. jc rawAddLS_sq ; if overflow, substract q
  1256. ; Compare with q
  1257. mov rax, [rdi + 24]
  1258. cmp rax, [q + 24]
  1259. jc rawAddLS_done ; q is bigget so done.
  1260. jnz rawAddLS_sq ; q is lower
  1261. mov rax, [rdi + 16]
  1262. cmp rax, [q + 16]
  1263. jc rawAddLS_done ; q is bigget so done.
  1264. jnz rawAddLS_sq ; q is lower
  1265. mov rax, [rdi + 8]
  1266. cmp rax, [q + 8]
  1267. jc rawAddLS_done ; q is bigget so done.
  1268. jnz rawAddLS_sq ; q is lower
  1269. mov rax, [rdi + 0]
  1270. cmp rax, [q + 0]
  1271. jc rawAddLS_done ; q is bigget so done.
  1272. jnz rawAddLS_sq ; q is lower
  1273. ; If equal substract q
  1274. rawAddLS_sq:
  1275. mov rax, [q + 0]
  1276. sub [rdi + 0], rax
  1277. mov rax, [q + 8]
  1278. sbb [rdi + 8], rax
  1279. mov rax, [q + 16]
  1280. sbb [rdi + 16], rax
  1281. mov rax, [q + 24]
  1282. sbb [rdi + 24], rax
  1283. rawAddLS_done:
  1284. ret
  1285. ;;;;;;;;;;;;;;;;;;;;;;
  1286. ; sub
  1287. ;;;;;;;;;;;;;;;;;;;;;;
  1288. ; Substracts two elements of any kind
  1289. ; Params:
  1290. ; rsi <= Pointer to element 1
  1291. ; rdx <= Pointer to element 2
  1292. ; rdi <= Pointer to result
  1293. ; Modified Registers:
  1294. ; r8, r9, 10, r11, rax, rcx
  1295. ;;;;;;;;;;;;;;;;;;;;;;
  1296. Fr_sub:
  1297. mov rax, [rsi]
  1298. mov rcx, [rdx]
  1299. bt rax, 63 ; Check if is long first operand
  1300. jc sub_l1
  1301. bt rcx, 63 ; Check if is long second operand
  1302. jc sub_s1l2
  1303. sub_s1s2: ; Both operands are short
  1304. xor rdx, rdx
  1305. mov edx, eax
  1306. sub edx, ecx
  1307. jo sub_manageOverflow ; rsi already is the 64bits result
  1308. mov [rdi], rdx ; not necessary to adjust so just save and return
  1309. ret
  1310. sub_manageOverflow: ; Do the operation in 64 bits
  1311. push rsi
  1312. movsx rsi, eax
  1313. movsx rdx, ecx
  1314. sub rsi, rdx
  1315. call rawCopyS2L
  1316. pop rsi
  1317. ret
  1318. sub_l1:
  1319. bt rcx, 63 ; Check if is short second operand
  1320. jc sub_l1l2
  1321. ;;;;;;;;
  1322. sub_l1s2:
  1323. bt rax, 62 ; check if montgomery first
  1324. jc sub_l1ms2
  1325. sub_l1ns2:
  1326. mov r11b, 0x80
  1327. shl r11, 56
  1328. mov [rdi], r11
  1329. add rsi, 8
  1330. movsx rdx, ecx
  1331. add rdi, 8
  1332. cmp rdx, 0
  1333. jns tmp_3
  1334. neg rdx
  1335. call rawAddLS
  1336. sub rdi, 8
  1337. sub rsi, 8
  1338. ret
  1339. tmp_3:
  1340. call rawSubLS
  1341. sub rdi, 8
  1342. sub rsi, 8
  1343. ret
  1344. sub_l1ms2:
  1345. bt rcx, 62 ; check if montgomery second
  1346. jc sub_l1ms2m
  1347. sub_l1ms2n:
  1348. mov r11b, 0xC0
  1349. shl r11, 56
  1350. mov [rdi], r11
  1351. push rdi
  1352. mov rdi, rdx
  1353. call Fr_toMontgomery
  1354. mov rdx, rdi
  1355. pop rdi
  1356. add rdi, 8
  1357. add rsi, 8
  1358. add rdx, 8
  1359. call rawSubLL
  1360. sub rdi, 8
  1361. sub rsi, 8
  1362. ret
  1363. sub_l1ms2m:
  1364. mov r11b, 0xC0
  1365. shl r11, 56
  1366. mov [rdi], r11
  1367. add rdi, 8
  1368. add rsi, 8
  1369. add rdx, 8
  1370. call rawSubLL
  1371. sub rdi, 8
  1372. sub rsi, 8
  1373. ret
  1374. ;;;;;;;;
  1375. sub_s1l2:
  1376. bt rcx, 62 ; check if montgomery first
  1377. jc sub_s1l2m
  1378. sub_s1l2n:
  1379. mov r11b, 0x80
  1380. shl r11, 56
  1381. mov [rdi], r11
  1382. cmp eax, 0
  1383. js tmp_4
  1384. ; First Operand is positive
  1385. push rsi
  1386. add rdi, 8
  1387. movsx rsi, eax
  1388. add rdx, 8
  1389. call rawSubSL
  1390. sub rdi, 8
  1391. pop rsi
  1392. ret
  1393. tmp_4: ; First operand is negative
  1394. push rsi
  1395. lea rsi, [rdx + 8]
  1396. movsx rdx, eax
  1397. add rdi, 8
  1398. neg rdx
  1399. call rawNegLS
  1400. sub rdi, 8
  1401. pop rsi
  1402. ret
  1403. sub_s1l2m:
  1404. bt rax, 62 ; check if montgomery second
  1405. jc sub_s1ml2m
  1406. sub_s1nl2m:
  1407. mov r11b, 0xC0
  1408. shl r11, 56
  1409. mov [rdi], r11
  1410. push rdi
  1411. mov rdi, rsi
  1412. mov rsi, rdx
  1413. call Fr_toMontgomery
  1414. mov rdx, rsi
  1415. mov rsi, rdi
  1416. pop rdi
  1417. add rdi, 8
  1418. add rsi, 8
  1419. add rdx, 8
  1420. call rawSubLL
  1421. sub rdi, 8
  1422. sub rsi, 8
  1423. ret
  1424. sub_s1ml2m:
  1425. mov r11b, 0xC0
  1426. shl r11, 56
  1427. mov [rdi], r11
  1428. add rdi, 8
  1429. add rsi, 8
  1430. add rdx, 8
  1431. call rawSubLL
  1432. sub rdi, 8
  1433. sub rsi, 8
  1434. ret
  1435. ;;;;
  1436. sub_l1l2:
  1437. bt rax, 62 ; check if montgomery first
  1438. jc sub_l1ml2
  1439. sub_l1nl2:
  1440. bt rcx, 62 ; check if montgomery second
  1441. jc sub_l1nl2m
  1442. sub_l1nl2n:
  1443. mov r11b, 0x80
  1444. shl r11, 56
  1445. mov [rdi], r11
  1446. add rdi, 8
  1447. add rsi, 8
  1448. add rdx, 8
  1449. call rawSubLL
  1450. sub rdi, 8
  1451. sub rsi, 8
  1452. ret
  1453. sub_l1nl2m:
  1454. mov r11b, 0xC0
  1455. shl r11, 56
  1456. mov [rdi], r11
  1457. push rdi
  1458. mov rdi, rsi
  1459. mov rsi, rdx
  1460. call Fr_toMontgomery
  1461. mov rdx, rsi
  1462. mov rsi, rdi
  1463. pop rdi
  1464. add rdi, 8
  1465. add rsi, 8
  1466. add rdx, 8
  1467. call rawSubLL
  1468. sub rdi, 8
  1469. sub rsi, 8
  1470. ret
  1471. sub_l1ml2:
  1472. bt rcx, 62 ; check if montgomery seconf
  1473. jc sub_l1ml2m
  1474. sub_l1ml2n:
  1475. mov r11b, 0xC0
  1476. shl r11, 56
  1477. mov [rdi], r11
  1478. push rdi
  1479. mov rdi, rdx
  1480. call Fr_toMontgomery
  1481. mov rdx, rdi
  1482. pop rdi
  1483. add rdi, 8
  1484. add rsi, 8
  1485. add rdx, 8
  1486. call rawSubLL
  1487. sub rdi, 8
  1488. sub rsi, 8
  1489. ret
  1490. sub_l1ml2m:
  1491. mov r11b, 0xC0
  1492. shl r11, 56
  1493. mov [rdi], r11
  1494. add rdi, 8
  1495. add rsi, 8
  1496. add rdx, 8
  1497. call rawSubLL
  1498. sub rdi, 8
  1499. sub rsi, 8
  1500. ret
  1501. ;;;;;;;;;;;;;;;;;;;;;;
  1502. ; rawSubLS
  1503. ;;;;;;;;;;;;;;;;;;;;;;
  1504. ; Substracts a short element from the long element
  1505. ; Params:
  1506. ; rdi <= Pointer to the long data of result
  1507. ; rsi <= Pointer to the long data of element 1 where will be substracted
  1508. ; rdx <= Value to be substracted
  1509. ; [rdi] = [rsi] - rdx
  1510. ; Modified Registers:
  1511. ; rax
  1512. ;;;;;;;;;;;;;;;;;;;;;;
  1513. rawSubLS:
  1514. ; Substract first digit
  1515. mov rax, [rsi]
  1516. sub rax, rdx
  1517. mov [rdi] ,rax
  1518. mov rdx, 0
  1519. mov rax, [rsi + 8]
  1520. sbb rax, rdx
  1521. mov [rdi + 8], rax
  1522. mov rax, [rsi + 16]
  1523. sbb rax, rdx
  1524. mov [rdi + 16], rax
  1525. mov rax, [rsi + 24]
  1526. sbb rax, rdx
  1527. mov [rdi + 24], rax
  1528. jnc rawSubLS_done ; if overflow, add q
  1529. ; Add q
  1530. rawSubLS_aq:
  1531. mov rax, [q + 0]
  1532. add [rdi + 0], rax
  1533. mov rax, [q + 8]
  1534. adc [rdi + 8], rax
  1535. mov rax, [q + 16]
  1536. adc [rdi + 16], rax
  1537. mov rax, [q + 24]
  1538. adc [rdi + 24], rax
  1539. rawSubLS_done:
  1540. ret
  1541. ;;;;;;;;;;;;;;;;;;;;;;
  1542. ; rawSubSL
  1543. ;;;;;;;;;;;;;;;;;;;;;;
  1544. ; Substracts a long element from a short element
  1545. ; Params:
  1546. ; rdi <= Pointer to the long data of result
  1547. ; rsi <= Value from where will bo substracted
  1548. ; rdx <= Pointer to long of the value to be substracted
  1549. ;
  1550. ; [rdi] = rsi - [rdx]
  1551. ; Modified Registers:
  1552. ; rax
  1553. ;;;;;;;;;;;;;;;;;;;;;;
  1554. rawSubSL:
  1555. ; Substract first digit
  1556. sub rsi, [rdx]
  1557. mov [rdi] ,rsi
  1558. mov rax, 0
  1559. sbb rax, [rdx + 8]
  1560. mov [rdi + 8], rax
  1561. mov rax, 0
  1562. sbb rax, [rdx + 16]
  1563. mov [rdi + 16], rax
  1564. mov rax, 0
  1565. sbb rax, [rdx + 24]
  1566. mov [rdi + 24], rax
  1567. jnc rawSubSL_done ; if overflow, add q
  1568. ; Add q
  1569. rawSubSL_aq:
  1570. mov rax, [q + 0]
  1571. add [rdi + 0], rax
  1572. mov rax, [q + 8]
  1573. adc [rdi + 8], rax
  1574. mov rax, [q + 16]
  1575. adc [rdi + 16], rax
  1576. mov rax, [q + 24]
  1577. adc [rdi + 24], rax
  1578. rawSubSL_done:
  1579. ret
  1580. ;;;;;;;;;;;;;;;;;;;;;;
  1581. ; rawSubLL
  1582. ;;;;;;;;;;;;;;;;;;;;;;
  1583. ; Substracts a long element from a short element
  1584. ; Params:
  1585. ; rdi <= Pointer to the long data of result
  1586. ; rsi <= Pointer to long from where substracted
  1587. ; rdx <= Pointer to long of the value to be substracted
  1588. ;
  1589. ; [rdi] = [rsi] - [rdx]
  1590. ; Modified Registers:
  1591. ; rax
  1592. ;;;;;;;;;;;;;;;;;;;;;;
  1593. rawSubLL:
  1594. ; Substract first digit
  1595. mov rax, [rsi + 0]
  1596. sub rax, [rdx + 0]
  1597. mov [rdi + 0], rax
  1598. mov rax, [rsi + 8]
  1599. sbb rax, [rdx + 8]
  1600. mov [rdi + 8], rax
  1601. mov rax, [rsi + 16]
  1602. sbb rax, [rdx + 16]
  1603. mov [rdi + 16], rax
  1604. mov rax, [rsi + 24]
  1605. sbb rax, [rdx + 24]
  1606. mov [rdi + 24], rax
  1607. jnc rawSubLL_done ; if overflow, add q
  1608. ; Add q
  1609. rawSubLL_aq:
  1610. mov rax, [q + 0]
  1611. add [rdi + 0], rax
  1612. mov rax, [q + 8]
  1613. adc [rdi + 8], rax
  1614. mov rax, [q + 16]
  1615. adc [rdi + 16], rax
  1616. mov rax, [q + 24]
  1617. adc [rdi + 24], rax
  1618. rawSubLL_done:
  1619. ret
  1620. ;;;;;;;;;;;;;;;;;;;;;;
  1621. ; rawNegLS
  1622. ;;;;;;;;;;;;;;;;;;;;;;
  1623. ; Substracts a long element and a short element form 0
  1624. ; Params:
  1625. ; rdi <= Pointer to the long data of result
  1626. ; rsi <= Pointer to long from where substracted
  1627. ; rdx <= short value to be substracted too
  1628. ;
  1629. ; [rdi] = -[rsi] - rdx
  1630. ; Modified Registers:
  1631. ; rax
  1632. ;;;;;;;;;;;;;;;;;;;;;;
  1633. rawNegLS:
  1634. mov rax, [q]
  1635. sub rax, rdx
  1636. mov [rdi], rax
  1637. mov rax, [q + 8 ]
  1638. sbb rax, 0
  1639. mov [rdi + 8], rax
  1640. mov rax, [q + 16 ]
  1641. sbb rax, 0
  1642. mov [rdi + 16], rax
  1643. mov rax, [q + 24 ]
  1644. sbb rax, 0
  1645. mov [rdi + 24], rax
  1646. setc dl
  1647. mov rax, [rdi + 0 ]
  1648. sub rax, [rsi + 0]
  1649. mov [rdi + 0], rax
  1650. mov rax, [rdi + 8 ]
  1651. sbb rax, [rsi + 8]
  1652. mov [rdi + 8], rax
  1653. mov rax, [rdi + 16 ]
  1654. sbb rax, [rsi + 16]
  1655. mov [rdi + 16], rax
  1656. mov rax, [rdi + 24 ]
  1657. sbb rax, [rsi + 24]
  1658. mov [rdi + 24], rax
  1659. setc dh
  1660. or dl, dh
  1661. jz rawNegSL_done
  1662. ; it is a negative value, so add q
  1663. mov rax, [q + 0]
  1664. add [rdi + 0], rax
  1665. mov rax, [q + 8]
  1666. adc [rdi + 8], rax
  1667. mov rax, [q + 16]
  1668. adc [rdi + 16], rax
  1669. mov rax, [q + 24]
  1670. adc [rdi + 24], rax
  1671. rawNegSL_done:
  1672. ret
  1673. ;;;;;;;;;;;;;;;;;;;;;;
  1674. ; neg
  1675. ;;;;;;;;;;;;;;;;;;;;;;
  1676. ; Adds two elements of any kind
  1677. ; Params:
  1678. ; rsi <= Pointer to element to be negated
  1679. ; rdi <= Pointer to result
  1680. ; [rdi] = -[rsi]
  1681. ;;;;;;;;;;;;;;;;;;;;;;
  1682. Fr_neg:
  1683. mov rax, [rsi]
  1684. bt rax, 63 ; Check if is short first operand
  1685. jc neg_l
  1686. neg_s: ; Operand is short
  1687. neg eax
  1688. jo neg_manageOverflow ; Check if overflow. (0x80000000 is the only case)
  1689. mov [rdi], rax ; not necessary to adjust so just save and return
  1690. ret
  1691. neg_manageOverflow: ; Do the operation in 64 bits
  1692. push rsi
  1693. movsx rsi, eax
  1694. neg rsi
  1695. call rawCopyS2L
  1696. pop rsi
  1697. ret
  1698. neg_l:
  1699. mov [rdi], rax ; Copy the type
  1700. add rdi, 8
  1701. add rsi, 8
  1702. call rawNegL
  1703. sub rdi, 8
  1704. sub rsi, 8
  1705. ret
  1706. ;;;;;;;;;;;;;;;;;;;;;;
  1707. ; rawNeg
  1708. ;;;;;;;;;;;;;;;;;;;;;;
  1709. ; Negates a value
  1710. ; Params:
  1711. ; rdi <= Pointer to the long data of result
  1712. ; rsi <= Pointer to the long data of element 1
  1713. ;
  1714. ; [rdi] = - [rsi]
  1715. ;;;;;;;;;;;;;;;;;;;;;;
  1716. rawNegL:
  1717. ; Compare is zero
  1718. xor rax, rax
  1719. cmp [rsi + 0], rax
  1720. jnz doNegate
  1721. cmp [rsi + 8], rax
  1722. jnz doNegate
  1723. cmp [rsi + 16], rax
  1724. jnz doNegate
  1725. cmp [rsi + 24], rax
  1726. jnz doNegate
  1727. ; it's zero so just set to zero
  1728. mov [rdi + 0], rax
  1729. mov [rdi + 8], rax
  1730. mov [rdi + 16], rax
  1731. mov [rdi + 24], rax
  1732. ret
  1733. doNegate:
  1734. mov rax, [q + 0]
  1735. sub rax, [rsi + 0]
  1736. mov [rdi + 0], rax
  1737. mov rax, [q + 8]
  1738. sbb rax, [rsi + 8]
  1739. mov [rdi + 8], rax
  1740. mov rax, [q + 16]
  1741. sbb rax, [rsi + 16]
  1742. mov [rdi + 16], rax
  1743. mov rax, [q + 24]
  1744. sbb rax, [rsi + 24]
  1745. mov [rdi + 24], rax
  1746. ret
  1747. ;;;;;;;;;;;;;;;;;;;;;;
  1748. ; square
  1749. ;;;;;;;;;;;;;;;;;;;;;;
  1750. ; Squares a field element
  1751. ; Params:
  1752. ; rsi <= Pointer to element 1
  1753. ; rdi <= Pointer to result
  1754. ; [rdi] = [rsi] * [rsi]
  1755. ; Modified Registers:
  1756. ; r8, r9, 10, r11, rax, rcx
  1757. ;;;;;;;;;;;;;;;;;;;;;;
  1758. Fr_square:
  1759. mov r8, [rsi]
  1760. bt r8, 63 ; Check if is short first operand
  1761. jc square_l1
  1762. square_s1: ; Both operands are short
  1763. xor rax, rax
  1764. mov eax, r8d
  1765. imul eax
  1766. jo square_manageOverflow ; rsi already is the 64bits result
  1767. mov [rdi], rax ; not necessary to adjust so just save and return
  1768. square_manageOverflow: ; Do the operation in 64 bits
  1769. push rsi
  1770. movsx rax, r8d
  1771. imul rax
  1772. mov rsi, rax
  1773. call rawCopyS2L
  1774. pop rsi
  1775. ret
  1776. square_l1:
  1777. bt r8, 62 ; check if montgomery first
  1778. jc square_l1m
  1779. square_l1n:
  1780. mov r11b, 0xC0
  1781. shl r11, 56
  1782. mov [rdi], r11
  1783. add rdi, 8
  1784. add rsi, 8
  1785. call rawMontgomerySquare
  1786. sub rdi, 8
  1787. sub rsi, 8
  1788. push rsi
  1789. add rdi, 8
  1790. mov rsi, rdi
  1791. lea rdx, [R3]
  1792. call rawMontgomeryMul
  1793. sub rdi, 8
  1794. pop rsi
  1795. ret
  1796. square_l1m:
  1797. mov r11b, 0xC0
  1798. shl r11, 56
  1799. mov [rdi], r11
  1800. add rdi, 8
  1801. add rsi, 8
  1802. call rawMontgomerySquare
  1803. sub rdi, 8
  1804. sub rsi, 8
  1805. ret
  1806. ;;;;;;;;;;;;;;;;;;;;;;
  1807. ; mul
  1808. ;;;;;;;;;;;;;;;;;;;;;;
  1809. ; Multiplies two elements of any kind
  1810. ; Params:
  1811. ; rsi <= Pointer to element 1
  1812. ; rdx <= Pointer to element 2
  1813. ; rdi <= Pointer to result
  1814. ; [rdi] = [rsi] * [rdi]
  1815. ; Modified Registers:
  1816. ; r8, r9, 10, r11, rax, rcx
  1817. ;;;;;;;;;;;;;;;;;;;;;;
  1818. Fr_mul:
  1819. mov r8, [rsi]
  1820. mov r9, [rdx]
  1821. bt r8, 63 ; Check if is short first operand
  1822. jc mul_l1
  1823. bt r9, 63 ; Check if is short second operand
  1824. jc mul_s1l2
  1825. mul_s1s2: ; Both operands are short
  1826. xor rax, rax
  1827. mov eax, r8d
  1828. imul r9d
  1829. jo mul_manageOverflow ; rsi already is the 64bits result
  1830. mov [rdi], rax ; not necessary to adjust so just save and return
  1831. mul_manageOverflow: ; Do the operation in 64 bits
  1832. push rsi
  1833. movsx rax, r8d
  1834. movsx rcx, r9d
  1835. imul rcx
  1836. mov rsi, rax
  1837. call rawCopyS2L
  1838. pop rsi
  1839. ret
  1840. mul_l1:
  1841. bt r9, 63 ; Check if is short second operand
  1842. jc mul_l1l2
  1843. ;;;;;;;;
  1844. mul_l1s2:
  1845. bt r8, 62 ; check if montgomery first
  1846. jc mul_l1ms2
  1847. mul_l1ns2:
  1848. bt r9, 62 ; check if montgomery first
  1849. jc mul_l1ns2m
  1850. mul_l1ns2n:
  1851. mov r11b, 0xC0
  1852. shl r11, 56
  1853. mov [rdi], r11
  1854. push rsi
  1855. add rsi, 8
  1856. movsx rdx, r9d
  1857. add rdi, 8
  1858. cmp rdx, 0
  1859. jns tmp_5
  1860. neg rdx
  1861. call rawMontgomeryMul1
  1862. mov rsi, rdi
  1863. call rawNegL
  1864. sub rdi, 8
  1865. pop rsi
  1866. jmp tmp_6
  1867. tmp_5:
  1868. call rawMontgomeryMul1
  1869. sub rdi, 8
  1870. pop rsi
  1871. tmp_6:
  1872. push rsi
  1873. add rdi, 8
  1874. mov rsi, rdi
  1875. lea rdx, [R3]
  1876. call rawMontgomeryMul
  1877. sub rdi, 8
  1878. pop rsi
  1879. ret
  1880. mul_l1ns2m:
  1881. mov r11b, 0x80
  1882. shl r11, 56
  1883. mov [rdi], r11
  1884. add rdi, 8
  1885. add rsi, 8
  1886. add rdx, 8
  1887. call rawMontgomeryMul
  1888. sub rdi, 8
  1889. sub rsi, 8
  1890. ret
  1891. mul_l1ms2:
  1892. bt r9, 62 ; check if montgomery second
  1893. jc mul_l1ms2m
  1894. mul_l1ms2n:
  1895. mov r11b, 0x80
  1896. shl r11, 56
  1897. mov [rdi], r11
  1898. push rsi
  1899. add rsi, 8
  1900. movsx rdx, r9d
  1901. add rdi, 8
  1902. cmp rdx, 0
  1903. jns tmp_7
  1904. neg rdx
  1905. call rawMontgomeryMul1
  1906. mov rsi, rdi
  1907. call rawNegL
  1908. sub rdi, 8
  1909. pop rsi
  1910. jmp tmp_8
  1911. tmp_7:
  1912. call rawMontgomeryMul1
  1913. sub rdi, 8
  1914. pop rsi
  1915. tmp_8:
  1916. ret
  1917. mul_l1ms2m:
  1918. mov r11b, 0xC0
  1919. shl r11, 56
  1920. mov [rdi], r11
  1921. add rdi, 8
  1922. add rsi, 8
  1923. add rdx, 8
  1924. call rawMontgomeryMul
  1925. sub rdi, 8
  1926. sub rsi, 8
  1927. ret
  1928. ;;;;;;;;
  1929. mul_s1l2:
  1930. bt r8, 62 ; check if montgomery first
  1931. jc mul_s1ml2
  1932. mul_s1nl2:
  1933. bt r9, 62 ; check if montgomery first
  1934. jc mul_s1nl2m
  1935. mul_s1nl2n:
  1936. mov r11b, 0xC0
  1937. shl r11, 56
  1938. mov [rdi], r11
  1939. push rsi
  1940. lea rsi, [rdx + 8]
  1941. movsx rdx, r8d
  1942. add rdi, 8
  1943. cmp rdx, 0
  1944. jns tmp_9
  1945. neg rdx
  1946. call rawMontgomeryMul1
  1947. mov rsi, rdi
  1948. call rawNegL
  1949. sub rdi, 8
  1950. pop rsi
  1951. jmp tmp_10
  1952. tmp_9:
  1953. call rawMontgomeryMul1
  1954. sub rdi, 8
  1955. pop rsi
  1956. tmp_10:
  1957. push rsi
  1958. add rdi, 8
  1959. mov rsi, rdi
  1960. lea rdx, [R3]
  1961. call rawMontgomeryMul
  1962. sub rdi, 8
  1963. pop rsi
  1964. ret
  1965. mul_s1nl2m:
  1966. mov r11b, 0x80
  1967. shl r11, 56
  1968. mov [rdi], r11
  1969. push rsi
  1970. lea rsi, [rdx + 8]
  1971. movsx rdx, r8d
  1972. add rdi, 8
  1973. cmp rdx, 0
  1974. jns tmp_11
  1975. neg rdx
  1976. call rawMontgomeryMul1
  1977. mov rsi, rdi
  1978. call rawNegL
  1979. sub rdi, 8
  1980. pop rsi
  1981. jmp tmp_12
  1982. tmp_11:
  1983. call rawMontgomeryMul1
  1984. sub rdi, 8
  1985. pop rsi
  1986. tmp_12:
  1987. ret
  1988. mul_s1ml2:
  1989. bt r9, 62 ; check if montgomery first
  1990. jc mul_s1ml2m
  1991. mul_s1ml2n:
  1992. mov r11b, 0x80
  1993. shl r11, 56
  1994. mov [rdi], r11
  1995. add rdi, 8
  1996. add rsi, 8
  1997. add rdx, 8
  1998. call rawMontgomeryMul
  1999. sub rdi, 8
  2000. sub rsi, 8
  2001. ret
  2002. mul_s1ml2m:
  2003. mov r11b, 0xC0
  2004. shl r11, 56
  2005. mov [rdi], r11
  2006. add rdi, 8
  2007. add rsi, 8
  2008. add rdx, 8
  2009. call rawMontgomeryMul
  2010. sub rdi, 8
  2011. sub rsi, 8
  2012. ret
  2013. ;;;;
  2014. mul_l1l2:
  2015. bt r8, 62 ; check if montgomery first
  2016. jc mul_l1ml2
  2017. mul_l1nl2:
  2018. bt r9, 62 ; check if montgomery second
  2019. jc mul_l1nl2m
  2020. mul_l1nl2n:
  2021. mov r11b, 0xC0
  2022. shl r11, 56
  2023. mov [rdi], r11
  2024. add rdi, 8
  2025. add rsi, 8
  2026. add rdx, 8
  2027. call rawMontgomeryMul
  2028. sub rdi, 8
  2029. sub rsi, 8
  2030. push rsi
  2031. add rdi, 8
  2032. mov rsi, rdi
  2033. lea rdx, [R3]
  2034. call rawMontgomeryMul
  2035. sub rdi, 8
  2036. pop rsi
  2037. ret
  2038. mul_l1nl2m:
  2039. mov r11b, 0x80
  2040. shl r11, 56
  2041. mov [rdi], r11
  2042. add rdi, 8
  2043. add rsi, 8
  2044. add rdx, 8
  2045. call rawMontgomeryMul
  2046. sub rdi, 8
  2047. sub rsi, 8
  2048. ret
  2049. mul_l1ml2:
  2050. bt r9, 62 ; check if montgomery seconf
  2051. jc mul_l1ml2m
  2052. mul_l1ml2n:
  2053. mov r11b, 0x80
  2054. shl r11, 56
  2055. mov [rdi], r11
  2056. add rdi, 8
  2057. add rsi, 8
  2058. add rdx, 8
  2059. call rawMontgomeryMul
  2060. sub rdi, 8
  2061. sub rsi, 8
  2062. ret
  2063. mul_l1ml2m:
  2064. mov r11b, 0xC0
  2065. shl r11, 56
  2066. mov [rdi], r11
  2067. add rdi, 8
  2068. add rsi, 8
  2069. add rdx, 8
  2070. call rawMontgomeryMul
  2071. sub rdi, 8
  2072. sub rsi, 8
  2073. ret
  2074. ;;;;;;;;;;;;;;;;;;;;;;
  2075. ; band
  2076. ;;;;;;;;;;;;;;;;;;;;;;
  2077. ; Adds two elements of any kind
  2078. ; Params:
  2079. ; rsi <= Pointer to element 1
  2080. ; rdx <= Pointer to element 2
  2081. ; rdi <= Pointer to result
  2082. ; Modified Registers:
  2083. ; r8, r9, 10, r11, rax, rcx
  2084. ;;;;;;;;;;;;;;;;;;;;;;
  2085. Fr_band:
  2086. mov r8, [rsi]
  2087. mov r9, [rdx]
  2088. bt r8, 63 ; Check if is short first operand
  2089. jc and_l1
  2090. bt r9, 63 ; Check if is short second operand
  2091. jc and_s1l2
  2092. and_s1s2:
  2093. cmp r8d, 0
  2094. js tmp_13
  2095. cmp r9d, 0
  2096. js tmp_13
  2097. xor rdx, rdx ; both ops are positive so do the op and return
  2098. mov edx, r8d
  2099. and edx, r9d
  2100. mov [rdi], rdx ; not necessary to adjust so just save and return
  2101. ret
  2102. tmp_13:
  2103. mov r11b, 0x80
  2104. shl r11, 56
  2105. mov [rdi], r11
  2106. push rdi
  2107. push rsi
  2108. mov rdi, rdx
  2109. movsx rsi, r9d
  2110. call rawCopyS2L
  2111. mov rdx, rdi
  2112. pop rsi
  2113. pop rdi
  2114. push rdi
  2115. push rdx
  2116. mov rdi, rsi
  2117. movsx rsi, r8d
  2118. call rawCopyS2L
  2119. mov rsi, rdi
  2120. pop rdx
  2121. pop rdi
  2122. mov rax, [rsi + 8]
  2123. and rax, [rdx + 8]
  2124. mov [rdi + 8 ], rax
  2125. mov rax, [rsi + 16]
  2126. and rax, [rdx + 16]
  2127. mov [rdi + 16 ], rax
  2128. mov rax, [rsi + 24]
  2129. and rax, [rdx + 24]
  2130. mov [rdi + 24 ], rax
  2131. mov rax, [rsi + 32]
  2132. and rax, [rdx + 32]
  2133. and rax, [lboMask]
  2134. mov [rdi + 32 ], rax
  2135. ret
  2136. and_l1:
  2137. bt r9, 63 ; Check if is short second operand
  2138. jc and_l1l2
  2139. and_l1s2:
  2140. bt r8, 62 ; check if montgomery first
  2141. jc and_l1ms2
  2142. and_l1ns2:
  2143. mov r11b, 0x80
  2144. shl r11, 56
  2145. mov [rdi], r11
  2146. cmp r9d, 0
  2147. js tmp_14
  2148. movsx rax, r9d
  2149. and rax, [rsi +8]
  2150. mov [rdi+8], rax
  2151. xor rax, rax
  2152. and rax, [rsi + 16];
  2153. mov [rdi + 16 ], rax;
  2154. xor rax, rax
  2155. and rax, [rsi + 24];
  2156. mov [rdi + 24 ], rax;
  2157. xor rax, rax
  2158. and rax, [rsi + 32];
  2159. and rax, [lboMask] ;
  2160. mov [rdi + 32 ], rax;
  2161. ret
  2162. tmp_14:
  2163. push rdi
  2164. push rsi
  2165. mov rdi, rdx
  2166. movsx rsi, r9d
  2167. call rawCopyS2L
  2168. mov rdx, rdi
  2169. pop rsi
  2170. pop rdi
  2171. mov r11b, 0x80
  2172. shl r11, 56
  2173. mov [rdi], r11
  2174. mov rax, [rsi + 8]
  2175. and rax, [rdx + 8]
  2176. mov [rdi + 8 ], rax
  2177. mov rax, [rsi + 16]
  2178. and rax, [rdx + 16]
  2179. mov [rdi + 16 ], rax
  2180. mov rax, [rsi + 24]
  2181. and rax, [rdx + 24]
  2182. mov [rdi + 24 ], rax
  2183. mov rax, [rsi + 32]
  2184. and rax, [rdx + 32]
  2185. and rax, [lboMask]
  2186. mov [rdi + 32 ], rax
  2187. ret
  2188. and_l1ms2:
  2189. mov r11b, 0x80
  2190. shl r11, 56
  2191. mov [rdi], r11
  2192. push r9 ; r9 is used in montgomery so we need to save it
  2193. push rdi
  2194. mov rdi, rsi
  2195. mov rsi, rdx
  2196. call Fr_toNormal
  2197. mov rdx, rsi
  2198. mov rsi, rdi
  2199. pop rdi
  2200. pop r9
  2201. cmp r9d, 0
  2202. js tmp_15
  2203. movsx rax, r9d
  2204. and rax, [rsi +8]
  2205. mov [rdi+8], rax
  2206. xor rax, rax
  2207. and rax, [rsi + 16];
  2208. mov [rdi + 16 ], rax;
  2209. xor rax, rax
  2210. and rax, [rsi + 24];
  2211. mov [rdi + 24 ], rax;
  2212. xor rax, rax
  2213. and rax, [rsi + 32];
  2214. and rax, [lboMask] ;
  2215. mov [rdi + 32 ], rax;
  2216. ret
  2217. tmp_15:
  2218. push rdi
  2219. push rsi
  2220. mov rdi, rdx
  2221. movsx rsi, r9d
  2222. call rawCopyS2L
  2223. mov rdx, rdi
  2224. pop rsi
  2225. pop rdi
  2226. mov r11b, 0x80
  2227. shl r11, 56
  2228. mov [rdi], r11
  2229. mov rax, [rsi + 8]
  2230. and rax, [rdx + 8]
  2231. mov [rdi + 8 ], rax
  2232. mov rax, [rsi + 16]
  2233. and rax, [rdx + 16]
  2234. mov [rdi + 16 ], rax
  2235. mov rax, [rsi + 24]
  2236. and rax, [rdx + 24]
  2237. mov [rdi + 24 ], rax
  2238. mov rax, [rsi + 32]
  2239. and rax, [rdx + 32]
  2240. and rax, [lboMask]
  2241. mov [rdi + 32 ], rax
  2242. ret
  2243. and_s1l2:
  2244. bt r9, 62 ; check if montgomery first
  2245. jc and_s1l2m
  2246. and_s1l2n:
  2247. mov r11b, 0x80
  2248. shl r11, 56
  2249. mov [rdi], r11
  2250. cmp r8d, 0
  2251. js tmp_16
  2252. movsx rax, r8d
  2253. and rax, [rdx +8]
  2254. mov [rdi+8], rax
  2255. xor rax, rax
  2256. and rax, [rdx + 16]
  2257. mov [rdi + 16 ], rax
  2258. xor rax, rax
  2259. and rax, [rdx + 24]
  2260. mov [rdi + 24 ], rax
  2261. xor rax, rax
  2262. and rax, [rdx + 32]
  2263. and rax, [lboMask]
  2264. mov [rdi + 32 ], rax
  2265. ret
  2266. tmp_16:
  2267. push rdi
  2268. push rdx
  2269. mov rdi, rsi
  2270. movsx rsi, r8d
  2271. call rawCopyS2L
  2272. mov rsi, rdi
  2273. pop rdx
  2274. pop rdi
  2275. mov r11b, 0x80
  2276. shl r11, 56
  2277. mov [rdi], r11
  2278. mov rax, [rsi + 8]
  2279. and rax, [rdx + 8]
  2280. mov [rdi + 8 ], rax
  2281. mov rax, [rsi + 16]
  2282. and rax, [rdx + 16]
  2283. mov [rdi + 16 ], rax
  2284. mov rax, [rsi + 24]
  2285. and rax, [rdx + 24]
  2286. mov [rdi + 24 ], rax
  2287. mov rax, [rsi + 32]
  2288. and rax, [rdx + 32]
  2289. and rax, [lboMask]
  2290. mov [rdi + 32 ], rax
  2291. ret
  2292. and_s1l2m:
  2293. mov r11b, 0x80
  2294. shl r11, 56
  2295. mov [rdi], r11
  2296. push r8 ; r8 is used in montgomery so we need to save it
  2297. push rdi
  2298. mov rdi, rdx
  2299. call Fr_toNormal
  2300. mov rdx, rdi
  2301. pop rdi
  2302. pop r8
  2303. cmp r8d, 0
  2304. js tmp_17
  2305. movsx rax, r8d
  2306. and rax, [rdx +8]
  2307. mov [rdi+8], rax
  2308. xor rax, rax
  2309. and rax, [rdx + 16]
  2310. mov [rdi + 16 ], rax
  2311. xor rax, rax
  2312. and rax, [rdx + 24]
  2313. mov [rdi + 24 ], rax
  2314. xor rax, rax
  2315. and rax, [rdx + 32]
  2316. and rax, [lboMask]
  2317. mov [rdi + 32 ], rax
  2318. ret
  2319. tmp_17:
  2320. push rdi
  2321. push rdx
  2322. mov rdi, rsi
  2323. movsx rsi, r8d
  2324. call rawCopyS2L
  2325. mov rsi, rdi
  2326. pop rdx
  2327. pop rdi
  2328. mov r11b, 0x80
  2329. shl r11, 56
  2330. mov [rdi], r11
  2331. mov rax, [rsi + 8]
  2332. and rax, [rdx + 8]
  2333. mov [rdi + 8 ], rax
  2334. mov rax, [rsi + 16]
  2335. and rax, [rdx + 16]
  2336. mov [rdi + 16 ], rax
  2337. mov rax, [rsi + 24]
  2338. and rax, [rdx + 24]
  2339. mov [rdi + 24 ], rax
  2340. mov rax, [rsi + 32]
  2341. and rax, [rdx + 32]
  2342. and rax, [lboMask]
  2343. mov [rdi + 32 ], rax
  2344. ret
  2345. and_l1l2:
  2346. bt r8, 62 ; check if montgomery first
  2347. jc and_l1ml2
  2348. bt r9, 62 ; check if montgomery first
  2349. jc and_l1nl2m
  2350. and_l1nl2n:
  2351. mov r11b, 0x80
  2352. shl r11, 56
  2353. mov [rdi], r11
  2354. mov rax, [rsi + 8]
  2355. and rax, [rdx + 8]
  2356. mov [rdi + 8 ], rax
  2357. mov rax, [rsi + 16]
  2358. and rax, [rdx + 16]
  2359. mov [rdi + 16 ], rax
  2360. mov rax, [rsi + 24]
  2361. and rax, [rdx + 24]
  2362. mov [rdi + 24 ], rax
  2363. mov rax, [rsi + 32]
  2364. and rax, [rdx + 32]
  2365. and rax, [lboMask]
  2366. mov [rdi + 32 ], rax
  2367. ret
  2368. and_l1nl2m:
  2369. mov r11b, 0x80
  2370. shl r11, 56
  2371. mov [rdi], r11
  2372. push rdi
  2373. mov rdi, rdx
  2374. call Fr_toNormal
  2375. mov rdx, rdi
  2376. pop rdi
  2377. mov rax, [rsi + 8]
  2378. and rax, [rdx + 8]
  2379. mov [rdi + 8 ], rax
  2380. mov rax, [rsi + 16]
  2381. and rax, [rdx + 16]
  2382. mov [rdi + 16 ], rax
  2383. mov rax, [rsi + 24]
  2384. and rax, [rdx + 24]
  2385. mov [rdi + 24 ], rax
  2386. mov rax, [rsi + 32]
  2387. and rax, [rdx + 32]
  2388. and rax, [lboMask]
  2389. mov [rdi + 32 ], rax
  2390. ret
  2391. and_l1ml2:
  2392. bt r9, 62 ; check if montgomery first
  2393. jc and_l1ml2m
  2394. and_l1ml2n:
  2395. mov r11b, 0x80
  2396. shl r11, 56
  2397. mov [rdi], r11
  2398. push rdi
  2399. mov rdi, rsi
  2400. mov rsi, rdx
  2401. call Fr_toNormal
  2402. mov rdx, rsi
  2403. mov rsi, rdi
  2404. pop rdi
  2405. mov rax, [rsi + 8]
  2406. and rax, [rdx + 8]
  2407. mov [rdi + 8 ], rax
  2408. mov rax, [rsi + 16]
  2409. and rax, [rdx + 16]
  2410. mov [rdi + 16 ], rax
  2411. mov rax, [rsi + 24]
  2412. and rax, [rdx + 24]
  2413. mov [rdi + 24 ], rax
  2414. mov rax, [rsi + 32]
  2415. and rax, [rdx + 32]
  2416. and rax, [lboMask]
  2417. mov [rdi + 32 ], rax
  2418. ret
  2419. and_l1ml2m:
  2420. mov r11b, 0x80
  2421. shl r11, 56
  2422. mov [rdi], r11
  2423. push rdi
  2424. mov rdi, rsi
  2425. mov rsi, rdx
  2426. call Fr_toNormal
  2427. mov rdx, rsi
  2428. mov rsi, rdi
  2429. pop rdi
  2430. push rdi
  2431. mov rdi, rdx
  2432. call Fr_toNormal
  2433. mov rdx, rdi
  2434. pop rdi
  2435. mov rax, [rsi + 8]
  2436. and rax, [rdx + 8]
  2437. mov [rdi + 8 ], rax
  2438. mov rax, [rsi + 16]
  2439. and rax, [rdx + 16]
  2440. mov [rdi + 16 ], rax
  2441. mov rax, [rsi + 24]
  2442. and rax, [rdx + 24]
  2443. mov [rdi + 24 ], rax
  2444. mov rax, [rsi + 32]
  2445. and rax, [rdx + 32]
  2446. and rax, [lboMask]
  2447. mov [rdi + 32 ], rax
  2448. ret
  2449. ;;;;;;;;;;;;;;;;;;;;;;
  2450. ; bor
  2451. ;;;;;;;;;;;;;;;;;;;;;;
  2452. ; Adds two elements of any kind
  2453. ; Params:
  2454. ; rsi <= Pointer to element 1
  2455. ; rdx <= Pointer to element 2
  2456. ; rdi <= Pointer to result
  2457. ; Modified Registers:
  2458. ; r8, r9, 10, r11, rax, rcx
  2459. ;;;;;;;;;;;;;;;;;;;;;;
  2460. Fr_bor:
  2461. mov r8, [rsi]
  2462. mov r9, [rdx]
  2463. bt r8, 63 ; Check if is short first operand
  2464. jc or_l1
  2465. bt r9, 63 ; Check if is short second operand
  2466. jc or_s1l2
  2467. or_s1s2:
  2468. cmp r8d, 0
  2469. js tmp_18
  2470. cmp r9d, 0
  2471. js tmp_18
  2472. xor rdx, rdx ; both ops are positive so do the op and return
  2473. mov edx, r8d
  2474. or edx, r9d
  2475. mov [rdi], rdx ; not necessary to adjust so just save and return
  2476. ret
  2477. tmp_18:
  2478. mov r11b, 0x80
  2479. shl r11, 56
  2480. mov [rdi], r11
  2481. push rdi
  2482. push rsi
  2483. mov rdi, rdx
  2484. movsx rsi, r9d
  2485. call rawCopyS2L
  2486. mov rdx, rdi
  2487. pop rsi
  2488. pop rdi
  2489. push rdi
  2490. push rdx
  2491. mov rdi, rsi
  2492. movsx rsi, r8d
  2493. call rawCopyS2L
  2494. mov rsi, rdi
  2495. pop rdx
  2496. pop rdi
  2497. mov rax, [rsi + 8]
  2498. or rax, [rdx + 8]
  2499. mov [rdi + 8 ], rax
  2500. mov rax, [rsi + 16]
  2501. or rax, [rdx + 16]
  2502. mov [rdi + 16 ], rax
  2503. mov rax, [rsi + 24]
  2504. or rax, [rdx + 24]
  2505. mov [rdi + 24 ], rax
  2506. mov rax, [rsi + 32]
  2507. or rax, [rdx + 32]
  2508. and rax, [lboMask]
  2509. mov [rdi + 32 ], rax
  2510. ret
  2511. or_l1:
  2512. bt r9, 63 ; Check if is short second operand
  2513. jc or_l1l2
  2514. or_l1s2:
  2515. bt r8, 62 ; check if montgomery first
  2516. jc or_l1ms2
  2517. or_l1ns2:
  2518. mov r11b, 0x80
  2519. shl r11, 56
  2520. mov [rdi], r11
  2521. cmp r9d, 0
  2522. js tmp_19
  2523. movsx rax, r9d
  2524. or rax, [rsi +8]
  2525. mov [rdi+8], rax
  2526. xor rax, rax
  2527. or rax, [rsi + 16];
  2528. mov [rdi + 16 ], rax;
  2529. xor rax, rax
  2530. or rax, [rsi + 24];
  2531. mov [rdi + 24 ], rax;
  2532. xor rax, rax
  2533. or rax, [rsi + 32];
  2534. and rax, [lboMask] ;
  2535. mov [rdi + 32 ], rax;
  2536. ret
  2537. tmp_19:
  2538. push rdi
  2539. push rsi
  2540. mov rdi, rdx
  2541. movsx rsi, r9d
  2542. call rawCopyS2L
  2543. mov rdx, rdi
  2544. pop rsi
  2545. pop rdi
  2546. mov r11b, 0x80
  2547. shl r11, 56
  2548. mov [rdi], r11
  2549. mov rax, [rsi + 8]
  2550. or rax, [rdx + 8]
  2551. mov [rdi + 8 ], rax
  2552. mov rax, [rsi + 16]
  2553. or rax, [rdx + 16]
  2554. mov [rdi + 16 ], rax
  2555. mov rax, [rsi + 24]
  2556. or rax, [rdx + 24]
  2557. mov [rdi + 24 ], rax
  2558. mov rax, [rsi + 32]
  2559. or rax, [rdx + 32]
  2560. and rax, [lboMask]
  2561. mov [rdi + 32 ], rax
  2562. ret
  2563. or_l1ms2:
  2564. mov r11b, 0x80
  2565. shl r11, 56
  2566. mov [rdi], r11
  2567. push r9 ; r9 is used in montgomery so we need to save it
  2568. push rdi
  2569. mov rdi, rsi
  2570. mov rsi, rdx
  2571. call Fr_toNormal
  2572. mov rdx, rsi
  2573. mov rsi, rdi
  2574. pop rdi
  2575. pop r9
  2576. cmp r9d, 0
  2577. js tmp_20
  2578. movsx rax, r9d
  2579. or rax, [rsi +8]
  2580. mov [rdi+8], rax
  2581. xor rax, rax
  2582. or rax, [rsi + 16];
  2583. mov [rdi + 16 ], rax;
  2584. xor rax, rax
  2585. or rax, [rsi + 24];
  2586. mov [rdi + 24 ], rax;
  2587. xor rax, rax
  2588. or rax, [rsi + 32];
  2589. and rax, [lboMask] ;
  2590. mov [rdi + 32 ], rax;
  2591. ret
  2592. tmp_20:
  2593. push rdi
  2594. push rsi
  2595. mov rdi, rdx
  2596. movsx rsi, r9d
  2597. call rawCopyS2L
  2598. mov rdx, rdi
  2599. pop rsi
  2600. pop rdi
  2601. mov r11b, 0x80
  2602. shl r11, 56
  2603. mov [rdi], r11
  2604. mov rax, [rsi + 8]
  2605. or rax, [rdx + 8]
  2606. mov [rdi + 8 ], rax
  2607. mov rax, [rsi + 16]
  2608. or rax, [rdx + 16]
  2609. mov [rdi + 16 ], rax
  2610. mov rax, [rsi + 24]
  2611. or rax, [rdx + 24]
  2612. mov [rdi + 24 ], rax
  2613. mov rax, [rsi + 32]
  2614. or rax, [rdx + 32]
  2615. and rax, [lboMask]
  2616. mov [rdi + 32 ], rax
  2617. ret
  2618. or_s1l2:
  2619. bt r9, 62 ; check if montgomery first
  2620. jc or_s1l2m
  2621. or_s1l2n:
  2622. mov r11b, 0x80
  2623. shl r11, 56
  2624. mov [rdi], r11
  2625. cmp r8d, 0
  2626. js tmp_21
  2627. movsx rax, r8d
  2628. or rax, [rdx +8]
  2629. mov [rdi+8], rax
  2630. xor rax, rax
  2631. or rax, [rdx + 16]
  2632. mov [rdi + 16 ], rax
  2633. xor rax, rax
  2634. or rax, [rdx + 24]
  2635. mov [rdi + 24 ], rax
  2636. xor rax, rax
  2637. or rax, [rdx + 32]
  2638. and rax, [lboMask]
  2639. mov [rdi + 32 ], rax
  2640. ret
  2641. tmp_21:
  2642. push rdi
  2643. push rdx
  2644. mov rdi, rsi
  2645. movsx rsi, r8d
  2646. call rawCopyS2L
  2647. mov rsi, rdi
  2648. pop rdx
  2649. pop rdi
  2650. mov r11b, 0x80
  2651. shl r11, 56
  2652. mov [rdi], r11
  2653. mov rax, [rsi + 8]
  2654. or rax, [rdx + 8]
  2655. mov [rdi + 8 ], rax
  2656. mov rax, [rsi + 16]
  2657. or rax, [rdx + 16]
  2658. mov [rdi + 16 ], rax
  2659. mov rax, [rsi + 24]
  2660. or rax, [rdx + 24]
  2661. mov [rdi + 24 ], rax
  2662. mov rax, [rsi + 32]
  2663. or rax, [rdx + 32]
  2664. and rax, [lboMask]
  2665. mov [rdi + 32 ], rax
  2666. ret
  2667. or_s1l2m:
  2668. mov r11b, 0x80
  2669. shl r11, 56
  2670. mov [rdi], r11
  2671. push r8 ; r8 is used in montgomery so we need to save it
  2672. push rdi
  2673. mov rdi, rdx
  2674. call Fr_toNormal
  2675. mov rdx, rdi
  2676. pop rdi
  2677. pop r8
  2678. cmp r8d, 0
  2679. js tmp_22
  2680. movsx rax, r8d
  2681. or rax, [rdx +8]
  2682. mov [rdi+8], rax
  2683. xor rax, rax
  2684. or rax, [rdx + 16]
  2685. mov [rdi + 16 ], rax
  2686. xor rax, rax
  2687. or rax, [rdx + 24]
  2688. mov [rdi + 24 ], rax
  2689. xor rax, rax
  2690. or rax, [rdx + 32]
  2691. and rax, [lboMask]
  2692. mov [rdi + 32 ], rax
  2693. ret
  2694. tmp_22:
  2695. push rdi
  2696. push rdx
  2697. mov rdi, rsi
  2698. movsx rsi, r8d
  2699. call rawCopyS2L
  2700. mov rsi, rdi
  2701. pop rdx
  2702. pop rdi
  2703. mov r11b, 0x80
  2704. shl r11, 56
  2705. mov [rdi], r11
  2706. mov rax, [rsi + 8]
  2707. or rax, [rdx + 8]
  2708. mov [rdi + 8 ], rax
  2709. mov rax, [rsi + 16]
  2710. or rax, [rdx + 16]
  2711. mov [rdi + 16 ], rax
  2712. mov rax, [rsi + 24]
  2713. or rax, [rdx + 24]
  2714. mov [rdi + 24 ], rax
  2715. mov rax, [rsi + 32]
  2716. or rax, [rdx + 32]
  2717. and rax, [lboMask]
  2718. mov [rdi + 32 ], rax
  2719. ret
  2720. or_l1l2:
  2721. bt r8, 62 ; check if montgomery first
  2722. jc or_l1ml2
  2723. bt r9, 62 ; check if montgomery first
  2724. jc or_l1nl2m
  2725. or_l1nl2n:
  2726. mov r11b, 0x80
  2727. shl r11, 56
  2728. mov [rdi], r11
  2729. mov rax, [rsi + 8]
  2730. or rax, [rdx + 8]
  2731. mov [rdi + 8 ], rax
  2732. mov rax, [rsi + 16]
  2733. or rax, [rdx + 16]
  2734. mov [rdi + 16 ], rax
  2735. mov rax, [rsi + 24]
  2736. or rax, [rdx + 24]
  2737. mov [rdi + 24 ], rax
  2738. mov rax, [rsi + 32]
  2739. or rax, [rdx + 32]
  2740. and rax, [lboMask]
  2741. mov [rdi + 32 ], rax
  2742. ret
  2743. or_l1nl2m:
  2744. mov r11b, 0x80
  2745. shl r11, 56
  2746. mov [rdi], r11
  2747. push rdi
  2748. mov rdi, rdx
  2749. call Fr_toNormal
  2750. mov rdx, rdi
  2751. pop rdi
  2752. mov rax, [rsi + 8]
  2753. or rax, [rdx + 8]
  2754. mov [rdi + 8 ], rax
  2755. mov rax, [rsi + 16]
  2756. or rax, [rdx + 16]
  2757. mov [rdi + 16 ], rax
  2758. mov rax, [rsi + 24]
  2759. or rax, [rdx + 24]
  2760. mov [rdi + 24 ], rax
  2761. mov rax, [rsi + 32]
  2762. or rax, [rdx + 32]
  2763. and rax, [lboMask]
  2764. mov [rdi + 32 ], rax
  2765. ret
  2766. or_l1ml2:
  2767. bt r9, 62 ; check if montgomery first
  2768. jc or_l1ml2m
  2769. or_l1ml2n:
  2770. mov r11b, 0x80
  2771. shl r11, 56
  2772. mov [rdi], r11
  2773. push rdi
  2774. mov rdi, rsi
  2775. mov rsi, rdx
  2776. call Fr_toNormal
  2777. mov rdx, rsi
  2778. mov rsi, rdi
  2779. pop rdi
  2780. mov rax, [rsi + 8]
  2781. or rax, [rdx + 8]
  2782. mov [rdi + 8 ], rax
  2783. mov rax, [rsi + 16]
  2784. or rax, [rdx + 16]
  2785. mov [rdi + 16 ], rax
  2786. mov rax, [rsi + 24]
  2787. or rax, [rdx + 24]
  2788. mov [rdi + 24 ], rax
  2789. mov rax, [rsi + 32]
  2790. or rax, [rdx + 32]
  2791. and rax, [lboMask]
  2792. mov [rdi + 32 ], rax
  2793. ret
  2794. or_l1ml2m:
  2795. mov r11b, 0x80
  2796. shl r11, 56
  2797. mov [rdi], r11
  2798. push rdi
  2799. mov rdi, rsi
  2800. mov rsi, rdx
  2801. call Fr_toNormal
  2802. mov rdx, rsi
  2803. mov rsi, rdi
  2804. pop rdi
  2805. push rdi
  2806. mov rdi, rdx
  2807. call Fr_toNormal
  2808. mov rdx, rdi
  2809. pop rdi
  2810. mov rax, [rsi + 8]
  2811. or rax, [rdx + 8]
  2812. mov [rdi + 8 ], rax
  2813. mov rax, [rsi + 16]
  2814. or rax, [rdx + 16]
  2815. mov [rdi + 16 ], rax
  2816. mov rax, [rsi + 24]
  2817. or rax, [rdx + 24]
  2818. mov [rdi + 24 ], rax
  2819. mov rax, [rsi + 32]
  2820. or rax, [rdx + 32]
  2821. and rax, [lboMask]
  2822. mov [rdi + 32 ], rax
  2823. ret
  2824. ;;;;;;;;;;;;;;;;;;;;;;
  2825. ; bxor
  2826. ;;;;;;;;;;;;;;;;;;;;;;
  2827. ; Adds two elements of any kind
  2828. ; Params:
  2829. ; rsi <= Pointer to element 1
  2830. ; rdx <= Pointer to element 2
  2831. ; rdi <= Pointer to result
  2832. ; Modified Registers:
  2833. ; r8, r9, 10, r11, rax, rcx
  2834. ;;;;;;;;;;;;;;;;;;;;;;
  2835. Fr_bxor:
  2836. mov r8, [rsi]
  2837. mov r9, [rdx]
  2838. bt r8, 63 ; Check if is short first operand
  2839. jc xor_l1
  2840. bt r9, 63 ; Check if is short second operand
  2841. jc xor_s1l2
  2842. xor_s1s2:
  2843. cmp r8d, 0
  2844. js tmp_23
  2845. cmp r9d, 0
  2846. js tmp_23
  2847. xor rdx, rdx ; both ops are positive so do the op and return
  2848. mov edx, r8d
  2849. xor edx, r9d
  2850. mov [rdi], rdx ; not necessary to adjust so just save and return
  2851. ret
  2852. tmp_23:
  2853. mov r11b, 0x80
  2854. shl r11, 56
  2855. mov [rdi], r11
  2856. push rdi
  2857. push rsi
  2858. mov rdi, rdx
  2859. movsx rsi, r9d
  2860. call rawCopyS2L
  2861. mov rdx, rdi
  2862. pop rsi
  2863. pop rdi
  2864. push rdi
  2865. push rdx
  2866. mov rdi, rsi
  2867. movsx rsi, r8d
  2868. call rawCopyS2L
  2869. mov rsi, rdi
  2870. pop rdx
  2871. pop rdi
  2872. mov rax, [rsi + 8]
  2873. xor rax, [rdx + 8]
  2874. mov [rdi + 8 ], rax
  2875. mov rax, [rsi + 16]
  2876. xor rax, [rdx + 16]
  2877. mov [rdi + 16 ], rax
  2878. mov rax, [rsi + 24]
  2879. xor rax, [rdx + 24]
  2880. mov [rdi + 24 ], rax
  2881. mov rax, [rsi + 32]
  2882. xor rax, [rdx + 32]
  2883. and rax, [lboMask]
  2884. mov [rdi + 32 ], rax
  2885. ret
  2886. xor_l1:
  2887. bt r9, 63 ; Check if is short second operand
  2888. jc xor_l1l2
  2889. xor_l1s2:
  2890. bt r8, 62 ; check if montgomery first
  2891. jc xor_l1ms2
  2892. xor_l1ns2:
  2893. mov r11b, 0x80
  2894. shl r11, 56
  2895. mov [rdi], r11
  2896. cmp r9d, 0
  2897. js tmp_24
  2898. movsx rax, r9d
  2899. xor rax, [rsi +8]
  2900. mov [rdi+8], rax
  2901. xor rax, rax
  2902. xor rax, [rsi + 16];
  2903. mov [rdi + 16 ], rax;
  2904. xor rax, rax
  2905. xor rax, [rsi + 24];
  2906. mov [rdi + 24 ], rax;
  2907. xor rax, rax
  2908. xor rax, [rsi + 32];
  2909. and rax, [lboMask] ;
  2910. mov [rdi + 32 ], rax;
  2911. ret
  2912. tmp_24:
  2913. push rdi
  2914. push rsi
  2915. mov rdi, rdx
  2916. movsx rsi, r9d
  2917. call rawCopyS2L
  2918. mov rdx, rdi
  2919. pop rsi
  2920. pop rdi
  2921. mov r11b, 0x80
  2922. shl r11, 56
  2923. mov [rdi], r11
  2924. mov rax, [rsi + 8]
  2925. xor rax, [rdx + 8]
  2926. mov [rdi + 8 ], rax
  2927. mov rax, [rsi + 16]
  2928. xor rax, [rdx + 16]
  2929. mov [rdi + 16 ], rax
  2930. mov rax, [rsi + 24]
  2931. xor rax, [rdx + 24]
  2932. mov [rdi + 24 ], rax
  2933. mov rax, [rsi + 32]
  2934. xor rax, [rdx + 32]
  2935. and rax, [lboMask]
  2936. mov [rdi + 32 ], rax
  2937. ret
  2938. xor_l1ms2:
  2939. mov r11b, 0x80
  2940. shl r11, 56
  2941. mov [rdi], r11
  2942. push r9 ; r9 is used in montgomery so we need to save it
  2943. push rdi
  2944. mov rdi, rsi
  2945. mov rsi, rdx
  2946. call Fr_toNormal
  2947. mov rdx, rsi
  2948. mov rsi, rdi
  2949. pop rdi
  2950. pop r9
  2951. cmp r9d, 0
  2952. js tmp_25
  2953. movsx rax, r9d
  2954. xor rax, [rsi +8]
  2955. mov [rdi+8], rax
  2956. xor rax, rax
  2957. xor rax, [rsi + 16];
  2958. mov [rdi + 16 ], rax;
  2959. xor rax, rax
  2960. xor rax, [rsi + 24];
  2961. mov [rdi + 24 ], rax;
  2962. xor rax, rax
  2963. xor rax, [rsi + 32];
  2964. and rax, [lboMask] ;
  2965. mov [rdi + 32 ], rax;
  2966. ret
  2967. tmp_25:
  2968. push rdi
  2969. push rsi
  2970. mov rdi, rdx
  2971. movsx rsi, r9d
  2972. call rawCopyS2L
  2973. mov rdx, rdi
  2974. pop rsi
  2975. pop rdi
  2976. mov r11b, 0x80
  2977. shl r11, 56
  2978. mov [rdi], r11
  2979. mov rax, [rsi + 8]
  2980. xor rax, [rdx + 8]
  2981. mov [rdi + 8 ], rax
  2982. mov rax, [rsi + 16]
  2983. xor rax, [rdx + 16]
  2984. mov [rdi + 16 ], rax
  2985. mov rax, [rsi + 24]
  2986. xor rax, [rdx + 24]
  2987. mov [rdi + 24 ], rax
  2988. mov rax, [rsi + 32]
  2989. xor rax, [rdx + 32]
  2990. and rax, [lboMask]
  2991. mov [rdi + 32 ], rax
  2992. ret
  2993. xor_s1l2:
  2994. bt r9, 62 ; check if montgomery first
  2995. jc xor_s1l2m
  2996. xor_s1l2n:
  2997. mov r11b, 0x80
  2998. shl r11, 56
  2999. mov [rdi], r11
  3000. cmp r8d, 0
  3001. js tmp_26
  3002. movsx rax, r8d
  3003. xor rax, [rdx +8]
  3004. mov [rdi+8], rax
  3005. xor rax, rax
  3006. xor rax, [rdx + 16]
  3007. mov [rdi + 16 ], rax
  3008. xor rax, rax
  3009. xor rax, [rdx + 24]
  3010. mov [rdi + 24 ], rax
  3011. xor rax, rax
  3012. xor rax, [rdx + 32]
  3013. and rax, [lboMask]
  3014. mov [rdi + 32 ], rax
  3015. ret
  3016. tmp_26:
  3017. push rdi
  3018. push rdx
  3019. mov rdi, rsi
  3020. movsx rsi, r8d
  3021. call rawCopyS2L
  3022. mov rsi, rdi
  3023. pop rdx
  3024. pop rdi
  3025. mov r11b, 0x80
  3026. shl r11, 56
  3027. mov [rdi], r11
  3028. mov rax, [rsi + 8]
  3029. xor rax, [rdx + 8]
  3030. mov [rdi + 8 ], rax
  3031. mov rax, [rsi + 16]
  3032. xor rax, [rdx + 16]
  3033. mov [rdi + 16 ], rax
  3034. mov rax, [rsi + 24]
  3035. xor rax, [rdx + 24]
  3036. mov [rdi + 24 ], rax
  3037. mov rax, [rsi + 32]
  3038. xor rax, [rdx + 32]
  3039. and rax, [lboMask]
  3040. mov [rdi + 32 ], rax
  3041. ret
  3042. xor_s1l2m:
  3043. mov r11b, 0x80
  3044. shl r11, 56
  3045. mov [rdi], r11
  3046. push r8 ; r8 is used in montgomery so we need to save it
  3047. push rdi
  3048. mov rdi, rdx
  3049. call Fr_toNormal
  3050. mov rdx, rdi
  3051. pop rdi
  3052. pop r8
  3053. cmp r8d, 0
  3054. js tmp_27
  3055. movsx rax, r8d
  3056. xor rax, [rdx +8]
  3057. mov [rdi+8], rax
  3058. xor rax, rax
  3059. xor rax, [rdx + 16]
  3060. mov [rdi + 16 ], rax
  3061. xor rax, rax
  3062. xor rax, [rdx + 24]
  3063. mov [rdi + 24 ], rax
  3064. xor rax, rax
  3065. xor rax, [rdx + 32]
  3066. and rax, [lboMask]
  3067. mov [rdi + 32 ], rax
  3068. ret
  3069. tmp_27:
  3070. push rdi
  3071. push rdx
  3072. mov rdi, rsi
  3073. movsx rsi, r8d
  3074. call rawCopyS2L
  3075. mov rsi, rdi
  3076. pop rdx
  3077. pop rdi
  3078. mov r11b, 0x80
  3079. shl r11, 56
  3080. mov [rdi], r11
  3081. mov rax, [rsi + 8]
  3082. xor rax, [rdx + 8]
  3083. mov [rdi + 8 ], rax
  3084. mov rax, [rsi + 16]
  3085. xor rax, [rdx + 16]
  3086. mov [rdi + 16 ], rax
  3087. mov rax, [rsi + 24]
  3088. xor rax, [rdx + 24]
  3089. mov [rdi + 24 ], rax
  3090. mov rax, [rsi + 32]
  3091. xor rax, [rdx + 32]
  3092. and rax, [lboMask]
  3093. mov [rdi + 32 ], rax
  3094. ret
  3095. xor_l1l2:
  3096. bt r8, 62 ; check if montgomery first
  3097. jc xor_l1ml2
  3098. bt r9, 62 ; check if montgomery first
  3099. jc xor_l1nl2m
  3100. xor_l1nl2n:
  3101. mov r11b, 0x80
  3102. shl r11, 56
  3103. mov [rdi], r11
  3104. mov rax, [rsi + 8]
  3105. xor rax, [rdx + 8]
  3106. mov [rdi + 8 ], rax
  3107. mov rax, [rsi + 16]
  3108. xor rax, [rdx + 16]
  3109. mov [rdi + 16 ], rax
  3110. mov rax, [rsi + 24]
  3111. xor rax, [rdx + 24]
  3112. mov [rdi + 24 ], rax
  3113. mov rax, [rsi + 32]
  3114. xor rax, [rdx + 32]
  3115. and rax, [lboMask]
  3116. mov [rdi + 32 ], rax
  3117. ret
  3118. xor_l1nl2m:
  3119. mov r11b, 0x80
  3120. shl r11, 56
  3121. mov [rdi], r11
  3122. push rdi
  3123. mov rdi, rdx
  3124. call Fr_toNormal
  3125. mov rdx, rdi
  3126. pop rdi
  3127. mov rax, [rsi + 8]
  3128. xor rax, [rdx + 8]
  3129. mov [rdi + 8 ], rax
  3130. mov rax, [rsi + 16]
  3131. xor rax, [rdx + 16]
  3132. mov [rdi + 16 ], rax
  3133. mov rax, [rsi + 24]
  3134. xor rax, [rdx + 24]
  3135. mov [rdi + 24 ], rax
  3136. mov rax, [rsi + 32]
  3137. xor rax, [rdx + 32]
  3138. and rax, [lboMask]
  3139. mov [rdi + 32 ], rax
  3140. ret
  3141. xor_l1ml2:
  3142. bt r9, 62 ; check if montgomery first
  3143. jc xor_l1ml2m
  3144. xor_l1ml2n:
  3145. mov r11b, 0x80
  3146. shl r11, 56
  3147. mov [rdi], r11
  3148. push rdi
  3149. mov rdi, rsi
  3150. mov rsi, rdx
  3151. call Fr_toNormal
  3152. mov rdx, rsi
  3153. mov rsi, rdi
  3154. pop rdi
  3155. mov rax, [rsi + 8]
  3156. xor rax, [rdx + 8]
  3157. mov [rdi + 8 ], rax
  3158. mov rax, [rsi + 16]
  3159. xor rax, [rdx + 16]
  3160. mov [rdi + 16 ], rax
  3161. mov rax, [rsi + 24]
  3162. xor rax, [rdx + 24]
  3163. mov [rdi + 24 ], rax
  3164. mov rax, [rsi + 32]
  3165. xor rax, [rdx + 32]
  3166. and rax, [lboMask]
  3167. mov [rdi + 32 ], rax
  3168. ret
  3169. xor_l1ml2m:
  3170. mov r11b, 0x80
  3171. shl r11, 56
  3172. mov [rdi], r11
  3173. push rdi
  3174. mov rdi, rsi
  3175. mov rsi, rdx
  3176. call Fr_toNormal
  3177. mov rdx, rsi
  3178. mov rsi, rdi
  3179. pop rdi
  3180. push rdi
  3181. mov rdi, rdx
  3182. call Fr_toNormal
  3183. mov rdx, rdi
  3184. pop rdi
  3185. mov rax, [rsi + 8]
  3186. xor rax, [rdx + 8]
  3187. mov [rdi + 8 ], rax
  3188. mov rax, [rsi + 16]
  3189. xor rax, [rdx + 16]
  3190. mov [rdi + 16 ], rax
  3191. mov rax, [rsi + 24]
  3192. xor rax, [rdx + 24]
  3193. mov [rdi + 24 ], rax
  3194. mov rax, [rsi + 32]
  3195. xor rax, [rdx + 32]
  3196. and rax, [lboMask]
  3197. mov [rdi + 32 ], rax
  3198. ret
  3199. ;;;;;;;;;;;;;;;;;;;;;;
  3200. ; bnot
  3201. ;;;;;;;;;;;;;;;;;;;;;;
  3202. ; Adds two elements of any kind
  3203. ; Params:
  3204. ; rsi <= Pointer to element 1
  3205. ; rdi <= Pointer to result
  3206. ; Modified Registers:
  3207. ; r8, r9, 10, r11, rax, rcx
  3208. ;;;;;;;;;;;;;;;;;;;;;;
  3209. Fr_bnot:
  3210. mov r11b, 0x80
  3211. shl r11, 56
  3212. mov [rdi], r11
  3213. mov r8, [rsi]
  3214. bt r8, 63 ; Check if is long operand
  3215. jc bnot_l1
  3216. bnot_s:
  3217. push rdi
  3218. push rdx
  3219. mov rdi, rsi
  3220. movsx rsi, r8d
  3221. call rawCopyS2L
  3222. mov rsi, rdi
  3223. pop rdx
  3224. pop rdi
  3225. jmp bnot_l1n
  3226. bnot_l1:
  3227. bt r8, 62 ; check if montgomery first
  3228. jnc bnot_l1n
  3229. bnot_l1m:
  3230. push rdi
  3231. mov rdi, rsi
  3232. mov rsi, rdx
  3233. call Fr_toNormal
  3234. mov rdx, rsi
  3235. mov rsi, rdi
  3236. pop rdi
  3237. bnot_l1n:
  3238. mov rax, [rsi + 8]
  3239. not rax
  3240. mov [rdi + 8], rax
  3241. mov rax, [rsi + 16]
  3242. not rax
  3243. mov [rdi + 16], rax
  3244. mov rax, [rsi + 24]
  3245. not rax
  3246. mov [rdi + 24], rax
  3247. mov rax, [rsi + 32]
  3248. not rax
  3249. and rax, [lboMask]
  3250. mov [rdi + 32], rax
  3251. ret
  3252. ;;;;;;;;;;;;;;;;;;;;;;
  3253. ; eq
  3254. ;;;;;;;;;;;;;;;;;;;;;;
  3255. ; Adds two elements of any kind
  3256. ; Params:
  3257. ; rsi <= Pointer to element 1
  3258. ; rdx <= Pointer to element 2
  3259. ; rdi <= Pointer to result can be zero or one.
  3260. ; Modified Registers:
  3261. ; r8, r9, 10, r11, rax, rcx
  3262. ;;;;;;;;;;;;;;;;;;;;;;
  3263. Fr_eq:
  3264. sub rsp, 40 ; Save space for the result of the substraction
  3265. push rdi ; Save rdi
  3266. lea rdi, [rsp+8] ; We pushed rdi so we need to add 8
  3267. call Fr_sub ; Do a substraction
  3268. call Fr_toNormal ; Convert it to normal
  3269. pop rdi
  3270. mov rax, [rsp] ; We already poped do no need to add 8
  3271. bt rax, 63 ; check is result is long
  3272. jc eq_longCmp
  3273. eq_shortCmp:
  3274. cmp eax, 0
  3275. je eq_s_eq
  3276. js eq_s_lt
  3277. eq_s_gt:
  3278. mov qword [rdi], 0
  3279. add rsp, 40
  3280. ret
  3281. eq_s_lt:
  3282. mov qword [rdi], 0
  3283. add rsp, 40
  3284. ret
  3285. eq_s_eq:
  3286. mov qword [rdi], 1
  3287. add rsp, 40
  3288. ret
  3289. eq_longCmp:
  3290. cmp qword [rsp + 32], 0
  3291. jnz eq_neq
  3292. cmp qword [rsp + 24], 0
  3293. jnz eq_neq
  3294. cmp qword [rsp + 16], 0
  3295. jnz eq_neq
  3296. cmp qword [rsp + 8], 0
  3297. jnz eq_neq
  3298. eq_eq:
  3299. mov qword [rdi], 1
  3300. add rsp, 40
  3301. ret
  3302. eq_neq:
  3303. mov qword [rdi], 0
  3304. add rsp, 40
  3305. ret
  3306. ;;;;;;;;;;;;;;;;;;;;;;
  3307. ; neq
  3308. ;;;;;;;;;;;;;;;;;;;;;;
  3309. ; Adds two elements of any kind
  3310. ; Params:
  3311. ; rsi <= Pointer to element 1
  3312. ; rdx <= Pointer to element 2
  3313. ; rdi <= Pointer to result can be zero or one.
  3314. ; Modified Registers:
  3315. ; r8, r9, 10, r11, rax, rcx
  3316. ;;;;;;;;;;;;;;;;;;;;;;
  3317. Fr_neq:
  3318. sub rsp, 40 ; Save space for the result of the substraction
  3319. push rdi ; Save rdi
  3320. lea rdi, [rsp+8] ; We pushed rdi so we need to add 8
  3321. call Fr_sub ; Do a substraction
  3322. call Fr_toNormal ; Convert it to normal
  3323. pop rdi
  3324. mov rax, [rsp] ; We already poped do no need to add 8
  3325. bt rax, 63 ; check is result is long
  3326. jc neq_longCmp
  3327. neq_shortCmp:
  3328. cmp eax, 0
  3329. je neq_s_eq
  3330. js neq_s_lt
  3331. neq_s_gt:
  3332. mov qword [rdi], 1
  3333. add rsp, 40
  3334. ret
  3335. neq_s_lt:
  3336. mov qword [rdi], 1
  3337. add rsp, 40
  3338. ret
  3339. neq_s_eq:
  3340. mov qword [rdi], 0
  3341. add rsp, 40
  3342. ret
  3343. neq_longCmp:
  3344. cmp qword [rsp + 32], 0
  3345. jnz neq_neq
  3346. cmp qword [rsp + 24], 0
  3347. jnz neq_neq
  3348. cmp qword [rsp + 16], 0
  3349. jnz neq_neq
  3350. cmp qword [rsp + 8], 0
  3351. jnz neq_neq
  3352. neq_eq:
  3353. mov qword [rdi], 0
  3354. add rsp, 40
  3355. ret
  3356. neq_neq:
  3357. mov qword [rdi], 1
  3358. add rsp, 40
  3359. ret
  3360. ;;;;;;;;;;;;;;;;;;;;;;
  3361. ; lt
  3362. ;;;;;;;;;;;;;;;;;;;;;;
  3363. ; Adds two elements of any kind
  3364. ; Params:
  3365. ; rsi <= Pointer to element 1
  3366. ; rdx <= Pointer to element 2
  3367. ; rdi <= Pointer to result can be zero or one.
  3368. ; Modified Registers:
  3369. ; r8, r9, 10, r11, rax, rcx
  3370. ;;;;;;;;;;;;;;;;;;;;;;
  3371. Fr_lt:
  3372. sub rsp, 40 ; Save space for the result of the substraction
  3373. push rdi ; Save rdi
  3374. lea rdi, [rsp+8] ; We pushed rdi so we need to add 8
  3375. call Fr_sub ; Do a substraction
  3376. call Fr_toNormal ; Convert it to normal
  3377. pop rdi
  3378. mov rax, [rsp] ; We already poped do no need to add 8
  3379. bt rax, 63 ; check is result is long
  3380. jc lt_longCmp
  3381. lt_shortCmp:
  3382. cmp eax, 0
  3383. je lt_s_eq
  3384. js lt_s_lt
  3385. lt_s_gt:
  3386. mov qword [rdi], 0
  3387. add rsp, 40
  3388. ret
  3389. lt_s_lt:
  3390. mov qword [rdi], 1
  3391. add rsp, 40
  3392. ret
  3393. lt_s_eq:
  3394. mov qword [rdi], 0
  3395. add rsp, 40
  3396. ret
  3397. lt_longCmp:
  3398. cmp qword [rsp + 32], 0
  3399. jnz lt_neq
  3400. cmp qword [rsp + 24], 0
  3401. jnz lt_neq
  3402. cmp qword [rsp + 16], 0
  3403. jnz lt_neq
  3404. cmp qword [rsp + 8], 0
  3405. jnz lt_neq
  3406. lt_eq:
  3407. mov qword [rdi], 0
  3408. add rsp, 40
  3409. ret
  3410. mov rax, [rsp + 32]
  3411. cmp [half + 24], rax ; comare with (q-1)/2
  3412. jc tmp_29 ; half<rax => e1-e2 is neg => e1 < e2
  3413. jnz tmp_28 ; half>rax => e1 -e2 is pos => e1 > e2
  3414. mov rax, [rsp + 24]
  3415. cmp [half + 16], rax ; comare with (q-1)/2
  3416. jc tmp_29 ; half<rax => e1-e2 is neg => e1 < e2
  3417. jnz tmp_28 ; half>rax => e1 -e2 is pos => e1 > e2
  3418. mov rax, [rsp + 16]
  3419. cmp [half + 8], rax ; comare with (q-1)/2
  3420. jc tmp_29 ; half<rax => e1-e2 is neg => e1 < e2
  3421. jnz tmp_28 ; half>rax => e1 -e2 is pos => e1 > e2
  3422. mov rax, [rsp + 8]
  3423. cmp [half + 0], rax ; comare with (q-1)/2
  3424. jc tmp_29 ; half<rax => e1-e2 is neg => e1 < e2
  3425. jnz tmp_28 ; half>rax => e1 -e2 is pos => e1 > e2
  3426. ; half == rax => e1-e2 is pos => e1 > e2
  3427. tmp_28:
  3428. mov qword [rdi], 0
  3429. add rsp, 40
  3430. ret
  3431. tmp_29:
  3432. mov qword [rdi], 1
  3433. add rsp, 40
  3434. ret
  3435. lt_neq:
  3436. mov rax, [rsp + 32]
  3437. cmp [half + 24], rax ; comare with (q-1)/2
  3438. jc tmp_31 ; half<rax => e1-e2 is neg => e1 < e2
  3439. jnz tmp_30 ; half>rax => e1 -e2 is pos => e1 > e2
  3440. mov rax, [rsp + 24]
  3441. cmp [half + 16], rax ; comare with (q-1)/2
  3442. jc tmp_31 ; half<rax => e1-e2 is neg => e1 < e2
  3443. jnz tmp_30 ; half>rax => e1 -e2 is pos => e1 > e2
  3444. mov rax, [rsp + 16]
  3445. cmp [half + 8], rax ; comare with (q-1)/2
  3446. jc tmp_31 ; half<rax => e1-e2 is neg => e1 < e2
  3447. jnz tmp_30 ; half>rax => e1 -e2 is pos => e1 > e2
  3448. mov rax, [rsp + 8]
  3449. cmp [half + 0], rax ; comare with (q-1)/2
  3450. jc tmp_31 ; half<rax => e1-e2 is neg => e1 < e2
  3451. jnz tmp_30 ; half>rax => e1 -e2 is pos => e1 > e2
  3452. ; half == rax => e1-e2 is pos => e1 > e2
  3453. tmp_30:
  3454. mov qword [rdi], 0
  3455. add rsp, 40
  3456. ret
  3457. tmp_31:
  3458. mov qword [rdi], 1
  3459. add rsp, 40
  3460. ret
  3461. ;;;;;;;;;;;;;;;;;;;;;;
  3462. ; gt
  3463. ;;;;;;;;;;;;;;;;;;;;;;
  3464. ; Adds two elements of any kind
  3465. ; Params:
  3466. ; rsi <= Pointer to element 1
  3467. ; rdx <= Pointer to element 2
  3468. ; rdi <= Pointer to result can be zero or one.
  3469. ; Modified Registers:
  3470. ; r8, r9, 10, r11, rax, rcx
  3471. ;;;;;;;;;;;;;;;;;;;;;;
  3472. Fr_gt:
  3473. sub rsp, 40 ; Save space for the result of the substraction
  3474. push rdi ; Save rdi
  3475. lea rdi, [rsp+8] ; We pushed rdi so we need to add 8
  3476. call Fr_sub ; Do a substraction
  3477. call Fr_toNormal ; Convert it to normal
  3478. pop rdi
  3479. mov rax, [rsp] ; We already poped do no need to add 8
  3480. bt rax, 63 ; check is result is long
  3481. jc gt_longCmp
  3482. gt_shortCmp:
  3483. cmp eax, 0
  3484. je gt_s_eq
  3485. js gt_s_lt
  3486. gt_s_gt:
  3487. mov qword [rdi], 1
  3488. add rsp, 40
  3489. ret
  3490. gt_s_lt:
  3491. mov qword [rdi], 0
  3492. add rsp, 40
  3493. ret
  3494. gt_s_eq:
  3495. mov qword [rdi], 0
  3496. add rsp, 40
  3497. ret
  3498. gt_longCmp:
  3499. cmp qword [rsp + 32], 0
  3500. jnz gt_neq
  3501. cmp qword [rsp + 24], 0
  3502. jnz gt_neq
  3503. cmp qword [rsp + 16], 0
  3504. jnz gt_neq
  3505. cmp qword [rsp + 8], 0
  3506. jnz gt_neq
  3507. gt_eq:
  3508. mov qword [rdi], 0
  3509. add rsp, 40
  3510. ret
  3511. mov rax, [rsp + 32]
  3512. cmp [half + 24], rax ; comare with (q-1)/2
  3513. jc tmp_33 ; half<rax => e1-e2 is neg => e1 < e2
  3514. jnz tmp_32 ; half>rax => e1 -e2 is pos => e1 > e2
  3515. mov rax, [rsp + 24]
  3516. cmp [half + 16], rax ; comare with (q-1)/2
  3517. jc tmp_33 ; half<rax => e1-e2 is neg => e1 < e2
  3518. jnz tmp_32 ; half>rax => e1 -e2 is pos => e1 > e2
  3519. mov rax, [rsp + 16]
  3520. cmp [half + 8], rax ; comare with (q-1)/2
  3521. jc tmp_33 ; half<rax => e1-e2 is neg => e1 < e2
  3522. jnz tmp_32 ; half>rax => e1 -e2 is pos => e1 > e2
  3523. mov rax, [rsp + 8]
  3524. cmp [half + 0], rax ; comare with (q-1)/2
  3525. jc tmp_33 ; half<rax => e1-e2 is neg => e1 < e2
  3526. jnz tmp_32 ; half>rax => e1 -e2 is pos => e1 > e2
  3527. ; half == rax => e1-e2 is pos => e1 > e2
  3528. tmp_32:
  3529. mov qword [rdi], 1
  3530. add rsp, 40
  3531. ret
  3532. tmp_33:
  3533. mov qword [rdi], 0
  3534. add rsp, 40
  3535. ret
  3536. gt_neq:
  3537. mov rax, [rsp + 32]
  3538. cmp [half + 24], rax ; comare with (q-1)/2
  3539. jc tmp_35 ; half<rax => e1-e2 is neg => e1 < e2
  3540. jnz tmp_34 ; half>rax => e1 -e2 is pos => e1 > e2
  3541. mov rax, [rsp + 24]
  3542. cmp [half + 16], rax ; comare with (q-1)/2
  3543. jc tmp_35 ; half<rax => e1-e2 is neg => e1 < e2
  3544. jnz tmp_34 ; half>rax => e1 -e2 is pos => e1 > e2
  3545. mov rax, [rsp + 16]
  3546. cmp [half + 8], rax ; comare with (q-1)/2
  3547. jc tmp_35 ; half<rax => e1-e2 is neg => e1 < e2
  3548. jnz tmp_34 ; half>rax => e1 -e2 is pos => e1 > e2
  3549. mov rax, [rsp + 8]
  3550. cmp [half + 0], rax ; comare with (q-1)/2
  3551. jc tmp_35 ; half<rax => e1-e2 is neg => e1 < e2
  3552. jnz tmp_34 ; half>rax => e1 -e2 is pos => e1 > e2
  3553. ; half == rax => e1-e2 is pos => e1 > e2
  3554. tmp_34:
  3555. mov qword [rdi], 1
  3556. add rsp, 40
  3557. ret
  3558. tmp_35:
  3559. mov qword [rdi], 0
  3560. add rsp, 40
  3561. ret
  3562. ;;;;;;;;;;;;;;;;;;;;;;
  3563. ; leq
  3564. ;;;;;;;;;;;;;;;;;;;;;;
  3565. ; Adds two elements of any kind
  3566. ; Params:
  3567. ; rsi <= Pointer to element 1
  3568. ; rdx <= Pointer to element 2
  3569. ; rdi <= Pointer to result can be zero or one.
  3570. ; Modified Registers:
  3571. ; r8, r9, 10, r11, rax, rcx
  3572. ;;;;;;;;;;;;;;;;;;;;;;
  3573. Fr_leq:
  3574. sub rsp, 40 ; Save space for the result of the substraction
  3575. push rdi ; Save rdi
  3576. lea rdi, [rsp+8] ; We pushed rdi so we need to add 8
  3577. call Fr_sub ; Do a substraction
  3578. call Fr_toNormal ; Convert it to normal
  3579. pop rdi
  3580. mov rax, [rsp] ; We already poped do no need to add 8
  3581. bt rax, 63 ; check is result is long
  3582. jc leq_longCmp
  3583. leq_shortCmp:
  3584. cmp eax, 0
  3585. je leq_s_eq
  3586. js leq_s_lt
  3587. leq_s_gt:
  3588. mov qword [rdi], 0
  3589. add rsp, 40
  3590. ret
  3591. leq_s_lt:
  3592. mov qword [rdi], 1
  3593. add rsp, 40
  3594. ret
  3595. leq_s_eq:
  3596. mov qword [rdi], 1
  3597. add rsp, 40
  3598. ret
  3599. leq_longCmp:
  3600. cmp qword [rsp + 32], 0
  3601. jnz leq_neq
  3602. cmp qword [rsp + 24], 0
  3603. jnz leq_neq
  3604. cmp qword [rsp + 16], 0
  3605. jnz leq_neq
  3606. cmp qword [rsp + 8], 0
  3607. jnz leq_neq
  3608. leq_eq:
  3609. mov qword [rdi], 1
  3610. add rsp, 40
  3611. ret
  3612. mov rax, [rsp + 32]
  3613. cmp [half + 24], rax ; comare with (q-1)/2
  3614. jc tmp_37 ; half<rax => e1-e2 is neg => e1 < e2
  3615. jnz tmp_36 ; half>rax => e1 -e2 is pos => e1 > e2
  3616. mov rax, [rsp + 24]
  3617. cmp [half + 16], rax ; comare with (q-1)/2
  3618. jc tmp_37 ; half<rax => e1-e2 is neg => e1 < e2
  3619. jnz tmp_36 ; half>rax => e1 -e2 is pos => e1 > e2
  3620. mov rax, [rsp + 16]
  3621. cmp [half + 8], rax ; comare with (q-1)/2
  3622. jc tmp_37 ; half<rax => e1-e2 is neg => e1 < e2
  3623. jnz tmp_36 ; half>rax => e1 -e2 is pos => e1 > e2
  3624. mov rax, [rsp + 8]
  3625. cmp [half + 0], rax ; comare with (q-1)/2
  3626. jc tmp_37 ; half<rax => e1-e2 is neg => e1 < e2
  3627. jnz tmp_36 ; half>rax => e1 -e2 is pos => e1 > e2
  3628. ; half == rax => e1-e2 is pos => e1 > e2
  3629. tmp_36:
  3630. mov qword [rdi], 0
  3631. add rsp, 40
  3632. ret
  3633. tmp_37:
  3634. mov qword [rdi], 1
  3635. add rsp, 40
  3636. ret
  3637. leq_neq:
  3638. mov rax, [rsp + 32]
  3639. cmp [half + 24], rax ; comare with (q-1)/2
  3640. jc tmp_39 ; half<rax => e1-e2 is neg => e1 < e2
  3641. jnz tmp_38 ; half>rax => e1 -e2 is pos => e1 > e2
  3642. mov rax, [rsp + 24]
  3643. cmp [half + 16], rax ; comare with (q-1)/2
  3644. jc tmp_39 ; half<rax => e1-e2 is neg => e1 < e2
  3645. jnz tmp_38 ; half>rax => e1 -e2 is pos => e1 > e2
  3646. mov rax, [rsp + 16]
  3647. cmp [half + 8], rax ; comare with (q-1)/2
  3648. jc tmp_39 ; half<rax => e1-e2 is neg => e1 < e2
  3649. jnz tmp_38 ; half>rax => e1 -e2 is pos => e1 > e2
  3650. mov rax, [rsp + 8]
  3651. cmp [half + 0], rax ; comare with (q-1)/2
  3652. jc tmp_39 ; half<rax => e1-e2 is neg => e1 < e2
  3653. jnz tmp_38 ; half>rax => e1 -e2 is pos => e1 > e2
  3654. ; half == rax => e1-e2 is pos => e1 > e2
  3655. tmp_38:
  3656. mov qword [rdi], 0
  3657. add rsp, 40
  3658. ret
  3659. tmp_39:
  3660. mov qword [rdi], 1
  3661. add rsp, 40
  3662. ret
  3663. ;;;;;;;;;;;;;;;;;;;;;;
  3664. ; geq
  3665. ;;;;;;;;;;;;;;;;;;;;;;
  3666. ; Adds two elements of any kind
  3667. ; Params:
  3668. ; rsi <= Pointer to element 1
  3669. ; rdx <= Pointer to element 2
  3670. ; rdi <= Pointer to result can be zero or one.
  3671. ; Modified Registers:
  3672. ; r8, r9, 10, r11, rax, rcx
  3673. ;;;;;;;;;;;;;;;;;;;;;;
  3674. Fr_geq:
  3675. sub rsp, 40 ; Save space for the result of the substraction
  3676. push rdi ; Save rdi
  3677. lea rdi, [rsp+8] ; We pushed rdi so we need to add 8
  3678. call Fr_sub ; Do a substraction
  3679. call Fr_toNormal ; Convert it to normal
  3680. pop rdi
  3681. mov rax, [rsp] ; We already poped do no need to add 8
  3682. bt rax, 63 ; check is result is long
  3683. jc geq_longCmp
  3684. geq_shortCmp:
  3685. cmp eax, 0
  3686. je geq_s_eq
  3687. js geq_s_lt
  3688. geq_s_gt:
  3689. mov qword [rdi], 1
  3690. add rsp, 40
  3691. ret
  3692. geq_s_lt:
  3693. mov qword [rdi], 0
  3694. add rsp, 40
  3695. ret
  3696. geq_s_eq:
  3697. mov qword [rdi], 1
  3698. add rsp, 40
  3699. ret
  3700. geq_longCmp:
  3701. cmp qword [rsp + 32], 0
  3702. jnz geq_neq
  3703. cmp qword [rsp + 24], 0
  3704. jnz geq_neq
  3705. cmp qword [rsp + 16], 0
  3706. jnz geq_neq
  3707. cmp qword [rsp + 8], 0
  3708. jnz geq_neq
  3709. geq_eq:
  3710. mov qword [rdi], 1
  3711. add rsp, 40
  3712. ret
  3713. mov rax, [rsp + 32]
  3714. cmp [half + 24], rax ; comare with (q-1)/2
  3715. jc tmp_41 ; half<rax => e1-e2 is neg => e1 < e2
  3716. jnz tmp_40 ; half>rax => e1 -e2 is pos => e1 > e2
  3717. mov rax, [rsp + 24]
  3718. cmp [half + 16], rax ; comare with (q-1)/2
  3719. jc tmp_41 ; half<rax => e1-e2 is neg => e1 < e2
  3720. jnz tmp_40 ; half>rax => e1 -e2 is pos => e1 > e2
  3721. mov rax, [rsp + 16]
  3722. cmp [half + 8], rax ; comare with (q-1)/2
  3723. jc tmp_41 ; half<rax => e1-e2 is neg => e1 < e2
  3724. jnz tmp_40 ; half>rax => e1 -e2 is pos => e1 > e2
  3725. mov rax, [rsp + 8]
  3726. cmp [half + 0], rax ; comare with (q-1)/2
  3727. jc tmp_41 ; half<rax => e1-e2 is neg => e1 < e2
  3728. jnz tmp_40 ; half>rax => e1 -e2 is pos => e1 > e2
  3729. ; half == rax => e1-e2 is pos => e1 > e2
  3730. tmp_40:
  3731. mov qword [rdi], 1
  3732. add rsp, 40
  3733. ret
  3734. tmp_41:
  3735. mov qword [rdi], 0
  3736. add rsp, 40
  3737. ret
  3738. geq_neq:
  3739. mov rax, [rsp + 32]
  3740. cmp [half + 24], rax ; comare with (q-1)/2
  3741. jc tmp_43 ; half<rax => e1-e2 is neg => e1 < e2
  3742. jnz tmp_42 ; half>rax => e1 -e2 is pos => e1 > e2
  3743. mov rax, [rsp + 24]
  3744. cmp [half + 16], rax ; comare with (q-1)/2
  3745. jc tmp_43 ; half<rax => e1-e2 is neg => e1 < e2
  3746. jnz tmp_42 ; half>rax => e1 -e2 is pos => e1 > e2
  3747. mov rax, [rsp + 16]
  3748. cmp [half + 8], rax ; comare with (q-1)/2
  3749. jc tmp_43 ; half<rax => e1-e2 is neg => e1 < e2
  3750. jnz tmp_42 ; half>rax => e1 -e2 is pos => e1 > e2
  3751. mov rax, [rsp + 8]
  3752. cmp [half + 0], rax ; comare with (q-1)/2
  3753. jc tmp_43 ; half<rax => e1-e2 is neg => e1 < e2
  3754. jnz tmp_42 ; half>rax => e1 -e2 is pos => e1 > e2
  3755. ; half == rax => e1-e2 is pos => e1 > e2
  3756. tmp_42:
  3757. mov qword [rdi], 1
  3758. add rsp, 40
  3759. ret
  3760. tmp_43:
  3761. mov qword [rdi], 0
  3762. add rsp, 40
  3763. ret
  3764. ;;;;;;;;;;;;;;;;;;;;;;
  3765. ; land
  3766. ;;;;;;;;;;;;;;;;;;;;;;
  3767. ; Logical and between two elements
  3768. ; Params:
  3769. ; rsi <= Pointer to element 1
  3770. ; rdx <= Pointer to element 2
  3771. ; rdi <= Pointer to result zero or one
  3772. ; Modified Registers:
  3773. ; rax, rcx, r8
  3774. ;;;;;;;;;;;;;;;;;;;;;;
  3775. Fr_land:
  3776. mov rax, [rsi]
  3777. bt rax, 63
  3778. jc tmp_44
  3779. test eax, eax
  3780. jz retZero_46
  3781. jmp retOne_45
  3782. tmp_44:
  3783. mov rax, [rsi + 8]
  3784. test rax, rax
  3785. jnz retOne_45
  3786. mov rax, [rsi + 16]
  3787. test rax, rax
  3788. jnz retOne_45
  3789. mov rax, [rsi + 24]
  3790. test rax, rax
  3791. jnz retOne_45
  3792. mov rax, [rsi + 32]
  3793. test rax, rax
  3794. jnz retOne_45
  3795. retZero_46:
  3796. mov qword r8, 0
  3797. jmp done_47
  3798. retOne_45:
  3799. mov qword r8, 1
  3800. done_47:
  3801. mov rax, [rdx]
  3802. bt rax, 63
  3803. jc tmp_48
  3804. test eax, eax
  3805. jz retZero_50
  3806. jmp retOne_49
  3807. tmp_48:
  3808. mov rax, [rdx + 8]
  3809. test rax, rax
  3810. jnz retOne_49
  3811. mov rax, [rdx + 16]
  3812. test rax, rax
  3813. jnz retOne_49
  3814. mov rax, [rdx + 24]
  3815. test rax, rax
  3816. jnz retOne_49
  3817. mov rax, [rdx + 32]
  3818. test rax, rax
  3819. jnz retOne_49
  3820. retZero_50:
  3821. mov qword rcx, 0
  3822. jmp done_51
  3823. retOne_49:
  3824. mov qword rcx, 1
  3825. done_51:
  3826. and rcx, r8
  3827. mov [rdi], rcx
  3828. ret
  3829. ;;;;;;;;;;;;;;;;;;;;;;
  3830. ; lor
  3831. ;;;;;;;;;;;;;;;;;;;;;;
  3832. ; Logical or between two elements
  3833. ; Params:
  3834. ; rsi <= Pointer to element 1
  3835. ; rdx <= Pointer to element 2
  3836. ; rdi <= Pointer to result zero or one
  3837. ; Modified Registers:
  3838. ; rax, rcx, r8
  3839. ;;;;;;;;;;;;;;;;;;;;;;
  3840. Fr_lor:
  3841. mov rax, [rsi]
  3842. bt rax, 63
  3843. jc tmp_52
  3844. test eax, eax
  3845. jz retZero_54
  3846. jmp retOne_53
  3847. tmp_52:
  3848. mov rax, [rsi + 8]
  3849. test rax, rax
  3850. jnz retOne_53
  3851. mov rax, [rsi + 16]
  3852. test rax, rax
  3853. jnz retOne_53
  3854. mov rax, [rsi + 24]
  3855. test rax, rax
  3856. jnz retOne_53
  3857. mov rax, [rsi + 32]
  3858. test rax, rax
  3859. jnz retOne_53
  3860. retZero_54:
  3861. mov qword r8, 0
  3862. jmp done_55
  3863. retOne_53:
  3864. mov qword r8, 1
  3865. done_55:
  3866. mov rax, [rdx]
  3867. bt rax, 63
  3868. jc tmp_56
  3869. test eax, eax
  3870. jz retZero_58
  3871. jmp retOne_57
  3872. tmp_56:
  3873. mov rax, [rdx + 8]
  3874. test rax, rax
  3875. jnz retOne_57
  3876. mov rax, [rdx + 16]
  3877. test rax, rax
  3878. jnz retOne_57
  3879. mov rax, [rdx + 24]
  3880. test rax, rax
  3881. jnz retOne_57
  3882. mov rax, [rdx + 32]
  3883. test rax, rax
  3884. jnz retOne_57
  3885. retZero_58:
  3886. mov qword rcx, 0
  3887. jmp done_59
  3888. retOne_57:
  3889. mov qword rcx, 1
  3890. done_59:
  3891. or rcx, r8
  3892. mov [rdi], rcx
  3893. ret
  3894. ;;;;;;;;;;;;;;;;;;;;;;
  3895. ; lnot
  3896. ;;;;;;;;;;;;;;;;;;;;;;
  3897. ; Do the logical not of an element
  3898. ; Params:
  3899. ; rsi <= Pointer to element to be tested
  3900. ; rdi <= Pointer to result one if element1 is zero and zero otherwise
  3901. ; Modified Registers:
  3902. ; rax, rax, r8
  3903. ;;;;;;;;;;;;;;;;;;;;;;
  3904. Fr_lnot:
  3905. mov rax, [rsi]
  3906. bt rax, 63
  3907. jc tmp_60
  3908. test eax, eax
  3909. jz retZero_62
  3910. jmp retOne_61
  3911. tmp_60:
  3912. mov rax, [rsi + 8]
  3913. test rax, rax
  3914. jnz retOne_61
  3915. mov rax, [rsi + 16]
  3916. test rax, rax
  3917. jnz retOne_61
  3918. mov rax, [rsi + 24]
  3919. test rax, rax
  3920. jnz retOne_61
  3921. mov rax, [rsi + 32]
  3922. test rax, rax
  3923. jnz retOne_61
  3924. retZero_62:
  3925. mov qword rcx, 0
  3926. jmp done_63
  3927. retOne_61:
  3928. mov qword rcx, 1
  3929. done_63:
  3930. test rcx, rcx
  3931. jz lnot_retOne
  3932. lnot_retZero:
  3933. mov qword [rdi], 0
  3934. ret
  3935. lnot_retOne:
  3936. mov qword [rdi], 1
  3937. ret
  3938. section .data
  3939. Fr_q:
  3940. dd 0
  3941. dd 0x80000000
  3942. q dq 0x43e1f593f0000001,0x2833e84879b97091,0xb85045b68181585d,0x30644e72e131a029
  3943. half dq 0xa1f0fac9f8000000,0x9419f4243cdcb848,0xdc2822db40c0ac2e,0x183227397098d014
  3944. R2 dq 0x1bb8e645ae216da7,0x53fe3ab1e35c59e3,0x8c49833d53bb8085,0x0216d0b17f4e44a5
  3945. R3 dq 0x5e94d8e1b4bf0040,0x2a489cbe1cfbb6b8,0x893cc664a19fcfed,0x0cf8594b7fcc657c
  3946. lboMask dq 0x1fffffffffffffff