You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

302 lines
6.2 KiB

  1. global <%=name%>_add
  2. global <%=name%>_mul
  3. global <%=name%>_q
  4. DEFAULT REL
  5. section .text
  6. ;;;;;;;;;;;;;;;;;;;;;;
  7. ; add
  8. ;;;;;;;;;;;;;;;;;;;;;;
  9. <%=name%>_add:
  10. ; Add component by component with carry
  11. <% for (let i=0; i<n64; i++) { %>
  12. mov rax, [rsi + <%=i*8%>]
  13. <%= i==0 ? "add" : "adc" %> rax, [rdx + <%=i*8%>]
  14. mov [rdi + <%=i*8%>], rax
  15. <% } %>
  16. jc add_sq ; if overflow, substract q
  17. ; Compare with q
  18. <% for (let i=0; i<n64; i++) { %>
  19. <% if (i>0) { %>
  20. mov rax, [rdi + <%= (n64-i-1)*8 %>]
  21. <% } %>
  22. cmp rax, [q + <%= (n64-i-1)*8 %>]
  23. jg add_sq
  24. jl add_done
  25. <% } %>
  26. ; If equal substract q
  27. add_sq:
  28. <% for (let i=0; i<n64; i++) { %>
  29. mov rax, [q + <%=i*8%>]
  30. <%= i==0 ? "sub" : "sbb" %> [rdi + <%=i*8%>], rax
  31. mov [rdx + <%=i*8%>], rax
  32. <% } %>
  33. add_done:
  34. ret
  35. ;;;;;;;;;;;;;;;;;;;;;;
  36. ; mul Montgomery
  37. ;;;;;;;;;;;;;;;;;;;;;;
  38. mulM:
  39. <%
  40. let r0, r1, r2;
  41. function setR(step) {
  42. if ((step % 3) == 0) {
  43. r0 = "r8";
  44. r1 = "r9";
  45. r2 = "r10";
  46. } else if ((step % 3) == 1) {
  47. r0 = "r9";
  48. r1 = "r10";
  49. r2 = "r8";
  50. } else {
  51. r0 = "r10";
  52. r1 = "r8";
  53. r2 = "r9";
  54. }
  55. }
  56. const base = bigInt.one.shiftLeft(64);
  57. const np64 = base.minus(q.modInv(base));
  58. %>
  59. sub rsp, <%= n64*8 %> ; Reserve space for ms
  60. mov rcx, rdx ; rdx is needed for multiplications so keep it in cx
  61. mov r11, 0x<%= np64.toString(16) %> ; np
  62. xor r8,r8
  63. xor r9,r9
  64. xor r10,r10
  65. <%
  66. // Main loop
  67. for (let i=0; i<n64*2; i++) {
  68. setR(i);
  69. %>
  70. <%
  71. // Same Digit
  72. for (let o1=Math.max(0, i-n64+1); (o1<=i)&&(o1<n64); o1++) {
  73. const o2= i-o1;
  74. %>
  75. mov rax, [rsi + <%= 8*o1 %>]
  76. mul qword [rcx + <%= 8*o2 %>]
  77. add <%= r0 %>, rax
  78. adc <%= r1 %>, rdx
  79. adc <%= r2 %>, 0x0
  80. <%
  81. } // Same digit
  82. %>
  83. <%
  84. for (let j=i-1; j>=0; j--) { // All ms
  85. if (((i-j)<n64)&&(j<n64)) {
  86. %>
  87. mov rax, [rsp + <%= j*8 %>]
  88. mul qword [q + <%= (i-j)*8 %>]
  89. add <%= r0 %>, rax
  90. adc <%= r1 %>, rdx
  91. adc <%= r2 %>, 0x0
  92. <%
  93. }
  94. } // ms
  95. %>
  96. <%
  97. if (i<n64) {
  98. %>
  99. mov rax, <%= r0 %>
  100. mul r11
  101. mov [rsp + <%= i*8 %>], rax
  102. mul qword [q]
  103. add <%= r0 %>, rax
  104. adc <%= r1 %>, rdx
  105. adc <%= r2 %>, 0x0
  106. <%
  107. } else {
  108. %>
  109. mov [rdi + <%= (i-n64)*8 %> ], <%= r0 %>
  110. xor <%= r0 %>,<%= r0 %>
  111. <%
  112. }
  113. %>
  114. <%
  115. } // Main Loop
  116. %>
  117. cmp <%= r1 %>, 0x0
  118. jne mulM_sq
  119. ; Compare with q
  120. <%
  121. for (let i=0; i<n64; i++) {
  122. %>
  123. mov rax, [rdi + <%= (n64-i-1)*8 %>]
  124. cmp rax, [q + <%= (n64-i-1)*8 %>]
  125. jg mulM_sq
  126. jl mulM_done
  127. <%
  128. }
  129. %>
  130. ; If equal substract q
  131. mulM_sq:
  132. <%
  133. for (let i=0; i<n64; i++) {
  134. %>
  135. mov rax, [q + <%= i*8 %>]
  136. <%= i==0 ? "sub" : "sbb" %> [rdi + <%= i*8 %>], rax
  137. mov [rdx + <%= i*8 %>], rax
  138. <%
  139. }
  140. %>
  141. mulM_done:
  142. add rsp, <%= n64*8 %> ; recover rsp
  143. ret
  144. ;;;;;;;;;;;;;;;;;;;;;;
  145. ; mul MontgomeryShort
  146. ;;;;;;;;;;;;;;;;;;;;;;
  147. mulSM:
  148. ;;;;;;;;;;;;;;;;;;;;;;
  149. ; mul
  150. ;;;;;;;;;;;;;;;;;;;;;;
  151. <%=name%>_mul:
  152. mov rax, [rsi]
  153. bt rax, 63
  154. jc l1
  155. mov rcx, [rdx]
  156. bt rcx, 63
  157. jc s1l2
  158. s1s2: ; short first and second
  159. mul ecx
  160. jc rs2l ; If if doesn't feed in 32 bits convert the result to long
  161. ; The shorts multiplication is done. copy the val to destination and return
  162. mov [rdi], rax
  163. ret
  164. rs2l: ; The result in the multiplication doen't feed
  165. ; we have the result in edx:eax we need to convert it to long
  166. shl rdx, 32
  167. mov edx, eax ; pack edx:eax to rdx
  168. xor rax, rax ; Set the format to long
  169. bts rax, 63
  170. mov [rdi], rax ; move the first digit
  171. cmp rdx, 0 ; check if redx is negative.
  172. jl rs2ln
  173. ; edx is positive.
  174. mov [rdi + 8], rdx ; Set the firs digit
  175. xor rax, rax ; Set the remaining digits to 0
  176. <% for (let i=1; i<n64; i++) { %>
  177. mov [rdi + <%= (i+1)*8 %>], rax
  178. <% } %>
  179. ret
  180. ; edx is negative.
  181. rs2ln:
  182. add rdx, [q] ; Set the firs digit
  183. mov [rdi + 8], rdx ;
  184. mov rdx, -1 ; all ones
  185. <% for (let i=1; i<n64; i++) { %>
  186. mov rax, rdx ; Add to q
  187. adc rax, [q + <%= i*8 %> ]
  188. mov [rdi + <%= (i+1)*8 %>], rax
  189. <% } %>
  190. ret
  191. l1:
  192. mov rcx, [rdx]
  193. bt rcx, 63
  194. jc ll
  195. l1s2:
  196. xor rdx, rdx
  197. mov edx, ecx
  198. bt rax, 62
  199. jc lsM
  200. jmp lsN
  201. s1l2:
  202. mov rsi, rdx
  203. xor rdx, rdx
  204. mov edx, eax
  205. bt rcx, 62
  206. jc lsM
  207. jmp lsN
  208. lsN:
  209. mov byte [rdi + 3], 0xC0 ; set the result to montgomery
  210. add rsi, 8
  211. add rdi, 8
  212. call mulSM
  213. mov rdx, R3
  214. call mulM
  215. ret
  216. lsM:
  217. mov byte [rdi + 3], 0x80 ; set the result to long normal
  218. add rsi, 8
  219. add rdi, 8
  220. call mulSM
  221. ret
  222. ll:
  223. bt rax, 62
  224. jc lml
  225. bt rcx, 62
  226. jc lnlm
  227. lnln:
  228. mov byte [rdi + 3], 0xC0 ; set the result to long montgomery
  229. add rsi, 8
  230. add rdi, 8
  231. add rdx, 8
  232. call mulM
  233. mov rdi, rsi
  234. mov rdx, R3
  235. call mulM
  236. ret
  237. lml:
  238. bt rcx, 62
  239. jc lmlm
  240. lnlm:
  241. mov byte [rdi + 3], 0x80 ; set the result to long normal
  242. add rsi, 8
  243. add rdi, 8
  244. add rdx, 8
  245. call mulM
  246. ret
  247. lmlm:
  248. mov byte [rdi + 3], 0xC0 ; set the result to long montgomery
  249. add rsi, 8
  250. add rdi, 8
  251. add rdx, 8
  252. call mulM
  253. ret
  254. section .data
  255. <%=name%>_q:
  256. dd 0
  257. dd 0x80000000
  258. q dq <%= constantElement(q) %>
  259. R3 dq <%= constantElement(bigInt.one.shiftLeft(n64*64*3).mod(q)) %>