You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

251 lines
5.1 KiB

  1. ;;;;;;;;;;;;;;;;;;;;;;
  2. ; mul Montgomery
  3. ;;;;;;;;;;;;;;;;;;;;;;
  4. mulM:
  5. <%
  6. let r0, r1, r2;
  7. function setR(step) {
  8. if ((step % 3) == 0) {
  9. r0 = "r8";
  10. r1 = "r9";
  11. r2 = "r10";
  12. } else if ((step % 3) == 1) {
  13. r0 = "r9";
  14. r1 = "r10";
  15. r2 = "r8";
  16. } else {
  17. r0 = "r10";
  18. r1 = "r8";
  19. r2 = "r9";
  20. }
  21. }
  22. const base = bigInt.one.shiftLeft(64);
  23. const np64 = base.minus(q.modInv(base));
  24. %>
  25. sub rsp, <%= n64*8 %> ; Reserve space for ms
  26. mov rcx, rdx ; rdx is needed for multiplications so keep it in cx
  27. mov r11, 0x<%= np64.toString(16) %> ; np
  28. xor r8,r8
  29. xor r9,r9
  30. xor r10,r10
  31. <%
  32. // Main loop
  33. for (let i=0; i<n64*2; i++) {
  34. setR(i);
  35. %>
  36. <%
  37. // Same Digit
  38. for (let o1=Math.max(0, i-n64+1); (o1<=i)&&(o1<n64); o1++) {
  39. const o2= i-o1;
  40. %>
  41. mov rax, [rsi + <%= 8*o1 %>]
  42. mul qword [rcx + <%= 8*o2 %>]
  43. add <%= r0 %>, rax
  44. adc <%= r1 %>, rdx
  45. adc <%= r2 %>, 0x0
  46. <%
  47. } // Same digit
  48. %>
  49. <%
  50. for (let j=i-1; j>=0; j--) { // All ms
  51. if (((i-j)<n64)&&(j<n64)) {
  52. %>
  53. mov rax, [rsp + <%= j*8 %>]
  54. mul qword [q + <%= (i-j)*8 %>]
  55. add <%= r0 %>, rax
  56. adc <%= r1 %>, rdx
  57. adc <%= r2 %>, 0x0
  58. <%
  59. }
  60. } // ms
  61. %>
  62. <%
  63. if (i<n64) {
  64. %>
  65. mov rax, <%= r0 %>
  66. mul r11
  67. mov [rsp + <%= i*8 %>], rax
  68. mul qword [q]
  69. add <%= r0 %>, rax
  70. adc <%= r1 %>, rdx
  71. adc <%= r2 %>, 0x0
  72. <%
  73. } else {
  74. %>
  75. mov [rdi + <%= (i-n64)*8 %> ], <%= r0 %>
  76. xor <%= r0 %>,<%= r0 %>
  77. <%
  78. }
  79. %>
  80. <%
  81. } // Main Loop
  82. %>
  83. cmp <%= r1 %>, 0x0
  84. jne mulM_sq
  85. ; Compare with q
  86. <%
  87. for (let i=0; i<n64; i++) {
  88. %>
  89. mov rax, [rdi + <%= (n64-i-1)*8 %>]
  90. cmp rax, [q + <%= (n64-i-1)*8 %>]
  91. jg mulM_sq
  92. jl mulM_done
  93. <%
  94. }
  95. %>
  96. ; If equal substract q
  97. mulM_sq:
  98. <%
  99. for (let i=0; i<n64; i++) {
  100. %>
  101. mov rax, [q + <%= i*8 %>]
  102. <%= i==0 ? "sub" : "sbb" %> [rdi + <%= i*8 %>], rax
  103. <%
  104. }
  105. %>
  106. mulM_done:
  107. add rsp, <%= n64*8 %> ; recover rsp
  108. ret
  109. ;;;;;;;;;;;;;;;;;;;;;;
  110. ; mul MontgomeryShort
  111. ;;;;;;;;;;;;;;;;;;;;;;
  112. mulSM:
  113. ;;;;;;;;;;;;;;;;;;;;;;
  114. ; mul
  115. ;;;;;;;;;;;;;;;;;;;;;;
  116. <%=name%>_mul:
  117. mov rax, [rsi]
  118. bt rax, 63
  119. jc l1
  120. mov rcx, [rdx]
  121. bt rcx, 63
  122. jc s1l2
  123. s1s2: ; short first and second
  124. mul ecx
  125. jc rs2l ; If if doesn't feed in 32 bits convert the result to long
  126. ; The shorts multiplication is done. copy the val to destination and return
  127. mov [rdi], rax
  128. ret
  129. rs2l: ; The result in the multiplication doen't feed
  130. ; we have the result in edx:eax we need to convert it to long
  131. shl rdx, 32
  132. mov edx, eax ; pack edx:eax to rdx
  133. xor rax, rax ; Set the format to long
  134. bts rax, 63
  135. mov [rdi], rax ; move the first digit
  136. cmp rdx, 0 ; check if redx is negative.
  137. jl rs2ln
  138. ; edx is positive.
  139. mov [rdi + 8], rdx ; Set the firs digit
  140. xor rax, rax ; Set the remaining digits to 0
  141. <% for (let i=1; i<n64; i++) { %>
  142. mov [rdi + <%= (i+1)*8 %>], rax
  143. <% } %>
  144. ret
  145. ; edx is negative.
  146. rs2ln:
  147. add rdx, [q] ; Set the firs digit
  148. mov [rdi + 8], rdx ;
  149. mov rdx, -1 ; all ones
  150. <% for (let i=1; i<n64; i++) { %>
  151. mov rax, rdx ; Add to q
  152. adc rax, [q + <%= i*8 %> ]
  153. mov [rdi + <%= (i+1)*8 %>], rax
  154. <% } %>
  155. ret
  156. l1:
  157. mov rcx, [rdx]
  158. bt rcx, 63
  159. jc ll
  160. l1s2:
  161. xor rdx, rdx
  162. mov edx, ecx
  163. bt rax, 62
  164. jc lsM
  165. jmp lsN
  166. s1l2:
  167. mov rsi, rdx
  168. xor rdx, rdx
  169. mov edx, eax
  170. bt rcx, 62
  171. jc lsM
  172. jmp lsN
  173. lsN:
  174. mov byte [rdi + 7], 0xC0 ; set the result to montgomery
  175. add rsi, 8
  176. add rdi, 8
  177. call mulSM
  178. mov rsi, rdi
  179. lea rdx, [R3]
  180. call mulM
  181. ret
  182. lsM:
  183. mov byte [rdi + 7], 0x80 ; set the result to long normal
  184. add rsi, 8
  185. add rdi, 8
  186. call mulSM
  187. ret
  188. ll:
  189. bt rax, 62
  190. jc lml
  191. bt rcx, 62
  192. jc lnlm
  193. lnln:
  194. mov byte [rdi + 7], 0xC0 ; set the result to long montgomery
  195. add rsi, 8
  196. add rdi, 8
  197. add rdx, 8
  198. call mulM
  199. mov rsi, rdi
  200. lea rdx, [R3]
  201. call mulM
  202. ret
  203. lml:
  204. bt rcx, 62
  205. jc lmlm
  206. lnlm:
  207. mov byte [rdi + 7], 0x80 ; set the result to long normal
  208. add rsi, 8
  209. add rdi, 8
  210. add rdx, 8
  211. call mulM
  212. ret
  213. lmlm:
  214. mov byte [rdi + 7], 0xC0 ; set the result to long montgomery
  215. add rsi, 8
  216. add rdi, 8
  217. add rdx, 8
  218. call mulM
  219. ret