You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

488 lines
8.8 KiB

  1. // +build !amd64_adx
  2. // Copyright 2020 ConsenSys Software Inc.
  3. //
  4. // Licensed under the Apache License, Version 2.0 (the "License");
  5. // you may not use this file except in compliance with the License.
  6. // You may obtain a copy of the License at
  7. //
  8. // http://www.apache.org/licenses/LICENSE-2.0
  9. //
  10. // Unless required by applicable law or agreed to in writing, software
  11. // distributed under the License is distributed on an "AS IS" BASIS,
  12. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. // See the License for the specific language governing permissions and
  14. // limitations under the License.
  15. #include "textflag.h"
  16. #include "funcdata.h"
  17. // modulus q
  18. DATA q<>+0(SB)/8, $0x43e1f593f0000001
  19. DATA q<>+8(SB)/8, $0x2833e84879b97091
  20. DATA q<>+16(SB)/8, $0xb85045b68181585d
  21. DATA q<>+24(SB)/8, $0x30644e72e131a029
  22. GLOBL q<>(SB), (RODATA+NOPTR), $32
  23. // qInv0 q'[0]
  24. DATA qInv0<>(SB)/8, $0xc2e1f593efffffff
  25. GLOBL qInv0<>(SB), (RODATA+NOPTR), $8
  26. #define REDUCE(ra0, ra1, ra2, ra3, rb0, rb1, rb2, rb3) \
  27. MOVQ ra0, rb0; \
  28. SUBQ q<>(SB), ra0; \
  29. MOVQ ra1, rb1; \
  30. SBBQ q<>+8(SB), ra1; \
  31. MOVQ ra2, rb2; \
  32. SBBQ q<>+16(SB), ra2; \
  33. MOVQ ra3, rb3; \
  34. SBBQ q<>+24(SB), ra3; \
  35. CMOVQCS rb0, ra0; \
  36. CMOVQCS rb1, ra1; \
  37. CMOVQCS rb2, ra2; \
  38. CMOVQCS rb3, ra3; \
  39. // mul(res, x, y *Element)
  40. TEXT ·mul(SB), $24-24
  41. // the algorithm is described here
  42. // https://hackmd.io/@zkteam/modular_multiplication
  43. // however, to benefit from the ADCX and ADOX carry chains
  44. // we split the inner loops in 2:
  45. // for i=0 to N-1
  46. // for j=0 to N-1
  47. // (A,t[j]) := t[j] + x[j]*y[i] + A
  48. // m := t[0]*q'[0] mod W
  49. // C,_ := t[0] + m*q[0]
  50. // for j=1 to N-1
  51. // (C,t[j-1]) := t[j] + m*q[j] + C
  52. // t[N-1] = C + A
  53. NO_LOCAL_POINTERS
  54. CMPB ·supportAdx(SB), $1
  55. JNE l1
  56. MOVQ x+8(FP), SI
  57. // x[0] -> DI
  58. // x[1] -> R8
  59. // x[2] -> R9
  60. // x[3] -> R10
  61. MOVQ 0(SI), DI
  62. MOVQ 8(SI), R8
  63. MOVQ 16(SI), R9
  64. MOVQ 24(SI), R10
  65. MOVQ y+16(FP), R11
  66. // A -> BP
  67. // t[0] -> R14
  68. // t[1] -> R15
  69. // t[2] -> CX
  70. // t[3] -> BX
  71. // clear the flags
  72. XORQ AX, AX
  73. MOVQ 0(R11), DX
  74. // (A,t[0]) := x[0]*y[0] + A
  75. MULXQ DI, R14, R15
  76. // (A,t[1]) := x[1]*y[0] + A
  77. MULXQ R8, AX, CX
  78. ADOXQ AX, R15
  79. // (A,t[2]) := x[2]*y[0] + A
  80. MULXQ R9, AX, BX
  81. ADOXQ AX, CX
  82. // (A,t[3]) := x[3]*y[0] + A
  83. MULXQ R10, AX, BP
  84. ADOXQ AX, BX
  85. // A += carries from ADCXQ and ADOXQ
  86. MOVQ $0, AX
  87. ADOXQ AX, BP
  88. // m := t[0]*q'[0] mod W
  89. MOVQ qInv0<>(SB), DX
  90. IMULQ R14, DX
  91. // clear the flags
  92. XORQ AX, AX
  93. // C,_ := t[0] + m*q[0]
  94. MULXQ q<>+0(SB), AX, R12
  95. ADCXQ R14, AX
  96. MOVQ R12, R14
  97. // (C,t[0]) := t[1] + m*q[1] + C
  98. ADCXQ R15, R14
  99. MULXQ q<>+8(SB), AX, R15
  100. ADOXQ AX, R14
  101. // (C,t[1]) := t[2] + m*q[2] + C
  102. ADCXQ CX, R15
  103. MULXQ q<>+16(SB), AX, CX
  104. ADOXQ AX, R15
  105. // (C,t[2]) := t[3] + m*q[3] + C
  106. ADCXQ BX, CX
  107. MULXQ q<>+24(SB), AX, BX
  108. ADOXQ AX, CX
  109. // t[3] = C + A
  110. MOVQ $0, AX
  111. ADCXQ AX, BX
  112. ADOXQ BP, BX
  113. // clear the flags
  114. XORQ AX, AX
  115. MOVQ 8(R11), DX
  116. // (A,t[0]) := t[0] + x[0]*y[1] + A
  117. MULXQ DI, AX, BP
  118. ADOXQ AX, R14
  119. // (A,t[1]) := t[1] + x[1]*y[1] + A
  120. ADCXQ BP, R15
  121. MULXQ R8, AX, BP
  122. ADOXQ AX, R15
  123. // (A,t[2]) := t[2] + x[2]*y[1] + A
  124. ADCXQ BP, CX
  125. MULXQ R9, AX, BP
  126. ADOXQ AX, CX
  127. // (A,t[3]) := t[3] + x[3]*y[1] + A
  128. ADCXQ BP, BX
  129. MULXQ R10, AX, BP
  130. ADOXQ AX, BX
  131. // A += carries from ADCXQ and ADOXQ
  132. MOVQ $0, AX
  133. ADCXQ AX, BP
  134. ADOXQ AX, BP
  135. // m := t[0]*q'[0] mod W
  136. MOVQ qInv0<>(SB), DX
  137. IMULQ R14, DX
  138. // clear the flags
  139. XORQ AX, AX
  140. // C,_ := t[0] + m*q[0]
  141. MULXQ q<>+0(SB), AX, R12
  142. ADCXQ R14, AX
  143. MOVQ R12, R14
  144. // (C,t[0]) := t[1] + m*q[1] + C
  145. ADCXQ R15, R14
  146. MULXQ q<>+8(SB), AX, R15
  147. ADOXQ AX, R14
  148. // (C,t[1]) := t[2] + m*q[2] + C
  149. ADCXQ CX, R15
  150. MULXQ q<>+16(SB), AX, CX
  151. ADOXQ AX, R15
  152. // (C,t[2]) := t[3] + m*q[3] + C
  153. ADCXQ BX, CX
  154. MULXQ q<>+24(SB), AX, BX
  155. ADOXQ AX, CX
  156. // t[3] = C + A
  157. MOVQ $0, AX
  158. ADCXQ AX, BX
  159. ADOXQ BP, BX
  160. // clear the flags
  161. XORQ AX, AX
  162. MOVQ 16(R11), DX
  163. // (A,t[0]) := t[0] + x[0]*y[2] + A
  164. MULXQ DI, AX, BP
  165. ADOXQ AX, R14
  166. // (A,t[1]) := t[1] + x[1]*y[2] + A
  167. ADCXQ BP, R15
  168. MULXQ R8, AX, BP
  169. ADOXQ AX, R15
  170. // (A,t[2]) := t[2] + x[2]*y[2] + A
  171. ADCXQ BP, CX
  172. MULXQ R9, AX, BP
  173. ADOXQ AX, CX
  174. // (A,t[3]) := t[3] + x[3]*y[2] + A
  175. ADCXQ BP, BX
  176. MULXQ R10, AX, BP
  177. ADOXQ AX, BX
  178. // A += carries from ADCXQ and ADOXQ
  179. MOVQ $0, AX
  180. ADCXQ AX, BP
  181. ADOXQ AX, BP
  182. // m := t[0]*q'[0] mod W
  183. MOVQ qInv0<>(SB), DX
  184. IMULQ R14, DX
  185. // clear the flags
  186. XORQ AX, AX
  187. // C,_ := t[0] + m*q[0]
  188. MULXQ q<>+0(SB), AX, R12
  189. ADCXQ R14, AX
  190. MOVQ R12, R14
  191. // (C,t[0]) := t[1] + m*q[1] + C
  192. ADCXQ R15, R14
  193. MULXQ q<>+8(SB), AX, R15
  194. ADOXQ AX, R14
  195. // (C,t[1]) := t[2] + m*q[2] + C
  196. ADCXQ CX, R15
  197. MULXQ q<>+16(SB), AX, CX
  198. ADOXQ AX, R15
  199. // (C,t[2]) := t[3] + m*q[3] + C
  200. ADCXQ BX, CX
  201. MULXQ q<>+24(SB), AX, BX
  202. ADOXQ AX, CX
  203. // t[3] = C + A
  204. MOVQ $0, AX
  205. ADCXQ AX, BX
  206. ADOXQ BP, BX
  207. // clear the flags
  208. XORQ AX, AX
  209. MOVQ 24(R11), DX
  210. // (A,t[0]) := t[0] + x[0]*y[3] + A
  211. MULXQ DI, AX, BP
  212. ADOXQ AX, R14
  213. // (A,t[1]) := t[1] + x[1]*y[3] + A
  214. ADCXQ BP, R15
  215. MULXQ R8, AX, BP
  216. ADOXQ AX, R15
  217. // (A,t[2]) := t[2] + x[2]*y[3] + A
  218. ADCXQ BP, CX
  219. MULXQ R9, AX, BP
  220. ADOXQ AX, CX
  221. // (A,t[3]) := t[3] + x[3]*y[3] + A
  222. ADCXQ BP, BX
  223. MULXQ R10, AX, BP
  224. ADOXQ AX, BX
  225. // A += carries from ADCXQ and ADOXQ
  226. MOVQ $0, AX
  227. ADCXQ AX, BP
  228. ADOXQ AX, BP
  229. // m := t[0]*q'[0] mod W
  230. MOVQ qInv0<>(SB), DX
  231. IMULQ R14, DX
  232. // clear the flags
  233. XORQ AX, AX
  234. // C,_ := t[0] + m*q[0]
  235. MULXQ q<>+0(SB), AX, R12
  236. ADCXQ R14, AX
  237. MOVQ R12, R14
  238. // (C,t[0]) := t[1] + m*q[1] + C
  239. ADCXQ R15, R14
  240. MULXQ q<>+8(SB), AX, R15
  241. ADOXQ AX, R14
  242. // (C,t[1]) := t[2] + m*q[2] + C
  243. ADCXQ CX, R15
  244. MULXQ q<>+16(SB), AX, CX
  245. ADOXQ AX, R15
  246. // (C,t[2]) := t[3] + m*q[3] + C
  247. ADCXQ BX, CX
  248. MULXQ q<>+24(SB), AX, BX
  249. ADOXQ AX, CX
  250. // t[3] = C + A
  251. MOVQ $0, AX
  252. ADCXQ AX, BX
  253. ADOXQ BP, BX
  254. // reduce element(R14,R15,CX,BX) using temp registers (R13,SI,R12,R11)
  255. REDUCE(R14,R15,CX,BX,R13,SI,R12,R11)
  256. MOVQ res+0(FP), AX
  257. MOVQ R14, 0(AX)
  258. MOVQ R15, 8(AX)
  259. MOVQ CX, 16(AX)
  260. MOVQ BX, 24(AX)
  261. RET
  262. l1:
  263. MOVQ res+0(FP), AX
  264. MOVQ AX, (SP)
  265. MOVQ x+8(FP), AX
  266. MOVQ AX, 8(SP)
  267. MOVQ y+16(FP), AX
  268. MOVQ AX, 16(SP)
  269. CALL ·_mulGeneric(SB)
  270. RET
  271. TEXT ·fromMont(SB), $8-8
  272. NO_LOCAL_POINTERS
  273. // the algorithm is described here
  274. // https://hackmd.io/@zkteam/modular_multiplication
  275. // when y = 1 we have:
  276. // for i=0 to N-1
  277. // t[i] = x[i]
  278. // for i=0 to N-1
  279. // m := t[0]*q'[0] mod W
  280. // C,_ := t[0] + m*q[0]
  281. // for j=1 to N-1
  282. // (C,t[j-1]) := t[j] + m*q[j] + C
  283. // t[N-1] = C
  284. CMPB ·supportAdx(SB), $1
  285. JNE l2
  286. MOVQ res+0(FP), DX
  287. MOVQ 0(DX), R14
  288. MOVQ 8(DX), R15
  289. MOVQ 16(DX), CX
  290. MOVQ 24(DX), BX
  291. XORQ DX, DX
  292. // m := t[0]*q'[0] mod W
  293. MOVQ qInv0<>(SB), DX
  294. IMULQ R14, DX
  295. XORQ AX, AX
  296. // C,_ := t[0] + m*q[0]
  297. MULXQ q<>+0(SB), AX, BP
  298. ADCXQ R14, AX
  299. MOVQ BP, R14
  300. // (C,t[0]) := t[1] + m*q[1] + C
  301. ADCXQ R15, R14
  302. MULXQ q<>+8(SB), AX, R15
  303. ADOXQ AX, R14
  304. // (C,t[1]) := t[2] + m*q[2] + C
  305. ADCXQ CX, R15
  306. MULXQ q<>+16(SB), AX, CX
  307. ADOXQ AX, R15
  308. // (C,t[2]) := t[3] + m*q[3] + C
  309. ADCXQ BX, CX
  310. MULXQ q<>+24(SB), AX, BX
  311. ADOXQ AX, CX
  312. MOVQ $0, AX
  313. ADCXQ AX, BX
  314. ADOXQ AX, BX
  315. XORQ DX, DX
  316. // m := t[0]*q'[0] mod W
  317. MOVQ qInv0<>(SB), DX
  318. IMULQ R14, DX
  319. XORQ AX, AX
  320. // C,_ := t[0] + m*q[0]
  321. MULXQ q<>+0(SB), AX, BP
  322. ADCXQ R14, AX
  323. MOVQ BP, R14
  324. // (C,t[0]) := t[1] + m*q[1] + C
  325. ADCXQ R15, R14
  326. MULXQ q<>+8(SB), AX, R15
  327. ADOXQ AX, R14
  328. // (C,t[1]) := t[2] + m*q[2] + C
  329. ADCXQ CX, R15
  330. MULXQ q<>+16(SB), AX, CX
  331. ADOXQ AX, R15
  332. // (C,t[2]) := t[3] + m*q[3] + C
  333. ADCXQ BX, CX
  334. MULXQ q<>+24(SB), AX, BX
  335. ADOXQ AX, CX
  336. MOVQ $0, AX
  337. ADCXQ AX, BX
  338. ADOXQ AX, BX
  339. XORQ DX, DX
  340. // m := t[0]*q'[0] mod W
  341. MOVQ qInv0<>(SB), DX
  342. IMULQ R14, DX
  343. XORQ AX, AX
  344. // C,_ := t[0] + m*q[0]
  345. MULXQ q<>+0(SB), AX, BP
  346. ADCXQ R14, AX
  347. MOVQ BP, R14
  348. // (C,t[0]) := t[1] + m*q[1] + C
  349. ADCXQ R15, R14
  350. MULXQ q<>+8(SB), AX, R15
  351. ADOXQ AX, R14
  352. // (C,t[1]) := t[2] + m*q[2] + C
  353. ADCXQ CX, R15
  354. MULXQ q<>+16(SB), AX, CX
  355. ADOXQ AX, R15
  356. // (C,t[2]) := t[3] + m*q[3] + C
  357. ADCXQ BX, CX
  358. MULXQ q<>+24(SB), AX, BX
  359. ADOXQ AX, CX
  360. MOVQ $0, AX
  361. ADCXQ AX, BX
  362. ADOXQ AX, BX
  363. XORQ DX, DX
  364. // m := t[0]*q'[0] mod W
  365. MOVQ qInv0<>(SB), DX
  366. IMULQ R14, DX
  367. XORQ AX, AX
  368. // C,_ := t[0] + m*q[0]
  369. MULXQ q<>+0(SB), AX, BP
  370. ADCXQ R14, AX
  371. MOVQ BP, R14
  372. // (C,t[0]) := t[1] + m*q[1] + C
  373. ADCXQ R15, R14
  374. MULXQ q<>+8(SB), AX, R15
  375. ADOXQ AX, R14
  376. // (C,t[1]) := t[2] + m*q[2] + C
  377. ADCXQ CX, R15
  378. MULXQ q<>+16(SB), AX, CX
  379. ADOXQ AX, R15
  380. // (C,t[2]) := t[3] + m*q[3] + C
  381. ADCXQ BX, CX
  382. MULXQ q<>+24(SB), AX, BX
  383. ADOXQ AX, CX
  384. MOVQ $0, AX
  385. ADCXQ AX, BX
  386. ADOXQ AX, BX
  387. // reduce element(R14,R15,CX,BX) using temp registers (SI,DI,R8,R9)
  388. REDUCE(R14,R15,CX,BX,SI,DI,R8,R9)
  389. MOVQ res+0(FP), AX
  390. MOVQ R14, 0(AX)
  391. MOVQ R15, 8(AX)
  392. MOVQ CX, 16(AX)
  393. MOVQ BX, 24(AX)
  394. RET
  395. l2:
  396. MOVQ res+0(FP), AX
  397. MOVQ AX, (SP)
  398. CALL ·_fromMontGeneric(SB)
  399. RET