466 lines
8.5 KiB

  1. // +build amd64_adx
  2. // Copyright 2020 ConsenSys Software Inc.
  3. //
  4. // Licensed under the Apache License, Version 2.0 (the "License");
  5. // you may not use this file except in compliance with the License.
  6. // You may obtain a copy of the License at
  7. //
  8. // http://www.apache.org/licenses/LICENSE-2.0
  9. //
  10. // Unless required by applicable law or agreed to in writing, software
  11. // distributed under the License is distributed on an "AS IS" BASIS,
  12. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. // See the License for the specific language governing permissions and
  14. // limitations under the License.
  15. #include "textflag.h"
  16. #include "funcdata.h"
  17. // modulus q
  18. DATA q<>+0(SB)/8, $0x43e1f593f0000001
  19. DATA q<>+8(SB)/8, $0x2833e84879b97091
  20. DATA q<>+16(SB)/8, $0xb85045b68181585d
  21. DATA q<>+24(SB)/8, $0x30644e72e131a029
  22. GLOBL q<>(SB), (RODATA+NOPTR), $32
  23. // qInv0 q'[0]
  24. DATA qInv0<>(SB)/8, $0xc2e1f593efffffff
  25. GLOBL qInv0<>(SB), (RODATA+NOPTR), $8
  26. #define REDUCE(ra0, ra1, ra2, ra3, rb0, rb1, rb2, rb3) \
  27. MOVQ ra0, rb0; \
  28. SUBQ q<>(SB), ra0; \
  29. MOVQ ra1, rb1; \
  30. SBBQ q<>+8(SB), ra1; \
  31. MOVQ ra2, rb2; \
  32. SBBQ q<>+16(SB), ra2; \
  33. MOVQ ra3, rb3; \
  34. SBBQ q<>+24(SB), ra3; \
  35. CMOVQCS rb0, ra0; \
  36. CMOVQCS rb1, ra1; \
  37. CMOVQCS rb2, ra2; \
  38. CMOVQCS rb3, ra3; \
  39. // mul(res, x, y *Element)
  40. TEXT ·mul(SB), NOSPLIT, $0-24
  41. // the algorithm is described here
  42. // https://hackmd.io/@zkteam/modular_multiplication
  43. // however, to benefit from the ADCX and ADOX carry chains
  44. // we split the inner loops in 2:
  45. // for i=0 to N-1
  46. // for j=0 to N-1
  47. // (A,t[j]) := t[j] + x[j]*y[i] + A
  48. // m := t[0]*q'[0] mod W
  49. // C,_ := t[0] + m*q[0]
  50. // for j=1 to N-1
  51. // (C,t[j-1]) := t[j] + m*q[j] + C
  52. // t[N-1] = C + A
  53. MOVQ x+8(FP), SI
  54. // x[0] -> DI
  55. // x[1] -> R8
  56. // x[2] -> R9
  57. // x[3] -> R10
  58. MOVQ 0(SI), DI
  59. MOVQ 8(SI), R8
  60. MOVQ 16(SI), R9
  61. MOVQ 24(SI), R10
  62. MOVQ y+16(FP), R11
  63. // A -> BP
  64. // t[0] -> R14
  65. // t[1] -> R15
  66. // t[2] -> CX
  67. // t[3] -> BX
  68. // clear the flags
  69. XORQ AX, AX
  70. MOVQ 0(R11), DX
  71. // (A,t[0]) := x[0]*y[0] + A
  72. MULXQ DI, R14, R15
  73. // (A,t[1]) := x[1]*y[0] + A
  74. MULXQ R8, AX, CX
  75. ADOXQ AX, R15
  76. // (A,t[2]) := x[2]*y[0] + A
  77. MULXQ R9, AX, BX
  78. ADOXQ AX, CX
  79. // (A,t[3]) := x[3]*y[0] + A
  80. MULXQ R10, AX, BP
  81. ADOXQ AX, BX
  82. // A += carries from ADCXQ and ADOXQ
  83. MOVQ $0, AX
  84. ADOXQ AX, BP
  85. // m := t[0]*q'[0] mod W
  86. MOVQ qInv0<>(SB), DX
  87. IMULQ R14, DX
  88. // clear the flags
  89. XORQ AX, AX
  90. // C,_ := t[0] + m*q[0]
  91. MULXQ q<>+0(SB), AX, R12
  92. ADCXQ R14, AX
  93. MOVQ R12, R14
  94. // (C,t[0]) := t[1] + m*q[1] + C
  95. ADCXQ R15, R14
  96. MULXQ q<>+8(SB), AX, R15
  97. ADOXQ AX, R14
  98. // (C,t[1]) := t[2] + m*q[2] + C
  99. ADCXQ CX, R15
  100. MULXQ q<>+16(SB), AX, CX
  101. ADOXQ AX, R15
  102. // (C,t[2]) := t[3] + m*q[3] + C
  103. ADCXQ BX, CX
  104. MULXQ q<>+24(SB), AX, BX
  105. ADOXQ AX, CX
  106. // t[3] = C + A
  107. MOVQ $0, AX
  108. ADCXQ AX, BX
  109. ADOXQ BP, BX
  110. // clear the flags
  111. XORQ AX, AX
  112. MOVQ 8(R11), DX
  113. // (A,t[0]) := t[0] + x[0]*y[1] + A
  114. MULXQ DI, AX, BP
  115. ADOXQ AX, R14
  116. // (A,t[1]) := t[1] + x[1]*y[1] + A
  117. ADCXQ BP, R15
  118. MULXQ R8, AX, BP
  119. ADOXQ AX, R15
  120. // (A,t[2]) := t[2] + x[2]*y[1] + A
  121. ADCXQ BP, CX
  122. MULXQ R9, AX, BP
  123. ADOXQ AX, CX
  124. // (A,t[3]) := t[3] + x[3]*y[1] + A
  125. ADCXQ BP, BX
  126. MULXQ R10, AX, BP
  127. ADOXQ AX, BX
  128. // A += carries from ADCXQ and ADOXQ
  129. MOVQ $0, AX
  130. ADCXQ AX, BP
  131. ADOXQ AX, BP
  132. // m := t[0]*q'[0] mod W
  133. MOVQ qInv0<>(SB), DX
  134. IMULQ R14, DX
  135. // clear the flags
  136. XORQ AX, AX
  137. // C,_ := t[0] + m*q[0]
  138. MULXQ q<>+0(SB), AX, R12
  139. ADCXQ R14, AX
  140. MOVQ R12, R14
  141. // (C,t[0]) := t[1] + m*q[1] + C
  142. ADCXQ R15, R14
  143. MULXQ q<>+8(SB), AX, R15
  144. ADOXQ AX, R14
  145. // (C,t[1]) := t[2] + m*q[2] + C
  146. ADCXQ CX, R15
  147. MULXQ q<>+16(SB), AX, CX
  148. ADOXQ AX, R15
  149. // (C,t[2]) := t[3] + m*q[3] + C
  150. ADCXQ BX, CX
  151. MULXQ q<>+24(SB), AX, BX
  152. ADOXQ AX, CX
  153. // t[3] = C + A
  154. MOVQ $0, AX
  155. ADCXQ AX, BX
  156. ADOXQ BP, BX
  157. // clear the flags
  158. XORQ AX, AX
  159. MOVQ 16(R11), DX
  160. // (A,t[0]) := t[0] + x[0]*y[2] + A
  161. MULXQ DI, AX, BP
  162. ADOXQ AX, R14
  163. // (A,t[1]) := t[1] + x[1]*y[2] + A
  164. ADCXQ BP, R15
  165. MULXQ R8, AX, BP
  166. ADOXQ AX, R15
  167. // (A,t[2]) := t[2] + x[2]*y[2] + A
  168. ADCXQ BP, CX
  169. MULXQ R9, AX, BP
  170. ADOXQ AX, CX
  171. // (A,t[3]) := t[3] + x[3]*y[2] + A
  172. ADCXQ BP, BX
  173. MULXQ R10, AX, BP
  174. ADOXQ AX, BX
  175. // A += carries from ADCXQ and ADOXQ
  176. MOVQ $0, AX
  177. ADCXQ AX, BP
  178. ADOXQ AX, BP
  179. // m := t[0]*q'[0] mod W
  180. MOVQ qInv0<>(SB), DX
  181. IMULQ R14, DX
  182. // clear the flags
  183. XORQ AX, AX
  184. // C,_ := t[0] + m*q[0]
  185. MULXQ q<>+0(SB), AX, R12
  186. ADCXQ R14, AX
  187. MOVQ R12, R14
  188. // (C,t[0]) := t[1] + m*q[1] + C
  189. ADCXQ R15, R14
  190. MULXQ q<>+8(SB), AX, R15
  191. ADOXQ AX, R14
  192. // (C,t[1]) := t[2] + m*q[2] + C
  193. ADCXQ CX, R15
  194. MULXQ q<>+16(SB), AX, CX
  195. ADOXQ AX, R15
  196. // (C,t[2]) := t[3] + m*q[3] + C
  197. ADCXQ BX, CX
  198. MULXQ q<>+24(SB), AX, BX
  199. ADOXQ AX, CX
  200. // t[3] = C + A
  201. MOVQ $0, AX
  202. ADCXQ AX, BX
  203. ADOXQ BP, BX
  204. // clear the flags
  205. XORQ AX, AX
  206. MOVQ 24(R11), DX
  207. // (A,t[0]) := t[0] + x[0]*y[3] + A
  208. MULXQ DI, AX, BP
  209. ADOXQ AX, R14
  210. // (A,t[1]) := t[1] + x[1]*y[3] + A
  211. ADCXQ BP, R15
  212. MULXQ R8, AX, BP
  213. ADOXQ AX, R15
  214. // (A,t[2]) := t[2] + x[2]*y[3] + A
  215. ADCXQ BP, CX
  216. MULXQ R9, AX, BP
  217. ADOXQ AX, CX
  218. // (A,t[3]) := t[3] + x[3]*y[3] + A
  219. ADCXQ BP, BX
  220. MULXQ R10, AX, BP
  221. ADOXQ AX, BX
  222. // A += carries from ADCXQ and ADOXQ
  223. MOVQ $0, AX
  224. ADCXQ AX, BP
  225. ADOXQ AX, BP
  226. // m := t[0]*q'[0] mod W
  227. MOVQ qInv0<>(SB), DX
  228. IMULQ R14, DX
  229. // clear the flags
  230. XORQ AX, AX
  231. // C,_ := t[0] + m*q[0]
  232. MULXQ q<>+0(SB), AX, R12
  233. ADCXQ R14, AX
  234. MOVQ R12, R14
  235. // (C,t[0]) := t[1] + m*q[1] + C
  236. ADCXQ R15, R14
  237. MULXQ q<>+8(SB), AX, R15
  238. ADOXQ AX, R14
  239. // (C,t[1]) := t[2] + m*q[2] + C
  240. ADCXQ CX, R15
  241. MULXQ q<>+16(SB), AX, CX
  242. ADOXQ AX, R15
  243. // (C,t[2]) := t[3] + m*q[3] + C
  244. ADCXQ BX, CX
  245. MULXQ q<>+24(SB), AX, BX
  246. ADOXQ AX, CX
  247. // t[3] = C + A
  248. MOVQ $0, AX
  249. ADCXQ AX, BX
  250. ADOXQ BP, BX
  251. // reduce element(R14,R15,CX,BX) using temp registers (R13,SI,R12,R11)
  252. REDUCE(R14,R15,CX,BX,R13,SI,R12,R11)
  253. MOVQ res+0(FP), AX
  254. MOVQ R14, 0(AX)
  255. MOVQ R15, 8(AX)
  256. MOVQ CX, 16(AX)
  257. MOVQ BX, 24(AX)
  258. RET
  259. TEXT ·fromMont(SB), NOSPLIT, $0-8
  260. // the algorithm is described here
  261. // https://hackmd.io/@zkteam/modular_multiplication
  262. // when y = 1 we have:
  263. // for i=0 to N-1
  264. // t[i] = x[i]
  265. // for i=0 to N-1
  266. // m := t[0]*q'[0] mod W
  267. // C,_ := t[0] + m*q[0]
  268. // for j=1 to N-1
  269. // (C,t[j-1]) := t[j] + m*q[j] + C
  270. // t[N-1] = C
  271. MOVQ res+0(FP), DX
  272. MOVQ 0(DX), R14
  273. MOVQ 8(DX), R15
  274. MOVQ 16(DX), CX
  275. MOVQ 24(DX), BX
  276. XORQ DX, DX
  277. // m := t[0]*q'[0] mod W
  278. MOVQ qInv0<>(SB), DX
  279. IMULQ R14, DX
  280. XORQ AX, AX
  281. // C,_ := t[0] + m*q[0]
  282. MULXQ q<>+0(SB), AX, BP
  283. ADCXQ R14, AX
  284. MOVQ BP, R14
  285. // (C,t[0]) := t[1] + m*q[1] + C
  286. ADCXQ R15, R14
  287. MULXQ q<>+8(SB), AX, R15
  288. ADOXQ AX, R14
  289. // (C,t[1]) := t[2] + m*q[2] + C
  290. ADCXQ CX, R15
  291. MULXQ q<>+16(SB), AX, CX
  292. ADOXQ AX, R15
  293. // (C,t[2]) := t[3] + m*q[3] + C
  294. ADCXQ BX, CX
  295. MULXQ q<>+24(SB), AX, BX
  296. ADOXQ AX, CX
  297. MOVQ $0, AX
  298. ADCXQ AX, BX
  299. ADOXQ AX, BX
  300. XORQ DX, DX
  301. // m := t[0]*q'[0] mod W
  302. MOVQ qInv0<>(SB), DX
  303. IMULQ R14, DX
  304. XORQ AX, AX
  305. // C,_ := t[0] + m*q[0]
  306. MULXQ q<>+0(SB), AX, BP
  307. ADCXQ R14, AX
  308. MOVQ BP, R14
  309. // (C,t[0]) := t[1] + m*q[1] + C
  310. ADCXQ R15, R14
  311. MULXQ q<>+8(SB), AX, R15
  312. ADOXQ AX, R14
  313. // (C,t[1]) := t[2] + m*q[2] + C
  314. ADCXQ CX, R15
  315. MULXQ q<>+16(SB), AX, CX
  316. ADOXQ AX, R15
  317. // (C,t[2]) := t[3] + m*q[3] + C
  318. ADCXQ BX, CX
  319. MULXQ q<>+24(SB), AX, BX
  320. ADOXQ AX, CX
  321. MOVQ $0, AX
  322. ADCXQ AX, BX
  323. ADOXQ AX, BX
  324. XORQ DX, DX
  325. // m := t[0]*q'[0] mod W
  326. MOVQ qInv0<>(SB), DX
  327. IMULQ R14, DX
  328. XORQ AX, AX
  329. // C,_ := t[0] + m*q[0]
  330. MULXQ q<>+0(SB), AX, BP
  331. ADCXQ R14, AX
  332. MOVQ BP, R14
  333. // (C,t[0]) := t[1] + m*q[1] + C
  334. ADCXQ R15, R14
  335. MULXQ q<>+8(SB), AX, R15
  336. ADOXQ AX, R14
  337. // (C,t[1]) := t[2] + m*q[2] + C
  338. ADCXQ CX, R15
  339. MULXQ q<>+16(SB), AX, CX
  340. ADOXQ AX, R15
  341. // (C,t[2]) := t[3] + m*q[3] + C
  342. ADCXQ BX, CX
  343. MULXQ q<>+24(SB), AX, BX
  344. ADOXQ AX, CX
  345. MOVQ $0, AX
  346. ADCXQ AX, BX
  347. ADOXQ AX, BX
  348. XORQ DX, DX
  349. // m := t[0]*q'[0] mod W
  350. MOVQ qInv0<>(SB), DX
  351. IMULQ R14, DX
  352. XORQ AX, AX
  353. // C,_ := t[0] + m*q[0]
  354. MULXQ q<>+0(SB), AX, BP
  355. ADCXQ R14, AX
  356. MOVQ BP, R14
  357. // (C,t[0]) := t[1] + m*q[1] + C
  358. ADCXQ R15, R14
  359. MULXQ q<>+8(SB), AX, R15
  360. ADOXQ AX, R14
  361. // (C,t[1]) := t[2] + m*q[2] + C
  362. ADCXQ CX, R15
  363. MULXQ q<>+16(SB), AX, CX
  364. ADOXQ AX, R15
  365. // (C,t[2]) := t[3] + m*q[3] + C
  366. ADCXQ BX, CX
  367. MULXQ q<>+24(SB), AX, BX
  368. ADOXQ AX, CX
  369. MOVQ $0, AX
  370. ADCXQ AX, BX
  371. ADOXQ AX, BX
  372. // reduce element(R14,R15,CX,BX) using temp registers (SI,DI,R8,R9)
  373. REDUCE(R14,R15,CX,BX,SI,DI,R8,R9)
  374. MOVQ res+0(FP), AX
  375. MOVQ R14, 0(AX)
  376. MOVQ R15, 8(AX)
  377. MOVQ CX, 16(AX)
  378. MOVQ BX, 24(AX)
  379. RET