You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

834 lines
21 KiB

  1. // Copyright 2013 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // We have an implementation in amd64 assembly so this code is only run on
  5. // non-amd64 platforms. The amd64 assembly does not support gccgo.
  6. // +build !amd64 gccgo appengine
  7. package curve25519
  8. import (
  9. "encoding/binary"
  10. )
  11. // This code is a port of the public domain, "ref10" implementation of
  12. // curve25519 from SUPERCOP 20130419 by D. J. Bernstein.
  13. // fieldElement represents an element of the field GF(2^255 - 19). An element
  14. // t, entries t[0]...t[9], represents the integer t[0]+2^26 t[1]+2^51 t[2]+2^77
  15. // t[3]+2^102 t[4]+...+2^230 t[9]. Bounds on each t[i] vary depending on
  16. // context.
  17. type fieldElement [10]int32
  18. func feZero(fe *fieldElement) {
  19. for i := range fe {
  20. fe[i] = 0
  21. }
  22. }
  23. func feOne(fe *fieldElement) {
  24. feZero(fe)
  25. fe[0] = 1
  26. }
  27. func feAdd(dst, a, b *fieldElement) {
  28. for i := range dst {
  29. dst[i] = a[i] + b[i]
  30. }
  31. }
  32. func feSub(dst, a, b *fieldElement) {
  33. for i := range dst {
  34. dst[i] = a[i] - b[i]
  35. }
  36. }
  37. func feCopy(dst, src *fieldElement) {
  38. for i := range dst {
  39. dst[i] = src[i]
  40. }
  41. }
  42. // feCSwap replaces (f,g) with (g,f) if b == 1; replaces (f,g) with (f,g) if b == 0.
  43. //
  44. // Preconditions: b in {0,1}.
  45. func feCSwap(f, g *fieldElement, b int32) {
  46. b = -b
  47. for i := range f {
  48. t := b & (f[i] ^ g[i])
  49. f[i] ^= t
  50. g[i] ^= t
  51. }
  52. }
  53. // load3 reads a 24-bit, little-endian value from in.
  54. func load3(in []byte) int64 {
  55. var r int64
  56. r = int64(in[0])
  57. r |= int64(in[1]) << 8
  58. r |= int64(in[2]) << 16
  59. return r
  60. }
  61. // load4 reads a 32-bit, little-endian value from in.
  62. func load4(in []byte) int64 {
  63. return int64(binary.LittleEndian.Uint32(in))
  64. }
  65. func feFromBytes(dst *fieldElement, src *[32]byte) {
  66. h0 := load4(src[:])
  67. h1 := load3(src[4:]) << 6
  68. h2 := load3(src[7:]) << 5
  69. h3 := load3(src[10:]) << 3
  70. h4 := load3(src[13:]) << 2
  71. h5 := load4(src[16:])
  72. h6 := load3(src[20:]) << 7
  73. h7 := load3(src[23:]) << 5
  74. h8 := load3(src[26:]) << 4
  75. h9 := load3(src[29:]) << 2
  76. var carry [10]int64
  77. carry[9] = (h9 + 1<<24) >> 25
  78. h0 += carry[9] * 19
  79. h9 -= carry[9] << 25
  80. carry[1] = (h1 + 1<<24) >> 25
  81. h2 += carry[1]
  82. h1 -= carry[1] << 25
  83. carry[3] = (h3 + 1<<24) >> 25
  84. h4 += carry[3]
  85. h3 -= carry[3] << 25
  86. carry[5] = (h5 + 1<<24) >> 25
  87. h6 += carry[5]
  88. h5 -= carry[5] << 25
  89. carry[7] = (h7 + 1<<24) >> 25
  90. h8 += carry[7]
  91. h7 -= carry[7] << 25
  92. carry[0] = (h0 + 1<<25) >> 26
  93. h1 += carry[0]
  94. h0 -= carry[0] << 26
  95. carry[2] = (h2 + 1<<25) >> 26
  96. h3 += carry[2]
  97. h2 -= carry[2] << 26
  98. carry[4] = (h4 + 1<<25) >> 26
  99. h5 += carry[4]
  100. h4 -= carry[4] << 26
  101. carry[6] = (h6 + 1<<25) >> 26
  102. h7 += carry[6]
  103. h6 -= carry[6] << 26
  104. carry[8] = (h8 + 1<<25) >> 26
  105. h9 += carry[8]
  106. h8 -= carry[8] << 26
  107. dst[0] = int32(h0)
  108. dst[1] = int32(h1)
  109. dst[2] = int32(h2)
  110. dst[3] = int32(h3)
  111. dst[4] = int32(h4)
  112. dst[5] = int32(h5)
  113. dst[6] = int32(h6)
  114. dst[7] = int32(h7)
  115. dst[8] = int32(h8)
  116. dst[9] = int32(h9)
  117. }
  118. // feToBytes marshals h to s.
  119. // Preconditions:
  120. // |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
  121. //
  122. // Write p=2^255-19; q=floor(h/p).
  123. // Basic claim: q = floor(2^(-255)(h + 19 2^(-25)h9 + 2^(-1))).
  124. //
  125. // Proof:
  126. // Have |h|<=p so |q|<=1 so |19^2 2^(-255) q|<1/4.
  127. // Also have |h-2^230 h9|<2^230 so |19 2^(-255)(h-2^230 h9)|<1/4.
  128. //
  129. // Write y=2^(-1)-19^2 2^(-255)q-19 2^(-255)(h-2^230 h9).
  130. // Then 0<y<1.
  131. //
  132. // Write r=h-pq.
  133. // Have 0<=r<=p-1=2^255-20.
  134. // Thus 0<=r+19(2^-255)r<r+19(2^-255)2^255<=2^255-1.
  135. //
  136. // Write x=r+19(2^-255)r+y.
  137. // Then 0<x<2^255 so floor(2^(-255)x) = 0 so floor(q+2^(-255)x) = q.
  138. //
  139. // Have q+2^(-255)x = 2^(-255)(h + 19 2^(-25) h9 + 2^(-1))
  140. // so floor(2^(-255)(h + 19 2^(-25) h9 + 2^(-1))) = q.
  141. func feToBytes(s *[32]byte, h *fieldElement) {
  142. var carry [10]int32
  143. q := (19*h[9] + (1 << 24)) >> 25
  144. q = (h[0] + q) >> 26
  145. q = (h[1] + q) >> 25
  146. q = (h[2] + q) >> 26
  147. q = (h[3] + q) >> 25
  148. q = (h[4] + q) >> 26
  149. q = (h[5] + q) >> 25
  150. q = (h[6] + q) >> 26
  151. q = (h[7] + q) >> 25
  152. q = (h[8] + q) >> 26
  153. q = (h[9] + q) >> 25
  154. // Goal: Output h-(2^255-19)q, which is between 0 and 2^255-20.
  155. h[0] += 19 * q
  156. // Goal: Output h-2^255 q, which is between 0 and 2^255-20.
  157. carry[0] = h[0] >> 26
  158. h[1] += carry[0]
  159. h[0] -= carry[0] << 26
  160. carry[1] = h[1] >> 25
  161. h[2] += carry[1]
  162. h[1] -= carry[1] << 25
  163. carry[2] = h[2] >> 26
  164. h[3] += carry[2]
  165. h[2] -= carry[2] << 26
  166. carry[3] = h[3] >> 25
  167. h[4] += carry[3]
  168. h[3] -= carry[3] << 25
  169. carry[4] = h[4] >> 26
  170. h[5] += carry[4]
  171. h[4] -= carry[4] << 26
  172. carry[5] = h[5] >> 25
  173. h[6] += carry[5]
  174. h[5] -= carry[5] << 25
  175. carry[6] = h[6] >> 26
  176. h[7] += carry[6]
  177. h[6] -= carry[6] << 26
  178. carry[7] = h[7] >> 25
  179. h[8] += carry[7]
  180. h[7] -= carry[7] << 25
  181. carry[8] = h[8] >> 26
  182. h[9] += carry[8]
  183. h[8] -= carry[8] << 26
  184. carry[9] = h[9] >> 25
  185. h[9] -= carry[9] << 25
  186. // h10 = carry9
  187. // Goal: Output h[0]+...+2^255 h10-2^255 q, which is between 0 and 2^255-20.
  188. // Have h[0]+...+2^230 h[9] between 0 and 2^255-1;
  189. // evidently 2^255 h10-2^255 q = 0.
  190. // Goal: Output h[0]+...+2^230 h[9].
  191. s[0] = byte(h[0] >> 0)
  192. s[1] = byte(h[0] >> 8)
  193. s[2] = byte(h[0] >> 16)
  194. s[3] = byte((h[0] >> 24) | (h[1] << 2))
  195. s[4] = byte(h[1] >> 6)
  196. s[5] = byte(h[1] >> 14)
  197. s[6] = byte((h[1] >> 22) | (h[2] << 3))
  198. s[7] = byte(h[2] >> 5)
  199. s[8] = byte(h[2] >> 13)
  200. s[9] = byte((h[2] >> 21) | (h[3] << 5))
  201. s[10] = byte(h[3] >> 3)
  202. s[11] = byte(h[3] >> 11)
  203. s[12] = byte((h[3] >> 19) | (h[4] << 6))
  204. s[13] = byte(h[4] >> 2)
  205. s[14] = byte(h[4] >> 10)
  206. s[15] = byte(h[4] >> 18)
  207. s[16] = byte(h[5] >> 0)
  208. s[17] = byte(h[5] >> 8)
  209. s[18] = byte(h[5] >> 16)
  210. s[19] = byte((h[5] >> 24) | (h[6] << 1))
  211. s[20] = byte(h[6] >> 7)
  212. s[21] = byte(h[6] >> 15)
  213. s[22] = byte((h[6] >> 23) | (h[7] << 3))
  214. s[23] = byte(h[7] >> 5)
  215. s[24] = byte(h[7] >> 13)
  216. s[25] = byte((h[7] >> 21) | (h[8] << 4))
  217. s[26] = byte(h[8] >> 4)
  218. s[27] = byte(h[8] >> 12)
  219. s[28] = byte((h[8] >> 20) | (h[9] << 6))
  220. s[29] = byte(h[9] >> 2)
  221. s[30] = byte(h[9] >> 10)
  222. s[31] = byte(h[9] >> 18)
  223. }
  224. // feMul calculates h = f * g
  225. // Can overlap h with f or g.
  226. //
  227. // Preconditions:
  228. // |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
  229. // |g| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
  230. //
  231. // Postconditions:
  232. // |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
  233. //
  234. // Notes on implementation strategy:
  235. //
  236. // Using schoolbook multiplication.
  237. // Karatsuba would save a little in some cost models.
  238. //
  239. // Most multiplications by 2 and 19 are 32-bit precomputations;
  240. // cheaper than 64-bit postcomputations.
  241. //
  242. // There is one remaining multiplication by 19 in the carry chain;
  243. // one *19 precomputation can be merged into this,
  244. // but the resulting data flow is considerably less clean.
  245. //
  246. // There are 12 carries below.
  247. // 10 of them are 2-way parallelizable and vectorizable.
  248. // Can get away with 11 carries, but then data flow is much deeper.
  249. //
  250. // With tighter constraints on inputs can squeeze carries into int32.
  251. func feMul(h, f, g *fieldElement) {
  252. f0 := f[0]
  253. f1 := f[1]
  254. f2 := f[2]
  255. f3 := f[3]
  256. f4 := f[4]
  257. f5 := f[5]
  258. f6 := f[6]
  259. f7 := f[7]
  260. f8 := f[8]
  261. f9 := f[9]
  262. g0 := g[0]
  263. g1 := g[1]
  264. g2 := g[2]
  265. g3 := g[3]
  266. g4 := g[4]
  267. g5 := g[5]
  268. g6 := g[6]
  269. g7 := g[7]
  270. g8 := g[8]
  271. g9 := g[9]
  272. g1_19 := 19 * g1 // 1.4*2^29
  273. g2_19 := 19 * g2 // 1.4*2^30; still ok
  274. g3_19 := 19 * g3
  275. g4_19 := 19 * g4
  276. g5_19 := 19 * g5
  277. g6_19 := 19 * g6
  278. g7_19 := 19 * g7
  279. g8_19 := 19 * g8
  280. g9_19 := 19 * g9
  281. f1_2 := 2 * f1
  282. f3_2 := 2 * f3
  283. f5_2 := 2 * f5
  284. f7_2 := 2 * f7
  285. f9_2 := 2 * f9
  286. f0g0 := int64(f0) * int64(g0)
  287. f0g1 := int64(f0) * int64(g1)
  288. f0g2 := int64(f0) * int64(g2)
  289. f0g3 := int64(f0) * int64(g3)
  290. f0g4 := int64(f0) * int64(g4)
  291. f0g5 := int64(f0) * int64(g5)
  292. f0g6 := int64(f0) * int64(g6)
  293. f0g7 := int64(f0) * int64(g7)
  294. f0g8 := int64(f0) * int64(g8)
  295. f0g9 := int64(f0) * int64(g9)
  296. f1g0 := int64(f1) * int64(g0)
  297. f1g1_2 := int64(f1_2) * int64(g1)
  298. f1g2 := int64(f1) * int64(g2)
  299. f1g3_2 := int64(f1_2) * int64(g3)
  300. f1g4 := int64(f1) * int64(g4)
  301. f1g5_2 := int64(f1_2) * int64(g5)
  302. f1g6 := int64(f1) * int64(g6)
  303. f1g7_2 := int64(f1_2) * int64(g7)
  304. f1g8 := int64(f1) * int64(g8)
  305. f1g9_38 := int64(f1_2) * int64(g9_19)
  306. f2g0 := int64(f2) * int64(g0)
  307. f2g1 := int64(f2) * int64(g1)
  308. f2g2 := int64(f2) * int64(g2)
  309. f2g3 := int64(f2) * int64(g3)
  310. f2g4 := int64(f2) * int64(g4)
  311. f2g5 := int64(f2) * int64(g5)
  312. f2g6 := int64(f2) * int64(g6)
  313. f2g7 := int64(f2) * int64(g7)
  314. f2g8_19 := int64(f2) * int64(g8_19)
  315. f2g9_19 := int64(f2) * int64(g9_19)
  316. f3g0 := int64(f3) * int64(g0)
  317. f3g1_2 := int64(f3_2) * int64(g1)
  318. f3g2 := int64(f3) * int64(g2)
  319. f3g3_2 := int64(f3_2) * int64(g3)
  320. f3g4 := int64(f3) * int64(g4)
  321. f3g5_2 := int64(f3_2) * int64(g5)
  322. f3g6 := int64(f3) * int64(g6)
  323. f3g7_38 := int64(f3_2) * int64(g7_19)
  324. f3g8_19 := int64(f3) * int64(g8_19)
  325. f3g9_38 := int64(f3_2) * int64(g9_19)
  326. f4g0 := int64(f4) * int64(g0)
  327. f4g1 := int64(f4) * int64(g1)
  328. f4g2 := int64(f4) * int64(g2)
  329. f4g3 := int64(f4) * int64(g3)
  330. f4g4 := int64(f4) * int64(g4)
  331. f4g5 := int64(f4) * int64(g5)
  332. f4g6_19 := int64(f4) * int64(g6_19)
  333. f4g7_19 := int64(f4) * int64(g7_19)
  334. f4g8_19 := int64(f4) * int64(g8_19)
  335. f4g9_19 := int64(f4) * int64(g9_19)
  336. f5g0 := int64(f5) * int64(g0)
  337. f5g1_2 := int64(f5_2) * int64(g1)
  338. f5g2 := int64(f5) * int64(g2)
  339. f5g3_2 := int64(f5_2) * int64(g3)
  340. f5g4 := int64(f5) * int64(g4)
  341. f5g5_38 := int64(f5_2) * int64(g5_19)
  342. f5g6_19 := int64(f5) * int64(g6_19)
  343. f5g7_38 := int64(f5_2) * int64(g7_19)
  344. f5g8_19 := int64(f5) * int64(g8_19)
  345. f5g9_38 := int64(f5_2) * int64(g9_19)
  346. f6g0 := int64(f6) * int64(g0)
  347. f6g1 := int64(f6) * int64(g1)
  348. f6g2 := int64(f6) * int64(g2)
  349. f6g3 := int64(f6) * int64(g3)
  350. f6g4_19 := int64(f6) * int64(g4_19)
  351. f6g5_19 := int64(f6) * int64(g5_19)
  352. f6g6_19 := int64(f6) * int64(g6_19)
  353. f6g7_19 := int64(f6) * int64(g7_19)
  354. f6g8_19 := int64(f6) * int64(g8_19)
  355. f6g9_19 := int64(f6) * int64(g9_19)
  356. f7g0 := int64(f7) * int64(g0)
  357. f7g1_2 := int64(f7_2) * int64(g1)
  358. f7g2 := int64(f7) * int64(g2)
  359. f7g3_38 := int64(f7_2) * int64(g3_19)
  360. f7g4_19 := int64(f7) * int64(g4_19)
  361. f7g5_38 := int64(f7_2) * int64(g5_19)
  362. f7g6_19 := int64(f7) * int64(g6_19)
  363. f7g7_38 := int64(f7_2) * int64(g7_19)
  364. f7g8_19 := int64(f7) * int64(g8_19)
  365. f7g9_38 := int64(f7_2) * int64(g9_19)
  366. f8g0 := int64(f8) * int64(g0)
  367. f8g1 := int64(f8) * int64(g1)
  368. f8g2_19 := int64(f8) * int64(g2_19)
  369. f8g3_19 := int64(f8) * int64(g3_19)
  370. f8g4_19 := int64(f8) * int64(g4_19)
  371. f8g5_19 := int64(f8) * int64(g5_19)
  372. f8g6_19 := int64(f8) * int64(g6_19)
  373. f8g7_19 := int64(f8) * int64(g7_19)
  374. f8g8_19 := int64(f8) * int64(g8_19)
  375. f8g9_19 := int64(f8) * int64(g9_19)
  376. f9g0 := int64(f9) * int64(g0)
  377. f9g1_38 := int64(f9_2) * int64(g1_19)
  378. f9g2_19 := int64(f9) * int64(g2_19)
  379. f9g3_38 := int64(f9_2) * int64(g3_19)
  380. f9g4_19 := int64(f9) * int64(g4_19)
  381. f9g5_38 := int64(f9_2) * int64(g5_19)
  382. f9g6_19 := int64(f9) * int64(g6_19)
  383. f9g7_38 := int64(f9_2) * int64(g7_19)
  384. f9g8_19 := int64(f9) * int64(g8_19)
  385. f9g9_38 := int64(f9_2) * int64(g9_19)
  386. h0 := f0g0 + f1g9_38 + f2g8_19 + f3g7_38 + f4g6_19 + f5g5_38 + f6g4_19 + f7g3_38 + f8g2_19 + f9g1_38
  387. h1 := f0g1 + f1g0 + f2g9_19 + f3g8_19 + f4g7_19 + f5g6_19 + f6g5_19 + f7g4_19 + f8g3_19 + f9g2_19
  388. h2 := f0g2 + f1g1_2 + f2g0 + f3g9_38 + f4g8_19 + f5g7_38 + f6g6_19 + f7g5_38 + f8g4_19 + f9g3_38
  389. h3 := f0g3 + f1g2 + f2g1 + f3g0 + f4g9_19 + f5g8_19 + f6g7_19 + f7g6_19 + f8g5_19 + f9g4_19
  390. h4 := f0g4 + f1g3_2 + f2g2 + f3g1_2 + f4g0 + f5g9_38 + f6g8_19 + f7g7_38 + f8g6_19 + f9g5_38
  391. h5 := f0g5 + f1g4 + f2g3 + f3g2 + f4g1 + f5g0 + f6g9_19 + f7g8_19 + f8g7_19 + f9g6_19
  392. h6 := f0g6 + f1g5_2 + f2g4 + f3g3_2 + f4g2 + f5g1_2 + f6g0 + f7g9_38 + f8g8_19 + f9g7_38
  393. h7 := f0g7 + f1g6 + f2g5 + f3g4 + f4g3 + f5g2 + f6g1 + f7g0 + f8g9_19 + f9g8_19
  394. h8 := f0g8 + f1g7_2 + f2g6 + f3g5_2 + f4g4 + f5g3_2 + f6g2 + f7g1_2 + f8g0 + f9g9_38
  395. h9 := f0g9 + f1g8 + f2g7 + f3g6 + f4g5 + f5g4 + f6g3 + f7g2 + f8g1 + f9g0
  396. var carry [10]int64
  397. // |h0| <= (1.1*1.1*2^52*(1+19+19+19+19)+1.1*1.1*2^50*(38+38+38+38+38))
  398. // i.e. |h0| <= 1.2*2^59; narrower ranges for h2, h4, h6, h8
  399. // |h1| <= (1.1*1.1*2^51*(1+1+19+19+19+19+19+19+19+19))
  400. // i.e. |h1| <= 1.5*2^58; narrower ranges for h3, h5, h7, h9
  401. carry[0] = (h0 + (1 << 25)) >> 26
  402. h1 += carry[0]
  403. h0 -= carry[0] << 26
  404. carry[4] = (h4 + (1 << 25)) >> 26
  405. h5 += carry[4]
  406. h4 -= carry[4] << 26
  407. // |h0| <= 2^25
  408. // |h4| <= 2^25
  409. // |h1| <= 1.51*2^58
  410. // |h5| <= 1.51*2^58
  411. carry[1] = (h1 + (1 << 24)) >> 25
  412. h2 += carry[1]
  413. h1 -= carry[1] << 25
  414. carry[5] = (h5 + (1 << 24)) >> 25
  415. h6 += carry[5]
  416. h5 -= carry[5] << 25
  417. // |h1| <= 2^24; from now on fits into int32
  418. // |h5| <= 2^24; from now on fits into int32
  419. // |h2| <= 1.21*2^59
  420. // |h6| <= 1.21*2^59
  421. carry[2] = (h2 + (1 << 25)) >> 26
  422. h3 += carry[2]
  423. h2 -= carry[2] << 26
  424. carry[6] = (h6 + (1 << 25)) >> 26
  425. h7 += carry[6]
  426. h6 -= carry[6] << 26
  427. // |h2| <= 2^25; from now on fits into int32 unchanged
  428. // |h6| <= 2^25; from now on fits into int32 unchanged
  429. // |h3| <= 1.51*2^58
  430. // |h7| <= 1.51*2^58
  431. carry[3] = (h3 + (1 << 24)) >> 25
  432. h4 += carry[3]
  433. h3 -= carry[3] << 25
  434. carry[7] = (h7 + (1 << 24)) >> 25
  435. h8 += carry[7]
  436. h7 -= carry[7] << 25
  437. // |h3| <= 2^24; from now on fits into int32 unchanged
  438. // |h7| <= 2^24; from now on fits into int32 unchanged
  439. // |h4| <= 1.52*2^33
  440. // |h8| <= 1.52*2^33
  441. carry[4] = (h4 + (1 << 25)) >> 26
  442. h5 += carry[4]
  443. h4 -= carry[4] << 26
  444. carry[8] = (h8 + (1 << 25)) >> 26
  445. h9 += carry[8]
  446. h8 -= carry[8] << 26
  447. // |h4| <= 2^25; from now on fits into int32 unchanged
  448. // |h8| <= 2^25; from now on fits into int32 unchanged
  449. // |h5| <= 1.01*2^24
  450. // |h9| <= 1.51*2^58
  451. carry[9] = (h9 + (1 << 24)) >> 25
  452. h0 += carry[9] * 19
  453. h9 -= carry[9] << 25
  454. // |h9| <= 2^24; from now on fits into int32 unchanged
  455. // |h0| <= 1.8*2^37
  456. carry[0] = (h0 + (1 << 25)) >> 26
  457. h1 += carry[0]
  458. h0 -= carry[0] << 26
  459. // |h0| <= 2^25; from now on fits into int32 unchanged
  460. // |h1| <= 1.01*2^24
  461. h[0] = int32(h0)
  462. h[1] = int32(h1)
  463. h[2] = int32(h2)
  464. h[3] = int32(h3)
  465. h[4] = int32(h4)
  466. h[5] = int32(h5)
  467. h[6] = int32(h6)
  468. h[7] = int32(h7)
  469. h[8] = int32(h8)
  470. h[9] = int32(h9)
  471. }
  472. // feSquare calculates h = f*f. Can overlap h with f.
  473. //
  474. // Preconditions:
  475. // |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
  476. //
  477. // Postconditions:
  478. // |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
  479. func feSquare(h, f *fieldElement) {
  480. f0 := f[0]
  481. f1 := f[1]
  482. f2 := f[2]
  483. f3 := f[3]
  484. f4 := f[4]
  485. f5 := f[5]
  486. f6 := f[6]
  487. f7 := f[7]
  488. f8 := f[8]
  489. f9 := f[9]
  490. f0_2 := 2 * f0
  491. f1_2 := 2 * f1
  492. f2_2 := 2 * f2
  493. f3_2 := 2 * f3
  494. f4_2 := 2 * f4
  495. f5_2 := 2 * f5
  496. f6_2 := 2 * f6
  497. f7_2 := 2 * f7
  498. f5_38 := 38 * f5 // 1.31*2^30
  499. f6_19 := 19 * f6 // 1.31*2^30
  500. f7_38 := 38 * f7 // 1.31*2^30
  501. f8_19 := 19 * f8 // 1.31*2^30
  502. f9_38 := 38 * f9 // 1.31*2^30
  503. f0f0 := int64(f0) * int64(f0)
  504. f0f1_2 := int64(f0_2) * int64(f1)
  505. f0f2_2 := int64(f0_2) * int64(f2)
  506. f0f3_2 := int64(f0_2) * int64(f3)
  507. f0f4_2 := int64(f0_2) * int64(f4)
  508. f0f5_2 := int64(f0_2) * int64(f5)
  509. f0f6_2 := int64(f0_2) * int64(f6)
  510. f0f7_2 := int64(f0_2) * int64(f7)
  511. f0f8_2 := int64(f0_2) * int64(f8)
  512. f0f9_2 := int64(f0_2) * int64(f9)
  513. f1f1_2 := int64(f1_2) * int64(f1)
  514. f1f2_2 := int64(f1_2) * int64(f2)
  515. f1f3_4 := int64(f1_2) * int64(f3_2)
  516. f1f4_2 := int64(f1_2) * int64(f4)
  517. f1f5_4 := int64(f1_2) * int64(f5_2)
  518. f1f6_2 := int64(f1_2) * int64(f6)
  519. f1f7_4 := int64(f1_2) * int64(f7_2)
  520. f1f8_2 := int64(f1_2) * int64(f8)
  521. f1f9_76 := int64(f1_2) * int64(f9_38)
  522. f2f2 := int64(f2) * int64(f2)
  523. f2f3_2 := int64(f2_2) * int64(f3)
  524. f2f4_2 := int64(f2_2) * int64(f4)
  525. f2f5_2 := int64(f2_2) * int64(f5)
  526. f2f6_2 := int64(f2_2) * int64(f6)
  527. f2f7_2 := int64(f2_2) * int64(f7)
  528. f2f8_38 := int64(f2_2) * int64(f8_19)
  529. f2f9_38 := int64(f2) * int64(f9_38)
  530. f3f3_2 := int64(f3_2) * int64(f3)
  531. f3f4_2 := int64(f3_2) * int64(f4)
  532. f3f5_4 := int64(f3_2) * int64(f5_2)
  533. f3f6_2 := int64(f3_2) * int64(f6)
  534. f3f7_76 := int64(f3_2) * int64(f7_38)
  535. f3f8_38 := int64(f3_2) * int64(f8_19)
  536. f3f9_76 := int64(f3_2) * int64(f9_38)
  537. f4f4 := int64(f4) * int64(f4)
  538. f4f5_2 := int64(f4_2) * int64(f5)
  539. f4f6_38 := int64(f4_2) * int64(f6_19)
  540. f4f7_38 := int64(f4) * int64(f7_38)
  541. f4f8_38 := int64(f4_2) * int64(f8_19)
  542. f4f9_38 := int64(f4) * int64(f9_38)
  543. f5f5_38 := int64(f5) * int64(f5_38)
  544. f5f6_38 := int64(f5_2) * int64(f6_19)
  545. f5f7_76 := int64(f5_2) * int64(f7_38)
  546. f5f8_38 := int64(f5_2) * int64(f8_19)
  547. f5f9_76 := int64(f5_2) * int64(f9_38)
  548. f6f6_19 := int64(f6) * int64(f6_19)
  549. f6f7_38 := int64(f6) * int64(f7_38)
  550. f6f8_38 := int64(f6_2) * int64(f8_19)
  551. f6f9_38 := int64(f6) * int64(f9_38)
  552. f7f7_38 := int64(f7) * int64(f7_38)
  553. f7f8_38 := int64(f7_2) * int64(f8_19)
  554. f7f9_76 := int64(f7_2) * int64(f9_38)
  555. f8f8_19 := int64(f8) * int64(f8_19)
  556. f8f9_38 := int64(f8) * int64(f9_38)
  557. f9f9_38 := int64(f9) * int64(f9_38)
  558. h0 := f0f0 + f1f9_76 + f2f8_38 + f3f7_76 + f4f6_38 + f5f5_38
  559. h1 := f0f1_2 + f2f9_38 + f3f8_38 + f4f7_38 + f5f6_38
  560. h2 := f0f2_2 + f1f1_2 + f3f9_76 + f4f8_38 + f5f7_76 + f6f6_19
  561. h3 := f0f3_2 + f1f2_2 + f4f9_38 + f5f8_38 + f6f7_38
  562. h4 := f0f4_2 + f1f3_4 + f2f2 + f5f9_76 + f6f8_38 + f7f7_38
  563. h5 := f0f5_2 + f1f4_2 + f2f3_2 + f6f9_38 + f7f8_38
  564. h6 := f0f6_2 + f1f5_4 + f2f4_2 + f3f3_2 + f7f9_76 + f8f8_19
  565. h7 := f0f7_2 + f1f6_2 + f2f5_2 + f3f4_2 + f8f9_38
  566. h8 := f0f8_2 + f1f7_4 + f2f6_2 + f3f5_4 + f4f4 + f9f9_38
  567. h9 := f0f9_2 + f1f8_2 + f2f7_2 + f3f6_2 + f4f5_2
  568. var carry [10]int64
  569. carry[0] = (h0 + (1 << 25)) >> 26
  570. h1 += carry[0]
  571. h0 -= carry[0] << 26
  572. carry[4] = (h4 + (1 << 25)) >> 26
  573. h5 += carry[4]
  574. h4 -= carry[4] << 26
  575. carry[1] = (h1 + (1 << 24)) >> 25
  576. h2 += carry[1]
  577. h1 -= carry[1] << 25
  578. carry[5] = (h5 + (1 << 24)) >> 25
  579. h6 += carry[5]
  580. h5 -= carry[5] << 25
  581. carry[2] = (h2 + (1 << 25)) >> 26
  582. h3 += carry[2]
  583. h2 -= carry[2] << 26
  584. carry[6] = (h6 + (1 << 25)) >> 26
  585. h7 += carry[6]
  586. h6 -= carry[6] << 26
  587. carry[3] = (h3 + (1 << 24)) >> 25
  588. h4 += carry[3]
  589. h3 -= carry[3] << 25
  590. carry[7] = (h7 + (1 << 24)) >> 25
  591. h8 += carry[7]
  592. h7 -= carry[7] << 25
  593. carry[4] = (h4 + (1 << 25)) >> 26
  594. h5 += carry[4]
  595. h4 -= carry[4] << 26
  596. carry[8] = (h8 + (1 << 25)) >> 26
  597. h9 += carry[8]
  598. h8 -= carry[8] << 26
  599. carry[9] = (h9 + (1 << 24)) >> 25
  600. h0 += carry[9] * 19
  601. h9 -= carry[9] << 25
  602. carry[0] = (h0 + (1 << 25)) >> 26
  603. h1 += carry[0]
  604. h0 -= carry[0] << 26
  605. h[0] = int32(h0)
  606. h[1] = int32(h1)
  607. h[2] = int32(h2)
  608. h[3] = int32(h3)
  609. h[4] = int32(h4)
  610. h[5] = int32(h5)
  611. h[6] = int32(h6)
  612. h[7] = int32(h7)
  613. h[8] = int32(h8)
  614. h[9] = int32(h9)
  615. }
  616. // feMul121666 calculates h = f * 121666. Can overlap h with f.
  617. //
  618. // Preconditions:
  619. // |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
  620. //
  621. // Postconditions:
  622. // |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
  623. func feMul121666(h, f *fieldElement) {
  624. h0 := int64(f[0]) * 121666
  625. h1 := int64(f[1]) * 121666
  626. h2 := int64(f[2]) * 121666
  627. h3 := int64(f[3]) * 121666
  628. h4 := int64(f[4]) * 121666
  629. h5 := int64(f[5]) * 121666
  630. h6 := int64(f[6]) * 121666
  631. h7 := int64(f[7]) * 121666
  632. h8 := int64(f[8]) * 121666
  633. h9 := int64(f[9]) * 121666
  634. var carry [10]int64
  635. carry[9] = (h9 + (1 << 24)) >> 25
  636. h0 += carry[9] * 19
  637. h9 -= carry[9] << 25
  638. carry[1] = (h1 + (1 << 24)) >> 25
  639. h2 += carry[1]
  640. h1 -= carry[1] << 25
  641. carry[3] = (h3 + (1 << 24)) >> 25
  642. h4 += carry[3]
  643. h3 -= carry[3] << 25
  644. carry[5] = (h5 + (1 << 24)) >> 25
  645. h6 += carry[5]
  646. h5 -= carry[5] << 25
  647. carry[7] = (h7 + (1 << 24)) >> 25
  648. h8 += carry[7]
  649. h7 -= carry[7] << 25
  650. carry[0] = (h0 + (1 << 25)) >> 26
  651. h1 += carry[0]
  652. h0 -= carry[0] << 26
  653. carry[2] = (h2 + (1 << 25)) >> 26
  654. h3 += carry[2]
  655. h2 -= carry[2] << 26
  656. carry[4] = (h4 + (1 << 25)) >> 26
  657. h5 += carry[4]
  658. h4 -= carry[4] << 26
  659. carry[6] = (h6 + (1 << 25)) >> 26
  660. h7 += carry[6]
  661. h6 -= carry[6] << 26
  662. carry[8] = (h8 + (1 << 25)) >> 26
  663. h9 += carry[8]
  664. h8 -= carry[8] << 26
  665. h[0] = int32(h0)
  666. h[1] = int32(h1)
  667. h[2] = int32(h2)
  668. h[3] = int32(h3)
  669. h[4] = int32(h4)
  670. h[5] = int32(h5)
  671. h[6] = int32(h6)
  672. h[7] = int32(h7)
  673. h[8] = int32(h8)
  674. h[9] = int32(h9)
  675. }
  676. // feInvert sets out = z^-1.
  677. func feInvert(out, z *fieldElement) {
  678. var t0, t1, t2, t3 fieldElement
  679. var i int
  680. feSquare(&t0, z)
  681. for i = 1; i < 1; i++ {
  682. feSquare(&t0, &t0)
  683. }
  684. feSquare(&t1, &t0)
  685. for i = 1; i < 2; i++ {
  686. feSquare(&t1, &t1)
  687. }
  688. feMul(&t1, z, &t1)
  689. feMul(&t0, &t0, &t1)
  690. feSquare(&t2, &t0)
  691. for i = 1; i < 1; i++ {
  692. feSquare(&t2, &t2)
  693. }
  694. feMul(&t1, &t1, &t2)
  695. feSquare(&t2, &t1)
  696. for i = 1; i < 5; i++ {
  697. feSquare(&t2, &t2)
  698. }
  699. feMul(&t1, &t2, &t1)
  700. feSquare(&t2, &t1)
  701. for i = 1; i < 10; i++ {
  702. feSquare(&t2, &t2)
  703. }
  704. feMul(&t2, &t2, &t1)
  705. feSquare(&t3, &t2)
  706. for i = 1; i < 20; i++ {
  707. feSquare(&t3, &t3)
  708. }
  709. feMul(&t2, &t3, &t2)
  710. feSquare(&t2, &t2)
  711. for i = 1; i < 10; i++ {
  712. feSquare(&t2, &t2)
  713. }
  714. feMul(&t1, &t2, &t1)
  715. feSquare(&t2, &t1)
  716. for i = 1; i < 50; i++ {
  717. feSquare(&t2, &t2)
  718. }
  719. feMul(&t2, &t2, &t1)
  720. feSquare(&t3, &t2)
  721. for i = 1; i < 100; i++ {
  722. feSquare(&t3, &t3)
  723. }
  724. feMul(&t2, &t3, &t2)
  725. feSquare(&t2, &t2)
  726. for i = 1; i < 50; i++ {
  727. feSquare(&t2, &t2)
  728. }
  729. feMul(&t1, &t2, &t1)
  730. feSquare(&t1, &t1)
  731. for i = 1; i < 5; i++ {
  732. feSquare(&t1, &t1)
  733. }
  734. feMul(out, &t1, &t0)
  735. }
  736. func scalarMult(out, in, base *[32]byte) {
  737. var e [32]byte
  738. copy(e[:], in[:])
  739. e[0] &= 248
  740. e[31] &= 127
  741. e[31] |= 64
  742. var x1, x2, z2, x3, z3, tmp0, tmp1 fieldElement
  743. feFromBytes(&x1, base)
  744. feOne(&x2)
  745. feCopy(&x3, &x1)
  746. feOne(&z3)
  747. swap := int32(0)
  748. for pos := 254; pos >= 0; pos-- {
  749. b := e[pos/8] >> uint(pos&7)
  750. b &= 1
  751. swap ^= int32(b)
  752. feCSwap(&x2, &x3, swap)
  753. feCSwap(&z2, &z3, swap)
  754. swap = int32(b)
  755. feSub(&tmp0, &x3, &z3)
  756. feSub(&tmp1, &x2, &z2)
  757. feAdd(&x2, &x2, &z2)
  758. feAdd(&z2, &x3, &z3)
  759. feMul(&z3, &tmp0, &x2)
  760. feMul(&z2, &z2, &tmp1)
  761. feSquare(&tmp0, &tmp1)
  762. feSquare(&tmp1, &x2)
  763. feAdd(&x3, &z3, &z2)
  764. feSub(&z2, &z3, &z2)
  765. feMul(&x2, &tmp1, &tmp0)
  766. feSub(&tmp1, &tmp1, &tmp0)
  767. feSquare(&z2, &z2)
  768. feMul121666(&z3, &tmp1)
  769. feSquare(&x3, &x3)
  770. feAdd(&tmp0, &tmp0, &z3)
  771. feMul(&z3, &x1, &z2)
  772. feMul(&z2, &tmp1, &tmp0)
  773. }
  774. feCSwap(&x2, &x3, swap)
  775. feCSwap(&z2, &z3, swap)
  776. feInvert(&z2, &z2)
  777. feMul(&x2, &x2, &z2)
  778. feToBytes(out, &x2)
  779. }