diff --git a/c/buildasm/add.asm.ejs b/c/buildasm/add.asm.ejs new file mode 100644 index 0000000..292bcb0 --- /dev/null +++ b/c/buildasm/add.asm.ejs @@ -0,0 +1,245 @@ +<% function addS1S2() { %> + xor rdx, rdx + mov edx, eax + add edx, ecx + jo add_manageOverflow ; rsi already is the 64bits result + + mov [rdi], rdx ; not necessary to adjust so just save and return + ret + +add_manageOverflow: ; Do the operation in 64 bits + push rsi + movsx rsi, eax + movsx rdx, ecx + add rsi, rdx + call rawCopyS2L + pop rsi + ret +<% } %> + + + +<% function addL1S2() { %> + add rsi, 8 + movsx rdx, ecx + add rdi, 8 + cmp rdx, 0 + <% const rawAddLabel = global.tmpLabel() %> + jns <%= rawAddLabel %> + neg rdx + call rawSubLS + sub rdi, 8 + sub rsi, 8 + ret +<%= rawAddLabel %>: + call rawAddLS + sub rdi, 8 + sub rsi, 8 + ret + +<% } %> + +<% function addS1L2() { %> + lea rsi, [rdx + 8] + movsx rdx, eax + add rdi, 8 + cmp rdx, 0 + <% const rawAddLabel = global.tmpLabel() %> + jns <%= rawAddLabel %> + neg rdx + call rawSubLS + sub rdi, 8 + sub rsi, 8 + ret +<%= rawAddLabel %>: + call rawAddLS + sub rdi, 8 + sub rsi, 8 + ret +<% } %> + +<% function addL1L2() { %> + add rdi, 8 + add rsi, 8 + add rdx, 8 + call rawAddLL + sub rdi, 8 + sub rsi, 8 + ret +<% } %> + +;;;;;;;;;;;;;;;;;;;;;; +; add +;;;;;;;;;;;;;;;;;;;;;; +; Adds two elements of any kind +; Params: +; rsi <= Pointer to element 1 +; rdx <= Pointer to element 2 +; rdi <= Pointer to result +; Modified Registers: +; r8, r9, 10, r11, rax, rcx +;;;;;;;;;;;;;;;;;;;;;; +<%=name%>_add: + mov rax, [rsi] + mov rcx, [rdx] + bt rax, 63 ; Check if is short first operand + jc add_l1 + bt rcx, 63 ; Check if is short second operand + jc add_s1l2 + +add_s1s2: ; Both operands are short +<%= addS1S2() %> +add_l1: + bt rcx, 63 ; Check if is short second operand + jc add_l1l2 + +;;;;;;;; +add_l1s2: + bt rax, 62 ; check if montgomery first + jc add_l1ms2 +add_l1ns2: +<%= global.setTypeDest("0x80"); %> +<%= addL1S2(); %> + +add_l1ms2: + bt rcx, 62 ; check if montgomery second + jc add_l1ms2m +add_l1ms2n: +<%= global.setTypeDest("0xC0"); %> +<%= global.toMont_b() %> +<%= addL1L2() %> + +add_l1ms2m: +<%= global.setTypeDest("0xC0"); %> +<%= addL1L2() %> + + +;;;;;;;; +add_s1l2: + bt rcx, 62 ; check if montgomery first + jc add_s1l2m +add_s1l2n: +<%= global.setTypeDest("0x80"); %> +<%= addS1L2(); %> + +add_s1l2m: + bt rax, 62 ; check if montgomery second + jc add_s1ml2m +add_s1nl2m: +<%= global.setTypeDest("0xC0"); %> +<%= global.toMont_a() %> +<%= addL1L2() %> + +add_s1ml2m: +<%= global.setTypeDest("0xC0"); %> +<%= addL1L2() %> + +;;;; +add_l1l2: + bt rax, 62 ; check if montgomery first + jc add_l1ml2 +add_l1nl2: + bt rcx, 62 ; check if montgomery second + jc add_l1nl2m +add_l1nl2n: +<%= global.setTypeDest("0x80"); %> +<%= addL1L2() %> + +add_l1nl2m: +<%= global.setTypeDest("0xC0"); %> +<%= global.toMont_a(); %> +<%= addL1L2() %> + +add_l1ml2: + bt rcx, 62 ; check if montgomery seconf + jc add_l1ml2m +add_l1ml2n: +<%= global.setTypeDest("0xC0"); %> +<%= global.toMont_b(); %> +<%= addL1L2() %> + +add_l1ml2m: +<%= global.setTypeDest("0xC0"); %> +<%= addL1L2() %> + + + +;;;;;;;;;;;;;;;;;;;;;; +; rawAddLL +;;;;;;;;;;;;;;;;;;;;;; +; Adds two elements of type long +; Params: +; rsi <= Pointer to the long data of element 1 +; rdx <= Pointer to the long data of element 2 +; rdi <= Pointer to the long data of result +; Modified Registers: +; rax +;;;;;;;;;;;;;;;;;;;;;; +rawAddLL: + ; Add component by component with carry +<% for (let i=0; i + mov rax, [rsi + <%=i*8%>] + <%= i==0 ? "add" : "adc" %> rax, [rdx + <%=i*8%>] + mov [rdi + <%=i*8%>], rax +<% } %> + jc rawAddLL_sq ; if overflow, substract q + + ; Compare with q +<% for (let i=0; i +<% if (i>0) { %> + mov rax, [rdi + <%= (n64-i-1)*8 %>] +<% } %> + cmp rax, [q + <%= (n64-i-1)*8 %>] + jc rawAddLL_done ; q is bigget so done. + jnz rawAddLL_sq ; q is lower +<% } %> + ; If equal substract q +rawAddLL_sq: +<% for (let i=0; i + mov rax, [q + <%=i*8%>] + <%= i==0 ? "sub" : "sbb" %> [rdi + <%=i*8%>], rax +<% } %> +rawAddLL_done: + ret + + +;;;;;;;;;;;;;;;;;;;;;; +; rawAddLS +;;;;;;;;;;;;;;;;;;;;;; +; Adds two elements of type long +; Params: +; rdi <= Pointer to the long data of result +; rsi <= Pointer to the long data of element 1 +; rdx <= Value to be added +;;;;;;;;;;;;;;;;;;;;;; +rawAddLS: + ; Add component by component with carry + + add rdx, [rsi] + mov [rdi] ,rdx +<% for (let i=1; i + mov rdx, 0 + adc rdx, [rsi + <%=i*8%>] + mov [rdi + <%=i*8%>], rdx +<% } %> + jc rawAddLS_sq ; if overflow, substract q + + ; Compare with q +<% for (let i=0; i + mov rax, [rdi + <%= (n64-i-1)*8 %>] + cmp rax, [q + <%= (n64-i-1)*8 %>] + jc rawAddLS_done ; q is bigget so done. + jnz rawAddLS_sq ; q is lower +<% } %> + ; If equal substract q +rawAddLS_sq: +<% for (let i=0; i + mov rax, [q + <%=i*8%>] + <%= i==0 ? "sub" : "sbb" %> [rdi + <%=i*8%>], rax +<% } %> +rawAddLS_done: + ret + + + + diff --git a/c/buildasm/binops.asm.ejs b/c/buildasm/binops.asm.ejs new file mode 100644 index 0000000..1ca5068 --- /dev/null +++ b/c/buildasm/binops.asm.ejs @@ -0,0 +1,178 @@ +<% function binOpS1S2(op) { %> + cmp r8d, 0 + <% const s1s2_solveNeg = global.tmpLabel() %> + js <%=s1s2_solveNeg%> + + cmp r9d, 0 + js <%=s1s2_solveNeg%> + xor rdx, rdx ; both ops are positive so do the op and return + mov edx, r8d + <%=op%> edx, r9d + mov [rdi], rdx ; not necessary to adjust so just save and return + ret + +<%=s1s2_solveNeg%>: +<%= global.setTypeDest("0x80"); %> +<%= global.toLong_b() %> +<%= global.toLong_a() %> +<%= binOpL1L2(op) %> + + +<% } %> + +<% function binOpS1L2(op) { %> + cmp r8d, 0 + <% const s1l2_solveNeg = global.tmpLabel() %> + js <%=s1l2_solveNeg%> + movsx rax, r8d + <%=op%> rax, [rdx +8] + mov [rdi+8], rax +<% for (let i=1; i + xor rax, rax + <%=op%> rax, [rdx + <%= (i*8)+8 %>] +<% if (i== n64-1) { %> + and rax, [lboMask] +<% } %> + mov [rdi + <%= (i*8)+8 %> ], rax +<% } %> + ret + +<%=s1l2_solveNeg%>: +<%= global.toLong_a() %> +<%= global.setTypeDest("0x80"); %> +<%= binOpL1L2(op) %> + +<% } %> + +<% function binOpL1S2(op) { %> + cmp r9d, 0 + <% const l1s2_solveNeg = global.tmpLabel() %> + js <%=l1s2_solveNeg%> + movsx rax, r9d + <%=op%> rax, [rsi +8] + mov [rdi+8], rax +<% for (let i=1; i + xor rax, rax + <%=op%> rax, [rsi + <%= (i*8)+8 %>]; +<% if (i== n64-1) { %> + and rax, [lboMask] ; +<% } %> + mov [rdi + <%= (i*8)+8 %> ], rax; +<% } %> + ret + +<%=l1s2_solveNeg%>: +<%= global.toLong_b() %> +<%= global.setTypeDest("0x80"); %> +<%= binOpL1L2(op) %> + +<% } %> + +<% function binOpL1L2(op) { %> +<% for (let i=0; i + mov rax, [rsi + <%= (i*8)+8 %>] + <%=op%> rax, [rdx + <%= (i*8)+8 %>] +<% if (i== n64-1) { %> + and rax, [lboMask] +<% } %> + mov [rdi + <%= (i*8)+8 %> ], rax +<% } %> + ret +<% } %> + + + + +<% function binOp(op) { %> +;;;;;;;;;;;;;;;;;;;;;; +; <%= op %> +;;;;;;;;;;;;;;;;;;;;;; +; Adds two elements of any kind +; Params: +; rsi <= Pointer to element 1 +; rdx <= Pointer to element 2 +; rdi <= Pointer to result +; Modified Registers: +; r8, r9, 10, r11, rax, rcx +;;;;;;;;;;;;;;;;;;;;;; +<%=name%>_b<%=op%>: + mov r8, [rsi] + mov r9, [rdx] + bt r8, 63 ; Check if is short first operand + jc <%=op%>_l1 + bt r9, 63 ; Check if is short second operand + jc <%=op%>_s1l2 + +<%=op%>_s1s2: +<%= binOpS1S2(op) %> + + +<%=op%>_l1: + bt r9, 63 ; Check if is short second operand + jc <%=op%>_l1l2 + + +<%=op%>_l1s2: + bt r8, 62 ; check if montgomery first + jc <%=op%>_l1ms2 +<%=op%>_l1ns2: +<%= global.setTypeDest("0x80"); %> +<%= binOpL1S2(op) %> + +<%=op%>_l1ms2: +<%= global.setTypeDest("0x80"); %> + push r9 ; r9 is used in montgomery so we need to save it +<%= global.fromMont_a() %> + pop r9 +<%= binOpL1S2(op) %> + + +<%=op%>_s1l2: + bt r9, 62 ; check if montgomery first + jc <%=op%>_s1l2m +<%=op%>_s1l2n: +<%= global.setTypeDest("0x80"); %> +<%= binOpS1L2(op) %> + +<%=op%>_s1l2m: +<%= global.setTypeDest("0x80"); %> + push r8 ; r8 is used in montgomery so we need to save it +<%= global.fromMont_b() %> + pop r8 +<%= binOpS1L2(op) %> + + +<%=op%>_l1l2: + bt r8, 62 ; check if montgomery first + jc <%=op%>_l1ml2 + bt r9, 62 ; check if montgomery first + jc <%=op%>_l1nl2m +<%=op%>_l1nl2n: +<%= global.setTypeDest("0x80"); %> +<%= binOpL1L2(op) %> + +<%=op%>_l1nl2m: +<%= global.setTypeDest("0x80"); %> +<%= global.fromMont_b() %> +<%= binOpL1L2(op) %> + +<%=op%>_l1ml2: + bt r9, 62 ; check if montgomery first + jc <%=op%>_l1ml2m +<%=op%>_l1ml2n: +<%= global.setTypeDest("0x80"); %> +<%= global.fromMont_a() %> +<%= binOpL1L2(op) %> + +<%=op%>_l1ml2m: +<%= global.setTypeDest("0x80"); %> +<%= global.fromMont_a() %> +<%= global.fromMont_b() %> +<%= binOpL1L2(op) %> +<% } %> + +<%= binOp("and") %> +<%= binOp("or") %> +<%= binOp("xor") %> + + diff --git a/c/buildasm/buildzqfield.js b/c/buildasm/buildzqfield.js new file mode 100644 index 0000000..9a2aeaf --- /dev/null +++ b/c/buildasm/buildzqfield.js @@ -0,0 +1,71 @@ +const bigInt=require("big-integer"); +const path = require("path"); +const util = require("util"); +const renderFile = util.promisify(require("ejs").renderFile); + +const runningAsScript = !module.parent; + + +class ZqBuilder { + constructor(q, name) { + const self = this; + this.q=bigInt(q); + this.n64 = Math.floor((this.q.bitLength() - 1) / 64)+1; + this.name = name; + this.bigInt = bigInt; + this.lastTmp=0; + this.global = {}; + this.global.tmpLabel = function() { + self.lastTmp++; + return "tmp"+self.lastTmp; + }; + } + + constantElement(v) { + let S = ""; + const mask = bigInt("FFFFFFFFFFFFFFFF", 16); + for (let i=0; i0) S = S+","; + let shex = v.shiftRight(i*64).and(mask).toString(16); + while (shex.length <16) shex = "0" + shex; + S = S + "0x" + shex; + } + return S; + } + +} + +async function buildField(q, name) { + const builder = new ZqBuilder(q, name); + + const asm = await renderFile(path.join(__dirname, "fr.asm.ejs"), builder); + const c = await renderFile(path.join(__dirname, "fr.c.ejs"), builder); + const h = await renderFile(path.join(__dirname, "fr.h.ejs"), builder); + + return {asm: asm, h: h, c: c}; +} + +if (runningAsScript) { + const fs = require("fs"); + var argv = require("yargs") + .usage("Usage: $0 -q [primeNum] -n [name] -oc [out .c file] -oh [out .h file]") + .demandOption(["q","n"]) + .alias("q", "prime") + .alias("n", "name") + .argv; + + const q = bigInt(argv.q); + + const asmFileName = (argv.oc) ? argv.oc : argv.name.toLowerCase() + ".asm"; + const hFileName = (argv.oc) ? argv.oc : argv.name.toLowerCase() + ".h"; + const cFileName = (argv.oc) ? argv.oc : argv.name.toLowerCase() + ".c"; + + buildField(q, argv.name).then( (res) => { + fs.writeFileSync(asmFileName, res.asm, "utf8"); + fs.writeFileSync(hFileName, res.h, "utf8"); + fs.writeFileSync(cFileName, res.c, "utf8"); + }); + +} else { + module.exports = buildField; +} diff --git a/c/buildasm/buildzqfieldtester.js b/c/buildasm/buildzqfieldtester.js new file mode 100644 index 0000000..8fdb1d6 --- /dev/null +++ b/c/buildasm/buildzqfieldtester.js @@ -0,0 +1,75 @@ +const chai = require("chai"); +const assert = chai.assert; + +const fs = require("fs"); +var tmp = require("tmp-promise"); +const path = require("path"); +const util = require("util"); +const exec = util.promisify(require("child_process").exec); + +const BuildZqField = require("./buildzqfield"); + +module.exports = testField; + +async function testField(prime, test) { + tmp.setGracefulCleanup(); + + const dir = await tmp.dir({prefix: "circom_", unsafeCleanup: true }); + + const source = await BuildZqField(prime, "Fr"); + + // console.log(dir.path); + + await fs.promises.writeFile(path.join(dir.path, "fr.asm"), source.asm, "utf8"); + await fs.promises.writeFile(path.join(dir.path, "fr.h"), source.h, "utf8"); + await fs.promises.writeFile(path.join(dir.path, "fr.c"), source.c, "utf8"); + + await exec(`cp ${path.join(__dirname, "tester.cpp")} ${dir.path}`); + + await exec("nasm -fmacho64 --prefix _ " + + ` ${path.join(dir.path, "fr.asm")}` + ); + + await exec("g++" + + ` ${path.join(dir.path, "tester.cpp")}` + + ` ${path.join(dir.path, "fr.o")}` + + ` ${path.join(dir.path, "fr.c")}` + + ` -o ${path.join(dir.path, "tester")}` + + " -lgmp" + ); + + const inLines = []; + for (let i=0; i${path.join(dir.path, "out.tst")}`); + + const res = await fs.promises.readFile(path.join(dir.path, "out.tst"), "utf8"); + const resLines = res.split("\n"); + + for (let i=0; i + mov qword [rdi], 1 + add rsp, <%= (n64+1)*8 %> + ret +<% } %> + +<% function retZero() { %> + mov qword [rdi], 0 + add rsp, <%= (n64+1)*8 %> + ret +<% } %> + +<% function cmpLong(op, eq) { %> + +<% + if (eq==true) { + if (["leq","geq"].indexOf(op) >= 0) retOne(); + if (["lt","gt"].indexOf(op) >= 0) retZero(); + } +%> + + +<% const label_gt = global.tmpLabel() %> +<% const label_lt = global.tmpLabel() %> +<% for (let i=n64-1; i>=0; i--) { %> + mov rax, [rsp + <%= 8+(i*8) %>] + cmp [half + <%= (i*8) %>], rax ; comare with (q-1)/2 + jc <%=label_lt%> ; half e1-e2 is neg => e1 < e2 + jnz <%=label_gt%> ; half>rax => e1 -e2 is pos => e1 > e2 +<% } %> + ; half == rax => e1-e2 is pos => e1 > e2 +<%=label_gt%>: +<% if (["geq","gt"].indexOf(op) >= 0) retOne(); else retZero(); %> +<%=label_lt%>: +<% if (["leq","lt"].indexOf(op) >= 0) retOne(); else retZero(); %> +<% } // cmpLong%> + +<% function cmpOp(op) { %> +;;;;;;;;;;;;;;;;;;;;;; +; <%= op %> +;;;;;;;;;;;;;;;;;;;;;; +; Adds two elements of any kind +; Params: +; rsi <= Pointer to element 1 +; rdx <= Pointer to element 2 +; rdi <= Pointer to result can be zero or one. +; Modified Registers: +; r8, r9, 10, r11, rax, rcx +;;;;;;;;;;;;;;;;;;;;;; +<%=name%>_<%=op%>: + sub rsp, <%= (n64+1)*8 %> ; Save space for the result of the substraction + push rdi ; Save rdi + lea rdi, [rsp+8] ; We pushed rdi so we need to add 8 + call <%=name%>_sub ; Do a substraction + call <%=name%>_toNormal ; Convert it to normal + pop rdi + + mov rax, [rsp] ; We already poped do no need to add 8 + bt rax, 63 ; check is result is long + jc <%=op%>_longCmp + +<%=op%>_shortCmp: + cmp eax, 0 + je <%=op%>_s_eq + js <%=op%>_s_lt +<%=op%>_s_gt: +<% if (["geq","gt", "neq"].indexOf(op) >= 0) retOne(); else retZero(); %> +<%=op%>_s_lt: +<% if (["leq","lt", "neq"].indexOf(op) >= 0) retOne(); else retZero(); %> +<%=op%>_s_eq: +<% if (["eq","geq", "leq"].indexOf(op) >= 0) retOne(); else retZero(); %> + +<%=op%>_longCmp: + +<% for (let i=n64-1; i>=0; i--) { %> + cmp qword [rsp + <%= 8+(i*8) %>], 0 + jnz <%=op%>_neq +<% } %> +<%=op%>_eq: +<% if (op == "eq") { + retOne(); + } else if (op == "neq") { + retZero(); + } else { + cmpLong(op, true); + } +%> +<%=op%>_neq: +<% if (op == "neq") { + retOne(); + } else if (op == "eq") { + retZero(); + } else { + cmpLong(op, false); + } +%> + + +<% } %> + +<%= cmpOp("eq") %> +<%= cmpOp("neq") %> +<%= cmpOp("lt") %> +<%= cmpOp("gt") %> +<%= cmpOp("leq") %> +<%= cmpOp("geq") %> + diff --git a/c/buildasm/copy.asm.ejs b/c/buildasm/copy.asm.ejs new file mode 100644 index 0000000..d8e3abe --- /dev/null +++ b/c/buildasm/copy.asm.ejs @@ -0,0 +1,39 @@ + +;;;;;;;;;;;;;;;;;;;;;; +; rawCopyS2L +;;;;;;;;;;;;;;;;;;;;;; +; Convert a 64 bit integer to a long format field element +; Params: +; rsi <= the integer +; rdi <= Pointer to the overwritted element +; +; Nidified registers: +; rax +;;;;;;;;;;;;;;;;;;;;;;; + +rawCopyS2L: + mov al, 0x80 + shl rax, 56 + mov [rdi], rax ; set the result to LONG normal + + cmp rsi, 0 + js u64toLong_adjust_neg + + mov [rdi + 8], rsi + xor rax, rax +<% for (let i=1; i + mov [rdi + <%= 8+i*8 %>], rax +<% } %> + ret + +u64toLong_adjust_neg: + add rsi, [q] ; Set the first digit + mov [rdi + 8], rsi ; + + mov rsi, -1 ; all ones +<% for (let i=1; i + mov rax, rsi ; Add to q + adc rax, [q + <%= i*8 %> ] + mov [rdi + <%= (i+1)*8 %>], rax +<% } %> + ret diff --git a/c/buildasm/fr.asm b/c/buildasm/fr.asm new file mode 100644 index 0000000..201a730 --- /dev/null +++ b/c/buildasm/fr.asm @@ -0,0 +1,4854 @@ + + + global Fr_add + global Fr_sub + global Fr_neg + global Fr_mul + global Fr_band + global Fr_bor + global Fr_bxor + global Fr_eq + global Fr_neq + global Fr_lt + global Fr_gt + global Fr_leq + global Fr_geq + global Fr_toNormal + global Fr_toMontgomery + global Fr_q + DEFAULT REL + + section .text + + + + + + + + + + + + + + + + + + +;;;;;;;;;;;;;;;;;;;;;; +; rawCopyS2L +;;;;;;;;;;;;;;;;;;;;;; +; Convert a 64 bit integer to a long format field element +; Params: +; rsi <= the integer +; rdi <= Pointer to the overwritted element +; +; Nidified registers: +; rax +;;;;;;;;;;;;;;;;;;;;;;; + +rawCopyS2L: + mov al, 0x80 + shl rax, 56 + mov [rdi], rax ; set the result to LONG normal + + cmp rsi, 0 + js u64toLong_adjust_neg + + mov [rdi + 8], rsi + xor rax, rax + + mov [rdi + 16], rax + + mov [rdi + 24], rax + + mov [rdi + 32], rax + + ret + +u64toLong_adjust_neg: + add rsi, [q] ; Set the first digit + mov [rdi + 8], rsi ; + + mov rsi, -1 ; all ones + + mov rax, rsi ; Add to q + adc rax, [q + 8 ] + mov [rdi + 16], rax + + mov rax, rsi ; Add to q + adc rax, [q + 16 ] + mov [rdi + 24], rax + + mov rax, rsi ; Add to q + adc rax, [q + 24 ] + mov [rdi + 32], rax + + ret + + + + + + +;;;;;;;;;;;;;;;;;;;;;; +; rawMontgomeryMul +;;;;;;;;;;;;;;;;;;;;;; +; Multiply two elements in montgomery form +; Params: +; rsi <= Pointer to the long data of element 1 +; rdx <= Pointer to the long data of element 2 +; rdi <= Pointer to the long data of result +; Modified registers: +; r8, r9, 10, r11, rax, rcx +;;;;;;;;;;;;;;;;;;;;;; + +rawMontgomeryMul: + sub rsp, 32 ; Reserve space for ms + mov rcx, rdx ; rdx is needed for multiplications so keep it in cx + mov r11, 0xc2e1f593efffffff ; np + xor r8,r8 + xor r9,r9 + xor r10,r10 + + mov rax, [rsi + 0] + mul qword [rcx + 0] + add r8, rax + adc r9, rdx + adc r10, 0x0 + + + + + + mov rax, r8 + mul r11 + mov [rsp + 0], rax + mul qword [q] + add r8, rax + adc r9, rdx + adc r10, 0x0 + + + + mov rax, [rsi + 0] + mul qword [rcx + 8] + add r9, rax + adc r10, rdx + adc r8, 0x0 + + mov rax, [rsi + 8] + mul qword [rcx + 0] + add r9, rax + adc r10, rdx + adc r8, 0x0 + + + + mov rax, [rsp + 0] + mul qword [q + 8] + add r9, rax + adc r10, rdx + adc r8, 0x0 + + + + mov rax, r9 + mul r11 + mov [rsp + 8], rax + mul qword [q] + add r9, rax + adc r10, rdx + adc r8, 0x0 + + + + mov rax, [rsi + 0] + mul qword [rcx + 16] + add r10, rax + adc r8, rdx + adc r9, 0x0 + + mov rax, [rsi + 8] + mul qword [rcx + 8] + add r10, rax + adc r8, rdx + adc r9, 0x0 + + mov rax, [rsi + 16] + mul qword [rcx + 0] + add r10, rax + adc r8, rdx + adc r9, 0x0 + + + + mov rax, [rsp + 8] + mul qword [q + 8] + add r10, rax + adc r8, rdx + adc r9, 0x0 + + mov rax, [rsp + 0] + mul qword [q + 16] + add r10, rax + adc r8, rdx + adc r9, 0x0 + + + + mov rax, r10 + mul r11 + mov [rsp + 16], rax + mul qword [q] + add r10, rax + adc r8, rdx + adc r9, 0x0 + + + + mov rax, [rsi + 0] + mul qword [rcx + 24] + add r8, rax + adc r9, rdx + adc r10, 0x0 + + mov rax, [rsi + 8] + mul qword [rcx + 16] + add r8, rax + adc r9, rdx + adc r10, 0x0 + + mov rax, [rsi + 16] + mul qword [rcx + 8] + add r8, rax + adc r9, rdx + adc r10, 0x0 + + mov rax, [rsi + 24] + mul qword [rcx + 0] + add r8, rax + adc r9, rdx + adc r10, 0x0 + + + + mov rax, [rsp + 16] + mul qword [q + 8] + add r8, rax + adc r9, rdx + adc r10, 0x0 + + mov rax, [rsp + 8] + mul qword [q + 16] + add r8, rax + adc r9, rdx + adc r10, 0x0 + + mov rax, [rsp + 0] + mul qword [q + 24] + add r8, rax + adc r9, rdx + adc r10, 0x0 + + + + mov rax, r8 + mul r11 + mov [rsp + 24], rax + mul qword [q] + add r8, rax + adc r9, rdx + adc r10, 0x0 + + + + mov rax, [rsi + 8] + mul qword [rcx + 24] + add r9, rax + adc r10, rdx + adc r8, 0x0 + + mov rax, [rsi + 16] + mul qword [rcx + 16] + add r9, rax + adc r10, rdx + adc r8, 0x0 + + mov rax, [rsi + 24] + mul qword [rcx + 8] + add r9, rax + adc r10, rdx + adc r8, 0x0 + + + + mov rax, [rsp + 24] + mul qword [q + 8] + add r9, rax + adc r10, rdx + adc r8, 0x0 + + mov rax, [rsp + 16] + mul qword [q + 16] + add r9, rax + adc r10, rdx + adc r8, 0x0 + + mov rax, [rsp + 8] + mul qword [q + 24] + add r9, rax + adc r10, rdx + adc r8, 0x0 + + + + mov [rdi + 0 ], r9 + xor r9,r9 + + + + mov rax, [rsi + 16] + mul qword [rcx + 24] + add r10, rax + adc r8, rdx + adc r9, 0x0 + + mov rax, [rsi + 24] + mul qword [rcx + 16] + add r10, rax + adc r8, rdx + adc r9, 0x0 + + + + mov rax, [rsp + 24] + mul qword [q + 16] + add r10, rax + adc r8, rdx + adc r9, 0x0 + + mov rax, [rsp + 16] + mul qword [q + 24] + add r10, rax + adc r8, rdx + adc r9, 0x0 + + + + mov [rdi + 8 ], r10 + xor r10,r10 + + + + mov rax, [rsi + 24] + mul qword [rcx + 24] + add r8, rax + adc r9, rdx + adc r10, 0x0 + + + + mov rax, [rsp + 24] + mul qword [q + 24] + add r8, rax + adc r9, rdx + adc r10, 0x0 + + + + mov [rdi + 16 ], r8 + xor r8,r8 + + + + + + + + mov [rdi + 24 ], r9 + xor r9,r9 + + + + test r10, r10 + jnz rawMontgomeryMul_mulM_sq + ; Compare with q + + mov rax, [rdi + 24] + cmp rax, [q + 24] + jc rawMontgomeryMul_mulM_done ; q is bigget so done. + jnz rawMontgomeryMul_mulM_sq ; q is lower + + mov rax, [rdi + 16] + cmp rax, [q + 16] + jc rawMontgomeryMul_mulM_done ; q is bigget so done. + jnz rawMontgomeryMul_mulM_sq ; q is lower + + mov rax, [rdi + 8] + cmp rax, [q + 8] + jc rawMontgomeryMul_mulM_done ; q is bigget so done. + jnz rawMontgomeryMul_mulM_sq ; q is lower + + mov rax, [rdi + 0] + cmp rax, [q + 0] + jc rawMontgomeryMul_mulM_done ; q is bigget so done. + jnz rawMontgomeryMul_mulM_sq ; q is lower + + ; If equal substract q + +rawMontgomeryMul_mulM_sq: + + mov rax, [q + 0] + sub [rdi + 0], rax + + mov rax, [q + 8] + sbb [rdi + 8], rax + + mov rax, [q + 16] + sbb [rdi + 16], rax + + mov rax, [q + 24] + sbb [rdi + 24], rax + + +rawMontgomeryMul_mulM_done: + mov rdx, rcx ; recover rdx to its original place. + add rsp, 32 ; recover rsp + ret + + + +;;;;;;;;;;;;;;;;;;;;;; +; rawMontgomeryMul1 +;;;;;;;;;;;;;;;;;;;;;; +; Multiply two elements in montgomery form +; Params: +; rsi <= Pointer to the long data of element 1 +; rdx <= second operand +; rdi <= Pointer to the long data of result +; Modified registers: +; r8, r9, 10, r11, rax, rcx +;;;;;;;;;;;;;;;;;;;;;; + +rawMontgomeryMul1: + sub rsp, 32 ; Reserve space for ms + mov rcx, rdx ; rdx is needed for multiplications so keep it in cx + mov r11, 0xc2e1f593efffffff ; np + xor r8,r8 + xor r9,r9 + xor r10,r10 + + mov rax, [rsi + 0] + mul rcx + add r8, rax + adc r9, rdx + adc r10, 0x0 + + + + + + mov rax, r8 + mul r11 + mov [rsp + 0], rax + mul qword [q] + add r8, rax + adc r9, rdx + adc r10, 0x0 + + + + mov rax, [rsi + 8] + mul rcx + add r9, rax + adc r10, rdx + adc r8, 0x0 + + + + mov rax, [rsp + 0] + mul qword [q + 8] + add r9, rax + adc r10, rdx + adc r8, 0x0 + + + + mov rax, r9 + mul r11 + mov [rsp + 8], rax + mul qword [q] + add r9, rax + adc r10, rdx + adc r8, 0x0 + + + + mov rax, [rsi + 16] + mul rcx + add r10, rax + adc r8, rdx + adc r9, 0x0 + + + + mov rax, [rsp + 8] + mul qword [q + 8] + add r10, rax + adc r8, rdx + adc r9, 0x0 + + mov rax, [rsp + 0] + mul qword [q + 16] + add r10, rax + adc r8, rdx + adc r9, 0x0 + + + + mov rax, r10 + mul r11 + mov [rsp + 16], rax + mul qword [q] + add r10, rax + adc r8, rdx + adc r9, 0x0 + + + + mov rax, [rsi + 24] + mul rcx + add r8, rax + adc r9, rdx + adc r10, 0x0 + + + + mov rax, [rsp + 16] + mul qword [q + 8] + add r8, rax + adc r9, rdx + adc r10, 0x0 + + mov rax, [rsp + 8] + mul qword [q + 16] + add r8, rax + adc r9, rdx + adc r10, 0x0 + + mov rax, [rsp + 0] + mul qword [q + 24] + add r8, rax + adc r9, rdx + adc r10, 0x0 + + + + mov rax, r8 + mul r11 + mov [rsp + 24], rax + mul qword [q] + add r8, rax + adc r9, rdx + adc r10, 0x0 + + + + + + mov rax, [rsp + 24] + mul qword [q + 8] + add r9, rax + adc r10, rdx + adc r8, 0x0 + + mov rax, [rsp + 16] + mul qword [q + 16] + add r9, rax + adc r10, rdx + adc r8, 0x0 + + mov rax, [rsp + 8] + mul qword [q + 24] + add r9, rax + adc r10, rdx + adc r8, 0x0 + + + + mov [rdi + 0 ], r9 + xor r9,r9 + + + + + + mov rax, [rsp + 24] + mul qword [q + 16] + add r10, rax + adc r8, rdx + adc r9, 0x0 + + mov rax, [rsp + 16] + mul qword [q + 24] + add r10, rax + adc r8, rdx + adc r9, 0x0 + + + + mov [rdi + 8 ], r10 + xor r10,r10 + + + + + + mov rax, [rsp + 24] + mul qword [q + 24] + add r8, rax + adc r9, rdx + adc r10, 0x0 + + + + mov [rdi + 16 ], r8 + xor r8,r8 + + + + + + + + mov [rdi + 24 ], r9 + xor r9,r9 + + + + test r10, r10 + jnz rawMontgomeryMul1_mulM_sq + ; Compare with q + + mov rax, [rdi + 24] + cmp rax, [q + 24] + jc rawMontgomeryMul1_mulM_done ; q is bigget so done. + jnz rawMontgomeryMul1_mulM_sq ; q is lower + + mov rax, [rdi + 16] + cmp rax, [q + 16] + jc rawMontgomeryMul1_mulM_done ; q is bigget so done. + jnz rawMontgomeryMul1_mulM_sq ; q is lower + + mov rax, [rdi + 8] + cmp rax, [q + 8] + jc rawMontgomeryMul1_mulM_done ; q is bigget so done. + jnz rawMontgomeryMul1_mulM_sq ; q is lower + + mov rax, [rdi + 0] + cmp rax, [q + 0] + jc rawMontgomeryMul1_mulM_done ; q is bigget so done. + jnz rawMontgomeryMul1_mulM_sq ; q is lower + + ; If equal substract q + +rawMontgomeryMul1_mulM_sq: + + mov rax, [q + 0] + sub [rdi + 0], rax + + mov rax, [q + 8] + sbb [rdi + 8], rax + + mov rax, [q + 16] + sbb [rdi + 16], rax + + mov rax, [q + 24] + sbb [rdi + 24], rax + + +rawMontgomeryMul1_mulM_done: + mov rdx, rcx ; recover rdx to its original place. + add rsp, 32 ; recover rsp + ret + + + + +;;;;;;;;;;;;;;;;;;;;;; +; rawFromMontgomery +;;;;;;;;;;;;;;;;;;;;;; +; Multiply two elements in montgomery form +; Params: +; rsi <= Pointer to the long data of element 1 +; rdi <= Pointer to the long data of result +; Modified registers: +; r8, r9, 10, r11, rax, rcx +;;;;;;;;;;;;;;;;;;;;;; + +rawFromMontgomery: + sub rsp, 32 ; Reserve space for ms + mov rcx, rdx ; rdx is needed for multiplications so keep it in cx + mov r11, 0xc2e1f593efffffff ; np + xor r8,r8 + xor r9,r9 + xor r10,r10 + + add r8, [rdi + 0] + adc r9, 0x0 + adc r10, 0x0 + + + + + + mov rax, r8 + mul r11 + mov [rsp + 0], rax + mul qword [q] + add r8, rax + adc r9, rdx + adc r10, 0x0 + + + + add r9, [rdi + 8] + adc r10, 0x0 + adc r8, 0x0 + + + + mov rax, [rsp + 0] + mul qword [q + 8] + add r9, rax + adc r10, rdx + adc r8, 0x0 + + + + mov rax, r9 + mul r11 + mov [rsp + 8], rax + mul qword [q] + add r9, rax + adc r10, rdx + adc r8, 0x0 + + + + add r10, [rdi + 16] + adc r8, 0x0 + adc r9, 0x0 + + + + mov rax, [rsp + 8] + mul qword [q + 8] + add r10, rax + adc r8, rdx + adc r9, 0x0 + + mov rax, [rsp + 0] + mul qword [q + 16] + add r10, rax + adc r8, rdx + adc r9, 0x0 + + + + mov rax, r10 + mul r11 + mov [rsp + 16], rax + mul qword [q] + add r10, rax + adc r8, rdx + adc r9, 0x0 + + + + add r8, [rdi + 24] + adc r9, 0x0 + adc r10, 0x0 + + + + mov rax, [rsp + 16] + mul qword [q + 8] + add r8, rax + adc r9, rdx + adc r10, 0x0 + + mov rax, [rsp + 8] + mul qword [q + 16] + add r8, rax + adc r9, rdx + adc r10, 0x0 + + mov rax, [rsp + 0] + mul qword [q + 24] + add r8, rax + adc r9, rdx + adc r10, 0x0 + + + + mov rax, r8 + mul r11 + mov [rsp + 24], rax + mul qword [q] + add r8, rax + adc r9, rdx + adc r10, 0x0 + + + + + + mov rax, [rsp + 24] + mul qword [q + 8] + add r9, rax + adc r10, rdx + adc r8, 0x0 + + mov rax, [rsp + 16] + mul qword [q + 16] + add r9, rax + adc r10, rdx + adc r8, 0x0 + + mov rax, [rsp + 8] + mul qword [q + 24] + add r9, rax + adc r10, rdx + adc r8, 0x0 + + + + mov [rdi + 0 ], r9 + xor r9,r9 + + + + + + mov rax, [rsp + 24] + mul qword [q + 16] + add r10, rax + adc r8, rdx + adc r9, 0x0 + + mov rax, [rsp + 16] + mul qword [q + 24] + add r10, rax + adc r8, rdx + adc r9, 0x0 + + + + mov [rdi + 8 ], r10 + xor r10,r10 + + + + + + mov rax, [rsp + 24] + mul qword [q + 24] + add r8, rax + adc r9, rdx + adc r10, 0x0 + + + + mov [rdi + 16 ], r8 + xor r8,r8 + + + + + + + + mov [rdi + 24 ], r9 + xor r9,r9 + + + + test r10, r10 + jnz rawFromMontgomery_mulM_sq + ; Compare with q + + mov rax, [rdi + 24] + cmp rax, [q + 24] + jc rawFromMontgomery_mulM_done ; q is bigget so done. + jnz rawFromMontgomery_mulM_sq ; q is lower + + mov rax, [rdi + 16] + cmp rax, [q + 16] + jc rawFromMontgomery_mulM_done ; q is bigget so done. + jnz rawFromMontgomery_mulM_sq ; q is lower + + mov rax, [rdi + 8] + cmp rax, [q + 8] + jc rawFromMontgomery_mulM_done ; q is bigget so done. + jnz rawFromMontgomery_mulM_sq ; q is lower + + mov rax, [rdi + 0] + cmp rax, [q + 0] + jc rawFromMontgomery_mulM_done ; q is bigget so done. + jnz rawFromMontgomery_mulM_sq ; q is lower + + ; If equal substract q + +rawFromMontgomery_mulM_sq: + + mov rax, [q + 0] + sub [rdi + 0], rax + + mov rax, [q + 8] + sbb [rdi + 8], rax + + mov rax, [q + 16] + sbb [rdi + 16], rax + + mov rax, [q + 24] + sbb [rdi + 24], rax + + +rawFromMontgomery_mulM_done: + mov rdx, rcx ; recover rdx to its original place. + add rsp, 32 ; recover rsp + ret + + + +;;;;;;;;;;;;;;;;;;;;;; +; toMontgomery +;;;;;;;;;;;;;;;;;;;;;; +; Convert a number to Montgomery +; rdi <= Pointer element to convert +; Modified registers: +; r8, r9, 10, r11, rax, rcx +;;;;;;;;;;;;;;;;;;;; +Fr_toMontgomery: + mov rax, [rdi] + bts rax, 62 ; check if montgomery + jc toMontgomery_doNothing + bts rax, 63 + jc toMontgomeryLong + +toMontgomeryShort: + mov [rdi], rax + add rdi, 8 + push rsi + lea rsi, [R2] + movsx rdx, eax + cmp rdx, 0 + js negMontgomeryShort +posMontgomeryShort: + call rawMontgomeryMul1 + pop rsi + sub rdi, 8 + ret + +negMontgomeryShort: + neg rdx ; Do the multiplication positive and then negate the result. + call rawMontgomeryMul1 + mov rsi, rdi + call rawNegL + pop rsi + sub rdi, 8 + ret + + +toMontgomeryLong: + mov [rdi], rax + add rdi, 8 + push rsi + mov rdx, rdi + lea rsi, [R2] + call rawMontgomeryMul + pop rsi + sub rdi, 8 + +toMontgomery_doNothing: + ret + +;;;;;;;;;;;;;;;;;;;;;; +; toNormal +;;;;;;;;;;;;;;;;;;;;;; +; Convert a number from Montgomery +; rdi <= Pointer element to convert +; Modified registers: +; r8, r9, 10, r11, rax, rcx +;;;;;;;;;;;;;;;;;;;; +Fr_toNormal: + mov rax, [rdi] + btc rax, 62 ; check if montgomery + jnc fromMontgomery_doNothing + bt rax, 63 ; if short, it means it's converted + jnc fromMontgomery_doNothing + +fromMontgomeryLong: + mov [rdi], rax + add rdi, 8 + call rawFromMontgomery + sub rdi, 8 + +fromMontgomery_doNothing: + ret + + + + + + + + + + + + + +;;;;;;;;;;;;;;;;;;;;;; +; add +;;;;;;;;;;;;;;;;;;;;;; +; Adds two elements of any kind +; Params: +; rsi <= Pointer to element 1 +; rdx <= Pointer to element 2 +; rdi <= Pointer to result +; Modified Registers: +; r8, r9, 10, r11, rax, rcx +;;;;;;;;;;;;;;;;;;;;;; +Fr_add: + mov rax, [rsi] + mov rcx, [rdx] + bt rax, 63 ; Check if is short first operand + jc add_l1 + bt rcx, 63 ; Check if is short second operand + jc add_s1l2 + +add_s1s2: ; Both operands are short + + xor rdx, rdx + mov edx, eax + add edx, ecx + jo add_manageOverflow ; rsi already is the 64bits result + + mov [rdi], rdx ; not necessary to adjust so just save and return + ret + +add_manageOverflow: ; Do the operation in 64 bits + push rsi + movsx rsi, eax + movsx rdx, ecx + add rsi, rdx + call rawCopyS2L + pop rsi + ret + +add_l1: + bt rcx, 63 ; Check if is short second operand + jc add_l1l2 + +;;;;;;;; +add_l1s2: + bt rax, 62 ; check if montgomery first + jc add_l1ms2 +add_l1ns2: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + + add rsi, 8 + movsx rdx, ecx + add rdi, 8 + cmp rdx, 0 + + jns tmp1 + neg rdx + call rawSubLS + sub rdi, 8 + sub rsi, 8 + ret +tmp1: + call rawAddLS + sub rdi, 8 + sub rsi, 8 + ret + + + +add_l1ms2: + bt rcx, 62 ; check if montgomery second + jc add_l1ms2m +add_l1ms2n: + mov r11b, 0xC0 + shl r11, 56 + mov [rdi], r11 + push rdi + mov rdi, rdx + call Fr_toMontgomery + mov rdx, rdi + pop rdi + + add rdi, 8 + add rsi, 8 + add rdx, 8 + call rawAddLL + sub rdi, 8 + sub rsi, 8 + ret + + +add_l1ms2m: + mov r11b, 0xC0 + shl r11, 56 + mov [rdi], r11 + + add rdi, 8 + add rsi, 8 + add rdx, 8 + call rawAddLL + sub rdi, 8 + sub rsi, 8 + ret + + + +;;;;;;;; +add_s1l2: + bt rcx, 62 ; check if montgomery first + jc add_s1l2m +add_s1l2n: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + + lea rsi, [rdx + 8] + movsx rdx, eax + add rdi, 8 + cmp rdx, 0 + + jns tmp2 + neg rdx + call rawSubLS + sub rdi, 8 + sub rsi, 8 + ret +tmp2: + call rawAddLS + sub rdi, 8 + sub rsi, 8 + ret + + +add_s1l2m: + bt rax, 62 ; check if montgomery second + jc add_s1ml2m +add_s1nl2m: + mov r11b, 0xC0 + shl r11, 56 + mov [rdi], r11 + push rdi + mov rdi, rsi + mov rsi, rdx + call Fr_toMontgomery + mov rdx, rsi + mov rsi, rdi + pop rdi + + add rdi, 8 + add rsi, 8 + add rdx, 8 + call rawAddLL + sub rdi, 8 + sub rsi, 8 + ret + + +add_s1ml2m: + mov r11b, 0xC0 + shl r11, 56 + mov [rdi], r11 + + add rdi, 8 + add rsi, 8 + add rdx, 8 + call rawAddLL + sub rdi, 8 + sub rsi, 8 + ret + + +;;;; +add_l1l2: + bt rax, 62 ; check if montgomery first + jc add_l1ml2 +add_l1nl2: + bt rcx, 62 ; check if montgomery second + jc add_l1nl2m +add_l1nl2n: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + + add rdi, 8 + add rsi, 8 + add rdx, 8 + call rawAddLL + sub rdi, 8 + sub rsi, 8 + ret + + +add_l1nl2m: + mov r11b, 0xC0 + shl r11, 56 + mov [rdi], r11 + push rdi + mov rdi, rsi + mov rsi, rdx + call Fr_toMontgomery + mov rdx, rsi + mov rsi, rdi + pop rdi + + add rdi, 8 + add rsi, 8 + add rdx, 8 + call rawAddLL + sub rdi, 8 + sub rsi, 8 + ret + + +add_l1ml2: + bt rcx, 62 ; check if montgomery seconf + jc add_l1ml2m +add_l1ml2n: + mov r11b, 0xC0 + shl r11, 56 + mov [rdi], r11 + push rdi + mov rdi, rdx + call Fr_toMontgomery + mov rdx, rdi + pop rdi + + add rdi, 8 + add rsi, 8 + add rdx, 8 + call rawAddLL + sub rdi, 8 + sub rsi, 8 + ret + + +add_l1ml2m: + mov r11b, 0xC0 + shl r11, 56 + mov [rdi], r11 + + add rdi, 8 + add rsi, 8 + add rdx, 8 + call rawAddLL + sub rdi, 8 + sub rsi, 8 + ret + + + + +;;;;;;;;;;;;;;;;;;;;;; +; rawAddLL +;;;;;;;;;;;;;;;;;;;;;; +; Adds two elements of type long +; Params: +; rsi <= Pointer to the long data of element 1 +; rdx <= Pointer to the long data of element 2 +; rdi <= Pointer to the long data of result +; Modified Registers: +; rax +;;;;;;;;;;;;;;;;;;;;;; +rawAddLL: + ; Add component by component with carry + + mov rax, [rsi + 0] + add rax, [rdx + 0] + mov [rdi + 0], rax + + mov rax, [rsi + 8] + adc rax, [rdx + 8] + mov [rdi + 8], rax + + mov rax, [rsi + 16] + adc rax, [rdx + 16] + mov [rdi + 16], rax + + mov rax, [rsi + 24] + adc rax, [rdx + 24] + mov [rdi + 24], rax + + jc rawAddLL_sq ; if overflow, substract q + + ; Compare with q + + + cmp rax, [q + 24] + jc rawAddLL_done ; q is bigget so done. + jnz rawAddLL_sq ; q is lower + + + mov rax, [rdi + 16] + + cmp rax, [q + 16] + jc rawAddLL_done ; q is bigget so done. + jnz rawAddLL_sq ; q is lower + + + mov rax, [rdi + 8] + + cmp rax, [q + 8] + jc rawAddLL_done ; q is bigget so done. + jnz rawAddLL_sq ; q is lower + + + mov rax, [rdi + 0] + + cmp rax, [q + 0] + jc rawAddLL_done ; q is bigget so done. + jnz rawAddLL_sq ; q is lower + + ; If equal substract q +rawAddLL_sq: + + mov rax, [q + 0] + sub [rdi + 0], rax + + mov rax, [q + 8] + sbb [rdi + 8], rax + + mov rax, [q + 16] + sbb [rdi + 16], rax + + mov rax, [q + 24] + sbb [rdi + 24], rax + +rawAddLL_done: + ret + + +;;;;;;;;;;;;;;;;;;;;;; +; rawAddLS +;;;;;;;;;;;;;;;;;;;;;; +; Adds two elements of type long +; Params: +; rdi <= Pointer to the long data of result +; rsi <= Pointer to the long data of element 1 +; rdx <= Value to be added +;;;;;;;;;;;;;;;;;;;;;; +rawAddLS: + ; Add component by component with carry + + add rdx, [rsi] + mov [rdi] ,rdx + + mov rdx, 0 + adc rdx, [rsi + 8] + mov [rdi + 8], rdx + + mov rdx, 0 + adc rdx, [rsi + 16] + mov [rdi + 16], rdx + + mov rdx, 0 + adc rdx, [rsi + 24] + mov [rdi + 24], rdx + + jc rawAddLS_sq ; if overflow, substract q + + ; Compare with q + + mov rax, [rdi + 24] + cmp rax, [q + 24] + jc rawAddLS_done ; q is bigget so done. + jnz rawAddLS_sq ; q is lower + + mov rax, [rdi + 16] + cmp rax, [q + 16] + jc rawAddLS_done ; q is bigget so done. + jnz rawAddLS_sq ; q is lower + + mov rax, [rdi + 8] + cmp rax, [q + 8] + jc rawAddLS_done ; q is bigget so done. + jnz rawAddLS_sq ; q is lower + + mov rax, [rdi + 0] + cmp rax, [q + 0] + jc rawAddLS_done ; q is bigget so done. + jnz rawAddLS_sq ; q is lower + + ; If equal substract q +rawAddLS_sq: + + mov rax, [q + 0] + sub [rdi + 0], rax + + mov rax, [q + 8] + sbb [rdi + 8], rax + + mov rax, [q + 16] + sbb [rdi + 16], rax + + mov rax, [q + 24] + sbb [rdi + 24], rax + +rawAddLS_done: + ret + + + + + + + + + + + + + + + +;;;;;;;;;;;;;;;;;;;;;; +; sub +;;;;;;;;;;;;;;;;;;;;;; +; Substracts two elements of any kind +; Params: +; rsi <= Pointer to element 1 +; rdx <= Pointer to element 2 +; rdi <= Pointer to result +; Modified Registers: +; r8, r9, 10, r11, rax, rcx +;;;;;;;;;;;;;;;;;;;;;; +Fr_sub: + mov rax, [rsi] + mov rcx, [rdx] + bt rax, 63 ; Check if is long first operand + jc sub_l1 + bt rcx, 63 ; Check if is long second operand + jc sub_s1l2 + +sub_s1s2: ; Both operands are short + + xor rdx, rdx + mov edx, eax + sub edx, ecx + jo sub_manageOverflow ; rsi already is the 64bits result + + mov [rdi], rdx ; not necessary to adjust so just save and return + ret + +sub_manageOverflow: ; Do the operation in 64 bits + push rsi + movsx rsi, eax + movsx rdx, ecx + sub rsi, rdx + call rawCopyS2L + pop rsi + ret + +sub_l1: + bt rcx, 63 ; Check if is short second operand + jc sub_l1l2 + +;;;;;;;; +sub_l1s2: + bt rax, 62 ; check if montgomery first + jc sub_l1ms2 +sub_l1ns2: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + + add rsi, 8 + movsx rdx, ecx + add rdi, 8 + cmp rdx, 0 + + jns tmp3 + neg rdx + call rawAddLS + sub rdi, 8 + sub rsi, 8 + ret +tmp3: + call rawSubLS + sub rdi, 8 + sub rsi, 8 + ret + + +sub_l1ms2: + bt rcx, 62 ; check if montgomery second + jc sub_l1ms2m +sub_l1ms2n: + mov r11b, 0xC0 + shl r11, 56 + mov [rdi], r11 + push rdi + mov rdi, rdx + call Fr_toMontgomery + mov rdx, rdi + pop rdi + + add rdi, 8 + add rsi, 8 + add rdx, 8 + call rawSubLL + sub rdi, 8 + sub rsi, 8 + ret + + +sub_l1ms2m: + mov r11b, 0xC0 + shl r11, 56 + mov [rdi], r11 + + add rdi, 8 + add rsi, 8 + add rdx, 8 + call rawSubLL + sub rdi, 8 + sub rsi, 8 + ret + + + +;;;;;;;; +sub_s1l2: + bt rcx, 62 ; check if montgomery first + jc sub_s1l2m +sub_s1l2n: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + + cmp eax, 0 + + js tmp4 + + ; First Operand is positive + push rsi + add rdi, 8 + movsx rsi, eax + add rdx, 8 + call rawSubSL + sub rdi, 8 + pop rsi + ret + +tmp4: ; First operand is negative + push rsi + lea rsi, [rdx + 8] + movsx rdx, eax + add rdi, 8 + neg rdx + call rawNegLS + sub rdi, 8 + pop rsi + ret + + +sub_s1l2m: + bt rax, 62 ; check if montgomery second + jc sub_s1ml2m +sub_s1nl2m: + mov r11b, 0xC0 + shl r11, 56 + mov [rdi], r11 + push rdi + mov rdi, rsi + mov rsi, rdx + call Fr_toMontgomery + mov rdx, rsi + mov rsi, rdi + pop rdi + + add rdi, 8 + add rsi, 8 + add rdx, 8 + call rawSubLL + sub rdi, 8 + sub rsi, 8 + ret + + +sub_s1ml2m: + mov r11b, 0xC0 + shl r11, 56 + mov [rdi], r11 + + add rdi, 8 + add rsi, 8 + add rdx, 8 + call rawSubLL + sub rdi, 8 + sub rsi, 8 + ret + + +;;;; +sub_l1l2: + bt rax, 62 ; check if montgomery first + jc sub_l1ml2 +sub_l1nl2: + bt rcx, 62 ; check if montgomery second + jc sub_l1nl2m +sub_l1nl2n: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + + add rdi, 8 + add rsi, 8 + add rdx, 8 + call rawSubLL + sub rdi, 8 + sub rsi, 8 + ret + + +sub_l1nl2m: + mov r11b, 0xC0 + shl r11, 56 + mov [rdi], r11 + push rdi + mov rdi, rsi + mov rsi, rdx + call Fr_toMontgomery + mov rdx, rsi + mov rsi, rdi + pop rdi + + add rdi, 8 + add rsi, 8 + add rdx, 8 + call rawSubLL + sub rdi, 8 + sub rsi, 8 + ret + + +sub_l1ml2: + bt rcx, 62 ; check if montgomery seconf + jc sub_l1ml2m +sub_l1ml2n: + mov r11b, 0xC0 + shl r11, 56 + mov [rdi], r11 + push rdi + mov rdi, rdx + call Fr_toMontgomery + mov rdx, rdi + pop rdi + + add rdi, 8 + add rsi, 8 + add rdx, 8 + call rawSubLL + sub rdi, 8 + sub rsi, 8 + ret + + +sub_l1ml2m: + mov r11b, 0xC0 + shl r11, 56 + mov [rdi], r11 + + add rdi, 8 + add rsi, 8 + add rdx, 8 + call rawSubLL + sub rdi, 8 + sub rsi, 8 + ret + + + +;;;;;;;;;;;;;;;;;;;;;; +; rawSubLS +;;;;;;;;;;;;;;;;;;;;;; +; Substracts a short element from the long element +; Params: +; rdi <= Pointer to the long data of result +; rsi <= Pointer to the long data of element 1 where will be substracted +; rdx <= Value to be substracted +; [rdi] = [rsi] - rdx +; Modified Registers: +; rax +;;;;;;;;;;;;;;;;;;;;;; +rawSubLS: + ; Substract first digit + + mov rax, [rsi] + sub rax, rdx + mov [rdi] ,rax + mov rdx, 0 + + mov rax, [rsi + 8] + sbb rax, rdx + mov [rdi + 8], rax + + mov rax, [rsi + 16] + sbb rax, rdx + mov [rdi + 16], rax + + mov rax, [rsi + 24] + sbb rax, rdx + mov [rdi + 24], rax + + jnc rawSubLS_done ; if overflow, add q + + ; Add q +rawSubLS_aq: + + mov rax, [q + 0] + add [rdi + 0], rax + + mov rax, [q + 8] + adc [rdi + 8], rax + + mov rax, [q + 16] + adc [rdi + 16], rax + + mov rax, [q + 24] + adc [rdi + 24], rax + +rawSubLS_done: + ret + + +;;;;;;;;;;;;;;;;;;;;;; +; rawSubSL +;;;;;;;;;;;;;;;;;;;;;; +; Substracts a long element from a short element +; Params: +; rdi <= Pointer to the long data of result +; rsi <= Value from where will bo substracted +; rdx <= Pointer to long of the value to be substracted +; +; [rdi] = rsi - [rdx] +; Modified Registers: +; rax +;;;;;;;;;;;;;;;;;;;;;; +rawSubSL: + ; Substract first digit + sub rsi, [rdx] + mov [rdi] ,rsi + + + mov rax, 0 + sbb rax, [rdx + 8] + mov [rdi + 8], rax + + mov rax, 0 + sbb rax, [rdx + 16] + mov [rdi + 16], rax + + mov rax, 0 + sbb rax, [rdx + 24] + mov [rdi + 24], rax + + jnc rawSubSL_done ; if overflow, add q + + ; Add q +rawSubSL_aq: + + mov rax, [q + 0] + add [rdi + 0], rax + + mov rax, [q + 8] + adc [rdi + 8], rax + + mov rax, [q + 16] + adc [rdi + 16], rax + + mov rax, [q + 24] + adc [rdi + 24], rax + +rawSubSL_done: + ret + +;;;;;;;;;;;;;;;;;;;;;; +; rawSubLL +;;;;;;;;;;;;;;;;;;;;;; +; Substracts a long element from a short element +; Params: +; rdi <= Pointer to the long data of result +; rsi <= Pointer to long from where substracted +; rdx <= Pointer to long of the value to be substracted +; +; [rdi] = [rsi] - [rdx] +; Modified Registers: +; rax +;;;;;;;;;;;;;;;;;;;;;; +rawSubLL: + ; Substract first digit + + mov rax, [rsi + 0] + sub rax, [rdx + 0] + mov [rdi + 0], rax + + mov rax, [rsi + 8] + sbb rax, [rdx + 8] + mov [rdi + 8], rax + + mov rax, [rsi + 16] + sbb rax, [rdx + 16] + mov [rdi + 16], rax + + mov rax, [rsi + 24] + sbb rax, [rdx + 24] + mov [rdi + 24], rax + + jnc rawSubLL_done ; if overflow, add q + + ; Add q +rawSubLL_aq: + + mov rax, [q + 0] + add [rdi + 0], rax + + mov rax, [q + 8] + adc [rdi + 8], rax + + mov rax, [q + 16] + adc [rdi + 16], rax + + mov rax, [q + 24] + adc [rdi + 24], rax + +rawSubLL_done: + ret + +;;;;;;;;;;;;;;;;;;;;;; +; rawNegLS +;;;;;;;;;;;;;;;;;;;;;; +; Substracts a long element and a short element form 0 +; Params: +; rdi <= Pointer to the long data of result +; rsi <= Pointer to long from where substracted +; rdx <= short value to be substracted too +; +; [rdi] = -[rsi] - rdx +; Modified Registers: +; rax +;;;;;;;;;;;;;;;;;;;;;; +rawNegLS: + mov rax, [q] + sub rax, rdx + mov [rdi], rax + + mov rax, [q + 8 ] + sbb rax, 0 + mov [rdi + 8], rax + + mov rax, [q + 16 ] + sbb rax, 0 + mov [rdi + 16], rax + + mov rax, [q + 24 ] + sbb rax, 0 + mov [rdi + 24], rax + + setc dl + + + mov rax, [rdi + 0 ] + sub rax, [rsi + 0] + mov [rdi + 0], rax + + mov rax, [rdi + 8 ] + sbb rax, [rsi + 8] + mov [rdi + 8], rax + + mov rax, [rdi + 16 ] + sbb rax, [rsi + 16] + mov [rdi + 16], rax + + mov rax, [rdi + 24 ] + sbb rax, [rsi + 24] + mov [rdi + 24], rax + + + setc dh + or dl, dh + jz rawNegSL_done + + ; it is a negative value, so add q + + mov rax, [q + 0] + add [rdi + 0], rax + + mov rax, [q + 8] + adc [rdi + 8], rax + + mov rax, [q + 16] + adc [rdi + 16], rax + + mov rax, [q + 24] + adc [rdi + 24], rax + + +rawNegSL_done: + ret + + + + + + + +;;;;;;;;;;;;;;;;;;;;;; +; neg +;;;;;;;;;;;;;;;;;;;;;; +; Adds two elements of any kind +; Params: +; rsi <= Pointer to element to be negated +; rdi <= Pointer to result +; [rdi] = -[rsi] +;;;;;;;;;;;;;;;;;;;;;; +Fr_neg: + mov rax, [rsi] + bt rax, 63 ; Check if is short first operand + jc neg_l + +neg_s: ; Operand is short + + neg eax + jo neg_manageOverflow ; Check if overflow. (0x80000000 is the only case) + + mov [rdi], rax ; not necessary to adjust so just save and return + ret + +neg_manageOverflow: ; Do the operation in 64 bits + push rsi + movsx rsi, eax + neg rsi + call rawCopyS2L + pop rsi + ret + + + +neg_l: + mov [rdi], rax ; Copy the type + + add rdi, 8 + add rsi, 8 + call rawNegL + sub rdi, 8 + sub rsi, 8 + ret + + + +;;;;;;;;;;;;;;;;;;;;;; +; rawNeg +;;;;;;;;;;;;;;;;;;;;;; +; Negates a value +; Params: +; rdi <= Pointer to the long data of result +; rsi <= Pointer to the long data of element 1 +; +; [rdi] = - [rsi] +;;;;;;;;;;;;;;;;;;;;;; +rawNegL: + ; Compare is zero + + xor rax, rax + + cmp [rsi + 0], rax + jnz doNegate + + cmp [rsi + 8], rax + jnz doNegate + + cmp [rsi + 16], rax + jnz doNegate + + cmp [rsi + 24], rax + jnz doNegate + + ; it's zero so just set to zero + + mov [rdi + 0], rax + + mov [rdi + 8], rax + + mov [rdi + 16], rax + + mov [rdi + 24], rax + + ret +doNegate: + + mov rax, [q + 0] + sub rax, [rsi + 0] + mov [rdi + 0], rax + + mov rax, [q + 8] + sbb rax, [rsi + 8] + mov [rdi + 8], rax + + mov rax, [q + 16] + sbb rax, [rsi + 16] + mov [rdi + 16], rax + + mov rax, [q + 24] + sbb rax, [rsi + 24] + mov [rdi + 24], rax + + ret + + + + + + + + + + + + +;;;;;;;;;;;;;;;;;;;;;; +; mul +;;;;;;;;;;;;;;;;;;;;;; +; Multiplies two elements of any kind +; Params: +; rsi <= Pointer to element 1 +; rdx <= Pointer to element 2 +; rdi <= Pointer to result +; [rdi] = [rsi] * [rdi] +; Modified Registers: +; r8, r9, 10, r11, rax, rcx +;;;;;;;;;;;;;;;;;;;;;; +Fr_mul: + mov r8, [rsi] + mov r9, [rdx] + bt r8, 63 ; Check if is short first operand + jc mul_l1 + bt r9, 63 ; Check if is short second operand + jc mul_s1l2 + +mul_s1s2: ; Both operands are short + + xor rax, rax + mov eax, r8d + imul r9d + jo mul_manageOverflow ; rsi already is the 64bits result + + mov [rdi], rax ; not necessary to adjust so just save and return + +mul_manageOverflow: ; Do the operation in 64 bits + push rsi + movsx rax, r8d + movsx rcx, r9d + imul rcx + mov rsi, rax + call rawCopyS2L + pop rsi + + ret + +mul_l1: + bt r9, 63 ; Check if is short second operand + jc mul_l1l2 + +;;;;;;;; +mul_l1s2: + bt r8, 62 ; check if montgomery first + jc mul_l1ms2 +mul_l1ns2: + bt r9, 62 ; check if montgomery first + jc mul_l1ns2m +mul_l1ns2n: + mov r11b, 0xC0 + shl r11, 56 + mov [rdi], r11 + + push rsi + add rsi, 8 + movsx rdx, r9d + add rdi, 8 + cmp rdx, 0 + + jns tmp5 + neg rdx + call rawMontgomeryMul1 + mov rsi, rdi + call rawNegL + sub rdi, 8 + pop rsi + + jmp tmp6 +tmp5: + call rawMontgomeryMul1 + sub rdi, 8 + pop rsi +tmp6: + + + + push rsi + add rdi, 8 + mov rsi, rdi + lea rdx, [R3] + call rawMontgomeryMul + sub rdi, 8 + pop rsi + + ret + + +mul_l1ns2m: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + + add rdi, 8 + add rsi, 8 + add rdx, 8 + call rawMontgomeryMul + sub rdi, 8 + sub rsi, 8 + + ret + + +mul_l1ms2: + bt r9, 62 ; check if montgomery second + jc mul_l1ms2m +mul_l1ms2n: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + + push rsi + add rsi, 8 + movsx rdx, r9d + add rdi, 8 + cmp rdx, 0 + + jns tmp7 + neg rdx + call rawMontgomeryMul1 + mov rsi, rdi + call rawNegL + sub rdi, 8 + pop rsi + + jmp tmp8 +tmp7: + call rawMontgomeryMul1 + sub rdi, 8 + pop rsi +tmp8: + + + ret + +mul_l1ms2m: + mov r11b, 0xC0 + shl r11, 56 + mov [rdi], r11 + + add rdi, 8 + add rsi, 8 + add rdx, 8 + call rawMontgomeryMul + sub rdi, 8 + sub rsi, 8 + + ret + + +;;;;;;;; +mul_s1l2: + bt r8, 62 ; check if montgomery first + jc mul_s1ml2 +mul_s1nl2: + bt r9, 62 ; check if montgomery first + jc mul_s1nl2m +mul_s1nl2n: + mov r11b, 0xC0 + shl r11, 56 + mov [rdi], r11 + + push rsi + lea rsi, [rdx + 8] + movsx rdx, r8d + add rdi, 8 + cmp rdx, 0 + + jns tmp9 + neg rdx + call rawMontgomeryMul1 + mov rsi, rdi + call rawNegL + sub rdi, 8 + pop rsi + + jmp tmp10 +tmp9: + call rawMontgomeryMul1 + sub rdi, 8 + pop rsi +tmp10: + + + + push rsi + add rdi, 8 + mov rsi, rdi + lea rdx, [R3] + call rawMontgomeryMul + sub rdi, 8 + pop rsi + + ret + +mul_s1nl2m: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + + push rsi + lea rsi, [rdx + 8] + movsx rdx, r8d + add rdi, 8 + cmp rdx, 0 + + jns tmp11 + neg rdx + call rawMontgomeryMul1 + mov rsi, rdi + call rawNegL + sub rdi, 8 + pop rsi + + jmp tmp12 +tmp11: + call rawMontgomeryMul1 + sub rdi, 8 + pop rsi +tmp12: + + + ret + +mul_s1ml2: + bt r9, 62 ; check if montgomery first + jc mul_s1ml2m +mul_s1ml2n: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + + add rdi, 8 + add rsi, 8 + add rdx, 8 + call rawMontgomeryMul + sub rdi, 8 + sub rsi, 8 + + ret + +mul_s1ml2m: + mov r11b, 0xC0 + shl r11, 56 + mov [rdi], r11 + + add rdi, 8 + add rsi, 8 + add rdx, 8 + call rawMontgomeryMul + sub rdi, 8 + sub rsi, 8 + + ret + +;;;; +mul_l1l2: + bt r8, 62 ; check if montgomery first + jc mul_l1ml2 +mul_l1nl2: + bt r9, 62 ; check if montgomery second + jc mul_l1nl2m +mul_l1nl2n: + mov r11b, 0xC0 + shl r11, 56 + mov [rdi], r11 + + add rdi, 8 + add rsi, 8 + add rdx, 8 + call rawMontgomeryMul + sub rdi, 8 + sub rsi, 8 + + + push rsi + add rdi, 8 + mov rsi, rdi + lea rdx, [R3] + call rawMontgomeryMul + sub rdi, 8 + pop rsi + + ret + +mul_l1nl2m: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + + add rdi, 8 + add rsi, 8 + add rdx, 8 + call rawMontgomeryMul + sub rdi, 8 + sub rsi, 8 + + ret + +mul_l1ml2: + bt r9, 62 ; check if montgomery seconf + jc mul_l1ml2m +mul_l1ml2n: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + + add rdi, 8 + add rsi, 8 + add rdx, 8 + call rawMontgomeryMul + sub rdi, 8 + sub rsi, 8 + + ret + +mul_l1ml2m: + mov r11b, 0xC0 + shl r11, 56 + mov [rdi], r11 + + add rdi, 8 + add rsi, 8 + add rdx, 8 + call rawMontgomeryMul + sub rdi, 8 + sub rsi, 8 + + ret + + + + + + + + + + + + + + + + + +;;;;;;;;;;;;;;;;;;;;;; +; and +;;;;;;;;;;;;;;;;;;;;;; +; Adds two elements of any kind +; Params: +; rsi <= Pointer to element 1 +; rdx <= Pointer to element 2 +; rdi <= Pointer to result +; Modified Registers: +; r8, r9, 10, r11, rax, rcx +;;;;;;;;;;;;;;;;;;;;;; +Fr_band: + mov r8, [rsi] + mov r9, [rdx] + bt r8, 63 ; Check if is short first operand + jc and_l1 + bt r9, 63 ; Check if is short second operand + jc and_s1l2 + +and_s1s2: + + cmp r8d, 0 + + js tmp13 + + cmp r9d, 0 + js tmp13 + xor rdx, rdx ; both ops are positive so do the op and return + mov edx, r8d + and edx, r9d + mov [rdi], rdx ; not necessary to adjust so just save and return + ret + +tmp13: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + push rdi + push rsi + mov rdi, rdx + movsx rsi, r9d + call rawCopyS2L + mov rdx, rdi + pop rsi + pop rdi + push rdi + push rdx + mov rdi, rsi + movsx rsi, r8d + call rawCopyS2L + mov rsi, rdi + pop rdx + pop rdi + + + mov rax, [rsi + 8] + and rax, [rdx + 8] + + mov [rdi + 8 ], rax + + mov rax, [rsi + 16] + and rax, [rdx + 16] + + mov [rdi + 16 ], rax + + mov rax, [rsi + 24] + and rax, [rdx + 24] + + mov [rdi + 24 ], rax + + mov rax, [rsi + 32] + and rax, [rdx + 32] + + and rax, [lboMask] + + mov [rdi + 32 ], rax + + ret + + + + + + +and_l1: + bt r9, 63 ; Check if is short second operand + jc and_l1l2 + + +and_l1s2: + bt r8, 62 ; check if montgomery first + jc and_l1ms2 +and_l1ns2: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + + cmp r9d, 0 + + js tmp14 + movsx rax, r9d + and rax, [rsi +8] + mov [rdi+8], rax + + xor rax, rax + and rax, [rsi + 16]; + + mov [rdi + 16 ], rax; + + xor rax, rax + and rax, [rsi + 24]; + + mov [rdi + 24 ], rax; + + xor rax, rax + and rax, [rsi + 32]; + + and rax, [lboMask] ; + + mov [rdi + 32 ], rax; + + ret + +tmp14: + push rdi + push rsi + mov rdi, rdx + movsx rsi, r9d + call rawCopyS2L + mov rdx, rdi + pop rsi + pop rdi + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + + + mov rax, [rsi + 8] + and rax, [rdx + 8] + + mov [rdi + 8 ], rax + + mov rax, [rsi + 16] + and rax, [rdx + 16] + + mov [rdi + 16 ], rax + + mov rax, [rsi + 24] + and rax, [rdx + 24] + + mov [rdi + 24 ], rax + + mov rax, [rsi + 32] + and rax, [rdx + 32] + + and rax, [lboMask] + + mov [rdi + 32 ], rax + + ret + + + + +and_l1ms2: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + push r9 ; r9 is used in montgomery so we need to save it + push rdi + mov rdi, rsi + mov rsi, rdx + call Fr_toNormal + mov rdx, rsi + mov rsi, rdi + pop rdi + pop r9 + + cmp r9d, 0 + + js tmp15 + movsx rax, r9d + and rax, [rsi +8] + mov [rdi+8], rax + + xor rax, rax + and rax, [rsi + 16]; + + mov [rdi + 16 ], rax; + + xor rax, rax + and rax, [rsi + 24]; + + mov [rdi + 24 ], rax; + + xor rax, rax + and rax, [rsi + 32]; + + and rax, [lboMask] ; + + mov [rdi + 32 ], rax; + + ret + +tmp15: + push rdi + push rsi + mov rdi, rdx + movsx rsi, r9d + call rawCopyS2L + mov rdx, rdi + pop rsi + pop rdi + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + + + mov rax, [rsi + 8] + and rax, [rdx + 8] + + mov [rdi + 8 ], rax + + mov rax, [rsi + 16] + and rax, [rdx + 16] + + mov [rdi + 16 ], rax + + mov rax, [rsi + 24] + and rax, [rdx + 24] + + mov [rdi + 24 ], rax + + mov rax, [rsi + 32] + and rax, [rdx + 32] + + and rax, [lboMask] + + mov [rdi + 32 ], rax + + ret + + + + + +and_s1l2: + bt r9, 62 ; check if montgomery first + jc and_s1l2m +and_s1l2n: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + + cmp r8d, 0 + + js tmp16 + movsx rax, r8d + and rax, [rdx +8] + mov [rdi+8], rax + + xor rax, rax + and rax, [rdx + 16] + + mov [rdi + 16 ], rax + + xor rax, rax + and rax, [rdx + 24] + + mov [rdi + 24 ], rax + + xor rax, rax + and rax, [rdx + 32] + + and rax, [lboMask] + + mov [rdi + 32 ], rax + + ret + +tmp16: + push rdi + push rdx + mov rdi, rsi + movsx rsi, r8d + call rawCopyS2L + mov rsi, rdi + pop rdx + pop rdi + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + + + mov rax, [rsi + 8] + and rax, [rdx + 8] + + mov [rdi + 8 ], rax + + mov rax, [rsi + 16] + and rax, [rdx + 16] + + mov [rdi + 16 ], rax + + mov rax, [rsi + 24] + and rax, [rdx + 24] + + mov [rdi + 24 ], rax + + mov rax, [rsi + 32] + and rax, [rdx + 32] + + and rax, [lboMask] + + mov [rdi + 32 ], rax + + ret + + + + +and_s1l2m: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + push r8 ; r8 is used in montgomery so we need to save it + push rdi + mov rdi, rdx + call Fr_toNormal + mov rdx, rdi + pop rdi + pop r8 + + cmp r8d, 0 + + js tmp17 + movsx rax, r8d + and rax, [rdx +8] + mov [rdi+8], rax + + xor rax, rax + and rax, [rdx + 16] + + mov [rdi + 16 ], rax + + xor rax, rax + and rax, [rdx + 24] + + mov [rdi + 24 ], rax + + xor rax, rax + and rax, [rdx + 32] + + and rax, [lboMask] + + mov [rdi + 32 ], rax + + ret + +tmp17: + push rdi + push rdx + mov rdi, rsi + movsx rsi, r8d + call rawCopyS2L + mov rsi, rdi + pop rdx + pop rdi + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + + + mov rax, [rsi + 8] + and rax, [rdx + 8] + + mov [rdi + 8 ], rax + + mov rax, [rsi + 16] + and rax, [rdx + 16] + + mov [rdi + 16 ], rax + + mov rax, [rsi + 24] + and rax, [rdx + 24] + + mov [rdi + 24 ], rax + + mov rax, [rsi + 32] + and rax, [rdx + 32] + + and rax, [lboMask] + + mov [rdi + 32 ], rax + + ret + + + + + +and_l1l2: + bt r8, 62 ; check if montgomery first + jc and_l1ml2 + bt r9, 62 ; check if montgomery first + jc and_l1nl2m +and_l1nl2n: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + + + mov rax, [rsi + 8] + and rax, [rdx + 8] + + mov [rdi + 8 ], rax + + mov rax, [rsi + 16] + and rax, [rdx + 16] + + mov [rdi + 16 ], rax + + mov rax, [rsi + 24] + and rax, [rdx + 24] + + mov [rdi + 24 ], rax + + mov rax, [rsi + 32] + and rax, [rdx + 32] + + and rax, [lboMask] + + mov [rdi + 32 ], rax + + ret + + +and_l1nl2m: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + push rdi + mov rdi, rdx + call Fr_toNormal + mov rdx, rdi + pop rdi + + + mov rax, [rsi + 8] + and rax, [rdx + 8] + + mov [rdi + 8 ], rax + + mov rax, [rsi + 16] + and rax, [rdx + 16] + + mov [rdi + 16 ], rax + + mov rax, [rsi + 24] + and rax, [rdx + 24] + + mov [rdi + 24 ], rax + + mov rax, [rsi + 32] + and rax, [rdx + 32] + + and rax, [lboMask] + + mov [rdi + 32 ], rax + + ret + + +and_l1ml2: + bt r9, 62 ; check if montgomery first + jc and_l1ml2m +and_l1ml2n: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + push rdi + mov rdi, rsi + mov rsi, rdx + call Fr_toNormal + mov rdx, rsi + mov rsi, rdi + pop rdi + + + mov rax, [rsi + 8] + and rax, [rdx + 8] + + mov [rdi + 8 ], rax + + mov rax, [rsi + 16] + and rax, [rdx + 16] + + mov [rdi + 16 ], rax + + mov rax, [rsi + 24] + and rax, [rdx + 24] + + mov [rdi + 24 ], rax + + mov rax, [rsi + 32] + and rax, [rdx + 32] + + and rax, [lboMask] + + mov [rdi + 32 ], rax + + ret + + +and_l1ml2m: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + push rdi + mov rdi, rsi + mov rsi, rdx + call Fr_toNormal + mov rdx, rsi + mov rsi, rdi + pop rdi + push rdi + mov rdi, rdx + call Fr_toNormal + mov rdx, rdi + pop rdi + + + mov rax, [rsi + 8] + and rax, [rdx + 8] + + mov [rdi + 8 ], rax + + mov rax, [rsi + 16] + and rax, [rdx + 16] + + mov [rdi + 16 ], rax + + mov rax, [rsi + 24] + and rax, [rdx + 24] + + mov [rdi + 24 ], rax + + mov rax, [rsi + 32] + and rax, [rdx + 32] + + and rax, [lboMask] + + mov [rdi + 32 ], rax + + ret + + + +;;;;;;;;;;;;;;;;;;;;;; +; or +;;;;;;;;;;;;;;;;;;;;;; +; Adds two elements of any kind +; Params: +; rsi <= Pointer to element 1 +; rdx <= Pointer to element 2 +; rdi <= Pointer to result +; Modified Registers: +; r8, r9, 10, r11, rax, rcx +;;;;;;;;;;;;;;;;;;;;;; +Fr_bor: + mov r8, [rsi] + mov r9, [rdx] + bt r8, 63 ; Check if is short first operand + jc or_l1 + bt r9, 63 ; Check if is short second operand + jc or_s1l2 + +or_s1s2: + + cmp r8d, 0 + + js tmp18 + + cmp r9d, 0 + js tmp18 + xor rdx, rdx ; both ops are positive so do the op and return + mov edx, r8d + or edx, r9d + mov [rdi], rdx ; not necessary to adjust so just save and return + ret + +tmp18: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + push rdi + push rsi + mov rdi, rdx + movsx rsi, r9d + call rawCopyS2L + mov rdx, rdi + pop rsi + pop rdi + push rdi + push rdx + mov rdi, rsi + movsx rsi, r8d + call rawCopyS2L + mov rsi, rdi + pop rdx + pop rdi + + + mov rax, [rsi + 8] + or rax, [rdx + 8] + + mov [rdi + 8 ], rax + + mov rax, [rsi + 16] + or rax, [rdx + 16] + + mov [rdi + 16 ], rax + + mov rax, [rsi + 24] + or rax, [rdx + 24] + + mov [rdi + 24 ], rax + + mov rax, [rsi + 32] + or rax, [rdx + 32] + + and rax, [lboMask] + + mov [rdi + 32 ], rax + + ret + + + + + + +or_l1: + bt r9, 63 ; Check if is short second operand + jc or_l1l2 + + +or_l1s2: + bt r8, 62 ; check if montgomery first + jc or_l1ms2 +or_l1ns2: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + + cmp r9d, 0 + + js tmp19 + movsx rax, r9d + or rax, [rsi +8] + mov [rdi+8], rax + + xor rax, rax + or rax, [rsi + 16]; + + mov [rdi + 16 ], rax; + + xor rax, rax + or rax, [rsi + 24]; + + mov [rdi + 24 ], rax; + + xor rax, rax + or rax, [rsi + 32]; + + and rax, [lboMask] ; + + mov [rdi + 32 ], rax; + + ret + +tmp19: + push rdi + push rsi + mov rdi, rdx + movsx rsi, r9d + call rawCopyS2L + mov rdx, rdi + pop rsi + pop rdi + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + + + mov rax, [rsi + 8] + or rax, [rdx + 8] + + mov [rdi + 8 ], rax + + mov rax, [rsi + 16] + or rax, [rdx + 16] + + mov [rdi + 16 ], rax + + mov rax, [rsi + 24] + or rax, [rdx + 24] + + mov [rdi + 24 ], rax + + mov rax, [rsi + 32] + or rax, [rdx + 32] + + and rax, [lboMask] + + mov [rdi + 32 ], rax + + ret + + + + +or_l1ms2: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + push r9 ; r9 is used in montgomery so we need to save it + push rdi + mov rdi, rsi + mov rsi, rdx + call Fr_toNormal + mov rdx, rsi + mov rsi, rdi + pop rdi + pop r9 + + cmp r9d, 0 + + js tmp20 + movsx rax, r9d + or rax, [rsi +8] + mov [rdi+8], rax + + xor rax, rax + or rax, [rsi + 16]; + + mov [rdi + 16 ], rax; + + xor rax, rax + or rax, [rsi + 24]; + + mov [rdi + 24 ], rax; + + xor rax, rax + or rax, [rsi + 32]; + + and rax, [lboMask] ; + + mov [rdi + 32 ], rax; + + ret + +tmp20: + push rdi + push rsi + mov rdi, rdx + movsx rsi, r9d + call rawCopyS2L + mov rdx, rdi + pop rsi + pop rdi + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + + + mov rax, [rsi + 8] + or rax, [rdx + 8] + + mov [rdi + 8 ], rax + + mov rax, [rsi + 16] + or rax, [rdx + 16] + + mov [rdi + 16 ], rax + + mov rax, [rsi + 24] + or rax, [rdx + 24] + + mov [rdi + 24 ], rax + + mov rax, [rsi + 32] + or rax, [rdx + 32] + + and rax, [lboMask] + + mov [rdi + 32 ], rax + + ret + + + + + +or_s1l2: + bt r9, 62 ; check if montgomery first + jc or_s1l2m +or_s1l2n: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + + cmp r8d, 0 + + js tmp21 + movsx rax, r8d + or rax, [rdx +8] + mov [rdi+8], rax + + xor rax, rax + or rax, [rdx + 16] + + mov [rdi + 16 ], rax + + xor rax, rax + or rax, [rdx + 24] + + mov [rdi + 24 ], rax + + xor rax, rax + or rax, [rdx + 32] + + and rax, [lboMask] + + mov [rdi + 32 ], rax + + ret + +tmp21: + push rdi + push rdx + mov rdi, rsi + movsx rsi, r8d + call rawCopyS2L + mov rsi, rdi + pop rdx + pop rdi + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + + + mov rax, [rsi + 8] + or rax, [rdx + 8] + + mov [rdi + 8 ], rax + + mov rax, [rsi + 16] + or rax, [rdx + 16] + + mov [rdi + 16 ], rax + + mov rax, [rsi + 24] + or rax, [rdx + 24] + + mov [rdi + 24 ], rax + + mov rax, [rsi + 32] + or rax, [rdx + 32] + + and rax, [lboMask] + + mov [rdi + 32 ], rax + + ret + + + + +or_s1l2m: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + push r8 ; r8 is used in montgomery so we need to save it + push rdi + mov rdi, rdx + call Fr_toNormal + mov rdx, rdi + pop rdi + pop r8 + + cmp r8d, 0 + + js tmp22 + movsx rax, r8d + or rax, [rdx +8] + mov [rdi+8], rax + + xor rax, rax + or rax, [rdx + 16] + + mov [rdi + 16 ], rax + + xor rax, rax + or rax, [rdx + 24] + + mov [rdi + 24 ], rax + + xor rax, rax + or rax, [rdx + 32] + + and rax, [lboMask] + + mov [rdi + 32 ], rax + + ret + +tmp22: + push rdi + push rdx + mov rdi, rsi + movsx rsi, r8d + call rawCopyS2L + mov rsi, rdi + pop rdx + pop rdi + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + + + mov rax, [rsi + 8] + or rax, [rdx + 8] + + mov [rdi + 8 ], rax + + mov rax, [rsi + 16] + or rax, [rdx + 16] + + mov [rdi + 16 ], rax + + mov rax, [rsi + 24] + or rax, [rdx + 24] + + mov [rdi + 24 ], rax + + mov rax, [rsi + 32] + or rax, [rdx + 32] + + and rax, [lboMask] + + mov [rdi + 32 ], rax + + ret + + + + + +or_l1l2: + bt r8, 62 ; check if montgomery first + jc or_l1ml2 + bt r9, 62 ; check if montgomery first + jc or_l1nl2m +or_l1nl2n: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + + + mov rax, [rsi + 8] + or rax, [rdx + 8] + + mov [rdi + 8 ], rax + + mov rax, [rsi + 16] + or rax, [rdx + 16] + + mov [rdi + 16 ], rax + + mov rax, [rsi + 24] + or rax, [rdx + 24] + + mov [rdi + 24 ], rax + + mov rax, [rsi + 32] + or rax, [rdx + 32] + + and rax, [lboMask] + + mov [rdi + 32 ], rax + + ret + + +or_l1nl2m: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + push rdi + mov rdi, rdx + call Fr_toNormal + mov rdx, rdi + pop rdi + + + mov rax, [rsi + 8] + or rax, [rdx + 8] + + mov [rdi + 8 ], rax + + mov rax, [rsi + 16] + or rax, [rdx + 16] + + mov [rdi + 16 ], rax + + mov rax, [rsi + 24] + or rax, [rdx + 24] + + mov [rdi + 24 ], rax + + mov rax, [rsi + 32] + or rax, [rdx + 32] + + and rax, [lboMask] + + mov [rdi + 32 ], rax + + ret + + +or_l1ml2: + bt r9, 62 ; check if montgomery first + jc or_l1ml2m +or_l1ml2n: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + push rdi + mov rdi, rsi + mov rsi, rdx + call Fr_toNormal + mov rdx, rsi + mov rsi, rdi + pop rdi + + + mov rax, [rsi + 8] + or rax, [rdx + 8] + + mov [rdi + 8 ], rax + + mov rax, [rsi + 16] + or rax, [rdx + 16] + + mov [rdi + 16 ], rax + + mov rax, [rsi + 24] + or rax, [rdx + 24] + + mov [rdi + 24 ], rax + + mov rax, [rsi + 32] + or rax, [rdx + 32] + + and rax, [lboMask] + + mov [rdi + 32 ], rax + + ret + + +or_l1ml2m: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + push rdi + mov rdi, rsi + mov rsi, rdx + call Fr_toNormal + mov rdx, rsi + mov rsi, rdi + pop rdi + push rdi + mov rdi, rdx + call Fr_toNormal + mov rdx, rdi + pop rdi + + + mov rax, [rsi + 8] + or rax, [rdx + 8] + + mov [rdi + 8 ], rax + + mov rax, [rsi + 16] + or rax, [rdx + 16] + + mov [rdi + 16 ], rax + + mov rax, [rsi + 24] + or rax, [rdx + 24] + + mov [rdi + 24 ], rax + + mov rax, [rsi + 32] + or rax, [rdx + 32] + + and rax, [lboMask] + + mov [rdi + 32 ], rax + + ret + + + +;;;;;;;;;;;;;;;;;;;;;; +; xor +;;;;;;;;;;;;;;;;;;;;;; +; Adds two elements of any kind +; Params: +; rsi <= Pointer to element 1 +; rdx <= Pointer to element 2 +; rdi <= Pointer to result +; Modified Registers: +; r8, r9, 10, r11, rax, rcx +;;;;;;;;;;;;;;;;;;;;;; +Fr_bxor: + mov r8, [rsi] + mov r9, [rdx] + bt r8, 63 ; Check if is short first operand + jc xor_l1 + bt r9, 63 ; Check if is short second operand + jc xor_s1l2 + +xor_s1s2: + + cmp r8d, 0 + + js tmp23 + + cmp r9d, 0 + js tmp23 + xor rdx, rdx ; both ops are positive so do the op and return + mov edx, r8d + xor edx, r9d + mov [rdi], rdx ; not necessary to adjust so just save and return + ret + +tmp23: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + push rdi + push rsi + mov rdi, rdx + movsx rsi, r9d + call rawCopyS2L + mov rdx, rdi + pop rsi + pop rdi + push rdi + push rdx + mov rdi, rsi + movsx rsi, r8d + call rawCopyS2L + mov rsi, rdi + pop rdx + pop rdi + + + mov rax, [rsi + 8] + xor rax, [rdx + 8] + + mov [rdi + 8 ], rax + + mov rax, [rsi + 16] + xor rax, [rdx + 16] + + mov [rdi + 16 ], rax + + mov rax, [rsi + 24] + xor rax, [rdx + 24] + + mov [rdi + 24 ], rax + + mov rax, [rsi + 32] + xor rax, [rdx + 32] + + and rax, [lboMask] + + mov [rdi + 32 ], rax + + ret + + + + + + +xor_l1: + bt r9, 63 ; Check if is short second operand + jc xor_l1l2 + + +xor_l1s2: + bt r8, 62 ; check if montgomery first + jc xor_l1ms2 +xor_l1ns2: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + + cmp r9d, 0 + + js tmp24 + movsx rax, r9d + xor rax, [rsi +8] + mov [rdi+8], rax + + xor rax, rax + xor rax, [rsi + 16]; + + mov [rdi + 16 ], rax; + + xor rax, rax + xor rax, [rsi + 24]; + + mov [rdi + 24 ], rax; + + xor rax, rax + xor rax, [rsi + 32]; + + and rax, [lboMask] ; + + mov [rdi + 32 ], rax; + + ret + +tmp24: + push rdi + push rsi + mov rdi, rdx + movsx rsi, r9d + call rawCopyS2L + mov rdx, rdi + pop rsi + pop rdi + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + + + mov rax, [rsi + 8] + xor rax, [rdx + 8] + + mov [rdi + 8 ], rax + + mov rax, [rsi + 16] + xor rax, [rdx + 16] + + mov [rdi + 16 ], rax + + mov rax, [rsi + 24] + xor rax, [rdx + 24] + + mov [rdi + 24 ], rax + + mov rax, [rsi + 32] + xor rax, [rdx + 32] + + and rax, [lboMask] + + mov [rdi + 32 ], rax + + ret + + + + +xor_l1ms2: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + push r9 ; r9 is used in montgomery so we need to save it + push rdi + mov rdi, rsi + mov rsi, rdx + call Fr_toNormal + mov rdx, rsi + mov rsi, rdi + pop rdi + pop r9 + + cmp r9d, 0 + + js tmp25 + movsx rax, r9d + xor rax, [rsi +8] + mov [rdi+8], rax + + xor rax, rax + xor rax, [rsi + 16]; + + mov [rdi + 16 ], rax; + + xor rax, rax + xor rax, [rsi + 24]; + + mov [rdi + 24 ], rax; + + xor rax, rax + xor rax, [rsi + 32]; + + and rax, [lboMask] ; + + mov [rdi + 32 ], rax; + + ret + +tmp25: + push rdi + push rsi + mov rdi, rdx + movsx rsi, r9d + call rawCopyS2L + mov rdx, rdi + pop rsi + pop rdi + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + + + mov rax, [rsi + 8] + xor rax, [rdx + 8] + + mov [rdi + 8 ], rax + + mov rax, [rsi + 16] + xor rax, [rdx + 16] + + mov [rdi + 16 ], rax + + mov rax, [rsi + 24] + xor rax, [rdx + 24] + + mov [rdi + 24 ], rax + + mov rax, [rsi + 32] + xor rax, [rdx + 32] + + and rax, [lboMask] + + mov [rdi + 32 ], rax + + ret + + + + + +xor_s1l2: + bt r9, 62 ; check if montgomery first + jc xor_s1l2m +xor_s1l2n: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + + cmp r8d, 0 + + js tmp26 + movsx rax, r8d + xor rax, [rdx +8] + mov [rdi+8], rax + + xor rax, rax + xor rax, [rdx + 16] + + mov [rdi + 16 ], rax + + xor rax, rax + xor rax, [rdx + 24] + + mov [rdi + 24 ], rax + + xor rax, rax + xor rax, [rdx + 32] + + and rax, [lboMask] + + mov [rdi + 32 ], rax + + ret + +tmp26: + push rdi + push rdx + mov rdi, rsi + movsx rsi, r8d + call rawCopyS2L + mov rsi, rdi + pop rdx + pop rdi + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + + + mov rax, [rsi + 8] + xor rax, [rdx + 8] + + mov [rdi + 8 ], rax + + mov rax, [rsi + 16] + xor rax, [rdx + 16] + + mov [rdi + 16 ], rax + + mov rax, [rsi + 24] + xor rax, [rdx + 24] + + mov [rdi + 24 ], rax + + mov rax, [rsi + 32] + xor rax, [rdx + 32] + + and rax, [lboMask] + + mov [rdi + 32 ], rax + + ret + + + + +xor_s1l2m: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + push r8 ; r8 is used in montgomery so we need to save it + push rdi + mov rdi, rdx + call Fr_toNormal + mov rdx, rdi + pop rdi + pop r8 + + cmp r8d, 0 + + js tmp27 + movsx rax, r8d + xor rax, [rdx +8] + mov [rdi+8], rax + + xor rax, rax + xor rax, [rdx + 16] + + mov [rdi + 16 ], rax + + xor rax, rax + xor rax, [rdx + 24] + + mov [rdi + 24 ], rax + + xor rax, rax + xor rax, [rdx + 32] + + and rax, [lboMask] + + mov [rdi + 32 ], rax + + ret + +tmp27: + push rdi + push rdx + mov rdi, rsi + movsx rsi, r8d + call rawCopyS2L + mov rsi, rdi + pop rdx + pop rdi + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + + + mov rax, [rsi + 8] + xor rax, [rdx + 8] + + mov [rdi + 8 ], rax + + mov rax, [rsi + 16] + xor rax, [rdx + 16] + + mov [rdi + 16 ], rax + + mov rax, [rsi + 24] + xor rax, [rdx + 24] + + mov [rdi + 24 ], rax + + mov rax, [rsi + 32] + xor rax, [rdx + 32] + + and rax, [lboMask] + + mov [rdi + 32 ], rax + + ret + + + + + +xor_l1l2: + bt r8, 62 ; check if montgomery first + jc xor_l1ml2 + bt r9, 62 ; check if montgomery first + jc xor_l1nl2m +xor_l1nl2n: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + + + mov rax, [rsi + 8] + xor rax, [rdx + 8] + + mov [rdi + 8 ], rax + + mov rax, [rsi + 16] + xor rax, [rdx + 16] + + mov [rdi + 16 ], rax + + mov rax, [rsi + 24] + xor rax, [rdx + 24] + + mov [rdi + 24 ], rax + + mov rax, [rsi + 32] + xor rax, [rdx + 32] + + and rax, [lboMask] + + mov [rdi + 32 ], rax + + ret + + +xor_l1nl2m: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + push rdi + mov rdi, rdx + call Fr_toNormal + mov rdx, rdi + pop rdi + + + mov rax, [rsi + 8] + xor rax, [rdx + 8] + + mov [rdi + 8 ], rax + + mov rax, [rsi + 16] + xor rax, [rdx + 16] + + mov [rdi + 16 ], rax + + mov rax, [rsi + 24] + xor rax, [rdx + 24] + + mov [rdi + 24 ], rax + + mov rax, [rsi + 32] + xor rax, [rdx + 32] + + and rax, [lboMask] + + mov [rdi + 32 ], rax + + ret + + +xor_l1ml2: + bt r9, 62 ; check if montgomery first + jc xor_l1ml2m +xor_l1ml2n: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + push rdi + mov rdi, rsi + mov rsi, rdx + call Fr_toNormal + mov rdx, rsi + mov rsi, rdi + pop rdi + + + mov rax, [rsi + 8] + xor rax, [rdx + 8] + + mov [rdi + 8 ], rax + + mov rax, [rsi + 16] + xor rax, [rdx + 16] + + mov [rdi + 16 ], rax + + mov rax, [rsi + 24] + xor rax, [rdx + 24] + + mov [rdi + 24 ], rax + + mov rax, [rsi + 32] + xor rax, [rdx + 32] + + and rax, [lboMask] + + mov [rdi + 32 ], rax + + ret + + +xor_l1ml2m: + mov r11b, 0x80 + shl r11, 56 + mov [rdi], r11 + push rdi + mov rdi, rsi + mov rsi, rdx + call Fr_toNormal + mov rdx, rsi + mov rsi, rdi + pop rdi + push rdi + mov rdi, rdx + call Fr_toNormal + mov rdx, rdi + pop rdi + + + mov rax, [rsi + 8] + xor rax, [rdx + 8] + + mov [rdi + 8 ], rax + + mov rax, [rsi + 16] + xor rax, [rdx + 16] + + mov [rdi + 16 ], rax + + mov rax, [rsi + 24] + xor rax, [rdx + 24] + + mov [rdi + 24 ], rax + + mov rax, [rsi + 32] + xor rax, [rdx + 32] + + and rax, [lboMask] + + mov [rdi + 32 ], rax + + ret + + + + + + + + + + + + + + + +;;;;;;;;;;;;;;;;;;;;;; +; eq +;;;;;;;;;;;;;;;;;;;;;; +; Adds two elements of any kind +; Params: +; rsi <= Pointer to element 1 +; rdx <= Pointer to element 2 +; rdi <= Pointer to result can be zero or one. +; Modified Registers: +; r8, r9, 10, r11, rax, rcx +;;;;;;;;;;;;;;;;;;;;;; +Fr_eq: + sub rsp, 40 ; Save space for the result of the substraction + push rdi ; Save rdi + lea rdi, [rsp+8] ; We pushed rdi so we need to add 8 + call Fr_sub ; Do a substraction + call Fr_toNormal ; Convert it to normal + pop rdi + + mov rax, [rsp] ; We already poped do no need to add 8 + bt rax, 63 ; check is result is long + jc eq_longCmp + +eq_shortCmp: + cmp eax, 0 + je eq_s_eq + js eq_s_lt +eq_s_gt: + + mov qword [rdi], 0 + add rsp, 40 + ret + +eq_s_lt: + + mov qword [rdi], 0 + add rsp, 40 + ret + +eq_s_eq: + + mov qword [rdi], 1 + add rsp, 40 + ret + + +eq_longCmp: + + + cmp qword [rsp + 32], 0 + jnz eq_neq + + cmp qword [rsp + 24], 0 + jnz eq_neq + + cmp qword [rsp + 16], 0 + jnz eq_neq + + cmp qword [rsp + 8], 0 + jnz eq_neq + +eq_eq: + + mov qword [rdi], 1 + add rsp, 40 + ret + +eq_neq: + + mov qword [rdi], 0 + add rsp, 40 + ret + + + + + +;;;;;;;;;;;;;;;;;;;;;; +; neq +;;;;;;;;;;;;;;;;;;;;;; +; Adds two elements of any kind +; Params: +; rsi <= Pointer to element 1 +; rdx <= Pointer to element 2 +; rdi <= Pointer to result can be zero or one. +; Modified Registers: +; r8, r9, 10, r11, rax, rcx +;;;;;;;;;;;;;;;;;;;;;; +Fr_neq: + sub rsp, 40 ; Save space for the result of the substraction + push rdi ; Save rdi + lea rdi, [rsp+8] ; We pushed rdi so we need to add 8 + call Fr_sub ; Do a substraction + call Fr_toNormal ; Convert it to normal + pop rdi + + mov rax, [rsp] ; We already poped do no need to add 8 + bt rax, 63 ; check is result is long + jc neq_longCmp + +neq_shortCmp: + cmp eax, 0 + je neq_s_eq + js neq_s_lt +neq_s_gt: + + mov qword [rdi], 1 + add rsp, 40 + ret + +neq_s_lt: + + mov qword [rdi], 1 + add rsp, 40 + ret + +neq_s_eq: + + mov qword [rdi], 0 + add rsp, 40 + ret + + +neq_longCmp: + + + cmp qword [rsp + 32], 0 + jnz neq_neq + + cmp qword [rsp + 24], 0 + jnz neq_neq + + cmp qword [rsp + 16], 0 + jnz neq_neq + + cmp qword [rsp + 8], 0 + jnz neq_neq + +neq_eq: + + mov qword [rdi], 0 + add rsp, 40 + ret + +neq_neq: + + mov qword [rdi], 1 + add rsp, 40 + ret + + + + + +;;;;;;;;;;;;;;;;;;;;;; +; lt +;;;;;;;;;;;;;;;;;;;;;; +; Adds two elements of any kind +; Params: +; rsi <= Pointer to element 1 +; rdx <= Pointer to element 2 +; rdi <= Pointer to result can be zero or one. +; Modified Registers: +; r8, r9, 10, r11, rax, rcx +;;;;;;;;;;;;;;;;;;;;;; +Fr_lt: + sub rsp, 40 ; Save space for the result of the substraction + push rdi ; Save rdi + lea rdi, [rsp+8] ; We pushed rdi so we need to add 8 + call Fr_sub ; Do a substraction + call Fr_toNormal ; Convert it to normal + pop rdi + + mov rax, [rsp] ; We already poped do no need to add 8 + bt rax, 63 ; check is result is long + jc lt_longCmp + +lt_shortCmp: + cmp eax, 0 + je lt_s_eq + js lt_s_lt +lt_s_gt: + + mov qword [rdi], 0 + add rsp, 40 + ret + +lt_s_lt: + + mov qword [rdi], 1 + add rsp, 40 + ret + +lt_s_eq: + + mov qword [rdi], 0 + add rsp, 40 + ret + + +lt_longCmp: + + + cmp qword [rsp + 32], 0 + jnz lt_neq + + cmp qword [rsp + 24], 0 + jnz lt_neq + + cmp qword [rsp + 16], 0 + jnz lt_neq + + cmp qword [rsp + 8], 0 + jnz lt_neq + +lt_eq: + + + + mov qword [rdi], 0 + add rsp, 40 + ret + + + + + + + mov rax, [rsp + 32] + cmp [half + 24], rax ; comare with (q-1)/2 + jc tmp29 ; half e1-e2 is neg => e1 < e2 + jnz tmp28 ; half>rax => e1 -e2 is pos => e1 > e2 + + mov rax, [rsp + 24] + cmp [half + 16], rax ; comare with (q-1)/2 + jc tmp29 ; half e1-e2 is neg => e1 < e2 + jnz tmp28 ; half>rax => e1 -e2 is pos => e1 > e2 + + mov rax, [rsp + 16] + cmp [half + 8], rax ; comare with (q-1)/2 + jc tmp29 ; half e1-e2 is neg => e1 < e2 + jnz tmp28 ; half>rax => e1 -e2 is pos => e1 > e2 + + mov rax, [rsp + 8] + cmp [half + 0], rax ; comare with (q-1)/2 + jc tmp29 ; half e1-e2 is neg => e1 < e2 + jnz tmp28 ; half>rax => e1 -e2 is pos => e1 > e2 + + ; half == rax => e1-e2 is pos => e1 > e2 +tmp28: + + mov qword [rdi], 0 + add rsp, 40 + ret + +tmp29: + + mov qword [rdi], 1 + add rsp, 40 + ret + + +lt_neq: + + + + + + + + + mov rax, [rsp + 32] + cmp [half + 24], rax ; comare with (q-1)/2 + jc tmp31 ; half e1-e2 is neg => e1 < e2 + jnz tmp30 ; half>rax => e1 -e2 is pos => e1 > e2 + + mov rax, [rsp + 24] + cmp [half + 16], rax ; comare with (q-1)/2 + jc tmp31 ; half e1-e2 is neg => e1 < e2 + jnz tmp30 ; half>rax => e1 -e2 is pos => e1 > e2 + + mov rax, [rsp + 16] + cmp [half + 8], rax ; comare with (q-1)/2 + jc tmp31 ; half e1-e2 is neg => e1 < e2 + jnz tmp30 ; half>rax => e1 -e2 is pos => e1 > e2 + + mov rax, [rsp + 8] + cmp [half + 0], rax ; comare with (q-1)/2 + jc tmp31 ; half e1-e2 is neg => e1 < e2 + jnz tmp30 ; half>rax => e1 -e2 is pos => e1 > e2 + + ; half == rax => e1-e2 is pos => e1 > e2 +tmp30: + + mov qword [rdi], 0 + add rsp, 40 + ret + +tmp31: + + mov qword [rdi], 1 + add rsp, 40 + ret + + + + + + +;;;;;;;;;;;;;;;;;;;;;; +; gt +;;;;;;;;;;;;;;;;;;;;;; +; Adds two elements of any kind +; Params: +; rsi <= Pointer to element 1 +; rdx <= Pointer to element 2 +; rdi <= Pointer to result can be zero or one. +; Modified Registers: +; r8, r9, 10, r11, rax, rcx +;;;;;;;;;;;;;;;;;;;;;; +Fr_gt: + sub rsp, 40 ; Save space for the result of the substraction + push rdi ; Save rdi + lea rdi, [rsp+8] ; We pushed rdi so we need to add 8 + call Fr_sub ; Do a substraction + call Fr_toNormal ; Convert it to normal + pop rdi + + mov rax, [rsp] ; We already poped do no need to add 8 + bt rax, 63 ; check is result is long + jc gt_longCmp + +gt_shortCmp: + cmp eax, 0 + je gt_s_eq + js gt_s_lt +gt_s_gt: + + mov qword [rdi], 1 + add rsp, 40 + ret + +gt_s_lt: + + mov qword [rdi], 0 + add rsp, 40 + ret + +gt_s_eq: + + mov qword [rdi], 0 + add rsp, 40 + ret + + +gt_longCmp: + + + cmp qword [rsp + 32], 0 + jnz gt_neq + + cmp qword [rsp + 24], 0 + jnz gt_neq + + cmp qword [rsp + 16], 0 + jnz gt_neq + + cmp qword [rsp + 8], 0 + jnz gt_neq + +gt_eq: + + + + mov qword [rdi], 0 + add rsp, 40 + ret + + + + + + + mov rax, [rsp + 32] + cmp [half + 24], rax ; comare with (q-1)/2 + jc tmp33 ; half e1-e2 is neg => e1 < e2 + jnz tmp32 ; half>rax => e1 -e2 is pos => e1 > e2 + + mov rax, [rsp + 24] + cmp [half + 16], rax ; comare with (q-1)/2 + jc tmp33 ; half e1-e2 is neg => e1 < e2 + jnz tmp32 ; half>rax => e1 -e2 is pos => e1 > e2 + + mov rax, [rsp + 16] + cmp [half + 8], rax ; comare with (q-1)/2 + jc tmp33 ; half e1-e2 is neg => e1 < e2 + jnz tmp32 ; half>rax => e1 -e2 is pos => e1 > e2 + + mov rax, [rsp + 8] + cmp [half + 0], rax ; comare with (q-1)/2 + jc tmp33 ; half e1-e2 is neg => e1 < e2 + jnz tmp32 ; half>rax => e1 -e2 is pos => e1 > e2 + + ; half == rax => e1-e2 is pos => e1 > e2 +tmp32: + + mov qword [rdi], 1 + add rsp, 40 + ret + +tmp33: + + mov qword [rdi], 0 + add rsp, 40 + ret + + +gt_neq: + + + + + + + + + mov rax, [rsp + 32] + cmp [half + 24], rax ; comare with (q-1)/2 + jc tmp35 ; half e1-e2 is neg => e1 < e2 + jnz tmp34 ; half>rax => e1 -e2 is pos => e1 > e2 + + mov rax, [rsp + 24] + cmp [half + 16], rax ; comare with (q-1)/2 + jc tmp35 ; half e1-e2 is neg => e1 < e2 + jnz tmp34 ; half>rax => e1 -e2 is pos => e1 > e2 + + mov rax, [rsp + 16] + cmp [half + 8], rax ; comare with (q-1)/2 + jc tmp35 ; half e1-e2 is neg => e1 < e2 + jnz tmp34 ; half>rax => e1 -e2 is pos => e1 > e2 + + mov rax, [rsp + 8] + cmp [half + 0], rax ; comare with (q-1)/2 + jc tmp35 ; half e1-e2 is neg => e1 < e2 + jnz tmp34 ; half>rax => e1 -e2 is pos => e1 > e2 + + ; half == rax => e1-e2 is pos => e1 > e2 +tmp34: + + mov qword [rdi], 1 + add rsp, 40 + ret + +tmp35: + + mov qword [rdi], 0 + add rsp, 40 + ret + + + + + + +;;;;;;;;;;;;;;;;;;;;;; +; leq +;;;;;;;;;;;;;;;;;;;;;; +; Adds two elements of any kind +; Params: +; rsi <= Pointer to element 1 +; rdx <= Pointer to element 2 +; rdi <= Pointer to result can be zero or one. +; Modified Registers: +; r8, r9, 10, r11, rax, rcx +;;;;;;;;;;;;;;;;;;;;;; +Fr_leq: + sub rsp, 40 ; Save space for the result of the substraction + push rdi ; Save rdi + lea rdi, [rsp+8] ; We pushed rdi so we need to add 8 + call Fr_sub ; Do a substraction + call Fr_toNormal ; Convert it to normal + pop rdi + + mov rax, [rsp] ; We already poped do no need to add 8 + bt rax, 63 ; check is result is long + jc leq_longCmp + +leq_shortCmp: + cmp eax, 0 + je leq_s_eq + js leq_s_lt +leq_s_gt: + + mov qword [rdi], 0 + add rsp, 40 + ret + +leq_s_lt: + + mov qword [rdi], 1 + add rsp, 40 + ret + +leq_s_eq: + + mov qword [rdi], 1 + add rsp, 40 + ret + + +leq_longCmp: + + + cmp qword [rsp + 32], 0 + jnz leq_neq + + cmp qword [rsp + 24], 0 + jnz leq_neq + + cmp qword [rsp + 16], 0 + jnz leq_neq + + cmp qword [rsp + 8], 0 + jnz leq_neq + +leq_eq: + + + + mov qword [rdi], 1 + add rsp, 40 + ret + + + + + + + mov rax, [rsp + 32] + cmp [half + 24], rax ; comare with (q-1)/2 + jc tmp37 ; half e1-e2 is neg => e1 < e2 + jnz tmp36 ; half>rax => e1 -e2 is pos => e1 > e2 + + mov rax, [rsp + 24] + cmp [half + 16], rax ; comare with (q-1)/2 + jc tmp37 ; half e1-e2 is neg => e1 < e2 + jnz tmp36 ; half>rax => e1 -e2 is pos => e1 > e2 + + mov rax, [rsp + 16] + cmp [half + 8], rax ; comare with (q-1)/2 + jc tmp37 ; half e1-e2 is neg => e1 < e2 + jnz tmp36 ; half>rax => e1 -e2 is pos => e1 > e2 + + mov rax, [rsp + 8] + cmp [half + 0], rax ; comare with (q-1)/2 + jc tmp37 ; half e1-e2 is neg => e1 < e2 + jnz tmp36 ; half>rax => e1 -e2 is pos => e1 > e2 + + ; half == rax => e1-e2 is pos => e1 > e2 +tmp36: + + mov qword [rdi], 0 + add rsp, 40 + ret + +tmp37: + + mov qword [rdi], 1 + add rsp, 40 + ret + + +leq_neq: + + + + + + + + + mov rax, [rsp + 32] + cmp [half + 24], rax ; comare with (q-1)/2 + jc tmp39 ; half e1-e2 is neg => e1 < e2 + jnz tmp38 ; half>rax => e1 -e2 is pos => e1 > e2 + + mov rax, [rsp + 24] + cmp [half + 16], rax ; comare with (q-1)/2 + jc tmp39 ; half e1-e2 is neg => e1 < e2 + jnz tmp38 ; half>rax => e1 -e2 is pos => e1 > e2 + + mov rax, [rsp + 16] + cmp [half + 8], rax ; comare with (q-1)/2 + jc tmp39 ; half e1-e2 is neg => e1 < e2 + jnz tmp38 ; half>rax => e1 -e2 is pos => e1 > e2 + + mov rax, [rsp + 8] + cmp [half + 0], rax ; comare with (q-1)/2 + jc tmp39 ; half e1-e2 is neg => e1 < e2 + jnz tmp38 ; half>rax => e1 -e2 is pos => e1 > e2 + + ; half == rax => e1-e2 is pos => e1 > e2 +tmp38: + + mov qword [rdi], 0 + add rsp, 40 + ret + +tmp39: + + mov qword [rdi], 1 + add rsp, 40 + ret + + + + + + +;;;;;;;;;;;;;;;;;;;;;; +; geq +;;;;;;;;;;;;;;;;;;;;;; +; Adds two elements of any kind +; Params: +; rsi <= Pointer to element 1 +; rdx <= Pointer to element 2 +; rdi <= Pointer to result can be zero or one. +; Modified Registers: +; r8, r9, 10, r11, rax, rcx +;;;;;;;;;;;;;;;;;;;;;; +Fr_geq: + sub rsp, 40 ; Save space for the result of the substraction + push rdi ; Save rdi + lea rdi, [rsp+8] ; We pushed rdi so we need to add 8 + call Fr_sub ; Do a substraction + call Fr_toNormal ; Convert it to normal + pop rdi + + mov rax, [rsp] ; We already poped do no need to add 8 + bt rax, 63 ; check is result is long + jc geq_longCmp + +geq_shortCmp: + cmp eax, 0 + je geq_s_eq + js geq_s_lt +geq_s_gt: + + mov qword [rdi], 1 + add rsp, 40 + ret + +geq_s_lt: + + mov qword [rdi], 0 + add rsp, 40 + ret + +geq_s_eq: + + mov qword [rdi], 1 + add rsp, 40 + ret + + +geq_longCmp: + + + cmp qword [rsp + 32], 0 + jnz geq_neq + + cmp qword [rsp + 24], 0 + jnz geq_neq + + cmp qword [rsp + 16], 0 + jnz geq_neq + + cmp qword [rsp + 8], 0 + jnz geq_neq + +geq_eq: + + + + mov qword [rdi], 1 + add rsp, 40 + ret + + + + + + + mov rax, [rsp + 32] + cmp [half + 24], rax ; comare with (q-1)/2 + jc tmp41 ; half e1-e2 is neg => e1 < e2 + jnz tmp40 ; half>rax => e1 -e2 is pos => e1 > e2 + + mov rax, [rsp + 24] + cmp [half + 16], rax ; comare with (q-1)/2 + jc tmp41 ; half e1-e2 is neg => e1 < e2 + jnz tmp40 ; half>rax => e1 -e2 is pos => e1 > e2 + + mov rax, [rsp + 16] + cmp [half + 8], rax ; comare with (q-1)/2 + jc tmp41 ; half e1-e2 is neg => e1 < e2 + jnz tmp40 ; half>rax => e1 -e2 is pos => e1 > e2 + + mov rax, [rsp + 8] + cmp [half + 0], rax ; comare with (q-1)/2 + jc tmp41 ; half e1-e2 is neg => e1 < e2 + jnz tmp40 ; half>rax => e1 -e2 is pos => e1 > e2 + + ; half == rax => e1-e2 is pos => e1 > e2 +tmp40: + + mov qword [rdi], 1 + add rsp, 40 + ret + +tmp41: + + mov qword [rdi], 0 + add rsp, 40 + ret + + +geq_neq: + + + + + + + + + mov rax, [rsp + 32] + cmp [half + 24], rax ; comare with (q-1)/2 + jc tmp43 ; half e1-e2 is neg => e1 < e2 + jnz tmp42 ; half>rax => e1 -e2 is pos => e1 > e2 + + mov rax, [rsp + 24] + cmp [half + 16], rax ; comare with (q-1)/2 + jc tmp43 ; half e1-e2 is neg => e1 < e2 + jnz tmp42 ; half>rax => e1 -e2 is pos => e1 > e2 + + mov rax, [rsp + 16] + cmp [half + 8], rax ; comare with (q-1)/2 + jc tmp43 ; half e1-e2 is neg => e1 < e2 + jnz tmp42 ; half>rax => e1 -e2 is pos => e1 > e2 + + mov rax, [rsp + 8] + cmp [half + 0], rax ; comare with (q-1)/2 + jc tmp43 ; half e1-e2 is neg => e1 < e2 + jnz tmp42 ; half>rax => e1 -e2 is pos => e1 > e2 + + ; half == rax => e1-e2 is pos => e1 > e2 +tmp42: + + mov qword [rdi], 1 + add rsp, 40 + ret + +tmp43: + + mov qword [rdi], 0 + add rsp, 40 + ret + + + + + + + + + section .data +Fr_q: + dd 0 + dd 0x80000000 +q dq 0x43e1f593f0000001,0x2833e84879b97091,0xb85045b68181585d,0x30644e72e131a029 +half dq 0xa1f0fac9f8000000,0x9419f4243cdcb848,0xdc2822db40c0ac2e,0x183227397098d014 +R2 dq 0x1bb8e645ae216da7,0x53fe3ab1e35c59e3,0x8c49833d53bb8085,0x0216d0b17f4e44a5 +R3 dq 0x5e94d8e1b4bf0040,0x2a489cbe1cfbb6b8,0x893cc664a19fcfed,0x0cf8594b7fcc657c +lboMask dq 0x1fffffffffffffff + diff --git a/c/buildasm/fr.asm.ejs b/c/buildasm/fr.asm.ejs new file mode 100644 index 0000000..b46b618 --- /dev/null +++ b/c/buildasm/fr.asm.ejs @@ -0,0 +1,41 @@ + + + global <%=name%>_add + global <%=name%>_sub + global <%=name%>_neg + global <%=name%>_mul + global <%=name%>_band + global <%=name%>_bor + global <%=name%>_bxor + global <%=name%>_eq + global <%=name%>_neq + global <%=name%>_lt + global <%=name%>_gt + global <%=name%>_leq + global <%=name%>_geq + global <%=name%>_toNormal + global <%=name%>_toMontgomery + global <%=name%>_q + DEFAULT REL + + section .text +<%- include('utils.asm.ejs'); %> +<%- include('copy.asm.ejs'); %> +<%- include('montgomery.asm.ejs'); %> +<%- include('add.asm.ejs'); %> +<%- include('sub.asm.ejs'); %> +<%- include('neg.asm.ejs'); %> +<%- include('mul.asm.ejs'); %> +<%- include('binops.asm.ejs'); %> +<%- include('cmpops.asm.ejs'); %> + + section .data +<%=name%>_q: + dd 0 + dd 0x80000000 +q dq <%= constantElement(q) %> +half dq <%= constantElement(q.shiftRight(1)) %> +R2 dq <%= constantElement(bigInt.one.shiftLeft(n64*64*2).mod(q)) %> +R3 dq <%= constantElement(bigInt.one.shiftLeft(n64*64*3).mod(q)) %> +lboMask dq 0x<%= bigInt("8000000000000000",16).shiftRight(n64*64 - q.bitLength()).minus(bigInt.one).toString(16) %> + diff --git a/c/buildasm/fr.c b/c/buildasm/fr.c new file mode 100644 index 0000000..bfd54cb --- /dev/null +++ b/c/buildasm/fr.c @@ -0,0 +1,39 @@ +#include "fr.h" +#include +#include +#include + +void Fr_str2element(PFrElement pE, char *s) { + mpz_t r; + mpz_init(r); + mpz_set_str(r, s, 10); + pE->type = Fr_LONG; + for (int i=0; ilongVal[i] = 0; + mpz_export((void *)pE->longVal, NULL, -1, 8, -1, 0, r); +} + +char *Fr_element2str(PFrElement pE) { + mpz_t r; + mpz_t q; + if (pE->type == Fr_SHORT) { + if (pE->shortVal>=0) { + char *r = new char[32]; + sprintf(r, "%d", pE->shortVal); + return r; + } else { + mpz_init(q); + mpz_import(q, Fr_N64, -1, 8, -1, 0, (const void *)Fr_q.longVal); + mpz_init_set_si(r, pE->shortVal); + mpz_add(r, r, q); + mpz_clear(q); + } + } else { + Fr_toNormal(pE); + mpz_init(r); + mpz_import(r, Fr_N64, -1, 8, -1, 0, (const void *)pE->longVal); + } + char *res = mpz_get_str (0, 10, r); + mpz_clear(r); + return res; +} + diff --git a/c/buildasm/fr.c.ejs b/c/buildasm/fr.c.ejs new file mode 100644 index 0000000..8149ba8 --- /dev/null +++ b/c/buildasm/fr.c.ejs @@ -0,0 +1,39 @@ +#include "<%=name.toLowerCase()+".h"%>" +#include +#include +#include + +void <%=name%>_str2element(P<%=name%>Element pE, char *s) { + mpz_t r; + mpz_init(r); + mpz_set_str(r, s, 10); + pE->type = <%=name%>_LONG; + for (int i=0; i<<%=name%>_N64; i++) pE->longVal[i] = 0; + mpz_export((void *)pE->longVal, NULL, -1, 8, -1, 0, r); +} + +char *<%=name%>_element2str(P<%=name%>Element pE) { + mpz_t r; + mpz_t q; + if (pE->type == <%=name%>_SHORT) { + if (pE->shortVal>=0) { + char *r = new char[32]; + sprintf(r, "%d", pE->shortVal); + return r; + } else { + mpz_init(q); + mpz_import(q, <%=name%>_N64, -1, 8, -1, 0, (const void *)Fr_q.longVal); + mpz_init_set_si(r, pE->shortVal); + mpz_add(r, r, q); + mpz_clear(q); + } + } else { + <%=name%>_toNormal(pE); + mpz_init(r); + mpz_import(r, <%=name%>_N64, -1, 8, -1, 0, (const void *)pE->longVal); + } + char *res = mpz_get_str (0, 10, r); + mpz_clear(r); + return res; +} + diff --git a/c/buildasm/fr.h.ejs b/c/buildasm/fr.h.ejs new file mode 100644 index 0000000..eda5d73 --- /dev/null +++ b/c/buildasm/fr.h.ejs @@ -0,0 +1,31 @@ +#include +#define <%=name%>_N64 <%= n64 %> +#define <%=name%>_SHORT 0x00000000 +#define <%=name%>_LONG 0x80000000 +#define <%=name%>_LONGMONTGOMERY 0xC0000000 +typedef struct __attribute__((__packed__)) { + int32_t shortVal; + u_int32_t type; + u_int64_t longVal[<%=name%>_N64]; +} <%=name%>Element; +typedef <%=name%>Element *P<%=name%>Element; +extern <%=name%>Element <%=name%>_q; +extern "C" void <%=name%>_add(P<%=name%>Element r, P<%=name%>Element a, P<%=name%>Element b); +extern "C" void <%=name%>_sub(P<%=name%>Element r, P<%=name%>Element a, P<%=name%>Element b); +extern "C" void <%=name%>_neg(P<%=name%>Element r, P<%=name%>Element a); +extern "C" void <%=name%>_mul(P<%=name%>Element r, P<%=name%>Element a, P<%=name%>Element b); +extern "C" void <%=name%>_band(P<%=name%>Element r, P<%=name%>Element a, P<%=name%>Element b); +extern "C" void <%=name%>_bor(P<%=name%>Element r, P<%=name%>Element a, P<%=name%>Element b); +extern "C" void <%=name%>_bxor(P<%=name%>Element r, P<%=name%>Element a, P<%=name%>Element b); +extern "C" void <%=name%>_eq(P<%=name%>Element r, P<%=name%>Element a, P<%=name%>Element b); +extern "C" void <%=name%>_neq(P<%=name%>Element r, P<%=name%>Element a, P<%=name%>Element b); +extern "C" void <%=name%>_lt(P<%=name%>Element r, P<%=name%>Element a, P<%=name%>Element b); +extern "C" void <%=name%>_gt(P<%=name%>Element r, P<%=name%>Element a, P<%=name%>Element b); +extern "C" void <%=name%>_leq(P<%=name%>Element r, P<%=name%>Element a, P<%=name%>Element b); +extern "C" void <%=name%>_geq(P<%=name%>Element r, P<%=name%>Element a, P<%=name%>Element b); +extern "C" void <%=name%>_toNormal(P<%=name%>Element pE); +extern "C" void <%=name%>_toMontgomery(P<%=name%>Element pE); +void <%=name%>_str2element(P<%=name%>Element pE, char *s); +char *<%=name%>_element2str(P<%=name%>Element pE); +extern <%=name%>Element <%=name%>_q; + diff --git a/c/buildasm/fr.o b/c/buildasm/fr.o new file mode 100644 index 0000000..3b81456 Binary files /dev/null and b/c/buildasm/fr.o differ diff --git a/c/buildasm/main b/c/buildasm/main new file mode 100755 index 0000000..129bb83 Binary files /dev/null and b/c/buildasm/main differ diff --git a/c/buildasm/main.c b/c/buildasm/main.c new file mode 100644 index 0000000..62d3123 --- /dev/null +++ b/c/buildasm/main.c @@ -0,0 +1,24 @@ +#include "stdio.h" +#include "fr.h" + +int main() { + + FrElement a = { 0, Fr_LONGMONTGOMERY, {1,1,1,1}}; + FrElement b = { 0, Fr_LONGMONTGOMERY, {2,2,2,2}}; + +/* + FrElement a={0x43e1f593f0000000ULL,0x2833e84879b97091ULL,0xb85045b68181585dULL,0x30644e72e131a029ULL}; + FrElement b = {3,0,0,0}; +*/ + FrElement c; + +// Fr_add(&(c[0]), a, a); +// Fr_add(&(c[0]), c, b); + + for (int i=0; i<1000000000; i++) { + Fr_mul(&c, &a, &b); + } + + Fr_mul(&c,&a, &b); + printf("%llu, %llu, %llu, %llu\n", c.longVal[0], c.longVal[1], c.longVal[2], c.longVal[3]); +} diff --git a/c/buildasm/main.dSYM/Contents/Info.plist b/c/buildasm/main.dSYM/Contents/Info.plist new file mode 100644 index 0000000..fe7fecd --- /dev/null +++ b/c/buildasm/main.dSYM/Contents/Info.plist @@ -0,0 +1,20 @@ + + + + + CFBundleDevelopmentRegion + English + CFBundleIdentifier + com.apple.xcode.dsym.main + CFBundleInfoDictionaryVersion + 6.0 + CFBundlePackageType + dSYM + CFBundleSignature + ???? + CFBundleShortVersionString + 1.0 + CFBundleVersion + 1 + + diff --git a/c/buildasm/main.dSYM/Contents/Resources/DWARF/main b/c/buildasm/main.dSYM/Contents/Resources/DWARF/main new file mode 100644 index 0000000..c6a803b Binary files /dev/null and b/c/buildasm/main.dSYM/Contents/Resources/DWARF/main differ diff --git a/c/buildasm/montgomery.asm.ejs b/c/buildasm/montgomery.asm.ejs new file mode 100644 index 0000000..1652c7a --- /dev/null +++ b/c/buildasm/montgomery.asm.ejs @@ -0,0 +1,273 @@ + + + +<% +////////////////////// +// montgomeryTemplate +////////////////////// +// This function creates functions with the montgomery transformation +// applied +// the round hook allows to add diferent code in the iteration +// +// All the montgomery functions modifies: +// r8, r9, 10, r11, rax, rcx +////////////////////// +function montgomeryTemplate(fnName, round) { + let r0, r1, r2; + function setR(step) { + if ((step % 3) == 0) { + r0 = "r8"; + r1 = "r9"; + r2 = "r10"; + } else if ((step % 3) == 1) { + r0 = "r9"; + r1 = "r10"; + r2 = "r8"; + } else { + r0 = "r10"; + r1 = "r8"; + r2 = "r9"; + } + } + + const base = bigInt.one.shiftLeft(64); + const np64 = base.minus(q.modInv(base)); +%> +<%=fnName%>: + sub rsp, <%= n64*8 %> ; Reserve space for ms + mov rcx, rdx ; rdx is needed for multiplications so keep it in cx + mov r11, 0x<%= np64.toString(16) %> ; np + xor r8,r8 + xor r9,r9 + xor r10,r10 +<% + // Main loop + for (let i=0; i + +<% + for (let j=i-1; j>=0; j--) { // All ms + if (((i-j) + mov rax, [rsp + <%= j*8 %>] + mul qword [q + <%= (i-j)*8 %>] + add <%= r0 %>, rax + adc <%= r1 %>, rdx + adc <%= r2 %>, 0x0 +<% + } + } // ms +%> + +<% + if (i + mov rax, <%= r0 %> + mul r11 + mov [rsp + <%= i*8 %>], rax + mul qword [q] + add <%= r0 %>, rax + adc <%= r1 %>, rdx + adc <%= r2 %>, 0x0 +<% + } else { +%> + mov [rdi + <%= (i-n64)*8 %> ], <%= r0 %> + xor <%= r0 %>,<%= r0 %> +<% + } +%> + +<% + } // Main Loop +%> + test <%= r1 %>, <%= r1 %> + jnz <%=fnName%>_mulM_sq + ; Compare with q +<% + for (let i=0; i + mov rax, [rdi + <%= (n64-i-1)*8 %>] + cmp rax, [q + <%= (n64-i-1)*8 %>] + jc <%=fnName%>_mulM_done ; q is bigget so done. + jnz <%=fnName%>_mulM_sq ; q is lower +<% + } +%> + ; If equal substract q + +<%=fnName%>_mulM_sq: +<% + for (let i=0; i + mov rax, [q + <%= i*8 %>] + <%= i==0 ? "sub" : "sbb" %> [rdi + <%= i*8 %>], rax +<% + } +%> + +<%=fnName%>_mulM_done: + mov rdx, rcx ; recover rdx to its original place. + add rsp, <%= n64*8 %> ; recover rsp + ret + +<% +} // Template +%> + +;;;;;;;;;;;;;;;;;;;;;; +; rawMontgomeryMul +;;;;;;;;;;;;;;;;;;;;;; +; Multiply two elements in montgomery form +; Params: +; rsi <= Pointer to the long data of element 1 +; rdx <= Pointer to the long data of element 2 +; rdi <= Pointer to the long data of result +; Modified registers: +; r8, r9, 10, r11, rax, rcx +;;;;;;;;;;;;;;;;;;;;;; +<% +montgomeryTemplate("rawMontgomeryMul", function(i, r0, r1, r2) { + // Same Digit + for (let o1=Math.max(0, i-n64+1); (o1<=i)&&(o1 + mov rax, [rsi + <%= 8*o1 %>] + mul qword [rcx + <%= 8*o2 %>] + add <%= r0 %>, rax + adc <%= r1 %>, rdx + adc <%= r2 %>, 0x0 +<% + } // Same digit +}) +%> + +;;;;;;;;;;;;;;;;;;;;;; +; rawMontgomeryMul1 +;;;;;;;;;;;;;;;;;;;;;; +; Multiply two elements in montgomery form +; Params: +; rsi <= Pointer to the long data of element 1 +; rdx <= second operand +; rdi <= Pointer to the long data of result +; Modified registers: +; r8, r9, 10, r11, rax, rcx +;;;;;;;;;;;;;;;;;;;;;; +<% +montgomeryTemplate("rawMontgomeryMul1", function(i, r0, r1, r2) { + // Same Digit + if (i + mov rax, [rsi + <%= 8*i %>] + mul rcx + add <%= r0 %>, rax + adc <%= r1 %>, rdx + adc <%= r2 %>, 0x0 +<% + } // Same digit +}) +%> + + +;;;;;;;;;;;;;;;;;;;;;; +; rawFromMontgomery +;;;;;;;;;;;;;;;;;;;;;; +; Multiply two elements in montgomery form +; Params: +; rsi <= Pointer to the long data of element 1 +; rdi <= Pointer to the long data of result +; Modified registers: +; r8, r9, 10, r11, rax, rcx +;;;;;;;;;;;;;;;;;;;;;; +<% +montgomeryTemplate("rawFromMontgomery", function(i, r0, r1, r2) { + // Same Digit + if (i + add <%= r0 %>, [rdi + <%= 8*i %>] + adc <%= r1 %>, 0x0 + adc <%= r2 %>, 0x0 +<% + } // Same digit +}) +%> + +;;;;;;;;;;;;;;;;;;;;;; +; toMontgomery +;;;;;;;;;;;;;;;;;;;;;; +; Convert a number to Montgomery +; rdi <= Pointer element to convert +; Modified registers: +; r8, r9, 10, r11, rax, rcx +;;;;;;;;;;;;;;;;;;;; +<%=name%>_toMontgomery: + mov rax, [rdi] + bts rax, 62 ; check if montgomery + jc toMontgomery_doNothing + bts rax, 63 + jc toMontgomeryLong + +toMontgomeryShort: + mov [rdi], rax + add rdi, 8 + push rsi + lea rsi, [R2] + movsx rdx, eax + cmp rdx, 0 + js negMontgomeryShort +posMontgomeryShort: + call rawMontgomeryMul1 + pop rsi + sub rdi, 8 + ret + +negMontgomeryShort: + neg rdx ; Do the multiplication positive and then negate the result. + call rawMontgomeryMul1 + mov rsi, rdi + call rawNegL + pop rsi + sub rdi, 8 + ret + + +toMontgomeryLong: + mov [rdi], rax + add rdi, 8 + push rsi + mov rdx, rdi + lea rsi, [R2] + call rawMontgomeryMul + pop rsi + sub rdi, 8 + +toMontgomery_doNothing: + ret + +;;;;;;;;;;;;;;;;;;;;;; +; toNormal +;;;;;;;;;;;;;;;;;;;;;; +; Convert a number from Montgomery +; rdi <= Pointer element to convert +; Modified registers: +; r8, r9, 10, r11, rax, rcx +;;;;;;;;;;;;;;;;;;;; +<%=name%>_toNormal: + mov rax, [rdi] + btc rax, 62 ; check if montgomery + jnc fromMontgomery_doNothing + bt rax, 63 ; if short, it means it's converted + jnc fromMontgomery_doNothing + +fromMontgomeryLong: + mov [rdi], rax + add rdi, 8 + call rawFromMontgomery + sub rdi, 8 + +fromMontgomery_doNothing: + ret + + diff --git a/c/buildasm/mul.asm.ejs b/c/buildasm/mul.asm.ejs new file mode 100644 index 0000000..88ecd48 --- /dev/null +++ b/c/buildasm/mul.asm.ejs @@ -0,0 +1,211 @@ +<% function mulS1S2() { %> + xor rax, rax + mov eax, r8d + imul r9d + jo mul_manageOverflow ; rsi already is the 64bits result + + mov [rdi], rax ; not necessary to adjust so just save and return + +mul_manageOverflow: ; Do the operation in 64 bits + push rsi + movsx rax, r8d + movsx rcx, r9d + imul rcx + mov rsi, rax + call rawCopyS2L + pop rsi +<% } %> + +<% function mulL1S2(t) { %> + push rsi + add rsi, 8 + movsx rdx, r9d + add rdi, 8 + cmp rdx, 0 + <% const rawPositiveLabel = global.tmpLabel() %> + jns <%= rawPositiveLabel %> + neg rdx + call rawMontgomeryMul1 + mov rsi, rdi + call rawNegL + sub rdi, 8 + pop rsi + <% const done = global.tmpLabel() %> + jmp <%= done %> +<%= rawPositiveLabel %>: + call rawMontgomeryMul1 + sub rdi, 8 + pop rsi +<%= done %>: + +<% } %> + +<% function mulS1L2() { %> + push rsi + lea rsi, [rdx + 8] + movsx rdx, r8d + add rdi, 8 + cmp rdx, 0 + <% const rawPositiveLabel = global.tmpLabel() %> + jns <%= rawPositiveLabel %> + neg rdx + call rawMontgomeryMul1 + mov rsi, rdi + call rawNegL + sub rdi, 8 + pop rsi + <% const done = global.tmpLabel() %> + jmp <%= done %> +<%= rawPositiveLabel %>: + call rawMontgomeryMul1 + sub rdi, 8 + pop rsi +<%= done %>: + +<% } %> + +<% function mulL1L2() { %> + add rdi, 8 + add rsi, 8 + add rdx, 8 + call rawMontgomeryMul + sub rdi, 8 + sub rsi, 8 +<% } %> + +<% function mulR3() { %> + push rsi + add rdi, 8 + mov rsi, rdi + lea rdx, [R3] + call rawMontgomeryMul + sub rdi, 8 + pop rsi +<% } %> + + +;;;;;;;;;;;;;;;;;;;;;; +; mul +;;;;;;;;;;;;;;;;;;;;;; +; Multiplies two elements of any kind +; Params: +; rsi <= Pointer to element 1 +; rdx <= Pointer to element 2 +; rdi <= Pointer to result +; [rdi] = [rsi] * [rdi] +; Modified Registers: +; r8, r9, 10, r11, rax, rcx +;;;;;;;;;;;;;;;;;;;;;; +<%=name%>_mul: + mov r8, [rsi] + mov r9, [rdx] + bt r8, 63 ; Check if is short first operand + jc mul_l1 + bt r9, 63 ; Check if is short second operand + jc mul_s1l2 + +mul_s1s2: ; Both operands are short +<%= mulS1S2() %> + ret + +mul_l1: + bt r9, 63 ; Check if is short second operand + jc mul_l1l2 + +;;;;;;;; +mul_l1s2: + bt r8, 62 ; check if montgomery first + jc mul_l1ms2 +mul_l1ns2: + bt r9, 62 ; check if montgomery first + jc mul_l1ns2m +mul_l1ns2n: +<%= global.setTypeDest("0xC0"); %> +<%= mulL1S2() %> +<%= mulR3() %> + ret + + +mul_l1ns2m: +<%= global.setTypeDest("0x80"); %> +<%= mulL1L2() %> + ret + + +mul_l1ms2: + bt r9, 62 ; check if montgomery second + jc mul_l1ms2m +mul_l1ms2n: +<%= global.setTypeDest("0x80"); %> +<%= mulL1S2() %> + ret + +mul_l1ms2m: +<%= global.setTypeDest("0xC0"); %> +<%= mulL1L2() %> + ret + + +;;;;;;;; +mul_s1l2: + bt r8, 62 ; check if montgomery first + jc mul_s1ml2 +mul_s1nl2: + bt r9, 62 ; check if montgomery first + jc mul_s1nl2m +mul_s1nl2n: +<%= global.setTypeDest("0xC0"); %> +<%= mulS1L2() %> +<%= mulR3() %> + ret + +mul_s1nl2m: +<%= global.setTypeDest("0x80"); %> +<%= mulS1L2(); %> + ret + +mul_s1ml2: + bt r9, 62 ; check if montgomery first + jc mul_s1ml2m +mul_s1ml2n: +<%= global.setTypeDest("0x80"); %> +<%= mulL1L2() %> + ret + +mul_s1ml2m: +<%= global.setTypeDest("0xC0"); %> +<%= mulL1L2() %> + ret + +;;;; +mul_l1l2: + bt r8, 62 ; check if montgomery first + jc mul_l1ml2 +mul_l1nl2: + bt r9, 62 ; check if montgomery second + jc mul_l1nl2m +mul_l1nl2n: +<%= global.setTypeDest("0xC0"); %> +<%= mulL1L2() %> +<%= mulR3() %> + ret + +mul_l1nl2m: +<%= global.setTypeDest("0x80"); %> +<%= mulL1L2() %> + ret + +mul_l1ml2: + bt r9, 62 ; check if montgomery seconf + jc mul_l1ml2m +mul_l1ml2n: +<%= global.setTypeDest("0x80"); %> +<%= mulL1L2() %> + ret + +mul_l1ml2m: +<%= global.setTypeDest("0xC0"); %> +<%= mulL1L2() %> + ret + + diff --git a/c/buildasm/neg.asm.ejs b/c/buildasm/neg.asm.ejs new file mode 100644 index 0000000..d0796dc --- /dev/null +++ b/c/buildasm/neg.asm.ejs @@ -0,0 +1,78 @@ +<% function negS() { %> + neg eax + jo neg_manageOverflow ; Check if overflow. (0x80000000 is the only case) + + mov [rdi], rax ; not necessary to adjust so just save and return + ret + +neg_manageOverflow: ; Do the operation in 64 bits + push rsi + movsx rsi, eax + neg rsi + call rawCopyS2L + pop rsi + ret +<% } %> + +<% function negL() { %> + add rdi, 8 + add rsi, 8 + call rawNegL + sub rdi, 8 + sub rsi, 8 + ret +<% } %> + +;;;;;;;;;;;;;;;;;;;;;; +; neg +;;;;;;;;;;;;;;;;;;;;;; +; Adds two elements of any kind +; Params: +; rsi <= Pointer to element to be negated +; rdi <= Pointer to result +; [rdi] = -[rsi] +;;;;;;;;;;;;;;;;;;;;;; +<%=name%>_neg: + mov rax, [rsi] + bt rax, 63 ; Check if is short first operand + jc neg_l + +neg_s: ; Operand is short +<%= negS() %> + + +neg_l: + mov [rdi], rax ; Copy the type +<%= negL() %> + + +;;;;;;;;;;;;;;;;;;;;;; +; rawNeg +;;;;;;;;;;;;;;;;;;;;;; +; Negates a value +; Params: +; rdi <= Pointer to the long data of result +; rsi <= Pointer to the long data of element 1 +; +; [rdi] = - [rsi] +;;;;;;;;;;;;;;;;;;;;;; +rawNegL: + ; Compare is zero + + xor rax, rax +<% for (let i=0; i + cmp [rsi + <%=i*8%>], rax + jnz doNegate +<% } %> + ; it's zero so just set to zero +<% for (let i=0; i + mov [rdi + <%=i*8%>], rax +<% } %> + ret +doNegate: +<% for (let i=0; i + mov rax, [q + <%=i*8%>] + <%= i==0 ? "sub" : "sbb" %> rax, [rsi + <%=i*8%>] + mov [rdi + <%=i*8%>], rax +<% } %> + ret diff --git a/c/buildasm/old/buildfieldasm.js b/c/buildasm/old/buildfieldasm.js new file mode 100644 index 0000000..fc48e4c --- /dev/null +++ b/c/buildasm/old/buildfieldasm.js @@ -0,0 +1,33 @@ +const tester = require("../c/buildasm/buildzqfieldtester2.js"); + +const bigInt = require("big-integer"); + +const __P__ = new bigInt("21888242871839275222246405745257275088548364400416034343698204186575808495617"); + + +describe("basic cases", function () { + this.timeout(100000); + it("should do basic tests", async () => { + await tester(__P__, [ + ["add", 0, 0], + ["add", 0, 1], + ["add", 1, 0], + ["add", 1, 1], + ["add", 2, 1], + ["add", 2, 10], + ["add", -1, -1], + ["add", -20, -10], + ["add", "10604728079509999371218483608188593244163417117449316147628604036713980815027", "10604728079509999371218483608188593244163417117449316147628604036713980815027"], + + ["mul", 0, 0], + ["mul", 0, 1], + ["mul", 1, 0], + ["mul", 1, 1], + ["mul", 2, 1], + ["mul", 2, 10], + ["mul", -1, -1], + ["mul", -20, -10], + ["mul", "10604728079509999371218483608188593244163417117449316147628604036713980815027", "10604728079509999371218483608188593244163417117449316147628604036713980815027"], + ]); + }); +}); diff --git a/c/buildasm/old/buildzqfield.js b/c/buildasm/old/buildzqfield.js new file mode 100644 index 0000000..72e5284 --- /dev/null +++ b/c/buildasm/old/buildzqfield.js @@ -0,0 +1,209 @@ +const bigInt=require("big-integer"); + + + + + +class ZqBuilder { + constructor(q, name) { + this.q=bigInt(q); + this.h = []; + this.c = []; + this.name = name; + } + + build() { + this._buildHeaders(); + this._buildAdd(); + this._buildMul(); + + this.c.push(""); this.h.push(""); + return [this.h.join("\n"), this.c.join("\n")]; + } + + _buildHeaders() { + this.n64 = Math.floor((this.q.bitLength() - 1) / 64)+1; + this.h.push("typedef unsigned long long u64;"); + this.h.push(`typedef u64 ${this.name}Element[${this.n64}];`); + this.h.push(`typedef u64 *P${this.name}Element;`); + this.h.push(`extern ${this.name}Element ${this.name}_q;`); + this.h.push(`#define ${this.name}_N64 ${this.n64}`); + this.c.push(`#include "${this.name.toLowerCase()}.h"`); + this._defineConstant(`${this.name}_q`, this.q); + this.c.push(""); this.h.push(""); + } + + _defineConstant(n, v) { + let S = `${this.name}Element ${n}={`; + const mask = bigInt("FFFFFFFFFFFFFFFF", 16); + for (let i=0; i0) S = S+","; + let shex = v.shiftRight(i*64).and(mask).toString(16); + while (shex <16) shex = "0" + shex; + S = S + "0x" + shex + "ULL"; + } + S += "};"; + this.c.push(S); + } + + _buildAdd() { + this.h.push(`void ${this.name}_add(P${this.name}Element r, P${this.name}Element a, P${this.name}Element b);`); + this.c.push(`void ${this.name}_add(P${this.name}Element r, P${this.name}Element a, P${this.name}Element b) {`); + this.c.push(" __asm__ __volatile__ ("); + for (let i=0; i0) { + this.c.push(` "movq ${(this.n64 - i-1)*8}(%0), %%rax;"`); + } + this.c.push(` "cmp ${(this.n64 - i-1)*8}(%3), %%rax;"`); + this.c.push(" \"jg SQ;\""); + this.c.push(" \"jl DONE;\""); + } + this.c.push(" \"SQ:\""); + for (let i=0; i=0; j--) { + if (((i-j)_add + global <%=name%>_mul + global <%=name%>_q + DEFAULT REL + + section .text + +;;;;;;;;;;;;;;;;;;;;;; +; add +;;;;;;;;;;;;;;;;;;;;;; +<%=name%>_add: + ; Add component by component with carry +<% for (let i=0; i + mov rax, [rsi + <%=i*8%>] + <%= i==0 ? "add" : "adc" %> rax, [rdx + <%=i*8%>] + mov [rdi + <%=i*8%>], rax +<% } %> + jc add_sq ; if overflow, substract q + + ; Compare with q +<% for (let i=0; i +<% if (i>0) { %> + mov rax, [rdi + <%= (n64-i-1)*8 %>] +<% } %> + cmp rax, [q + <%= (n64-i-1)*8 %>] + jg add_sq + jl add_done +<% } %> + ; If equal substract q +add_sq: +<% for (let i=0; i + mov rax, [q + <%=i*8%>] + <%= i==0 ? "sub" : "sbb" %> [rdi + <%=i*8%>], rax + mov [rdx + <%=i*8%>], rax +<% } %> + +add_done: + ret + + +;;;;;;;;;;;;;;;;;;;;;; +; mul Montgomery +;;;;;;;;;;;;;;;;;;;;;; +mulM: +<% +let r0, r1, r2; +function setR(step) { + if ((step % 3) == 0) { + r0 = "r8"; + r1 = "r9"; + r2 = "r10"; + } else if ((step % 3) == 1) { + r0 = "r9"; + r1 = "r10"; + r2 = "r8"; + } else { + r0 = "r10"; + r1 = "r8"; + r2 = "r9"; + } +} + +const base = bigInt.one.shiftLeft(64); +const np64 = base.minus(q.modInv(base)); +%> + sub rsp, <%= n64*8 %> ; Reserve space for ms + mov rcx, rdx ; rdx is needed for multiplications so keep it in cx + mov r11, 0x<%= np64.toString(16) %> ; np + xor r8,r8 + xor r9,r9 + xor r10,r10 +<% +// Main loop +for (let i=0; i +<% + // Same Digit + for (let o1=Math.max(0, i-n64+1); (o1<=i)&&(o1 + mov rax, [rsi + <%= 8*o1 %>] + mul qword [rcx + <%= 8*o2 %>] + add <%= r0 %>, rax + adc <%= r1 %>, rdx + adc <%= r2 %>, 0x0 +<% + } // Same digit +%> + + +<% + for (let j=i-1; j>=0; j--) { // All ms + if (((i-j) + mov rax, [rsp + <%= j*8 %>] + mul qword [q + <%= (i-j)*8 %>] + add <%= r0 %>, rax + adc <%= r1 %>, rdx + adc <%= r2 %>, 0x0 +<% + } + } // ms +%> + +<% + if (i + mov rax, <%= r0 %> + mul r11 + mov [rsp + <%= i*8 %>], rax + mul qword [q] + add <%= r0 %>, rax + adc <%= r1 %>, rdx + adc <%= r2 %>, 0x0 +<% + } else { +%> + mov [rdi + <%= (i-n64)*8 %> ], <%= r0 %> + xor <%= r0 %>,<%= r0 %> +<% + } +%> + +<% +} // Main Loop +%> + cmp <%= r1 %>, 0x0 + jne mulM_sq + ; Compare with q +<% +for (let i=0; i + mov rax, [rdi + <%= (n64-i-1)*8 %>] + cmp rax, [q + <%= (n64-i-1)*8 %>] + jg mulM_sq + jl mulM_done +<% +} +%> + ; If equal substract q + +mulM_sq: +<% +for (let i=0; i + mov rax, [q + <%= i*8 %>] + <%= i==0 ? "sub" : "sbb" %> [rdi + <%= i*8 %>], rax + mov [rdx + <%= i*8 %>], rax +<% +} +%> + +mulM_done: + add rsp, <%= n64*8 %> ; recover rsp + ret + +;;;;;;;;;;;;;;;;;;;;;; +; mul MontgomeryShort +;;;;;;;;;;;;;;;;;;;;;; +mulSM: + +;;;;;;;;;;;;;;;;;;;;;; +; mul +;;;;;;;;;;;;;;;;;;;;;; +<%=name%>_mul: + mov rax, [rsi] + bt rax, 63 + jc l1 + mov rcx, [rdx] + bt rcx, 63 + jc s1l2 +s1s2: ; short first and second + mul ecx + jc rs2l ; If if doesn't feed in 32 bits convert the result to long + + ; The shorts multiplication is done. copy the val to destination and return + mov [rdi], rax + ret + +rs2l: ; The result in the multiplication doen't feed + ; we have the result in edx:eax we need to convert it to long + shl rdx, 32 + mov edx, eax ; pack edx:eax to rdx + + xor rax, rax ; Set the format to long + bts rax, 63 + mov [rdi], rax ; move the first digit + + cmp rdx, 0 ; check if redx is negative. + jl rs2ln + + ; edx is positive. + mov [rdi + 8], rdx ; Set the firs digit + + xor rax, rax ; Set the remaining digits to 0 +<% for (let i=1; i + mov [rdi + <%= (i+1)*8 %>], rax +<% } %> + ret + + ; edx is negative. +rs2ln: + + add rdx, [q] ; Set the firs digit + mov [rdi + 8], rdx ; + + mov rdx, -1 ; all ones +<% for (let i=1; i + mov rax, rdx ; Add to q + adc rax, [q + <%= i*8 %> ] + mov [rdi + <%= (i+1)*8 %>], rax +<% } %> + ret + +l1: + mov rcx, [rdx] + bt rcx, 63 + jc ll + +l1s2: + xor rdx, rdx + mov edx, ecx + bt rax, 62 + jc lsM + jmp lsN + +s1l2: + mov rsi, rdx + xor rdx, rdx + mov edx, eax + bt rcx, 62 + jc lsM + jmp lsN + + +lsN: + mov byte [rdi + 3], 0xC0 ; set the result to montgomery + add rsi, 8 + add rdi, 8 + call mulSM + mov rdx, R3 + call mulM + ret + +lsM: + mov byte [rdi + 3], 0x80 ; set the result to long normal + add rsi, 8 + add rdi, 8 + call mulSM + ret + + +ll: + + bt rax, 62 + jc lml + bt rcx, 62 + jc lnlm + +lnln: + mov byte [rdi + 3], 0xC0 ; set the result to long montgomery + add rsi, 8 + add rdi, 8 + add rdx, 8 + call mulM + mov rdi, rsi + mov rdx, R3 + call mulM + ret + +lml: + bt rcx, 62 + jc lmlm + +lnlm: + mov byte [rdi + 3], 0x80 ; set the result to long normal + add rsi, 8 + add rdi, 8 + add rdx, 8 + call mulM + ret + +lmlm: + mov byte [rdi + 3], 0xC0 ; set the result to long montgomery + add rsi, 8 + add rdi, 8 + add rdx, 8 + call mulM + ret + + + section .data +<%=name%>_q: + dd 0 + dd 0x80000000 +q dq <%= constantElement(q) %> +R3 dq <%= constantElement(bigInt.one.shiftLeft(n64*64*3).mod(q)) %> + + diff --git a/c/buildasm/old/mul.asm.ejs b/c/buildasm/old/mul.asm.ejs new file mode 100644 index 0000000..f6b537e --- /dev/null +++ b/c/buildasm/old/mul.asm.ejs @@ -0,0 +1,251 @@ + +;;;;;;;;;;;;;;;;;;;;;; +; mul Montgomery +;;;;;;;;;;;;;;;;;;;;;; +mulM: +<% +let r0, r1, r2; +function setR(step) { + if ((step % 3) == 0) { + r0 = "r8"; + r1 = "r9"; + r2 = "r10"; + } else if ((step % 3) == 1) { + r0 = "r9"; + r1 = "r10"; + r2 = "r8"; + } else { + r0 = "r10"; + r1 = "r8"; + r2 = "r9"; + } +} + +const base = bigInt.one.shiftLeft(64); +const np64 = base.minus(q.modInv(base)); +%> + sub rsp, <%= n64*8 %> ; Reserve space for ms + mov rcx, rdx ; rdx is needed for multiplications so keep it in cx + mov r11, 0x<%= np64.toString(16) %> ; np + xor r8,r8 + xor r9,r9 + xor r10,r10 +<% +// Main loop +for (let i=0; i +<% + // Same Digit + for (let o1=Math.max(0, i-n64+1); (o1<=i)&&(o1 + mov rax, [rsi + <%= 8*o1 %>] + mul qword [rcx + <%= 8*o2 %>] + add <%= r0 %>, rax + adc <%= r1 %>, rdx + adc <%= r2 %>, 0x0 +<% + } // Same digit +%> + + +<% + for (let j=i-1; j>=0; j--) { // All ms + if (((i-j) + mov rax, [rsp + <%= j*8 %>] + mul qword [q + <%= (i-j)*8 %>] + add <%= r0 %>, rax + adc <%= r1 %>, rdx + adc <%= r2 %>, 0x0 +<% + } + } // ms +%> + +<% + if (i + mov rax, <%= r0 %> + mul r11 + mov [rsp + <%= i*8 %>], rax + mul qword [q] + add <%= r0 %>, rax + adc <%= r1 %>, rdx + adc <%= r2 %>, 0x0 +<% + } else { +%> + mov [rdi + <%= (i-n64)*8 %> ], <%= r0 %> + xor <%= r0 %>,<%= r0 %> +<% + } +%> + +<% +} // Main Loop +%> + cmp <%= r1 %>, 0x0 + jne mulM_sq + ; Compare with q +<% +for (let i=0; i + mov rax, [rdi + <%= (n64-i-1)*8 %>] + cmp rax, [q + <%= (n64-i-1)*8 %>] + jg mulM_sq + jl mulM_done +<% +} +%> + ; If equal substract q + +mulM_sq: +<% +for (let i=0; i + mov rax, [q + <%= i*8 %>] + <%= i==0 ? "sub" : "sbb" %> [rdi + <%= i*8 %>], rax +<% +} +%> + +mulM_done: + add rsp, <%= n64*8 %> ; recover rsp + ret + +;;;;;;;;;;;;;;;;;;;;;; +; mul MontgomeryShort +;;;;;;;;;;;;;;;;;;;;;; +mulSM: + +;;;;;;;;;;;;;;;;;;;;;; +; mul +;;;;;;;;;;;;;;;;;;;;;; +<%=name%>_mul: + mov rax, [rsi] + bt rax, 63 + jc l1 + mov rcx, [rdx] + bt rcx, 63 + jc s1l2 +s1s2: ; short first and second + mul ecx + jc rs2l ; If if doesn't feed in 32 bits convert the result to long + + ; The shorts multiplication is done. copy the val to destination and return + mov [rdi], rax + ret + +rs2l: ; The result in the multiplication doen't feed + ; we have the result in edx:eax we need to convert it to long + shl rdx, 32 + mov edx, eax ; pack edx:eax to rdx + + xor rax, rax ; Set the format to long + bts rax, 63 + mov [rdi], rax ; move the first digit + + cmp rdx, 0 ; check if redx is negative. + jl rs2ln + + ; edx is positive. + mov [rdi + 8], rdx ; Set the firs digit + + xor rax, rax ; Set the remaining digits to 0 +<% for (let i=1; i + mov [rdi + <%= (i+1)*8 %>], rax +<% } %> + ret + + ; edx is negative. +rs2ln: + + add rdx, [q] ; Set the firs digit + mov [rdi + 8], rdx ; + + mov rdx, -1 ; all ones +<% for (let i=1; i + mov rax, rdx ; Add to q + adc rax, [q + <%= i*8 %> ] + mov [rdi + <%= (i+1)*8 %>], rax +<% } %> + ret + +l1: + mov rcx, [rdx] + bt rcx, 63 + jc ll + +l1s2: + xor rdx, rdx + mov edx, ecx + bt rax, 62 + jc lsM + jmp lsN + +s1l2: + mov rsi, rdx + xor rdx, rdx + mov edx, eax + bt rcx, 62 + jc lsM + jmp lsN + + +lsN: + mov byte [rdi + 7], 0xC0 ; set the result to montgomery + add rsi, 8 + add rdi, 8 + call mulSM + mov rsi, rdi + lea rdx, [R3] + call mulM + ret + +lsM: + mov byte [rdi + 7], 0x80 ; set the result to long normal + add rsi, 8 + add rdi, 8 + call mulSM + ret + + +ll: + + bt rax, 62 + jc lml + bt rcx, 62 + jc lnlm + +lnln: + mov byte [rdi + 7], 0xC0 ; set the result to long montgomery + add rsi, 8 + add rdi, 8 + add rdx, 8 + call mulM + mov rsi, rdi + lea rdx, [R3] + call mulM + ret + +lml: + bt rcx, 62 + jc lmlm + +lnlm: + mov byte [rdi + 7], 0x80 ; set the result to long normal + add rsi, 8 + add rdi, 8 + add rdx, 8 + call mulM + ret + +lmlm: + mov byte [rdi + 7], 0xC0 ; set the result to long montgomery + add rsi, 8 + add rdi, 8 + add rdx, 8 + call mulM + ret diff --git a/c/buildasm/out.map b/c/buildasm/out.map new file mode 100644 index 0000000..0792914 --- /dev/null +++ b/c/buildasm/out.map @@ -0,0 +1,219 @@ +# Path: main +# Arch: x86_64 +# Object files: +[ 0] linker synthesized +[ 1] /var/folders/g_/74y0ll3503d4sm0c64jw432r0000gn/T//cczqYl2H.o +[ 2] fr.o +[ 3] /var/folders/g_/74y0ll3503d4sm0c64jw432r0000gn/T//cc5nHggh.o +[ 4] /usr/local/lib/libgmp.dylib +[ 5] /usr/local/Cellar/gcc/9.2.0_2/lib/gcc/9/libstdc++.dylib +[ 6] /Library/Developer/CommandLineTools/SDKs/MacOSX10.15.sdk/usr/lib/libSystem.tbd +# Sections: +# Address Size Segment Section +0x1000011BD 0x00001C99 __TEXT __text +0x100002E56 0x00000042 __TEXT __stubs +0x100002E98 0x00000074 __TEXT __stub_helper +0x100002F0C 0x0000001B __TEXT __cstring +0x100002F28 0x000000D8 __TEXT __eh_frame +0x100003000 0x00000008 __DATA_CONST __got +0x100004000 0x00000058 __DATA __la_symbol_ptr +0x100004058 0x00000078 __DATA __data +# Symbols: +# Address Size File Name +0x1000011BD 0x000000E2 [ 1] _main +0x10000129F 0x00000023 [ 2] rawCopyS2L +0x1000012C2 0x0000003D [ 2] u64toLong_adjust_neg +0x1000012FF 0x00000301 [ 2] rawMontgomeryMul +0x100001600 0x0000002B [ 2] rawMontgomeryMul_mulM_sq +0x10000162B 0x00000005 [ 2] rawMontgomeryMul_mulM_done +0x100001630 0x0000022C [ 2] rawMontgomeryMul1 +0x10000185C 0x0000002B [ 2] rawMontgomeryMul1_mulM_sq +0x100001887 0x00000005 [ 2] rawMontgomeryMul1_mulM_done +0x10000188C 0x00000218 [ 2] rawFromMontgomery +0x100001AA4 0x0000002B [ 2] rawFromMontgomery_mulM_sq +0x100001ACF 0x00000005 [ 2] rawFromMontgomery_mulM_done +0x100001AD4 0x00000011 [ 2] _Fr_toMontgomery +0x100001AE5 0x00000018 [ 2] toMontgomeryShort +0x100001AFD 0x0000000B [ 2] posMontgomeryShort +0x100001B08 0x00000016 [ 2] negMontgomeryShort +0x100001B1E 0x0000001C [ 2] toMontgomeryLong +0x100001B3A 0x00000001 [ 2] toMontgomery_doNothing +0x100001B3B 0x00000011 [ 2] _Fr_toNormal +0x100001B4C 0x00000010 [ 2] fromMontgomeryLong +0x100001B5C 0x00000001 [ 2] fromMontgomery_doNothing +0x100001B5D 0x00000018 [ 2] _Fr_add +0x100001B75 0x0000000D [ 2] add_s1s2 +0x100001B82 0x00000011 [ 2] add_manageOverflow +0x100001B93 0x0000000B [ 2] add_l1 +0x100001B9E 0x00000007 [ 2] add_l1s2 +0x100001BA5 0x0000002C [ 2] add_l1ns2 +0x100001BD1 0x0000000E [ 2] tmp1 +0x100001BDF 0x00000007 [ 2] add_l1ms2 +0x100001BE6 0x00000031 [ 2] add_l1ms2n +0x100001C17 0x00000024 [ 2] add_l1ms2m +0x100001C3B 0x00000007 [ 2] add_s1l2 +0x100001C42 0x0000002C [ 2] add_s1l2n +0x100001C6E 0x0000000E [ 2] tmp2 +0x100001C7C 0x00000007 [ 2] add_s1l2m +0x100001C83 0x00000037 [ 2] add_s1nl2m +0x100001CBA 0x00000024 [ 2] add_s1ml2m +0x100001CDE 0x00000007 [ 2] add_l1l2 +0x100001CE5 0x00000007 [ 2] add_l1nl2 +0x100001CEC 0x00000024 [ 2] add_l1nl2n +0x100001D10 0x00000037 [ 2] add_l1nl2m +0x100001D47 0x00000007 [ 2] add_l1ml2 +0x100001D4E 0x00000031 [ 2] add_l1ml2n +0x100001D7F 0x00000024 [ 2] add_l1ml2m +0x100001DA3 0x00000066 [ 2] rawAddLL +0x100001E09 0x0000002B [ 2] rawAddLL_sq +0x100001E34 0x00000001 [ 2] rawAddLL_done +0x100001E35 0x0000006A [ 2] rawAddLS +0x100001E9F 0x0000002B [ 2] rawAddLS_sq +0x100001ECA 0x00000001 [ 2] rawAddLS_done +0x100001ECB 0x00000018 [ 2] _Fr_sub +0x100001EE3 0x0000000D [ 2] sub_s1s2 +0x100001EF0 0x00000011 [ 2] sub_manageOverflow +0x100001F01 0x0000000B [ 2] sub_l1 +0x100001F0C 0x00000007 [ 2] sub_l1s2 +0x100001F13 0x0000002C [ 2] sub_l1ns2 +0x100001F3F 0x0000000E [ 2] tmp3 +0x100001F4D 0x00000007 [ 2] sub_l1ms2 +0x100001F54 0x00000031 [ 2] sub_l1ms2n +0x100001F85 0x00000024 [ 2] sub_l1ms2m +0x100001FA9 0x00000007 [ 2] sub_s1l2 +0x100001FB0 0x00000026 [ 2] sub_s1l2n +0x100001FD6 0x0000001A [ 2] tmp4 +0x100001FF0 0x00000007 [ 2] sub_s1l2m +0x100001FF7 0x00000037 [ 2] sub_s1nl2m +0x10000202E 0x00000024 [ 2] sub_s1ml2m +0x100002052 0x00000007 [ 2] sub_l1l2 +0x100002059 0x00000007 [ 2] sub_l1nl2 +0x100002060 0x00000024 [ 2] sub_l1nl2n +0x100002084 0x00000037 [ 2] sub_l1nl2m +0x1000020BB 0x00000007 [ 2] sub_l1ml2 +0x1000020C2 0x00000031 [ 2] sub_l1ml2n +0x1000020F3 0x00000024 [ 2] sub_l1ml2m +0x100002117 0x00000031 [ 2] rawSubLS +0x100002148 0x0000002B [ 2] rawSubLS_aq +0x100002173 0x00000001 [ 2] rawSubLS_done +0x100002174 0x0000002F [ 2] rawSubSL +0x1000021A3 0x0000002B [ 2] rawSubSL_aq +0x1000021CE 0x00000001 [ 2] rawSubSL_done +0x1000021CF 0x0000002F [ 2] rawSubLL +0x1000021FE 0x0000002B [ 2] rawSubLL_aq +0x100002229 0x00000001 [ 2] rawSubLL_done +0x10000222A 0x0000009C [ 2] rawNegLS +0x1000022C6 0x00000001 [ 2] rawNegSL_done +0x1000022C7 0x0000000A [ 2] _Fr_neg +0x1000022D1 0x00000008 [ 2] neg_s +0x1000022D9 0x0000000E [ 2] neg_manageOverflow +0x1000022E7 0x00000019 [ 2] neg_l +0x100002300 0x0000002A [ 2] rawNegL +0x10000232A 0x0000003B [ 2] doNegate +0x100002365 0x00000018 [ 2] _Fr_mul +0x10000237D 0x0000000E [ 2] mul_s1s2 +0x10000238B 0x00000014 [ 2] mul_manageOverflow +0x10000239F 0x0000000B [ 2] mul_l1 +0x1000023AA 0x0000000B [ 2] mul_l1s2 +0x1000023B5 0x00000007 [ 2] mul_l1ns2 +0x1000023BC 0x00000033 [ 2] mul_l1ns2n +0x1000023EF 0x0000000A [ 2] tmp5 +0x1000023F9 0x0000001A [ 2] tmp6 +0x100002413 0x00000024 [ 2] mul_l1ns2m +0x100002437 0x00000007 [ 2] mul_l1ms2 +0x10000243E 0x00000033 [ 2] mul_l1ms2n +0x100002471 0x0000000A [ 2] tmp7 +0x10000247B 0x00000001 [ 2] tmp8 +0x10000247C 0x00000024 [ 2] mul_l1ms2m +0x1000024A0 0x0000000B [ 2] mul_s1l2 +0x1000024AB 0x00000007 [ 2] mul_s1nl2 +0x1000024B2 0x00000033 [ 2] mul_s1nl2n +0x1000024E5 0x0000000A [ 2] tmp9 +0x1000024EF 0x0000001A [ 2] tmp10 +0x100002509 0x00000033 [ 2] mul_s1nl2m +0x10000253C 0x0000000A [ 2] tmp11 +0x100002546 0x00000001 [ 2] tmp12 +0x100002547 0x00000007 [ 2] mul_s1ml2 +0x10000254E 0x00000024 [ 2] mul_s1ml2n +0x100002572 0x00000024 [ 2] mul_s1ml2m +0x100002596 0x00000007 [ 2] mul_l1l2 +0x10000259D 0x00000007 [ 2] mul_l1nl2 +0x1000025A4 0x0000003D [ 2] mul_l1nl2n +0x1000025E1 0x00000024 [ 2] mul_l1nl2m +0x100002605 0x00000007 [ 2] mul_l1ml2 +0x10000260C 0x00000024 [ 2] mul_l1ml2n +0x100002630 0x00000024 [ 2] mul_l1ml2m +0x100002654 0x0000001C [ 2] _Fr_band +0x100002670 0x00000019 [ 2] and_s1s2 +0x100002689 0x00000012 [ 2] tmp13 +0x10000269B 0x00000054 [ 2] tmp14 +0x1000026EF 0x0000000B [ 2] and_l1 +0x1000026FA 0x0000000B [ 2] and_l1s2 +0x100002705 0x00000044 [ 2] and_l1ns2 +0x100002749 0x00000054 [ 2] tmp15 +0x10000279D 0x00000059 [ 2] and_l1ms2 +0x1000027F6 0x00000054 [ 2] tmp16 +0x10000284A 0x0000000B [ 2] and_s1l2 +0x100002855 0x00000044 [ 2] and_s1l2n +0x100002899 0x00000054 [ 2] tmp17 +0x1000028ED 0x00000053 [ 2] and_s1l2m +0x100002940 0x00000054 [ 2] tmp18 +0x100002994 0x00000016 [ 2] and_l1l2 +0x1000029AA 0x00000044 [ 2] and_l1nl2n +0x1000029EE 0x00000054 [ 2] tmp19 +0x100002A42 0x00000053 [ 2] and_l1nl2m +0x100002A95 0x00000054 [ 2] tmp20 +0x100002AE9 0x0000000B [ 2] and_l1ml2 +0x100002AF4 0x00000059 [ 2] and_l1ml2n +0x100002B4D 0x00000054 [ 2] tmp21 +0x100002BA1 0x00000068 [ 2] and_l1ml2m +0x100002C09 0x00000054 [ 2] tmp22 +0x100002C5D 0x0000009F [ 3] __Z14Fr_str2elementP9FrElementPc +0x100002CFC 0x0000015A [ 3] __Z14Fr_element2strP9FrElement +0x100002E56 0x00000006 [ 5] __Znam +0x100002E5C 0x00000006 [ 4] ___gmpz_add +0x100002E62 0x00000006 [ 4] ___gmpz_clear +0x100002E68 0x00000006 [ 4] ___gmpz_export +0x100002E6E 0x00000006 [ 4] ___gmpz_get_str +0x100002E74 0x00000006 [ 4] ___gmpz_import +0x100002E7A 0x00000006 [ 4] ___gmpz_init +0x100002E80 0x00000006 [ 4] ___gmpz_init_set_si +0x100002E86 0x00000006 [ 4] ___gmpz_set_str +0x100002E8C 0x00000006 [ 6] _printf +0x100002E92 0x00000006 [ 6] _sprintf +0x100002E98 0x00000010 [ 0] helper helper +0x100002EA8 0x0000000A [ 4] ___gmpz_add +0x100002EB2 0x0000000A [ 4] ___gmpz_clear +0x100002EBC 0x0000000A [ 4] ___gmpz_export +0x100002EC6 0x0000000A [ 4] ___gmpz_get_str +0x100002ED0 0x0000000A [ 4] ___gmpz_import +0x100002EDA 0x0000000A [ 4] ___gmpz_init +0x100002EE4 0x0000000A [ 4] ___gmpz_init_set_si +0x100002EEE 0x0000000A [ 4] ___gmpz_set_str +0x100002EF8 0x0000000A [ 6] _printf +0x100002F02 0x0000000A [ 6] _sprintf +0x100002F0C 0x00000018 [ 1] literal string: %llu, %llu, %llu, %llu\n +0x100002F24 0x00000003 [ 3] literal string: %d +0x100002F28 0x00000018 [ 1] CIE +0x100002F40 0x00000038 [ 1] FDE for: _main +0x100002F78 0x00000018 [ 3] CIE +0x100002F90 0x00000038 [ 3] FDE for: __Z14Fr_str2elementP9FrElementPc +0x100002FC8 0x00000038 [ 3] FDE for: __Z14Fr_element2strP9FrElement +0x100003000 0x00000008 [ 0] non-lazy-pointer-to-local: dyld_stub_binder +0x100004000 0x00000008 [ 5] __Znam +0x100004008 0x00000008 [ 4] ___gmpz_add +0x100004010 0x00000008 [ 4] ___gmpz_clear +0x100004018 0x00000008 [ 4] ___gmpz_export +0x100004020 0x00000008 [ 4] ___gmpz_get_str +0x100004028 0x00000008 [ 4] ___gmpz_import +0x100004030 0x00000008 [ 4] ___gmpz_init +0x100004038 0x00000008 [ 4] ___gmpz_init_set_si +0x100004040 0x00000008 [ 4] ___gmpz_set_str +0x100004048 0x00000008 [ 6] _printf +0x100004050 0x00000008 [ 6] _sprintf +0x100004058 0x00000008 [ 0] __dyld_private +0x100004060 0x00000008 [ 2] _Fr_q +0x100004068 0x00000020 [ 2] q +0x100004088 0x00000020 [ 2] R2 +0x1000040A8 0x00000020 [ 2] R3 +0x1000040C8 0x00000008 [ 2] lboMask diff --git a/c/buildasm/sub.asm.ejs b/c/buildasm/sub.asm.ejs new file mode 100644 index 0000000..5a8d199 --- /dev/null +++ b/c/buildasm/sub.asm.ejs @@ -0,0 +1,317 @@ +<% function subS1S2() { %> + xor rdx, rdx + mov edx, eax + sub edx, ecx + jo sub_manageOverflow ; rsi already is the 64bits result + + mov [rdi], rdx ; not necessary to adjust so just save and return + ret + +sub_manageOverflow: ; Do the operation in 64 bits + push rsi + movsx rsi, eax + movsx rdx, ecx + sub rsi, rdx + call rawCopyS2L + pop rsi + ret +<% } %> + +<% function subL1S2(t) { %> + add rsi, 8 + movsx rdx, ecx + add rdi, 8 + cmp rdx, 0 + <% const rawSubLabel = global.tmpLabel() %> + jns <%= rawSubLabel %> + neg rdx + call rawAddLS + sub rdi, 8 + sub rsi, 8 + ret +<%= rawSubLabel %>: + call rawSubLS + sub rdi, 8 + sub rsi, 8 + ret +<% } %> + + +<% function subS1L2(t) { %> + cmp eax, 0 + <% const s1NegLabel = global.tmpLabel() %> + js <%= s1NegLabel %> + + ; First Operand is positive + push rsi + add rdi, 8 + movsx rsi, eax + add rdx, 8 + call rawSubSL + sub rdi, 8 + pop rsi + ret + +<%= s1NegLabel %>: ; First operand is negative + push rsi + lea rsi, [rdx + 8] + movsx rdx, eax + add rdi, 8 + neg rdx + call rawNegLS + sub rdi, 8 + pop rsi + ret +<% } %> + + +<% function subL1L2(t) { %> + add rdi, 8 + add rsi, 8 + add rdx, 8 + call rawSubLL + sub rdi, 8 + sub rsi, 8 + ret +<% } %> + +;;;;;;;;;;;;;;;;;;;;;; +; sub +;;;;;;;;;;;;;;;;;;;;;; +; Substracts two elements of any kind +; Params: +; rsi <= Pointer to element 1 +; rdx <= Pointer to element 2 +; rdi <= Pointer to result +; Modified Registers: +; r8, r9, 10, r11, rax, rcx +;;;;;;;;;;;;;;;;;;;;;; +<%=name%>_sub: + mov rax, [rsi] + mov rcx, [rdx] + bt rax, 63 ; Check if is long first operand + jc sub_l1 + bt rcx, 63 ; Check if is long second operand + jc sub_s1l2 + +sub_s1s2: ; Both operands are short +<%= subS1S2() %> +sub_l1: + bt rcx, 63 ; Check if is short second operand + jc sub_l1l2 + +;;;;;;;; +sub_l1s2: + bt rax, 62 ; check if montgomery first + jc sub_l1ms2 +sub_l1ns2: +<%= global.setTypeDest("0x80"); %> +<%= subL1S2(); %> + +sub_l1ms2: + bt rcx, 62 ; check if montgomery second + jc sub_l1ms2m +sub_l1ms2n: +<%= global.setTypeDest("0xC0"); %> +<%= global.toMont_b() %> +<%= subL1L2() %> + +sub_l1ms2m: +<%= global.setTypeDest("0xC0"); %> +<%= subL1L2() %> + + +;;;;;;;; +sub_s1l2: + bt rcx, 62 ; check if montgomery first + jc sub_s1l2m +sub_s1l2n: +<%= global.setTypeDest("0x80"); %> +<%= subS1L2(); %> + +sub_s1l2m: + bt rax, 62 ; check if montgomery second + jc sub_s1ml2m +sub_s1nl2m: +<%= global.setTypeDest("0xC0"); %> +<%= global.toMont_a() %> +<%= subL1L2() %> + +sub_s1ml2m: +<%= global.setTypeDest("0xC0"); %> +<%= subL1L2() %> + +;;;; +sub_l1l2: + bt rax, 62 ; check if montgomery first + jc sub_l1ml2 +sub_l1nl2: + bt rcx, 62 ; check if montgomery second + jc sub_l1nl2m +sub_l1nl2n: +<%= global.setTypeDest("0x80"); %> +<%= subL1L2() %> + +sub_l1nl2m: +<%= global.setTypeDest("0xC0"); %> +<%= global.toMont_a(); %> +<%= subL1L2() %> + +sub_l1ml2: + bt rcx, 62 ; check if montgomery seconf + jc sub_l1ml2m +sub_l1ml2n: +<%= global.setTypeDest("0xC0"); %> +<%= global.toMont_b(); %> +<%= subL1L2() %> + +sub_l1ml2m: +<%= global.setTypeDest("0xC0"); %> +<%= subL1L2() %> + + +;;;;;;;;;;;;;;;;;;;;;; +; rawSubLS +;;;;;;;;;;;;;;;;;;;;;; +; Substracts a short element from the long element +; Params: +; rdi <= Pointer to the long data of result +; rsi <= Pointer to the long data of element 1 where will be substracted +; rdx <= Value to be substracted +; [rdi] = [rsi] - rdx +; Modified Registers: +; rax +;;;;;;;;;;;;;;;;;;;;;; +rawSubLS: + ; Substract first digit + + mov rax, [rsi] + sub rax, rdx + mov [rdi] ,rax + mov rdx, 0 +<% for (let i=1; i + mov rax, [rsi + <%=i*8%>] + sbb rax, rdx + mov [rdi + <%=i*8%>], rax +<% } %> + jnc rawSubLS_done ; if overflow, add q + + ; Add q +rawSubLS_aq: +<% for (let i=0; i + mov rax, [q + <%=i*8%>] + <%= i==0 ? "add" : "adc" %> [rdi + <%=i*8%>], rax +<% } %> +rawSubLS_done: + ret + + +;;;;;;;;;;;;;;;;;;;;;; +; rawSubSL +;;;;;;;;;;;;;;;;;;;;;; +; Substracts a long element from a short element +; Params: +; rdi <= Pointer to the long data of result +; rsi <= Value from where will bo substracted +; rdx <= Pointer to long of the value to be substracted +; +; [rdi] = rsi - [rdx] +; Modified Registers: +; rax +;;;;;;;;;;;;;;;;;;;;;; +rawSubSL: + ; Substract first digit + sub rsi, [rdx] + mov [rdi] ,rsi + +<% for (let i=1; i + mov rax, 0 + sbb rax, [rdx + <%=i*8%>] + mov [rdi + <%=i*8%>], rax +<% } %> + jnc rawSubSL_done ; if overflow, add q + + ; Add q +rawSubSL_aq: +<% for (let i=0; i + mov rax, [q + <%=i*8%>] + <%= i==0 ? "add" : "adc" %> [rdi + <%=i*8%>], rax +<% } %> +rawSubSL_done: + ret + +;;;;;;;;;;;;;;;;;;;;;; +; rawSubLL +;;;;;;;;;;;;;;;;;;;;;; +; Substracts a long element from a short element +; Params: +; rdi <= Pointer to the long data of result +; rsi <= Pointer to long from where substracted +; rdx <= Pointer to long of the value to be substracted +; +; [rdi] = [rsi] - [rdx] +; Modified Registers: +; rax +;;;;;;;;;;;;;;;;;;;;;; +rawSubLL: + ; Substract first digit +<% for (let i=0; i + mov rax, [rsi + <%=i*8%>] + <%= i==0 ? "sub" : "sbb" %> rax, [rdx + <%=i*8%>] + mov [rdi + <%=i*8%>], rax +<% } %> + jnc rawSubLL_done ; if overflow, add q + + ; Add q +rawSubLL_aq: +<% for (let i=0; i + mov rax, [q + <%=i*8%>] + <%= i==0 ? "add" : "adc" %> [rdi + <%=i*8%>], rax +<% } %> +rawSubLL_done: + ret + +;;;;;;;;;;;;;;;;;;;;;; +; rawNegLS +;;;;;;;;;;;;;;;;;;;;;; +; Substracts a long element and a short element form 0 +; Params: +; rdi <= Pointer to the long data of result +; rsi <= Pointer to long from where substracted +; rdx <= short value to be substracted too +; +; [rdi] = -[rsi] - rdx +; Modified Registers: +; rax +;;;;;;;;;;;;;;;;;;;;;; +rawNegLS: + mov rax, [q] + sub rax, rdx + mov [rdi], rax +<% for (let i=1; i + mov rax, [q + <%=i*8%> ] + sbb rax, 0 + mov [rdi + <%=i*8%>], rax +<% } %> + setc dl + +<% for (let i=0; i + mov rax, [rdi + <%=i*8%> ] + <%= i==0 ? "sub" : "sbb" %> rax, [rsi + <%=i*8%>] + mov [rdi + <%=i*8%>], rax +<% } %> + + setc dh + or dl, dh + jz rawNegSL_done + + ; it is a negative value, so add q +<% for (let i=0; i + mov rax, [q + <%=i*8%>] + <%= i==0 ? "add" : "adc" %> [rdi + <%=i*8%>], rax +<% } %> + +rawNegSL_done: + ret + + diff --git a/c/buildasm/tester b/c/buildasm/tester new file mode 100755 index 0000000..93428e6 Binary files /dev/null and b/c/buildasm/tester differ diff --git a/c/buildasm/tester.cpp b/c/buildasm/tester.cpp new file mode 100644 index 0000000..04d2d83 --- /dev/null +++ b/c/buildasm/tester.cpp @@ -0,0 +1,209 @@ +#include +#include + +#include +#include +#include +#include +#include + +#include /* printf, NULL */ +#include +#include + + +#include "fr.h" + + +typedef void (*Func1)(PFrElement, PFrElement); +typedef void (*Func2)(PFrElement, PFrElement, PFrElement); +typedef void *FuncAny; + +typedef struct { + FuncAny fn; + int nOps; +} FunctionSpec; + +std::map functions; +std::vector stack; + +void addFunction(std::string name, FuncAny f, int nOps) { + FunctionSpec fs; + fs.fn = f; + fs.nOps = nOps; + functions[name] = fs; +} + +void fillMap() { + addFunction("add", (FuncAny)Fr_add, 2); + addFunction("sub", (FuncAny)Fr_sub, 2); + addFunction("neg", (FuncAny)Fr_neg, 1); + addFunction("mul", (FuncAny)Fr_mul, 2); + addFunction("band", (FuncAny)Fr_band, 2); + addFunction("bor", (FuncAny)Fr_bor, 2); + addFunction("bxor", (FuncAny)Fr_bxor, 2); + addFunction("eq", (FuncAny)Fr_eq, 2); + addFunction("neq", (FuncAny)Fr_neq, 2); + addFunction("lt", (FuncAny)Fr_lt, 2); + addFunction("gt", (FuncAny)Fr_gt, 2); + addFunction("leq", (FuncAny)Fr_leq, 2); + addFunction("geq", (FuncAny)Fr_geq, 2); +} + +u_int64_t readInt(std::string &s) { + if (s.rfind("0x", 0) == 0) { + return std::stoull(s.substr(2), 0, 16); + } else { + return std::stoull(s, 0, 10); + } +} + +void pushNumber(std::vector &v) { + u_int64_t a; + if ((v.size()<1) || (v.size() > (Fr_N64+1))) { + printf("Invalid Size: %d - %d \n", v.size(), Fr_N64); + throw std::runtime_error("Invalid number of parameters for number"); + } + FrElement e; + a = readInt(v[0]); + *(u_int64_t *)(&e) = a; + for (int i=0; i tokens; + + std::copy(begin, end, std::back_inserter(tokens)); + + // Remove initial empty tokens + while ((tokens.size() > 0)&&(tokens[0] == "")) { + tokens.erase(tokens.begin()); + } + + // Empty lines are valid but are not processed + if (tokens.size() == 0) return; + + auto search = functions.find(tokens[0]); + if (search == functions.end()) { + pushNumber(tokens); + } else { + if (tokens.size() != 1) { + throw std::runtime_error("Functions does not accept parameters"); + } + callFunction(search->second); + } +} + +int main(void) +{ + fillMap(); + std::string line; + int i=0; + while (std::getline(std::cin, line)) { + processLine(line); + // if (i%1000 == 0) printf("%d\n", i); + // printf("%d\n", i); + i++; + } + // Print the elements in the stack + // + for (int i=0; i +#include +#include "fr.h" + +typedef void (*Func2)(PFrElement, PFrElement, PFrElement); + +typedef struct { + const char *fnName; + Func2 fn; +} FN; + + +#define NFN 2 +FN fns[NFN] = { + {"add", Fr_add}, + {"mul", Fr_mul}, +}; + +int main(int argc, char **argv) { + + if (argc <= 1) { + fprintf( stderr, "invalid number of parameters"); + return 1; + } + + for (int i=0; i< NFN;i++) { + if (strcmp(argv[1], fns[i].fnName) == 0) { + if (argc != 4) { + fprintf( stderr, "invalid number of parameters"); + return 1; + } + FrElement a; + FrElement b; + + Fr_str2element(&a, argv[2]); + Fr_str2element(&b, argv[3]); + FrElement c; + fns[i].fn(&c, &a, &b); + + char *s; + s = Fr_element2str(&c); + printf("%s", s); + free(s); + return 0; + } + } + fprintf( stderr, "invalid operation %s", argv[1]); + return 1; +} + +*/ diff --git a/c/buildasm/tester.dSYM/Contents/Info.plist b/c/buildasm/tester.dSYM/Contents/Info.plist new file mode 100644 index 0000000..c78a483 --- /dev/null +++ b/c/buildasm/tester.dSYM/Contents/Info.plist @@ -0,0 +1,20 @@ + + + + + CFBundleDevelopmentRegion + English + CFBundleIdentifier + com.apple.xcode.dsym.tester + CFBundleInfoDictionaryVersion + 6.0 + CFBundlePackageType + dSYM + CFBundleSignature + ???? + CFBundleShortVersionString + 1.0 + CFBundleVersion + 1 + + diff --git a/c/buildasm/tester.dSYM/Contents/Resources/DWARF/tester b/c/buildasm/tester.dSYM/Contents/Resources/DWARF/tester new file mode 100644 index 0000000..1f72fd4 Binary files /dev/null and b/c/buildasm/tester.dSYM/Contents/Resources/DWARF/tester differ diff --git a/c/buildasm/utils.asm.ejs b/c/buildasm/utils.asm.ejs new file mode 100644 index 0000000..24a59d0 --- /dev/null +++ b/c/buildasm/utils.asm.ejs @@ -0,0 +1,73 @@ +<% global.setTypeDest = function (t) { +return ( +` mov r11b, ${t} + shl r11, 56 + mov [rdi], r11`); +} %> + + +<% global.toMont_a = function () { +return ( +` push rdi + mov rdi, rsi + mov rsi, rdx + call ${name}_toMontgomery + mov rdx, rsi + mov rsi, rdi + pop rdi`); +} %> + +<% global.toMont_b = function() { +return ( +` push rdi + mov rdi, rdx + call ${name}_toMontgomery + mov rdx, rdi + pop rdi`); +} %> + +<% global.fromMont_a = function () { +return ( +` push rdi + mov rdi, rsi + mov rsi, rdx + call ${name}_toNormal + mov rdx, rsi + mov rsi, rdi + pop rdi`); +} %> + +<% global.fromMont_b = function() { +return ( +` push rdi + mov rdi, rdx + call ${name}_toNormal + mov rdx, rdi + pop rdi`); +} %> + +<% global.toLong_a = function () { +return ( +` push rdi + push rdx + mov rdi, rsi + movsx rsi, r8d + call rawCopyS2L + mov rsi, rdi + pop rdx + pop rdi`); +} %> + +<% global.toLong_b = function() { +return ( +` push rdi + push rsi + mov rdi, rdx + movsx rsi, r9d + call rawCopyS2L + mov rdx, rdi + pop rsi + pop rdi`); +} %> + + diff --git a/c/calcwit.cpp b/c/calcwit.cpp index a85af31..0c50fe5 100644 --- a/c/calcwit.cpp +++ b/c/calcwit.cpp @@ -141,7 +141,7 @@ void Circom_CalcWit::freeBigInts(PBigInt bi, int n) { void Circom_CalcWit::getSignal(int currentComponentIdx, int cIdx, int sIdx, PBigInt value) { // syncPrintf("getSignal: %d\n", sIdx); - if (currentComponentIdx != cIdx) { + if ((circuit->components[cIdx].newThread)&&(currentComponentIdx != cIdx)) { std::unique_lock lk(mutexes[cIdx % NMUTEXES]); while (inputSignalsToTrigger[cIdx] != -1) { cvs[cIdx % NMUTEXES].wait(lk); diff --git a/c/main.cpp b/c/main.cpp index 1f77b34..55ef180 100644 --- a/c/main.cpp +++ b/c/main.cpp @@ -189,6 +189,8 @@ int main(int argc, char *argv[]) { ctx->join(); + printf("Finished!\n"); + std::string outfilename = argv[2]; if (hasEnding(outfilename, std::string(".bin"))) { diff --git a/cli.js b/cli.js index 937db48..e2d8528 100755 --- a/cli.js +++ b/cli.js @@ -35,6 +35,7 @@ const argv = require("yargs") .alias("c", "csource") .alias("s", "sym") .alias("r", "r1cs") + .alias("n", "newThreadTemplates") .help("h") .alias("h", "help") .option("verbose", { @@ -83,6 +84,9 @@ if (argv.r1cs) { if (argv.sym) { options.symWriteStream = fs.createWriteStream(symName); } +if (argv.newThreadTemplates) { + options.newThreadTemplates = new RegExp(argv.newThreadTemplates); +} compiler(fullFileName, options).then( () => { let cSourceDone = false; diff --git a/package-lock.json b/package-lock.json index a0a0f41..c61bcc0 100644 --- a/package-lock.json +++ b/package-lock.json @@ -305,6 +305,11 @@ "integrity": "sha1-zR9rpHfFY4xAyX7ZtXLbW6tdgzE=", "dev": true }, + "ejs": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/ejs/-/ejs-3.0.1.tgz", + "integrity": "sha512-cuIMtJwxvzumSAkqaaoGY/L6Fc/t6YvoP9/VIaK0V/CyqKLEQ8sqODmYfy/cjXEdZ9+OOL8TecbJu+1RsofGDw==" + }, "emoji-regex": { "version": "7.0.3", "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-7.0.3.tgz", @@ -533,6 +538,21 @@ "integrity": "sha1-PYpcZog6FqMMqGQ+hR8Zuqd5eRc=", "dev": true }, + "fflib": { + "version": "0.0.2", + "resolved": "https://registry.npmjs.org/fflib/-/fflib-0.0.2.tgz", + "integrity": "sha512-TvQ3nQjJwdyrFBZAz+GTWz1mv9hHwRiQmZX3T2G0P+cBAlyw42lm4W62i+Ofj1ZOW1XNrEEhFhxHPnutTsJBwA==", + "requires": { + "big-integer": "^1.6.48" + }, + "dependencies": { + "big-integer": { + "version": "1.6.48", + "resolved": "https://registry.npmjs.org/big-integer/-/big-integer-1.6.48.tgz", + "integrity": "sha512-j51egjPa7/i+RdiRuJbPdJ2FIUYYPhvYLjzoYbcMMm62ooO6F94fETG4MTs46zPAF9Brs04OajboA/qTGuz78w==" + } + } + }, "figures": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/figures/-/figures-2.0.0.tgz", diff --git a/package.json b/package.json index b5af9ab..25ae4ff 100644 --- a/package.json +++ b/package.json @@ -30,6 +30,8 @@ }, "dependencies": { "big-integer": "^1.6.32", + "ejs": "^3.0.1", + "fflib": "0.0.2", "fnv-plus": "^1.3.1", "optimist": "^0.6.1", "rimraf-promise": "^2.0.0", diff --git a/src/c_build.js b/src/c_build.js index da2f27d..d302909 100644 --- a/src/c_build.js +++ b/src/c_build.js @@ -204,8 +204,18 @@ function buildComponentsArray(ctx) { const ccodes = []; ccodes.push(`Circom_Component _components[${ctx.components.length}] = {\n`); for (let i=0; i< ctx.components.length; i++) { + let newThread; + if (ctx.newThreadTemplates) { + if (ctx.newThreadTemplates.test(ctx.components[i].template)) { + newThread = "true"; + } else { + newThread = "false"; + } + } else { + newThread = "false"; + } ccodes.push(i>0 ? " ," : " "); - ccodes.push(`{${ctx.components[i].htName},${ctx.components[i].etName},${ctx.components[i].fnName}, ${ctx.components[i].nInSignals}, true}\n`); + ccodes.push(`{${ctx.components[i].htName},${ctx.components[i].etName},${ctx.components[i].fnName}, ${ctx.components[i].nInSignals}, ${newThread}}\n`); } ccodes.push("};\n"); const codeComponents = ccodes.join(""); diff --git a/src/compiler.js b/src/compiler.js index b1e64f1..6714d8e 100644 --- a/src/compiler.js +++ b/src/compiler.js @@ -41,7 +41,7 @@ async function compile(srcFile, options) { ctx.field = new ZqField(options.p); ctx.verbose= options.verbose || false; ctx.mainComponent = options.mainComponent || "main"; - + ctx.newThreadTemplates = options.newThreadTemplates; constructionPhase(ctx, srcFile); diff --git a/src/construction_phase.js b/src/construction_phase.js index 43d445f..1d33da6 100644 --- a/src/construction_phase.js +++ b/src/construction_phase.js @@ -558,7 +558,7 @@ function execFunctionCall(ctx, ast) { if (ast.name == "log") { const v = exec(ctx, ast.params[0]); const ev = val(ctx, v, ast); - console.log(ev.toString()); + console.log(ev.v.toString()); return; } if (ast.name == "assert") { @@ -691,7 +691,7 @@ function execPin(ctx, ast) { } const sIdx = ctx.components[cIdx].names.getSignalIdx(ast.pin.name, selsP); - if (sIdx<0) ctx.throwError(ast, "Signal not defined:" + buildFullName() ); + if (sIdx<0) return ctx.throwError(ast, "Signal not defined:" + buildFullName() ); return { t: "S", sIdx: sIdx, diff --git a/test/fieldasm.js b/test/fieldasm.js new file mode 100644 index 0000000..2f9437b --- /dev/null +++ b/test/fieldasm.js @@ -0,0 +1,322 @@ +const tester = require("../c/buildasm/buildzqfieldtester.js"); + +const ZqField = require("fflib").ZqField; + +const bigInt = require("big-integer"); + +const bn128q = new bigInt("21888242871839275222246405745257275088696311157297823662689037894645226208583"); +const bn128r = new bigInt("21888242871839275222246405745257275088548364400416034343698204186575808495617"); +const secp256k1q = new bigInt("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2F", 16); +const secp256k1r = new bigInt("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEBAAEDCE6AF48A03BBFD25E8CD0364141", 16); +const mnt6753q = new bigInt("41898490967918953402344214791240637128170709919953949071783502921025352812571106773058893763790338921418070971888458477323173057491593855069696241854796396165721416325350064441470418137846398469611935719059908164220784476160001"); +const mnt6753r = new bigInt("41898490967918953402344214791240637128170709919953949071783502921025352812571106773058893763790338921418070971888253786114353726529584385201591605722013126468931404347949840543007986327743462853720628051692141265303114721689601"); + +describe("field asm test", function () { + this.timeout(1000000000); +/* + it("bn128r add", async () => { + const tv = buildTestVector2(bn128r, "add"); + await tester(bn128r, tv); + }); + it("secp256k1q add", async () => { + const tv = buildTestVector2(secp256k1q, "add"); + await tester(secp256k1q, tv); + }); + it("mnt6753q add", async () => { + const tv = buildTestVector2(mnt6753q, "add"); + await tester(mnt6753q, tv); + }); + it("bn128r sub", async () => { + const tv = buildTestVector2(bn128r, "sub"); + await tester(bn128r, tv); + }); + it("secp256k1q sub", async () => { + const tv = buildTestVector2(secp256k1q, "sub"); + await tester(secp256k1q, tv); + }); + it("mnt6753q sub", async () => { + const tv = buildTestVector2(mnt6753q, "sub"); + await tester(mnt6753q, tv); + }); + + it("bn128r neg", async () => { + const tv = buildTestVector1(bn128r, "neg"); + await tester(bn128r, tv); + }); + it("secp256k1q neg", async () => { + const tv = buildTestVector1(secp256k1q, "neg"); + await tester(secp256k1q, tv); + }); + it("mnt6753q neg", async () => { + const tv = buildTestVector1(mnt6753q, "neg"); + await tester(mnt6753q, tv); + }); + it("bn128r mul", async () => { + const tv = buildTestVector2(bn128r, "mul"); + await tester(bn128r, tv); + }); + it("secp256k1q mul", async () => { + const tv = buildTestVector2(secp256k1q, "mul"); + await tester(secp256k1q, tv); + }); + it("mnt6753q mul", async () => { + const tv = buildTestVector2(mnt6753q, "mul"); + await tester(mnt6753q, tv); + }); + it("bn128r binary and", async () => { + const tv = buildTestVector2(bn128r, "band"); + await tester(bn128r, tv); + }); + it("secp256k1q binary and", async () => { + const tv = buildTestVector2(secp256k1q, "band"); + await tester(secp256k1q, tv); + }); + it("mnt6753q binary and", async () => { + const tv = buildTestVector2(mnt6753q, "band"); + await tester(mnt6753q, tv); + }); + it("bn128r binary or", async () => { + const tv = buildTestVector2(bn128r, "bor"); + await tester(bn128r, tv); + }); + + it("secp256k1q binary or", async () => { + const tv = buildTestVector2(secp256k1q, "bor"); + await tester(secp256k1q, tv); + }); + it("mnt6753q binary or", async () => { + const tv = buildTestVector2(mnt6753q, "bor"); + await tester(mnt6753q, tv); + }); + it("bn128r binary xor", async () => { + const tv = buildTestVector2(bn128r, "bxor"); + await tester(bn128r, tv); + }); + it("secp256k1q binary xor", async () => { + const tv = buildTestVector2(secp256k1q, "bxor"); + await tester(secp256k1q, tv); + }); + it("mnt6753q binary xor", async () => { + const tv = buildTestVector2(mnt6753q, "bxor"); + await tester(mnt6753q, tv); + }); + it("bn128r eq", async () => { + const tv = buildTestVector2(bn128r, "eq"); + await tester(bn128r, tv); + }); + it("secp256k1q eq", async () => { + const tv = buildTestVector2(secp256k1q, "eq"); + await tester(secp256k1q, tv); + }); +*/ + it("mnt6753q eq", async () => { + const tv = buildTestVector2(mnt6753q, "eq"); + await tester(mnt6753q, tv); + }); +/* + it("bn128r neq", async () => { + const tv = buildTestVector2(bn128r, "neq"); + await tester(bn128r, tv); + }); + it("secp256k1q neq", async () => { + const tv = buildTestVector2(secp256k1q, "neq"); + await tester(secp256k1q, tv); + }); +*/ + it("mnt6753q neq", async () => { + const tv = buildTestVector2(mnt6753q, "neq"); + await tester(mnt6753q, tv); + }); +/* + it("bn128r lt", async () => { + const tv = buildTestVector2(bn128r, "lt"); + await tester(bn128r, tv); + }); + it("secp256k1q lt", async () => { + const tv = buildTestVector2(secp256k1q, "lt"); + await tester(secp256k1q, tv); + }); +*/ + it("mnt6753q lt", async () => { + const tv = buildTestVector2(mnt6753q, "lt"); + await tester(mnt6753q, tv); + }); +/* + it("bn128r gt", async () => { + const tv = buildTestVector2(bn128r, "gt"); + await tester(bn128r, tv); + }); + it("secp256k1q gt", async () => { + const tv = buildTestVector2(secp256k1q, "gt"); + await tester(secp256k1q, tv); + }); +*/ + it("mnt6753q gt", async () => { + const tv = buildTestVector2(mnt6753q, "gt"); + await tester(mnt6753q, tv); + }); +/* + it("bn128r leq", async () => { + const tv = buildTestVector2(bn128r, "leq"); + await tester(bn128r, tv); + }); + it("secp256k1q leq", async () => { + const tv = buildTestVector2(secp256k1q, "leq"); + await tester(secp256k1q, tv); + }); +*/ + it("mnt6753q leq", async () => { + const tv = buildTestVector2(mnt6753q, "leq"); + await tester(mnt6753q, tv); + }); +/* + it("bn128r geq", async () => { + const tv = buildTestVector2(bn128r, "geq"); + await tester(bn128r, tv); + }); + it("secp256k1q geq", async () => { + const tv = buildTestVector2(secp256k1q, "geq"); + await tester(secp256k1q, tv); + }); +*/ + it("mnt6753q geq", async () => { + const tv = buildTestVector2(mnt6753q, "geq"); + await tester(mnt6753q, tv); + }); +}); + +function buildTestVector2(p, op) { + const F = new ZqField(p); + const tv = []; + const nums = getCriticalNumbers(p, 2); + + const excludeZero = ["div", "mod"].indexOf(op) >= 0; + + for (let i=0; i= 0; + + for (let i=0; i