Begining of wasm

This commit is contained in:
Jordi Baylina
2020-03-09 21:16:56 +01:00
parent 6c1a3e7687
commit 8f63d18ff4
70 changed files with 4135 additions and 1353 deletions

View File

@@ -0,0 +1,33 @@
const tester = require("../c/buildasm/buildzqfieldtester2.js");
const bigInt = require("big-integer");
const __P__ = new bigInt("21888242871839275222246405745257275088548364400416034343698204186575808495617");
describe("basic cases", function () {
this.timeout(100000);
it("should do basic tests", async () => {
await tester(__P__, [
["add", 0, 0],
["add", 0, 1],
["add", 1, 0],
["add", 1, 1],
["add", 2, 1],
["add", 2, 10],
["add", -1, -1],
["add", -20, -10],
["add", "10604728079509999371218483608188593244163417117449316147628604036713980815027", "10604728079509999371218483608188593244163417117449316147628604036713980815027"],
["mul", 0, 0],
["mul", 0, 1],
["mul", 1, 0],
["mul", 1, 1],
["mul", 2, 1],
["mul", 2, 10],
["mul", -1, -1],
["mul", -20, -10],
["mul", "10604728079509999371218483608188593244163417117449316147628604036713980815027", "10604728079509999371218483608188593244163417117449316147628604036713980815027"],
]);
});
});

View File

@@ -0,0 +1,209 @@
const bigInt=require("big-integer");
class ZqBuilder {
constructor(q, name) {
this.q=bigInt(q);
this.h = [];
this.c = [];
this.name = name;
}
build() {
this._buildHeaders();
this._buildAdd();
this._buildMul();
this.c.push(""); this.h.push("");
return [this.h.join("\n"), this.c.join("\n")];
}
_buildHeaders() {
this.n64 = Math.floor((this.q.bitLength() - 1) / 64)+1;
this.h.push("typedef unsigned long long u64;");
this.h.push(`typedef u64 ${this.name}Element[${this.n64}];`);
this.h.push(`typedef u64 *P${this.name}Element;`);
this.h.push(`extern ${this.name}Element ${this.name}_q;`);
this.h.push(`#define ${this.name}_N64 ${this.n64}`);
this.c.push(`#include "${this.name.toLowerCase()}.h"`);
this._defineConstant(`${this.name}_q`, this.q);
this.c.push(""); this.h.push("");
}
_defineConstant(n, v) {
let S = `${this.name}Element ${n}={`;
const mask = bigInt("FFFFFFFFFFFFFFFF", 16);
for (let i=0; i<this.n64; i++) {
if (i>0) S = S+",";
let shex = v.shiftRight(i*64).and(mask).toString(16);
while (shex <16) shex = "0" + shex;
S = S + "0x" + shex + "ULL";
}
S += "};";
this.c.push(S);
}
_buildAdd() {
this.h.push(`void ${this.name}_add(P${this.name}Element r, P${this.name}Element a, P${this.name}Element b);`);
this.c.push(`void ${this.name}_add(P${this.name}Element r, P${this.name}Element a, P${this.name}Element b) {`);
this.c.push(" __asm__ __volatile__ (");
for (let i=0; i<this.n64; i++) {
this.c.push(` "movq ${i*8}(%2), %%rax;"`);
this.c.push(` "${i==0 ? "addq" : "adcq"} ${i*8}(%1), %%rax;"`);
this.c.push(` "movq %%rax, ${i*8}(%0);"`);
}
this.c.push(" \"jc SQ;\"");
for (let i=0; i<this.n64; i++) {
if (i>0) {
this.c.push(` "movq ${(this.n64 - i-1)*8}(%0), %%rax;"`);
}
this.c.push(` "cmp ${(this.n64 - i-1)*8}(%3), %%rax;"`);
this.c.push(" \"jg SQ;\"");
this.c.push(" \"jl DONE;\"");
}
this.c.push(" \"SQ:\"");
for (let i=0; i<this.n64; i++) {
this.c.push(` "movq ${i*8}(%3), %%rax;"`);
this.c.push(` "${i==0 ? "subq" : "sbbq"} %%rax, ${i*8}(%0);"`);
}
this.c.push(" \"DONE:\"");
this.c.push(` :: "r" (r), "r" (a), "r" (b), "r" (${this.name}_q) : "%rax", "memory");`);
this.c.push("}\n");
}
_buildMul() {
let r0, r1, r2;
function setR(step) {
if ((step % 3) == 0) {
r0 = "%%r8";
r1 = "%%r9";
r2 = "%%r10";
} else if ((step % 3) == 1) {
r0 = "%%r9";
r1 = "%%r10";
r2 = "%%r8";
} else {
r0 = "%%r10";
r1 = "%%r8";
r2 = "%%r9";
}
}
const base = bigInt.one.shiftLeft(64);
const np64 = base.minus(this.q.modInv(base));
this.h.push(`void ${this.name}_mul(P${this.name}Element r, P${this.name}Element a, P${this.name}Element b);`);
this.c.push(`void ${this.name}_mul(P${this.name}Element r, P${this.name}Element a, P${this.name}Element b) {`);
this.c.push(" __asm__ __volatile__ (");
this.c.push(` "subq $${this.n64*8}, %%rsp;"`);
this.c.push(` "movq $0x${np64.toString(16)}, %%r11;"`);
this.c.push(" \"movq $0x0, %%r8;\"");
this.c.push(" \"movq $0x0, %%r9;\"");
this.c.push(" \"movq $0x0, %%r10;\"");
for (let i=0; i<this.n64*2; i++) {
setR(i);
for (let o1=Math.max(0, i-this.n64+1); (o1<=i)&&(o1<this.n64); o1++) {
const o2= i-o1;
this.c.push(` "movq ${o1*8}(%1), %%rax;"`);
this.c.push(` "mulq ${o2*8}(%2);"`);
this.c.push(` "addq %%rax, ${r0};"`);
this.c.push(` "adcq %%rdx, ${r1};"`);
this.c.push(` "adcq $0x0, ${r2};"`);
}
for (let j=i-1; j>=0; j--) {
if (((i-j)<this.n64)&&(j<this.n64)) {
this.c.push(` "movq ${j*8}(%%rsp), %%rax;"`);
this.c.push(` "mulq ${(i-j)*8}(%3);"`);
this.c.push(` "addq %%rax, ${r0};"`);
this.c.push(` "adcq %%rdx, ${r1};"`);
this.c.push(` "adcq $0x0, ${r2};"`);
}
}
if (i<this.n64) {
this.c.push(` "movq ${r0}, %%rax;"`);
this.c.push(" \"mulq %%r11;\"");
this.c.push(` "movq %%rax, ${i*8}(%%rsp);"`);
this.c.push(" \"mulq (%3);\"");
this.c.push(` "addq %%rax, ${r0};"`);
this.c.push(` "adcq %%rdx, ${r1};"`);
this.c.push(` "adcq $0x0, ${r2};"`);
} else {
this.c.push(` "movq ${r0}, ${(i-this.n64)*8}(%0);"`);
this.c.push(` "movq $0, ${r0};"`);
}
}
this.c.push(` "cmp $0, ${r1};"`);
this.c.push(" \"jne SQ2;\"");
for (let i=0; i<this.n64; i++) {
this.c.push(` "movq ${(this.n64 - i-1)*8}(%0), %%rax;"`);
this.c.push(` "cmp ${(this.n64 - i-1)*8}(%3), %%rax;"`);
this.c.push(" \"jg SQ2;\"");
this.c.push(" \"jl DONE2;\"");
}
this.c.push(" \"SQ2:\"");
for (let i=0; i<this.n64; i++) {
this.c.push(` "movq ${i*8}(%3), %%rax;"`);
this.c.push(` "${i==0 ? "subq" : "sbbq"} %%rax, ${i*8}(%0);"`);
}
this.c.push(" \"DONE2:\"");
this.c.push(` "addq $${this.n64*8}, %%rsp;"`);
this.c.push(` :: "r" (r), "r" (a), "r" (b), "r" (${this.name}_q) : "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "memory");`);
this.c.push("}\n");
}
_buildIDiv() {
this.h.push(`void ${this.name}_idiv(P${this.name}Element r, P${this.name}Element a, P${this.name}Element b);`);
this.c.push(`void ${this.name}_idiv(P${this.name}Element r, P${this.name}Element a, P${this.name}Element b) {`);
this.c.push(" __asm__ __volatile__ (");
this.c.push(" \"pxor %%xmm0, %%xmm0;\""); // Comparison Register
if (this.n64 == 1) {
this.c.push(` "mov %%rax, $${this.n64 - 8};"`);
} else {
this.c.push(` "mov %%rax, $${this.n64 -16};"`);
}
this.c.push(` :: "r" (r), "r" (a), "r" (b), "r" (${this.name}_q) : "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "memory");`);
this.c.push("}\n");
}
}
var runningAsScript = !module.parent;
if (runningAsScript) {
const fs = require("fs");
var argv = require("yargs")
.usage("Usage: $0 -q [primeNum] -n [name] -oc [out .c file] -oh [out .h file]")
.demandOption(["q","n"])
.alias("q", "prime")
.alias("n", "name")
.argv;
const q = bigInt(argv.q);
const cFileName = (argv.oc) ? argv.oc : argv.name.toLowerCase() + ".c";
const hFileName = (argv.oh) ? argv.oh : argv.name.toLowerCase() + ".h";
const builder = new ZqBuilder(q, argv.name);
const res = builder.build();
fs.writeFileSync(hFileName, res[0], "utf8");
fs.writeFileSync(cFileName, res[1], "utf8");
} else {
module.exports = function(q, name) {
const builder = new ZqBuilder(q, name);
return builder.build();
};
}

View File

@@ -0,0 +1,68 @@
const chai = require("chai");
const assert = chai.assert;
const fs = require("fs");
var tmp = require("tmp-promise");
const path = require("path");
const util = require("util");
const exec = util.promisify(require("child_process").exec);
const bigInt = require("big-integer");
const BuildZqField = require("./buildzqfield");
const ZqField = require("fflib").ZqField;
module.exports = testField;
function toMontgomeryStr(a, prime) {
const n64 = Math.floor((prime.bitLength() - 1) / 64)+1;
return a.shiftLeft(n64*64).mod(prime).toString(10);
}
function fromMontgomeryStr(a, prime) {
const n64 = Math.floor((prime.bitLength() - 1) / 64)+1;
const R = bigInt.one.shiftLeft(n64*64).mod(prime);
const RI = R.modInv(prime);
return bigInt(a).times(RI).mod(prime);
}
async function testField(prime, test) {
tmp.setGracefulCleanup();
const F = new ZqField(prime);
const dir = await tmp.dir({prefix: "circom_", unsafeCleanup: true });
const [hSource, cSource] = BuildZqField(prime, "Fr");
await fs.promises.writeFile(path.join(dir.path, "fr.h"), hSource, "utf8");
await fs.promises.writeFile(path.join(dir.path, "fr.c"), cSource, "utf8");
await exec("g++" +
` ${path.join(__dirname, "tester.c")}` +
` ${path.join(dir.path, "fr.c")}` +
` -o ${path.join(dir.path, "tester")}` +
" -lgmp"
);
for (let i=0; i<test.length; i++) {
let a = bigInt(test[i][1]).mod(prime);
if (a.isNegative()) a = prime.add(a);
let b = bigInt(test[i][2]).mod(prime);
if (b.isNegative()) b = prime.add(b);
const ec = F[test[i][0]](a,b);
// console.log(toMontgomeryStr(a, prime));
// console.log(toMontgomeryStr(b, prime));
const res = await exec(`${path.join(dir.path, "tester")}` +
` ${test[i][0]}` +
` ${toMontgomeryStr(a, prime)}` +
` ${toMontgomeryStr(b, prime)}`
);
// console.log(res.stdout);
const c=fromMontgomeryStr(res.stdout, prime);
assert.equal(ec.toString(), c.toString());
}
}

View File

@@ -0,0 +1,302 @@
global <%=name%>_add
global <%=name%>_mul
global <%=name%>_q
DEFAULT REL
section .text
;;;;;;;;;;;;;;;;;;;;;;
; add
;;;;;;;;;;;;;;;;;;;;;;
<%=name%>_add:
; Add component by component with carry
<% for (let i=0; i<n64; i++) { %>
mov rax, [rsi + <%=i*8%>]
<%= i==0 ? "add" : "adc" %> rax, [rdx + <%=i*8%>]
mov [rdi + <%=i*8%>], rax
<% } %>
jc add_sq ; if overflow, substract q
; Compare with q
<% for (let i=0; i<n64; i++) { %>
<% if (i>0) { %>
mov rax, [rdi + <%= (n64-i-1)*8 %>]
<% } %>
cmp rax, [q + <%= (n64-i-1)*8 %>]
jg add_sq
jl add_done
<% } %>
; If equal substract q
add_sq:
<% for (let i=0; i<n64; i++) { %>
mov rax, [q + <%=i*8%>]
<%= i==0 ? "sub" : "sbb" %> [rdi + <%=i*8%>], rax
mov [rdx + <%=i*8%>], rax
<% } %>
add_done:
ret
;;;;;;;;;;;;;;;;;;;;;;
; mul Montgomery
;;;;;;;;;;;;;;;;;;;;;;
mulM:
<%
let r0, r1, r2;
function setR(step) {
if ((step % 3) == 0) {
r0 = "r8";
r1 = "r9";
r2 = "r10";
} else if ((step % 3) == 1) {
r0 = "r9";
r1 = "r10";
r2 = "r8";
} else {
r0 = "r10";
r1 = "r8";
r2 = "r9";
}
}
const base = bigInt.one.shiftLeft(64);
const np64 = base.minus(q.modInv(base));
%>
sub rsp, <%= n64*8 %> ; Reserve space for ms
mov rcx, rdx ; rdx is needed for multiplications so keep it in cx
mov r11, 0x<%= np64.toString(16) %> ; np
xor r8,r8
xor r9,r9
xor r10,r10
<%
// Main loop
for (let i=0; i<n64*2; i++) {
setR(i);
%>
<%
// Same Digit
for (let o1=Math.max(0, i-n64+1); (o1<=i)&&(o1<n64); o1++) {
const o2= i-o1;
%>
mov rax, [rsi + <%= 8*o1 %>]
mul qword [rcx + <%= 8*o2 %>]
add <%= r0 %>, rax
adc <%= r1 %>, rdx
adc <%= r2 %>, 0x0
<%
} // Same digit
%>
<%
for (let j=i-1; j>=0; j--) { // All ms
if (((i-j)<n64)&&(j<n64)) {
%>
mov rax, [rsp + <%= j*8 %>]
mul qword [q + <%= (i-j)*8 %>]
add <%= r0 %>, rax
adc <%= r1 %>, rdx
adc <%= r2 %>, 0x0
<%
}
} // ms
%>
<%
if (i<n64) {
%>
mov rax, <%= r0 %>
mul r11
mov [rsp + <%= i*8 %>], rax
mul qword [q]
add <%= r0 %>, rax
adc <%= r1 %>, rdx
adc <%= r2 %>, 0x0
<%
} else {
%>
mov [rdi + <%= (i-n64)*8 %> ], <%= r0 %>
xor <%= r0 %>,<%= r0 %>
<%
}
%>
<%
} // Main Loop
%>
cmp <%= r1 %>, 0x0
jne mulM_sq
; Compare with q
<%
for (let i=0; i<n64; i++) {
%>
mov rax, [rdi + <%= (n64-i-1)*8 %>]
cmp rax, [q + <%= (n64-i-1)*8 %>]
jg mulM_sq
jl mulM_done
<%
}
%>
; If equal substract q
mulM_sq:
<%
for (let i=0; i<n64; i++) {
%>
mov rax, [q + <%= i*8 %>]
<%= i==0 ? "sub" : "sbb" %> [rdi + <%= i*8 %>], rax
mov [rdx + <%= i*8 %>], rax
<%
}
%>
mulM_done:
add rsp, <%= n64*8 %> ; recover rsp
ret
;;;;;;;;;;;;;;;;;;;;;;
; mul MontgomeryShort
;;;;;;;;;;;;;;;;;;;;;;
mulSM:
;;;;;;;;;;;;;;;;;;;;;;
; mul
;;;;;;;;;;;;;;;;;;;;;;
<%=name%>_mul:
mov rax, [rsi]
bt rax, 63
jc l1
mov rcx, [rdx]
bt rcx, 63
jc s1l2
s1s2: ; short first and second
mul ecx
jc rs2l ; If if doesn't feed in 32 bits convert the result to long
; The shorts multiplication is done. copy the val to destination and return
mov [rdi], rax
ret
rs2l: ; The result in the multiplication doen't feed
; we have the result in edx:eax we need to convert it to long
shl rdx, 32
mov edx, eax ; pack edx:eax to rdx
xor rax, rax ; Set the format to long
bts rax, 63
mov [rdi], rax ; move the first digit
cmp rdx, 0 ; check if redx is negative.
jl rs2ln
; edx is positive.
mov [rdi + 8], rdx ; Set the firs digit
xor rax, rax ; Set the remaining digits to 0
<% for (let i=1; i<n64; i++) { %>
mov [rdi + <%= (i+1)*8 %>], rax
<% } %>
ret
; edx is negative.
rs2ln:
add rdx, [q] ; Set the firs digit
mov [rdi + 8], rdx ;
mov rdx, -1 ; all ones
<% for (let i=1; i<n64; i++) { %>
mov rax, rdx ; Add to q
adc rax, [q + <%= i*8 %> ]
mov [rdi + <%= (i+1)*8 %>], rax
<% } %>
ret
l1:
mov rcx, [rdx]
bt rcx, 63
jc ll
l1s2:
xor rdx, rdx
mov edx, ecx
bt rax, 62
jc lsM
jmp lsN
s1l2:
mov rsi, rdx
xor rdx, rdx
mov edx, eax
bt rcx, 62
jc lsM
jmp lsN
lsN:
mov byte [rdi + 3], 0xC0 ; set the result to montgomery
add rsi, 8
add rdi, 8
call mulSM
mov rdx, R3
call mulM
ret
lsM:
mov byte [rdi + 3], 0x80 ; set the result to long normal
add rsi, 8
add rdi, 8
call mulSM
ret
ll:
bt rax, 62
jc lml
bt rcx, 62
jc lnlm
lnln:
mov byte [rdi + 3], 0xC0 ; set the result to long montgomery
add rsi, 8
add rdi, 8
add rdx, 8
call mulM
mov rdi, rsi
mov rdx, R3
call mulM
ret
lml:
bt rcx, 62
jc lmlm
lnlm:
mov byte [rdi + 3], 0x80 ; set the result to long normal
add rsi, 8
add rdi, 8
add rdx, 8
call mulM
ret
lmlm:
mov byte [rdi + 3], 0xC0 ; set the result to long montgomery
add rsi, 8
add rdi, 8
add rdx, 8
call mulM
ret
section .data
<%=name%>_q:
dd 0
dd 0x80000000
q dq <%= constantElement(q) %>
R3 dq <%= constantElement(bigInt.one.shiftLeft(n64*64*3).mod(q)) %>

View File

@@ -0,0 +1,251 @@
;;;;;;;;;;;;;;;;;;;;;;
; mul Montgomery
;;;;;;;;;;;;;;;;;;;;;;
mulM:
<%
let r0, r1, r2;
function setR(step) {
if ((step % 3) == 0) {
r0 = "r8";
r1 = "r9";
r2 = "r10";
} else if ((step % 3) == 1) {
r0 = "r9";
r1 = "r10";
r2 = "r8";
} else {
r0 = "r10";
r1 = "r8";
r2 = "r9";
}
}
const base = bigInt.one.shiftLeft(64);
const np64 = base.minus(q.modInv(base));
%>
sub rsp, <%= n64*8 %> ; Reserve space for ms
mov rcx, rdx ; rdx is needed for multiplications so keep it in cx
mov r11, 0x<%= np64.toString(16) %> ; np
xor r8,r8
xor r9,r9
xor r10,r10
<%
// Main loop
for (let i=0; i<n64*2; i++) {
setR(i);
%>
<%
// Same Digit
for (let o1=Math.max(0, i-n64+1); (o1<=i)&&(o1<n64); o1++) {
const o2= i-o1;
%>
mov rax, [rsi + <%= 8*o1 %>]
mul qword [rcx + <%= 8*o2 %>]
add <%= r0 %>, rax
adc <%= r1 %>, rdx
adc <%= r2 %>, 0x0
<%
} // Same digit
%>
<%
for (let j=i-1; j>=0; j--) { // All ms
if (((i-j)<n64)&&(j<n64)) {
%>
mov rax, [rsp + <%= j*8 %>]
mul qword [q + <%= (i-j)*8 %>]
add <%= r0 %>, rax
adc <%= r1 %>, rdx
adc <%= r2 %>, 0x0
<%
}
} // ms
%>
<%
if (i<n64) {
%>
mov rax, <%= r0 %>
mul r11
mov [rsp + <%= i*8 %>], rax
mul qword [q]
add <%= r0 %>, rax
adc <%= r1 %>, rdx
adc <%= r2 %>, 0x0
<%
} else {
%>
mov [rdi + <%= (i-n64)*8 %> ], <%= r0 %>
xor <%= r0 %>,<%= r0 %>
<%
}
%>
<%
} // Main Loop
%>
cmp <%= r1 %>, 0x0
jne mulM_sq
; Compare with q
<%
for (let i=0; i<n64; i++) {
%>
mov rax, [rdi + <%= (n64-i-1)*8 %>]
cmp rax, [q + <%= (n64-i-1)*8 %>]
jg mulM_sq
jl mulM_done
<%
}
%>
; If equal substract q
mulM_sq:
<%
for (let i=0; i<n64; i++) {
%>
mov rax, [q + <%= i*8 %>]
<%= i==0 ? "sub" : "sbb" %> [rdi + <%= i*8 %>], rax
<%
}
%>
mulM_done:
add rsp, <%= n64*8 %> ; recover rsp
ret
;;;;;;;;;;;;;;;;;;;;;;
; mul MontgomeryShort
;;;;;;;;;;;;;;;;;;;;;;
mulSM:
;;;;;;;;;;;;;;;;;;;;;;
; mul
;;;;;;;;;;;;;;;;;;;;;;
<%=name%>_mul:
mov rax, [rsi]
bt rax, 63
jc l1
mov rcx, [rdx]
bt rcx, 63
jc s1l2
s1s2: ; short first and second
mul ecx
jc rs2l ; If if doesn't feed in 32 bits convert the result to long
; The shorts multiplication is done. copy the val to destination and return
mov [rdi], rax
ret
rs2l: ; The result in the multiplication doen't feed
; we have the result in edx:eax we need to convert it to long
shl rdx, 32
mov edx, eax ; pack edx:eax to rdx
xor rax, rax ; Set the format to long
bts rax, 63
mov [rdi], rax ; move the first digit
cmp rdx, 0 ; check if redx is negative.
jl rs2ln
; edx is positive.
mov [rdi + 8], rdx ; Set the firs digit
xor rax, rax ; Set the remaining digits to 0
<% for (let i=1; i<n64; i++) { %>
mov [rdi + <%= (i+1)*8 %>], rax
<% } %>
ret
; edx is negative.
rs2ln:
add rdx, [q] ; Set the firs digit
mov [rdi + 8], rdx ;
mov rdx, -1 ; all ones
<% for (let i=1; i<n64; i++) { %>
mov rax, rdx ; Add to q
adc rax, [q + <%= i*8 %> ]
mov [rdi + <%= (i+1)*8 %>], rax
<% } %>
ret
l1:
mov rcx, [rdx]
bt rcx, 63
jc ll
l1s2:
xor rdx, rdx
mov edx, ecx
bt rax, 62
jc lsM
jmp lsN
s1l2:
mov rsi, rdx
xor rdx, rdx
mov edx, eax
bt rcx, 62
jc lsM
jmp lsN
lsN:
mov byte [rdi + 7], 0xC0 ; set the result to montgomery
add rsi, 8
add rdi, 8
call mulSM
mov rsi, rdi
lea rdx, [R3]
call mulM
ret
lsM:
mov byte [rdi + 7], 0x80 ; set the result to long normal
add rsi, 8
add rdi, 8
call mulSM
ret
ll:
bt rax, 62
jc lml
bt rcx, 62
jc lnlm
lnln:
mov byte [rdi + 7], 0xC0 ; set the result to long montgomery
add rsi, 8
add rdi, 8
add rdx, 8
call mulM
mov rsi, rdi
lea rdx, [R3]
call mulM
ret
lml:
bt rcx, 62
jc lmlm
lnlm:
mov byte [rdi + 7], 0x80 ; set the result to long normal
add rsi, 8
add rdi, 8
add rdx, 8
call mulM
ret
lmlm:
mov byte [rdi + 7], 0xC0 ; set the result to long montgomery
add rsi, 8
add rdi, 8
add rdx, 8
call mulM
ret