global <%=name%>_add global <%=name%>_mul global <%=name%>_q DEFAULT REL section .text ;;;;;;;;;;;;;;;;;;;;;; ; add ;;;;;;;;;;;;;;;;;;;;;; <%=name%>_add: ; Add component by component with carry <% for (let i=0; i mov rax, [rsi + <%=i*8%>] <%= i==0 ? "add" : "adc" %> rax, [rdx + <%=i*8%>] mov [rdi + <%=i*8%>], rax <% } %> jc add_sq ; if overflow, substract q ; Compare with q <% for (let i=0; i <% if (i>0) { %> mov rax, [rdi + <%= (n64-i-1)*8 %>] <% } %> cmp rax, [q + <%= (n64-i-1)*8 %>] jg add_sq jl add_done <% } %> ; If equal substract q add_sq: <% for (let i=0; i mov rax, [q + <%=i*8%>] <%= i==0 ? "sub" : "sbb" %> [rdi + <%=i*8%>], rax mov [rdx + <%=i*8%>], rax <% } %> add_done: ret ;;;;;;;;;;;;;;;;;;;;;; ; mul Montgomery ;;;;;;;;;;;;;;;;;;;;;; mulM: <% let r0, r1, r2; function setR(step) { if ((step % 3) == 0) { r0 = "r8"; r1 = "r9"; r2 = "r10"; } else if ((step % 3) == 1) { r0 = "r9"; r1 = "r10"; r2 = "r8"; } else { r0 = "r10"; r1 = "r8"; r2 = "r9"; } } const base = bigInt.one.shiftLeft(64); const np64 = base.minus(q.modInv(base)); %> sub rsp, <%= n64*8 %> ; Reserve space for ms mov rcx, rdx ; rdx is needed for multiplications so keep it in cx mov r11, 0x<%= np64.toString(16) %> ; np xor r8,r8 xor r9,r9 xor r10,r10 <% // Main loop for (let i=0; i <% // Same Digit for (let o1=Math.max(0, i-n64+1); (o1<=i)&&(o1 mov rax, [rsi + <%= 8*o1 %>] mul qword [rcx + <%= 8*o2 %>] add <%= r0 %>, rax adc <%= r1 %>, rdx adc <%= r2 %>, 0x0 <% } // Same digit %> <% for (let j=i-1; j>=0; j--) { // All ms if (((i-j) mov rax, [rsp + <%= j*8 %>] mul qword [q + <%= (i-j)*8 %>] add <%= r0 %>, rax adc <%= r1 %>, rdx adc <%= r2 %>, 0x0 <% } } // ms %> <% if (i mov rax, <%= r0 %> mul r11 mov [rsp + <%= i*8 %>], rax mul qword [q] add <%= r0 %>, rax adc <%= r1 %>, rdx adc <%= r2 %>, 0x0 <% } else { %> mov [rdi + <%= (i-n64)*8 %> ], <%= r0 %> xor <%= r0 %>,<%= r0 %> <% } %> <% } // Main Loop %> cmp <%= r1 %>, 0x0 jne mulM_sq ; Compare with q <% for (let i=0; i mov rax, [rdi + <%= (n64-i-1)*8 %>] cmp rax, [q + <%= (n64-i-1)*8 %>] jg mulM_sq jl mulM_done <% } %> ; If equal substract q mulM_sq: <% for (let i=0; i mov rax, [q + <%= i*8 %>] <%= i==0 ? "sub" : "sbb" %> [rdi + <%= i*8 %>], rax mov [rdx + <%= i*8 %>], rax <% } %> mulM_done: add rsp, <%= n64*8 %> ; recover rsp ret ;;;;;;;;;;;;;;;;;;;;;; ; mul MontgomeryShort ;;;;;;;;;;;;;;;;;;;;;; mulSM: ;;;;;;;;;;;;;;;;;;;;;; ; mul ;;;;;;;;;;;;;;;;;;;;;; <%=name%>_mul: mov rax, [rsi] bt rax, 63 jc l1 mov rcx, [rdx] bt rcx, 63 jc s1l2 s1s2: ; short first and second mul ecx jc rs2l ; If if doesn't feed in 32 bits convert the result to long ; The shorts multiplication is done. copy the val to destination and return mov [rdi], rax ret rs2l: ; The result in the multiplication doen't feed ; we have the result in edx:eax we need to convert it to long shl rdx, 32 mov edx, eax ; pack edx:eax to rdx xor rax, rax ; Set the format to long bts rax, 63 mov [rdi], rax ; move the first digit cmp rdx, 0 ; check if redx is negative. jl rs2ln ; edx is positive. mov [rdi + 8], rdx ; Set the firs digit xor rax, rax ; Set the remaining digits to 0 <% for (let i=1; i mov [rdi + <%= (i+1)*8 %>], rax <% } %> ret ; edx is negative. rs2ln: add rdx, [q] ; Set the firs digit mov [rdi + 8], rdx ; mov rdx, -1 ; all ones <% for (let i=1; i mov rax, rdx ; Add to q adc rax, [q + <%= i*8 %> ] mov [rdi + <%= (i+1)*8 %>], rax <% } %> ret l1: mov rcx, [rdx] bt rcx, 63 jc ll l1s2: xor rdx, rdx mov edx, ecx bt rax, 62 jc lsM jmp lsN s1l2: mov rsi, rdx xor rdx, rdx mov edx, eax bt rcx, 62 jc lsM jmp lsN lsN: mov byte [rdi + 3], 0xC0 ; set the result to montgomery add rsi, 8 add rdi, 8 call mulSM mov rdx, R3 call mulM ret lsM: mov byte [rdi + 3], 0x80 ; set the result to long normal add rsi, 8 add rdi, 8 call mulSM ret ll: bt rax, 62 jc lml bt rcx, 62 jc lnlm lnln: mov byte [rdi + 3], 0xC0 ; set the result to long montgomery add rsi, 8 add rdi, 8 add rdx, 8 call mulM mov rdi, rsi mov rdx, R3 call mulM ret lml: bt rcx, 62 jc lmlm lnlm: mov byte [rdi + 3], 0x80 ; set the result to long normal add rsi, 8 add rdi, 8 add rdx, 8 call mulM ret lmlm: mov byte [rdi + 3], 0xC0 ; set the result to long montgomery add rsi, 8 add rdi, 8 add rdx, 8 call mulM ret section .data <%=name%>_q: dd 0 dd 0x80000000 q dq <%= constantElement(q) %> R3 dq <%= constantElement(bigInt.one.shiftLeft(n64*64*3).mod(q)) %>