|
|
global <%=name%>_add global <%=name%>_mul global <%=name%>_q DEFAULT REL
section .text
;;;;;;;;;;;;;;;;;;;;;; ; add ;;;;;;;;;;;;;;;;;;;;;; <%=name%>_add: ; Add component by component with carry <% for (let i=0; i<n64; i++) { %> mov rax, [rsi + <%=i*8%>] <%= i==0 ? "add" : "adc" %> rax, [rdx + <%=i*8%>] mov [rdi + <%=i*8%>], rax <% } %> jc add_sq ; if overflow, substract q
; Compare with q <% for (let i=0; i<n64; i++) { %> <% if (i>0) { %> mov rax, [rdi + <%= (n64-i-1)*8 %>] <% } %> cmp rax, [q + <%= (n64-i-1)*8 %>] jg add_sq jl add_done <% } %> ; If equal substract q add_sq: <% for (let i=0; i<n64; i++) { %> mov rax, [q + <%=i*8%>] <%= i==0 ? "sub" : "sbb" %> [rdi + <%=i*8%>], rax mov [rdx + <%=i*8%>], rax <% } %>
add_done: ret
;;;;;;;;;;;;;;;;;;;;;; ; mul Montgomery ;;;;;;;;;;;;;;;;;;;;;; mulM: <% let r0, r1, r2; function setR(step) { if ((step % 3) == 0) { r0 = "r8"; r1 = "r9"; r2 = "r10"; } else if ((step % 3) == 1) { r0 = "r9"; r1 = "r10"; r2 = "r8"; } else { r0 = "r10"; r1 = "r8"; r2 = "r9"; } }
const base = bigInt.one.shiftLeft(64); const np64 = base.minus(q.modInv(base)); %> sub rsp, <%= n64*8 %> ; Reserve space for ms mov rcx, rdx ; rdx is needed for multiplications so keep it in cx mov r11, 0x<%= np64.toString(16) %> ; np xor r8,r8 xor r9,r9 xor r10,r10 <% // Main loop for (let i=0; i<n64*2; i++) { setR(i); %> <% // Same Digit for (let o1=Math.max(0, i-n64+1); (o1<=i)&&(o1<n64); o1++) { const o2= i-o1; %> mov rax, [rsi + <%= 8*o1 %>] mul qword [rcx + <%= 8*o2 %>] add <%= r0 %>, rax adc <%= r1 %>, rdx adc <%= r2 %>, 0x0 <% } // Same digit %>
<% for (let j=i-1; j>=0; j--) { // All ms if (((i-j)<n64)&&(j<n64)) { %> mov rax, [rsp + <%= j*8 %>] mul qword [q + <%= (i-j)*8 %>] add <%= r0 %>, rax adc <%= r1 %>, rdx adc <%= r2 %>, 0x0 <% } } // ms %>
<% if (i<n64) { %> mov rax, <%= r0 %> mul r11 mov [rsp + <%= i*8 %>], rax mul qword [q] add <%= r0 %>, rax adc <%= r1 %>, rdx adc <%= r2 %>, 0x0 <% } else { %> mov [rdi + <%= (i-n64)*8 %> ], <%= r0 %> xor <%= r0 %>,<%= r0 %> <% } %>
<% } // Main Loop %> cmp <%= r1 %>, 0x0 jne mulM_sq ; Compare with q <% for (let i=0; i<n64; i++) { %> mov rax, [rdi + <%= (n64-i-1)*8 %>] cmp rax, [q + <%= (n64-i-1)*8 %>] jg mulM_sq jl mulM_done <% } %> ; If equal substract q
mulM_sq: <% for (let i=0; i<n64; i++) { %> mov rax, [q + <%= i*8 %>] <%= i==0 ? "sub" : "sbb" %> [rdi + <%= i*8 %>], rax mov [rdx + <%= i*8 %>], rax <% } %>
mulM_done: add rsp, <%= n64*8 %> ; recover rsp ret
;;;;;;;;;;;;;;;;;;;;;; ; mul MontgomeryShort ;;;;;;;;;;;;;;;;;;;;;; mulSM:
;;;;;;;;;;;;;;;;;;;;;; ; mul ;;;;;;;;;;;;;;;;;;;;;; <%=name%>_mul: mov rax, [rsi] bt rax, 63 jc l1 mov rcx, [rdx] bt rcx, 63 jc s1l2 s1s2: ; short first and second mul ecx jc rs2l ; If if doesn't feed in 32 bits convert the result to long
; The shorts multiplication is done. copy the val to destination and return mov [rdi], rax ret
rs2l: ; The result in the multiplication doen't feed ; we have the result in edx:eax we need to convert it to long shl rdx, 32 mov edx, eax ; pack edx:eax to rdx
xor rax, rax ; Set the format to long bts rax, 63 mov [rdi], rax ; move the first digit
cmp rdx, 0 ; check if redx is negative. jl rs2ln
; edx is positive. mov [rdi + 8], rdx ; Set the firs digit
xor rax, rax ; Set the remaining digits to 0 <% for (let i=1; i<n64; i++) { %> mov [rdi + <%= (i+1)*8 %>], rax <% } %> ret
; edx is negative. rs2ln:
add rdx, [q] ; Set the firs digit mov [rdi + 8], rdx ;
mov rdx, -1 ; all ones <% for (let i=1; i<n64; i++) { %> mov rax, rdx ; Add to q adc rax, [q + <%= i*8 %> ] mov [rdi + <%= (i+1)*8 %>], rax <% } %> ret
l1: mov rcx, [rdx] bt rcx, 63 jc ll
l1s2: xor rdx, rdx mov edx, ecx bt rax, 62 jc lsM jmp lsN
s1l2: mov rsi, rdx xor rdx, rdx mov edx, eax bt rcx, 62 jc lsM jmp lsN
lsN: mov byte [rdi + 3], 0xC0 ; set the result to montgomery add rsi, 8 add rdi, 8 call mulSM mov rdx, R3 call mulM ret
lsM: mov byte [rdi + 3], 0x80 ; set the result to long normal add rsi, 8 add rdi, 8 call mulSM ret
ll:
bt rax, 62 jc lml bt rcx, 62 jc lnlm
lnln: mov byte [rdi + 3], 0xC0 ; set the result to long montgomery add rsi, 8 add rdi, 8 add rdx, 8 call mulM mov rdi, rsi mov rdx, R3 call mulM ret
lml: bt rcx, 62 jc lmlm
lnlm: mov byte [rdi + 3], 0x80 ; set the result to long normal add rsi, 8 add rdi, 8 add rdx, 8 call mulM ret
lmlm: mov byte [rdi + 3], 0xC0 ; set the result to long montgomery add rsi, 8 add rdi, 8 add rdx, 8 call mulM ret
section .data <%=name%>_q: dd 0 dd 0x80000000 q dq <%= constantElement(q) %> R3 dq <%= constantElement(bigInt.one.shiftLeft(n64*64*3).mod(q)) %>
|