// +build amd64_adx // Copyright 2020 ConsenSys Software Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "textflag.h" #include "funcdata.h" // modulus q DATA q<>+0(SB)/8, $0x43e1f593f0000001 DATA q<>+8(SB)/8, $0x2833e84879b97091 DATA q<>+16(SB)/8, $0xb85045b68181585d DATA q<>+24(SB)/8, $0x30644e72e131a029 GLOBL q<>(SB), (RODATA+NOPTR), $32 // qInv0 q'[0] DATA qInv0<>(SB)/8, $0xc2e1f593efffffff GLOBL qInv0<>(SB), (RODATA+NOPTR), $8 #define REDUCE(ra0, ra1, ra2, ra3, rb0, rb1, rb2, rb3) \ MOVQ ra0, rb0; \ SUBQ q<>(SB), ra0; \ MOVQ ra1, rb1; \ SBBQ q<>+8(SB), ra1; \ MOVQ ra2, rb2; \ SBBQ q<>+16(SB), ra2; \ MOVQ ra3, rb3; \ SBBQ q<>+24(SB), ra3; \ CMOVQCS rb0, ra0; \ CMOVQCS rb1, ra1; \ CMOVQCS rb2, ra2; \ CMOVQCS rb3, ra3; \ // mul(res, x, y *Element) TEXT ·mul(SB), NOSPLIT, $0-24 // the algorithm is described here // https://hackmd.io/@zkteam/modular_multiplication // however, to benefit from the ADCX and ADOX carry chains // we split the inner loops in 2: // for i=0 to N-1 // for j=0 to N-1 // (A,t[j]) := t[j] + x[j]*y[i] + A // m := t[0]*q'[0] mod W // C,_ := t[0] + m*q[0] // for j=1 to N-1 // (C,t[j-1]) := t[j] + m*q[j] + C // t[N-1] = C + A MOVQ x+8(FP), SI // x[0] -> DI // x[1] -> R8 // x[2] -> R9 // x[3] -> R10 MOVQ 0(SI), DI MOVQ 8(SI), R8 MOVQ 16(SI), R9 MOVQ 24(SI), R10 MOVQ y+16(FP), R11 // A -> BP // t[0] -> R14 // t[1] -> R15 // t[2] -> CX // t[3] -> BX // clear the flags XORQ AX, AX MOVQ 0(R11), DX // (A,t[0]) := x[0]*y[0] + A MULXQ DI, R14, R15 // (A,t[1]) := x[1]*y[0] + A MULXQ R8, AX, CX ADOXQ AX, R15 // (A,t[2]) := x[2]*y[0] + A MULXQ R9, AX, BX ADOXQ AX, CX // (A,t[3]) := x[3]*y[0] + A MULXQ R10, AX, BP ADOXQ AX, BX // A += carries from ADCXQ and ADOXQ MOVQ $0, AX ADOXQ AX, BP // m := t[0]*q'[0] mod W MOVQ qInv0<>(SB), DX IMULQ R14, DX // clear the flags XORQ AX, AX // C,_ := t[0] + m*q[0] MULXQ q<>+0(SB), AX, R12 ADCXQ R14, AX MOVQ R12, R14 // (C,t[0]) := t[1] + m*q[1] + C ADCXQ R15, R14 MULXQ q<>+8(SB), AX, R15 ADOXQ AX, R14 // (C,t[1]) := t[2] + m*q[2] + C ADCXQ CX, R15 MULXQ q<>+16(SB), AX, CX ADOXQ AX, R15 // (C,t[2]) := t[3] + m*q[3] + C ADCXQ BX, CX MULXQ q<>+24(SB), AX, BX ADOXQ AX, CX // t[3] = C + A MOVQ $0, AX ADCXQ AX, BX ADOXQ BP, BX // clear the flags XORQ AX, AX MOVQ 8(R11), DX // (A,t[0]) := t[0] + x[0]*y[1] + A MULXQ DI, AX, BP ADOXQ AX, R14 // (A,t[1]) := t[1] + x[1]*y[1] + A ADCXQ BP, R15 MULXQ R8, AX, BP ADOXQ AX, R15 // (A,t[2]) := t[2] + x[2]*y[1] + A ADCXQ BP, CX MULXQ R9, AX, BP ADOXQ AX, CX // (A,t[3]) := t[3] + x[3]*y[1] + A ADCXQ BP, BX MULXQ R10, AX, BP ADOXQ AX, BX // A += carries from ADCXQ and ADOXQ MOVQ $0, AX ADCXQ AX, BP ADOXQ AX, BP // m := t[0]*q'[0] mod W MOVQ qInv0<>(SB), DX IMULQ R14, DX // clear the flags XORQ AX, AX // C,_ := t[0] + m*q[0] MULXQ q<>+0(SB), AX, R12 ADCXQ R14, AX MOVQ R12, R14 // (C,t[0]) := t[1] + m*q[1] + C ADCXQ R15, R14 MULXQ q<>+8(SB), AX, R15 ADOXQ AX, R14 // (C,t[1]) := t[2] + m*q[2] + C ADCXQ CX, R15 MULXQ q<>+16(SB), AX, CX ADOXQ AX, R15 // (C,t[2]) := t[3] + m*q[3] + C ADCXQ BX, CX MULXQ q<>+24(SB), AX, BX ADOXQ AX, CX // t[3] = C + A MOVQ $0, AX ADCXQ AX, BX ADOXQ BP, BX // clear the flags XORQ AX, AX MOVQ 16(R11), DX // (A,t[0]) := t[0] + x[0]*y[2] + A MULXQ DI, AX, BP ADOXQ AX, R14 // (A,t[1]) := t[1] + x[1]*y[2] + A ADCXQ BP, R15 MULXQ R8, AX, BP ADOXQ AX, R15 // (A,t[2]) := t[2] + x[2]*y[2] + A ADCXQ BP, CX MULXQ R9, AX, BP ADOXQ AX, CX // (A,t[3]) := t[3] + x[3]*y[2] + A ADCXQ BP, BX MULXQ R10, AX, BP ADOXQ AX, BX // A += carries from ADCXQ and ADOXQ MOVQ $0, AX ADCXQ AX, BP ADOXQ AX, BP // m := t[0]*q'[0] mod W MOVQ qInv0<>(SB), DX IMULQ R14, DX // clear the flags XORQ AX, AX // C,_ := t[0] + m*q[0] MULXQ q<>+0(SB), AX, R12 ADCXQ R14, AX MOVQ R12, R14 // (C,t[0]) := t[1] + m*q[1] + C ADCXQ R15, R14 MULXQ q<>+8(SB), AX, R15 ADOXQ AX, R14 // (C,t[1]) := t[2] + m*q[2] + C ADCXQ CX, R15 MULXQ q<>+16(SB), AX, CX ADOXQ AX, R15 // (C,t[2]) := t[3] + m*q[3] + C ADCXQ BX, CX MULXQ q<>+24(SB), AX, BX ADOXQ AX, CX // t[3] = C + A MOVQ $0, AX ADCXQ AX, BX ADOXQ BP, BX // clear the flags XORQ AX, AX MOVQ 24(R11), DX // (A,t[0]) := t[0] + x[0]*y[3] + A MULXQ DI, AX, BP ADOXQ AX, R14 // (A,t[1]) := t[1] + x[1]*y[3] + A ADCXQ BP, R15 MULXQ R8, AX, BP ADOXQ AX, R15 // (A,t[2]) := t[2] + x[2]*y[3] + A ADCXQ BP, CX MULXQ R9, AX, BP ADOXQ AX, CX // (A,t[3]) := t[3] + x[3]*y[3] + A ADCXQ BP, BX MULXQ R10, AX, BP ADOXQ AX, BX // A += carries from ADCXQ and ADOXQ MOVQ $0, AX ADCXQ AX, BP ADOXQ AX, BP // m := t[0]*q'[0] mod W MOVQ qInv0<>(SB), DX IMULQ R14, DX // clear the flags XORQ AX, AX // C,_ := t[0] + m*q[0] MULXQ q<>+0(SB), AX, R12 ADCXQ R14, AX MOVQ R12, R14 // (C,t[0]) := t[1] + m*q[1] + C ADCXQ R15, R14 MULXQ q<>+8(SB), AX, R15 ADOXQ AX, R14 // (C,t[1]) := t[2] + m*q[2] + C ADCXQ CX, R15 MULXQ q<>+16(SB), AX, CX ADOXQ AX, R15 // (C,t[2]) := t[3] + m*q[3] + C ADCXQ BX, CX MULXQ q<>+24(SB), AX, BX ADOXQ AX, CX // t[3] = C + A MOVQ $0, AX ADCXQ AX, BX ADOXQ BP, BX // reduce element(R14,R15,CX,BX) using temp registers (R13,SI,R12,R11) REDUCE(R14,R15,CX,BX,R13,SI,R12,R11) MOVQ res+0(FP), AX MOVQ R14, 0(AX) MOVQ R15, 8(AX) MOVQ CX, 16(AX) MOVQ BX, 24(AX) RET TEXT ·fromMont(SB), NOSPLIT, $0-8 // the algorithm is described here // https://hackmd.io/@zkteam/modular_multiplication // when y = 1 we have: // for i=0 to N-1 // t[i] = x[i] // for i=0 to N-1 // m := t[0]*q'[0] mod W // C,_ := t[0] + m*q[0] // for j=1 to N-1 // (C,t[j-1]) := t[j] + m*q[j] + C // t[N-1] = C MOVQ res+0(FP), DX MOVQ 0(DX), R14 MOVQ 8(DX), R15 MOVQ 16(DX), CX MOVQ 24(DX), BX XORQ DX, DX // m := t[0]*q'[0] mod W MOVQ qInv0<>(SB), DX IMULQ R14, DX XORQ AX, AX // C,_ := t[0] + m*q[0] MULXQ q<>+0(SB), AX, BP ADCXQ R14, AX MOVQ BP, R14 // (C,t[0]) := t[1] + m*q[1] + C ADCXQ R15, R14 MULXQ q<>+8(SB), AX, R15 ADOXQ AX, R14 // (C,t[1]) := t[2] + m*q[2] + C ADCXQ CX, R15 MULXQ q<>+16(SB), AX, CX ADOXQ AX, R15 // (C,t[2]) := t[3] + m*q[3] + C ADCXQ BX, CX MULXQ q<>+24(SB), AX, BX ADOXQ AX, CX MOVQ $0, AX ADCXQ AX, BX ADOXQ AX, BX XORQ DX, DX // m := t[0]*q'[0] mod W MOVQ qInv0<>(SB), DX IMULQ R14, DX XORQ AX, AX // C,_ := t[0] + m*q[0] MULXQ q<>+0(SB), AX, BP ADCXQ R14, AX MOVQ BP, R14 // (C,t[0]) := t[1] + m*q[1] + C ADCXQ R15, R14 MULXQ q<>+8(SB), AX, R15 ADOXQ AX, R14 // (C,t[1]) := t[2] + m*q[2] + C ADCXQ CX, R15 MULXQ q<>+16(SB), AX, CX ADOXQ AX, R15 // (C,t[2]) := t[3] + m*q[3] + C ADCXQ BX, CX MULXQ q<>+24(SB), AX, BX ADOXQ AX, CX MOVQ $0, AX ADCXQ AX, BX ADOXQ AX, BX XORQ DX, DX // m := t[0]*q'[0] mod W MOVQ qInv0<>(SB), DX IMULQ R14, DX XORQ AX, AX // C,_ := t[0] + m*q[0] MULXQ q<>+0(SB), AX, BP ADCXQ R14, AX MOVQ BP, R14 // (C,t[0]) := t[1] + m*q[1] + C ADCXQ R15, R14 MULXQ q<>+8(SB), AX, R15 ADOXQ AX, R14 // (C,t[1]) := t[2] + m*q[2] + C ADCXQ CX, R15 MULXQ q<>+16(SB), AX, CX ADOXQ AX, R15 // (C,t[2]) := t[3] + m*q[3] + C ADCXQ BX, CX MULXQ q<>+24(SB), AX, BX ADOXQ AX, CX MOVQ $0, AX ADCXQ AX, BX ADOXQ AX, BX XORQ DX, DX // m := t[0]*q'[0] mod W MOVQ qInv0<>(SB), DX IMULQ R14, DX XORQ AX, AX // C,_ := t[0] + m*q[0] MULXQ q<>+0(SB), AX, BP ADCXQ R14, AX MOVQ BP, R14 // (C,t[0]) := t[1] + m*q[1] + C ADCXQ R15, R14 MULXQ q<>+8(SB), AX, R15 ADOXQ AX, R14 // (C,t[1]) := t[2] + m*q[2] + C ADCXQ CX, R15 MULXQ q<>+16(SB), AX, CX ADOXQ AX, R15 // (C,t[2]) := t[3] + m*q[3] + C ADCXQ BX, CX MULXQ q<>+24(SB), AX, BX ADOXQ AX, CX MOVQ $0, AX ADCXQ AX, BX ADOXQ AX, BX // reduce element(R14,R15,CX,BX) using temp registers (SI,DI,R8,R9) REDUCE(R14,R15,CX,BX,SI,DI,R8,R9) MOVQ res+0(FP), AX MOVQ R14, 0(AX) MOVQ R15, 8(AX) MOVQ CX, 16(AX) MOVQ BX, 24(AX) RET