You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

695 lines
13 KiB

// Code generated by goff (v0.2.0) DO NOT EDIT
#include "textflag.h"
// func MulAssignElement(res,y *Element)
// montgomery multiplication of res by y
// stores the result in res
TEXT ·MulAssignElement(SB), NOSPLIT, $0-16
// dereference our parameters
MOVQ res+0(FP), DI
MOVQ y+8(FP), R8
// check if we support adx and mulx
CMPB ·supportAdx(SB), $1
JNE no_adx
// the algorithm is described here
// https://hackmd.io/@zkteam/modular_multiplication
// however, to benefit from the ADCX and ADOX carry chains
// we split the inner loops in 2:
// for i=0 to N-1
// for j=0 to N-1
// (A,t[j]) := t[j] + a[j]*b[i] + A
// m := t[0]*q'[0] mod W
// C,_ := t[0] + m*q[0]
// for j=1 to N-1
// (C,t[j-1]) := t[j] + m*q[j] + C
// t[N-1] = C + A
// ---------------------------------------------------------------------------------------------
// outter loop 0
// clear up the carry flags
XORQ R9 , R9
// R12 = y[0]
MOVQ 0(R8), R12
// for j=0 to N-1
// (A,t[j]) := t[j] + x[j]*y[i] + A
// DX = res[0]
MOVQ 0(DI), DX
MULXQ R12, CX , R9
// DX = res[1]
MOVQ 8(DI), DX
MOVQ R9, BX
MULXQ R12, AX, R9
ADOXQ AX, BX
// DX = res[2]
MOVQ 16(DI), DX
MOVQ R9, BP
MULXQ R12, AX, R9
ADOXQ AX, BP
// DX = res[3]
MOVQ 24(DI), DX
MOVQ R9, SI
MULXQ R12, AX, R9
ADOXQ AX, SI
// add the last carries to R9
MOVQ $0, DX
ADCXQ DX, R9
ADOXQ DX, R9
// m := t[0]*q'[0] mod W
MOVQ $0xc2e1f593efffffff, DX
MULXQ CX,R11, DX
// clear the carry flags
XORQ DX, DX
// C,_ := t[0] + m*q[0]
MOVQ $0x43e1f593f0000001, DX
MULXQ R11, AX, R10
ADCXQ CX ,AX
// for j=1 to N-1
// (C,t[j-1]) := t[j] + m*q[j] + C
MOVQ $0x2833e84879b97091, DX
MULXQ R11, AX, DX
ADCXQ BX, R10
ADOXQ AX, R10
MOVQ R10, CX
MOVQ DX, R10
MOVQ $0xb85045b68181585d, DX
MULXQ R11, AX, DX
ADCXQ BP, R10
ADOXQ AX, R10
MOVQ R10, BX
MOVQ DX, R10
MOVQ $0x30644e72e131a029, DX
MULXQ R11, AX, DX
ADCXQ SI, R10
ADOXQ AX, R10
MOVQ R10, BP
MOVQ $0, AX
ADCXQ AX, DX
ADOXQ DX, R9
MOVQ R9, SI
// ---------------------------------------------------------------------------------------------
// outter loop 1
// clear up the carry flags
XORQ R9 , R9
// R12 = y[1]
MOVQ 8(R8), R12
// for j=0 to N-1
// (A,t[j]) := t[j] + x[j]*y[i] + A
// DX = res[0]
MOVQ 0(DI), DX
MULXQ R12, AX, R9
ADOXQ AX, CX
// DX = res[1]
MOVQ 8(DI), DX
ADCXQ R9, BX
MULXQ R12, AX, R9
ADOXQ AX, BX
// DX = res[2]
MOVQ 16(DI), DX
ADCXQ R9, BP
MULXQ R12, AX, R9
ADOXQ AX, BP
// DX = res[3]
MOVQ 24(DI), DX
ADCXQ R9, SI
MULXQ R12, AX, R9
ADOXQ AX, SI
// add the last carries to R9
MOVQ $0, DX
ADCXQ DX, R9
ADOXQ DX, R9
// m := t[0]*q'[0] mod W
MOVQ $0xc2e1f593efffffff, DX
MULXQ CX,R11, DX
// clear the carry flags
XORQ DX, DX
// C,_ := t[0] + m*q[0]
MOVQ $0x43e1f593f0000001, DX
MULXQ R11, AX, R10
ADCXQ CX ,AX
// for j=1 to N-1
// (C,t[j-1]) := t[j] + m*q[j] + C
MOVQ $0x2833e84879b97091, DX
MULXQ R11, AX, DX
ADCXQ BX, R10
ADOXQ AX, R10
MOVQ R10, CX
MOVQ DX, R10
MOVQ $0xb85045b68181585d, DX
MULXQ R11, AX, DX
ADCXQ BP, R10
ADOXQ AX, R10
MOVQ R10, BX
MOVQ DX, R10
MOVQ $0x30644e72e131a029, DX
MULXQ R11, AX, DX
ADCXQ SI, R10
ADOXQ AX, R10
MOVQ R10, BP
MOVQ $0, AX
ADCXQ AX, DX
ADOXQ DX, R9
MOVQ R9, SI
// ---------------------------------------------------------------------------------------------
// outter loop 2
// clear up the carry flags
XORQ R9 , R9
// R12 = y[2]
MOVQ 16(R8), R12
// for j=0 to N-1
// (A,t[j]) := t[j] + x[j]*y[i] + A
// DX = res[0]
MOVQ 0(DI), DX
MULXQ R12, AX, R9
ADOXQ AX, CX
// DX = res[1]
MOVQ 8(DI), DX
ADCXQ R9, BX
MULXQ R12, AX, R9
ADOXQ AX, BX
// DX = res[2]
MOVQ 16(DI), DX
ADCXQ R9, BP
MULXQ R12, AX, R9
ADOXQ AX, BP
// DX = res[3]
MOVQ 24(DI), DX
ADCXQ R9, SI
MULXQ R12, AX, R9
ADOXQ AX, SI
// add the last carries to R9
MOVQ $0, DX
ADCXQ DX, R9
ADOXQ DX, R9
// m := t[0]*q'[0] mod W
MOVQ $0xc2e1f593efffffff, DX
MULXQ CX,R11, DX
// clear the carry flags
XORQ DX, DX
// C,_ := t[0] + m*q[0]
MOVQ $0x43e1f593f0000001, DX
MULXQ R11, AX, R10
ADCXQ CX ,AX
// for j=1 to N-1
// (C,t[j-1]) := t[j] + m*q[j] + C
MOVQ $0x2833e84879b97091, DX
MULXQ R11, AX, DX
ADCXQ BX, R10
ADOXQ AX, R10
MOVQ R10, CX
MOVQ DX, R10
MOVQ $0xb85045b68181585d, DX
MULXQ R11, AX, DX
ADCXQ BP, R10
ADOXQ AX, R10
MOVQ R10, BX
MOVQ DX, R10
MOVQ $0x30644e72e131a029, DX
MULXQ R11, AX, DX
ADCXQ SI, R10
ADOXQ AX, R10
MOVQ R10, BP
MOVQ $0, AX
ADCXQ AX, DX
ADOXQ DX, R9
MOVQ R9, SI
// ---------------------------------------------------------------------------------------------
// outter loop 3
// clear up the carry flags
XORQ R9 , R9
// R12 = y[3]
MOVQ 24(R8), R12
// for j=0 to N-1
// (A,t[j]) := t[j] + x[j]*y[i] + A
// DX = res[0]
MOVQ 0(DI), DX
MULXQ R12, AX, R9
ADOXQ AX, CX
// DX = res[1]
MOVQ 8(DI), DX
ADCXQ R9, BX
MULXQ R12, AX, R9
ADOXQ AX, BX
// DX = res[2]
MOVQ 16(DI), DX
ADCXQ R9, BP
MULXQ R12, AX, R9
ADOXQ AX, BP
// DX = res[3]
MOVQ 24(DI), DX
ADCXQ R9, SI
MULXQ R12, AX, R9
ADOXQ AX, SI
// add the last carries to R9
MOVQ $0, DX
ADCXQ DX, R9
ADOXQ DX, R9
// m := t[0]*q'[0] mod W
MOVQ $0xc2e1f593efffffff, DX
MULXQ CX,R11, DX
// clear the carry flags
XORQ DX, DX
// C,_ := t[0] + m*q[0]
MOVQ $0x43e1f593f0000001, DX
MULXQ R11, AX, R10
ADCXQ CX ,AX
// for j=1 to N-1
// (C,t[j-1]) := t[j] + m*q[j] + C
MOVQ $0x2833e84879b97091, DX
MULXQ R11, AX, DX
ADCXQ BX, R10
ADOXQ AX, R10
MOVQ R10, CX
MOVQ DX, R10
MOVQ $0xb85045b68181585d, DX
MULXQ R11, AX, DX
ADCXQ BP, R10
ADOXQ AX, R10
MOVQ R10, BX
MOVQ DX, R10
MOVQ $0x30644e72e131a029, DX
MULXQ R11, AX, DX
ADCXQ SI, R10
ADOXQ AX, R10
MOVQ R10, BP
MOVQ $0, AX
ADCXQ AX, DX
ADOXQ DX, R9
MOVQ R9, SI
reduce:
// reduce, constant time version
// first we copy registers storing t in a separate set of registers
// as SUBQ modifies the 2nd operand
MOVQ CX, DX
MOVQ BX, R8
MOVQ BP, R9
MOVQ SI, R10
MOVQ $0x43e1f593f0000001, R11
SUBQ R11, DX
MOVQ $0x2833e84879b97091, R11
SBBQ R11, R8
MOVQ $0xb85045b68181585d, R11
SBBQ R11, R9
MOVQ $0x30644e72e131a029, R11
SBBQ R11, R10
JCS t_is_smaller // no borrow, we return t
// borrow is set, we return u
MOVQ DX, (DI)
MOVQ R8, 8(DI)
MOVQ R9, 16(DI)
MOVQ R10, 24(DI)
RET
t_is_smaller:
MOVQ CX, 0(DI)
MOVQ BX, 8(DI)
MOVQ BP, 16(DI)
MOVQ SI, 24(DI)
RET
no_adx:
// ---------------------------------------------------------------------------------------------
// outter loop 0
// (A,t[0]) := t[0] + x[0]*y[0]
MOVQ (DI), AX // x[0]
MOVQ 0(R8), R12
MULQ R12 // x[0] * y[0]
MOVQ DX, R9
MOVQ AX, CX
// m := t[0]*q'[0] mod W
MOVQ $0xc2e1f593efffffff, R11
IMULQ CX , R11
// C,_ := t[0] + m*q[0]
MOVQ $0x43e1f593f0000001, AX
MULQ R11
ADDQ CX ,AX
ADCQ $0, DX
MOVQ DX, R10
// for j=1 to N-1
// (A,t[j]) := t[j] + x[j]*y[i] + A
// (C,t[j-1]) := t[j] + m*q[j] + C
MOVQ 8(DI), AX
MULQ R12 // x[1] * y[0]
MOVQ R9, BX
ADDQ AX, BX
ADCQ $0, DX
MOVQ DX, R9
MOVQ $0x2833e84879b97091, AX
MULQ R11
ADDQ BX, R10
ADCQ $0, DX
ADDQ AX, R10
ADCQ $0, DX
MOVQ R10, CX
MOVQ DX, R10
MOVQ 16(DI), AX
MULQ R12 // x[2] * y[0]
MOVQ R9, BP
ADDQ AX, BP
ADCQ $0, DX
MOVQ DX, R9
MOVQ $0xb85045b68181585d, AX
MULQ R11
ADDQ BP, R10
ADCQ $0, DX
ADDQ AX, R10
ADCQ $0, DX
MOVQ R10, BX
MOVQ DX, R10
MOVQ 24(DI), AX
MULQ R12 // x[3] * y[0]
MOVQ R9, SI
ADDQ AX, SI
ADCQ $0, DX
MOVQ DX, R9
MOVQ $0x30644e72e131a029, AX
MULQ R11
ADDQ SI, R10
ADCQ $0, DX
ADDQ AX, R10
ADCQ $0, DX
MOVQ R10, BP
MOVQ DX, R10
ADDQ R10, R9
MOVQ R9, SI
// ---------------------------------------------------------------------------------------------
// outter loop 1
// (A,t[0]) := t[0] + x[0]*y[1]
MOVQ (DI), AX // x[0]
MOVQ 8(R8), R12
MULQ R12 // x[0] * y[1]
ADDQ AX, CX
ADCQ $0, DX
MOVQ DX, R9
// m := t[0]*q'[0] mod W
MOVQ $0xc2e1f593efffffff, R11
IMULQ CX , R11
// C,_ := t[0] + m*q[0]
MOVQ $0x43e1f593f0000001, AX
MULQ R11
ADDQ CX ,AX
ADCQ $0, DX
MOVQ DX, R10
// for j=1 to N-1
// (A,t[j]) := t[j] + x[j]*y[i] + A
// (C,t[j-1]) := t[j] + m*q[j] + C
MOVQ 8(DI), AX
MULQ R12 // x[1] * y[1]
ADDQ R9, BX
ADCQ $0, DX
ADDQ AX, BX
ADCQ $0, DX
MOVQ DX, R9
MOVQ $0x2833e84879b97091, AX
MULQ R11
ADDQ BX, R10
ADCQ $0, DX
ADDQ AX, R10
ADCQ $0, DX
MOVQ R10, CX
MOVQ DX, R10
MOVQ 16(DI), AX
MULQ R12 // x[2] * y[1]
ADDQ R9, BP
ADCQ $0, DX
ADDQ AX, BP
ADCQ $0, DX
MOVQ DX, R9
MOVQ $0xb85045b68181585d, AX
MULQ R11
ADDQ BP, R10
ADCQ $0, DX
ADDQ AX, R10
ADCQ $0, DX
MOVQ R10, BX
MOVQ DX, R10
MOVQ 24(DI), AX
MULQ R12 // x[3] * y[1]
ADDQ R9, SI
ADCQ $0, DX
ADDQ AX, SI
ADCQ $0, DX
MOVQ DX, R9
MOVQ $0x30644e72e131a029, AX
MULQ R11
ADDQ SI, R10
ADCQ $0, DX
ADDQ AX, R10
ADCQ $0, DX
MOVQ R10, BP
MOVQ DX, R10
ADDQ R10, R9
MOVQ R9, SI
// ---------------------------------------------------------------------------------------------
// outter loop 2
// (A,t[0]) := t[0] + x[0]*y[2]
MOVQ (DI), AX // x[0]
MOVQ 16(R8), R12
MULQ R12 // x[0] * y[2]
ADDQ AX, CX
ADCQ $0, DX
MOVQ DX, R9
// m := t[0]*q'[0] mod W
MOVQ $0xc2e1f593efffffff, R11
IMULQ CX , R11
// C,_ := t[0] + m*q[0]
MOVQ $0x43e1f593f0000001, AX
MULQ R11
ADDQ CX ,AX
ADCQ $0, DX
MOVQ DX, R10
// for j=1 to N-1
// (A,t[j]) := t[j] + x[j]*y[i] + A
// (C,t[j-1]) := t[j] + m*q[j] + C
MOVQ 8(DI), AX
MULQ R12 // x[1] * y[2]
ADDQ R9, BX
ADCQ $0, DX
ADDQ AX, BX
ADCQ $0, DX
MOVQ DX, R9
MOVQ $0x2833e84879b97091, AX
MULQ R11
ADDQ BX, R10
ADCQ $0, DX
ADDQ AX, R10
ADCQ $0, DX
MOVQ R10, CX
MOVQ DX, R10
MOVQ 16(DI), AX
MULQ R12 // x[2] * y[2]
ADDQ R9, BP
ADCQ $0, DX
ADDQ AX, BP
ADCQ $0, DX
MOVQ DX, R9
MOVQ $0xb85045b68181585d, AX
MULQ R11
ADDQ BP, R10
ADCQ $0, DX
ADDQ AX, R10
ADCQ $0, DX
MOVQ R10, BX
MOVQ DX, R10
MOVQ 24(DI), AX
MULQ R12 // x[3] * y[2]
ADDQ R9, SI
ADCQ $0, DX
ADDQ AX, SI
ADCQ $0, DX
MOVQ DX, R9
MOVQ $0x30644e72e131a029, AX
MULQ R11
ADDQ SI, R10
ADCQ $0, DX
ADDQ AX, R10
ADCQ $0, DX
MOVQ R10, BP
MOVQ DX, R10
ADDQ R10, R9
MOVQ R9, SI
// ---------------------------------------------------------------------------------------------
// outter loop 3
// (A,t[0]) := t[0] + x[0]*y[3]
MOVQ (DI), AX // x[0]
MOVQ 24(R8), R12
MULQ R12 // x[0] * y[3]
ADDQ AX, CX
ADCQ $0, DX
MOVQ DX, R9
// m := t[0]*q'[0] mod W
MOVQ $0xc2e1f593efffffff, R11
IMULQ CX , R11
// C,_ := t[0] + m*q[0]
MOVQ $0x43e1f593f0000001, AX
MULQ R11
ADDQ CX ,AX
ADCQ $0, DX
MOVQ DX, R10
// for j=1 to N-1
// (A,t[j]) := t[j] + x[j]*y[i] + A
// (C,t[j-1]) := t[j] + m*q[j] + C
MOVQ 8(DI), AX
MULQ R12 // x[1] * y[3]
ADDQ R9, BX
ADCQ $0, DX
ADDQ AX, BX
ADCQ $0, DX
MOVQ DX, R9
MOVQ $0x2833e84879b97091, AX
MULQ R11
ADDQ BX, R10
ADCQ $0, DX
ADDQ AX, R10
ADCQ $0, DX
MOVQ R10, CX
MOVQ DX, R10
MOVQ 16(DI), AX
MULQ R12 // x[2] * y[3]
ADDQ R9, BP
ADCQ $0, DX
ADDQ AX, BP
ADCQ $0, DX
MOVQ DX, R9
MOVQ $0xb85045b68181585d, AX
MULQ R11
ADDQ BP, R10
ADCQ $0, DX
ADDQ AX, R10
ADCQ $0, DX
MOVQ R10, BX
MOVQ DX, R10
MOVQ 24(DI), AX
MULQ R12 // x[3] * y[3]
ADDQ R9, SI
ADCQ $0, DX
ADDQ AX, SI
ADCQ $0, DX
MOVQ DX, R9
MOVQ $0x30644e72e131a029, AX
MULQ R11
ADDQ SI, R10
ADCQ $0, DX
ADDQ AX, R10
ADCQ $0, DX
MOVQ R10, BP
MOVQ DX, R10
ADDQ R10, R9
MOVQ R9, SI
JMP reduce