mirror of
https://github.com/arnaucube/poulpy.git
synced 2026-02-10 05:06:44 +01:00
181 lines
6.7 KiB
ArmAsm
181 lines
6.7 KiB
ArmAsm
# ----------------------------------------------------------------------
|
|
# This kernel is a direct port of the IFFT16 routine from spqlios-arithmetic
|
|
# (https://github.com/tfhe/spqlios-arithmetic)
|
|
# ----------------------------------------------------------------------
|
|
#
|
|
|
|
.text
|
|
.globl ifft16_avx2_fma_asm
|
|
.hidden ifft16_avx2_fma_asm
|
|
.p2align 4, 0x90
|
|
.type ifft16_avx2_fma_asm,@function
|
|
ifft16_avx2_fma_asm:
|
|
.att_syntax prefix
|
|
|
|
vmovupd (%rdi),%ymm0 # ra0
|
|
vmovupd 0x20(%rdi),%ymm1 # ra4
|
|
vmovupd 0x40(%rdi),%ymm2 # ra8
|
|
vmovupd 0x60(%rdi),%ymm3 # ra12
|
|
vmovupd (%rsi),%ymm4 # ia0
|
|
vmovupd 0x20(%rsi),%ymm5 # ia4
|
|
vmovupd 0x40(%rsi),%ymm6 # ia8
|
|
vmovupd 0x60(%rsi),%ymm7 # ia12
|
|
|
|
1:
|
|
vmovupd 0x00(%rdx),%ymm12
|
|
vmovupd 0x20(%rdx),%ymm13
|
|
|
|
vperm2f128 $0x31,%ymm2,%ymm0,%ymm8 # ymm8 contains re to mul (tw)
|
|
vperm2f128 $0x31,%ymm3,%ymm1,%ymm9 # ymm9 contains re to mul (itw)
|
|
vperm2f128 $0x31,%ymm6,%ymm4,%ymm10 # ymm10 contains im to mul (tw)
|
|
vperm2f128 $0x31,%ymm7,%ymm5,%ymm11 # ymm11 contains im to mul (itw)
|
|
vperm2f128 $0x20,%ymm2,%ymm0,%ymm0 # ymm0 contains re to add (tw)
|
|
vperm2f128 $0x20,%ymm3,%ymm1,%ymm1 # ymm1 contains re to add (itw)
|
|
vperm2f128 $0x20,%ymm6,%ymm4,%ymm2 # ymm2 contains im to add (tw)
|
|
vperm2f128 $0x20,%ymm7,%ymm5,%ymm3 # ymm3 contains im to add (itw)
|
|
|
|
vunpckhpd %ymm1,%ymm0,%ymm4 # (0,1) -> (0,4)
|
|
vunpckhpd %ymm3,%ymm2,%ymm6 # (2,3) -> (2,6)
|
|
vunpckhpd %ymm9,%ymm8,%ymm5 # (8,9) -> (1,5)
|
|
vunpckhpd %ymm11,%ymm10,%ymm7 # (10,11) -> (3,7)
|
|
vunpcklpd %ymm1,%ymm0,%ymm0
|
|
vunpcklpd %ymm3,%ymm2,%ymm2
|
|
vunpcklpd %ymm9,%ymm8,%ymm1
|
|
vunpcklpd %ymm11,%ymm10,%ymm3
|
|
|
|
# invctwiddle Re:(ymm0,ymm4) and Im:(ymm2,ymm6) with omega=(ymm12,ymm13)
|
|
# invcitwiddle Re:(ymm1,ymm5) and Im:(ymm3,ymm7) with omega=(ymm12,ymm13)
|
|
vsubpd %ymm4,%ymm0,%ymm8 # retw
|
|
vsubpd %ymm5,%ymm1,%ymm9 # reitw
|
|
vsubpd %ymm6,%ymm2,%ymm10 # imtw
|
|
vsubpd %ymm7,%ymm3,%ymm11 # imitw
|
|
vaddpd %ymm4,%ymm0,%ymm0
|
|
vaddpd %ymm5,%ymm1,%ymm1
|
|
vaddpd %ymm6,%ymm2,%ymm2
|
|
vaddpd %ymm7,%ymm3,%ymm3
|
|
# multiply 8,9,10,11 by 12,13, result to: 4,5,6,7
|
|
# twiddles use reom=ymm12, imom=ymm13
|
|
# invtwiddles use reom=ymm13, imom=-ymm12
|
|
vmulpd %ymm10,%ymm13,%ymm4 # imtw.omai (tw)
|
|
vmulpd %ymm11,%ymm12,%ymm5 # imitw.omar (itw)
|
|
vmulpd %ymm8,%ymm13,%ymm6 # retw.omai (tw)
|
|
vmulpd %ymm9,%ymm12,%ymm7 # reitw.omar (itw)
|
|
vfmsub231pd %ymm8,%ymm12,%ymm4 # rprod0 (tw)
|
|
vfmadd231pd %ymm9,%ymm13,%ymm5 # rprod4 (itw)
|
|
vfmadd231pd %ymm10,%ymm12,%ymm6 # iprod0 (tw)
|
|
vfmsub231pd %ymm11,%ymm13,%ymm7 # iprod4 (itw)
|
|
|
|
vunpckhpd %ymm7,%ymm3,%ymm11 # (0,4) -> (0,1)
|
|
vunpckhpd %ymm5,%ymm1,%ymm9 # (2,6) -> (2,3)
|
|
vunpcklpd %ymm7,%ymm3,%ymm10
|
|
vunpcklpd %ymm5,%ymm1,%ymm8
|
|
vunpckhpd %ymm6,%ymm2,%ymm3 # (1,5) -> (8,9)
|
|
vunpckhpd %ymm4,%ymm0,%ymm1 # (3,7) -> (10,11)
|
|
vunpcklpd %ymm6,%ymm2,%ymm2
|
|
vunpcklpd %ymm4,%ymm0,%ymm0
|
|
|
|
2:
|
|
vmovupd 0x40(%rdx),%ymm12
|
|
vshufpd $15, %ymm12, %ymm12, %ymm13 # ymm13: omaiii'i'
|
|
vshufpd $0, %ymm12, %ymm12, %ymm12 # ymm12: omarrr'r'
|
|
|
|
# invctwiddle Re:(ymm0,ymm8) and Im:(ymm2,ymm10) with omega=(ymm12,ymm13)
|
|
# invcitwiddle Re:(ymm1,ymm9) and Im:(ymm3,ymm11) with omega=(ymm12,ymm13)
|
|
vsubpd %ymm8,%ymm0,%ymm4 # retw
|
|
vsubpd %ymm9,%ymm1,%ymm5 # reitw
|
|
vsubpd %ymm10,%ymm2,%ymm6 # imtw
|
|
vsubpd %ymm11,%ymm3,%ymm7 # imitw
|
|
vaddpd %ymm8,%ymm0,%ymm0
|
|
vaddpd %ymm9,%ymm1,%ymm1
|
|
vaddpd %ymm10,%ymm2,%ymm2
|
|
vaddpd %ymm11,%ymm3,%ymm3
|
|
# multiply 4,5,6,7 by 12,13, result to 8,9,10,11
|
|
# twiddles use reom=ymm12, imom=ymm13
|
|
# invtwiddles use reom=ymm13, imom=-ymm12
|
|
vmulpd %ymm6,%ymm13,%ymm8 # imtw.omai (tw)
|
|
vmulpd %ymm7,%ymm12,%ymm9 # imitw.omar (itw)
|
|
vmulpd %ymm4,%ymm13,%ymm10 # retw.omai (tw)
|
|
vmulpd %ymm5,%ymm12,%ymm11 # reitw.omar (itw)
|
|
vfmsub231pd %ymm4,%ymm12,%ymm8 # rprod0 (tw)
|
|
vfmadd231pd %ymm5,%ymm13,%ymm9 # rprod4 (itw)
|
|
vfmadd231pd %ymm6,%ymm12,%ymm10 # iprod0 (tw)
|
|
vfmsub231pd %ymm7,%ymm13,%ymm11 # iprod4 (itw)
|
|
|
|
vperm2f128 $0x31,%ymm10,%ymm2,%ymm6
|
|
vperm2f128 $0x31,%ymm11,%ymm3,%ymm7
|
|
vperm2f128 $0x20,%ymm10,%ymm2,%ymm4
|
|
vperm2f128 $0x20,%ymm11,%ymm3,%ymm5
|
|
vperm2f128 $0x31,%ymm8,%ymm0,%ymm2
|
|
vperm2f128 $0x31,%ymm9,%ymm1,%ymm3
|
|
vperm2f128 $0x20,%ymm8,%ymm0,%ymm0
|
|
vperm2f128 $0x20,%ymm9,%ymm1,%ymm1
|
|
|
|
3:
|
|
vmovupd 0x60(%rdx),%xmm12
|
|
vinsertf128 $1, %xmm12, %ymm12, %ymm12 # omriri
|
|
vshufpd $15, %ymm12, %ymm12, %ymm13 # ymm13: omai
|
|
vshufpd $0, %ymm12, %ymm12, %ymm12 # ymm12: omar
|
|
|
|
# invctwiddle Re:(ymm0,ymm1) and Im:(ymm4,ymm5) with omega=(ymm12,ymm13)
|
|
# invcitwiddle Re:(ymm2,ymm3) and Im:(ymm6,ymm7) with omega=(ymm12,ymm13)
|
|
vsubpd %ymm1,%ymm0,%ymm8 # retw
|
|
vsubpd %ymm3,%ymm2,%ymm9 # reitw
|
|
vsubpd %ymm5,%ymm4,%ymm10 # imtw
|
|
vsubpd %ymm7,%ymm6,%ymm11 # imitw
|
|
vaddpd %ymm1,%ymm0,%ymm0
|
|
vaddpd %ymm3,%ymm2,%ymm2
|
|
vaddpd %ymm5,%ymm4,%ymm4
|
|
vaddpd %ymm7,%ymm6,%ymm6
|
|
# multiply 8,9,10,11 by 12,13, result to 1,3,5,7
|
|
# twiddles use reom=ymm12, imom=ymm13
|
|
# invtwiddles use reom=ymm13, imom=-ymm12
|
|
vmulpd %ymm10,%ymm13,%ymm1 # imtw.omai (tw)
|
|
vmulpd %ymm11,%ymm12,%ymm3 # imitw.omar (itw)
|
|
vmulpd %ymm8,%ymm13,%ymm5 # retw.omai (tw)
|
|
vmulpd %ymm9,%ymm12,%ymm7 # reitw.omar (itw)
|
|
vfmsub231pd %ymm8,%ymm12,%ymm1 # rprod0 (tw)
|
|
vfmadd231pd %ymm9,%ymm13,%ymm3 # rprod4 (itw)
|
|
vfmadd231pd %ymm10,%ymm12,%ymm5 # iprod0 (tw)
|
|
vfmsub231pd %ymm11,%ymm13,%ymm7 # iprod4 (itw)
|
|
|
|
4:
|
|
vmovupd 0x70(%rdx),%xmm12
|
|
vinsertf128 $1, %xmm12, %ymm12, %ymm12 # omriri
|
|
vshufpd $15, %ymm12, %ymm12, %ymm13 # ymm13: omai
|
|
vshufpd $0, %ymm12, %ymm12, %ymm12 # ymm12: omar
|
|
|
|
# invctwiddle Re:(ymm0,ymm2) and Im:(ymm4,ymm6) with omega=(ymm12,ymm13)
|
|
# invctwiddle Re:(ymm1,ymm3) and Im:(ymm5,ymm7) with omega=(ymm12,ymm13)
|
|
vsubpd %ymm2,%ymm0,%ymm8 # retw1
|
|
vsubpd %ymm3,%ymm1,%ymm9 # retw2
|
|
vsubpd %ymm6,%ymm4,%ymm10 # imtw1
|
|
vsubpd %ymm7,%ymm5,%ymm11 # imtw2
|
|
vaddpd %ymm2,%ymm0,%ymm0
|
|
vaddpd %ymm3,%ymm1,%ymm1
|
|
vaddpd %ymm6,%ymm4,%ymm4
|
|
vaddpd %ymm7,%ymm5,%ymm5
|
|
# multiply 8,9,10,11 by 12,13, result to 2,3,6,7
|
|
# twiddles use reom=ymm12, imom=ymm13
|
|
vmulpd %ymm10,%ymm13,%ymm2 # imtw1.omai
|
|
vmulpd %ymm11,%ymm13,%ymm3 # imtw2.omai
|
|
vmulpd %ymm8,%ymm13,%ymm6 # retw1.omai
|
|
vmulpd %ymm9,%ymm13,%ymm7 # retw2.omai
|
|
vfmsub231pd %ymm8,%ymm12,%ymm2 # rprod0
|
|
vfmsub231pd %ymm9,%ymm12,%ymm3 # rprod4
|
|
vfmadd231pd %ymm10,%ymm12,%ymm6 # iprod0
|
|
vfmadd231pd %ymm11,%ymm12,%ymm7 # iprod4
|
|
|
|
5:
|
|
vmovupd %ymm0,(%rdi) # ra0
|
|
vmovupd %ymm1,0x20(%rdi) # ra4
|
|
vmovupd %ymm2,0x40(%rdi) # ra8
|
|
vmovupd %ymm3,0x60(%rdi) # ra12
|
|
vmovupd %ymm4,(%rsi) # ia0
|
|
vmovupd %ymm5,0x20(%rsi) # ia4
|
|
vmovupd %ymm6,0x40(%rsi) # ia8
|
|
vmovupd %ymm7,0x60(%rsi) # ia12
|
|
vzeroupper
|
|
ret
|
|
|
|
.size ifft16_avx_fma, .-ifft16_avx_fma
|
|
.section .note.GNU-stack,"",@progbits
|