Files
poulpy/spqlios/lib/spqlios/ext/neon_accel/macrofx4.h
Jean-Philippe Bossuat 06e4e58b2d spqlios basic wrapper
2025-01-26 12:26:44 +01:00

429 lines
20 KiB
C

/*
* This file is extracted from the implementation of the FFT on Arm64/Neon
* available in https://github.com/cothan/Falcon-Arm (neon/macrof.h).
* =============================================================================
* Copyright (c) 2022 by Cryptographic Engineering Research Group (CERG)
* ECE Department, George Mason University
* Fairfax, VA, U.S.A.
* @author: Duc Tri Nguyen dnguye69@gmu.edu, cothannguyen@gmail.com
* Licensed under the Apache License, Version 2.0 (the "License");
* =============================================================================
*
* This 64-bit Floating point NEON macro x4 has not been modified and is provided as is.
*/
#ifndef MACROFX4_H
#define MACROFX4_H
#include <arm_neon.h>
#include "macrof.h"
#define vloadx4(c, addr) c = vld1q_f64_x4(addr);
#define vstorex4(addr, c) vst1q_f64_x4(addr, c);
#define vfdupx4(c, constant) \
c.val[0] = vdupq_n_f64(constant); \
c.val[1] = vdupq_n_f64(constant); \
c.val[2] = vdupq_n_f64(constant); \
c.val[3] = vdupq_n_f64(constant);
#define vfnegx4(c, a) \
c.val[0] = vnegq_f64(a.val[0]); \
c.val[1] = vnegq_f64(a.val[1]); \
c.val[2] = vnegq_f64(a.val[2]); \
c.val[3] = vnegq_f64(a.val[3]);
#define vfmulnx4(c, a, n) \
c.val[0] = vmulq_n_f64(a.val[0], n); \
c.val[1] = vmulq_n_f64(a.val[1], n); \
c.val[2] = vmulq_n_f64(a.val[2], n); \
c.val[3] = vmulq_n_f64(a.val[3], n);
// c = a - b
#define vfsubx4(c, a, b) \
c.val[0] = vsubq_f64(a.val[0], b.val[0]); \
c.val[1] = vsubq_f64(a.val[1], b.val[1]); \
c.val[2] = vsubq_f64(a.val[2], b.val[2]); \
c.val[3] = vsubq_f64(a.val[3], b.val[3]);
// c = a + b
#define vfaddx4(c, a, b) \
c.val[0] = vaddq_f64(a.val[0], b.val[0]); \
c.val[1] = vaddq_f64(a.val[1], b.val[1]); \
c.val[2] = vaddq_f64(a.val[2], b.val[2]); \
c.val[3] = vaddq_f64(a.val[3], b.val[3]);
#define vfmulx4(c, a, b) \
c.val[0] = vmulq_f64(a.val[0], b.val[0]); \
c.val[1] = vmulq_f64(a.val[1], b.val[1]); \
c.val[2] = vmulq_f64(a.val[2], b.val[2]); \
c.val[3] = vmulq_f64(a.val[3], b.val[3]);
#define vfmulx4_i(c, a, b) \
c.val[0] = vmulq_f64(a.val[0], b); \
c.val[1] = vmulq_f64(a.val[1], b); \
c.val[2] = vmulq_f64(a.val[2], b); \
c.val[3] = vmulq_f64(a.val[3], b);
#define vfinvx4(c, a) \
c.val[0] = vdivq_f64(vdupq_n_f64(1.0), a.val[0]); \
c.val[1] = vdivq_f64(vdupq_n_f64(1.0), a.val[1]); \
c.val[2] = vdivq_f64(vdupq_n_f64(1.0), a.val[2]); \
c.val[3] = vdivq_f64(vdupq_n_f64(1.0), a.val[3]);
#define vfcvtx4(c, a) \
c.val[0] = vcvtq_f64_s64(a.val[0]); \
c.val[1] = vcvtq_f64_s64(a.val[1]); \
c.val[2] = vcvtq_f64_s64(a.val[2]); \
c.val[3] = vcvtq_f64_s64(a.val[3]);
#define vfmlax4(d, c, a, b) \
vfmla(d.val[0], c.val[0], a.val[0], b.val[0]); \
vfmla(d.val[1], c.val[1], a.val[1], b.val[1]); \
vfmla(d.val[2], c.val[2], a.val[2], b.val[2]); \
vfmla(d.val[3], c.val[3], a.val[3], b.val[3]);
#define vfmlsx4(d, c, a, b) \
vfmls(d.val[0], c.val[0], a.val[0], b.val[0]); \
vfmls(d.val[1], c.val[1], a.val[1], b.val[1]); \
vfmls(d.val[2], c.val[2], a.val[2], b.val[2]); \
vfmls(d.val[3], c.val[3], a.val[3], b.val[3]);
#define vfrintx4(c, a) \
c.val[0] = vcvtnq_s64_f64(a.val[0]); \
c.val[1] = vcvtnq_s64_f64(a.val[1]); \
c.val[2] = vcvtnq_s64_f64(a.val[2]); \
c.val[3] = vcvtnq_s64_f64(a.val[3]);
/*
* Wrapper for FFT, split/merge and poly_float.c
*/
#define FPC_MUL(d_re, d_im, a_re, a_im, b_re, b_im) \
vfmul(d_re, a_re, b_re); \
vfmls(d_re, d_re, a_im, b_im); \
vfmul(d_im, a_re, b_im); \
vfmla(d_im, d_im, a_im, b_re);
#define FPC_MULx2(d_re, d_im, a_re, a_im, b_re, b_im) \
vfmul(d_re.val[0], a_re.val[0], b_re.val[0]); \
vfmls(d_re.val[0], d_re.val[0], a_im.val[0], b_im.val[0]); \
vfmul(d_re.val[1], a_re.val[1], b_re.val[1]); \
vfmls(d_re.val[1], d_re.val[1], a_im.val[1], b_im.val[1]); \
vfmul(d_im.val[0], a_re.val[0], b_im.val[0]); \
vfmla(d_im.val[0], d_im.val[0], a_im.val[0], b_re.val[0]); \
vfmul(d_im.val[1], a_re.val[1], b_im.val[1]); \
vfmla(d_im.val[1], d_im.val[1], a_im.val[1], b_re.val[1]);
#define FPC_MULx4(d_re, d_im, a_re, a_im, b_re, b_im) \
vfmul(d_re.val[0], a_re.val[0], b_re.val[0]); \
vfmls(d_re.val[0], d_re.val[0], a_im.val[0], b_im.val[0]); \
vfmul(d_re.val[1], a_re.val[1], b_re.val[1]); \
vfmls(d_re.val[1], d_re.val[1], a_im.val[1], b_im.val[1]); \
vfmul(d_re.val[2], a_re.val[2], b_re.val[2]); \
vfmls(d_re.val[2], d_re.val[2], a_im.val[2], b_im.val[2]); \
vfmul(d_re.val[3], a_re.val[3], b_re.val[3]); \
vfmls(d_re.val[3], d_re.val[3], a_im.val[3], b_im.val[3]); \
vfmul(d_im.val[0], a_re.val[0], b_im.val[0]); \
vfmla(d_im.val[0], d_im.val[0], a_im.val[0], b_re.val[0]); \
vfmul(d_im.val[1], a_re.val[1], b_im.val[1]); \
vfmla(d_im.val[1], d_im.val[1], a_im.val[1], b_re.val[1]); \
vfmul(d_im.val[2], a_re.val[2], b_im.val[2]); \
vfmla(d_im.val[2], d_im.val[2], a_im.val[2], b_re.val[2]); \
vfmul(d_im.val[3], a_re.val[3], b_im.val[3]); \
vfmla(d_im.val[3], d_im.val[3], a_im.val[3], b_re.val[3]);
#define FPC_MLA(d_re, d_im, a_re, a_im, b_re, b_im) \
vfmla(d_re, d_re, a_re, b_re); \
vfmls(d_re, d_re, a_im, b_im); \
vfmla(d_im, d_im, a_re, b_im); \
vfmla(d_im, d_im, a_im, b_re);
#define FPC_MLAx2(d_re, d_im, a_re, a_im, b_re, b_im) \
vfmla(d_re.val[0], d_re.val[0], a_re.val[0], b_re.val[0]); \
vfmls(d_re.val[0], d_re.val[0], a_im.val[0], b_im.val[0]); \
vfmla(d_re.val[1], d_re.val[1], a_re.val[1], b_re.val[1]); \
vfmls(d_re.val[1], d_re.val[1], a_im.val[1], b_im.val[1]); \
vfmla(d_im.val[0], d_im.val[0], a_re.val[0], b_im.val[0]); \
vfmla(d_im.val[0], d_im.val[0], a_im.val[0], b_re.val[0]); \
vfmla(d_im.val[1], d_im.val[1], a_re.val[1], b_im.val[1]); \
vfmla(d_im.val[1], d_im.val[1], a_im.val[1], b_re.val[1]);
#define FPC_MLAx4(d_re, d_im, a_re, a_im, b_re, b_im) \
vfmla(d_re.val[0], d_re.val[0], a_re.val[0], b_re.val[0]); \
vfmls(d_re.val[0], d_re.val[0], a_im.val[0], b_im.val[0]); \
vfmla(d_re.val[1], d_re.val[1], a_re.val[1], b_re.val[1]); \
vfmls(d_re.val[1], d_re.val[1], a_im.val[1], b_im.val[1]); \
vfmla(d_re.val[2], d_re.val[2], a_re.val[2], b_re.val[2]); \
vfmls(d_re.val[2], d_re.val[2], a_im.val[2], b_im.val[2]); \
vfmla(d_re.val[3], d_re.val[3], a_re.val[3], b_re.val[3]); \
vfmls(d_re.val[3], d_re.val[3], a_im.val[3], b_im.val[3]); \
vfmla(d_im.val[0], d_im.val[0], a_re.val[0], b_im.val[0]); \
vfmla(d_im.val[0], d_im.val[0], a_im.val[0], b_re.val[0]); \
vfmla(d_im.val[1], d_im.val[1], a_re.val[1], b_im.val[1]); \
vfmla(d_im.val[1], d_im.val[1], a_im.val[1], b_re.val[1]); \
vfmla(d_im.val[2], d_im.val[2], a_re.val[2], b_im.val[2]); \
vfmla(d_im.val[2], d_im.val[2], a_im.val[2], b_re.val[2]); \
vfmla(d_im.val[3], d_im.val[3], a_re.val[3], b_im.val[3]); \
vfmla(d_im.val[3], d_im.val[3], a_im.val[3], b_re.val[3]);
#define FPC_MUL_CONJx4(d_re, d_im, a_re, a_im, b_re, b_im) \
vfmul(d_re.val[0], b_im.val[0], a_im.val[0]); \
vfmla(d_re.val[0], d_re.val[0], a_re.val[0], b_re.val[0]); \
vfmul(d_re.val[1], b_im.val[1], a_im.val[1]); \
vfmla(d_re.val[1], d_re.val[1], a_re.val[1], b_re.val[1]); \
vfmul(d_re.val[2], b_im.val[2], a_im.val[2]); \
vfmla(d_re.val[2], d_re.val[2], a_re.val[2], b_re.val[2]); \
vfmul(d_re.val[3], b_im.val[3], a_im.val[3]); \
vfmla(d_re.val[3], d_re.val[3], a_re.val[3], b_re.val[3]); \
vfmul(d_im.val[0], b_re.val[0], a_im.val[0]); \
vfmls(d_im.val[0], d_im.val[0], a_re.val[0], b_im.val[0]); \
vfmul(d_im.val[1], b_re.val[1], a_im.val[1]); \
vfmls(d_im.val[1], d_im.val[1], a_re.val[1], b_im.val[1]); \
vfmul(d_im.val[2], b_re.val[2], a_im.val[2]); \
vfmls(d_im.val[2], d_im.val[2], a_re.val[2], b_im.val[2]); \
vfmul(d_im.val[3], b_re.val[3], a_im.val[3]); \
vfmls(d_im.val[3], d_im.val[3], a_re.val[3], b_im.val[3]);
#define FPC_MLA_CONJx4(d_re, d_im, a_re, a_im, b_re, b_im) \
vfmla(d_re.val[0], d_re.val[0], b_im.val[0], a_im.val[0]); \
vfmla(d_re.val[0], d_re.val[0], a_re.val[0], b_re.val[0]); \
vfmla(d_re.val[1], d_re.val[1], b_im.val[1], a_im.val[1]); \
vfmla(d_re.val[1], d_re.val[1], a_re.val[1], b_re.val[1]); \
vfmla(d_re.val[2], d_re.val[2], b_im.val[2], a_im.val[2]); \
vfmla(d_re.val[2], d_re.val[2], a_re.val[2], b_re.val[2]); \
vfmla(d_re.val[3], d_re.val[3], b_im.val[3], a_im.val[3]); \
vfmla(d_re.val[3], d_re.val[3], a_re.val[3], b_re.val[3]); \
vfmla(d_im.val[0], d_im.val[0], b_re.val[0], a_im.val[0]); \
vfmls(d_im.val[0], d_im.val[0], a_re.val[0], b_im.val[0]); \
vfmla(d_im.val[1], d_im.val[1], b_re.val[1], a_im.val[1]); \
vfmls(d_im.val[1], d_im.val[1], a_re.val[1], b_im.val[1]); \
vfmla(d_im.val[2], d_im.val[2], b_re.val[2], a_im.val[2]); \
vfmls(d_im.val[2], d_im.val[2], a_re.val[2], b_im.val[2]); \
vfmla(d_im.val[3], d_im.val[3], b_re.val[3], a_im.val[3]); \
vfmls(d_im.val[3], d_im.val[3], a_re.val[3], b_im.val[3]);
#define FPC_MUL_LANE(d_re, d_im, a_re, a_im, b_re_im) \
vfmul_lane(d_re, a_re, b_re_im, 0); \
vfmls_lane(d_re, d_re, a_im, b_re_im, 1); \
vfmul_lane(d_im, a_re, b_re_im, 1); \
vfmla_lane(d_im, d_im, a_im, b_re_im, 0);
#define FPC_MUL_LANEx4(d_re, d_im, a_re, a_im, b_re_im) \
vfmul_lane(d_re.val[0], a_re.val[0], b_re_im, 0); \
vfmls_lane(d_re.val[0], d_re.val[0], a_im.val[0], b_re_im, 1); \
vfmul_lane(d_re.val[1], a_re.val[1], b_re_im, 0); \
vfmls_lane(d_re.val[1], d_re.val[1], a_im.val[1], b_re_im, 1); \
vfmul_lane(d_re.val[2], a_re.val[2], b_re_im, 0); \
vfmls_lane(d_re.val[2], d_re.val[2], a_im.val[2], b_re_im, 1); \
vfmul_lane(d_re.val[3], a_re.val[3], b_re_im, 0); \
vfmls_lane(d_re.val[3], d_re.val[3], a_im.val[3], b_re_im, 1); \
vfmul_lane(d_im.val[0], a_re.val[0], b_re_im, 1); \
vfmla_lane(d_im.val[0], d_im.val[0], a_im.val[0], b_re_im, 0); \
vfmul_lane(d_im.val[1], a_re.val[1], b_re_im, 1); \
vfmla_lane(d_im.val[1], d_im.val[1], a_im.val[1], b_re_im, 0); \
vfmul_lane(d_im.val[2], a_re.val[2], b_re_im, 1); \
vfmla_lane(d_im.val[2], d_im.val[2], a_im.val[2], b_re_im, 0); \
vfmul_lane(d_im.val[3], a_re.val[3], b_re_im, 1); \
vfmla_lane(d_im.val[3], d_im.val[3], a_im.val[3], b_re_im, 0);
#define FWD_TOP(t_re, t_im, b_re, b_im, zeta_re, zeta_im) \
FPC_MUL(t_re, t_im, b_re, b_im, zeta_re, zeta_im);
#define FWD_TOP_LANE(t_re, t_im, b_re, b_im, zeta) \
FPC_MUL_LANE(t_re, t_im, b_re, b_im, zeta);
#define FWD_TOP_LANEx4(t_re, t_im, b_re, b_im, zeta) \
FPC_MUL_LANEx4(t_re, t_im, b_re, b_im, zeta);
/*
* FPC
*/
#define FPC_SUB(d_re, d_im, a_re, a_im, b_re, b_im) \
d_re = vsubq_f64(a_re, b_re); \
d_im = vsubq_f64(a_im, b_im);
#define FPC_SUBx4(d_re, d_im, a_re, a_im, b_re, b_im) \
d_re.val[0] = vsubq_f64(a_re.val[0], b_re.val[0]); \
d_im.val[0] = vsubq_f64(a_im.val[0], b_im.val[0]); \
d_re.val[1] = vsubq_f64(a_re.val[1], b_re.val[1]); \
d_im.val[1] = vsubq_f64(a_im.val[1], b_im.val[1]); \
d_re.val[2] = vsubq_f64(a_re.val[2], b_re.val[2]); \
d_im.val[2] = vsubq_f64(a_im.val[2], b_im.val[2]); \
d_re.val[3] = vsubq_f64(a_re.val[3], b_re.val[3]); \
d_im.val[3] = vsubq_f64(a_im.val[3], b_im.val[3]);
#define FPC_ADD(d_re, d_im, a_re, a_im, b_re, b_im) \
d_re = vaddq_f64(a_re, b_re); \
d_im = vaddq_f64(a_im, b_im);
#define FPC_ADDx4(d_re, d_im, a_re, a_im, b_re, b_im) \
d_re.val[0] = vaddq_f64(a_re.val[0], b_re.val[0]); \
d_im.val[0] = vaddq_f64(a_im.val[0], b_im.val[0]); \
d_re.val[1] = vaddq_f64(a_re.val[1], b_re.val[1]); \
d_im.val[1] = vaddq_f64(a_im.val[1], b_im.val[1]); \
d_re.val[2] = vaddq_f64(a_re.val[2], b_re.val[2]); \
d_im.val[2] = vaddq_f64(a_im.val[2], b_im.val[2]); \
d_re.val[3] = vaddq_f64(a_re.val[3], b_re.val[3]); \
d_im.val[3] = vaddq_f64(a_im.val[3], b_im.val[3]);
#define FWD_BOT(a_re, a_im, b_re, b_im, t_re, t_im) \
FPC_SUB(b_re, b_im, a_re, a_im, t_re, t_im); \
FPC_ADD(a_re, a_im, a_re, a_im, t_re, t_im);
#define FWD_BOTx4(a_re, a_im, b_re, b_im, t_re, t_im) \
FPC_SUBx4(b_re, b_im, a_re, a_im, t_re, t_im); \
FPC_ADDx4(a_re, a_im, a_re, a_im, t_re, t_im);
/*
* FPC_J
*/
#define FPC_ADDJ(d_re, d_im, a_re, a_im, b_re, b_im) \
d_re = vsubq_f64(a_re, b_im); \
d_im = vaddq_f64(a_im, b_re);
#define FPC_ADDJx4(d_re, d_im, a_re, a_im, b_re, b_im) \
d_re.val[0] = vsubq_f64(a_re.val[0], b_im.val[0]); \
d_im.val[0] = vaddq_f64(a_im.val[0], b_re.val[0]); \
d_re.val[1] = vsubq_f64(a_re.val[1], b_im.val[1]); \
d_im.val[1] = vaddq_f64(a_im.val[1], b_re.val[1]); \
d_re.val[2] = vsubq_f64(a_re.val[2], b_im.val[2]); \
d_im.val[2] = vaddq_f64(a_im.val[2], b_re.val[2]); \
d_re.val[3] = vsubq_f64(a_re.val[3], b_im.val[3]); \
d_im.val[3] = vaddq_f64(a_im.val[3], b_re.val[3]);
#define FPC_SUBJ(d_re, d_im, a_re, a_im, b_re, b_im) \
d_re = vaddq_f64(a_re, b_im); \
d_im = vsubq_f64(a_im, b_re);
#define FPC_SUBJx4(d_re, d_im, a_re, a_im, b_re, b_im) \
d_re.val[0] = vaddq_f64(a_re.val[0], b_im.val[0]); \
d_im.val[0] = vsubq_f64(a_im.val[0], b_re.val[0]); \
d_re.val[1] = vaddq_f64(a_re.val[1], b_im.val[1]); \
d_im.val[1] = vsubq_f64(a_im.val[1], b_re.val[1]); \
d_re.val[2] = vaddq_f64(a_re.val[2], b_im.val[2]); \
d_im.val[2] = vsubq_f64(a_im.val[2], b_re.val[2]); \
d_re.val[3] = vaddq_f64(a_re.val[3], b_im.val[3]); \
d_im.val[3] = vsubq_f64(a_im.val[3], b_re.val[3]);
#define FWD_BOTJ(a_re, a_im, b_re, b_im, t_re, t_im) \
FPC_SUBJ(b_re, b_im, a_re, a_im, t_re, t_im); \
FPC_ADDJ(a_re, a_im, a_re, a_im, t_re, t_im);
#define FWD_BOTJx4(a_re, a_im, b_re, b_im, t_re, t_im) \
FPC_SUBJx4(b_re, b_im, a_re, a_im, t_re, t_im); \
FPC_ADDJx4(a_re, a_im, a_re, a_im, t_re, t_im);
//============== Inverse FFT
/*
* FPC_J
* a * conj(b)
* Original (without swap):
* d_re = b_im * a_im + a_re * b_re;
* d_im = b_re * a_im - a_re * b_im;
*/
#define FPC_MUL_BOTJ_LANE(d_re, d_im, a_re, a_im, b_re_im) \
vfmul_lane(d_re, a_re, b_re_im, 0); \
vfmla_lane(d_re, d_re, a_im, b_re_im, 1); \
vfmul_lane(d_im, a_im, b_re_im, 0); \
vfmls_lane(d_im, d_im, a_re, b_re_im, 1);
#define FPC_MUL_BOTJ_LANEx4(d_re, d_im, a_re, a_im, b_re_im) \
vfmul_lane(d_re.val[0], a_re.val[0], b_re_im, 0); \
vfmla_lane(d_re.val[0], d_re.val[0], a_im.val[0], b_re_im, 1); \
vfmul_lane(d_im.val[0], a_im.val[0], b_re_im, 0); \
vfmls_lane(d_im.val[0], d_im.val[0], a_re.val[0], b_re_im, 1); \
vfmul_lane(d_re.val[1], a_re.val[1], b_re_im, 0); \
vfmla_lane(d_re.val[1], d_re.val[1], a_im.val[1], b_re_im, 1); \
vfmul_lane(d_im.val[1], a_im.val[1], b_re_im, 0); \
vfmls_lane(d_im.val[1], d_im.val[1], a_re.val[1], b_re_im, 1); \
vfmul_lane(d_re.val[2], a_re.val[2], b_re_im, 0); \
vfmla_lane(d_re.val[2], d_re.val[2], a_im.val[2], b_re_im, 1); \
vfmul_lane(d_im.val[2], a_im.val[2], b_re_im, 0); \
vfmls_lane(d_im.val[2], d_im.val[2], a_re.val[2], b_re_im, 1); \
vfmul_lane(d_re.val[3], a_re.val[3], b_re_im, 0); \
vfmla_lane(d_re.val[3], d_re.val[3], a_im.val[3], b_re_im, 1); \
vfmul_lane(d_im.val[3], a_im.val[3], b_re_im, 0); \
vfmls_lane(d_im.val[3], d_im.val[3], a_re.val[3], b_re_im, 1);
#define FPC_MUL_BOTJ(d_re, d_im, a_re, a_im, b_re, b_im) \
vfmul(d_re, b_im, a_im); \
vfmla(d_re, d_re, a_re, b_re); \
vfmul(d_im, b_re, a_im); \
vfmls(d_im, d_im, a_re, b_im);
#define INV_TOPJ(t_re, t_im, a_re, a_im, b_re, b_im) \
FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im); \
FPC_ADD(a_re, a_im, a_re, a_im, b_re, b_im);
#define INV_TOPJx4(t_re, t_im, a_re, a_im, b_re, b_im) \
FPC_SUBx4(t_re, t_im, a_re, a_im, b_re, b_im); \
FPC_ADDx4(a_re, a_im, a_re, a_im, b_re, b_im);
#define INV_BOTJ(b_re, b_im, t_re, t_im, zeta_re, zeta_im) \
FPC_MUL_BOTJ(b_re, b_im, t_re, t_im, zeta_re, zeta_im);
#define INV_BOTJ_LANE(b_re, b_im, t_re, t_im, zeta) \
FPC_MUL_BOTJ_LANE(b_re, b_im, t_re, t_im, zeta);
#define INV_BOTJ_LANEx4(b_re, b_im, t_re, t_im, zeta) \
FPC_MUL_BOTJ_LANEx4(b_re, b_im, t_re, t_im, zeta);
/*
* FPC_Jm
* a * -conj(b)
* d_re = a_re * b_im - a_im * b_re;
* d_im = a_im * b_im + a_re * b_re;
*/
#define FPC_MUL_BOTJm_LANE(d_re, d_im, a_re, a_im, b_re_im) \
vfmul_lane(d_re, a_re, b_re_im, 1); \
vfmls_lane(d_re, d_re, a_im, b_re_im, 0); \
vfmul_lane(d_im, a_re, b_re_im, 0); \
vfmla_lane(d_im, d_im, a_im, b_re_im, 1);
#define FPC_MUL_BOTJm_LANEx4(d_re, d_im, a_re, a_im, b_re_im) \
vfmul_lane(d_re.val[0], a_re.val[0], b_re_im, 1); \
vfmls_lane(d_re.val[0], d_re.val[0], a_im.val[0], b_re_im, 0); \
vfmul_lane(d_im.val[0], a_re.val[0], b_re_im, 0); \
vfmla_lane(d_im.val[0], d_im.val[0], a_im.val[0], b_re_im, 1); \
vfmul_lane(d_re.val[1], a_re.val[1], b_re_im, 1); \
vfmls_lane(d_re.val[1], d_re.val[1], a_im.val[1], b_re_im, 0); \
vfmul_lane(d_im.val[1], a_re.val[1], b_re_im, 0); \
vfmla_lane(d_im.val[1], d_im.val[1], a_im.val[1], b_re_im, 1); \
vfmul_lane(d_re.val[2], a_re.val[2], b_re_im, 1); \
vfmls_lane(d_re.val[2], d_re.val[2], a_im.val[2], b_re_im, 0); \
vfmul_lane(d_im.val[2], a_re.val[2], b_re_im, 0); \
vfmla_lane(d_im.val[2], d_im.val[2], a_im.val[2], b_re_im, 1); \
vfmul_lane(d_re.val[3], a_re.val[3], b_re_im, 1); \
vfmls_lane(d_re.val[3], d_re.val[3], a_im.val[3], b_re_im, 0); \
vfmul_lane(d_im.val[3], a_re.val[3], b_re_im, 0); \
vfmla_lane(d_im.val[3], d_im.val[3], a_im.val[3], b_re_im, 1);
#define FPC_MUL_BOTJm(d_re, d_im, a_re, a_im, b_re, b_im) \
vfmul(d_re, a_re, b_im); \
vfmls(d_re, d_re, a_im, b_re); \
vfmul(d_im, a_im, b_im); \
vfmla(d_im, d_im, a_re, b_re);
#define INV_TOPJm(t_re, t_im, a_re, a_im, b_re, b_im) \
FPC_SUB(t_re, t_im, b_re, b_im, a_re, a_im); \
FPC_ADD(a_re, a_im, a_re, a_im, b_re, b_im);
#define INV_TOPJmx4(t_re, t_im, a_re, a_im, b_re, b_im) \
FPC_SUBx4(t_re, t_im, b_re, b_im, a_re, a_im); \
FPC_ADDx4(a_re, a_im, a_re, a_im, b_re, b_im);
#define INV_BOTJm(b_re, b_im, t_re, t_im, zeta_re, zeta_im) \
FPC_MUL_BOTJm(b_re, b_im, t_re, t_im, zeta_re, zeta_im);
#define INV_BOTJm_LANE(b_re, b_im, t_re, t_im, zeta) \
FPC_MUL_BOTJm_LANE(b_re, b_im, t_re, t_im, zeta);
#define INV_BOTJm_LANEx4(b_re, b_im, t_re, t_im, zeta) \
FPC_MUL_BOTJm_LANEx4(b_re, b_im, t_re, t_im, zeta);
#endif