mirror of
https://github.com/arnaucube/poulpy.git
synced 2026-02-10 13:16:44 +01:00
497 lines
16 KiB
C++
497 lines
16 KiB
C++
#include <cmath>
|
|
|
|
#include "gtest/gtest.h"
|
|
#include "spqlios/commons_private.h"
|
|
#include "spqlios/cplx/cplx_fft.h"
|
|
#include "spqlios/cplx/cplx_fft_internal.h"
|
|
#include "spqlios/cplx/cplx_fft_private.h"
|
|
|
|
#ifdef __x86_64__
|
|
TEST(fft, ifft16_fma_vs_ref) {
|
|
CPLX data[16];
|
|
CPLX omega[8];
|
|
for (uint64_t i = 0; i < 32; ++i) ((double*)data)[i] = 2 * i + 1; //(rand()%100)-50;
|
|
for (uint64_t i = 0; i < 16; ++i) ((double*)omega)[i] = i + 1; //(rand()%100)-50;
|
|
CPLX copydata[16];
|
|
CPLX copyomega[8];
|
|
memcpy(copydata, data, sizeof(copydata));
|
|
memcpy(copyomega, omega, sizeof(copyomega));
|
|
cplx_ifft16_avx_fma(data, omega);
|
|
cplx_ifft16_ref(copydata, copyomega);
|
|
double distance = 0;
|
|
for (uint64_t i = 0; i < 16; ++i) {
|
|
double d1 = fabs(data[i][0] - copydata[i][0]);
|
|
double d2 = fabs(data[i][0] - copydata[i][0]);
|
|
if (d1 > distance) distance = d1;
|
|
if (d2 > distance) distance = d2;
|
|
}
|
|
/*
|
|
printf("data:\n");
|
|
for (uint64_t i=0; i<4; ++i) {
|
|
for (uint64_t j=0; j<8; ++j) {
|
|
printf("%.5lf ", data[4 * i + j / 2][j % 2]);
|
|
}
|
|
printf("\n");
|
|
}
|
|
printf("copydata:\n");
|
|
for (uint64_t i=0; i<4; ++i) {
|
|
for (uint64_t j=0; j<8; ++j) {
|
|
printf("%5.5lf ", copydata[4 * i + j / 2][j % 2]);
|
|
}
|
|
printf("\n");
|
|
}
|
|
*/
|
|
ASSERT_EQ(distance, 0);
|
|
}
|
|
|
|
#endif
|
|
|
|
void cplx_zero(CPLX r) { r[0] = r[1] = 0; }
|
|
void cplx_addmul(CPLX r, const CPLX a, const CPLX b) {
|
|
double re = r[0] + a[0] * b[0] - a[1] * b[1];
|
|
double im = r[1] + a[0] * b[1] + a[1] * b[0];
|
|
r[0] = re;
|
|
r[1] = im;
|
|
}
|
|
|
|
void halfcfft_eval(CPLX res, uint32_t nn, uint32_t k, const CPLX* coeffs, const CPLX* powomegas) {
|
|
const uint32_t N = nn / 2;
|
|
cplx_zero(res);
|
|
for (uint64_t i = 0; i < N; ++i) {
|
|
cplx_addmul(res, coeffs[i], powomegas[(k * i) % (2 * nn)]);
|
|
}
|
|
}
|
|
void halfcfft_naive(uint32_t nn, CPLX* data) {
|
|
const uint32_t N = nn / 2;
|
|
CPLX* in = (CPLX*)malloc(N * sizeof(CPLX));
|
|
CPLX* powomega = (CPLX*)malloc(2 * nn * sizeof(CPLX));
|
|
for (uint64_t i = 0; i < (2 * nn); ++i) {
|
|
powomega[i][0] = m_accurate_cos((M_PI * i) / nn);
|
|
powomega[i][1] = m_accurate_sin((M_PI * i) / nn);
|
|
}
|
|
memcpy(in, data, N * sizeof(CPLX));
|
|
for (uint64_t j = 0; j < N; ++j) {
|
|
uint64_t p = rint(log2(N)) + 2;
|
|
uint64_t k = revbits(p, j) + 1;
|
|
halfcfft_eval(data[j], nn, k, in, powomega);
|
|
}
|
|
free(powomega);
|
|
free(in);
|
|
}
|
|
|
|
#ifdef __x86_64__
|
|
TEST(fft, fft16_fma_vs_ref) {
|
|
CPLX data[16];
|
|
CPLX omega[8];
|
|
for (uint64_t i = 0; i < 32; ++i) ((double*)data)[i] = rand() % 1000;
|
|
for (uint64_t i = 0; i < 16; ++i) ((double*)omega)[i] = rand() % 1000;
|
|
CPLX copydata[16];
|
|
CPLX copyomega[8];
|
|
memcpy(copydata, data, sizeof(copydata));
|
|
memcpy(copyomega, omega, sizeof(copyomega));
|
|
cplx_fft16_avx_fma(data, omega);
|
|
cplx_fft16_ref(copydata, omega);
|
|
double distance = 0;
|
|
for (uint64_t i = 0; i < 16; ++i) {
|
|
double d1 = fabs(data[i][0] - copydata[i][0]);
|
|
double d2 = fabs(data[i][0] - copydata[i][0]);
|
|
if (d1 > distance) distance = d1;
|
|
if (d2 > distance) distance = d2;
|
|
}
|
|
ASSERT_EQ(distance, 0);
|
|
}
|
|
#endif
|
|
|
|
TEST(fft, citwiddle_then_invcitwiddle) {
|
|
CPLX om;
|
|
CPLX ombar;
|
|
CPLX data[2];
|
|
CPLX copydata[2];
|
|
om[0] = cos(3);
|
|
om[1] = sin(3);
|
|
ombar[0] = om[0];
|
|
ombar[1] = -om[1];
|
|
data[0][0] = 47;
|
|
data[0][1] = 23;
|
|
data[1][0] = -12;
|
|
data[1][1] = -9;
|
|
memcpy(copydata, data, sizeof(copydata));
|
|
citwiddle(data[0], data[1], om);
|
|
invcitwiddle(data[0], data[1], ombar);
|
|
double distance = 0;
|
|
for (uint64_t i = 0; i < 2; ++i) {
|
|
double d1 = fabs(data[i][0] - 2 * copydata[i][0]);
|
|
double d2 = fabs(data[i][1] - 2 * copydata[i][1]);
|
|
if (d1 > distance) distance = d1;
|
|
if (d2 > distance) distance = d2;
|
|
}
|
|
ASSERT_LE(distance, 1e-9);
|
|
}
|
|
|
|
TEST(fft, ctwiddle_then_invctwiddle) {
|
|
CPLX om;
|
|
CPLX ombar;
|
|
CPLX data[2];
|
|
CPLX copydata[2];
|
|
om[0] = cos(3);
|
|
om[1] = sin(3);
|
|
ombar[0] = om[0];
|
|
ombar[1] = -om[1];
|
|
data[0][0] = 47;
|
|
data[0][1] = 23;
|
|
data[1][0] = -12;
|
|
data[1][1] = -9;
|
|
memcpy(copydata, data, sizeof(copydata));
|
|
ctwiddle(data[0], data[1], om);
|
|
invctwiddle(data[0], data[1], ombar);
|
|
double distance = 0;
|
|
for (uint64_t i = 0; i < 2; ++i) {
|
|
double d1 = fabs(data[i][0] - 2 * copydata[i][0]);
|
|
double d2 = fabs(data[i][1] - 2 * copydata[i][1]);
|
|
if (d1 > distance) distance = d1;
|
|
if (d2 > distance) distance = d2;
|
|
}
|
|
ASSERT_LE(distance, 1e-9);
|
|
}
|
|
|
|
TEST(fft, fft16_then_ifft16_ref) {
|
|
CPLX full_omegas[64];
|
|
CPLX full_omegabars[64];
|
|
for (uint64_t i = 0; i < 64; ++i) {
|
|
full_omegas[i][0] = cos(M_PI * i / 32.);
|
|
full_omegas[i][1] = sin(M_PI * i / 32.);
|
|
full_omegabars[i][0] = full_omegas[i][0];
|
|
full_omegabars[i][1] = -full_omegas[i][1];
|
|
}
|
|
CPLX omega[8];
|
|
CPLX omegabar[8];
|
|
cplx_set(omega[0], full_omegas[8]); // j
|
|
cplx_set(omega[1], full_omegas[4]); // k
|
|
cplx_set(omega[2], full_omegas[2]); // l
|
|
cplx_set(omega[3], full_omegas[10]); // lj
|
|
cplx_set(omega[4], full_omegas[1]); // n
|
|
cplx_set(omega[5], full_omegas[9]); // nj
|
|
cplx_set(omega[6], full_omegas[5]); // nk
|
|
cplx_set(omega[7], full_omegas[13]); // njk
|
|
cplx_set(omegabar[0], full_omegabars[1]); // n
|
|
cplx_set(omegabar[1], full_omegabars[9]); // nj
|
|
cplx_set(omegabar[2], full_omegabars[5]); // nk
|
|
cplx_set(omegabar[3], full_omegabars[13]); // njk
|
|
cplx_set(omegabar[4], full_omegabars[2]); // l
|
|
cplx_set(omegabar[5], full_omegabars[10]); // lj
|
|
cplx_set(omegabar[6], full_omegabars[4]); // k
|
|
cplx_set(omegabar[7], full_omegabars[8]); // j
|
|
CPLX data[16];
|
|
CPLX copydata[16];
|
|
for (uint64_t i = 0; i < 32; ++i) ((double*)data)[i] = rand() % 1000;
|
|
memcpy(copydata, data, sizeof(copydata));
|
|
cplx_fft16_ref(data, omega);
|
|
cplx_ifft16_ref(data, omegabar);
|
|
double distance = 0;
|
|
for (uint64_t i = 0; i < 16; ++i) {
|
|
double d1 = fabs(data[i][0] - 16 * copydata[i][0]);
|
|
double d2 = fabs(data[i][0] - 16 * copydata[i][0]);
|
|
if (d1 > distance) distance = d1;
|
|
if (d2 > distance) distance = d2;
|
|
}
|
|
ASSERT_LE(distance, 1e-9);
|
|
}
|
|
|
|
TEST(fft, halfcfft_ref_vs_naive) {
|
|
for (uint64_t nn : {4, 8, 16, 64, 256, 8192}) {
|
|
uint64_t m = nn / 2;
|
|
CPLX_FFT_PRECOMP* tables = new_cplx_fft_precomp(m, 0);
|
|
CPLX* a = (CPLX*)spqlios_alloc_custom_align(32, m * sizeof(CPLX));
|
|
CPLX* a1 = (CPLX*)spqlios_alloc_custom_align(32, m * sizeof(CPLX));
|
|
CPLX* a2 = (CPLX*)spqlios_alloc_custom_align(32, m * sizeof(CPLX));
|
|
int64_t p = 1 << 16;
|
|
for (uint32_t i = 0; i < m; i++) {
|
|
a[i][0] = (rand() % p) - p / 2; // between -p/2 and p/2
|
|
a[i][1] = (rand() % p) - p / 2;
|
|
}
|
|
memcpy(a1, a, m * sizeof(CPLX));
|
|
memcpy(a2, a, m * sizeof(CPLX));
|
|
|
|
halfcfft_naive(nn, a1);
|
|
cplx_fft_naive(m, 0.25, a2);
|
|
|
|
double d = 0;
|
|
for (uint32_t i = 0; i < nn / 2; i++) {
|
|
double dre = fabs(a1[i][0] - a2[i][0]);
|
|
double dim = fabs(a1[i][1] - a2[i][1]);
|
|
if (dre > d) d = dre;
|
|
if (dim > d) d = dim;
|
|
}
|
|
ASSERT_LE(d, nn * 1e-10) << nn;
|
|
spqlios_free(a);
|
|
spqlios_free(a1);
|
|
spqlios_free(a2);
|
|
delete_cplx_fft_precomp(tables);
|
|
}
|
|
}
|
|
|
|
#ifdef __x86_64__
|
|
TEST(fft, halfcfft_fma_vs_ref) {
|
|
typedef void (*FFTF)(const CPLX_FFT_PRECOMP*, void* data);
|
|
for (FFTF fft : {cplx_fft_ref, cplx_fft_avx2_fma}) {
|
|
for (uint64_t nn : {8, 16, 32, 64, 1024, 8192, 65536}) {
|
|
uint64_t m = nn / 2;
|
|
CPLX_FFT_PRECOMP* tables = new_cplx_fft_precomp(m, 0);
|
|
CPLX* a = (CPLX*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
|
|
CPLX* a1 = (CPLX*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
|
|
CPLX* a2 = (CPLX*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
|
|
int64_t p = 1 << 16;
|
|
for (uint32_t i = 0; i < nn / 2; i++) {
|
|
a[i][0] = (rand() % p) - p / 2; // between -p/2 and p/2
|
|
a[i][1] = (rand() % p) - p / 2;
|
|
}
|
|
memcpy(a1, a, nn / 2 * sizeof(CPLX));
|
|
memcpy(a2, a, nn / 2 * sizeof(CPLX));
|
|
cplx_fft_naive(m, 0.25, a2);
|
|
fft(tables, a1);
|
|
double d = 0;
|
|
for (uint32_t i = 0; i < nn / 2; i++) {
|
|
double dre = fabs(a1[i][0] - a2[i][0]);
|
|
double dim = fabs(a1[i][1] - a2[i][1]);
|
|
if (dre > d) d = dre;
|
|
if (dim > d) d = dim;
|
|
}
|
|
ASSERT_LE(d, nn * 1e-10) << nn;
|
|
spqlios_free(a);
|
|
spqlios_free(a1);
|
|
spqlios_free(a2);
|
|
delete_cplx_fft_precomp(tables);
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
|
|
TEST(fft, halfcfft_then_ifft_ref) {
|
|
for (uint64_t nn : {4, 8, 16, 32, 64, 1024, 8192, 65536}) {
|
|
uint64_t m = nn / 2;
|
|
CPLX_FFT_PRECOMP* tables = new_cplx_fft_precomp(m, 0);
|
|
CPLX_IFFT_PRECOMP* itables = new_cplx_ifft_precomp(m, 0);
|
|
CPLX* a = (CPLX*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
|
|
CPLX* a1 = (CPLX*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
|
|
int64_t p = 1 << 16;
|
|
for (uint32_t i = 0; i < nn / 2; i++) {
|
|
a[i][0] = (rand() % p) - p / 2; // between -p/2 and p/2
|
|
a[i][1] = (rand() % p) - p / 2;
|
|
}
|
|
memcpy(a1, a, nn / 2 * sizeof(CPLX));
|
|
cplx_fft_ref(tables, a1);
|
|
cplx_ifft_ref(itables, a1);
|
|
double d = 0;
|
|
for (uint32_t i = 0; i < nn / 2; i++) {
|
|
double dre = fabs(a[i][0] - a1[i][0] / (nn / 2));
|
|
double dim = fabs(a[i][1] - a1[i][1] / (nn / 2));
|
|
if (dre > d) d = dre;
|
|
if (dim > d) d = dim;
|
|
}
|
|
ASSERT_LE(d, 1e-8);
|
|
spqlios_free(a);
|
|
spqlios_free(a1);
|
|
delete_cplx_fft_precomp(tables);
|
|
delete_cplx_ifft_precomp(itables);
|
|
}
|
|
}
|
|
|
|
#ifdef __x86_64__
|
|
TEST(fft, halfcfft_ifft_fma_vs_ref) {
|
|
for (IFFT_FUNCTION ifft : {cplx_ifft_ref, cplx_ifft_avx2_fma}) {
|
|
for (uint64_t nn : {8, 16, 32, 1024, 4096, 8192, 65536}) {
|
|
uint64_t m = nn / 2;
|
|
CPLX_IFFT_PRECOMP* itables = new_cplx_ifft_precomp(m, 0);
|
|
CPLX* a = (CPLX*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
|
|
CPLX* a1 = (CPLX*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
|
|
CPLX* a2 = (CPLX*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
|
|
int64_t p = 1 << 16;
|
|
for (uint32_t i = 0; i < nn / 2; i++) {
|
|
a[i][0] = (rand() % p) - p / 2; // between -p/2 and p/2
|
|
a[i][1] = (rand() % p) - p / 2;
|
|
}
|
|
memcpy(a1, a, nn / 2 * sizeof(CPLX));
|
|
memcpy(a2, a, nn / 2 * sizeof(CPLX));
|
|
cplx_ifft_naive(m, 0.25, a2);
|
|
ifft(itables, a1);
|
|
double d = 0;
|
|
for (uint32_t i = 0; i < nn / 2; i++) {
|
|
double dre = fabs(a1[i][0] - a2[i][0]);
|
|
double dim = fabs(a1[i][1] - a2[i][1]);
|
|
if (dre > d) d = dre;
|
|
if (dim > d) d = dim;
|
|
}
|
|
ASSERT_LE(d, 1e-8);
|
|
spqlios_free(a);
|
|
spqlios_free(a1);
|
|
spqlios_free(a2);
|
|
delete_cplx_ifft_precomp(itables);
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
|
|
// test the reference and simple implementations of mul on all dimensions
|
|
TEST(fftvec, cplx_fftvec_mul_ref) {
|
|
for (uint64_t nn : {2, 4, 8, 16, 32, 1024, 4096, 8192, 65536}) {
|
|
uint64_t m = nn / 2;
|
|
CPLX_FFTVEC_MUL_PRECOMP* precomp = new_cplx_fftvec_mul_precomp(m);
|
|
CPLX* a = new CPLX[m];
|
|
CPLX* b = new CPLX[m];
|
|
CPLX* r0 = new CPLX[m];
|
|
CPLX* r1 = new CPLX[m];
|
|
CPLX* r2 = new CPLX[m];
|
|
int64_t p = 1 << 16;
|
|
for (uint32_t i = 0; i < m; i++) {
|
|
a[i][0] = (rand() % p) - p / 2; // between -p/2 and p/2
|
|
a[i][1] = (rand() % p) - p / 2;
|
|
b[i][0] = (rand() % p) - p / 2; // between -p/2 and p/2
|
|
b[i][1] = (rand() % p) - p / 2;
|
|
r2[i][0] = r1[i][0] = r0[i][0] = (rand() % p) - p / 2; // between -p/2 and p/2
|
|
r2[i][1] = r1[i][1] = r0[i][1] = (rand() % p) - p / 2;
|
|
}
|
|
cplx_fftvec_mul_simple(m, r0, a, b);
|
|
cplx_fftvec_mul_ref(precomp, r1, a, b);
|
|
for (uint32_t i = 0; i < m; i++) {
|
|
r2[i][0] = a[i][0] * b[i][0] - a[i][1] * b[i][1];
|
|
r2[i][1] = a[i][0] * b[i][1] + a[i][1] * b[i][0];
|
|
ASSERT_LE(fabs(r1[i][0] - r2[i][0]) + fabs(r1[i][1] - r2[i][1]), 1e-8);
|
|
ASSERT_LE(fabs(r0[i][0] - r2[i][0]) + fabs(r0[i][1] - r2[i][1]), 1e-8);
|
|
}
|
|
delete[] a;
|
|
delete[] b;
|
|
delete[] r0;
|
|
delete[] r1;
|
|
delete[] r2;
|
|
delete_cplx_fftvec_mul_precomp(precomp);
|
|
}
|
|
}
|
|
|
|
// test the reference and simple implementations of addmul on all dimensions
|
|
TEST(fftvec, cplx_fftvec_addmul_ref) {
|
|
for (uint64_t nn : {2, 4, 8, 16, 32, 1024, 4096, 8192, 65536}) {
|
|
uint64_t m = nn / 2;
|
|
CPLX_FFTVEC_ADDMUL_PRECOMP* precomp = new_cplx_fftvec_addmul_precomp(m);
|
|
CPLX* a = new CPLX[m];
|
|
CPLX* b = new CPLX[m];
|
|
CPLX* r0 = new CPLX[m];
|
|
CPLX* r1 = new CPLX[m];
|
|
CPLX* r2 = new CPLX[m];
|
|
int64_t p = 1 << 16;
|
|
for (uint32_t i = 0; i < m; i++) {
|
|
a[i][0] = (rand() % p) - p / 2; // between -p/2 and p/2
|
|
a[i][1] = (rand() % p) - p / 2;
|
|
b[i][0] = (rand() % p) - p / 2; // between -p/2 and p/2
|
|
b[i][1] = (rand() % p) - p / 2;
|
|
r2[i][0] = r1[i][0] = r0[i][0] = (rand() % p) - p / 2; // between -p/2 and p/2
|
|
r2[i][1] = r1[i][1] = r0[i][1] = (rand() % p) - p / 2;
|
|
}
|
|
cplx_fftvec_addmul_simple(m, r0, a, b);
|
|
cplx_fftvec_addmul_ref(precomp, r1, a, b);
|
|
for (uint32_t i = 0; i < m; i++) {
|
|
r2[i][0] += a[i][0] * b[i][0] - a[i][1] * b[i][1];
|
|
r2[i][1] += a[i][0] * b[i][1] + a[i][1] * b[i][0];
|
|
ASSERT_LE(fabs(r1[i][0] - r2[i][0]) + fabs(r1[i][1] - r2[i][1]), 1e-8);
|
|
ASSERT_LE(fabs(r0[i][0] - r2[i][0]) + fabs(r0[i][1] - r2[i][1]), 1e-8);
|
|
}
|
|
delete[] a;
|
|
delete[] b;
|
|
delete[] r0;
|
|
delete[] r1;
|
|
delete[] r2;
|
|
delete_cplx_fftvec_addmul_precomp(precomp);
|
|
}
|
|
}
|
|
|
|
// comparative tests between mul ref vs. optimized (only relevant dimensions)
|
|
TEST(fftvec, cplx_fftvec_mul_ref_vs_optim) {
|
|
struct totest {
|
|
FFTVEC_MUL_FUNCTION f;
|
|
uint64_t min_m;
|
|
totest(FFTVEC_MUL_FUNCTION f, uint64_t min_m) : f(f), min_m(min_m) {}
|
|
};
|
|
std::vector<totest> totestset;
|
|
totestset.emplace_back(cplx_fftvec_mul, 1);
|
|
#ifdef __x86_64__
|
|
totestset.emplace_back(cplx_fftvec_mul_fma, 8);
|
|
#endif
|
|
for (uint64_t m : {1, 2, 4, 8, 16, 1024, 4096, 8192, 65536}) {
|
|
CPLX_FFTVEC_MUL_PRECOMP* precomp = new_cplx_fftvec_mul_precomp(m);
|
|
for (const totest& t : totestset) {
|
|
if (t.min_m > m) continue;
|
|
CPLX* a = new CPLX[m];
|
|
CPLX* b = new CPLX[m];
|
|
CPLX* r1 = new CPLX[m];
|
|
CPLX* r2 = new CPLX[m];
|
|
int64_t p = 1 << 16;
|
|
for (uint32_t i = 0; i < m; i++) {
|
|
a[i][0] = (rand() % p) - p / 2; // between -p/2 and p/2
|
|
a[i][1] = (rand() % p) - p / 2;
|
|
b[i][0] = (rand() % p) - p / 2; // between -p/2 and p/2
|
|
b[i][1] = (rand() % p) - p / 2;
|
|
r2[i][0] = r1[i][0] = (rand() % p) - p / 2; // between -p/2 and p/2
|
|
r2[i][1] = r1[i][1] = (rand() % p) - p / 2;
|
|
}
|
|
t.f(precomp, r1, a, b);
|
|
cplx_fftvec_mul_ref(precomp, r2, a, b);
|
|
for (uint32_t i = 0; i < m; i++) {
|
|
double dre = fabs(r1[i][0] - r2[i][0]);
|
|
double dim = fabs(r1[i][1] - r2[i][1]);
|
|
ASSERT_LE(dre, 1e-8);
|
|
ASSERT_LE(dim, 1e-8);
|
|
}
|
|
delete[] a;
|
|
delete[] b;
|
|
delete[] r1;
|
|
delete[] r2;
|
|
}
|
|
delete_cplx_fftvec_mul_precomp(precomp);
|
|
}
|
|
}
|
|
|
|
// comparative tests between addmul ref vs. optimized (only relevant dimensions)
|
|
TEST(fftvec, cplx_fftvec_addmul_ref_vs_optim) {
|
|
struct totest {
|
|
FFTVEC_ADDMUL_FUNCTION f;
|
|
uint64_t min_m;
|
|
totest(FFTVEC_ADDMUL_FUNCTION f, uint64_t min_m) : f(f), min_m(min_m) {}
|
|
};
|
|
std::vector<totest> totestset;
|
|
totestset.emplace_back(cplx_fftvec_addmul, 1);
|
|
#ifdef __x86_64__
|
|
totestset.emplace_back(cplx_fftvec_addmul_fma, 8);
|
|
#endif
|
|
for (uint64_t m : {1, 2, 4, 8, 16, 1024, 4096, 8192, 65536}) {
|
|
CPLX_FFTVEC_ADDMUL_PRECOMP* precomp = new_cplx_fftvec_addmul_precomp(m);
|
|
for (const totest& t : totestset) {
|
|
if (t.min_m > m) continue;
|
|
CPLX* a = new CPLX[m];
|
|
CPLX* b = new CPLX[m];
|
|
CPLX* r1 = new CPLX[m];
|
|
CPLX* r2 = new CPLX[m];
|
|
int64_t p = 1 << 16;
|
|
for (uint32_t i = 0; i < m; i++) {
|
|
a[i][0] = (rand() % p) - p / 2; // between -p/2 and p/2
|
|
a[i][1] = (rand() % p) - p / 2;
|
|
b[i][0] = (rand() % p) - p / 2; // between -p/2 and p/2
|
|
b[i][1] = (rand() % p) - p / 2;
|
|
r2[i][0] = r1[i][0] = (rand() % p) - p / 2; // between -p/2 and p/2
|
|
r2[i][1] = r1[i][1] = (rand() % p) - p / 2;
|
|
}
|
|
t.f(precomp, r1, a, b);
|
|
cplx_fftvec_addmul_ref(precomp, r2, a, b);
|
|
for (uint32_t i = 0; i < m; i++) {
|
|
double dre = fabs(r1[i][0] - r2[i][0]);
|
|
double dim = fabs(r1[i][1] - r2[i][1]);
|
|
ASSERT_LE(dre, 1e-8);
|
|
ASSERT_LE(dim, 1e-8);
|
|
}
|
|
delete[] a;
|
|
delete[] b;
|
|
delete[] r1;
|
|
delete[] r2;
|
|
}
|
|
delete_cplx_fftvec_addmul_precomp(precomp);
|
|
}
|
|
}
|