poulpy/spqlios/lib/test/spqlios_cplx_test.cpp

#include <cmath>

#include "gtest/gtest.h"
#include "spqlios/commons_private.h"
#include "spqlios/cplx/cplx_fft.h"
#include "spqlios/cplx/cplx_fft_internal.h"
#include "spqlios/cplx/cplx_fft_private.h"

#ifdef __x86_64__
TEST(fft, ifft16_fma_vs_ref) {
  CPLX data[16];
  CPLX omega[8];
  for (uint64_t i = 0; i < 32; ++i) ((double*)data)[i] = 2 * i + 1;  //(rand()%100)-50;
  for (uint64_t i = 0; i < 16; ++i) ((double*)omega)[i] = i + 1;     //(rand()%100)-50;
  CPLX copydata[16];
  CPLX copyomega[8];
  memcpy(copydata, data, sizeof(copydata));
  memcpy(copyomega, omega, sizeof(copyomega));
  cplx_ifft16_avx_fma(data, omega);
  cplx_ifft16_ref(copydata, copyomega);
  double distance = 0;
  for (uint64_t i = 0; i < 16; ++i) {
    double d1 = fabs(data[i][0] - copydata[i][0]);
    double d2 = fabs(data[i][0] - copydata[i][0]);
    if (d1 > distance) distance = d1;
    if (d2 > distance) distance = d2;
  }
  /*
  printf("data:\n");
  for (uint64_t i=0; i<4; ++i) {
    for (uint64_t j=0; j<8; ++j) {
      printf("%.5lf ", data[4 * i + j / 2][j % 2]);
    }
    printf("\n");
  }
  printf("copydata:\n");
  for (uint64_t i=0; i<4; ++i) {
    for (uint64_t j=0; j<8; ++j) {
      printf("%5.5lf ", copydata[4 * i + j / 2][j % 2]);
    }
    printf("\n");
  }
  */
  ASSERT_EQ(distance, 0);
}

#endif

void cplx_zero(CPLX r) { r[0] = r[1] = 0; }
void cplx_addmul(CPLX r, const CPLX a, const CPLX b) {
  double re = r[0] + a[0] * b[0] - a[1] * b[1];
  double im = r[1] + a[0] * b[1] + a[1] * b[0];
  r[0] = re;
  r[1] = im;
}

void halfcfft_eval(CPLX res, uint32_t nn, uint32_t k, const CPLX* coeffs, const CPLX* powomegas) {
  const uint32_t N = nn / 2;
  cplx_zero(res);
  for (uint64_t i = 0; i < N; ++i) {
    cplx_addmul(res, coeffs[i], powomegas[(k * i) % (2 * nn)]);
  }
}
void halfcfft_naive(uint32_t nn, CPLX* data) {
  const uint32_t N = nn / 2;
  CPLX* in = (CPLX*)malloc(N * sizeof(CPLX));
  CPLX* powomega = (CPLX*)malloc(2 * nn * sizeof(CPLX));
  for (uint64_t i = 0; i < (2 * nn); ++i) {
    powomega[i][0] = m_accurate_cos((M_PI * i) / nn);
    powomega[i][1] = m_accurate_sin((M_PI * i) / nn);
  }
  memcpy(in, data, N * sizeof(CPLX));
  for (uint64_t j = 0; j < N; ++j) {
    uint64_t p = rint(log2(N)) + 2;
    uint64_t k = revbits(p, j) + 1;
    halfcfft_eval(data[j], nn, k, in, powomega);
  }
  free(powomega);
  free(in);
}

#ifdef __x86_64__
TEST(fft, fft16_fma_vs_ref) {
  CPLX data[16];
  CPLX omega[8];
  for (uint64_t i = 0; i < 32; ++i) ((double*)data)[i] = rand() % 1000;
  for (uint64_t i = 0; i < 16; ++i) ((double*)omega)[i] = rand() % 1000;
  CPLX copydata[16];
  CPLX copyomega[8];
  memcpy(copydata, data, sizeof(copydata));
  memcpy(copyomega, omega, sizeof(copyomega));
  cplx_fft16_avx_fma(data, omega);
  cplx_fft16_ref(copydata, omega);
  double distance = 0;
  for (uint64_t i = 0; i < 16; ++i) {
    double d1 = fabs(data[i][0] - copydata[i][0]);
    double d2 = fabs(data[i][0] - copydata[i][0]);
    if (d1 > distance) distance = d1;
    if (d2 > distance) distance = d2;
  }
  ASSERT_EQ(distance, 0);
}
#endif

TEST(fft, citwiddle_then_invcitwiddle) {
  CPLX om;
  CPLX ombar;
  CPLX data[2];
  CPLX copydata[2];
  om[0] = cos(3);
  om[1] = sin(3);
  ombar[0] = om[0];
  ombar[1] = -om[1];
  data[0][0] = 47;
  data[0][1] = 23;
  data[1][0] = -12;
  data[1][1] = -9;
  memcpy(copydata, data, sizeof(copydata));
  citwiddle(data[0], data[1], om);
  invcitwiddle(data[0], data[1], ombar);
  double distance = 0;
  for (uint64_t i = 0; i < 2; ++i) {
    double d1 = fabs(data[i][0] - 2 * copydata[i][0]);
    double d2 = fabs(data[i][1] - 2 * copydata[i][1]);
    if (d1 > distance) distance = d1;
    if (d2 > distance) distance = d2;
  }
  ASSERT_LE(distance, 1e-9);
}

TEST(fft, ctwiddle_then_invctwiddle) {
  CPLX om;
  CPLX ombar;
  CPLX data[2];
  CPLX copydata[2];
  om[0] = cos(3);
  om[1] = sin(3);
  ombar[0] = om[0];
  ombar[1] = -om[1];
  data[0][0] = 47;
  data[0][1] = 23;
  data[1][0] = -12;
  data[1][1] = -9;
  memcpy(copydata, data, sizeof(copydata));
  ctwiddle(data[0], data[1], om);
  invctwiddle(data[0], data[1], ombar);
  double distance = 0;
  for (uint64_t i = 0; i < 2; ++i) {
    double d1 = fabs(data[i][0] - 2 * copydata[i][0]);
    double d2 = fabs(data[i][1] - 2 * copydata[i][1]);
    if (d1 > distance) distance = d1;
    if (d2 > distance) distance = d2;
  }
  ASSERT_LE(distance, 1e-9);
}

TEST(fft, fft16_then_ifft16_ref) {
  CPLX full_omegas[64];
  CPLX full_omegabars[64];
  for (uint64_t i = 0; i < 64; ++i) {
    full_omegas[i][0] = cos(M_PI * i / 32.);
    full_omegas[i][1] = sin(M_PI * i / 32.);
    full_omegabars[i][0] = full_omegas[i][0];
    full_omegabars[i][1] = -full_omegas[i][1];
  }
  CPLX omega[8];
  CPLX omegabar[8];
  cplx_set(omega[0], full_omegas[8]);         // j
  cplx_set(omega[1], full_omegas[4]);         // k
  cplx_set(omega[2], full_omegas[2]);         // l
  cplx_set(omega[3], full_omegas[10]);        // lj
  cplx_set(omega[4], full_omegas[1]);         // n
  cplx_set(omega[5], full_omegas[9]);         // nj
  cplx_set(omega[6], full_omegas[5]);         // nk
  cplx_set(omega[7], full_omegas[13]);        // njk
  cplx_set(omegabar[0], full_omegabars[1]);   // n
  cplx_set(omegabar[1], full_omegabars[9]);   // nj
  cplx_set(omegabar[2], full_omegabars[5]);   // nk
  cplx_set(omegabar[3], full_omegabars[13]);  // njk
  cplx_set(omegabar[4], full_omegabars[2]);   // l
  cplx_set(omegabar[5], full_omegabars[10]);  // lj
  cplx_set(omegabar[6], full_omegabars[4]);   // k
  cplx_set(omegabar[7], full_omegabars[8]);   // j
  CPLX data[16];
  CPLX copydata[16];
  for (uint64_t i = 0; i < 32; ++i) ((double*)data)[i] = rand() % 1000;
  memcpy(copydata, data, sizeof(copydata));
  cplx_fft16_ref(data, omega);
  cplx_ifft16_ref(data, omegabar);
  double distance = 0;
  for (uint64_t i = 0; i < 16; ++i) {
    double d1 = fabs(data[i][0] - 16 * copydata[i][0]);
    double d2 = fabs(data[i][0] - 16 * copydata[i][0]);
    if (d1 > distance) distance = d1;
    if (d2 > distance) distance = d2;
  }
  ASSERT_LE(distance, 1e-9);
}

TEST(fft, halfcfft_ref_vs_naive) {
  for (uint64_t nn : {4, 8, 16, 64, 256, 8192}) {
    uint64_t m = nn / 2;
    CPLX_FFT_PRECOMP* tables = new_cplx_fft_precomp(m, 0);
    CPLX* a = (CPLX*)spqlios_alloc_custom_align(32, m * sizeof(CPLX));
    CPLX* a1 = (CPLX*)spqlios_alloc_custom_align(32, m * sizeof(CPLX));
    CPLX* a2 = (CPLX*)spqlios_alloc_custom_align(32, m * sizeof(CPLX));
    int64_t p = 1 << 16;
    for (uint32_t i = 0; i < m; i++) {
      a[i][0] = (rand() % p) - p / 2;  // between -p/2 and p/2
      a[i][1] = (rand() % p) - p / 2;
    }
    memcpy(a1, a, m * sizeof(CPLX));
    memcpy(a2, a, m * sizeof(CPLX));

    halfcfft_naive(nn, a1);
    cplx_fft_naive(m, 0.25, a2);

    double d = 0;
    for (uint32_t i = 0; i < nn / 2; i++) {
      double dre = fabs(a1[i][0] - a2[i][0]);
      double dim = fabs(a1[i][1] - a2[i][1]);
      if (dre > d) d = dre;
      if (dim > d) d = dim;
    }
    ASSERT_LE(d, nn * 1e-10) << nn;
    spqlios_free(a);
    spqlios_free(a1);
    spqlios_free(a2);
    delete_cplx_fft_precomp(tables);
  }
}

#ifdef __x86_64__
TEST(fft, halfcfft_fma_vs_ref) {
  typedef void (*FFTF)(const CPLX_FFT_PRECOMP*, void* data);
  for (FFTF fft : {cplx_fft_ref, cplx_fft_avx2_fma}) {
    for (uint64_t nn : {8, 16, 32, 64, 1024, 8192, 65536}) {
      uint64_t m = nn / 2;
      CPLX_FFT_PRECOMP* tables = new_cplx_fft_precomp(m, 0);
      CPLX* a = (CPLX*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
      CPLX* a1 = (CPLX*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
      CPLX* a2 = (CPLX*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
      int64_t p = 1 << 16;
      for (uint32_t i = 0; i < nn / 2; i++) {
        a[i][0] = (rand() % p) - p / 2;  // between -p/2 and p/2
        a[i][1] = (rand() % p) - p / 2;
      }
      memcpy(a1, a, nn / 2 * sizeof(CPLX));
      memcpy(a2, a, nn / 2 * sizeof(CPLX));
      cplx_fft_naive(m, 0.25, a2);
      fft(tables, a1);
      double d = 0;
      for (uint32_t i = 0; i < nn / 2; i++) {
        double dre = fabs(a1[i][0] - a2[i][0]);
        double dim = fabs(a1[i][1] - a2[i][1]);
        if (dre > d) d = dre;
        if (dim > d) d = dim;
      }
      ASSERT_LE(d, nn * 1e-10) << nn;
      spqlios_free(a);
      spqlios_free(a1);
      spqlios_free(a2);
      delete_cplx_fft_precomp(tables);
    }
  }
}
#endif

TEST(fft, halfcfft_then_ifft_ref) {
  for (uint64_t nn : {4, 8, 16, 32, 64, 1024, 8192, 65536}) {
    uint64_t m = nn / 2;
    CPLX_FFT_PRECOMP* tables = new_cplx_fft_precomp(m, 0);
    CPLX_IFFT_PRECOMP* itables = new_cplx_ifft_precomp(m, 0);
    CPLX* a = (CPLX*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
    CPLX* a1 = (CPLX*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
    int64_t p = 1 << 16;
    for (uint32_t i = 0; i < nn / 2; i++) {
      a[i][0] = (rand() % p) - p / 2;  // between -p/2 and p/2
      a[i][1] = (rand() % p) - p / 2;
    }
    memcpy(a1, a, nn / 2 * sizeof(CPLX));
    cplx_fft_ref(tables, a1);
    cplx_ifft_ref(itables, a1);
    double d = 0;
    for (uint32_t i = 0; i < nn / 2; i++) {
      double dre = fabs(a[i][0] - a1[i][0] / (nn / 2));
      double dim = fabs(a[i][1] - a1[i][1] / (nn / 2));
      if (dre > d) d = dre;
      if (dim > d) d = dim;
    }
    ASSERT_LE(d, 1e-8);
    spqlios_free(a);
    spqlios_free(a1);
    delete_cplx_fft_precomp(tables);
    delete_cplx_ifft_precomp(itables);
  }
}

#ifdef __x86_64__
TEST(fft, halfcfft_ifft_fma_vs_ref) {
  for (IFFT_FUNCTION ifft : {cplx_ifft_ref, cplx_ifft_avx2_fma}) {
    for (uint64_t nn : {8, 16, 32, 1024, 4096, 8192, 65536}) {
      uint64_t m = nn / 2;
      CPLX_IFFT_PRECOMP* itables = new_cplx_ifft_precomp(m, 0);
      CPLX* a = (CPLX*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
      CPLX* a1 = (CPLX*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
      CPLX* a2 = (CPLX*)spqlios_alloc_custom_align(32, nn / 2 * sizeof(CPLX));
      int64_t p = 1 << 16;
      for (uint32_t i = 0; i < nn / 2; i++) {
        a[i][0] = (rand() % p) - p / 2;  // between -p/2 and p/2
        a[i][1] = (rand() % p) - p / 2;
      }
      memcpy(a1, a, nn / 2 * sizeof(CPLX));
      memcpy(a2, a, nn / 2 * sizeof(CPLX));
      cplx_ifft_naive(m, 0.25, a2);
      ifft(itables, a1);
      double d = 0;
      for (uint32_t i = 0; i < nn / 2; i++) {
        double dre = fabs(a1[i][0] - a2[i][0]);
        double dim = fabs(a1[i][1] - a2[i][1]);
        if (dre > d) d = dre;
        if (dim > d) d = dim;
      }
      ASSERT_LE(d, 1e-8);
      spqlios_free(a);
      spqlios_free(a1);
      spqlios_free(a2);
      delete_cplx_ifft_precomp(itables);
    }
  }
}
#endif

// test the reference and simple implementations of mul on all dimensions
TEST(fftvec, cplx_fftvec_mul_ref) {
  for (uint64_t nn : {2, 4, 8, 16, 32, 1024, 4096, 8192, 65536}) {
    uint64_t m = nn / 2;
    CPLX_FFTVEC_MUL_PRECOMP* precomp = new_cplx_fftvec_mul_precomp(m);
    CPLX* a = new CPLX[m];
    CPLX* b = new CPLX[m];
    CPLX* r0 = new CPLX[m];
    CPLX* r1 = new CPLX[m];
    CPLX* r2 = new CPLX[m];
    int64_t p = 1 << 16;
    for (uint32_t i = 0; i < m; i++) {
      a[i][0] = (rand() % p) - p / 2;  // between -p/2 and p/2
      a[i][1] = (rand() % p) - p / 2;
      b[i][0] = (rand() % p) - p / 2;  // between -p/2 and p/2
      b[i][1] = (rand() % p) - p / 2;
      r2[i][0] = r1[i][0] = r0[i][0] = (rand() % p) - p / 2;  // between -p/2 and p/2
      r2[i][1] = r1[i][1] = r0[i][1] = (rand() % p) - p / 2;
    }
    cplx_fftvec_mul_simple(m, r0, a, b);
    cplx_fftvec_mul_ref(precomp, r1, a, b);
    for (uint32_t i = 0; i < m; i++) {
      r2[i][0] = a[i][0] * b[i][0] - a[i][1] * b[i][1];
      r2[i][1] = a[i][0] * b[i][1] + a[i][1] * b[i][0];
      ASSERT_LE(fabs(r1[i][0] - r2[i][0]) + fabs(r1[i][1] - r2[i][1]), 1e-8);
      ASSERT_LE(fabs(r0[i][0] - r2[i][0]) + fabs(r0[i][1] - r2[i][1]), 1e-8);
    }
    delete[] a;
    delete[] b;
    delete[] r0;
    delete[] r1;
    delete[] r2;
    delete_cplx_fftvec_mul_precomp(precomp);
  }
}

// test the reference and simple implementations of addmul on all dimensions
TEST(fftvec, cplx_fftvec_addmul_ref) {
  for (uint64_t nn : {2, 4, 8, 16, 32, 1024, 4096, 8192, 65536}) {
    uint64_t m = nn / 2;
    CPLX_FFTVEC_ADDMUL_PRECOMP* precomp = new_cplx_fftvec_addmul_precomp(m);
    CPLX* a = new CPLX[m];
    CPLX* b = new CPLX[m];
    CPLX* r0 = new CPLX[m];
    CPLX* r1 = new CPLX[m];
    CPLX* r2 = new CPLX[m];
    int64_t p = 1 << 16;
    for (uint32_t i = 0; i < m; i++) {
      a[i][0] = (rand() % p) - p / 2;  // between -p/2 and p/2
      a[i][1] = (rand() % p) - p / 2;
      b[i][0] = (rand() % p) - p / 2;  // between -p/2 and p/2
      b[i][1] = (rand() % p) - p / 2;
      r2[i][0] = r1[i][0] = r0[i][0] = (rand() % p) - p / 2;  // between -p/2 and p/2
      r2[i][1] = r1[i][1] = r0[i][1] = (rand() % p) - p / 2;
    }
    cplx_fftvec_addmul_simple(m, r0, a, b);
    cplx_fftvec_addmul_ref(precomp, r1, a, b);
    for (uint32_t i = 0; i < m; i++) {
      r2[i][0] += a[i][0] * b[i][0] - a[i][1] * b[i][1];
      r2[i][1] += a[i][0] * b[i][1] + a[i][1] * b[i][0];
      ASSERT_LE(fabs(r1[i][0] - r2[i][0]) + fabs(r1[i][1] - r2[i][1]), 1e-8);
      ASSERT_LE(fabs(r0[i][0] - r2[i][0]) + fabs(r0[i][1] - r2[i][1]), 1e-8);
    }
    delete[] a;
    delete[] b;
    delete[] r0;
    delete[] r1;
    delete[] r2;
    delete_cplx_fftvec_addmul_precomp(precomp);
  }
}

// comparative tests between mul ref vs. optimized (only relevant dimensions)
TEST(fftvec, cplx_fftvec_mul_ref_vs_optim) {
  struct totest {
    FFTVEC_MUL_FUNCTION f;
    uint64_t min_m;
    totest(FFTVEC_MUL_FUNCTION f, uint64_t min_m) : f(f), min_m(min_m) {}
  };
  std::vector<totest> totestset;
  totestset.emplace_back(cplx_fftvec_mul, 1);
#ifdef __x86_64__
  totestset.emplace_back(cplx_fftvec_mul_fma, 8);
#endif
  for (uint64_t m : {1, 2, 4, 8, 16, 1024, 4096, 8192, 65536}) {
    CPLX_FFTVEC_MUL_PRECOMP* precomp = new_cplx_fftvec_mul_precomp(m);
    for (const totest& t : totestset) {
      if (t.min_m > m) continue;
      CPLX* a = new CPLX[m];
      CPLX* b = new CPLX[m];
      CPLX* r1 = new CPLX[m];
      CPLX* r2 = new CPLX[m];
      int64_t p = 1 << 16;
      for (uint32_t i = 0; i < m; i++) {
        a[i][0] = (rand() % p) - p / 2;  // between -p/2 and p/2
        a[i][1] = (rand() % p) - p / 2;
        b[i][0] = (rand() % p) - p / 2;  // between -p/2 and p/2
        b[i][1] = (rand() % p) - p / 2;
        r2[i][0] = r1[i][0] = (rand() % p) - p / 2;  // between -p/2 and p/2
        r2[i][1] = r1[i][1] = (rand() % p) - p / 2;
      }
      t.f(precomp, r1, a, b);
      cplx_fftvec_mul_ref(precomp, r2, a, b);
      for (uint32_t i = 0; i < m; i++) {
        double dre = fabs(r1[i][0] - r2[i][0]);
        double dim = fabs(r1[i][1] - r2[i][1]);
        ASSERT_LE(dre, 1e-8);
        ASSERT_LE(dim, 1e-8);
      }
      delete[] a;
      delete[] b;
      delete[] r1;
      delete[] r2;
    }
    delete_cplx_fftvec_mul_precomp(precomp);
  }
}

// comparative tests between addmul ref vs. optimized (only relevant dimensions)
TEST(fftvec, cplx_fftvec_addmul_ref_vs_optim) {
  struct totest {
    FFTVEC_ADDMUL_FUNCTION f;
    uint64_t min_m;
    totest(FFTVEC_ADDMUL_FUNCTION f, uint64_t min_m) : f(f), min_m(min_m) {}
  };
  std::vector<totest> totestset;
  totestset.emplace_back(cplx_fftvec_addmul, 1);
#ifdef __x86_64__
  totestset.emplace_back(cplx_fftvec_addmul_fma, 8);
#endif
  for (uint64_t m : {1, 2, 4, 8, 16, 1024, 4096, 8192, 65536}) {
    CPLX_FFTVEC_ADDMUL_PRECOMP* precomp = new_cplx_fftvec_addmul_precomp(m);
    for (const totest& t : totestset) {
      if (t.min_m > m) continue;
      CPLX* a = new CPLX[m];
      CPLX* b = new CPLX[m];
      CPLX* r1 = new CPLX[m];
      CPLX* r2 = new CPLX[m];
      int64_t p = 1 << 16;
      for (uint32_t i = 0; i < m; i++) {
        a[i][0] = (rand() % p) - p / 2;  // between -p/2 and p/2
        a[i][1] = (rand() % p) - p / 2;
        b[i][0] = (rand() % p) - p / 2;  // between -p/2 and p/2
        b[i][1] = (rand() % p) - p / 2;
        r2[i][0] = r1[i][0] = (rand() % p) - p / 2;  // between -p/2 and p/2
        r2[i][1] = r1[i][1] = (rand() % p) - p / 2;
      }
      t.f(precomp, r1, a, b);
      cplx_fftvec_addmul_ref(precomp, r2, a, b);
      for (uint32_t i = 0; i < m; i++) {
        double dre = fabs(r1[i][0] - r2[i][0]);
        double dim = fabs(r1[i][1] - r2[i][1]);
        ASSERT_LE(dre, 1e-8);
        ASSERT_LE(dim, 1e-8);
      }
      delete[] a;
      delete[] b;
      delete[] r1;
      delete[] r2;
    }
    delete_cplx_fftvec_addmul_precomp(precomp);
  }
}