/*
  This file is part of CDO. CDO is a collection of Operators to
  manipulate and analyse Climate model Data.

  Copyright (C) 2003-2020 Uwe Schulzweida, <uwe.schulzweida AT mpimet.mpg.de>
  See COPYING file for copying and redistribution conditions.

  This program is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; version 2 of the License.

  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
*/
/* This source code is copied from PINGO version 1.5 */

/* ********************************** */
/* HEADER FOR PARALLEL EIGEN SOLUTION */
/*  -->SEE END OF ROUTINE             */
/* ********************************** */
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include <utility>
#include <cfloat>

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

#include "cdo_options.h"
#include "process_int.h"
#include "statistic.h"

#ifndef M_2_SQRTPI
#define M_2_SQRTPI 1.12837916709551257390 /* 2/std::sqrt(pi) */
#endif
#ifndef M_SQRT2
#define M_SQRT2 1.41421356237309504880 /* std::sqrt(2) */
#endif

constexpr double FNORM_PRECISION = 1e-12;
constexpr int MAX_JACOBI_ITER = 12;

// global variables to handle environment settings
static double fnorm_precision;
static int max_jacobi_iter;

static size_t n_finished;

namespace cdo
{

static void
heap_sort(Varray<double> &eig_val, Varray2D<double> &a, long n)
{
  long j_next;

  // First part of heap sort:
  for (long i = n / 2 - 1; i >= 0; i--)
    {
      for (long j = i; 2 * j + 1 < n; j = j_next)
        {
          auto k1 = 2 * j + 1;
          auto k2 = 2 * j + 2;
          j_next = j;
          if (eig_val[k1] < eig_val[j_next]) j_next = k1;
          if (k2 < n && eig_val[k2] < eig_val[j_next]) j_next = k2;
          if (j == j_next) break;
          std::swap(eig_val[j], eig_val[j_next]);
          std::swap(a[j], a[j_next]);
        }
    }
  // Second part of head sort:
  for (long i = n - 1; i > 0; i--)
    {
      std::swap(eig_val[0], eig_val[i]);
      std::swap(a[0], a[i]);
      for (long j = 0; 2 * j + 1 < i; j = j_next)
        {
          auto k1 = 2 * j + 1;
          auto k2 = 2 * j + 2;
          j_next = j;
          if (eig_val[k1] < eig_val[j_next]) j_next = k1;
          if (k2 < i && eig_val[k2] < eig_val[j_next]) j_next = k2;
          if (j == j_next) break;
          std::swap(eig_val[j], eig_val[j_next]);
          std::swap(a[j], a[j_next]);
        }
    }
}

static void
make_symmetric_matrix_triangular(Varray2D<double> &a, long n, Varray<double> &d, Varray<double> &e)
{
  long i, j, k;
  double f, g, h, hh, scale;

  for (i = n - 1; i >= 1; i--)
    {
      h = scale = 0;
      if (i > 1)
        {
          for (k = 0; k < i; k++) scale += std::fabs(a[i][k]);
          if (DBL_IS_EQUAL(scale, 0.))
            e[i] = a[i][i - 1];
          else
            {
              for (k = 0; k < i; k++)
                {
                  a[i][k] /= scale;
                  h += a[i][k] * a[i][k];
                }
              f = a[i][i - 1];
              g = f >= 0 ? -sqrt(h) : std::sqrt(h);
              e[i] = scale * g;
              h -= f * g;
              a[i][i - 1] = f - g;
              f = 0;
              for (j = 0; j < i; j++)
                {
                  a[j][i] = a[i][j] / h;
                  g = 0;
                  for (k = 0; k <= j; k++) g += a[j][k] * a[i][k];
                  for (k = j + 1; k < i; k++) g += a[k][j] * a[i][k];
                  e[j] = g / h;
                  f += e[j] * a[i][j];
                }
              hh = f / (2 * h);
              for (j = 0; j < i; j++)
                {
                  f = a[i][j];
                  e[j] = g = e[j] - hh * f;
                  for (k = 0; k <= j; k++) a[j][k] -= f * e[k] + g * a[i][k];
                }
            }
        }
      else
        e[i] = a[i][i - 1];

      d[i] = h;
    }

  d[0] = e[0] = 0;
  for (i = 0; i < n; i++)
    {
      if (std::fabs(d[i]) > 0)
        {
          for (j = 0; j < i; j++)
            {
              g = 0;
              for (k = 0; k < i; k++) g += a[i][k] * a[k][j];
              for (k = 0; k < i; k++) a[k][j] -= g * a[k][i];
            }
        }
      d[i] = a[i][i];
      a[i][i] = 1;
      for (j = 0; j < i; j++) a[j][i] = a[i][j] = 0;
    }
}

static double
pythagoras(double a, double b)
{
  const auto abs_a = std::fabs(a);
  const auto abs_b = std::fabs(b);
  if (abs_a > abs_b)
    {
      auto sqr = abs_b / abs_a;
      sqr *= sqr;
      return abs_a * std::sqrt(1.0 + sqr);
    }
  else if (abs_b > abs_a)
    {
      auto sqr = abs_a / abs_b;
      sqr *= sqr;
      return abs_b * std::sqrt(1.0 + sqr);
    }
  else
    return M_SQRT2 * abs_a;
}

static void
eigen_solution_of_triangular_matrix(Varray<double> &d, Varray<double> &e, long n, Varray2D<double> &a, const char *prompt)
{
  constexpr double eps = 1.e-6;
  constexpr long MAX_ITER = 1000;
  long i, k, l, m, iter;
  double b, c, f, g, p, r, s;

  for (i = 1; i < n; i++) e[i - 1] = e[i];

  e[n - 1] = 0.0;
  for (l = 0; l < n; l++)
    {
      iter = 0;
      while (1)
        {
          for (m = l; m < n - 1; m++)
            if (std::fabs(e[m]) <= eps * (std::fabs(d[m]) + std::fabs(d[m + 1]))) break;
          if (m == l)
            {
              // printf("found solution after %i Iteration\n", iter++);
              break;
            }
          iter++;
          if (iter == MAX_ITER)
            {
              fprintf(stderr, "%s: ERROR! Too many iterations while determining the eigensolution!\n", prompt);
              exit(1);
            }
          g = (d[l + 1] - d[l]) / (2.0 * e[l]);
          r = pythagoras(g, 1);
          g = d[m] - d[l] + e[l] / (g + (std::fabs(g) > 0 ? (g >= 0 ? std::fabs(r) : -std::fabs(r)) : r));
          s = c = 1;
          p = 0;
          for (i = m - 1; i >= l; i--)
            {
              f = s * e[i];
              b = c * e[i];
              e[i + 1] = r = pythagoras(f, g);

              if (DBL_IS_EQUAL(r, 0.0))
                {
                  d[i + 1] -= p;
                  e[m] = 0.0;
                  break;
                }

              s = f / r;
              c = g / r;
              g = d[i + 1] - p;
              r = (d[i] - g) * s + 2.0 * c * b;
              p = s * r;
              d[i + 1] = g + p;
              g = c * r - b;
              for (k = 0; k < n; k++)
                {
                  f = a[k][i + 1];
                  a[k][i + 1] = s * a[k][i] + c * f;
                  a[k][i] = c * a[k][i] - s * f;
                }
            }

          if (DBL_IS_EQUAL(r, 0.0) && i >= l) continue;

          d[l] -= p;
          e[l] = g;
          e[m] = 0.0;
        }
    }
}

void
eigen_solution_of_symmetric_matrix(Varray2D<double> &a, Varray<double> &eig_val, size_t n, const char *prompt)
// After return the rows (!!!) of a are the eigenvectors
{
  {
    std::vector<double> e(n);
    make_symmetric_matrix_triangular(a, n, eig_val, e);
    eigen_solution_of_triangular_matrix(eig_val, e, n, a, prompt);
  }

  for (size_t i = 0; i < n; i++)
    for (size_t j = i + 1; j < n; j++) std::swap(a[i][j], a[j][i]);

  heap_sort(eig_val, a, n);
}
/*
static int
lu_decomposition(double **a, int n, int *index, int *sign)
{
  int i, imax = 0, j, k;
  double big, sum, temp;

  std::vector<double> v(n);
  *sign = 1;
  for (i = 0; i < n; i++)
    {
      big = 0;
      for (j = 0; j < n; j++)
        if ((temp = std::fabs(a[i][j])) > big) big = temp;

      if (DBL_IS_EQUAL(big, 0.)) return 0;

      v[i] = 1 / big;
    }
  for (j = 0; j < n; j++)
    {
      for (i = 0; i < j; i++)
        {
          sum = a[i][j];
          for (k = 0; k < i; k++) sum -= a[i][k] * a[k][j];
          a[i][j] = sum;
        }
      big = 0;
      for (i = j; i < n; i++)
        {
          sum = a[i][j];
          for (k = 0; k < j; k++) sum -= a[i][k] * a[k][j];
          a[i][j] = sum;
          if ((temp = v[i] * std::fabs(sum)) >= big)
            {
              big = temp;
              imax = i;
            }
        }
      if (j != imax)
        {
          for (k = 0; k < n; k++)
            {
              temp = a[imax][k];
              a[imax][k] = a[j][k];
              a[j][k] = temp;
            }
          *sign = -*sign;
          v[imax] = v[j];
        }
      index[j] = imax;

      if (DBL_IS_EQUAL(a[j][j], 0.)) return 0;

      if (j != n)
        {
          temp = 1 / a[j][j];
          for (i = j + 1; i < n; i++) a[i][j] *= temp;
        }
    }

  return 1;
}

static void
lu_backsubstitution(double **a, int n, int *index, double *b)
{
  int i, ii, ip, j;
  double sum;

  ii = 0;
  for (i = 0; i < n; i++)
    {
      ip = index[i];
      sum = b[ip];
      b[ip] = b[i];

      if (ii)
        for (j = ii; j < i; j++) sum -= a[i][j] * b[j];
      else if (std::fabs(sum) > 0)
        ii = i;

      b[i] = sum;
    }
  for (i = n - 1; i >= 0; i--)
    {
      sum = b[i];
      for (j = i + 1; j < n; j++) sum -= a[i][j] * b[j];
      b[i] = sum / a[i][i];
    }
}

static int
solution_of_linear_equation(double **a, double *b, int n)
{
  std::vector<int> index(n);

  int sign;
  int not_singular = lu_decomposition(a, n, index.data(), &sign);

  if (not_singular) lu_backsubstitution(a, n, index.data(), b);

  return not_singular;
}

static int
inverse_of_matrix(double **a, double **b, int n)
{
  int sign;
  int i, j;

  std::vector<int> index(n);
  std::vector<double> col(n);

  int not_singular = lu_decomposition(a, n, index.data(), &sign);

  if (not_singular)
    {
      for (i = 0; i < n; i++)
        {
          for (j = 0; j < n; j++) col[j] = 0;
          col[i] = 1;
          lu_backsubstitution(a, n, index.data(), col.data());
          for (j = 0; j < n; j++) b[j][i] = col[j];
        }
    }

  return not_singular;
}
*/
void
fft(double *restrict real, double *restrict imag, int n, int sign)
{
  // n must be a power of 2
  // sign should be 1 (FT) or -1 (reverse FT)
  int j, j1, j2;
  int bit;

  // Bit reversal part
  for (int i = j = 0; i < n; i++)  // The bit pattern of i and j are reverse
    {
      if (i > j) std::swap(real[i], real[j]);
      if (i > j) std::swap(imag[i], imag[j]);

      for (bit = n >> 1; j & bit; bit >>= 1) j ^= bit;
      j |= bit;
    }

  // Danielson-Lanczos Part
  for (int step = 1; step < n; step <<= 1)
    {
      const auto w_r = std::cos(M_PI / step);
      const auto w_i = std::sin(M_PI / step) * sign;
      double ww_r = 1.0;
      double ww_i = 0.0;
      for (int i = 0; i < step; i++)
        {
          double temp_r, temp_i;
          for (j1 = i, j2 = i + step; j2 < n; j1 += 2 * step, j2 += 2 * step)
            {
              temp_r = ww_r * real[j2] - ww_i * imag[j2];
              temp_i = ww_r * imag[j2] + ww_i * real[j2];
              real[j2] = real[j1] - temp_r;
              imag[j2] = imag[j1] - temp_i;
              real[j1] += temp_r;
              imag[j1] += temp_i;
            }
          temp_r = ww_r;
          ww_r = ww_r * w_r - ww_i * w_i;
          ww_i = temp_r * w_i + ww_i * w_r;
        }
    }

  const auto norm = 1.0 / std::sqrt(n);
  for (int i = 0; i < n; i++) real[i] *= norm;
  for (int i = 0; i < n; i++) imag[i] *= norm;
}

void
ft(double *real, double *imag, int n, int sign)
{
  // sign should be 1 (FT) or -1 (reverse FT)
  static double *work_r = 0, *work_i = 0;
  double temp_r;

  if (!work_r)
    {
      work_r = (double *) malloc(n * sizeof(double));
      /* free_at_exit (work_r); */
    }
  if (!work_i)
    {
      work_i = (double *) malloc(n * sizeof(double));
      /* free_at_exit (work_i); */
    }

  for (int k = 0; k < n; k++)
    {
      const auto w_r = std::cos(2 * M_PI * k / n);
      const auto w_i = std::sin(2 * M_PI * k / n) * sign;
      double ww_r = 1.0;
      double ww_i = 0.0;
      double sum_r = 0.0;
      double sum_i = 0.0;
      for (int j = 0; j < n; j++)
        {
          sum_r += real[j] * ww_r - imag[j] * ww_i;
          sum_i += real[j] * ww_i + imag[j] * ww_r;
          temp_r = ww_r;
          ww_r = ww_r * w_r - ww_i * w_i;
          ww_i = temp_r * w_i + ww_i * w_r;
        }
      work_r[k] = sum_r;
      work_i[k] = sum_i;
    }

  const auto norm = 1. / std::sqrt(n);
  for (int k = 0; k < n; k++) real[k] = work_r[k] * norm;
  for (int k = 0; k < n; k++) imag[k] = work_i[k] * norm;
}

// reentrant version of ft
void
ft_r(double *restrict real, double *restrict imag, int n, int sign, double *restrict work_r, double *restrict work_i)
{
  // sign should be 1 (FT) or -1 (reverse FT)
  double temp_r;

  for (int k = 0; k < n; k++)
    {
      const auto w_r = std::cos(2 * M_PI * k / n);
      const auto w_i = std::sin(2 * M_PI * k / n) * sign;
      double ww_r = 1.0;
      double ww_i = 0.0;
      double sum_r = 0.0;
      double sum_i = 0.0;
      for (int j = 0; j < n; j++)
        {
          sum_r += real[j] * ww_r - imag[j] * ww_i;
          sum_i += real[j] * ww_i + imag[j] * ww_r;
          temp_r = ww_r;
          ww_r = ww_r * w_r - ww_i * w_i;
          ww_i = temp_r * w_i + ww_i * w_r;
        }
      work_r[k] = sum_r;
      work_i[k] = sum_i;
    }

  const auto norm = 1.0 / std::sqrt(n);
  for (int k = 0; k < n; k++) real[k] = work_r[k] * norm;
  for (int k = 0; k < n; k++) imag[k] = work_i[k] * norm;
}

double
lngamma(double x)
{
  static const double cof[6] = { 76.18009172947146,  -86.50532032941677,    24.01409824083091,
                                 -1.231739572450155, 0.1208650973866179e-2, -0.5395239384953e-5 };

  double b, a = b = x;
  double temp = a + 5.5;
  temp -= (a + 0.5) * std::log(temp);
  double ser = 1.000000000190015;
  for (int j = 0; j <= 5; j++) ser += cof[j] / ++b;
  return -temp + std::log(2.5066282746310005 * ser / a);
}

static double
gamma_help_1(double a, double x, const char *prompt)
{
  constexpr double eps = 1.e-20;

  double gln = lngamma(a);
  double ap = a;
  double sum, del = sum = 1.0 / a;

  for (int i = 1; i <= 100; i++)
    {
      ap++;
      del *= x / ap;
      sum += del;
      if (std::fabs(del) < std::fabs(sum) * eps) return sum * std::exp(-x + a * std::log(x) - (gln));
    }

  fprintf(stderr, "%s: ERROR! Too many iterations in routine \"gamma_help_1\"!\n", prompt);
  exit(1);

  return 0;
}

static double
gamma_help_2(double a, double x, const char *prompt)
{
  constexpr double eps = 1.e-20;
  double an, del;
  double const very_small = 1000 * DBL_MIN;

  double gln = lngamma(a);
  double b = x + 1 - a;
  double c = 1 / very_small;
  double d = 1 / b;
  double h = d;

  for (int i = 1; i <= 100; i++)
    {
      an = -i * (i - a);
      b += 2.0;
      d = an * d + b;
      if (std::fabs(d) < very_small) d = very_small;
      c = b + an / c;
      if (std::fabs(c) < very_small) c = very_small;
      d = 1 / d;
      del = d * c;
      h *= del;
      if (std::fabs(del - 1) < eps) return std::exp(-x + a * std::log(x) - gln) * h;
    }

  fprintf(stderr, "%s: ERROR! Too many iterations in routine \"gamma_help_2\"!\n", prompt);
  exit(1);

  return -1;
}

double
incomplete_gamma(double a, double x, const char *prompt)
{
  if (x < 0.0 || a <= 0.0)
    {
      fprintf(stderr, "%s: IMPLEMENTATION ERROR! (Invalid argument in function \"incomplete_gamma\")\n", prompt);
      exit(4);
    }
  if (x < (a + 1.0))
    return gamma_help_1(a, x, prompt);
  else
    return 1.0 - gamma_help_2(a, x, prompt);
}

double
beta(double a, double b, const char *prompt)
{
  if (a <= 0.0 || b <= 0.0)
    {
      fprintf(stderr, "%s: IMPLEMENTATION ERROR! (Invalid argument in function \"beta\")\n", prompt);
      exit(4);
    }

  return std::exp(lngamma(a) + lngamma(b) - lngamma(a + b));
}

static double
beta_help(double a, double b, double x, const char *prompt)
{
  constexpr double very_small = 1000.0 * DBL_MIN;
  constexpr double eps = 3.e-07;
  double aa, del;

  const auto qab = a + b;
  const auto qap = a + 1;
  const auto qam = a - 1;
  double c = 1.0;
  double d = 1.0 - qab * x / qap;
  if (std::fabs(d) < very_small) d = very_small;
  d = 1 / d;
  double h = d;
  for (int m = 1; m <= 100; m++)
    {
      int m2 = 2 * m;
      aa = m * (b - m) * x / ((qam + m2) * (a + m2));
      d = 1 + aa * d;
      if (std::fabs(d) < very_small) d = very_small;
      c = 1.0 + aa / c;
      if (std::fabs(c) < very_small) c = very_small;
      d = 1.0 / d;
      h *= d * c;
      aa = -(a + m) * (qab + m) * x / ((a + m2) * (qap + m2));
      d = 1.0 + aa * d;
      if (std::fabs(d) < very_small) d = very_small;
      c = 1.0 + aa / c;
      if (std::fabs(c) < very_small) c = very_small;
      d = 1.0 / d;
      del = d * c;
      h *= del;
      if (std::fabs(del - 1.0) < eps) return h;
    }

  fprintf(stderr, "%s: ERROR! Too many iterations in routine \"beta_help\"!\n", prompt);
  exit(1);

  return -1;
}

double
incomplete_beta(double a, double b, double x, const char *prompt)
{
  if (a <= 0.0 || b <= 0.0)
    {
      fprintf(stderr, "%s: IMPLEMENTATION ERROR! (Invalid argument in function \"incomplete_beta\")\n", prompt);
      exit(4);
    }

  if (x < 0.0 || x > 1.0)
    {
      fprintf(stderr, "%s: Value out of range (0-1)!\n", prompt);
      exit(4);
    }

  double c = (DBL_IS_EQUAL(x, 0.) || DBL_IS_EQUAL(x, 1.))
                 ? 0.0
                 : std::exp(lngamma(a + b) - lngamma(a) - lngamma(b) + a * std::log(x) + b * std::log(1 - x));

  if (x < (a + 1) / (a + b + 2.0))
    return c * beta_help(a, b, x, prompt) / a;
  else
    return 1.0 - c * beta_help(b, a, 1.0 - x, prompt) / b;
}

double
normal_density(double x)
{
  return M_2_SQRTPI / 2.0 / M_SQRT2 * std::exp(-x * x / 2.0);
}

double
normal(double x, const char *prompt)
{
  return x > 0.0 ? 0.5 * (1.0 + incomplete_gamma(0.5, x * x / 2.0, prompt))
               : x < 0.0 ? 0.5 * (1 - incomplete_gamma(0.5, x * x / 2.0, prompt)) : 0.5;
}

double
normal_inv(double p, const char *prompt)
{
  constexpr double eps = 1.e-10;
  static double last_p = 0.5, last_x = 0.0;
  double x, xx;

  if (p <= 0.0 || p >= 1.0)
    {
      fprintf(stderr, "%s: IMPLEMENTATION ERROR! (Invalid argument in function \"normal_inv\")\n", prompt);
      exit(4);
    }

  if (DBL_IS_EQUAL(p, last_p)) return last_x;
  if (DBL_IS_EQUAL(p, 0.5))
    return 0;
  else if (p < 0.5)
    return -normal_inv(1 - p, prompt);
  else
    {
      x = 0.0;
      while (true)
        {
          xx = x - (normal(x, prompt) - p) / normal_density(x);
          if (std::fabs(xx - x) < x * eps) break;
          x = xx;
        }
      last_p = p;
      last_x = x;
      return x;
    }
}

double
student_t_density(double n, double x, const char *prompt)
{
  if (n <= 0.0)
    {
      fprintf(stderr, "%s: IMPLEMENTATION ERROR! (Invalid argument in function \"student_t_density\")\n", prompt);
      exit(4);
    }

  return std::exp(lngamma((n + 1) / 2) - lngamma(n / 2)) / std::sqrt(n / 2) * std::pow((1 + x * x / n), -(n + 1) / 2);
}

double
student_t(double n, double x, const char *prompt)
{
  if (n <= 0.0)
    {
      fprintf(stderr, "%s: IMPLEMENTATION ERROR! (Invalid argument in function \"student_t\")\n", prompt);
      exit(4);
    }

  if (x > 0)
    return 1.0 - 0.5 * incomplete_beta(n / 2.0, 0.5, n / (n + x * x), prompt);
  else if (x < 0)
    return 0.5 * incomplete_beta(n / 2.0, 0.5, n / (n + x * x), prompt);
  else
    return 0.5;
}

double
student_t_inv(double n, double p, const char *prompt)
{
  constexpr double eps = 1.e-10;
  static double last_n = 1, last_p = 0.5, last_x = 0.0;
  double x, xx;

  if (n <= 0.0 || p <= 0.0 || p >= 1.0)
    {
      fprintf(stderr, "%s: IMPLEMENTATION ERROR! (Invalid argument in function \"student_t_inv\")\n", prompt);
      exit(4);
    }

  if (DBL_IS_EQUAL(n, last_n) && DBL_IS_EQUAL(p, last_p)) return last_x;
  if (DBL_IS_EQUAL(p, 0.5))
    return 0.0;
  else if (p < 0.5)
    return -student_t_inv(n, 1.0 - p, prompt);
  else
    {
      x = 0.0;
      while (true)
        {
          xx = x - (student_t(n, x, prompt) - p) / student_t_density(n, x, prompt);
          if (std::fabs(xx - x) < x * eps) break;
          x = xx;
        }
      last_n = n;
      last_p = p;
      last_x = x;
      return x;
    }
}

double
chi_square_density(double n, double x, const char *prompt)
{
  if (n <= 0.0)
    {
      fprintf(stderr, "%s: IMPLEMENTATION ERROR! (Invalid argument in function \"chi_square_density\")\n", prompt);
      exit(4);
    }

  return x <= 0 ? 0 : std::pow(2, -n / 2) * std::pow(x, n / 2 - 1) * std::exp(-x / 2 - lngamma(n / 2));
}

double
chi_square(double n, double x, const char *prompt)
{
  if (n <= 0.0)
    {
      fprintf(stderr, "%s: IMPLEMENTATION ERROR! (Invalid argument in function \"chi_square\")\n", prompt);
      exit(4);
    }

  return x <= 0.0 ? 0.0 : incomplete_gamma(n / 2.0, x / 2.0, prompt);
}

double
chi_square_inv(double n, double p, const char *prompt)
{
  constexpr double eps = 1.e-10;
  static double last_n = -1.0, last_p = -1.0, last_x = -1.0;
  static double last_last_n = -1.0, last_last_p = -1.0, last_last_x = -1.0;
  double x, xx;

  if (n <= 0.0 || p <= 0.0 || p >= 1.0)
    {
      fprintf(stderr, "%s: IMPLEMENTATION ERROR! (Invalid argument in function \"chi_square_inv\")\n", prompt);
      exit(4);
    }

  if (DBL_IS_EQUAL(n, last_n) && DBL_IS_EQUAL(p, last_p)) return last_x;

  if (DBL_IS_EQUAL(n, last_last_n) && DBL_IS_EQUAL(p, last_last_p)) return last_last_x;

  x = n;
  while (true)
    {
      xx = x - (chi_square(n, x, prompt) - p) / chi_square_density(n, x, prompt);
      if (std::fabs(xx - x) < x * eps) break;
      if (xx < 0)
        x /= 2.0;
      else
        x = xx;
    }

  last_last_n = last_n;
  last_last_p = last_p;
  last_last_x = last_x;
  last_n = n;
  last_p = p;
  last_x = x;

  return x;
}

void
chi_square_constants(double n, double p, double *c1, double *c2, const char *prompt)
{
  constexpr double eps = 1.e-10;
  static double last_n, last_p, last_c1, last_c2;

  if (n <= 0.0 || p <= 0.0 || p >= 1.0)
    {
      fprintf(stderr, "%s: IMPLEMENTATION ERROR! (Invalid argument in function \"chi_square_constants\")\n", prompt);
      exit(4);
    }

  if (DBL_IS_EQUAL(n, last_n) && DBL_IS_EQUAL(p, last_p))
    {
      *c1 = last_c1;
      *c2 = last_c2;
      return;
    }

  *c1 = n;
  *c2 = n;

  while (true)
    {
      const auto a11 = -chi_square_density(n, *c1, prompt);
      const auto a12 = chi_square_density(n, *c2, prompt);
      const auto a21 = -chi_square_density(n + 2, *c1, prompt);
      const auto a22 = chi_square_density(n + 2, *c2, prompt);
      const auto b1 = p + chi_square(n, *c1, prompt) - chi_square(n, *c2, prompt);
      const auto b2 = p + chi_square(n + 2, *c1, prompt) - chi_square(n + 2, *c2, prompt);
      /* Solve ((a11,a12),(a21,a22))*(delta_c1,delta_c2)==(b1,b2) */
      const auto det = a11 * a22 - a12 * a21;
      const auto delta_c1 = (b1 * a22 - b2 * a12) / det;
      const auto delta_c2 = (b2 * a11 - b1 * a21) / det;
      if (std::fabs(delta_c1) < *c1 * eps && std::fabs(delta_c2) < *c2 * eps) break;
      if (*c1 + delta_c1 >= n)
        *c1 = (n + *c1) / 2.0;
      else if (*c1 + delta_c1 <= 0)
        *c1 /= 2.0;
      else
        *c1 += delta_c1;
      if (*c2 + delta_c2 <= n)
        *c2 = (n + *c2) / 2.0;
      else
        *c2 += delta_c2;
    }

  last_n = n;
  last_p = p;
  last_c1 = *c1;
  last_c2 = *c2;
}

double
beta_distr_density(double a, double b, double x, const char *prompt)
{
  if (a <= 0.0 || b <= 0.0)
    {
      fprintf(stderr, "%s: IMPLEMENTATION ERROR! (Invalid argument in function \"beta_distr_density\")\n", prompt);
      exit(4);
    }

  return x <= 0 ? 0 : x >= 1 ? 1 : std::pow(x, a - 1) * std::pow(1 - x, b - 1) / beta(a, b, prompt);
}

double
beta_distr(double a, double b, double x, const char *prompt)
{
  return incomplete_beta(a, b, x, prompt);
}

double
beta_distr_inv(double a, double b, double p, const char *prompt)
{
  constexpr double eps = 1.e-10;
  static double last_a = -1.0, last_b, last_p = -1.0, last_x = -1.0;
  static double last_last_a = -1.0, last_last_b = -1.0, last_last_p = -1.0, last_last_x = -1.0;
  double xx, x;

  if (a <= 0.0 || b <= 0.0 || p <= 0.0 || p >= 1.0)
    {
      fprintf(stderr, "%s: IMPLEMENTATION ERROR! (Invalid argument in function \"beta_distr_inv\")\n", prompt);
      exit(4);
    }

  if (DBL_IS_EQUAL(a, last_a) && DBL_IS_EQUAL(b, last_b) && DBL_IS_EQUAL(p, last_p)) return last_x;

  if (DBL_IS_EQUAL(a, last_last_a) && DBL_IS_EQUAL(b, last_last_b) && DBL_IS_EQUAL(p, last_last_p)) return last_last_x;

  x = a / (a + b);
  while (true)
    {
      xx = x - (beta_distr(a, b, x, prompt) - p) / beta_distr_density(a, b, x, prompt);
      if (std::fabs(xx - x) < x * eps) break;
      if (xx <= 0.0)
        x /= 2;
      else if (xx >= 1.0)
        x = (1.0 + x) / 2.0;
      else
        x = xx;
    }
#if 0
  for (x_l = 0, x_r = 1; fabs (x_l - x_r) > eps;
       x = (x_l+x_r) / 2.0, beta_distr (a, b, x, prompt) < p ? (x_l=x):(x_r=x));
#endif

  last_last_a = last_a;
  last_last_b = last_b;
  last_last_p = last_p;
  last_last_x = last_x;
  last_a = a;
  last_b = b;
  last_p = p;
  last_x = x;

  return x;
}

void
beta_distr_constants(double a, double b, double p, double *c1, double *c2, const char *prompt)
{
  constexpr double eps = 1.e-10;
  static double last_a, last_b, last_p, last_c1, last_c2;

  if (a <= 0.0 || b <= 0.0 || p <= 0.0 || p >= 1.0)
    {
      fprintf(stderr, "%s: IMPLEMENTATION ERROR! (Invalid argument in function \"beta_distr_constants\")\n", prompt);
      exit(4);
    }

  if (DBL_IS_EQUAL(a, last_a) && DBL_IS_EQUAL(b, last_b) && DBL_IS_EQUAL(p, last_p))
    {
      *c1 = last_c1;
      *c2 = last_c2;
      return;
    }

#if 0
  *c1 = a / (a + b);
  *c2 = a / (a + b);
#endif
  *c1 = beta_distr_inv(a, b, p / 2.0, prompt);
  *c2 = beta_distr_inv(a, b, 1.0 - p / 2.0, prompt);

  while (true)
    {
      const auto a11 = -beta_distr_density(a, b, *c1, prompt);
      const auto a12 = beta_distr_density(a, b, *c2, prompt);
      const auto a21 = -beta_distr_density(a + 1, b, *c1, prompt);
      const auto a22 = beta_distr_density(a + 1, b, *c2, prompt);
      const auto b1 = p + beta_distr(a, b, *c1, prompt) - beta_distr(a, b, *c2, prompt);
      const auto b2 = p + beta_distr(a + 1, b, *c1, prompt) - beta_distr(a + 1, b, *c2, prompt);
      // Solve ((a11,a12),(a21,a22))*(delta_c1,delta_c2)==(b1,b2)
      const auto det = a11 * a22 - a12 * a21;
      const auto delta_c1 = (b1 * a22 - b2 * a12) / det;
      const auto delta_c2 = (b2 * a11 - b1 * a21) / det;
      if (std::fabs(delta_c1) < *c1 * eps && std::fabs(delta_c2) < *c2 * eps) break;
      if (*c1 + delta_c1 >= a / (a + b))
        *c1 = (a / (a + b) + *c1) / 2.0;
      else if (*c1 + delta_c1 <= 0)
        *c1 /= 2.0;
      else
        *c1 += delta_c1;
      if (*c2 + delta_c2 >= 1.0)
        *c2 = (1.0 + *c2) / 2.0;
      else if (*c2 + delta_c2 <= a / (a + b))
        *c2 = (a / (a + b) + *c2) / 2.0;
      else
        *c2 += delta_c2;
    }

  last_a = a;
  last_b = b;
  last_p = p;
  last_c1 = *c1;
  last_c2 = *c2;
}

double
fisher(double m, double n, double x, const char *prompt)
{
  if (m <= 0.0 || n <= 0.0)
    {
      fprintf(stderr, "%s: IMPLEMENTATION ERROR! (Invalid argument in function \"fisher\")\n", prompt);
      exit(4);
    }

  return incomplete_beta(m / 2.0, n / 2.0, n / (n + m * x), prompt);
}

/* ******************************************************************************** */
/* This routine rotates columns/rows i and j of a symmetric Matrix M in a fashion,  */
/* thus that the dot product of columns i and j 0 afterwards                        */
/*                                                                                  */
/* As this is done by a right-multiplication with a rotation matrix, which only     */
/* changes columns i and j, this can be carried out for n/2 pairs of columns at     */
/* the same time.                                                                   */
/* ******************************************************************************** */
static void
annihilate_1side(Varray2D<double> &M, size_t i, size_t j, size_t n)
{
  i--;
  j--;

  if (j < i) std::swap(i, j);

  auto &Mi = M[i];
  auto &Mj = M[j];

  double alpha = 0.0, beta = 0.0, gamma = 0.0;
#ifdef HAVE_OPENMP4
#pragma omp simd reduction(+ : alpha) reduction(+ : beta) reduction(+ : gamma)
#endif
  for (size_t r = 0; r < n; r++)
    {
      alpha += Mj[r] * Mj[r];
      beta += Mi[r] * Mi[r];
      gamma += Mi[r] * Mj[r];
    }

  // 2011-08-15 Cedrick Ansorge: bug fix
  //  tmp = std::fabs(gamma/sqrt(alpha/beta));
  const auto tmp = std::fabs(gamma / std::sqrt(alpha * beta));

  if (tmp < fnorm_precision)
    {
#ifdef _OPENMP
#pragma omp atomic
#endif
      n_finished++;

      return;
    }

  const auto zeta = (beta - alpha) / (2.0 * gamma);  // tan(2*theta)
  auto tk = 1.0 / (std::fabs(zeta) + std::sqrt(1. + zeta * zeta));
  tk = zeta > 0 ? tk : -tk;                      // = cot(2*theta)
  const auto ck = 1.0 / std::sqrt(1. + tk * tk);  // = cos(theta)
  const auto sk = ck * tk;                       // = sin(theta)

  // calculate a_i,j - tilde
  for (size_t r = 0; r < n; r++)
    {
      const auto mi = Mi[r];
      const auto mj = Mj[r];
      Mi[r] = ck * mi + sk * mj;
      Mj[r] = -sk * mi + ck * mj;
    }
}

static int
jacobi_1side(Varray2D<double> &M, Varray<double> &A, size_t n)
{
  Varray2D<int> annihilations(2);
  annihilations[0].resize(n * n);
  annihilations[1].resize(n * n);

  size_t count = 0;
  for (size_t k = 1; k < n + 1; k++)
    {
      if (k < n)
        {
          {
            const auto nmax = (size_t) std::ceil(1. / 2. * (n - k));
            for (size_t i = 1; i <= nmax; i++)
              {
                const auto j = n - k + 2 - i;
                annihilations[0][count] = i;
                annihilations[1][count] = j;
                count++;
              }
          }
          if (k > 2)
            {
              const auto nmax = n - (size_t) std::floor(1. / 2. * k);
              for (size_t i = n - k + 2; i <= nmax; i++)
                {
                  const auto j = 2 * n - k + 2 - i;
                  annihilations[0][count] = i;
                  annihilations[1][count] = j;
                  count++;
                }
            }
        }
      else if (k == n)
        {
          const auto nmax = (size_t) std::ceil(1. / 2. * n);
          for (size_t i = 2; i <= nmax; i++)
            {
              const auto j = n + 2 - i;
              annihilations[0][count] = i;
              annihilations[1][count] = j;
              count++;
            }
        }
    }

  //  fprintf(stderr, "%d annihilations per sweep\n", count);

  n_finished = 0;

  //  override global openmp settings works
  //  omp_set_num_threads(2);

  int n_iter = 0;
  while (n_iter < max_jacobi_iter && n_finished < count)
    {
      n_finished = 0;
      if (n % 2 == 1)
        {
          for (size_t m = 0; m < n; m++)
            {
#ifdef _OPENMP
#pragma omp parallel for shared(M, annihilations, n)
#endif
              for (size_t i = 0; i < n / 2; i++)
                {
                  const auto idx = m * (n / 2) + i;
                  const auto i_ann = annihilations[0][idx];
                  const auto j_ann = annihilations[1][idx];
                  if (i_ann != j_ann && i_ann && j_ann) annihilate_1side(M, i_ann, j_ann, n);
                }
            }
        }
      else
        {  // n%2 == 0
          for (size_t m = 0; m < n; m++)
            {
#ifdef _OPENMP
#pragma omp parallel for shared(M, annihilations, n)
#endif
              for (size_t i = 0; i < n / 2 - (m % 2); i++)
                {
                  auto idx = m / 2 * (n / 2 + n / 2 - 1);
                  if (m % 2) idx += n / 2;
                  const auto i_ann = annihilations[0][idx + i];
                  const auto j_ann = annihilations[1][idx + i];
                  if (i_ann && j_ann && i_ann != j_ann) annihilate_1side(M, i_ann, j_ann, n);
                }
            }
        }
      n_iter++;
    }

  if (Options::cdoVerbose) cdoPrint("Finished one-sided jacobi scheme for eigenvalue computation after %i iterations", n_iter);

  //  fprintf(stderr,"finished after %i sweeps (n_finished %i)\n",n_iter,n_finished);

  if (n_iter == max_jacobi_iter && n_finished < count)
    {
      fprintf(stderr,
              "statistics-module (Warning): Eigenvalue computation with one-sided jacobi scheme\n"
              "                             did not converge properly. %ld of %ld pairs of columns did\n"
              "                             not achieve requested orthogonality of %10.6g\n",
              count - n_finished, count, fnorm_precision);

      if (n_finished == 0)
        {
          //	Do not overwrite results in case of insufficient convergence
          cdoWarning("Setting Matrix and Eigenvalues to 0 before return");
          for (size_t i = 0; i < n; i++)
            {
              memset(M[i].data(), 0, n * sizeof(double));
              memset(A.data(), 0, n * sizeof(double));
            }
          return -1;
        }
    }
  // calculate  eigen values as std::sqrt(||m_i||)
  for (size_t i = 0; i < n; i++)
    {
      A[i] = 0.0;
      for (size_t r = 0; r < n; r++) A[i] += M[i][r] * M[i][r];
      A[i] = std::sqrt(A[i]);
      for (size_t r = 0; r < n; r++) M[i][r] /= A[i];
    }

  heap_sort(A, M, n);

  return n_iter;
}

/* ******************************************************************************** */
/*                                                                                  */
/*   P A R A L L E L   S O L U T I O N   O F   T H E   E I G E N   P R O B L E M    */
/*                     WITH ONE SIDED JACOBI ALGORITHM                              */
/*                                                                                  */
/* ******************************************************************************** */

void
parallel_eigen_solution_of_symmetric_matrix(Varray2D<double> &M, Varray<double> &A, size_t n, const char func[])
{
  (void) (func);  // CDO_UNUSED

  char *envstr;
  /* Get Environment variables if set */
  envstr = getenv("MAX_JACOBI_ITER");
  max_jacobi_iter = envstr ? atoi(envstr) : MAX_JACOBI_ITER;
  if (Options::cdoVerbose) cdoPrint("Using MAX_JACOBI_ITER %i from %s", max_jacobi_iter, envstr ? "Environment" : "default");

  envstr = getenv("FNORM_PRECISION");
  fnorm_precision = envstr ? strtod(envstr, nullptr) : FNORM_PRECISION;
  if (Options::cdoVerbose) cdoPrint("Using FNORM_PRECISION %g from %s", fnorm_precision, envstr ? "Environment" : "default");

  // eigen_solution_of_symmetric_matrix(M, A, n, func);
  jacobi_1side(M, A, n);

  return;
}

}  // namespace cdo
