#include "Python.h"
#include "Numeric/arrayobject.h"

#include <math.h>
#include <limits.h>
#include <malloc.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <float.h>
#include <time.h>
// #include <ieeefp.h>

#define READ_BUFFER_SIZE 100000
#define INFO_INTERVAL 30
#define pi 3.14159265358979

int n_input_dim;                               // input dimensionality
int n_classes;                                 // number of classes

int n_parzen_points;                           // total number of Parzen centers
double ** parzen_points = NULL;                // Parzen centers (input-space coordinates)

#define PROJECTED_POINTS 2
#define PARTIALLY_PROJECTED_POINTS 1
#define RBF_ACTIVATIONS 0
double *** projected_parzen_points = NULL;     // Parzen centers (RBF kernel activations and output coordinates))
double ** parzen_classes = NULL;               // classification vector of each parzen point: (n_points,n_classes), sum=1 over classes
double pdf_sigma;                              // width of Gaussian centers

double * empirical_class_probabilities = NULL; // Number of points in class divided by total n. of points
double empirical_class_norm;                   // sum(empirical_class_probabilities.^2)

int n_output_dim;                              // output dimensionality
double ** W = NULL;                            // projection matrix for output layer: n_input_dim by n_output_dim
double ** A = NULL;                            // Fixed linear transformation before linear layer (usually, scaling + LDA rotation) (n_input_dim by n_input_dim)
int n_parameters;                              // total number of parameters

#define DCA_ALGORITHM 0
int algorithm_used;                            // 0=DCA algorithm

int use_angle_reparameterization;
double ** initial_W = NULL;
double * W_angles = NULL;
double * W_lengths = NULL;












double *** transform_points(int n_x, double ** x, double ***result)
{
  int i, k, l;
  double temp, temp2;
  double * temp_x, * temp_m;
  double rbf_scaling;

  // printf("Transforming points 1\n");

  if (!result)  
  {
    // printf("Allocating transformed points\n");
    // Allocate return array
    result = (double ***)malloc(n_x*sizeof(double **));
    for (i = n_x-1; i >= 0; i--)
    {
      result[i] = (double **)malloc(3*sizeof(double *));
      result[i][RBF_ACTIVATIONS] = NULL;   // used to be RBF kernel activations, not used now
      result[i][PARTIALLY_PROJECTED_POINTS] = (double *)malloc(n_input_dim*sizeof(double)); // Inputs, multiplied by matrix A
      result[i][PROJECTED_POINTS] = (double *)malloc(n_output_dim*sizeof(double)); // Output coordinates
    }
  }

  for (i = n_x-1; i >= 0; i--)
  {
    // printf("Transforming points 2, i=%d\n", i);

    // Partially transformed points
    // printf("Transforming points 3\n");
    temp_x = result[i][RBF_ACTIVATIONS];
    for (k = n_input_dim - 1; k >= 0; k--)
    {
      temp = 0;
      
      // Direct connections from inputs
      for (l = n_input_dim-1; l >= 0; l--)
      {
	temp += A[l][k]*x[i][l];
      }
      
      result[i][PARTIALLY_PROJECTED_POINTS][k] = temp;
    }

    // Output layers
    // printf("Transforming points 4\n");
    temp_x = result[i][PARTIALLY_PROJECTED_POINTS];
    for (k = n_output_dim-1; k >= 0; k--)
    {
      temp = 0;
          
      for (l = n_input_dim-1; l >= 0; l--)
      {
        temp += W[l][k]*temp_x[l];
      }
     
      result[i][PROJECTED_POINTS][k] = temp;
    }
  }

  // printf("Transforming points 5\n");
  /*
  printf("Transformed points\n");
  for (i = 0; i < n_x; i++)
  {
    for (k = 0; k < n_output_dim; k++)
      printf("%e ", result[i][PROJECTED_POINTS][k]);
    printf("\n");
  }
  */
  return(result);
}


void calculate_rotation_gradient
(
  double ** W_gradient, double *result
)
{
  int i, j, k, l;
  int d1, d2;
  static double *** W_derivatives = NULL;
  double cosine, sine;
  double temp1, temp2;
  int n_W_input_dim, n_W_output_dim;

  n_W_input_dim = n_input_dim;
  n_W_output_dim = n_output_dim;

  // Calculate derivatives of W with respect to the angles
  if (!W_derivatives)
  {
    W_derivatives = (double ***)malloc(n_W_output_dim*(n_W_input_dim-n_W_output_dim)*sizeof(double **));
    for (i = n_W_output_dim*(n_W_input_dim-n_W_output_dim) - 1; i >= 0; i--)
    {
      W_derivatives[i] = (double **)malloc(n_W_input_dim*sizeof(double *));
      for (j = n_W_input_dim-1; j >= 0; j--)
        W_derivatives[i][j] = (double *)malloc(n_W_output_dim*sizeof(double));
    }
  }

  for (i = n_W_output_dim*(n_W_input_dim-n_W_output_dim) - 1; i >= 0; i--)
  {
    // Start with the original orthonormal matrix, and scale by vector lengths
    for (j = n_W_output_dim-1; j >= 0; j--)
      for (k = n_W_input_dim-1; k >= 0; k--)
         W_derivatives[i][k][j] = initial_W[k][j]*W_lengths[j];

    // Rotate along planes from current+1 to maximum
    for (j = n_W_output_dim*(n_W_input_dim-n_W_output_dim) - 1; j > i; j--)
    {
      d1 = j/(n_W_input_dim-n_W_output_dim); // 0 < d1 < n_W_output_dim
      d2 = (j % (n_W_input_dim-n_W_output_dim)) + n_W_output_dim;
      cosine = cos(W_angles[j]);
      sine = sin(W_angles[j]);
      
      for (k = n_W_output_dim-1; k >= 0; k--)
      {
        temp1 = cosine*W_derivatives[i][d1][k] +   sine*W_derivatives[i][d2][k];
        temp2 =  -sine*W_derivatives[i][d1][k] + cosine*W_derivatives[i][d2][k];
        W_derivatives[i][d1][k] = temp1;
        W_derivatives[i][d2][k] = temp2;
      }
    }

    // Apply the derived transformation
    d1 = i/(n_W_input_dim-n_W_output_dim);
    d2 = (i % (n_W_input_dim-n_W_output_dim)) + n_W_output_dim;
    cosine = cos(W_angles[i]);
    sine = sin(W_angles[i]);
    for (k = n_W_output_dim-1; k >= 0; k--)
    {
      temp1 =  -   sine*W_derivatives[i][d1][k] + cosine*W_derivatives[i][d2][k];
      temp2 =  - cosine*W_derivatives[i][d1][k] -   sine*W_derivatives[i][d2][k];
      for (l = n_W_input_dim-1; l >= 0; l--)
        W_derivatives[i][l][k] = 0;
      W_derivatives[i][d1][k] = temp1;
      W_derivatives[i][d2][k] = temp2;      
    }

    // Rotate along the rest of the dimensions
    for (j = i-1; j >= 0; j--)
    {
      d1 = j/(n_W_input_dim-n_W_output_dim);
      d2 = (j % (n_W_input_dim-n_W_output_dim)) + n_W_output_dim;
      cosine = cos(W_angles[j]);
      sine = sin(W_angles[j]);
      
      for (k = n_W_output_dim-1; k >= 0; k--)
      {
        temp1 = cosine*W_derivatives[i][d1][k] +   sine*W_derivatives[i][d2][k];
        temp2 =  -sine*W_derivatives[i][d1][k] + cosine*W_derivatives[i][d2][k];
        W_derivatives[i][d1][k] = temp1;
        W_derivatives[i][d2][k] = temp2;
      }
    }    
  }

  // Calculate the angle gradients, using the above matrices and the W gradient
  for (k = n_W_output_dim*(n_W_input_dim-n_W_output_dim) - 1; k >= 0; k--)
  {
    temp1 = 0;   

    result[k] = 0;
    for (i = n_W_input_dim-1; i >= 0; i--)
      for (j = n_W_output_dim-1; j >= 0; j--)
      {
  	temp2 = fabs(W_derivatives[k][i][j]); 
        if (temp2 > temp1) temp1 = temp2;
        result[k] += W_gradient[i][j]*W_derivatives[k][i][j];
      }
    // printf("Angle[%d]=%e, gradient=%e, maxderiv=%e\n", k, W_angles[k], result[k], temp1);
  }
}


void construct_projection_from_angles()
{
  int i, j, k, l;
  int d1, d2;
  double cosine, sine;
  double temp1, temp2;
  int n_W_input_dim, n_W_output_dim;

  n_W_output_dim = n_output_dim;
  n_W_input_dim = n_input_dim;

  // printf("1\n");

  // Start with the original orthonormal matrix, and scale by vector lengths
  for (j = n_W_output_dim-1; j >= 0; j--)
    for (k = n_W_input_dim-1; k >= 0; k--) 
    {
      // printf("1a\n");
      temp1 = initial_W[k][j];
      // printf("1b, temp1=%e\n", temp1);
      temp2 = W_lengths[j];
      // printf("1b2, temp2=%e\n", temp2);
      temp1 *= temp2;
      // printf("1c\n");
      W[k][j] = temp1; // initial_W[k][j]*W_lengths[j];
    }

  // printf("2\n");

  // Rotate along planes
  for (j = n_W_output_dim*(n_W_input_dim-n_W_output_dim) - 1; j >= 0; j--)
  {
    // printf("3, j=%d\n", j);
    d1 = j/(n_W_input_dim-n_W_output_dim); // 0 <= d1 < n_W_output_dim
    d2 = (j % (n_W_input_dim-n_W_output_dim)) + n_W_output_dim;

    // printf("angle=%e\n", W_angles[j]);
    cosine = cos(W_angles[j]);
    sine = sin(W_angles[j]);

    // printf("3b\n");
    for (k = n_W_output_dim-1; k >= 0; k--)
    {
      // printf("3b1\n");
      temp1 = cosine*W[d1][k] +   sine*W[d2][k];
      temp2 =  -sine*W[d1][k] + cosine*W[d2][k];
      // printf("3b2\n");
      W[d1][k] = temp1;
      W[d2][k] = temp2;
      /*
      if ((d1 == 1) || (d2 == 1))
      {
        printf("Projected dimensions %d, %d: W[%d][%d]=%e W[%d][%d]=%e\n", 
	       d1, d2, d1, k, W[d1][k], d2, k, W[d2][k]);
      }
      */
    }
    // printf("4\n");
  }  
  // printf("Projection constructed.\n");
}


double parzen_conditional_log_probability(int x_index, int x_class)
{
  double scaling, scaling_xclass;
  static double * tempprobs = NULL;
  double * temp_x;
  double * temp_m;

  double temp, temp2;
  double prob1, prob2;
  int i, j;
   
  if (!tempprobs)
    tempprobs = (double *)malloc(n_parzen_points*sizeof(double));

  // Project the point
  // printf("Calculating cost for point %d, %d\n", x_index, x_class);
  temp_x = projected_parzen_points[x_index][PROJECTED_POINTS];

  // Find scalings
  // printf("Find scaling\n");
  scaling = -DBL_MAX;
  scaling_xclass = -DBL_MAX;
  for (i = n_parzen_points-1; i >= 0; i--)
  {
    if (i != x_index) // Sort of implicit 'leave-one out validation'
    {
      // Project the Parzen point
      temp_m = projected_parzen_points[i][PROJECTED_POINTS];

      temp = 0;
      for (j = n_output_dim-1; j >= 0; j--)
      {
        temp2 = temp_x[j]-temp_m[j];
        temp += temp2*temp2;
      }
      // printf("point %i, temp=%e\n", i, temp);
      tempprobs[i] = -temp/(2*pdf_sigma*pdf_sigma);
      if (tempprobs[i] > scaling) scaling = tempprobs[i];
      if ((parzen_classes[i][x_class]) && (tempprobs[i] > scaling_xclass)) 
	scaling_xclass = tempprobs[i];
    }
  }

  // Calculate conditional probability
  // printf("Calculating conditional probability\n");
  prob1 = 0; 
  prob2 = 0;
  for (i = n_parzen_points-1; i >= 0; i--)
  {
    if (i != x_index)
    {
      // printf("point %i, tempprobs=%e, scaling=%e, scaling_xclass=%e\n", i, tempprobs[i], scaling, scaling_xclass);
      temp = exp(tempprobs[i] - scaling);
      prob1 += temp;
      if (parzen_classes[i][x_class])
      {
	temp = exp(tempprobs[i] - scaling_xclass)*parzen_classes[i][x_class];
	prob2 += temp;
      }
    }
  }

  // printf("prob2=%e, prob1=%e\n", prob2, prob1);
  // printf("prob2/prob1=%e\n", prob2/prob1);
  temp = log(prob2) + scaling_xclass - log(prob1) - scaling;
  return(temp);
}




void calculate_dca_gradient
(
  int x_index, int x_class, 
  double **W_result
)
{
  int i, j, k, l;
  int layer;

  static double * scalings = NULL;
  double allscaling;
  double denominator1, denominator2;
  double * temp_vector1 = NULL, * temp_vector2 = NULL, * temp_vector3 = NULL, * temp_vector4 = NULL;
  static double * tempprobs = NULL;
  static double *** parzen_gradients = NULL;
  double temp, temp2, temp3;
  static double time1 = 0, time2 = 0, time3 = 0, time4 = 0;
  clock_t temp_clock1, temp_clock2;

  if (!tempprobs)
  {
    scalings = (double *)malloc(n_classes*sizeof(double));
    tempprobs = (double *)malloc(n_parzen_points*sizeof(double));

    parzen_gradients = (double ***)malloc(n_parzen_points*sizeof(double **));
    for (i = n_parzen_points-1; i >= 0; i--)
    {
      parzen_gradients[i] = (double **)malloc(3*sizeof(double *));
      parzen_gradients[i][RBF_ACTIVATIONS] = NULL;
      parzen_gradients[i][PARTIALLY_PROJECTED_POINTS] = (double *)malloc(n_input_dim*sizeof(double));
      parzen_gradients[i][PROJECTED_POINTS] = (double *)malloc(n_output_dim*sizeof(double));
    }
  }

  temp_clock1 = clock();
  if (algorithm_used == DCA_ALGORITHM)
  {   
    /*--------------------------------------------------
     * Weighting of Parzen kernels
     *--------------------------------------------------*/  
    // Find scalings
    // printf("Initializing scaling factors, %d classes\n", n_classes);
    for (i = n_classes-1; i >= 0; i--) scalings[i] = -DBL_MAX;

    // printf("Projecting x (index %d, class %d)\n", x_index, x_class);
    temp_vector1 = projected_parzen_points[x_index][PROJECTED_POINTS];

    // printf("Finding scaling factors\n");
    for (i = n_parzen_points-1; i >= 0; i--)
    {
      // printf("i=%d\n", i);
      // printf("class %d\n", parzen_classes[i]);
      if (i != x_index)
      {
        temp_vector2 = projected_parzen_points[i][PROJECTED_POINTS];
    
        temp = 0;
        for (j = n_output_dim-1; j >= 0; j--)
        {
          temp2 = temp_vector1[j]-temp_vector2[j]; 
          temp += temp2*temp2;
        }
        temp = -temp/(2*pdf_sigma*pdf_sigma);
  
        tempprobs[i] = temp;      
	for (j = n_classes-1; j >= 0; j--)
	  if (parzen_classes[i][j])
	    if (temp > scalings[j]) scalings[j] = temp;
      }
    }
  
    // Find global scaling
    allscaling = -DBL_MAX;
    for (i = n_classes-1; i >= 0; i--)
      if (scalings[i] > allscaling) allscaling = scalings[i];
  
    // calculate denominators
    // printf("Calculating denominators\n");
    denominator1 = 0;  // Class-independent denominator
    denominator2 = 0;  // Class-dependent denominator
    for (i = n_parzen_points-1; i >= 0; i--)
    {
      if (i != x_index)
      {
        denominator1 += exp(tempprobs[i] - allscaling);
        if (parzen_classes[i][x_class]) 
	  denominator2 += exp(tempprobs[i] - scalings[x_class])*parzen_classes[i][x_class];
      }
    }
    if ((denominator1 == 0) || (denominator2 == 0))
    {
      printf("Scaling problem detected, denominator1: %e, denominator2: %e, allscaling: %e, scalings[x_class]: %e, x_class: %d\n",
  	     denominator1, denominator2, allscaling, scalings[x_class], x_class);
    }
       
    /*--------------------------------------------------
     * Gradients w.resp. to outputs (transformed points)
     *--------------------------------------------------*/  
    // printf("Output layer\n");
    temp_vector1 = projected_parzen_points[x_index][PROJECTED_POINTS];
    temp_vector3 = parzen_gradients[x_index][PROJECTED_POINTS];
    for (j = n_output_dim-1; j >= 0; j--)
      temp_vector3[j] = 0;

    for (i = n_parzen_points-1; i >= 0; i--)
    {
      // printf("i=%d\n", i);    
      if (i != x_index)
      {
        temp = exp(tempprobs[i] - allscaling)/denominator1;
        if (parzen_classes[i][x_class]) 
          temp -= exp(tempprobs[i] - scalings[x_class])*parzen_classes[i][x_class]/denominator2;
        temp *= -1.0/(pdf_sigma*pdf_sigma);
  
        temp_vector2 = projected_parzen_points[i][PROJECTED_POINTS];
        temp_vector4 = parzen_gradients[i][PROJECTED_POINTS];
        for (j = n_output_dim-1; j >= 0; j--)
        {
          temp2 = (temp_vector1[j] - temp_vector2[j])*temp;
          temp_vector4[j] = temp2;
  	  temp_vector3[j] -= temp2;
        }
      }
    }
  }


  temp_clock2 = clock();
  time2 += (temp_clock2-temp_clock1)/(double)(CLOCKS_PER_SEC);


  /*--------------------------------------------------
   * Gradients w.resp. to parameters
   *--------------------------------------------------*/  
  // printf("Parameters\n");

  for (i = n_input_dim-1; i >= 0; i--)
    for (j = n_output_dim-1; j >= 0; j--)
      W_result[i][j] = 0;


  temp_clock1 = clock();
  for (k = n_parzen_points-1; k >= 0; k--) // This sum includes x_index!
  {
    // Linear projection parameters
    temp_vector1 = parzen_gradients[k][PROJECTED_POINTS];    
    temp_vector2 = projected_parzen_points[k][PARTIALLY_PROJECTED_POINTS];
    for (i = n_input_dim-1; i >= 0; i--)
    {
      temp = temp_vector2[i];
      temp_vector4 = W_result[i];
      for (j = n_output_dim-1; j >= 0; j--)
	temp_vector4[j] += temp*temp_vector1[j];
    }
  }
  temp_clock2 = clock();
  time3 += (temp_clock2-temp_clock1)/(double)(CLOCKS_PER_SEC); 

  // if ((x_index % 100) == 0) printf("time1=%e, time2=%e, time3=%e, time4=%e\n", time1, time2, time3, time4);
}




void print_projection(char * filename)
{
  FILE * f;
  int i, j, k, prev_dim;

  f = fopen(filename, "ab");
  fprintf(f, "%d %d", n_input_dim, n_output_dim);

  // Output projection
  for (i = 0; i < n_input_dim; i++)
  {
    for (j = 0; j < n_output_dim; j++)
      fprintf(f, "%e ", W[i][j]);
    fprintf(f, "\n");
  }

  fclose(f);
}



double calculate_dca_cost_function()
{
  double result;
  double temp;
  int i, j;

  result = 0;
  for (i = n_parzen_points-1; i >= 0; i--)
  {
    for (j = n_classes-1; j >= 0; j--)
      if (parzen_classes[i][j])
      {
        // If there is more than one point in this class, include this term in the overall cost
        if (empirical_class_probabilities[j] > parzen_classes[i][j]/n_parzen_points)
	{
          temp = parzen_conditional_log_probability(i, j);
	  result += temp*parzen_classes[i][j];
        }
      }
  }
  return(result);
}




double calculate_cost_function()
{
  if (algorithm_used == DCA_ALGORITHM) return(calculate_dca_cost_function());
  return(-1);
}


void save_projection(char * out_file)
{
  int i, j, k;
  FILE * f;

  printf("Saving projection to [%s]\n", out_file);
  f = fopen(out_file, "wb");

  //fprintf(f, "%d %d\n", n_input_dim, n_output_dim);


  // Scaling before linear layer
  /*
  for (i = 0; i < n_input_dim; i++)
  {
    for (j = 0; j < n_input_dim; j++)
      fprintf(f, "%e ", A[i][j]);
    fprintf(f, "\n");
  }
  */

  // Output projection
  for (i = 0; i < n_input_dim; i++)
  {
    for (j = 0; j < n_output_dim; j++)
      fprintf(f, "%e ", W[i][j]);
    fprintf(f, "\n");
  }

  fclose(f);
}



void initialize_projection
(
  PyArrayObject * py_A,
  PyArrayObject * py_initial_W
)
{
  int i, j, k, prev_dim;

  n_parameters = 0;


  A = (double **)malloc(n_input_dim*sizeof(double *));
  for (j = 0; j < n_input_dim; j++)
    A[j] = (double *)malloc(n_input_dim*sizeof(double));

  W = (double **)malloc(n_input_dim*sizeof(double *));
  for (j = 0; j < n_input_dim; j++)  
    W[j] = (double *)malloc(n_output_dim*sizeof(double));  

  if (use_angle_reparameterization)
  {
    W_angles = (double *)malloc((n_output_dim)*(n_input_dim-n_output_dim)*sizeof(double));    
    n_parameters += (n_output_dim)*(n_input_dim-n_output_dim);

    W_lengths = (double *)malloc(n_output_dim*sizeof(double));
    initial_W = (double **)malloc(n_input_dim*sizeof(double *));
    for (j = 0; j < n_input_dim; j++)    
      initial_W[j] = (double *)malloc(n_output_dim*sizeof(double));
  }
  else
  {
    n_parameters += n_input_dim*n_output_dim;
  }

  printf("Initializing projection with dimensions: %d %d\n", n_input_dim, n_output_dim);

  printf("A\n");
  for (j = 0; j < n_input_dim; j++)
    for (k = 0; k < n_input_dim; k++)
      A[j][k] = *(double *)(py_A->data + j*py_A->strides[0] + k*py_A->strides[1]);


  if (use_angle_reparameterization)
  {
    /*printf("Initial W\n");*/
    for (j = 0; j < n_input_dim; j++)
      for (k = 0; k < n_output_dim; k++)
      {
	initial_W[j][k] = *(double *)(py_initial_W->data + j*py_initial_W->strides[0] + k*py_initial_W->strides[1]);
	printf("initial_W[%d][%d] = %e\n", j, k, initial_W[j][k]);
      }

    /*printf("Angles\n");*/
    for (j = 0; j < (n_output_dim)*(n_input_dim-n_output_dim); j++)
      W_angles[j] = 0;

    /*printf("Lengths\n");*/
    for (j = 0; j < n_output_dim; j++)
      W_lengths[j] = 1;

    /*printf("Constructing projection\n");*/
    construct_projection_from_angles();
    /*printf("Projection constructed.\n");*/
  }
  else
  {
    for (j = 0; j < n_input_dim; j++)
      for (k = 0; k < n_output_dim; k++)
	W[j][k] = *(double *)(py_initial_W->data + j*py_initial_W->strides[0] + k*py_initial_W->strides[1]);
  }

  printf("Projection initialized\n");
}



/*
  Batch gradient for nonlinear DCA algorithm and pre-given data set
  This version uses internal parameterization for an internal conjugate
  gradient function.
 */
void dca_lib_gradient_internalparams
(
  int ntotalparams, 
  double *currentparams, 
  double *writegradienthere, 
  int whethertoprint_costfunction
)
{ 
  static double **W_gradient = NULL;
  static double **W_batch_gradient = NULL;
  static double *W_angle_gradient = NULL;

  int point_index;
  int i, j, k, prev_dim;

  int resultdim[2];
  PyArrayObject * py_result;
  int result_index;
  double temp_costfunction;

  if (!W_gradient)
  {
    // Allocate gradients
    /*printf("Allocating gradients\n");*/


    W_gradient = (double **)malloc(n_input_dim*sizeof(double *));
    for (j = 0; j < n_input_dim; j++)
      W_gradient[j] = (double *)malloc(n_output_dim*sizeof(double));

    W_batch_gradient = (double **)malloc(n_input_dim*sizeof(double *));
    for (j = 0; j < n_input_dim; j++)
      W_batch_gradient[j] = (double *)malloc(n_output_dim*sizeof(double));

    if (use_angle_reparameterization)
      W_angle_gradient = (double *)malloc(n_output_dim*(n_input_dim-n_output_dim)*sizeof(double));
  }


  // Parse the arguments
  /*printf("Parsing arguments\n");*/


  result_index = 0;
  if (use_angle_reparameterization)
  {
    // Retrieve the W angles
    /*printf("Retrieving W angles\n");*/
    for (j = n_output_dim*(n_input_dim-n_output_dim)-1; j >= 0; j--)
    {
      W_angles[j] = currentparams[result_index];
      //printf("W_angles[%d]=%e\n", j, W_angles[j]);
      result_index++;
    }
    /*printf("Constructing projection\n");*/
    construct_projection_from_angles();
    /*printf("Projection constructed.\n");*/
  }
  else
  {
    /*printf("Retrieving W\n");*/
    // Retrieve the W parameters
    for (j = n_input_dim-1; j >= 0; j--)
      for (k = n_output_dim-1; k >= 0; k--)
      {
        W[j][k] = currentparams[result_index];
	result_index++;
      }
  }


  if (whethertoprint_costfunction)
  {
    printf("Current W:\n");
    for (j = 0; j < n_input_dim; j++)
    {
      for (k = 0; k < n_output_dim; k++)
      {
        printf("%e ", W[j][k]);
      }
      printf("\n");
    }
  }



  // Initialize W-batch-gradient
  /*printf("Init W-gradient\n");*/
  for (j = n_input_dim-1; j >= 0; j--)
    for (k = n_output_dim-1; k >= 0; k--)
      W_batch_gradient[j][k] = 0;    

  // Transform points
  /*printf("Projecting Parzen points\n");*/
  projected_parzen_points = transform_points(n_parzen_points, parzen_points, projected_parzen_points);


  if (whethertoprint_costfunction)
  {
    temp_costfunction = calculate_cost_function();
    printf("Cost function: %e\n", temp_costfunction);
  }
  /*
  printf("Calculating initial cost function\n");
  printf("Cost function before training: %e\n", calculate_cost_function()); 
  */

  /*printf("Calculating individual gradients\n");*/
  // Sum over gradients of individual point likelihoods
  /*printf("n_parzen_points=%d\n", n_parzen_points);*/
  for (point_index = n_parzen_points-1; point_index >= 0; point_index--)
  {
    /*
    if (!(point_index%500)) 
    {       
      printf("point index:%d, point classes: ", point_index);
      for (i = 0; i < n_classes; i++)
        printf("%f ", parzen_classes[point_index][i]);
      printf("\n");
    }
    */

    for (i = n_classes-1; i >= 0; i--)
      if (parzen_classes[point_index][i])
      {
        // If there is more than one point in this class, include this term in the overall cost
        if ((algorithm_used == DCA_ALGORITHM) && (empirical_class_probabilities[i] > parzen_classes[point_index][i]/n_parzen_points))
	{
  	  calculate_dca_gradient(point_index, i, W_gradient);

	  // Add to W-batch-gradient
	  // printf("Add to W-batch-gradient\n");
	  for (j = n_input_dim-1; j >= 0; j--)
	    for (k = n_output_dim-1; k >= 0; k--)
	      W_batch_gradient[j][k] += W_gradient[j][k]*parzen_classes[point_index][i];

  	  /*
	  printf("W_gradient (%d):\n", point_index);
	  for (j = n_input_dim-1; j >= 0; j--)
	  {
	    for (k = n_output_dim-1; k >= 0; k--)
	      printf("W_gradient[%d][%d] = %e ", j, k, W_gradient[j][k]);
	    printf("\n");
	  }
	  */
	}
      }
  }

  /*
  printf("W batch gradient:\n");
  for (j = n_input_dim-1; j >= 0; j--)
  {
    for (k = n_output_dim-1; k >= 0; k--)
      printf("W_batch_gradient[%d][%d] = %e ", j, k, W_batch_gradient[j][k]);
    printf("\n");
  }
  */

  if (use_angle_reparameterization)
  {
    /*printf("Calculating rotation gradient\n");*/
    calculate_rotation_gradient(W_batch_gradient, W_angle_gradient);
    /*printf("Rotation gradient calculated\n");*/
  }

  // Create the return array
  resultdim[0] = n_parameters;
  resultdim[1] = 1;
  py_result = (PyArrayObject *)PyArray_FromDims(1,resultdim,PyArray_DOUBLE);


  // Return the opposite of the gradient (because we want to maximize the 
  // value of the log-likelihood, not minimize it)
#define GRADIENT_FLIPSIGN (-1)
  result_index = 0; 
  if (use_angle_reparameterization)
  {
    // Return the W angle gradient
    for (j = n_output_dim*(n_input_dim-n_output_dim)-1; j >= 0; j--)
    {
      writegradienthere[result_index] = GRADIENT_FLIPSIGN*W_angle_gradient[j];
      result_index++;
    }
  }
  else
  {
    // Return the W batch gradient
    for (j = n_input_dim-1; j >= 0; j--)
      for (k = n_output_dim-1; k >= 0; k--)
      {
	writegradienthere[result_index] = GRADIENT_FLIPSIGN*W_batch_gradient[j][k];
        result_index++;
      }
  }

}





/*
  Initialize Discriminative Component Analysis algorithm for a data set
 */
static PyObject * dca_lib_initialize(PyObject *self, PyObject *args)
{ 
  PyArrayObject * py_x;
  PyArrayObject * py_x_classes;
  PyObject * py_nclasses;
  PyArrayObject * py_A;
  PyArrayObject * py_initial_W;
  PyObject * py_pdf_sigma;
  PyObject * py_use_angle_reparameterization;
  PyObject * py_algorithm_used;

  int i, j, k, prev_dim;

  int resultdim[2];
  PyArrayObject * py_result;
  int result_index;

  // Parse the arguments
  if (!PyArg_ParseTuple(args, "O!O!O!O!O!O!O!", 
      &PyArray_Type, &py_x, &PyArray_Type, &py_x_classes, &PyInt_Type, &py_nclasses,      
      &PyArray_Type, &py_A, &PyArray_Type, &py_initial_W, &PyFloat_Type, &py_pdf_sigma,
      &PyInt_Type, &py_use_angle_reparameterization))
    return(NULL);

  // Read dimensionalities, doesn't check consistency between arrays  
  // printf("Read dimensionalities\n");

  n_input_dim = py_x->dimensions[1];
  n_parzen_points = py_x->dimensions[0];
  n_classes = PyInt_AsLong(py_nclasses);
  n_output_dim = py_initial_W->dimensions[1];
  pdf_sigma = PyFloat_AsDouble(py_pdf_sigma);
  use_angle_reparameterization = PyInt_AsLong(py_use_angle_reparameterization);

  algorithm_used = DCA_ALGORITHM;

  printf("Initializing DCA, n_input_dim=%d, n_parzen_points=%d, n_classes=%d, n_output_dim=%d, pdf_sigma=%e, use_angle_reparameterization=%d\n", n_input_dim, n_parzen_points, n_classes, n_output_dim, pdf_sigma, use_angle_reparameterization);

  // Copy dataset to internal arrays
  parzen_points = (double **)malloc(n_parzen_points*sizeof(double *));
  parzen_classes = (double **)malloc(n_parzen_points*sizeof(double *));
  for (i = 0; i < n_parzen_points; i++)
  {
    parzen_points[i] = (double *)malloc(n_input_dim*sizeof(double));
    parzen_classes[i] = (double *)malloc(n_classes*sizeof(double));
    for (j = 0; j < n_input_dim; j++)
    {
      parzen_points[i][j] = *(double *)(py_x->data + i*py_x->strides[0] + j*py_x->strides[1]);
    }
    for (j = 0; j < n_classes; j++)
    {
      parzen_classes[i][j] = *(double *)(py_x_classes->data + i*py_x_classes->strides[0] + j*py_x_classes->strides[1]);
    }
  }


  // Calculate empirical class probabilities and empirical class norm
  empirical_class_probabilities = (double *)malloc(n_classes*sizeof(double));
  for (i = 0; i < n_classes; i++)
    empirical_class_probabilities[i] = 0;
  for (i = 0; i < n_parzen_points; i++)
    for (j = 0; j < n_classes; j++)
      empirical_class_probabilities[j] += parzen_classes[i][j];
  for (i = 0; i < n_classes; i++)
    empirical_class_probabilities[i] /= n_parzen_points;

  empirical_class_norm = 0;
  for (i = 0; i < n_classes; i++)
    empirical_class_norm += empirical_class_probabilities[i]*empirical_class_probabilities[i];

  initialize_projection(py_A, py_initial_W);


  // Create the return array
  /*printf("Returning initial projection parameters\n");*/
  resultdim[0] = n_parameters;
  resultdim[1] = 1;
  py_result = (PyArrayObject *)PyArray_FromDims(1,resultdim,PyArray_DOUBLE);

  result_index = 0;
  if (use_angle_reparameterization)
  {
    // Return the W angles
    for (j = n_output_dim*(n_input_dim-n_output_dim)-1; j >= 0; j--)
    {
      *(double *)(py_result->data + result_index*py_result->strides[0]) = W_angles[j];
      result_index++;
    }
  }
  else
  {
    // Return the W parameters
    for (j = n_input_dim-1; j >= 0; j--)
      for (k = n_output_dim-1; k >= 0; k--)
      {
	*(double *)(py_result->data + result_index*py_result->strides[0]) = W[j][k];
	result_index++;
      }
  }

  /*printf("Initial projection parameters returned.\n");*/
  return((PyObject *)PyArray_Return(py_result));
}


/*
  Single-point gradient for nonlinear DCA algorithm and pre-given data set
 */
static PyObject * dca_lib_singlepoint_gradient(PyObject *self, PyObject *args)
{ 
  static double **W_gradient = NULL;
  static double **W_batch_gradient = NULL;
  static double *W_angle_gradient = NULL;

  PyArrayObject * py_MW_values;
  PyObject * py_point_index;

  int point_index;
  int i, j, k, prev_dim;

  int resultdim[2];
  PyArrayObject * py_result;
  int result_index;

  if (!W_gradient)
  {
    // Allocate gradients
    /*printf("Allocating gradients\n");*/

    W_gradient = (double **)malloc(n_input_dim*sizeof(double *));
    for (j = 0; j < n_input_dim; j++)
      W_gradient[j] = (double *)malloc(n_output_dim*sizeof(double));

    W_batch_gradient = (double **)malloc(n_input_dim*sizeof(double *));
    for (j = 0; j < n_input_dim; j++)
      W_batch_gradient[j] = (double *)malloc(n_output_dim*sizeof(double));

    if (use_angle_reparameterization)
      W_angle_gradient = (double *)malloc(n_output_dim*(n_input_dim-n_output_dim)*sizeof(double));
  }


  // Parse the arguments
  // printf("Parsing arguments\n");

  if (!PyArg_ParseTuple(args, "O!O!", &PyArray_Type, &py_MW_values, &PyInt_Type, &py_point_index))
    return(NULL);

  result_index = 0;
  if (use_angle_reparameterization)
  {
    // Retrieve the W angles
    for (j = n_output_dim*(n_input_dim-n_output_dim)-1; j >= 0; j--)
    {
      W_angles[j] = *(double *)(py_MW_values->data + result_index*py_MW_values->strides[0]);
      // printf("W_angles[%d]=%e\n", j, W_angles[j]);
      result_index++;
    }
    // printf("Constructing projection\n");
    construct_projection_from_angles();
    // printf("Projection constructed.\n");
  }
  else
  {
    // Retrieve the W parameters
    for (j = n_input_dim-1; j >= 0; j--)
      for (k = n_output_dim-1; k >= 0; k--)
      {
	W[j][k] = *(double *)(py_MW_values->data + result_index*py_MW_values->strides[0]);
	result_index++;
      }
  }

  point_index = PyInt_AsLong(py_point_index);


  // Initialize W-batch-gradient
  // printf("Init W-gradient\n");
  for (j = n_input_dim-1; j >= 0; j--)
    for (k = n_output_dim-1; k >= 0; k--)
      W_batch_gradient[j][k] = 0;    

  // Transform points
  // printf("Projecting Parzen points\n");
  projected_parzen_points = transform_points(n_parzen_points, parzen_points, projected_parzen_points);
  /*
  printf("Calculating initial cost function\n");
  printf("Cost function before training: %e\n", calculate_cost_function()); 
  */

  // Sum over gradients of point likelihoods for each class
  for (i = n_classes-1; i >= 0; i--)
    if (parzen_classes[point_index][i])
    {
      calculate_dca_gradient(point_index, i, W_gradient);

      // Add to W-batch-gradient
      // printf("Add to W-batch-gradient\n");
      for (j = n_input_dim-1; j >= 0; j--)
	for (k = n_output_dim-1; k >= 0; k--)
	  W_batch_gradient[j][k] += W_gradient[j][k]*parzen_classes[point_index][i];

      /*
      printf("W_gradient (%d):\n", point_index);
      for (j = n_input_dim-1; j >= 0; j--)
      {
        for (k = n_output_dim-1; k >= 0; k--)
	  printf("W_gradient[%d][%d] = %e ", j, k, W_gradient[j][k]);
	printf("\n");
      }
      */
    } 

  /*
  printf("W batch gradient:\n");
  for (j = n_input_dim-1; j >= 0; j--)
  {
    for (k = n_output_dim-1; k >= 0; k--)
      printf("W_batch_gradient[%d][%d] = %e ", j, k, W_batch_gradient[j][k]);
    printf("\n");
  }
  */

  if (use_angle_reparameterization)
  {
    // printf("Calculating rotation gradient\n");
    calculate_rotation_gradient(W_batch_gradient, W_angle_gradient);
    // printf("Rotation gradient calculated\n");
  }

  // Create the return array
  resultdim[0] = n_parameters;
  resultdim[1] = 1;
  py_result = (PyArrayObject *)PyArray_FromDims(1,resultdim,PyArray_DOUBLE);


  result_index = 0; 
  if (use_angle_reparameterization)
  {
    // Return the W angle gradient
    for (j = n_output_dim*(n_input_dim-n_output_dim)-1; j >= 0; j--)
    {
      *(double *)(py_result->data + result_index*py_result->strides[0]) = W_angle_gradient[j];
      result_index++;
    }
  }
  else
  {
    // Return the W batch gradient
    for (j = n_input_dim-1; j >= 0; j--)
      for (k = n_output_dim-1; k >= 0; k--)
      {
        *(double *)(py_result->data + result_index*py_result->strides[0]) = W_batch_gradient[j][k];
        result_index++;
      }
  }

  return((PyObject *)PyArray_Return(py_result));
}



/*
  Batch gradient for nonlinear DCA algorithm and pre-given data set
 */
static PyObject * dca_lib_gradient(PyObject *self, PyObject *args)
{ 
  static double **W_gradient = NULL;
  static double **W_batch_gradient = NULL;
  static double *W_angle_gradient = NULL;

  PyArrayObject * py_MW_values;

  int point_index;
  int i, j, k, prev_dim;

  int resultdim[2];
  PyArrayObject * py_result;
  int result_index;

  if (!W_gradient)
  {
    // Allocate gradients
    /*printf("Allocating gradients\n");*/


    W_gradient = (double **)malloc(n_input_dim*sizeof(double *));
    for (j = 0; j < n_input_dim; j++)
      W_gradient[j] = (double *)malloc(n_output_dim*sizeof(double));

    W_batch_gradient = (double **)malloc(n_input_dim*sizeof(double *));
    for (j = 0; j < n_input_dim; j++)
      W_batch_gradient[j] = (double *)malloc(n_output_dim*sizeof(double));

    if (use_angle_reparameterization)
      W_angle_gradient = (double *)malloc(n_output_dim*(n_input_dim-n_output_dim)*sizeof(double));
  }


  // Parse the arguments
  /*printf("Parsing arguments\n");*/

  if (!PyArg_ParseTuple(args, "O!", &PyArray_Type, &py_MW_values))
    return(NULL);

  result_index = 0;
  if (use_angle_reparameterization)
  {
    // Retrieve the W angles
    /*printf("Retrieving W angles\n");*/
    for (j = n_output_dim*(n_input_dim-n_output_dim)-1; j >= 0; j--)
    {
      W_angles[j] = *(double *)(py_MW_values->data + result_index*py_MW_values->strides[0]);
      printf("W_angles[%d]=%e\n", j, W_angles[j]);
      result_index++;
    }
    /*printf("Constructing projection\n");*/
    construct_projection_from_angles();
    /*printf("Projection constructed.\n");*/
  }
  else
  {
    /*printf("Retrieving W\n");*/
    // Retrieve the W parameters
    for (j = n_input_dim-1; j >= 0; j--)
      for (k = n_output_dim-1; k >= 0; k--)
      {
	W[j][k] = *(double *)(py_MW_values->data + result_index*py_MW_values->strides[0]);
	result_index++;
      }
  }

  // Initialize W-batch-gradient
  /*printf("Init W-gradient\n");*/
  for (j = n_input_dim-1; j >= 0; j--)
    for (k = n_output_dim-1; k >= 0; k--)
      W_batch_gradient[j][k] = 0;    

  // Transform points
  /*printf("Projecting Parzen points\n");*/
  projected_parzen_points = transform_points(n_parzen_points, parzen_points, projected_parzen_points);
  /*
  printf("Calculating initial cost function\n");
  printf("Cost function before training: %e\n", calculate_cost_function()); 
  */

  /*printf("Calculating individual gradients\n");*/
  // Sum over gradients of individual point likelihoods
  /*printf("n_parzen_points=%d\n", n_parzen_points);*/
  for (point_index = n_parzen_points-1; point_index >= 0; point_index--)
  {
    /*
    if (!(point_index%500)) 
    {       
      printf("point index:%d, point classes: ", point_index);
      for (i = 0; i < n_classes; i++)
        printf("%f ", parzen_classes[point_index][i]);
      printf("\n");
    }
    */

    for (i = n_classes-1; i >= 0; i--)
      if (parzen_classes[point_index][i])
      {
        // If there is more than one point in this class, include this term in the overall cost
        if ((algorithm_used == DCA_ALGORITHM) && (empirical_class_probabilities[i] > parzen_classes[point_index][i]/n_parzen_points))
	{
  	  calculate_dca_gradient(point_index, i, W_gradient);

	  // Add to W-batch-gradient
	  // printf("Add to W-batch-gradient\n");
	  for (j = n_input_dim-1; j >= 0; j--)
	    for (k = n_output_dim-1; k >= 0; k--)
	      W_batch_gradient[j][k] += W_gradient[j][k]*parzen_classes[point_index][i];

  	  /*
	  printf("W_gradient (%d):\n", point_index);
	  for (j = n_input_dim-1; j >= 0; j--)
	  {
	    for (k = n_output_dim-1; k >= 0; k--)
	      printf("W_gradient[%d][%d] = %e ", j, k, W_gradient[j][k]);
	    printf("\n");
	  }
	  */
	}
      }
  }

  /*
  printf("W batch gradient:\n");
  for (j = n_input_dim-1; j >= 0; j--)
  {
    for (k = n_output_dim-1; k >= 0; k--)
      printf("W_batch_gradient[%d][%d] = %e ", j, k, W_batch_gradient[j][k]);
    printf("\n");
  }
  */

  if (use_angle_reparameterization)
  {
    /*printf("Calculating rotation gradient\n");*/
    calculate_rotation_gradient(W_batch_gradient, W_angle_gradient);
    /*printf("Rotation gradient calculated\n");*/
  }

  // Create the return array
  resultdim[0] = n_parameters;
  resultdim[1] = 1;
  py_result = (PyArrayObject *)PyArray_FromDims(1,resultdim,PyArray_DOUBLE);


  result_index = 0; 
  if (use_angle_reparameterization)
  {
    // Return the W angle gradient
    for (j = n_output_dim*(n_input_dim-n_output_dim)-1; j >= 0; j--)
    {
      *(double *)(py_result->data + result_index*py_result->strides[0]) = W_angle_gradient[j];
      result_index++;
    }
  }
  else
  {
    // Return the W batch gradient
    for (j = n_input_dim-1; j >= 0; j--)
      for (k = n_output_dim-1; k >= 0; k--)
      {
        *(double *)(py_result->data + result_index*py_result->strides[0]) = W_batch_gradient[j][k];
        result_index++;
      }
  }

  return((PyObject *)PyArray_Return(py_result));
}


/*
  Cost function for nonlinear DCA algorithm and pre-given data set
  Version using internal conjugate gradient parameterization
 */
double dca_lib_costfunction_internalparams
(
  int ntotalparams, 
  double *currentparams
)
{
  int i, j, k, prev_dim;
  double result_cost;



  PyObject * py_result;
  int result_index;



  result_index = 0;
  if (use_angle_reparameterization)
  {
    // Retrieve the W angles
    /*printf("Retrieving W angles\n");*/
    for (j = n_output_dim*(n_input_dim-n_output_dim)-1; j >= 0; j--)
    {
      W_angles[j] = currentparams[result_index];
      //printf("W_angles[%d]=%e\n", j, W_angles[j]);
      result_index++;
    }
    /*printf("Constructing projection\n");*/
    construct_projection_from_angles();
    /*printf("Projection constructed.\n");*/
  }
  else
  {
    /*printf("Retrieving W\n");*/
    // Retrieve the W parameters
    for (j = n_input_dim-1; j >= 0; j--)
      for (k = n_output_dim-1; k >= 0; k--)
      {
        W[j][k] = currentparams[result_index];
	result_index++;
      }
  }


  // Transform points
  /*printf("Projecting Parzen points\n");*/
  projected_parzen_points = transform_points(n_parzen_points, parzen_points, projected_parzen_points);
  /*printf("Calculating cost function\n");*/
  result_cost = calculate_cost_function();

  return(result_cost);
}



/*
  Cost function for nonlinear DCA algorithm and pre-given data set
 */
static PyObject * dca_lib_cost(PyObject *self, PyObject *args)
{
  int i, j, k, prev_dim;
  double result_cost;

  PyArrayObject * py_MW_values;

  PyObject * py_result;
  int result_index;


  // Parse the arguments
  /*printf("Parsing arguments\n");*/
  if (!PyArg_ParseTuple(args, "O!", &PyArray_Type, &py_MW_values))
    return(NULL);

  result_index = 0;
  if (use_angle_reparameterization)
  {
    // Retrieve the W angles
    for (j = n_output_dim*(n_input_dim-n_output_dim)-1; j >= 0; j--)
    {
      W_angles[j] = *(double *)(py_MW_values->data + result_index*py_MW_values->strides[0]);
      result_index++;
    }
    /*printf("Constructing projection for cost function calculation\n");*/
    construct_projection_from_angles();
    /*printf("Projection constructed\n");*/
  }
  else
  {
    // Retrieve the W parameters
    for (j = n_input_dim-1; j >= 0; j--)
      for (k = n_output_dim-1; k >= 0; k--)
      {
	W[j][k] = *(double *)(py_MW_values->data + result_index*py_MW_values->strides[0]);
	result_index++;
      }
  }

  // Transform points
  /*printf("Projecting Parzen points\n");*/
  projected_parzen_points = transform_points(n_parzen_points, parzen_points, projected_parzen_points);
  /*printf("Calculating cost function\n");*/
  result_cost = calculate_cost_function();

  printf("Cost=%e\n", result_cost);
  py_result = (PyObject *)Py_BuildValue("d",result_cost);
  return(py_result);
}


/*
  Save projection to a text file
 */
static PyObject * dca_lib_save_projection(PyObject *self, PyObject *args)
{
  int i, j, k, prev_dim;
  char * filename;
  int return_value;

  PyArrayObject * py_MW_values;
  int result_index;

  PyObject * py_result;

  // Parse the arguments
  if (!PyArg_ParseTuple(args, "O!s", &PyArray_Type, &py_MW_values, &filename))
    return(NULL);

  result_index = 0;
  if (use_angle_reparameterization)
  {
    // Retrieve the W angles
    for (j = n_output_dim*(n_input_dim-n_output_dim)-1; j >= 0; j--)
    {
      W_angles[j] = *(double *)(py_MW_values->data + result_index*py_MW_values->strides[0]);
      result_index++;
    }
    construct_projection_from_angles();
  }
  else
  {
    // Retrieve the W parameters
    for (j = n_input_dim-1; j >= 0; j--)
      for (k = n_output_dim-1; k >= 0; k--)
      {
	W[j][k] = *(double *)(py_MW_values->data + result_index*py_MW_values->strides[0]);
	result_index++;
      }
  }

  save_projection(filename);

  return_value = 1;
  return Py_BuildValue("i", 1);
}





/*-------------------conjugate gradient code starts---------------*/

/*
%
% Conjugate gradient (Polak-Ribiere variant with restarts)
% according to the description in J. Shewchuk, An Introduction 
% to the Conjugate Gradient Method Without the Agonizing Pain, 1994.
%
% Implemented by Jaakko Peltonen 2007, with additional modifications:
% - prevent too large steps in Secant search
% - prevent searching for the maximum in Secant search!
% Converted to C in 2009.
%
% Parameters:
% - gradhandle: handle or name of a function that computes the 
%   gradient of the function to be minimized, given the parameters 
%   as a vector. Must take a column vector and return a column vector.
% - n: number of parameters
% - initparams: initial starting point for the search. Must be a
%   vector of n values.
% - niters: number of iterations of conjugate gradient
% - nsecantiters: number of Secant iterations (which minimize 
%   along a line)
% - finalparams: vector where the optimized parameters are written
% - finalcost: pointer where the cost function value is written;
%   currently it is simply set to zero instead of an actual value.
*/


double innerproduct(double* vector1, double* vector2, int vectorlength)
{
  int i;
  double resultvalue;
  resultvalue = 0;
  for (i = vectorlength-1; i >= 0; i--)
    resultvalue += vector1[i]*vector2[i];
  return(resultvalue);
}


int mysign(double x)
{
  if (x >= 0) return(1);
  else return(-1);
}


void conjugate_gradient2
(
  /* arguments: nparams, paramvalues, outputgradient, whethertoprintthecost */
  void (*gradhandle)(int, double *, double *, int),

  int n, 
  double *initparams,  /* n initial parameter values, given together as a vector */

  int niters,
  int nsecantiters,

  double * finalparams,
  double * finalcost
)
{
  double * x;
  double * temp_x;
  double * r;
  double * r_old;
  double * d;
  double * grad;
  double * gradsigma;
  double alpha;
  double alphamult;
  double sigma;
  double beta;
  double innerproduct_gradsigma_d;
  double innerproduct_grad_d;
  double innerproduct_r_r;
  double innerproduct_r_rold;
  double innerproduct_rold_rold;

  double alpha_step;
  double cur_alpha;
  double cur_cost;
  double best_alpha;
  double best_cost;
  


  int i, j, k, l;

  x = (double *)malloc(n*sizeof(double));
  for (l = 0; l < n; l++)
    x[l] = initparams[l];

  temp_x = (double *)malloc(n*sizeof(double));
  d = (double *)malloc(n*sizeof(double));
  r = (double *)malloc(n*sizeof(double));
  r_old = (double *)malloc(n*sizeof(double));
  grad = (double *)malloc(n*sizeof(double));
  gradsigma = (double *)malloc(n*sizeof(double));

  for (i=1; i <=niters; i++)
  {
    printf("Conjugate gradient iteration %d\n", i);
    /* Compute negative gradient */
    gradhandle(n,x,d,1);
    for (l = 0; l < n; l++)
      d[l] = -d[l];
  
    for (l = 0; l < n; l++)
      r[l] = d[l];
    
    for (j=1; j <=n; j++)
    {
#ifdef SIMPLE_LINESEARCH
      /* Use a really simple line search... */
      best_alpha = 0;
      for (l = 0; l < n; l++)
	temp_x[l] = x[l] + best_alpha*d[l];
      best_cost = dca_lib_costfunction_internalparams(n,temp_x);

      alpha_step = 1e-5;
      cur_alpha = best_alpha + alpha_step;
      for (k=1; k <=nsecantiters; k++)
      {
        for (l = 0; l < n; l++)
	  temp_x[l] = x[l] + cur_alpha*d[l];
        cur_cost = dca_lib_costfunction_internalparams(n,temp_x);
	//printf("Cur cost %e, best cost %e, diff %e, cur_alpha %e, best_alpha %e\n", cur_cost,best_cost,cur_cost-best_cost,cur_alpha, best_alpha);
        if (cur_cost >= best_cost)
	{
	  best_alpha = cur_alpha;
          best_cost = cur_cost;
          alpha_step = alpha_step*2;
          cur_alpha = best_alpha + alpha_step;
	}
	else
	{
	  alpha_step = alpha_step/2;
          cur_alpha = best_alpha + alpha_step;
	}
      }      
      alpha = best_alpha;
#endif

#define OLD_CODE
#ifdef OLD_CODE
      /*
      %
      % Minimize the function along d, with respect to length alpha, 
      % using either the Secant method or a basic line search.
      %
      */
      alpha=1e-5;
      gradhandle(n,x,grad,0);
      //gradhandle(n,x,grad,1);
  
      for (k=1; k <=nsecantiters; k++)
      {
	//printf("alpha now: %e\n",alpha);
        sigma = alpha;
        for (l = 0; l < n; l++)
          temp_x[l] = x[l] + sigma*d[l];

        gradhandle(n,temp_x,gradsigma,0);
        //gradhandle(n,temp_x,gradsigma,1);

        innerproduct_gradsigma_d = innerproduct(gradsigma,d,n);
        innerproduct_grad_d = innerproduct(grad,d,n);
      
        if (fabs(innerproduct_gradsigma_d) < 1e-10)
        {
          /*
          % gradient along the line d at x is very small, so
          % we are close enough to stop
          %fprintf(1,'Stopped\n');
          */
          break;
        }
        else                
        {
          if (mysign(sigma) == mysign(innerproduct_gradsigma_d - innerproduct_grad_d))
          {
            /*
            % if the gradient decreases in the direction of the
            % gradient, we can compute step size by the Secant 
            % method          
            */
	    //printf("Using secant\n");
            alphamult = -(innerproduct_grad_d)/(innerproduct_gradsigma_d-innerproduct_grad_d);
	  }
	  else
	  {
	    /*          
            % the gradient decreases in the wrong direction, so we can't
            % use Secant. Instead, just try a basic line search.
            %fprintf(1,'Line search %d\n', k);
	    */
	    //printf("Using line search\n");
            alphamult = 2;
	  }
        
          /* ensure that the step is not too large */
          if (fabs(alphamult) > 3)
            alphamult = 3*mysign(alphamult);        
          alpha = alphamult*sigma ;
        }
      }
#endif

      for (l = 0; l <= n; l++)      
        x[l] = x[l] + alpha*d[l];

      for (l = 0; l <= n; l++)      
        r_old[l] = r[l];

      gradhandle(n,x,r,0);
      for (l = 0; l <= n; l++)      
	r[l] = -r[l];

      innerproduct_r_r = innerproduct(r,r,n);
      innerproduct_r_rold = innerproduct(r,r_old,n);
      innerproduct_rold_rold = innerproduct(r_old,r_old,n);

      beta = 0;
      if (beta < (innerproduct_r_r - innerproduct_r_rold)/(innerproduct_rold_rold))
        beta = (innerproduct_r_r - innerproduct_r_rold)/(innerproduct_rold_rold);

      for (l = 0; l <= n; l++)
        d[l] = r[l] + beta*d[l];
    }
  }    

  for (l = 0; l <= n; l++)
    finalparams[l]=x[l];
  *finalcost = 0;
}

/*-------------------conjugate gradient code ends-----------------*/


/*
  Optimize Discriminative Component Analysis algorithm by conjugate gradient
 */
static PyObject * dca_lib_optimize(PyObject *self, PyObject *args)
{ 
  PyObject * py_niters;
  PyObject * py_nsecantiters;
  PyArrayObject * py_MW_values;

  int niters, nsecantiters;
  int ntotalparams;
  double * initparameters;
  double * finalparameters;
  double finalcost;

  int resultdim[2];
  PyArrayObject * py_result;
  int result_index;

  int i, j;


  printf("Starting optimize\n");

  // Parse the arguments
  if (!PyArg_ParseTuple(args, "O!O!O!", 
      &PyArray_Type, &py_MW_values, 
      &PyInt_Type, &py_niters, 
      &PyInt_Type, &py_nsecantiters))
    return(NULL);

  ntotalparams = py_MW_values->dimensions[0];
  printf("%d total parameters (initialized earlier with %d input dims, %d output dims)\n", ntotalparams, n_input_dim, n_output_dim);
  /*
  if (use_angle_reparameterization)
    ntotalparams = n_output_dim*(n_input_dim-n_output_dim);  
  else  
    ntotalparams = n_input_dim*n_output_dim;  
  */

  niters = PyInt_AsLong(py_niters);
  nsecantiters = PyInt_AsLong(py_nsecantiters);
  printf("%d iters, %d secant iters\n", niters, nsecantiters);


  // Retrieve the initial parameters
  initparameters = (double *)malloc(ntotalparams*sizeof(double));
  result_index = 0;
  for (j = ntotalparams-1; j >= 0; j--)
  {
    initparameters[result_index] = *(double *)(py_MW_values->data + result_index*py_MW_values->strides[0]);
    result_index++;
  }


  finalparameters = (double *)malloc(ntotalparams*sizeof(double));

  printf("Starting conjugate gradient\n");

  conjugate_gradient2
  (
    //void (*gradhandle)(int, double *, double *, int)  
    /* arguments: nparams, paramvalues, outputgradient, whethertoprintthecost */
    dca_lib_gradient_internalparams,

    ntotalparams, 
    initparameters,  /* n initial parameter values, given together as a vector */

    niters,
    nsecantiters,

    finalparameters,
    &finalcost
  );


  printf("Returning final parameters\n");

  // Create the return array
  /*printf("Returning final projection parameters\n");*/
  resultdim[0] = ntotalparams;
  resultdim[1] = 1;
  py_result = (PyArrayObject *)PyArray_FromDims(1,resultdim,PyArray_DOUBLE);

  result_index = 0;
  for (j = ntotalparams-1; j >= 0; j--)
  {
    *(double *)(py_result->data + result_index*py_result->strides[0]) = finalparameters[result_index];
    result_index++;
  }

  /*printf("Initial projection parameters returned.\n");*/
  return((PyObject *)PyArray_Return(py_result));
}





/* The method table */
static PyMethodDef dca_libMethods[] = {
  {"gradient", dca_lib_gradient, METH_VARARGS, "computes overall gradient"},
  {"singlepoint_gradient", dca_lib_singlepoint_gradient, METH_VARARGS, "computes gradient contribution of one point"},
  {"cost", dca_lib_cost, METH_VARARGS, "computes the cost function"},
  {"initialize", dca_lib_initialize, METH_VARARGS, "initializes the cost function"},
  {"optimize", dca_lib_optimize, METH_VARARGS, "optimizes the cost function"},
  {"save_projection", dca_lib_save_projection, METH_VARARGS, "saves the projection to a file"},
  {NULL,NULL,0,NULL}
};


/* Initialization function */
void initdca_lib(void) {
  (void)Py_InitModule("dca_lib",dca_libMethods);

  import_array();
}


/* Main */

int main(int argc, char **argv)
{
  Py_SetProgramName(argv[0]);
  Py_Initialize();
  initdca_lib();
  return(0);
}

