#include "ELMClassifier.hpp"
#include "math.hpp"
#include "random.hpp"
#include "util.hpp"
#include "AdvancedString.hpp"
#include <set>


using std::vector;
using std::string;
using std::cout;
using std::endl;

namespace slmotion {
 void ELMClassifier::train(const std::vector<std::vector<float> > &negSamples,
			   const std::vector<std::vector<float> > &posSamples,
			   const vector<bool> *dimMask,
			   const vector<float>* /*negWeights*/,
			   const vector<float>* /*posWeights*/){


   // construct column major data matrix and call train() for that

   floatMatrix m;


   size_t dim=posSamples[0].size();
   vector<size_t> activeDim;
   
   // for(size_t d=0;d<dim;d++)
   for(size_t d = 0; d < dim; ++d)
     if(dimMask==NULL || (*dimMask)[d]) activeDim.push_back(d);
   
   dim=activeDim.size();

   int nsamples=negSamples.size()+posSamples.size();

   // construct augmented input matrix from the training samples
   // each row = one sample
   // last column = target response value using -1/+1 coding for the classes

   m.rows=nsamples;
   m.cols=dim+1;

   m.allocate();

   int currentsample=0;

   for (size_t i=0; i < posSamples.size(); i++, currentsample++) {
     size_t d;
     for(d=0;d<dim;d++){
       m.dptr[currentsample+d*nsamples]= posSamples[i][activeDim[d]];
     }
     m.dptr[currentsample+d*nsamples]=1.0;
   }

   for (size_t i=0; i < negSamples.size(); i++, currentsample++) {
     size_t d;
     for(d=0;d<dim;d++)
       m.dptr[currentsample+d*nsamples]= negSamples[i][activeDim[d]];
     m.dptr[currentsample+d*nsamples]=-1.0;
   }

   train(m);
 }

  void ELMClassifier::train(const floatMatrix &datamat){

    trainedparam.clear();

    if(Nneur_sigmoid+Nneur_RBF+Nneur_chisq==0) return;

    const floatMatrix *norm=&datamat;

    int dim=datamat.cols-1;

    floatMatrix mcopy;

    if(useznorm){
      
      mcopy=datamat;
      mcopy.allocate();

      estimateZNorm(datamat);

   // pre-calculate the inverses of sqrt(var)

      invsqrtvariance.resize(variance.size());
      for(size_t d=0;d<variance.size();d++)
	invsqrtvariance[d] = variance[d]>0 ? 1.0/sqrt(variance[d]):0;
   
      for(int d=0;d<datamat.cols-1;d++) // leave last column=class label intact
	for(int s=0;s<datamat.rows;s++){
	  int idx=s+d*datamat.rows;
	  mcopy.dptr[idx]=(datamat.dptr[idx]-mean[d])*invsqrtvariance[d];
	}
    
      norm=&mcopy;
    }

   float *tgtM=new float[datamat.rows];

   for(int s=0;s<datamat.rows;s++){
     tgtM[s]=datamat.dptr[s+datamat.rows*(datamat.cols-1)];
     norm->dptr[s+datamat.rows*(datamat.cols-1)]=1; // replace last column with constant bias
   }


   // now *norm is the input matrix augmented with constant bias,
   // tgtM is vector of target values in {-1,+1}

   int Nneur=Nneur_sigmoid;

   // generate random input weights for each neuron 
   // row = neuron index
   // column = dim


   float *weightM=NULL;

   int m,n,ld; // dummy args for the known output matrix size
   float *H=NULL,*beta;

   int dim1=dim+1;
   int nsamples=norm->rows;
   


   if(Nneur_sigmoid>0){

     weightM=new float[Nneur*(dim+1)];
   
     float mul=6.0/RAND_MAX;

     for(int i=0;i<Nneur*(dim+1);i++){
       weightM[i]=mul*rand();
       //     weightM[i] = (i%7)*3.0/7.0+(i%15)*3.0/15.0;
       // make the weight matrix deterministic for the purpose of testing
       weightM[i] -= 3.0;
       
     }






   // all entries now uniform random numbers in [-3,+3]

   //   print_matrix("weight matrix", Nneur, dim+1, weightM, Nneur );


   // prepare the mapping matrix H
   // row = sample
   // column=neuron


   // perform first linear mapping H = inputM * transpose(weightM)

    multiplymat(nsamples,dim1, norm->dptr, nsamples,
		Nneur, dim1, weightM,Nneur,
	        false, true,
		m, n, &H,ld);


    //    print_matrix("linear mapping", m,n,H,ld );

    // now apply the (nonlinear) transfer function to each entry of H

    transferfunction=&tanhtransfer;

    for(int i=0;i<nsamples*Nneur;i++)
      H[i]=(*transferfunction)(H[i]);

   }


    // outputs of sigmoidal formed, now add some RBF neurons to the mix

    // use the normalised vectors if normalisation is specified

    float *RBFcenters=NULL;
    float *RBFwidths=NULL;

    if(Nneur_RBF>0){
      
      // find the distance distribution within the training samples

      int distance_samples=5000;

      vector<float> distances;

      for(int i=0;i<distance_samples;i++){
	int i1=uniform(nsamples-1);
	int i2=uniform(nsamples-1);
	distances.push_back(evalEuclDist(norm->dptr,i1,nsamples,norm->dptr,i2,nsamples,norm->cols-1));
      }
      
      // determine the 20% and 80% percentiles
      
      sort(distances.begin(),distances.end());
      
      float minWidth=distances[0.2*distances.size()];
      float maxWidth=distances[0.8*distances.size()];
      

      RBFcenters=new float[Nneur_RBF*norm->cols-1]; // store in column major order
      RBFwidths=new float[Nneur_RBF];
      
      for(int i=0;i<Nneur_RBF;i++){
	int r=uniform(nsamples-1);
	for(int d=0;d<norm->cols-1;d++)
	  RBFcenters[i+d*Nneur_RBF]=norm->dptr[r+d*nsamples];
	
	RBFwidths[i]=unifloat()*(maxWidth-minWidth)+minWidth;
	
      }
     
      // RBF kernels now randomly selected
      // next, project training data to these kernels

      float *respMat=new float[Nneur_RBF*nsamples]; // stored in column major order
      // row -> training sample
      // column -> kernel

      for(int s=0;s<nsamples;s++)
	for(int k=0;k<Nneur_RBF;k++){
	  float d=evalEuclDist(RBFcenters,k,Nneur_RBF,norm->dptr,s,nsamples,norm->cols-1);
	  d /= RBFwidths[k];
	  respMat[s+k*nsamples]=exp(-d*d);
	}
      
      float *oldH=H;

      H=new float[(Nneur_sigmoid+Nneur_RBF)*nsamples];
      if(Nneur_sigmoid>0)
	memcpy(H,oldH,Nneur_sigmoid*nsamples*sizeof(float));

      memcpy(H+Nneur_sigmoid*nsamples,respMat,Nneur_RBF*nsamples*sizeof(float));

      delete [] oldH;
      delete [] respMat;
      
    }

    //    print_matrix("nonlinear mapping", m,n,H,ld );


    Nneur += Nneur_RBF;

    // outputs of RBF neurons formed, now add some RBF neurons to the mix

    // for chisq neurons, ignore the normalisation even if it's specified

    float *chisqcenters=NULL;
    float *chisqwidths=NULL;

    if(Nneur_chisq>0){

      // check that training samples are all non-negative
      // otherwise chisq distance would not make sense

      for(int s=0;s<datamat.rows;s++)
	for(int d=0;d<datamat.cols-1;d++)
	  if(datamat.dptr[s+d*datamat.rows]<0)
	    throw string("chisq neurons require non-negative training data");


      // find the distance distribution within the training samples

      int distance_samples=5000;

      vector<float> distances;

      for(int i=0;i<distance_samples;i++){
	int i1=uniform(nsamples-1);
	int i2=uniform(nsamples-1);
	distances.push_back(evalChisqDist(datamat.dptr,i1,nsamples,datamat.dptr,i2,nsamples,datamat.cols-1));
      }
      
      // determine the 20% and 80% percentiles
      
      sort(distances.begin(),distances.end());
      
      float minWidth=distances[0.2*distances.size()];
      float maxWidth=distances[0.8*distances.size()];
      

      chisqcenters=new float[Nneur_chisq*datamat.cols-1]; // store in column major order
      chisqwidths=new float[Nneur_chisq];
      
      for(int i=0;i<Nneur_chisq;i++){
	int r=uniform(nsamples-1);
	for(int d=0;d<datamat.cols-1;d++)
	  chisqcenters[i+d*Nneur_chisq]=datamat.dptr[r+d*nsamples];
	
	chisqwidths[i]=unifloat()*(maxWidth-minWidth)+minWidth;
	
      }
     
      // chisq kernels now randomly selected
      // next, project training data to these kernels

      float *respMat=new float[Nneur_chisq*nsamples]; // stored in column major order
      // row -> training sample
      // column -> kernel

      for(int s=0;s<nsamples;s++)
	for(int k=0;k<Nneur_chisq;k++){
	  float d=evalEuclDist(chisqcenters,k,Nneur_chisq,datamat.dptr,s,nsamples,datamat.cols-1);
	  d /= chisqwidths[k];
	  respMat[s+k*nsamples]=exp(-d*d);
	}
      
      float *oldH=H;

      H=new float[(Nneur_sigmoid+Nneur_RBF+Nneur_chisq)*nsamples];
      if(Nneur_sigmoid+Nneur_RBF>0)
	memcpy(H,oldH,(Nneur_sigmoid+Nneur_RBF)*nsamples*sizeof(float));

      memcpy(H+(Nneur_sigmoid+Nneur_RBF)*nsamples,respMat,Nneur_chisq*nsamples*sizeof(float));

      delete [] oldH;
      delete [] respMat;
      
    }



    // calculate the output weights as beta = pinv(H) * tgtM;

    Nneur += Nneur_chisq;

    trainedparam.clear();

    trainedparam.push_back(trainedELM());


    if(useOPELMpruning){

      vector<int> bestNeurons;

      findBestNeurons(H, tgtM,nsamples,Nneur,bestNeurons);
      
      // go through the list of best neurons 
      // and re-build the weightM, RBFcenters, RBFwidths,
      // chisqcenters and chisqwidths matrices

      int sigmoid_count=0,rbf_count=0,chisq_count=0;
      
      for(auto it=bestNeurons.begin();it!=bestNeurons.end();it++){
	if(*it<Nneur_sigmoid)
	  sigmoid_count++;
	else if(*it<Nneur_sigmoid+Nneur_RBF)
	  rbf_count++;
	else
	  chisq_count++;
      }

      cout << "optimal pruning resulted in " << sigmoid_count << " sigmoidal, "
	   << rbf_count << " RBF and " << chisq_count << " chisq neurons" << 
	endl;

      float *weightM_opt=(sigmoid_count>0)?new float[(dim+1)*sigmoid_count]:NULL;

      float *RBFcenters_opt=(rbf_count>0)?new float[dim*rbf_count]:NULL;
      float *RBFwidths_opt=(rbf_count>0)?new float[dim*rbf_count]:NULL;

      float *chisqcenters_opt=(chisq_count>0)?new float[dim*chisq_count]:NULL;
      float *chisqwidths_opt=(chisq_count>0)?new float[dim*chisq_count]:NULL;
      
      int idx_sigmoid=0,idx_rbf=0,idx_chisq=0;

      // the columns of H should also be copied/reordered

      // this variable appears to be unused
      // -Matti
      // int Nneur_old=Nneur; // the number of neurons before pruning
      Nneur=bestNeurons.size();

      float *H_opt=new float[nsamples*Nneur];

      //print_matrix("nonlinear mapping", nsamples,Nneur_old,H,nsamples );

      int idx_h=0;

      for(auto it=bestNeurons.begin();it!=bestNeurons.end();it++){

	//	cout << "neuron #"<< *it << " -> idx " << idx_h << endl;

	// copy the column of H corresponding to it

	for(int i=0;i<nsamples;i++){
	  //cout << "i="<<i<<endl;
	  //cout << (i+(*it)*Nneur_old) << " -> " << (i+idx_h*Nneur) << endl;   
	  H_opt[i+idx_h*nsamples]=H[i+(*it)*nsamples];
	}
	//cout << "copied column of H for neuron " << idx_h << endl;

	idx_h++;

	if(*it<Nneur_sigmoid){
	  int d;
	  for(d=0;d<dim;d++)
	    weightM_opt[idx_sigmoid+d*sigmoid_count]=weightM[*it+d*Nneur_sigmoid];
	    weightM_opt[idx_sigmoid+d*sigmoid_count]=1;
	    idx_sigmoid++;
	} else if(*it<Nneur_sigmoid+Nneur_RBF){
	  for(int d=0;d<dim;d++)
	    RBFcenters_opt[idx_rbf+d*rbf_count]=RBFcenters[*it-Nneur_sigmoid+d*Nneur_RBF];
	  
	  RBFwidths_opt[idx_rbf]=RBFwidths[*it-Nneur_sigmoid];
	  idx_rbf++;
	}else{
	  for(int d=0;d<dim;d++)
	    chisqcenters_opt[idx_chisq+d*chisq_count]=
	      chisqcenters[*it-Nneur_sigmoid-Nneur_RBF+d*Nneur_chisq];
	  
	  chisqwidths_opt[idx_chisq]=chisqwidths[*it-Nneur_sigmoid-Nneur_RBF];
	  idx_chisq++;
	}
      }

      delete[] weightM;
      delete[] RBFcenters;
      delete[] RBFwidths;
      delete[] chisqcenters;
      delete[] chisqwidths;
      delete[] H;
      
      H=H_opt;

      // store the optimally selected matrices

      trainedparam[0].weightM.rows=sigmoid_count;
      trainedparam[0].weightM.cols=dim+1;
      trainedparam[0].weightM.dptr=weightM_opt;
      
      trainedparam[0].RBFcenters.rows=rbf_count;
      trainedparam[0].RBFcenters.cols=dim;
      trainedparam[0].RBFcenters.dptr=RBFcenters_opt;

      trainedparam[0].RBFwidths.rows=rbf_count;
      trainedparam[0].RBFwidths.cols=1;
      trainedparam[0].RBFwidths.dptr=RBFwidths_opt;

      trainedparam[0].chisqcenters.rows=chisq_count;
      trainedparam[0].chisqcenters.cols=dim;
      trainedparam[0].chisqcenters.dptr=chisqcenters_opt;

      trainedparam[0].chisqwidths.rows=chisq_count;
      trainedparam[0].chisqwidths.cols=1;
      trainedparam[0].chisqwidths.dptr=chisqwidths_opt;

    } else{
      trainedparam[0].weightM.rows=Nneur_sigmoid;
      trainedparam[0].weightM.cols=dim+1;
      trainedparam[0].weightM.dptr=weightM;
      
      trainedparam[0].RBFcenters.rows=Nneur_RBF;
      trainedparam[0].RBFcenters.cols=dim;
      trainedparam[0].RBFcenters.dptr=RBFcenters;

      trainedparam[0].RBFwidths.rows=Nneur_RBF;
      trainedparam[0].RBFwidths.cols=1;
      trainedparam[0].RBFwidths.dptr=RBFwidths;

      trainedparam[0].chisqcenters.rows=Nneur_chisq;
      trainedparam[0].chisqcenters.cols=dim;
      trainedparam[0].chisqcenters.dptr=chisqcenters;

      trainedparam[0].chisqwidths.rows=Nneur_chisq;
      trainedparam[0].chisqwidths.cols=1;
      trainedparam[0].chisqwidths.dptr=chisqwidths;
    }

    if(true){
      // augment H with a constant column

      float *H_tmp=new float[(Nneur+1)*nsamples];
      memcpy(H_tmp,H,Nneur*nsamples*sizeof(float));
      
      int idx=Nneur*nsamples;
      for(int i=0;i<nsamples;i++)
	H_tmp[idx++]=1;
      delete[] H;
      H=H_tmp;
      Nneur++;
    }

      float *pi;
      int pim,pin,ldpi;
      
    int one=1;

    pseudoinverse(nsamples, Nneur, H, nsamples, pim, pin, &pi, ldpi);
    //    print_matrix("pseudoinverse", pim,pin,pi,ldpi );

    multiplymat(pim, pin, pi,ldpi,
		nsamples,one,tgtM,nsamples,
		false, false,
		m, n, &beta,ld);

    //    print_matrix("estimated beta", m,n,beta,ld );

    // now insert the parameter values in the trainedparam vector


    trainedparam[0].beta.rows=Nneur;
    trainedparam[0].beta.cols=1;
    trainedparam[0].beta.dptr=beta;

    delete [] tgtM;

    predictioncache.clear();

 }

  float ELMClassifier::predict(const std::vector<float>  &sample,
			       const vector<bool> *dimMask){

    

    int dim=sample.size();
    vector<size_t> activeDim;
    
    //    for(size_t d=0;d<dim;d++)
    for (size_t d = 0; d < static_cast<size_t>(dim); d++)
      if(dimMask==NULL || (*dimMask)[d]) activeDim.push_back(d);
    
    dim=activeDim.size();

    // assert(dim==this->dim);

    vector<float> samplevec;

    int d;
    for(d=0;d<dim;d++)
      samplevec[d]=sample[activeDim[d]];

    return predict_lean(samplevec);
  }

  float ELMClassifier::predict_lean(const std::vector<float>  &sample){

    int cacheidx;

    if(maxcachedim>=0){
     
      cacheidx=0;
      for(int d=0;d<=maxcachedim;d++)
	cacheidx=(cacheidx<<8) + sample[d];

      if(predictioncache.count(cacheidx))
	return predictioncache[cacheidx];

   }

    int dim=sample.size();

    // collect normalised augmented sample vector

    float *samplevec=new float[dim+1];
    
    int d;

    if(useznorm)
      for(d=0;d<dim;d++)
	samplevec[d]=(sample[d]-mean[d])*invsqrtvariance[d];
    else
      for(d=0;d<dim;d++)
	samplevec[d]=sample[d];

    samplevec[d]=1; // augment with term corresponding to bias

    if(trainedparam.size()!=1)
      throw std::string("ensembling ELMs not yet supported");

    float *beta=trainedparam[0].beta.dptr;
    float *weightM=trainedparam[0].weightM.dptr;

    int Nneur=trainedparam[0].beta.rows;

    int Nneur_sigmoid=trainedparam[0].weightM.rows;
    int Nneur_RBF=trainedparam[0].RBFcenters.rows;
    int Nneur_chisq=trainedparam[0].chisqcenters.rows;


   // perform first linear mapping H = samplevec * transpose(weightM)
    int dim1=dim+1;
    int one=1;
    float *H=NULL;
    int m,n,ld; // dummy output variables

    if(Nneur_sigmoid>0){

      multiplymat(one,dim1, samplevec, one,
		  Nneur_sigmoid, dim1, weightM,Nneur_sigmoid,
		  false,true,
		  m, n, &H, ld);
    
      // now apply the (nonlinear) transfer function to each entry of H
      
      for(int i=0;i<Nneur_sigmoid;i++)
	H[i]=(*transferfunction)(H[i]);
    }

    if(Nneur_RBF>0){
      // augment H with projections to RBF kernels

      // use the normalised data vector

      float *oldH=H;
      H=new float[Nneur_sigmoid+Nneur_RBF];
      if(Nneur_sigmoid>0)
	memcpy(H,oldH,Nneur_sigmoid*sizeof(float));
      delete[] oldH;

      for(int k=0;k<Nneur_RBF;k++){
	float d=evalEuclDist(trainedparam[0].RBFcenters.dptr,k,Nneur_RBF,
			     &samplevec[0],0,1,dim);
	d /= trainedparam[0].RBFwidths.dptr[k];
	H[Nneur_sigmoid+k]=exp(-d*d);
      }

    }

    if(Nneur_chisq>0){
      // augment H with projections to chisq kernels

      // use the unnormalised sample vector regardless whether the normalisation is specified

      // check the elements of the sample vector for non-negativity

      for(int d=0;d<dim;d++)
	if(sample[d]<0) throw string("chisq neurons can't cope with negative vector elements");

      float *oldH=H;
      H=new float[Nneur];
      if(Nneur_sigmoid+Nneur_RBF>0)
	memcpy(H,oldH,(Nneur_sigmoid+Nneur_RBF)*sizeof(float));
      delete[] oldH;

      for(int k=0;k<Nneur_chisq;k++){
	float d=evalChisqDist(trainedparam[0].chisqcenters.dptr,k,Nneur_chisq,
			     &sample[0],0,1,dim);
	d /= trainedparam[0].chisqwidths.dptr[k];
	H[Nneur_sigmoid+Nneur_RBF+k]=exp(-d*d);
      }

    }


    float ret=0;

    if(true){

      // interpret last element of beta as the 
      // multiplier of a constant term 

      for(int i=0;i<Nneur-1;i++)
	ret += beta[i]*H[i];
      
      ret += beta[Nneur-1];

    } else{

      for(int i=0;i<Nneur;i++)
	ret += beta[i]*H[i];
      
    }

    // try to bring the value to [0,1]

    ret *= 0.5;
    ret += 0.5;

    if(hardlimitresponses){
      if(ret<0) ret=0;
      if(ret>1) ret=1;
    }

    if(maxcachedim>=0)
      predictioncache[cacheidx]=ret;

    return ret;

  }

  bool ELMClassifier::readModelFromFile(const std::string &fn){

    FILE *fptr=fopen(fn.c_str(),"r");

    if(!fptr)
      throw string("Could not open model file ") + fn;

    
    if(!readText(fptr,"DIM")){
      fclose(fptr);
      return false;
    }

    int dim;
    if(fscanf(fptr,"%d",&dim)!=1){
      fclose(fptr);
      return false;
    }

    if(!readText(fptr,"USEZNORM")){
      fclose(fptr);
      return false;
    }

    int zn;
    if(fscanf(fptr,"%d",&zn)!=1){
      fclose(fptr);
      return false;
    }

    useznorm=(zn>0)?true:false;

    if(!readText(fptr,"MEAN")){
      fclose(fptr);
      return false;
    }

    if(!readFloatVectorFromFile(fptr,mean)){
      fclose(fptr);
      return false;
    }

    if(!readText(fptr,"INVSQRTVARIANCE")){
      fclose(fptr);
      return false;
    }

    if(!readFloatVectorFromFile(fptr,invsqrtvariance)){
      fclose(fptr);
      return false;
    }

    if(!readText(fptr,"ENSEMBLE_SIZE")){
      fclose(fptr);
      return false;
    }

    int esize;
    if(fscanf(fptr,"%d",&esize)!=1){
      fclose(fptr);
      return false;
    }

    ensemblesize=esize;

    if(!readText(fptr,"FUSION_OPERATION")){
      fclose(fptr);
      return false;
    }

    char str[200];
    if(fscanf(fptr,"%s",str)!=1){
      fclose(fptr);
      return false;
    }

    fusionoperator=str;

    trainedparam=vector<trainedELM>(esize);

    for(int em=0;em<esize;em++){
      if(!readText(fptr,"ENSEMBLE_MEMBER")){
	fclose(fptr);
	return false;
      }

      if(!readText(fptr,"WEIGHTM")){
	fclose(fptr);
	return false;
      }

      if(!trainedparam[em].weightM.readFromFile(fptr)){
	fclose(fptr);
	return false;
      }

      if(!readText(fptr,"RBFCENTERS")){
	fclose(fptr);
	return false;
      }

      if(!trainedparam[em].RBFcenters.readFromFile(fptr)){
	fclose(fptr);
	return false;
      }

      if(!readText(fptr,"RBFWIDTHS")){
	fclose(fptr);
	return false;
      }

      if(!trainedparam[em].RBFwidths.readFromFile(fptr)){
	fclose(fptr);
	return false;
      }

      if(!readText(fptr,"CHISQCENTERS")){
	fclose(fptr);
	return false;
      }

      if(!trainedparam[em].chisqcenters.readFromFile(fptr)){
	fclose(fptr);
	return false;
      }

      if(!readText(fptr,"CHISQWIDTHS")){
	fclose(fptr);
	return false;
      }

      if(!trainedparam[em].chisqwidths.readFromFile(fptr)){
	fclose(fptr);
	return false;
      }

      if(!readText(fptr,"BETA")){
	fclose(fptr);
	return false;
      }

      if(!trainedparam[em].beta.readFromFile(fptr)){
	fclose(fptr);
	return false;
      }

    }

    fclose(fptr);

    return true;
    

  }

  bool readText(FILE *f,const char *t){

    char str[200];

    do{
      fscanf(f,"%s",str);
    } while(!feof(f)&&strcmp(str,t));

    if(feof(f)) return false;
    return true;
  }

  void ELMClassifier::writeModelToFile(const std::string &fn){

    if(trainedparam.size()<1) return;

    FILE *fptr=fopen(fn.c_str(),"w");
    
    if(!fptr)
      throw string("Could not open model file ") + fn + " for writing";
    
    fprintf(fptr,"#ELM ENSEMBLE\n");

    int dim=trainedparam[0].weightM.cols-1;
    fprintf(fptr,"DIM %d\n",dim);

    fprintf(fptr,"USEZNORM %d\n",useznorm?1:0);

    fprintf(fptr,"MEAN ");
    writeFloatVectorToFile(fptr,mean);
    
    fprintf(fptr,"INVSQRTVARIANCE ");
    writeFloatVectorToFile(fptr,mean);
    
    fprintf(fptr,"ENSEMBLE_SIZE %d\n",(int)trainedparam.size());
    fprintf(fptr,"FUSION_OPERATION %s\n",fusionoperator.c_str());

    for(int em=0;em<(int)trainedparam.size();em++){
      fprintf(fptr,"ENSEMBLE_MEMBER\n");

      fprintf(fptr,"WEIGHTM ");
      trainedparam[em].weightM.writeToFile(fptr);

      fprintf(fptr,"RBFCENTERS ");
      trainedparam[em].RBFcenters.writeToFile(fptr);

      fprintf(fptr,"RBFWIDTHS ");
      trainedparam[em].RBFwidths.writeToFile(fptr);

      fprintf(fptr,"CHISQCENTERS ");
      trainedparam[em].chisqcenters.writeToFile(fptr);

      fprintf(fptr,"CHISQWIDTHS ");
      trainedparam[em].chisqwidths.writeToFile(fptr);

      fprintf(fptr,"BETA ");
      trainedparam[em].beta.writeToFile(fptr);
    }

    fclose(fptr);

  }

  void ELMClassifier::showParam(){
    
    cout << "means: " << endl;
    for(auto it=mean.begin();it!=mean.end();it++)
      cout << " " << *it;
    cout << endl;

    cout << "invsqrtvariance: " << endl;
    for(auto it=invsqrtvariance.begin();it!=invsqrtvariance.end();it++)
      cout << " " << *it;
    cout << endl;

    printf("ENSEMBLE_SIZE %d\n",(int)(trainedparam.size()));
    printf("FUSION_OPERATION %s\n",fusionoperator.c_str());

    for(int em=0;em<(int)trainedparam.size();em++){
      printf("ENSEMBLE_MEMBER\n");
      printf("NNEUR %d\n",trainedparam[em].weightM.rows);
      
      printf("WEIGHTM");
      for(int r=0;r<trainedparam[em].weightM.rows;r++){
	for(int c=0;c<trainedparam[em].weightM.cols;c++)
	  printf(" %f",trainedparam[em].weightM.dptr[r+c*trainedparam[em].weightM.rows]);
	printf("\n");
      }

      printf("BETA");
      for(int r=0;r<trainedparam[em].beta.rows;r++){
	printf(" %f",trainedparam[em].beta.dptr[r]);
      }
      printf("\n");
    }

  }

  int uniform(int m){

    // returns uniformly distributed random number in [0,m]

    float mul=m+1;
    mul /= RAND_MAX;

    float r;

    do{
      r=mul*rand();
    } while((int)r>m);

    return (int)r; 
  }

  float unifloat(){
    float r=rand();
    r/=RAND_MAX;
    return r;
  }

  float	evalEuclDist(const float *m1, int r1,int rows1,const float *m2, int r2,int rows2,int dim){

    // returns the Euclidean distance between the row vectors of column-major matrices

    // distance calculation takes dim first columns into account
    
    // r1,r2  specify the row in the matrices
    // rows1,rows2 are the number of rows in the two matrices

    float dist=0;

    for(int d=0;d<dim;d++){
      float diff=m1[r1+d*rows1]-m2[r2+d*rows2];
      dist += diff*diff;
    }

    return sqrt(dist);

  }

  float	evalChisqDist(const float *m1, int r1,int rows1,const float *m2, int r2,int rows2,int dim){

    // returns the square root of chisq distance between the row vectors of column-major matrices

    // distance calculation takes dim first columns into account
    
    // r1,r2  specify the row in the matrices
    // rows1,rows2 are the number of rows in the two matrices

    float dist=0;

    for(int d=0;d<dim;d++){
      float diff=m1[r1+d*rows1]-m2[r2+d*rows2];
      float sum=m1[r1+d*rows1]+m2[r2+d*rows2];
      
      if(sum>0)
      dist += diff*diff/sum;
    }

    return sqrt(dist);

  }
	

  bool readFloatVectorFromFile(FILE *f,vector<float> &v){

    v.clear();

    int dim;
    if(fscanf(f,"%d",&dim)!=1){
	return false;
    }

    for(int i=0;i<dim;i++){
      float val;
      if(fscanf(f,"%f",&val)!=1){
	return false;
      }
      v.push_back(val);
    }	     

    return true;
  }

  void writeFloatVectorToFile(FILE *f, const vector<float> &v){

    fprintf(f,"%d",(int)v.size());
    for(auto it=v.begin();it!=v.end();it++){
      fprintf(f," %f",*it);
    }
    fprintf(f,"\n");
  }

  void stagewise(const floatMatrix &regressors, const floatMatrix &tgt, 
		 float steplen, regressionPath &p){

    p.clear();

    // copy and normalise the matrices to zero mean 

    floatMatrix r=regressors.clone();
    floatMatrix t=tgt.clone();

    normaliseColumns(r,true);
    normaliseColumns(t);

//     print_matrix("normalised regressors",
// 		 r.rows,r.cols,r.dptr,r.rows);

//     print_matrix("normalised target",
// 		 t.rows,t.cols,t.dptr,t.rows);

    floatMatrix muhat; // column vector of current target estimate
    muhat.rows=t.rows;
    muhat.cols=1;
    muhat.allocate();

     for(int i=0;i<t.rows;i++) // initialise muhat to zero
       muhat.dptr[i]=0;

    floatMatrix beta; // column vector of current regression coefficients
    beta.rows=r.cols;
    beta.cols=1;
    beta.allocate();

    floatMatrix err=t.clone();

    for(int i=0;i<r.cols;i++)
      beta.dptr[i]=0; // start from zero coefficients

    int iteration=0;

    float sqrerr=0,olderr;

    // initially the error is the total length of the target vector

    for(int i=0;i<t.rows;i++)
      sqrerr += t.dptr[i]*t.dptr[i];

    do{

      olderr=sqrerr;

      cout << "stagewise iteration #" << iteration++ << endl;

      print_matrix("current estimate of the coefficients",
		   beta.rows,beta.cols,beta.dptr,beta.rows);



      int ld; // output variables of multiplymat
      
//       print_matrix("current estimate of the target",
// 		   muhat.rows,muhat.cols,muhat.dptr,muhat.rows);

      // evaluate current correlations c= transpose(r) err

      floatMatrix c; // column vector of current correlations

      multiplymat(r.rows,r.cols, r.dptr, r.rows,
		  err.rows, err.cols, err.dptr,err.rows,
		  true, false,
		  c.rows, c.cols, &(c.dptr),ld);
      
//       print_matrix("current correlations",
// 		   c.rows,c.cols,c.dptr,c.rows);

      float maxabs=-1;
      int maxind;

      for(int i=0;i<c.rows;i++){
	if(fabs(c.dptr[i])>maxabs){
	  maxabs=fabs(c.dptr[i]);
	  maxind=i;
	}
      }

//       cout << "identified maximally correlating variable #"<<maxind<<endl;

      beta.dptr[maxind] += steplen * ((c.dptr[maxind]>0)?1:-1);


//       print_matrix("new estimate of the coefficients",
// 		   beta.rows,beta.cols,beta.dptr,beta.rows);

      // update the target estimate
      // calculate the current estimate muhat=r beta

      if(muhat.dptr) free(muhat.dptr);
      
      multiplymat(r.rows,r.cols, r.dptr, r.rows,
		  beta.rows, beta.cols, beta.dptr,beta.rows,
		  false, false,
		  muhat.rows, muhat.cols, &(muhat.dptr),ld);
      
//       print_matrix("new estimate of the target",
// 		   muhat.rows,muhat.cols,muhat.dptr,muhat.rows);

      // evaluate the error

      //      if(err.dptr) free(err.dptr);
     
      //      err=t.clone();

      for(int i=0;i<err.rows;i++)
	err.dptr[i] = t.dptr[i]-muhat.dptr[i];

//       print_matrix("new error vector",
// 		   err.rows,err.cols,err.dptr,err.rows);

      // calculate the error magnitude

      sqrerr=0;

      for(int i=0;i<err.rows;i++)
	sqrerr += err.dptr[i]*err.dptr[i];

      //       cout << "squared error " << olderr << " -> " << sqrerr << endl; 

      vector<float> v(beta.rows);
      
      for(int i=0;i<beta.rows;i++)
	v[i]=beta.dptr[i];

      p.push_back(v);
      

    } while(sqrerr<olderr);

    


  }

  void lars(const floatMatrix &regressors, const floatMatrix &tgt, regressionPath &p){

    // perform the algorithm with rather low numerical precision
    // in order to avoid numerical problems

    float epsilon=0.001;

    p.clear();

    // copy and normalise the matrices to zero mean 

    floatMatrix r=regressors.clone();
    floatMatrix t=tgt.clone();

    normaliseColumns(r,true);
    normaliseColumns(t);

//     print_matrix("normalised regressors",
// 		 r.rows,r.cols,r.dptr,r.rows);

//     print_matrix("normalised target",
// 		 t.rows,t.cols,t.dptr,t.rows);

    floatMatrix muhat; // column vector of current target estimate
    muhat.rows=t.rows;
    muhat.cols=1;
    muhat.allocate();

     for(int i=0;i<t.rows;i++) // initialise muhat to zero
       muhat.dptr[i]=0;

    floatMatrix beta; // column vector of current regression coefficients
    beta.rows=r.cols;
    beta.cols=1;
    beta.allocate();

    floatMatrix err=t.clone();

    for(int i=0;i<r.cols;i++)
      beta.dptr[i]=0; // start from zero coefficients

    // These variables seem to go unused
    // - Matti
    // int iteration=0;

    // float sqrerr=0,olderr;
    float sqrerr=0;

    // steps replicated rather straightforwardly fro the Matlab
    // implementation by Xiaohui Chen (xiaohuic@ece.ubc.ca)

    // initially the error is the total length of the target vector

    for(int i=0;i<t.rows;i++)
      sqrerr += t.dptr[i]*t.dptr[i];

    int n=r.rows;
    int vars=r.cols;

    int m=fmin(vars,n-1); // maximum number of vars in the final set of active variables

    int nVars=0;

    while(nVars<m){

      // this does not seem to be used (-Matti)
      // olderr=sqrerr;

//        cout << "lars iteration #" << iteration++ << endl;

//        print_matrix("current estimate of the coefficients",
//  		   beta.rows,beta.cols,beta.dptr,beta.rows);


      int ld; // output variables of multiplymat
      
//       print_matrix("current estimate of the target",
// 		   muhat.rows,muhat.cols,muhat.dptr,muhat.rows);

      // evaluate current correlations c= transpose(r) err

      floatMatrix c; // column vector of current correlations

      multiplymat(r.rows,r.cols, r.dptr, r.rows,
		  err.rows, err.cols, err.dptr,err.rows,
		  true, false,
		  c.rows, c.cols, &(c.dptr),ld);
      
//        print_matrix("current correlations",
//  		   c.rows,c.cols,c.dptr,c.rows);
      
      float maxabs=-1;
      // this variable seems to go unused
      // -Matti
      // int maxind;

      for(int i=0;i<c.rows;i++){
	if(fabs(c.dptr[i])>maxabs){
	  maxabs=fabs(c.dptr[i]);
	  // maxind=i;
	}
      }

      if(maxabs<epsilon) break; // early stopping criterion

      // identify the active set of regressors and collect them as a matrix Xa

      vector<int> active,inactive;

      for(int i=0;i<c.rows;i++)
	if(maxabs-fabs(c.dptr[i])<epsilon)
	  active.push_back(i);
	else
	  inactive.push_back(i);

//       cout << "current active set: " << endl;
//       for(auto it=active.begin();it!=active.end();it++)
// 	cout << "#"<<*it<<" ";
//       cout << endl;

      floatMatrix Xa;

      Xa.rows=r.rows;
      Xa.cols=active.size();
      Xa.allocate();

      for(int c=0;c<Xa.cols;c++)
	for(int j=0;j<Xa.rows;j++)
	  Xa.dptr[j+c*Xa.rows]=r.dptr[j+active[c]*r.rows];
      
      // active part of r now projected to Xa

      // the signs of correlations of the active vars collected as a column vector

      floatMatrix s_A;
      s_A.rows=active.size();
      s_A.cols=1;
      s_A.allocate();

      for(int i=0;i<(int)active.size();i++)
	s_A.dptr[i]=(c.dptr[active[i]]>0)?1:-1;

      // the Gram matrix Ga = Xa'*Xa

      floatMatrix Ga;

      multiplymat(Xa.rows,Xa.cols, Xa.dptr, Xa.rows,
		  Xa.rows, Xa.cols, Xa.dptr,Xa.rows,
		  true, false,
		  Ga.rows, Ga.cols, &(Ga.dptr),ld);

      // and its inverse

      floatMatrix Ginv=Ga.clone();

      invertmatrixinplace(Ginv.rows,Ginv.cols,Ginv.dptr,Ginv.rows);

      // find the coefficients wa of the equiangular vector
      floatMatrix wa;

      multiplymat(Ginv.rows,Ginv.cols, Ginv.dptr, Ginv.rows,
		  s_A.rows,s_A.cols, s_A.dptr,s_A.rows,
		  false, false,
		  wa.rows, wa.cols, &(wa.dptr),ld);

      // calculate "Length" of Ga

      float La=0;
      for(int i=0;i<wa.rows;i++)
	La += s_A.dptr[i]*wa.dptr[i];

      La=1.0/sqrt(La);

      // normalise wa so that we get unit length equiangular vector

      for(int i=0;i<wa.rows;i++)
	wa.dptr[i] *= La;

      // find the equiangular vector ua=Xa * wa

      floatMatrix ua;

      multiplymat(Xa.rows,Xa.cols, Xa.dptr, Xa.rows,
		  wa.rows,wa.cols, wa.dptr,wa.rows,
		  false, false,
		  ua.rows, ua.cols, &(ua.dptr),ld);

      // angles between ua and columns of r a=r' * ua

      floatMatrix a;

      multiplymat(r.rows,r.cols, r.dptr, r.rows,
		  ua.rows,ua.cols, ua.dptr,ua.rows,
		  true, false,
		  a.rows, a.cols, &(a.dptr),ld);


      // determine the step length gamma


      float gamma;

      if((int)active.size()==m){
	// maximum number of vars active, move directly to the OLS solution

	gamma=maxabs/La;

      } else{

	float minstep=888888;
        // this variable appears to be unused (although set, see below)
        // so I commented it out to suppress compiler warnings
        // -Matti
	// int minind=-1;

	for(int j=0;j<(int)inactive.size();j++){
	  int jj=inactive[j];
	  float val1=(maxabs-c.dptr[jj])/(La-a.dptr[jj]);
	  float val2=(maxabs+c.dptr[jj])/(La+a.dptr[jj]);

	  if(val1>0 && val1< minstep){
	    minstep=val1;
	    // minind=jj;
	  }

	  if(val2>0 && val2< minstep){
	    minstep=val2;
	    // minind=jj;
	  }
	}

	gamma=minstep;
      }

      // update the coefficient estimate
      // and the target estimate

      for(int i=0;i<(int)active.size();i++){
	beta.dptr[active[i]] += gamma*wa.dptr[i];
      }

      for(int i=0;i<n;i++)
	muhat.dptr[i] += gamma * ua.dptr[i];
	


//       print_matrix("new estimate of the coefficients",
// 		   beta.rows,beta.cols,beta.dptr,beta.rows);

      // update the target estimate
      // calculate the current estimate muhat=r beta

      for(int i=0;i<err.rows;i++)
	err.dptr[i] = t.dptr[i]-muhat.dptr[i];

//       print_matrix("new error vector",
// 		   err.rows,err.cols,err.dptr,err.rows);

      // calculate the error magnitude

      sqrerr=0;

      for(int i=0;i<err.rows;i++)
	sqrerr += err.dptr[i]*err.dptr[i];

      //      cout << "squared error " << olderr << " -> " << sqrerr << endl; 

      vector<float> v(beta.rows);
      nVars=0;
      
      for(int i=0;i<beta.rows;i++){
	v[i]=beta.dptr[i];
	if(v[i]!=0) nVars++;
      }

      p.push_back(v);
      

    } 

    


  }


  void normaliseColumns(floatMatrix &m, bool unitlen){

    if(m.cols==0 || m.rows==0 || m.dptr==NULL) return;

    for(int c=0;c<m.cols;c++){
      float sum=0;
      for(int r=0;r<m.rows;r++){
	sum += m.dptr[r+c*m.rows];
      }

      sum /= m.rows;

      for(int r=0;r<m.rows;r++)
	m.dptr[r+c*m.rows] -= sum;

      if(unitlen){
	float sqrsum=0;
	for(int r=0;r<m.rows;r++){
	  sqrsum += m.dptr[r+c*m.rows]*m.dptr[r+c*m.rows];
	}
	
	float idev=1.0/sqrt(sqrsum);

	for(int r=0;r<m.rows;r++)
	  m.dptr[r+c*m.rows] *= idev;
      }
      
    }

  }

  void findBestNeurons(float *KM, float *tgt,int nsamples, int nneur, 
		       vector<int> &bestNeurons){

    cout << "starting pruning" << endl;


    // find ranking of neurons by the LARS algorithm

    regressionPath p;

    floatMatrix r,t;

    r.rows=nsamples;
    r.cols=nneur;
    r.dptr=KM;

    t.rows=nsamples;
    t.cols=1;
    t.dptr=tgt;

    lars(r,t,p);

    bestNeurons.clear();

    std::set<int> activevars;

     for(int i=0;i<(int)p.size();i++){
       for(int d=0;d<(int)p[i].size();d++)
	 if(p[i][d]!=0 && activevars.count(d)==0){
	   bestNeurons.push_back(d);
	   activevars.insert(d);
	 }
     }

     cout << "ranked neurons with lars: " << endl;

     for(auto it=bestNeurons.begin();it!=bestNeurons.end();it++){
       cout << "#"<<*it<<" ";
     }
     cout << endl;

     r.dptr=NULL; // prevent freeing of the memory areas
     t.dptr=NULL;

    // re-order the columns of KM according to the ranking

     int relvars=bestNeurons.size();
     // exclude neurons that are not selected by lars

     int relsamples=nsamples;
     // future option: for large number of samples, use only 
     // randomly selected K for determining the best number of neurons

     float *KM_reordered=new float[relsamples*relvars];

     for(int s=0;s<relsamples;s++)
       for(int n=0;n<relvars;n++){
	 KM_reordered[s+n*relsamples]=KM[s+bestNeurons[n]*nsamples];
       }

     // select the number of neurons for which the 
     // model selection criterion is to be evaluated

     vector<int> testsizes;

     int i;

     for(i=0;i<fmin(10,relvars);i++)
       testsizes.push_back(i+1);
     for(;i<relvars;i+=4)
       testsizes.push_back(i+1);

     if(i-4+1<relvars) // always include the full set of variables
       testsizes.push_back(relvars);

     // for future reference, evaluate var(tgt)

     float sum=0,sqrsum=0;

     for(int i=0;i<nsamples;i++){
       sum += tgt[i];
       sqrsum += tgt[i]*tgt[i];
     }

     sum /= nsamples;
     sqrsum /= nsamples;

     float vartgt=sqrsum-sum*sum; // slightly biased estimate

     cout << "var(tgt)="<<vartgt<<endl;

     float largeErr=2*relsamples*vartgt;

    // evaluate the PRESS statistic to the vector press

     vector<float> press(testsizes.size(),largeErr);

     float minpress;
       
    // row ~ number of neurons
    // column ~each training sample

     int stopcount=0;

     for(int nn_idx=0;stopcount<5&&nn_idx<(int)testsizes.size();nn_idx++){

       int nneur=testsizes[nn_idx];

       cout << "evaluating PRESS for " << nneur << " neurons" << endl;

       // cut the portion of the reordered kernel matrix and augment 
       // it with a column of ones

       float *KM2=new float[(nneur+1)*relsamples];
       
       // copy the first nneur columns from KM_reordered
       memcpy(KM2,KM_reordered,nneur*relsamples*sizeof(float));

       // the constant column
       for(int i=0;i<relsamples;i++)
	 KM2[i+nneur*relsamples]=1;

       // try to form the smoother matrix by first calculating
       // P=inv(KM2'*KM2)

       float *P;
       int m,n,ld; // storage for the output dimension

       int nn1=nneur+1;



       multiplymat(relsamples, nn1, KM2,relsamples,
		   relsamples,nn1,KM2,relsamples,
		   true, false,
		   m, n, &P,ld);

       //       cout << "matrix C formed:" << endl;

       //print_matrix("C=",m,n,P,m);

       // P should be of size nn1 x nn1

       invertmatrixinplace(nn1,nn1,P,nn1);

       //cout << "inversion done" << endl;

       // reject nneur with too badly conditioned P

       float eps=1e-17;

       if(rcond(m,n,P,m)<eps)
	 continue;

       //cout << "condition test passed" << endl;

       // calculate the smoother matrix as S=KM2 * P * KM2'

       float *T;
       float *S;

       multiplymat(nn1, nn1, P,nn1,
		   relsamples,nn1,KM2,relsamples,
		   false, true,
		   m, n, &T,ld);

       //print_matrix("Intermediate result P * KM2'=",m,n,T,m);

       multiplymat(relsamples, nn1,KM2,relsamples,
		   nn1,relsamples,T,nn1,
		   false, false,
		   m, n, &S,ld);

       //print_matrix("Calculated smoother matrix S=",m,n,S,m);

       
       // S should have now size relsamples*relsamples

       

       // calculate the output weights as beta = pinv(KM2) * tgt(1:relsamples);       
       
       float *pi,*beta;
       int pim,pin,ldpi;

       int one=1;

       pseudoinverse(relsamples, nn1, KM2, relsamples, pim, pin, &pi, ldpi);
       //    print_matrix("pseudoinverse", pim,pin,pi,ldpi );

       multiplymat(pim, pin, pi,ldpi,
		   relsamples,one,tgt,relsamples,
		   false, false,
		   m, n, &beta,ld);


       // the estimates of the model yhat=KM2*beta

       float *yhat;

       multiplymat(relsamples, nn1, KM2,relsamples,
		   nn1,one,beta,nn1,
		   false, false,
		   m, n, &yhat,ld);

       // size of yhat now relsamples x 1

       // sum over the training samples and accumulate PRESS

       press[nn_idx]=0;

       for(int i=0;i<relsamples;i++){
	 float e=yhat[i]-tgt[i];
	 float d=1-S[i+i*relsamples];
	 if(d>eps)
	   press[nn_idx] += (e*e)/(d*d);
	 else{
	   press[nn_idx]=largeErr;
	   break;
	 }
       }

       // free the accumulated matrices before moving to next model size

       delete[] KM2;
       free(P);
       free(T);
       free(S);
       free(beta);
       free(pi);
       free(yhat);

       if(nn_idx==0 || press[nn_idx]<minpress)
	 minpress=press[nn_idx];

       if(nn_idx>0 && (press[nn_idx]>vartgt*relsamples*1.5 || press[nn_idx]>1.5*minpress))
	 stopcount++;
     }

     // for debugging, dump the press statistic to a file

     string fn="pressdump";
     FILE *f=fopen(fn.c_str(),"w");
     if(f){
       for(int i=0;i<(int)press.size();i++)
	 fprintf(f,"%d %f\n",testsizes[i],press[i]);
       fclose(f);
     }


    // determine the best number of neurons

     int optsize=-1;

     for(int i=0;i<(int)press.size();i++)
       if(press[i]==minpress){
	 optsize=testsizes[i];
	 break;
       }

     assert(optsize>0);

     bestNeurons.resize(optsize);

  }


  void CvElmClassifier::train(const cv::Mat& samples, const cv::Mat& desiredOutputs, size_t N, 
                              float(*activationFunction)(float)) {
    assert(samples.type() == CV_32FC1);
    assert(desiredOutputs.type() == CV_32FC1);
    assert(samples.rows == desiredOutputs.rows);
    assert(samples.cols == 3);
    assert(desiredOutputs.cols == 1 || desiredOutputs.cols == 2);
    // int m = desiredOutputs.cols; // desired output dimensionality

    cv::Mat normalisedSamples;
    //cv::Mat means, stddevs;
    math::zScoreByColumn(samples, normalisedSamples, means, stddevs);
    
    // std::cerr << "means:" << std::endl
    //           << means << std::endl
    //           << "stddevs" << std::endl
    //           << stddevs << std::endl
    //           << "samples:" << std::endl
    //           << normalisedSamples << std::endl;
    
    // select weights and biases at random
    // W = slmotion::random::unifrnd<float>(0., 1., N, 3);
    // b = slmotion::random::unifrnd<float>(0., 1., N, 1);
    W = slmotion::random::unifrnd<float>(-3., 3., N, 3);
    b = slmotion::random::unifrnd<float>(-3., 3., N, 1);
    g = activationFunction;

    // compute hidden layer output matrix
    // H is Nneurons x Nsamples
    // where h_ij = g(w_j * x_i + b_i)
    // cv::Mat H = samples * W.t() + cv::repeat(b.t(), samples.rows, 1);
    // cv::Mat H(samples.rows, N, CV_32FC1, cv::Scalar::all(0));
    cv::Mat H(samples.rows, N, CV_32FC1, cv::Scalar::all(0));
    for (int i = 0; i < H.rows; ++i) {
      for (int j = 0; j < H.cols; ++j) {
        float v = b.at<float>(j,0);
        for (int k = 0; k < 3; ++k)
          v += W.at<float>(j,k) * normalisedSamples.at<float>(i,k);
        H.at<float>(i,j) = g(v);
      }
    }

    assert(H.rows == samples.rows);
    assert(H.cols == W.rows);
    assert(H.cols == (int)N);

    cv::Mat temp;
    //cv::gemm(H,H,1.0,cv::Mat(),0.0,temp,cv::GEMM_1_T);
    //temp = temp.inv();
    //cv::Mat Hpinv;
    //cv::gemm(temp,H,1.0,cv::Mat(),0.0,Hpinv,cv::GEMM_2_T);
    //temp = cv::Mat();
    //cv::gemm(Hpinv, desiredOutputs, 1.0, cv::Mat(), 0.0, betas);
    //betas = Hpinv * desiredOutputs;
    betas = (H.t()*H).inv()*H.t()*desiredOutputs;
  }



  std::string CvElmClassifier::toString() const {
    assert(W.rows == b.rows);
    assert(W.rows == betas.rows);
    assert(W.cols == 3);
    assert(b.cols == 1);
    // assert(betas.cols == 1);
    assert(W.type() == CV_32FC1);
    assert(b.type() == CV_32FC1);
    assert(betas.type() == CV_32FC1);

    // format:
    // CvElmClassifier
    // name of activation function
    // N (the number of neurons)
    // 3 floats (W values) times N
    // 1 float (bias) times N
    // 1 float (betas) times N
    std::ostringstream oss;
    oss << "CvElmClassifier\n";
    if (g == &tanhf)
      oss << "tanhf";
    else if (g == &slmotion::sigmoid)
      oss << "sigmoid";
    else
      oss << "UNKNOWN";

    oss << "\n" << W.rows << "\n";

    auto writeMatrix = [&](const cv::Mat& m) {
      for (int i = 0; i < m.rows; ++i) {
        for (int j = 0; j < m.cols; ++j) {
          if (j != 0)
            oss << " ";
          oss << m.at<float>(i,j);
        }
        oss << std::endl;
      }
    };
    writeMatrix(W);
    writeMatrix(b);
    writeMatrix(betas);
    writeMatrix(means);
    writeMatrix(stddevs);
    
    return oss.str();
  }



  CvElmClassifier::CvElmClassifier(const std::string& fn) {
    std::vector<std::string> lines = readFileLines(fn);
    
    if (lines.size() < 4)
      throw ConfigurationFileException(fn + " is a malformed ELM model: too few lines.");
    
    if (lines[0] != "CvElmClassifier")
      throw ConfigurationFileException(fn + " is a malformed ELM model: unknown identifier line: \"" + 
                                       lines[0] + "\".");

    if (lines[1] != "tanhf" && lines[1] != "sigmoid")
      throw ConfigurationFileException(fn + " is a malformed ELM model: \"" + 
                                       lines[1] + "\" is not a supported activation function.");
    
    // g = &tanhf;
    if (lines[1] == "tanhf")
      g = &tanhf;
    else if (lines[1] == "sigmoid")
      g = &slmotion::sigmoid;

    size_t nNeurons = static_cast<size_t>(AdvancedString(lines[2]));

    // the file is ended with an empty line
    if (lines.size() != 3 + nNeurons*3 + 1 + 2)
      throw ConfigurationFileException(fn + " is a malformed ELM model: line count mismatch. " +
                                       AdvancedString(6 + nNeurons*3).c_str() + " lines expected");

    W = cv::Mat(nNeurons, 3, CV_32FC1);
    for (size_t i = 0; i < nNeurons; ++i) {
      auto vec = AdvancedString(lines[i+3]).split(" ");
      for (int j = 0; j < 3; ++j) 
        W.at<float>(i,j) = static_cast<float>(vec[j]);
    }

    b = cv::Mat(nNeurons, 1, CV_32FC1);
    for (size_t i = 0; i < nNeurons; ++i) 
      b.at<float>(i) = static_cast<float>(AdvancedString(lines[i+3+nNeurons]));
    
    int m = AdvancedString(lines[3+2*nNeurons]).split(" ").size();
    if (m != 2 && m != 1)
      throw ConfigurationFileException((AdvancedString("1- or 2-dimensional "
                                                       "outputs expected for a "
                                                       "binary classifier! Got "
                                                       ) +  AdvancedString(m))
                                       .c_str());
 
    betas = cv::Mat(nNeurons, m, CV_32FC1);
    for (size_t i = 0; i < nNeurons; ++i) {
      auto vec = AdvancedString(lines[i+3+2*nNeurons]).split(" ");
      assert(vec.size() == static_cast<size_t>(m));
      for (int j = 0; j < m; ++j)
        betas.at<float>(i,j) = static_cast<float>(vec[j]);
    }

    auto fields = AdvancedString(lines[3+3*nNeurons]).split(" ");
    means = cv::Mat(1, fields.size(), CV_32FC1);
    for (int j = 0; j < means.cols; ++j)
      means.at<float>(0,j) = static_cast<float>(fields[j]);

    fields = AdvancedString(lines[4+3*nNeurons]).split(" ");
    stddevs = cv::Mat(1, fields.size(), CV_32FC1);
    for (int j = 0; j < means.cols; ++j)
      stddevs.at<float>(0,j) = static_cast<float>(fields[j]);
  }



  CvElmClassifier& CvElmClassifier::operator=(const CvElmClassifier& that) {
    if (this != &that) {
      CvElmClassifier temp(that);
      std::swap(W, temp.W);
      std::swap(b, temp.b);
      std::swap(betas, temp.betas);
      std::swap(g, temp.g);
      std::swap(means, temp.means);
      std::swap(stddevs, temp.stddevs);
    }

    return *this;
  }



  CvElmClassifier& CvElmClassifier::operator=(CvElmClassifier&& that) {
    if (this != &that) {
      W = std::move(that.W);
      b = std::move(that.b);
      betas = std::move(that.betas);
      g = that.g;
      means = std::move(that.means);
      stddevs = std::move(that.stddevs);
    }

    return *this;
  }



  float CvElmClassifier::predict(const cv::Vec3b& sample) const {
    
    cv::Mat s(3,1,CV_32FC1);// = (cv::Mat_<float>(3,1) << 
    for (int k = 0; k < 3; ++k)
      s.at<float>(k,0) = (sample[k]-means.at<float>(0,k))/stddevs.at<float>(0,k);
    cv::Mat temp = W*s + b;

    for (int i = 0; i < temp.rows; ++i)
      temp.at<float>(i) = g(temp.at<float>(i));
    temp = betas.t() * temp;
    assert(temp.type() == CV_32FC1);
    assert(temp.cols == 1);
    assert(temp.rows == 1);
    return temp.at<float>(0,0);

    // return cv::Vec2f(temp.at<float>(0), temp.at<float>(1));
  }
}

