#
# Multi-Tensor Factorization
# Created: 20.7.2013
# Last Updated: 1.08.2014
# Suleiman Ali Khan, suleiman.khan@aalto.fi, khan.suleiman@gmail.com
#
#
# Y[[m]] \approx X(U)W[[m]]^T, m=1,..,M (M views in total)
# X, latent variables
# U, projection of tensorial dimension (only for tensor views)
# W, the projections (list)
# Z, M by K binary matrix indicating group activity
# alpha, list of same size of W
#

# Copyright 2013 Suleiman Ali Khan. All rights reserved.
# Citation: Suleiman A. Khan, Eemeli Leppäaho, Samuel Kaski, Multi-Tensor Factorization http://arxiv.org/abs/1412.4679.


hadamard.product <- function(x,y)
{
 h <- array(0,dim=c(nrow(x),nrow(y),ncol(x)))
 for(k in 1:ncol(x))
   h[,,k] <- x[,k] %*% t(y[,k])
 return(h)
}
library(compiler)
#install.packages(tensor)
library(tensor)

#Y is a list of Tensors(represented as arrays in R) as input
MTF <- function(Y,IsMat,K,opts){
  #
  # the hyperparameter settings may not work for all (real) data sets
  # if match data inverse variance? can be sampled (justified solution)
  #
  
  M <- length(Y)
  N <- dim(Y[[1]])[1]
  if(sum(IsMat==0) == 0)
	  L <- 1
  else  
	  L <- dim(Y[[which(!IsMat)[1]]])[3] #DONE
  D <- unlist(lapply(Y,function(x){dim(x)[2]}))
  Ls <-unlist(lapply(Y,function(x){dim(x)[3]})); Ls[is.na(Ls)] = 1

  const <- - N*sum(Ls*D)*log(2*pi)/2 #DONE 
  id <- rep(1,K)
  X <- matrix(rnorm(N*K),N,K)# latent variable
  covX <- diag(K)
  XX <- crossprod(X)
  XXtmp <- XX
  XXdiag <- rep(1,K)

  if(sum(IsMat==0) == 0)
  	U <- matrix(1,L,K)
  else
  	U <- matrix(rnorm(L*K),L,K) #DONE
  covU <- diag(K)
  UU <- crossprod(U)
  UUdiag <- rep(1,K)

  alphaU <- matrix(K,L,K)
  alphaX <- matrix(K,N,K)
  alpha <- vector("list")
  for(m in 1:M)
    alpha[[m]] <- matrix(K,D[m],K)
  # note that alpha_0 typically not important as added to D/2 ND/2 or K
  alpha_0 <- opts$prior.alpha_0 # ARD hyperparameters
  beta_0 <- opts$prior.beta_0 # rep(opts$prior.beta_0,2)
  Z <- matrix(1,M,K) # binary mask
  alpha_0t <- opts$prior.alpha_0t
  beta_0t <- opts$prior.beta_0t
  prior.betaW1 <- opts$prior.betaW1
  prior.betaW2 <- opts$prior.betaW2  
  ELEMENT_ALPHA <- opts$elementwiseAlpha	
  TAU_SETTING <- opts$tau_setting #"S"
  DEBUGMODE <- opts$debug_mode #TRUE/FALSE
  VERBOSE <- opts$verbose
  tau <- vector("list")
  b_tau <- vector("list")
  for(m in 1:M)
  {
    	tau[[m]] <- matrix(rep(opts$init.tau,Ls[m]*D[m]),Ls[m],D[m],byrow=TRUE)
    	b_tau[[m]] <- matrix(rep(1,Ls[m]*D[m]),Ls[m],D[m],byrow=TRUE) # initialization
    	if(VERBOSE==2) print("speed up possible - tau LD being created and stored")
  }
  a_tau <- N/2#(N*L)/2 #N/2#

  W <- vector("list",length=M)
  for(m in 1:M)
    W[[m]] <- matrix(0,D[m],K)

  # r is prior prabiliry for Z_{m,k}=1.
  # Preference to choose spike in case of tie
  # this is also sampled
  r <- rep(0.5,K)

  iter.burnin <- opts$iter.burnin
  iter.sampling <- opts$iter.sampling
  iter.thinning <- opts$iter.thinning  
  maxiter <- iter.burnin + iter.sampling*iter.thinning
  
  posterior <- list()
  if(iter.sampling>0)
  {
	  posterior$tau <- list(); length(posterior$tau) <- ceiling(iter.sampling/iter.thinning)
	  posterior$U <- list(); length(posterior$U) <- ceiling(iter.sampling/iter.thinning)
	  posterior$alphaU <- list(); length(posterior$alphaU) <- ceiling(iter.sampling/iter.thinning)
	  posterior$W <- list(); length(posterior$W) <- ceiling(iter.sampling/iter.thinning)
	  posterior$alpha <- list(); length(posterior$alpha) <- ceiling(iter.sampling/iter.thinning)
	  posterior$X <- list(); length(posterior$X) <- ceiling(iter.sampling/iter.thinning)
	  posterior$Z <- list(); length(posterior$Z) <- ceiling(iter.sampling/iter.thinning)
	  posterior$cost <- rep(NA,ceiling(iter.sampling/iter.thinning))
  }
    
   
  # the component numbers and loglikelihood
  spes1 <- spes2 <- shared <- cost <- Zactive <- rep(0,maxiter)  
  
  debug <- list()
  if(DEBUGMODE)
  {
 	  debug$taus <- rep(0,maxiter) 
	  debug$btau <- rep(0,maxiter)
	  debug$iter = 0  
	  debug$Umax <- rep(0,maxiter)
	  debug$Xmax <- rep(0,maxiter)  
	  debug$zone <- matrix(0,M,K)
	  debug$U <- list()
	  debug$alphaU <- list()
	  debug$W <- list()
	  debug$alpha <- list()
	  debug$X <- list()
  }
  
  ##Missing Values
  missingValues = FALSE
  na.inds = list()
  for(m in 1:M)
  {
  	inds = which(is.na(Y[[m]]))
  	if(length(inds)>0)
  	{
  		na.inds[[m]] = inds
  		missingValues = TRUE
  		print("Missing Values Detected, Prediction using EM type approximation")
  	}
  }
  na.inds[M+1] = 0

  #missingValues.InitIter 
  #0 - start prediction from mean of all values, and use updates of the model allways
  #between 2 and maxiter - use updates of the model in a single imputation setting for  iterations past the set value.
  #maxiter+1 - ignore missing values. never update with the model's single imputation. 
  missingValues.InitIter = round(iter.burnin/2,0) #5000
  if(missingValues)
  	print(paste("missingValues.InitIter",missingValues.InitIter))
  ## Missing Values end

  for(iter in 1:maxiter){
  if(VERBOSE==2)
    print(paste("iter: ",iter,sep=""))
    #
    # sample Z,U and W
    #
	
    if(iter > 1){
	XXUU <- XX*UU#DONE
	XXUUdiag <- diag(XXUU)
	diag(XXUU) <- 0 #for j!=k matrix summations
	XXtmp <- XX
	XXdiag <- diag(XXtmp)
	diag(XXtmp) <- 0 #for j!=k matrix summations
		
	for(m in 1:M){ #DONE
	for(k in 1:K){

	if(IsMat[m]) lambda <- tau[[m]][1,]*XXdiag[k] + alpha[[m]][,k] #DONE
	else lambda <- tau[[m]][1,]*XXUUdiag[k] + alpha[[m]][,k]
	
	if(missingValues && (missingValues.InitIter >= iter) )
	{
		mu_sub <- 0
		if(IsMat[m])
		{
			UX = X; UXK = UX[,k];  UX[,k] = 0
			ss <- tcrossprod(UX,W[[m]])
			tmp = Y[[m]]-ss; tmp[is.na(tmp)] = 0;
			mu_sub <- mu_sub + crossprod(tmp,UXK)		
		}
		else{
		for(l in 1:L)
		{
			UX = X*matrix(rep(U[l,],N),N,K,byrow=T); UXK = UX[,k];  UX[,k] = 0
			ss <- tcrossprod(UX,W[[m]])
			tmp = Y[[m]][,,l]-ss; tmp[is.na(tmp)] = 0;
			mu_sub <- mu_sub + crossprod(tmp,UXK)
		}}
		mu = mu_sub*tau[[m]][1,1]/lambda #TODO: Tau assumed to be "S" ONLY on this LINE
	}
	else
	{
		if(IsMat[m]) mu <- (tau[[m]][1,]/lambda)*(crossprod(Y[[m]],X[,k]) - W[[m]]%*%(XXtmp[k,]))
		else mu <- (tau[[m]][1,]/lambda)*(crossprod(tensor(Y[[m]],U[,k],3,1),X[,k]) - W[[m]]%*%(XXUU[k,]))
	}  
	#logpr of activating the component
      	logpr <- 0.5*( sum(log(alpha[[m]][,k]) - log(lambda)) + crossprod(mu*sqrt(lambda)) ) + log(r[k]) - log(1-r[k]) # m - k
	zone <- 1/(1+exp(-logpr))
	if(DEBUGMODE)
		debug$zone[m,k] = logpr
	  
	if(iter > 500){
		a = Z[m,k]
	  	Z[m,k] <- as.double((runif(1) < zone))
	}
	  
	if(Z[m,k]==1){
		W[[m]][,k] <- mu + 1/sqrt(lambda)*rnorm(D[m])
	}else{
		W[[m]][,k] <- 0
	}
	}
	}
	
	r <- rep( (prior.betaW1+sum(Z))/(M*K+prior.betaW1+prior.betaW2), K)
    }else{
	# alternative is to use method similar to VB
	# note now this is more connected to initialization
	for(m in 1:M){
	if(missingValues && (missingValues.InitIter < iter))
	{
	      ##Could give bad init when high missing values. therefore skip from here.
	      if(length(na.inds[[m]])>0)
	      Y[[m]][na.inds[[m]]] = mean(Y[[m]][-na.inds[[m]]])
	}
	if(IsMat[m]) covW <- diag(1,K) + opts$init.tau*XX #DONE
	else covW <- diag(1,K) + opts$init.tau*XX*UU
        eSW <- eigen(covW)
        covW <- tcrossprod(eSW$vectors*outer(id,1/eSW$values),eSW$vectors)		
	estimW = matrix(0,D[m],K) #equivalent of crossprod(Y[[m]],X)
	for(k in 1:K){
		if(missingValues && (missingValues.InitIter >= iter))
		{
			tmp = Y[[m]]; tmp[is.na(tmp)] = 0;
			if(IsMat[m]) estimW[,k] = crossprod(tmp,X[,k]) #DONE
			else estimW[,k] = crossprod(tensor(tmp,U[,k],3,1),X[,k]) 
		}
		else{
			if(IsMat[m]) estimW[,k] = crossprod(Y[[m]],X[,k]) #DONE
			else estimW[,k] = crossprod(tensor(Y[[m]],U[,k],3,1),X[,k])
		}
	}
	W[[m]] <- estimW%*%covW*opts$init.tau + matrix(rnorm(D[m]*K),D[m],K)%*%t( eSW$vectors*outer(id,1/sqrt(eSW$values)) )
	}

	#Initialize alphaU here
	if(TRUE == ELEMENT_ALPHA)
	{
	##Elementwise Alpha
	alphaU <- apply((U^2)/2,c(1,2),function(x){rgamma(1,shape=alpha_0+1/2,rate=beta_0+x)})
	#alphaX <- apply((X^2)/2,c(1,2),function(x){rgamma(1,shape=alpha_0+1/2,rate=beta_0+x)})
	}
	else
	{
	##Viewwise Alpha
	tmp <- apply(U^2,2,function(x){rgamma(1,shape=alpha_0+1/2,rate=beta_0+sum(x)/2)})
	alphaU <- matrix(rep(tmp,L),L,K,byrow=TRUE)
	#tmp <- apply(X^2,2,function(x){rgamma(1,shape=alpha_0+1/2,rate=beta_0+sum(x)/2)})
	#alphaX <- matrix(rep(tmp,N),N,K,byrow=TRUE)    	
	}
    }# end initialization else
	
	if(DEBUGMODE && sum(is.na(unlist(W))))
	{
	print("NA - Detected: W STOP")
	browser()
	}
################################################################################	
	#
	# sample X (latent variables)
	#
################################################################################

	#Sample X_n Matrix Version
	covX <- diag(K)
	for(m in 1:M){
		if(IsMat[m]) covX <- covX + crossprod(W[[m]]*sqrt(tau[[m]][1,])) #DONE
		else covX <- covX + crossprod(W[[m]]*sqrt(tau[[m]][1,]))*UU
	}
	eSX <- eigen(covX)
	covX <- tcrossprod( eSX$vectors*outer(id,1/sqrt(eSX$values)) )
	X <- 0
	if(missingValues && (missingValues.InitIter >= iter) )
	{
		for(m in 1:M){
		tmp = Y[[m]]; tmp[is.na(tmp)] = 0;
		if(IsMat[m]) X <- X + tmp %*% (W[[m]]*tau[[m]][1,1]) #DONE
		else{
			for(l in 1:L) X <- X + ((tmp[,,l]*matrix(rep(tau[[m]][1,],N),N,D[m],byrow=TRUE)) %*% (W[[m]]*matrix(rep(U[l,],D[m]),D[m],ncol(U),byrow=TRUE)))			
		}
		}
	}
	else
	{
		for(m in 1:M){
		if(IsMat[m]) X <- X + Y[[m]] %*% (W[[m]]*tau[[m]][1,1]) #DONE
		else{
			for(l in 1:L) X <- X + ((Y[[m]][,,l]*matrix(rep(tau[[m]][1,],N),N,D[m],byrow=TRUE)) %*% (W[[m]]*matrix(rep(U[l,],D[m]),D[m],ncol(U),byrow=TRUE)))
		}
		}
	}
	X <-  X%*%covX + matrix(rnorm(N*K),N,K)%*%t( eSX$vectors*outer(id,1/sqrt(eSX$values)) )

	if(DEBUGMODE && (sum(is.na(covX)) | sum(is.na(X))) )
	{
	print("NA - Detected: X STOP")
	browser()
	}
	
	# do scaling 
	Rx <- apply(X,2,sd)
	X <- X*outer(rep(1,N),1/Rx)
	for(m in 1:M)
		W[[m]] <- W[[m]]*outer(rep(1,D[m]),Rx)
	XX <- crossprod(X)
	
################################################################################
	#
	# sample U 
	#
################################################################################

  if(sum(IsMat==0) != 0) #Update if there is atleast one tensor, else ignore U
  {
	#Sample U_L, Matrix Version is correct for ARD and slow for standard normal. Verified computationally.	
	covU.bk <- 0
	for(m in 1:M){ #DONE not only all M
		if(!IsMat[m]) covU.bk <- covU.bk + crossprod(W[[m]]*sqrt(tau[[m]][1,]))*XX
	}
	U <- 0
	if(missingValues && (missingValues.InitIter >= iter) )
	{
		for(m in 1:M){
		if(!IsMat[m]){ #DONE
			tmp = Y[[m]]; tmp[is.na(tmp)] = 0;
			for(n in 1:N){			
		      		U <- U + ((t(tmp[n,,])*matrix(rep(tau[[m]][1,],L),L,D[m],byrow=TRUE)) %*% (W[[m]]*matrix(rep(X[n,],D[m]),D[m],ncol(X),byrow=TRUE)))
			}
		}}
	}
	else
	{
		for(m in 1:M){
			if(!IsMat[m]) {
			for(n in 1:N) U <- U + ((t(Y[[m]][n,,])*matrix(rep(tau[[m]][1,],L),L,D[m],byrow=TRUE)) %*% (W[[m]]*matrix(rep(X[n,],D[m]),D[m],ncol(X),byrow=TRUE))) }
		}
	}
	
	if(opts$U_prior == "Gaussian")
	{
		covU <- covU.bk + diag(K) #Gaussian Prior with variance I on U
		eS <- eigen(covU) 
		covU <- tcrossprod( eS$vectors*outer(id,1/sqrt(eS$values)) )
		U <- U%*%covU
		U <-  U + matrix(rnorm(L*K),L,K)%*%t( eS$vectors*outer(id,1/sqrt(eS$values)) )
	}
	if(opts$U_prior == "ARD")
	{
		for(l in 1:L)
		{	
		covU <- covU.bk + diag(alphaU[l,]) #ARD Prior
		eS <- eigen(covU) 
		covU <- tcrossprod( eS$vectors*outer(id,1/sqrt(eS$values)) )
		U[l,] <- U[l,]%*%covU
		U[l,] <-  U[l,] + matrix(rnorm(K),1,K)%*%t( eS$vectors*outer(id,1/sqrt(eS$values)) ) 		
		}
	}
	if(DEBUGMODE && (sum(is.na(covU)) | sum(is.na(U))) )
	{
	print("NA - Detected: U STOP")
	browser()
	}

	# do scaling
	if(L == 1)
	{
		Ru <- as.vector(U)
		U <- U*outer(rep(1,L),1/Ru)
		U[is.na(U)] = 0	
		for(m in 1:M) #DONE, not in all M
			if(!IsMat[m]) W[[m]] <- W[[m]]*outer(rep(1,D[m]),Ru)		
	}
	else
	{
		Ru <- apply(U,2,sd) 
		U <- U*outer(rep(1,L),1/Ru)
		U[is.na(U)] = 0	
		for(m in 1:M) #DONE, not in all M
			if(!IsMat[m]) W[[m]] <- W[[m]]*outer(rep(1,D[m]),Ru)		
	}
	UU <- crossprod(U)
   }
################################################################################	
	#
	# Sample Hyperparameters
	#	
################################################################################

	# sample alpha
	if(TRUE == ELEMENT_ALPHA)
	{
	#Elementwise alpha
 	alphaU <- U^2/2
  	keep <- which(alphaU!=0)
  	for(n in keep) alphaU[n] <- rgamma(1,shape=alpha_0+0.5,rate=beta_0+alphaU[n])+ 1e-7 #+ 1e-7 to prevent bad values of gamma
  	alphaU[-keep] <- rgamma( length(alphaU[-keep]),shape=alpha_0,rate=beta_0)+ 1e-7
	}
	else
	{
	#View-Wise alpha
	tmp <- apply(U^2,2,function(x){rgamma(1,shape=alpha_0+L/2,rate=beta_0+sum(x)/2)})
        alphaU <- matrix(rep(tmp,L),L,K,byrow=TRUE)+ 1e-7
        }
	
	for(m in 1:M)
	{
	  if(TRUE == ELEMENT_ALPHA)
	  {
          #Elementwise alpha
	  alpha[[m]] <- W[[m]]^2/2
	  keep <- which(alpha[[m]]!=0)
	  for(n in keep) alpha[[m]][n] <- rgamma(1,shape=alpha_0+0.5,rate=beta_0+alpha[[m]][n])+ 1e-7
	  alpha[[m]][-keep] <- rgamma( length(alpha[[m]][-keep]),shape=alpha_0,rate=beta_0)+ 1e-7
	  }
	  else
	  {
  	  #View-Wise alpha
	  tmp <- apply(W[[m]]^2,2,function(x){rgamma(1,shape=alpha_0+(as.double(sum(x)!=0)*D[m])/2,rate=beta_0+sum(x)/2)})
          alpha[[m]] <- matrix(rep(tmp,D[m]),D[m],K,byrow=TRUE)+ 1e-7
	  }
	}
	if(DEBUGMODE && (sum(is.na(unlist(alpha))) | sum(is.na(alphaU))) )
	{
	print("NA - Detected: Alpha STOP")
	browser()
	}

	#
	# sample noise
	#
	XU <- hadamard.product(X,U)
	for(m in 1:M){
	if(IsMat[m]){ #DONE
		XW <- tcrossprod(X,W[[m]])
		if(missingValues && (missingValues.InitIter > iter) )
		{
			XW <- Y[[m]] - XW
			XW[is.na(XW)] = 0;
			b_tau[[m]][1,] <- 0.5*crossprod((XW)^2,rep(1,dim(Y[[m]])[1]))
		}
		else{
			if(missingValues)
			{
				if(length(na.inds[[m]])>0)
					Y[[m]][na.inds[[m]]] = XW[na.inds[[m]]]
			}
			b_tau[[m]][1,] <- 0.5*crossprod((Y[[m]] - XW)^2,rep(1,dim(Y[[m]])[1]))
		}

		#Vector b_tau
		if("S" == TAU_SETTING)
		{
			if(VERBOSE==2) print("TAU-S")
			a_tau <- (N*Ls[m]*D[m])/2 #DONE
			b_tau2 <- sum(b_tau[[m]][1,])
			b_tau[[m]] <- matrix(b_tau2,Ls[m],D[m])
			tau[[m]] <- matrix(rgamma(1,shape= alpha_0t[m]+ a_tau,rate= beta_0t[m] + b_tau2),Ls[m],D[m])
		}
	}
	else {
		#single valued tau
		#b_tau[m] <- ( tau_tmp[m] - 2*sum( W[[m]]*crossprod(Y[[m]],X) ) + 
		#                sum((W[[m]]%*%XX)*W[[m]]) )/2
		  
		#Matrix b_tau generation of D values (not LD) is correct, verified computationally
		XWU <- aperm(tensor(XU,W[[m]],3,2),c(1,3,2))
		if(missingValues && (missingValues.InitIter > iter) ) #in the last iter of ignoring missing values, update Y[[m]] with the model updates
		{
			XWU <- Y[[m]] - XWU
			XWU[is.na(XWU)] = 0;
			b_tau[[m]][1,] <- 0.5*tensor( tensor((XWU)^2,rep(1,dim(Y[[m]])[3]),3,1) ,rep(1,dim(Y[[m]])[1]),1,1) 
		}
		else{
			if(missingValues)
			{
				if(length(na.inds[[m]])>0)
					Y[[m]][na.inds[[m]]] = XWU[na.inds[[m]]]
			}
			b_tau[[m]][1,] <- 0.5*tensor( tensor((Y[[m]] - XWU)^2,rep(1,dim(Y[[m]])[3]),3,1) ,rep(1,dim(Y[[m]])[1]),1,1) 
		}

		#Vector b_tau
		if("S" == TAU_SETTING)
		{
			if(VERBOSE==2) print("TAU-S")
			a_tau <- (N*Ls[m]*D[m])/2
			b_tau2 <- sum(b_tau[[m]][1,])
			b_tau[[m]] <- matrix(b_tau2,Ls[m],D[m])
			tau[[m]] <- matrix(rgamma(1,shape= alpha_0t[m]+ a_tau,rate= beta_0t[m] + b_tau2),Ls[m],D[m])
		}
    	}
    	}

    # calculate log likelihood
    cost[iter] <- const
    for(m in 1:M)
    {
    if("S" == TAU_SETTING)
    	cost[iter] <- cost[iter] + N*L*D[m]*0.5*log(tau[[m]][1,1]) - b_tau[[m]][1,1]*tau[[m]][1,1] # Will change when tau reduced to D vector
    if("D" == TAU_SETTING)
    	cost[iter] <- cost[iter] + N*L*0.5*sum(log(tau[[m]][1,])) - sum(b_tau[[m]][1,]*tau[[m]][1,])
    if("LD" == TAU_SETTING)
    	cost[iter] <- cost[iter] + N*0.5*sum(log(tau[[m]])) - sum(b_tau[[m]]*tau[[m]])
    }
	if(DEBUGMODE)	
	{
		debug$taus[iter] <- mean(unlist(tau))
		debug$btau[iter] <- mean(unlist(b_tau))	
		debug$Xmax[iter] <- max(abs(X))
		debug$Umax[iter] <- max(abs(U))    
	}

    # print component number and collect numbers
    if(VERBOSE==2)
    {
	    print(rowSums(Z))
	    print(paste("Empty",sum(colSums(Z) == 0)))	    
	    print(cost[iter])
    }
    if((VERBOSE==1) && ((iter %% 500) == 0))
    	print(iter)
    #print(range(X))
    Zactive[iter] <- sum(rowSums(Z))
    if(M == 1)
    {
	shared[iter] <- sum(Z[1,])
    	spes1[iter] <- spes2[iter] <- 0
    }
    else
    {
    		if(M == 2)
    		{
    		shared[iter] <- sum(Z[1,]*Z[2,])
		spes1[iter] <- sum(Z[1,]) - shared[iter]
		spes2[iter] <- sum(Z[2,]) - shared[iter]
    		}
    		else
    		{
		shared[iter] <- sum(colSums(Z) == nrow(Z))
		spes1[iter] <- sum(colSums(Z) == 1) #specific only
		spes2[iter] <- sum( colSums(Z) > 1 &  colSums(Z) < nrow(Z) ) #partly shared
		}
    }

	if(DEBUGMODE && (sum(is.na(U)) | sum(is.na(X)) | sum(is.na(W[[1]])) | sum(is.na(unlist(tau)))) ) 
	{
	print("NA - Detected: STOP")
	for(s in 1:5)
		browser()
	}

  if(iter > iter.burnin && (((iter - iter.burnin) %% iter.thinning) == 0))
  {
	ind <- (iter - iter.burnin)/iter.thinning
	posterior$tau[[ind]] <- tau
	posterior$U[[ind]] <- U
	posterior$alphaU[[ind]] <- alphaU
	posterior$W[[ind]] <- W
	posterior$alpha[[ind]] <- alpha
	posterior$X[[ind]] <- X
	posterior$Z[[ind]] <- Z
	posterior$cost[ind] <- cost[iter]
  }

  }
  return(list(shared=shared,spes1=spes1,spes2=spes2,cost=cost,Zactive=Zactive,alpha=alpha,X=X,W=W,Z=Z,U=U,alpha=alpha,alphaU=alphaU,tau=tau,posterior=posterior,debug=debug,opts=opts))
}
MTF <- cmpfun(MTF)



getDefaultOpts <- function(Y){
  #
  # A function for generating a default set of parameters.
  #
  # To run the algorithm with other values:
  #   opts <- getDefaultOpts()
  #   opts$init.tau <- 10^6
  #   model <- MTF(Y,IsMat,K,opts)
  
 
  #
  # Initial value for the noise precisions. Should be large enough
  # so that the real structure is modeled with components
  # instead of the noise parameters (see Luttinen&Ilin, 2010)
  #  Values: Positive numbers, but generally should use values well
  #          above 1
  #
  init.tau <- 10^3
  
  #
  # Parameters for controlling when the algorithm stops.
  # It stops when the relative difference in the lower bound
  # falls below iter.crit or iter.max iterations have been performed.
  #
  iter.crit <- 10^-6
  iter.burnin <- 10e3
  
  #
  # Additive noise level for latent variables. The latent variables
  # of inactive components (those with very large alpha) occasionally
  # show some structure in the mean values, even though the distribution
  # matches very accurately the prior N(0,I). This structure disappears
  # is a tiny amount of random noise is added on top of the
  # mean estimates. Setting the value to 0 will make the predictions
  # deterministic
  #
  addednoise <- 1e-6
  
  #
  # Hyperparameters
  # - alpha_0, beta_0 for the ARD precisions
  # - alpha_0t, beta_0t for the residual noise predicions
  #
  prior.alpha_0 <- prior.beta_0 <- 1e-3
  prior.alpha_0t <- prior.beta_0t <- rep(1,length(Y))
  
  #
  # Verbosity level
  #  0: Nothing
  #  1: Final cost function value for each run of gsCCAexperiment()
  #  2: Cost function values for each iteration
  #
  verbose <- 1
  
  #
  # Element wise Alpha
  #  TRUE: The Alpha Parameter be Element wise
  #  FALSE: Alpha Parameter should be component wise (Default)
  #  
  elementwiseAlpha <- FALSE

  #
  # Prior for Binary variables.
  #  W1 = W2 = 1: uninformative
  #  W1 < W2: sparsity inducing
  #    
  prior.betaW1 <- 1
  prior.betaW2 <- 1
  
  
  iter.sampling <- 1000
  iter.thinning <- 1
  tau_setting = "S"
  debug_mode = FALSE
  U_prior = "Gaussian"

  return(list(init.tau=init.tau,
              iter.burnin=iter.burnin,
              addednoise=addednoise,
              prior.alpha_0=prior.alpha_0,prior.beta_0=prior.beta_0,
              prior.alpha_0t=prior.alpha_0t,prior.beta_0t=prior.beta_0t,
              verbose=verbose,elementwiseAlpha=elementwiseAlpha,
              prior.betaW1=prior.betaW1,prior.betaW2=prior.betaW2,
	      iter.sampling=iter.sampling,iter.thinning=iter.thinning,
		tau_setting = tau_setting,
  		debug_mode = debug_mode,
	        U_prior = U_prior))
}

informative.Prior.Tensor <- function(Y,prop.to.be.explained.by.noise,conf,tau_setting="S")
{
	# This function sets view wise paramteres for an informative prior such that the mean of   
	# noise prior is equal to the proportion of variance we want to be explained
	# by noise. The confidence paramter sets width of the prior distribution.
	
	## INPUT
	## Y: list of datasets. data normalization does not matters.
	## prop.to.be.explained.by.noise: proportion of total variance of each view to be explained by noise. Valid Values: 0.01 -> 0.99
	## conf: width of the distribution. Valid values: 1e-7 -> 100
	
	#### OUTPUT
	#### a list containing 2 vectors: prior.alpha_0t and prior.beta_0t
	#### prior.alpha_0t is alpha_0t paramter of HyperPrior for each view
	#### prior.beta_0t is beta_0t paramter of HyperPrior for each view

	## Derivation
	## alpha = conf*alpha
	## beta = conf*amount_of_var 
	## For gamma mu = a/b
	## In model a and b have opposite role so, amount_of_var = b/a, i.e. b = a*amount_of_var, b = conf*total_amount_of_var
	
	D <- unlist(lapply(Y,function(x){dim(x)[2]}))
	N <- unlist(lapply(Y,function(x){dim(x)[1]}))
	L <- unlist(lapply(Y,function(x){dim(x)[3]})); L[is.na(L)] = 1
	M <- length(Y)

	if(tau_setting == "S")
		prior.alpha_0t <- conf*N*L*D/2
	else
		prior.alpha_0t <- rep(conf*N*L/2, M)

	view.var.modes <- rep(NA, M)
	for (i in 1:M) {
	    total.variance.in.view <- sum(Y[[i]][!is.na(Y[[i]])]^2)/2 #(N*L*D[i]*var(Y[[i]][!is.na(Y[[i]])]))/2
	    if(tau_setting != "S")
	    	total.variance.in.view <- total.variance.in.view/D[i]
	    	
	    sigma.hat <- prop.to.be.explained.by.noise * total.variance.in.view
	    view.var.modes[i] <- sigma.hat
	}
	
	prior.beta_0t <- conf * view.var.modes
	return(list(prior.alpha_0t=prior.alpha_0t,prior.beta_0t=prior.beta_0t))
}
