#
# Tensor CCA/GFA
# Created: 20.7.2013
# Last Updated: 23.6.2014
# Suleiman Ali Khan, suleiman.khan@aalto.fi
#
#
# Y[[m]] \approx X(U)W[[m]]^T, m=1,..,M (M views in total)
# X, latent variables
# U, projection of tensorial dimension
# W, the projections (list)
# Z, M by K binary matrix indicating group activity
# alpha, list of same size of W
#
# Citation: Suleiman A. Khan, Samuel Kaski, Bayesian Multi-View Tensor Factorization, ECML 2014.

hadamard.product <- function(x,y)
{
 h <- array(0,dim=c(nrow(x),nrow(y),ncol(x)))
 for(k in 1:ncol(x))
   h[,,k] <- x[,k] %*% t(y[,k])
 return(h)
}
library(compiler)
#install.packages(tensor)
library(tensor)
SampleDimension1 = 1
SampleDimension2 = 3
GFAdimension = 2

#Y is a list of Tensors(represented as arrays in R) as input
bmtf <- function(Y,K,opts){
  #
  # the hyperparameter settings may not work for all (real) data sets
  # if match data inverse variance? can be sampled (justified solution)
  #
  
  M <- length(Y)
  N <- dim(Y[[1]])[SampleDimension1]
  L <- dim(Y[[1]])[SampleDimension2]
  D <- unlist(lapply(Y,function(x){dim(x)[GFAdimension]}))

  const <- - N*L*sum(D)*log(2*pi)/2 #likelihood
  id <- rep(1,K)
  X <- matrix(rnorm(N*K),N,K)# latent variable
  covX <- diag(K)
  XX <- crossprod(X) # t(X) %*% X
  XXdiag <- rep(1,K)

  U <- matrix(rnorm(L*K),L,K)# Latent 3rd Dimension matrix(1,L,K)#
  #U <- matrix(1,L,K)# Latent 3rd Dimension matrix(1,L,K)#
  #print("INITIALIZE U BY POSITIVE VALUES ONLY")  
  #covU <- diag(K)
  UU <- crossprod(U) # t(U) %*% U
  UUdiag <- rep(1,K)

  alphaU <- matrix(K,L,K)
  alphaX <- matrix(K,N,K)
  alpha <- vector("list")
  for(m in 1:M)
    alpha[[m]] <- matrix(K,D[m],K)
  # note that alpha_0 typically not important as added to D/2 ND/2 or K
  alpha_0 <- opts$prior.alpha_0 # ARD hyperparameters
  beta_0 <- opts$prior.beta_0 # rep(opts$prior.beta_0,2)
  Z <- matrix(1,M,K) # binary mask
  alpha_0t <- opts$prior.alpha_0t
  beta_0t <- opts$prior.beta_0t
  prior.betaW1 <- opts$prior.betaW1
  prior.betaW2 <- opts$prior.betaW2  
  ELEMENT_ALPHA <- opts$elementwiseAlpha	
  TAU_SETTING <- opts$tau_setting 
  DEBUGMODE <- opts$debug_mode #TRUE/FALSE
  VERBOSE <- opts$verbose
  tau <- vector("list")
  b_tau <- vector("list")
  for(m in 1:M)
  {
	    	tau[[m]] <- matrix(rep(opts$init.tau,L*D[m]),L,D[m],byrow=TRUE)
	    	b_tau[[m]] <- matrix(rep(1,L*D[m]),L,D[m],byrow=TRUE) # initialization
	    	if(VERBOSE==2) print("speed up possible - tau LD being created and stored")
  }
  a_tau <- N/2

  W <- vector("list",length=M)
  for(m in 1:M)
    W[[m]] <- matrix(0,D[m],K)

  # r is prior prabiliry for Z_{m,k}=1.
  # Preference to choose spike in case of tie
  # this is also sampled
  r <- rep(0.5,K)

  iter.burnin <- opts$iter.burnin
  iter.sampling <- opts$iter.sampling
  iter.thinning <- opts$iter.thinning  
  maxiter <- iter.burnin + iter.sampling*iter.thinning
  
  posterior <- list()
  if(iter.sampling>0)
  {
	  posterior$tau <- list(); length(posterior$tau) <- ceiling(iter.sampling/iter.thinning)
	  posterior$U <- list(); length(posterior$U) <- ceiling(iter.sampling/iter.thinning)
	  posterior$alphaU <- list(); length(posterior$alphaU) <- ceiling(iter.sampling/iter.thinning)
	  posterior$W <- list(); length(posterior$W) <- ceiling(iter.sampling/iter.thinning)
	  posterior$alpha <- list(); length(posterior$alpha) <- ceiling(iter.sampling/iter.thinning)
	  posterior$X <- list(); length(posterior$X) <- ceiling(iter.sampling/iter.thinning)
	  posterior$Z <- list(); length(posterior$Z) <- ceiling(iter.sampling/iter.thinning)
	  posterior$cost <- rep(NA,ceiling(iter.sampling/iter.thinning))
  }
    
   
   Y_U = list()
   for(m in 1:M){
   Y_U[[m]] = 0
	for(ii in 1:dim(Y[[m]])[3])
		Y_U[[m]] <- Y_U[[m]] + Y[[m]][,,ii]
   }
   Y_Z = list()
   for(m in 1:M){
   Y_Z[[m]] = 0
	for(ii in 1:dim(Y[[m]])[1])
		Y_Z[[m]] <- Y_Z[[m]] + Y[[m]][ii,,]
   }

  # the component numbers and loglikelihood
  spes1 <- spes2 <- shared <- cost <- Zactive <- rep(0,maxiter)  
  
  debug <- list()
  if(DEBUGMODE)
  {
 	  debug$taus <- rep(0,maxiter) #matrix(0,maxiter,sum(D)*L) # collect residual precisions
	  debug$btau <- rep(0,maxiter) #matrix(0,maxiter,sum(D)*L)
	  debug$iter = 0  
	  debug$Umax <- rep(0,maxiter)
	  debug$Xmax <- rep(0,maxiter)  
	  debug$zone <- matrix(0,M,K)
	  debug$U <- list()
	  debug$alphaU <- list()
	  debug$W <- list()
	  debug$alpha <- list()
	  debug$X <- list()
  }
  
  ##Missing Values
  missingValues = FALSE
  na.inds = list()
  for(m in 1:M)
  {
  	inds = which(is.na(Y[[m]]))
  	if(length(inds)>0)
  	{
  		na.inds[[m]] = inds
  		missingValues = TRUE
  		print("Missing Values Detected, Prediction using EM type approximation")
  	}
  }

  missingValues.InitIter = round(iter.burnin/2,0) 
  if(missingValues)
  	print("Missing Values Detected")
  ## Missing Values end
  
  for(iter in 1:maxiter){
  if(VERBOSE==2)
    print(paste("iter: ",iter,sep=""))
    #
    # sample Z,U and W
    #
	
    if(iter > 1){
	XXUU <- XX*UU
	XXUUdiag <- diag(XXUU)
	diag(XXUU) <- 0 #for j!=k matrix summations	
	
	for(m in 1:M){
	for(k in 1:K){
	##matrix computation of lambda is same as scalar below, but does not works in tau_LD case
        lambda <- tau[[m]][1,]*XXUUdiag[k] + alpha[[m]][,k]     
         


         ##matrix computation of mu is same as iterative below
         if(missingValues && (missingValues.InitIter >= iter) )
         {
	      	  #Faster and computationally verified correct version
         	  mu_sub <- 0
		  for(l in 1:L)
		  {
			  UX = X*matrix(rep(U[l,],N),N,K,byrow=T); UXK = UX[,k];  UX[,k] = 0
			  ss <- tcrossprod(UX,W[[m]])
			  tmp = Y[[m]][,,l]-ss; tmp[is.na(tmp)] = 0;
			  mu_sub <- mu_sub + crossprod(tmp,UXK)
	      	  }
	      	  mu = mu_sub*tau[[m]][1,1]/lambda #TODO: Tau assumed to be "S" ONLY on this LINE
         }
         else
	  mu <- (tau[[m]][1,]/lambda)*(crossprod(tensor(Y[[m]],U[,k],3,1),X[,k]) - W[[m]]%*%(XXUU[k,])) #MissingValue: might be done by setting na to zero. computational check required
	  
	  #logpr of activating the component
      logpr <- 0.5*( sum(log(alpha[[m]][,k]) - log(lambda)) + crossprod(mu*sqrt(lambda)) ) + log(r[k]) - log(1-r[k]) # m - k
      zone <- 1/(1+exp(-logpr))
      if(DEBUGMODE)
	  debug$zone[m,k] = logpr
	  
	  if(iter > 500){
	  	a = Z[m,k]
	  	Z[m,k] <- as.double((runif(1) < zone))
	  }
	  
	  # if(iter > 2000 && Z[m,k] > a)
	  # browser()
	  
          if(Z[m,k]==1){
            W[[m]][,k] <- mu + 1/sqrt(lambda)*rnorm(D[m])
          }else{
            W[[m]][,k] <- 0
          }
        }
	}
	
	r <- rep( (prior.betaW1+sum(Z))/(M*K+prior.betaW1+prior.betaW2), K)
    }else{
      # alternative is to use method similar to VB
      # note now this is more connected to initialization
      for(m in 1:M){
	      if(missingValues && (missingValues.InitIter < iter))
	      {
	      	      ##Could give bad init when high missing values. therefore skip from here.
		      if(length(na.inds[[m]])>0)
		      Y[[m]][na.inds[[m]]] = mean(Y[[m]][-na.inds[[m]]])
	      }
        covW <- diag(1,K) + opts$init.tau*XX*UU #covW <- diag(alpha[m,]) + tau[m]*XX*UU #covX 
        
        eSW <- eigen(covW)
        covW <- tcrossprod(eSW$vectors*outer(id,1/eSW$values),eSW$vectors)
		
		estimW = matrix(0,D[m],K) #equivalent of crossprod(Y[[m]],X)
		for(k in 1:K){
			if(missingValues && (missingValues.InitIter >= iter))
			{
				tmp = Y[[m]]; tmp[is.na(tmp)] = 0;
				estimW[,k] = crossprod(tensor(tmp,U[,k],3,1),X[,k])
			}
			else
			estimW[,k] = crossprod(tensor(Y[[m]],U[,k],3,1),X[,k]) #MissingValue:
			}
		W[[m]] <- estimW%*%covW*opts$init.tau + matrix(rnorm(D[m]*K),D[m],K)%*%t( eSW$vectors*outer(id,1/sqrt(eSW$values)) )
		
      }

	if(TRUE == ELEMENT_ALPHA)
	{
	##Elementwise Alpha
    	alphaU <- apply((U^2)/2,c(1,2),function(x){rgamma(1,shape=alpha_0+1/2,rate=beta_0+x)})
    	#alphaX <- apply((X^2)/2,c(1,2),function(x){rgamma(1,shape=alpha_0+1/2,rate=beta_0+x)})
	}
	else
	{
	##Viewwise Alpha
	tmp <- apply(U^2,2,function(x){rgamma(1,shape=alpha_0+1/2,rate=beta_0+sum(x)/2)})
	alphaU <- matrix(rep(tmp,L),L,K,byrow=TRUE)
	#tmp <- apply(X^2,2,function(x){rgamma(1,shape=alpha_0+1/2,rate=beta_0+sum(x)/2)})
	#alphaX <- matrix(rep(tmp,N),N,K,byrow=TRUE)    	
	}
    }# end initialization else
	
	if(DEBUGMODE && sum(is.na(unlist(W))))
	{
	print("NA - Detected: W STOP")
	browser()
	}
################################################################################	
	#
	# sample X (latent variables)
	#
################################################################################
	
	#Sample X_n Matrix Version
	covX <- diag(K)
	for(m in 1:M){
		covX <- covX + crossprod(W[[m]]*sqrt(tau[[m]][1,]))*UU
	}
	eSX <- eigen(covX)
	covX <- tcrossprod( eSX$vectors*outer(id,1/sqrt(eSX$values)) )
	X <- 0
	if(missingValues && (missingValues.InitIter >= iter) )
	{
		for(m in 1:M){
		tmp = Y[[m]]; tmp[is.na(tmp)] = 0;
		for(l in 1:L){
			X <- X + ((tmp[,,l]*matrix(rep(tau[[m]][1,],N),N,D[m],byrow=TRUE)) %*% (W[[m]]*matrix(rep(U[l,],D[m]),D[m],ncol(U),byrow=TRUE)))
		}
		}
	}
	else
	{
		for(m in 1:M){
		for(l in 1:L){
			X <- X + ((Y[[m]][,,l]*matrix(rep(tau[[m]][1,],N),N,D[m],byrow=TRUE)) %*% (W[[m]]*matrix(rep(U[l,],D[m]),D[m],ncol(U),byrow=TRUE)))
		}
		}
	}
	X <- X%*%covX
	X <-  X + matrix(rnorm(N*K),N,K)%*%t( eSX$vectors*outer(id,1/sqrt(eSX$values)) )

	if(DEBUGMODE && (sum(is.na(covX)) | sum(is.na(X))) )
	{
	print("NA - Detected: X STOP")
	browser()
	}
	
	# do scaling 
	# moving scale from X to W. Move both mean and variance scales?
	Rx <- apply(X,2,sd) #colMeans(X^2) #sd not var??
	X <- X*outer(rep(1,N),1/Rx)
	for(m in 1:M)
		W[[m]] <- W[[m]]*outer(rep(1,D[m]),Rx)
	XX <- crossprod(X)
	
################################################################################
	#
	# sample U 
	#
################################################################################
	
	# Sample U_K - scalar version
	#Sample U_L, Matrix Version is correct for ARD and slow for standard normal. Verified computationally.	
	covU.bk <- 0
	for(m in 1:M){
		covU.bk <- covU.bk + crossprod(W[[m]]*sqrt(tau[[m]][1,]))*XX
	}
	U <- 0
	if(missingValues && (missingValues.InitIter >= iter) )
	{
		for(m in 1:M){
		tmp = Y[[m]]; tmp[is.na(tmp)] = 0;
		for(n in 1:N){			
	      		U <- U + ((t(tmp[n,,])*matrix(rep(tau[[m]][1,],L),L,D[m],byrow=TRUE)) %*% (W[[m]]*matrix(rep(X[n,],D[m]),D[m],ncol(X),byrow=TRUE)))
		}
		}
	}
	else
	{
		for(n in 1:N){
		for(m in 1:M){
	      		U <- U + ((t(Y[[m]][n,,])*matrix(rep(tau[[m]][1,],L),L,D[m],byrow=TRUE)) %*% (W[[m]]*matrix(rep(X[n,],D[m]),D[m],ncol(X),byrow=TRUE)))
		}
		}
	}
	for(l in 1:L)
	{
		if(opts$U_prior == "Gaussian")
			covU <- covU.bk + diag(K) #Gaussian Prior with variance I on U
		if(opts$U_prior == "ARD")
			covU <- covU.bk + diag(alphaU[l,]) #ARD Prior
			
		eS <- eigen(covU) 
		covU <- tcrossprod( eS$vectors*outer(id,1/sqrt(eS$values)) )
		U[l,] <- U[l,]%*%covU
		U[l,] <-  U[l,] + matrix(rnorm(K),1,K)%*%t( eS$vectors*outer(id,1/sqrt(eS$values)) ) 		
	}

	if(DEBUGMODE && (sum(is.na(covU)) | sum(is.na(U))) )
	{
	print("NA - Detected: U STOP")
	browser()
	}
	# do scaling
	# moving scale to U from W. Components do not get switched off! though structure is found
	if(L == 1)
	{
		Ru <- as.vector(U)
		U <- U*outer(rep(1,L),1/Ru)
		U[is.na(U)] = 0	
		for(m in 1:M)
			W[[m]] <- W[[m]]*outer(rep(1,D[m]),Ru)		
	}
	else
	{
		Ru <- apply(U,2,sd) 
		U <- U*outer(rep(1,L),1/Ru)
		U[is.na(U)] = 0	
		for(m in 1:M)
			W[[m]] <- W[[m]]*outer(rep(1,D[m]),Ru)		
	}
	UU <- crossprod(U)
	
################################################################################	
	#
	# Sample Hyperparameters
	#	
################################################################################

	# sample alpha
	if(TRUE == ELEMENT_ALPHA)
	{
 	alphaU <- U^2/2
  	keep <- which(alphaU!=0)
  	for(n in keep) alphaU[n] <- rgamma(1,shape=alpha_0+0.5,rate=beta_0+alphaU[n])+ 1e-7 #+ 1e-7 to prevent bad values of gamma
  	alphaU[-keep] <- rgamma( length(alphaU[-keep]),shape=alpha_0,rate=beta_0)+ 1e-7
	}
	else
	{
	#View-Wise alpha
	tmp <- apply(U^2,2,function(x){rgamma(1,shape=alpha_0+L/2,rate=beta_0+sum(x)/2)})
        alphaU <- matrix(rep(tmp,L),L,K,byrow=TRUE)
        }
	
	for(m in 1:M)
	{
	  if(TRUE == ELEMENT_ALPHA)
	  {
	  alpha[[m]] <- W[[m]]^2/2
	  keep <- which(alpha[[m]]!=0)
	  for(n in keep) alpha[[m]][n] <- rgamma(1,shape=alpha_0+0.5,rate=beta_0+alpha[[m]][n])+ 1e-7
	  alpha[[m]][-keep] <- rgamma( length(alpha[[m]][-keep]),shape=alpha_0,rate=beta_0)+ 1e-7
	  }
	  else
	  {
  	  #View-Wise alpha
	  tmp <- apply(W[[m]]^2,2,function(x){rgamma(1,shape=alpha_0+(as.double(sum(x)!=0)*D[m])/2,rate=beta_0+sum(x)/2)})
          alpha[[m]] <- matrix(rep(tmp,D[m]),D[m],K,byrow=TRUE)
      }
	}
	if(DEBUGMODE && (sum(is.na(unlist(alpha))) | sum(is.na(alphaU))) )
	{
	print("NA - Detected: Alpha STOP")
	browser()
	}

	#
	# sample noise
	#
	XU <- hadamard.product(X,U)
	for(m in 1:M){
	#Matrix b_tau generation of D values (not LD) is correct, verified computationally
	XWU <- aperm(tensor(XU,W[[m]],3,2),c(1,3,2))
	if(missingValues && (missingValues.InitIter > iter) ) #in the last iter of ignoring missing values, update Y[[m]] with the model updates
	{
		XWU <- Y[[m]] - XWU
		XWU[is.na(XWU)] = 0;
		b_tau[[m]][1,] <- 0.5*tensor( tensor((XWU)^2,rep(1,dim(Y[[m]])[3]),3,1) ,rep(1,dim(Y[[m]])[1]),1,1) #MissingValue: after substraction na = zero
	}
	else{
		if(missingValues)
		{
			if(length(na.inds[[m]])>0)
				Y[[m]][na.inds[[m]]] = XWU[na.inds[[m]]]
		}
		b_tau[[m]][1,] <- 0.5*tensor( tensor((Y[[m]] - XWU)^2,rep(1,dim(Y[[m]])[3]),3,1) ,rep(1,dim(Y[[m]])[1]),1,1) #MissingValue: after substraction na = zero
	}

		#Vector b_tau
		if("D" == TAU_SETTING)
		{
			a_tau <- (N*L)/2
			for(d in 1:D[m])
				tau[[m]][1,d] <- rgamma(1,shape= alpha_0t[m]+ a_tau,rate= beta_0t[m] + b_tau[[m]][1,d])
			if(L>1)
			for(l in 2:L)
			{
				b_tau[[m]][l,] <- b_tau[[m]][1,]
				tau[[m]][l,] <- tau[[m]][1,]
			}
		}

		#Vector b_tau
		if("S" == TAU_SETTING)
		{
			a_tau <- (N*L*D[m])/2
			b_tau2 <- sum(b_tau[[m]][1,])
			b_tau[[m]] <- matrix(b_tau2,L,D[m])
			tau[[m]] <- matrix(rgamma(1,shape= alpha_0t[m]+ a_tau,rate= beta_0t[m] + b_tau2),L,D[m])
		}
    }

    # calculate log likelihood
    cost[iter] <- const
    for(m in 1:M)
    {
    if("S" == TAU_SETTING)
    	cost[iter] <- cost[iter] + N*L*D[m]*0.5*log(tau[[m]][1,1]) - b_tau[[m]][1,1]*tau[[m]][1,1] # Will change when tau reduced to D vector
    if("D" == TAU_SETTING)
    	cost[iter] <- cost[iter] + N*L*0.5*sum(log(tau[[m]][1,])) - sum(b_tau[[m]][1,]*tau[[m]][1,])
    if("LD" == TAU_SETTING)
    	cost[iter] <- cost[iter] + N*0.5*sum(log(tau[[m]])) - sum(b_tau[[m]]*tau[[m]])
    }	  
	if(DEBUGMODE)	
	{
	    debug$taus[iter] <- mean(unlist(tau))
	    debug$btau[iter] <- mean(unlist(b_tau))	
	    debug$Xmax[iter] <- max(abs(X))
	    debug$Umax[iter] <- max(abs(U))    
	}

    # print component number and collect numbers
    if(VERBOSE==2)
    {
	    print(rowSums(Z))
	    print(paste("Empty",sum(colSums(Z) == 0)))	    
	    print(cost[iter])
    }
    if((VERBOSE==1) && ((iter %% 500) == 0))
    	print(iter)
    #print(range(X))
    Zactive[iter] <- sum(rowSums(Z))
    if(M == 1)
    {
	shared[iter] <- sum(Z[1,])
    	spes1[iter] <- spes2[iter] <- 0
    }
    else
    {
    		if(M == 2)
    		{
    		shared[iter] <- sum(Z[1,]*Z[2,])
		spes1[iter] <- sum(Z[1,]) - shared[iter]
		spes2[iter] <- sum(Z[2,]) - shared[iter]
    		}
    		else
    		{
		shared[iter] <- sum(colSums(Z) == nrow(Z))
		spes1[iter] <- sum(colSums(Z) == 1) #specific only
		spes2[iter] <- sum( colSums(Z) > 1 &  colSums(Z) < nrow(Z) ) #partly shared
		}
    }

	if(DEBUGMODE && (sum(is.na(U)) | sum(is.na(X)) | sum(is.na(W[[1]])) | sum(is.na(unlist(tau)))) ) 
	{
	print("NA - Detected: STOP")
	for(s in 1:50)
		browser()
	}

  if(iter > iter.burnin && (((iter - iter.burnin) %% iter.thinning) == 0))
  {
	ind <- (iter - iter.burnin)/iter.thinning
	posterior$tau[[ind]] <- tau
	posterior$U[[ind]] <- U
	posterior$alphaU[[ind]] <- alphaU
	posterior$W[[ind]] <- W
	posterior$alpha[[ind]] <- alpha
	posterior$X[[ind]] <- X
	posterior$Z[[ind]] <- Z
	posterior$cost[ind] <- cost[iter]
  }

  }
  return(list(shared=shared,spes1=spes1,spes2=spes2,cost=cost,Zactive=Zactive,alpha=alpha,X=X,W=W,Z=Z,U=U,alpha=alpha,alphaU=alphaU,tau=tau,posterior=posterior,debug=debug,opts=opts))
  #return(list(X=X,W=W,Z=Z,U=U,alpha=alpha,alphaU=alphaU,tau=tau))
}
bmtf <- cmpfun(bmtf)



getDefaultOpts <- function(Y){
  #
  # A function for generating a default set of parameters.
  #
  # To run the algorithm with other values:
  #   opts <- getDefaultOpts()
  #   opts$opt.method <- "BFGS"
  #   model <- gsCCA(Y,K,opts)
  
  #
  # Whether to use the rotation explained in the ICML'11 paper.
  # Using the rotation is strongly recommended, only turn this
  # off if it causes problems.
  #  - TRUE|FALSE
  #
  rotate <- TRUE
  
  #
  # Parameters for controlling how the rotation is solved
  #  - opt.method chooses the optimization method and
  #    takes values "BFGS" or "L-BFGS". The former
  #    is typically faster but takes more memory, so the latter
  #    is the default choice. For small K may use BFGS instead.
  #  - opt.iter is the maximum number of iterations
  #  - lbfgs.factr is convergence criterion for L-BFGS; smaller
  #    values increase the accuracy (10^7 or 10^10 could be tried
  #    to speed things up)
  #  - bfgs.crit is convergence criterion for BFGS; smaller
  #    values increase the accuracy (10^-7 or 10^-3 could also be used)
  #
  opt.method <- "L-BFGS"
  opt.iter <- 10^5
  lbfgs.factr <- 10^3
  bfgs.crit <- 10^-5
  
  #
  # Initial value for the noise precisions. Should be large enough
  # so that the real structure is modeled with components
  # instead of the noise parameters (see Luttinen&Ilin, 2010)
  #  Values: Positive numbers, but generally should use values well
  #          above 1
  #
  init.tau <- 10^3
  
  #
  # Parameters for controlling when the algorithm stops.
  # It stops when the relative difference in the lower bound
  # falls below iter.crit or iter.max iterations have been performed.
  #
  iter.crit <- 10^-6
  
  #
  # Additive noise level for latent variables. The latent variables
  # of inactive components (those with very large alpha) occasionally
  # show some structure in the mean values, even though the distribution
  # matches very accurately the prior N(0,I). This structure disappears
  # is a tiny amount of random noise is added on top of the
  # mean estimates. Setting the value to 0 will make the predictions
  # deterministic
  #
  addednoise <- 1e-5
  
  #
  # Hyperparameters
  # - alpha_0, beta_0 for the ARD precisions
  # - alpha_0t, beta_0t for the residual noise predicions
  #
  prior.alpha_0 <- prior.beta_0 <- 1 #1e-3
  prior.alpha_0t <- prior.beta_0t <- rep(1,length(Y))
  
  #
  # Verbosity level
  #  0: Nothing
  #  1: Final cost function value for each run of gsCCAexperiment()
  #  2: Cost function values for each iteration
  #
  verbose <- 2
  
  #
  # Element wise Alpha
  #  TRUE: The Alpha Parameter be Element wise
  #  FALSE: Alpha Parameter should be component wise (Default)
  #  
  elementwiseAlpha <- TRUE

  #
  # Prior for Binary variables.
  #  W1 = W2 = 1: uninformative
  #  W1 < W2: sparsity inducing
  #    
  prior.betaW1 <- 1
  prior.betaW2 <- 1
  
iter.burnin <- 10000
iter.sampling <- 1000
iter.thinning <- 1
tau_setting <- "S"
U_prior <- "ARD" 
debug_mode <- FALSE
  
  return(list(rotate=rotate, init.tau=init.tau, iter.crit=iter.crit,debug_mode=debug_mode,
              iter.burnin=iter.burnin, iter.sampling=iter.sampling,
	      iter.thinning=iter.thinning, opt.method=opt.method,
              lbfgs.factr=lbfgs.factr, bfgs.crit=bfgs.crit, opt.iter=opt.iter,
              addednoise=1e-6,tau_setting=tau_setting,U_prior=U_prior,
              prior.alpha_0=prior.alpha_0,prior.beta_0=prior.beta_0,
              prior.alpha_0t=prior.alpha_0t,prior.beta_0t=prior.beta_0t,
              verbose=verbose,elementwiseAlpha=elementwiseAlpha,
              prior.betaW1=prior.betaW1,prior.betaW2=prior.betaW2))
}

informative.Prior.Tensor <- function(Y,prop.to.be.explained.by.noise,conf,tau_setting="S")
{
	# This function sets view wise paramteres for an informative prior such that the mean of   
	# noise prior is equal to the proportion of variance we want to be explained
	# by noise. The confidence paramter sets width of the prior distribution.
	
	## INPUT
	## Y: list of datasets. data normalization does not matters.
	## prop.to.be.explained.by.noise: proportion of total variance of each view to be explained by noise. Valid Values: 0.01 -> 0.99
	## conf: width of the distribution. Valid values: 1e-7 -> 100
	
	#### OUTPUT
	#### a list containing 2 vectors: prior.alpha_0t and prior.beta_0t
	#### prior.alpha_0t is alpha_0t paramter of HyperPrior for each view
	#### prior.beta_0t is beta_0t paramter of HyperPrior for each view

	## Derivation
	## alpha = conf*alpha_GFA (alpha_GFA in GFA derivations)
	## beta = conf*amount_of_var 
	## For gamma mu = a/b
	## In model a and b have opposite role so, amount_of_var = b/a, i.e. b = a*amount_of_var, b = conf*total_amount_of_var
	
	D <- unlist(lapply(Y,ncol))
	N <- dim(Y[[1]])[1]
	L <- dim(Y[[1]])[3]
	M <- length(Y)
	
	if(tau_setting == "S")
		prior.alpha_0t <- conf*N*L*D/2
	else
		prior.alpha_0t <- rep(conf*N*L/2, M)

	view.var.modes <- rep(NA, M)
	for (i in 1:M) {
	    total.variance.in.view <- sum(Y[[i]][!is.na(Y[[i]])]^2)/2 #(N*L*D[i]*var(Y[[i]][!is.na(Y[[i]])]))/2
	    if(tau_setting != "S")
	    	total.variance.in.view <- total.variance.in.view/D[i]
	    	
	    sigma.hat <- prop.to.be.explained.by.noise * total.variance.in.view
	    view.var.modes[i] <- sigma.hat
	}
	
	prior.beta_0t <- conf * view.var.modes
	return(list(prior.alpha_0t=prior.alpha_0t,prior.beta_0t=prior.beta_0t))
}
