##############################################################################################
#
# R code accompanying the following publication:
#
# E. Georgii, J. Salojärvi, M. Brosché, J. Kangasjärvi, and S. Kaski.
# Targeted Retrieval of Gene Expression Measurements Using Regulatory Models.
# Bioinformatics, 2012 (Virtual issue on "Computational Systems Biology" in conjunction with MLSB12).
#
# Uses the R package glmnet.
#
# License: GNU GPLv3 (GNU General Public License Version 3), see glp.txt or <http://www.gnu.org/licenses/>.
#
# Contact: elisabeth.georgii@aalto.fi
#
##############################################################################################

# Learn the regulatory model of gene expression by regularized linear regression for each target gene
learning <- function(targets, alpha=1, myLibraryPath="~/Library/R/") {
	library("glmnet", lib.loc=myLibraryPath)
	
	submodels <- list()
	
	nr <- 0
	for (i in targets) { # for each target gene
		sdi <- (ncol(DATABASE)/(ncol(DATABASE)-1))*sd(DATABASE[i,]) # maximum likelihood estimate
		enetResult <- glmnet(t(DATABASE[-i,]),DATABASE[i,],alpha=alpha, standardize=FALSE) # alpha=1 corresponds to Lasso regularization
		pen <- enetResult$lambda # lambda sequence sorted in decreasing order; for efficiency reasons whole path is fitted (see glmnet documentation; use nlambda and lambda.min.ratio arguments in glmnet if needed)
		
		thr <- (sdi/sqrt(ncol(DATABASE)))*qnorm(1-(0.05/(2*(nrow(DATABASE)^2))))  # analytical threshold in case of Lasso regularization (i.e., alpha=1)
		
		w <- which(pen<thr)
		if (length(w)==0) {
			w <- length(pen)
		}
		idx <- min(w) # closest to thr from below
		
		betaCurr <- rep(0, nrow(DATABASE))
		coeffWithoutI <- enetResult$beta[, idx]
		betaCurr[-i] <- coeffWithoutI
		wCurr <- which(betaCurr!=0)
		
		nr <- nr+1
		
		submodels[[nr]] <- list(nz=wCurr, co=betaCurr[wCurr], ic=enetResult$a0[idx], int=i)
	}
	
	return(submodels)
}

# Compute submodel-based similarity; each target gene has own submodel
computeSubmodelSimilarity <- function(submodel, query) {
	geneOfInt <- submodel$int
	neighbors <- submodel$nz
	coeff <- submodel$co
	
	ncolData <- ncol(DATABASE)
	sim <- rep(0, ncolData)
	if (length(neighbors)>0) {
		targetValues <- DATABASE[geneOfInt,]
		neighborValues <- DATABASE[neighbors, , drop = FALSE]
		
		deviations <- as.vector(targetValues - coeff %*% neighborValues)
		sdGeneOfInt <- sd(deviations)
		factGeneOfInt <- (1/(sdGeneOfInt^2))
		
		fisherScoresData <- factGeneOfInt * t(deviations * t(neighborValues))
		fisherScoreQuery <- factGeneOfInt * as.vector(query[geneOfInt] - coeff %*% query[neighbors]) * query[neighbors]
		
		
		sim <- fisherScoreQuery %*% fisherScoresData
	}
	
	return(sim)
	
}

# Compute model-based similarity of a query to the DATABASE
computeSimilarity <- function(query, model) { 
	
	simMatrix <- sapply(model, function(l, q) computeSubmodelSimilarity(l, q), q=query, simplify=T)
	sim <- apply(simMatrix, 1, sum)
		
}

# Get ranked lists from similarity scores
getRankedList <- function(outputs, names, outfile, kPrint=10, kFile=length(outputs)) {
	
	sortedOutput <- sort(outputs, decreasing=TRUE, index.return = TRUE)
	rankedList <- names[sortedOutput$ix]
	topk <- rankedList[1:min(kPrint,length(outputs))]
	
	print(topk)
	
	if (file.exists(outfile)) {
  		file.remove(outfile)
	}
	
	# write to file
	for (i in 1:min(kFile,length(rankedList))) {
		cat(sprintf("%s\n", rankedList[i]), file=outfile, append=TRUE)
	}
	
	return(rankedList)
	
}

# Install glmnet package
installGlmnet <- function(myLibraryPath="~/Library/R/") {
	install.packages("glmnet",lib=myLibraryPath, dependencies = TRUE)
}

# DATABASE is set globally for efficiency reasons
setDatabase <- function(dataFile, indices) {
	load(dataFile) # data (e.g., genes times measurements)
	DATABASE <<- data[, indices] # global
	
	# normalize
	sc <- scale(t(DATABASE))
	mu <- attr(sc, "scaled:center")
	scalvec <- attr(sc, "scaled:scale")
	DATABASE <<- t(sc)
	return(list(m=mu, s=scalvec))
}

# Example application: query with some measurements against the remaining measurements; model learned without query data
## dataFile: name of R data file with genes times measurements matrix
## queryIndices: indices of measurements to be taken as queries
## targetIndices: indices of target genes (genes of interest)
## resPrefix: path for storing result files (for each query a file containing ranked list of measurements)
## mlp: my library path for R packages
testrun <- function(dataFile, queryIndices, targetIndices, resPrefix, mlp) {
	load(dataFile) 
	trainIndices <- setdiff(1:ncol(data), queryIndices)
	queries <- data[, queryIndices, drop=FALSE]
	rm(data)
	
	sinfo <- setDatabase(dataFile, trainIndices)
	
	# normalize
	for (j in 1:length(queryIndices)) {
		queries[, j] <- (queries[, j]-(sinfo$m))/(sinfo$s) 
	}
	
	mod <- learning(targetIndices, myLibraryPath=mlp)
	for (j in 1:length(queryIndices)) {
		sim <- computeSimilarity(queries[, j], mod)
		print(sprintf("%d. Query", j))
		getRankedList(sim, colnames(DATABASE), sprintf("%s_%d", resPrefix, queryIndices[j]))
	}
	
	
}

# Set library path and install glmnet if necessary
myLibraryPath <- "~/Library/R/" # change to your library path
if (!require("glmnet", lib.loc=myLibraryPath)) {
	installGlmnet(myLibraryPath)
}

# Example execution with two queries and three target genes; results stored into files results/rankedList_<query>
testrun("data/testdata.Rdata", c(59,60), c(2,4,9), "results/rankedList", myLibraryPath)

