transform2simcca.input <- function(featureType, data, mart,is_mirna) {
## Map to ensembl ids and transform data for simCCA
if(!is_mirna) { 
gene_id = getGene(rownames(data), type = featureType, mart = mart) 
st_index = 11 #index seperating actual data from annotation used later.
} else {
gene_id <- getBM(c("mirbase_id", "ensembl_gene_id", "start_position","end_position","chromosome_name", "band"), filters = "with_mirbase", values = TRUE, mart = mart) 
st_index = 8
}

gene_id = subset(gene_id, !duplicated(gene_id[[featureType]]))
row.names(gene_id) = gene_id[[featureType]]
ucontig = c(grep(paste("GL........"),gene_id$chromosome_name),grep("H",gene_id$chromosome_name))  #Remove unknown contig
if(length(ucontig) > 0 ) {gene_id = gene_id[-ucontig,]}
gene_id = gene_id[-c(which(gene_id$chromosome_name == c("X")),which(gene_id$chromosome_name == c("Y"))),] #Remove sex chromosomes

if(is_mirna) { row.names(gene_id) = unlist(lapply(rownames(gene_id),function(x) gsub('-mir','',x))) }
mapped = merge(gene_id, data, by = "row.names")
mapped = subset(mapped, !duplicated(mapped$ensembl_gene_id)) #collapse multiple probes matching to the same gene.

tranformed_data = list()

tranformed_data$data = as.matrix(mapped[,st_index:dim(mapped)[2]])
row.names(tranformed_data$data) = mapped$ensembl_gene_id

mapped$band = unlist(lapply(mapped$band,function(x) substr(x,1,1))) #Just arm information and no need to cytobands in simCCA

tranformed_data$info = data.frame(chr = as.integer(mapped$chromosome_name), arm = mapped$band, loc = (mapped$start_position + mapped$end_position)/2)

colnames(tranformed_data$data) = unlist(lapply(colnames(tranformed_data$data),function(x) substr(x,1,12)))#First 4 are TCGA(4).collection_centre(2).patient_id(4) = 4+4+2+2(dots) = 12 ids that define a patient uniquely: For further info refer to pg: 57 of TCGA Data Guide: http://tcga-data.nci.nih.gov/docs/TCGA_Data_Primer.pdf
# All of these are tumor samples that orginate from DNA

rownames(tranformed_data$info) = rownames(tranformed_data$data)

return(tranformed_data)
#Alternative methods to map CGH
# get chromosomal information from the aCGH aiglent 244A probes: mapping via CanGEM: file GRCh37.txt found here:
#http://www.cangem.org/index.php?platform={E74F2BCF-A930-47FE-8AF5-C91D4F6748C7}

}

my.read.delim <- function(path, rowPartition) {
## Reads huge raw data file in chunks.
cgh1 = read.delim(path, nrows = rowPartition)
cgh2 = read.delim(path, skip = rowPartition, nrows = rowPartition)
cgh3 = read.delim(path, skip = 2*rowPartition,nrows = rowPartition)
cgh4 = read.delim(path, skip = 3*rowPartition)

rownames(cgh2) = cgh2[,1] #REPEAT THIS AND BELOW TWO COMMANDS FOR CGH3 AND CGH4 as follows:
cgh2 = cgh2[,-1]
colnames(cgh2) = colnames(cgh1)

rownames(cgh3) = cgh3[,1] 
cgh3 = cgh3[,-1]
colnames(cgh3) = colnames(cgh1)

rownames(cgh4) = cgh4[,1]
cgh4 = cgh4[,-1]
colnames(cgh4) = colnames(cgh1)

read_data = rbind(cgh1,cgh2,cgh3,cgh4)
return(read_data)

}

feat.samp.map <- function(data) {
##Maps features and samples for data matrices
sSample_ids = colnames(data[[1]]$data)
sFeature_ids = rownames(data[[1]]$data)
views = length(data)

#Common features and samples
for(i in 2:views) {
	sSample_ids = intersect(sSample_ids,colnames(data[[i]]$data))
	sFeature_ids = intersect(sFeature_ids,rownames(data[[i]]$data))
}
for(i in 1:views) {
	data[[i]]$info = data[[i]]$info[match(sFeature_ids, rownames(data[[i]]$data)), ]
	data[[i]]$data = data[[i]]$data[match(sFeature_ids, rownames(data[[i]]$data)), match(sSample_ids, colnames(data[[i]]$data))]

	data[[i]]$data =  data[[i]]$data - rowMeans(data[[i]]$data, na.rm = TRUE)
}
	return(data)
}

sig.top.models <- function() {
##Extracts significant highly dependent chromosomal regions 

Pat = dim(data[[1]]$data)[2]
ii = 1
TopGen = 10 #Highest scoring genes to consider within each chromosomes
pval = NULL

Zs = NULL
geneName = NULL
himodels = list()

png(file.path(PlotPath,"Z_histograms.png"),height=1024,width=2048)
par(mfrow = c(8,5))
load(file.path(res_dir,"permuted_scores.RData"))
op <- options(warn = (-1))
for(i in 1:nchrom) {# For all chromosomes
        for(j in 1:2) {
                chr_arm = 'p'
                if(j == 2) {chr_arm = 'q'}

                if(!is.na(models[[ii]])) {
			himodels = topModels(models[[ii]], TopGen)
			for(k in 1:TopGen) {
				tem = sum(perm_mat$dependencyScore > genelist[[ii]]$dependencyScore[k])/length(perm_mat$dependencyScore)
				if(tem < 0.05) {
					pval = rbind(pval,tem)
	        	        	Zs = cbind(Zs,as.vector(getZ(himodels[[k]], data[[1]], data[[2]])))
					geneName = rbind(geneName, genelist[[ii]]$geneName[k])
					hist(getZ(himodels[[k]], data[[1]], data[[2]]), main=paste(i,chr_arm,sep=""), xlab = "") #Distribution of Zs
				}
			}
                }
		ii = ii + 1
        }
}
options(op) 
rownames(Zs) = colnames(getZ(model[[1]], data[[1]], data[[2]]))
colnames(Zs) = t(geneName)
dev.off()
gen2remov = which(p.adjust(pval, method = "bonferroni") > 0.05)
if(length(gen2remov) > 0) { Zs = Zs[,-gen2remov] }
return(Zs)
}

summerize.km <- function() {
proj_dir = path
current_rundir = res_dir

## Read output from KM analysis, annotate significant regions and store the final result
stat_table = read.delim(file.path(current_rundir,"km_anduril_v1","final_camda","km","statistics.csv"))

##annotate sig_roi genes
#If Biomart is not down use the following command {
#mart = useMart("ensembl", dataset = "hsapiens_gene_ensembl") 
#else use these ensembl Biomart 
ensembl=useMart("ENSEMBL_MART_ENSEMBL", host="www.ensembl.org")
dataset="hsapiens_gene_ensembl"
ensembl=useDataset(dataset, mart=ensembl)
#}

pvals = stat_table$pValue #Pvalues for each ROI based on KM analysis.
sig_roi_index = which(pvals < 0.05) #Significant ROIs
names(sig_roi_index) = colnames(Zs)[sig_roi_index]
#sig_roi_index = as.data.frame(sig_roi_index)

#Annotate_roi_pvals = getGene(names(sig_roi_index), type = "ensembl_gene_id", mart = mart) #For Biomart
Annotate_roi_pvals = getBM(c("ensembl_gene_id","hgnc_symbol","description","chromosome_name","band", "strand","start_position","end_position","ensembl_gene_id"), "ensembl_gene_id", names(sig_roi_index),ensembl) # For Ensembl Biomart

rownames(Annotate_roi_pvals) = Annotate_roi_pvals$ensembl_gene_id

tem = match(names(sig_roi_index),  rownames(Annotate_roi_pvals))
if(any(is.na(tem) == TRUE)) { sig_roi_index =  sig_roi_index[-which(is.na(tem))] }
tem = tem[!is.na(tem)]
Annotate_roi_pvals = Annotate_roi_pvals[tem,]
Annotate_roi_pvals$Index = sig_roi_index

write.csv(Annotate_roi_pvals,file=file.path(current_rundir,"sigroi_annot_pvals.csv"))
proj_dir = path
current_rundir = res_dir

## Read output from KM analysis, annotate significant regions and store the final result
stat_table = read.delim(file.path(current_rundir,"km_anduril_v1","final_camda","km","statistics.csv"))
qvals = p.adjust(pvals, method = "fdr")
sig_roi_index = which(qvals < 0.06)
names(sig_roi_index) = colnames(Zs)[sig_roi_index]
#Annotate_roi_qvals = getGene(names(sig_roi_index), type = "ensembl_gene_id", mart = mart)
Annotate_roi_qvals = getBM(c("ensembl_gene_id","hgnc_symbol","description","chromosome_name","band", "strand","start_position","end_position","ensembl_gene_id"), "ensembl_gene_id", names(sig_roi_index),ensembl) # For Ensembl Biomart
rownames(Annotate_roi_qvals) = Annotate_roi_qvals$ensembl_gene_id

tem = match(names(sig_roi_index),  rownames(Annotate_roi_qvals))
if(any(is.na(tem) == TRUE)) { 
	sig_roi_index =  sig_roi_index[-which(is.na(tem))] 
	tem = tem[!is.na(tem)]
}
Annotate_roi_qvals = Annotate_roi_qvals[tem,]
Annotate_roi_qvals$Index = sig_roi_index

write.csv(Annotate_roi_qvals,file=file.path(current_rundir,"sigroi_annot_qvals.csv"))
return()
}

clin_enrichment <- function() {
##########Form groupings##########
inp = read.delim(file.path(res_dir,"anduril_input_file.txt"))
sig_roi_index = read.delim(file=file.path(current_rundir,"sigroi_annot_qvals.csv"), sep =",")
sig_roi_index = sig_roi_index$Index
load(file.path(dat_dir,"preprocessed","clinical_info.RData"))

fac_names = c("WHITE","FEMALE","MALE","Age<30","Age<40","Age>50","Age>60")
factors = c("race","gender","gender", "age_at_initial_pathologic_diagnosis","age_at_initial_pathologic_diagnosis","age_at_initial_pathologic_diagnosis","age_at_initial_pathologic_diagnosis")
#drugfac = c(1,1,1,1,1,0,0)
drugfac = c(0,0,0,0,0,0,0) #clinical or drug information

gp = matrix(0,dim(inp)[1],length(fac_names))
colnames(gp) = fac_names
age_thres = c(30,40,50,60) #this is what we use for enrichment of age cateogy
annot_info = list()
annot_info$clinpat = clinpat
annot_info$drugpat = drugpat
annot_info$clinpat$bcr_patient_barcode = rownames(clinpat)

i = 0
a = 0
 for(grouping in colnames(gp)){
	i = i+1		
	if(grepl("Age*",grouping))
	{
		op <- options(warn = (-1))
		a = a+1
		if(age_thres[a] < 45){ sel_pat = which(as.numeric(as.character(annot_info[[drugfac[i]+1]][[factors[i]]])) <= as.numeric(age_thres[a])) } else {
		sel_pat = which(as.numeric(as.character(annot_info[[drugfac[i]+1]][[factors[i]]])) >= as.numeric(age_thres[a])) }
		
		tem_gp1 = unique(annot_info[[drugfac[i]+1]]$bcr_patient_barcode[sel_pat])
		options(op)
	} else {
		 tem_gp1 = unique(annot_info[[drugfac[i]+1]]$bcr_patient_barcode[which(annot_info[[drugfac[i]+1]][[factors[i]]] == grouping)])	
	}
	gp[which(inp$ID %in% tem_gp1),i] = 1

	tem_gpna = unique(annot_info[[drugfac[i]+1]]$bcr_patient_barcode[which(annot_info[[drugfac[i]+1]][[factors[i]]] == "null")])
	gp[which(inp$ID %in% tem_gpna),i] = NA	#NULL entries

	gp[which(!(inp$ID %in% unique(annot_info[[drugfac[i]+1]]$bcr_patient_barcode))),i] = NA	#Patients having no information
}
enriched = list()
caty = c("Pos_Affect","Neg_Affect","No_Affect", "Pos_Affect")
if(file.exists(file.path(res_dir, "enrichment_analysis.csv"))) { file.remove((file.path(res_dir, "enrichment_analysis.csv"))) }
  for(k in 1:dim(gp)[2]) {
  enriched[[colnames(gp)[k]]] = matrix(0,3,length(sig_roi_index))
  rownames(enriched[[k]]) = c("PosVsNeg","NegVsNoaffect","PosVsNoaffect")

    for(i in 1:length(sig_roi_index)) {
	pat_gp = inp[,3+sig_roi_index[i]]
	for(j in 1:3) {
		row1 = which(pat_gp == caty[j])	
		row2 = which(pat_gp == caty[j+1])
		
		c11 = sum(gp[row1,k] == 1, na.rm=T)
		c12 = sum(gp[row1,k] == 0, na.rm=T)
		c21 = sum(gp[row2,k] == 1, na.rm=T)
		c22 = sum(gp[row2,k] == 0, na.rm=T)
		tem = fisher.test(matrix(c(c11,c21,c12,c22), nrow = 2),alternative = "greater")
		enriched[[k]][j,i] = tem$p.value
	}
    }
    write.table(names(enriched)[k], file = file.path(current_rundir,"enrichment_analysis.csv"), append = TRUE, col.names = F, row.names = F, sep=",")
    write.table(enriched[[k]], file = file.path(current_rundir,"enrichment_analysis.csv"), append = TRUE, col.names = F, sep=",")
  }  
}

diffexprs_enrichment <- function() {
  sig_roi_index = read.delim(file=file.path(current_rundir,"sigroi_annot_qvals.csv"), sep =",") #ROIs having qvals < 0.05
  sig_roi_index = sig_roi_index$Index
  
  #pvalcgh = read.delim(file=file.path(path,"results_cghexp","sigroi_annot_pvals.csv"), sep =",") 
  #pvalmthyn = read.delim(file=file.path(path,"results_mthynexp","sigroi_annot_pvals.csv"), sep =",") 
  
  #sig_roi_index = c(sig_roi_index, pvalcgh$Index[which(pvalcgh$ensembl_gene_id %in% pvalmthyn$ensembl_gene_id)]) #Common significant (pvals < 0.05) ROIs from CGH-GE and METHY-GE datasets

  load(file.path(dat_dir,"preprocessed","clinical_info.RData"))
  b = unique(drugpat$bcr_patient_barcode[which(drugpat$drug_name == "Temozolamide"|drugpat$drug_name == "Temozolomide")])

  inp = read.delim(file.path(current_rundir,"anduril_input_file.txt"))
  inp = inp[which(inp$ID %in% b),]

  load(file.path(dat_dir,"preprocessed","data_ensembl.RData"))

  lograt = read.delim(file.path(dat_dir,"gExpression","logratio.csv"), sep="\t")
  colnames(lograt) = unlist(lapply(colnames(lograt),function(x) substr(x,1,12)))
  rownames(lograt) = lograt$GeneID
  lograt = lograt[,-1]
  pexprs$dat = lograt
  up = down = matrix("no",dim(pexprs$dat)[1],dim(pexprs$dat)[2])

  for(i in 1:dim(pexprs$dat)[1]) {
    down[i,] = up[i,] = as.numeric(pexprs$dat[i,])
    up[i, as.numeric(up[i,]) >= 1] = "yes"
    up[i, as.numeric(up[i,]) < 1] = "no"
    down[i, as.numeric(down[i,]) <= -1] = "yes"
    down[i, as.numeric(down[i,]) > -1] = "no"
  }

  up = t(up)
  down = t(down)

  colnames(up) = rownames(pexprs$dat)
  colnames(down) = rownames(pexprs$dat)
  rownames(down) = colnames(pexprs$dat)
  rownames(up) = colnames(pexprs$dat)


  newup = up[,which(colnames(up) %in% colnames(inp)[sig_roi_index+3])]
  newdown = down[,which(colnames(down) %in% colnames(inp)[sig_roi_index+3])]
  newinp = inp[,c(1,2,3,sig_roi_index+3)]
  colnames(newinp) = colnames(inp)[c(1,2,3,sig_roi_index+3)]
  rownames(newinp) = inp$ID
  newinp = newinp[,-c(1:3)]
  aind = NULL
  bind = NULL
  rownames(newup) = unlist(lapply(rownames(newup),function(x) gsub('_','.',x)))
  rownames(newdown) = unlist(lapply(rownames(newdown),function(x) gsub('_','.',x)))
  for(i in 1:dim(newinp)[1]) {
    aind = rbind(aind, which(rownames(newup) %in% rownames(newinp)[i]))
    bind = rbind(bind, which(rownames(newdown) %in% rownames(newinp)[i]))
  }
  newup = newup[aind,]
  newdown = newdown[bind,]
  newinp = newinp[,match(colnames(newdown),colnames(newinp))]
  
  for(i in 1:dim(newup)[1]) {
	newup[i,] = replace(newup[i,], which(newup[i,] == "yes"),1)
	newup[i,] = replace(newup[i,], which(newup[i,] == "no"),0)
	newdown[i,] = replace(newdown[i,], which(newdown[i,] == "yes"),1)
        newdown[i,] = replace(newdown[i,], which(newdown[i,] == "no"),0)
  }
  newinp = newinp[match(rownames(newup),rownames(newinp)),]

  fac_names = colnames(inp)[sig_roi_index]
 
  enriched = list()
  ii = c("up","down")
  filenam = "Tamazolomide_enrichment_final_biomarkers_temp.csv"
 for(i in 1:length(ii)) {
 
  gp = newup
  if(ii[i] == "down") gp = newdown
  
  caty = c("Pos_Affect","Neg_Affect","No_Affect", "Pos_Affect") 
  for(k in 1:dim(gp)[2]) {
    if(i == 1) enriched[[colnames(gp)[k]]] = matrix(0,3,length(ii))
    if(k == 1) colnames(enriched[[colnames(gp)[k]]]) = ii
    
    rownames(enriched[[k]]) = c("PosVsNeg","NegVsNoaffect","PosVsNoaffect")
    pat_gp = newinp[,k]
    for(j in 1:3) {
                row1 = which(pat_gp == caty[j])
                row2 = which(pat_gp == caty[j+1])
                c11 = sum(gp[row1,k] == 1, na.rm=T)
                c12 = sum(gp[row1,k] == 0, na.rm=T)
                c21 = sum(gp[row2,k] == 1, na.rm=T)
                c22 = sum(gp[row2,k] == 0, na.rm=T)
                tem = fisher.test(matrix(c(c11,c21,c12,c22), nrow = 2),alternative = "greater")
                enriched[[k]][j,i] = tem$p.value
     }
  }
 }
 for(k in 1:dim(gp)[2]) {
  if(k == 1) write.table(t(ii), file = file.path(current_rundir,filenam), append = FALSE, col.names = F, sep=",")
  
  write.table(names(enriched)[k], file = file.path(current_rundir,filenam), append = TRUE, col.names = F, row.names = F, sep=",")
  write.table(enriched[[k]], file = file.path(current_rundir,filenam), append = TRUE, col.names = F, sep=",")
 }
}

compute_new_gp <- function(dat, flag) {
	new_gp = FALSE
	for(i in 1:length(dat)) {
		if(flag=="pos" & i==length(dat)) {
			new_gp = (new_gp | (dat[[i]] == "Pos_Affect"))
		} else {
			new_gp = (new_gp | (dat[[i]] == "Neg_Affect"))

		}
	}
	new_gp = replace(new_gp, which(new_gp == TRUE), "active")
	new_gp = replace(new_gp, which(new_gp == FALSE), "inactive")
return(new_gp)
}
