## Script for loading and re-arranging and the FARMS-preprocessed TG-GATEs data files
# Author: Tommi Suvitaival, tommi.suvitaival@aalto.fi

## License

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

## Required data files:

# TGP drug info and pathological findings (CSV, EXCEL format)
# Available at: http://www.bioinf.jku.at/research/camda2013/tgp_info.zip

# Study – rat in vivo single (CSV format)
# Collapsed replicates (19MB) 2088 samples, 12088 genes
# Available at: http://www.bioinf.jku.at/research/camda2013/rat_invivo_single_collapsed_farms.zip

# Study – rat in vitro single (CSV format)
# Collapsed replicates (13MB) 1570 samples, 18988 genes
# Available at: http://www.bioinf.jku.at/research/camda2013/rat_in_vitro_collapsed_farms.zip

# Study – human in vitro (CSV format) 
# Collapsed replicates (8MB) 714 samples, 18988 genes
# Available at: http://www.bioinf.jku.at/research/camda2013/human_in_vitro_collapsed_farms.zip

## Set working directory.

setwd() # Set the path to the downloaded data here.

## Load the array meta-data.

array.metadata <- read.csv(file.path("tgp_info", "Array Metadata.fixedchars.unix.csv"))

## Load the rat in vitro data set.

sampleNames.rat.invitro.collapsed <- read.table(file.path("rat_in_vitro_collapsed_farms", "sampleNames_rat_invitro_collapsed.csv"), quote="\"") # Load the sample names.
geneNames.rat.invitro.collapsed <- read.csv(file.path("rat_in_vitro_collapsed_farms", "geneNames_rat_invitro_collapsed.csv"), header=F) # Load the probe names and descriptions.
exprs.rat.invitro.collapsed <- read.csv(file.path("rat_in_vitro_collapsed_farms", "exprs_rat_invitro_collapsed.csv"), header=F) # Load the expression data.
rownames(exprs.rat.invitro.collapsed) <- sampleNames.rat.invitro.collapsed[,1] # IDs of the samples
colnames(exprs.rat.invitro.collapsed) <- toupper(geneNames.rat.invitro.collapsed[,2]) # names of the genes

## Load the rat in vivo data set.

# Expression data from the single-dose experiment

sampleNames.rat.invivo.single.collapsed <- read.table(file.path("rat_invivo_single_collapsed_farms", "sampleNames_rat_invivo_single_collapsed.csv"), quote="\"") # Load the sample names.
geneNames.rat.invivo.single.collapsed <- read.csv(file.path("rat_invivo_single_collapsed_farms", "geneNames_rat_invivo_single_collapsed.csv"), header=F) # Load the probe names and descriptions.
exprs.rat.invivo.single.collapsed <- read.csv(file.path("rat_invivo_single_collapsed_farms", "exprs_rat_invivo_single_collapsed.csv"), header=F) # Load the expression data.
rownames(exprs.rat.invivo.single.collapsed) <- sampleNames.rat.invivo.single.collapsed[,1] # IDs of the samples
colnames(exprs.rat.invivo.single.collapsed) <- toupper(geneNames.rat.invivo.single.collapsed[,2]) # names of the genes

# Pathology data

pathology.data.loaded <- read.csv(file.path("tgp_info", "Pathology data.csv"))
pathology.data = list()
pathology.data$char = as.matrix(x=pathology.data.loaded[,1:12])
pathology.data$num = as.matrix(x=pathology.data.loaded[,-(1:12)])
rm(pathology.data.loaded)

## Load the human in vivo data set.

sampleNames.human.invitro.collapsed <- read.table(file.path("human_in_vitro_collapsed_farms", "sampleNames_human_invitro_collapsed.csv"), quote="\"")
geneNames.human.invitro.collapsed <- read.csv(file.path("human_in_vitro_collapsed_farms", "geneNames_human_invitro_collapsed.csv"), header=F)
exprs.human.invitro.collapsed <- read.csv(file.path("human_in_vitro_collapsed_farms", "exprs_human_invitro_collapsed.csv"), header=F)
rownames(exprs.human.invitro.collapsed) <- sampleNames.human.invitro.collapsed[,1] # IDs of the samples
colnames(exprs.human.invitro.collapsed) <- toupper(geneNames.human.invitro.collapsed[,2]) # names of the genes


## Match the samples of the expression data sets to the meta-data.

# Rat in vitro data set

ids.split = strsplit(x=rownames(exprs.rat.invitro.collapsed), split="_")
mapping.samples.rat.invitro.exprs.to.metadata = array(dim=c(nrow(exprs.rat.invitro.collapsed), 2)) # Each category has two replicates.
error.match = NULL
for (ni in 1:nrow(exprs.rat.invitro.collapsed)) {
	for (ci in 1:length(ids.split[[ni]])) {
		id.ni.ci = substr(x=ids.split[[ni]][ci], start=3, stop=nchar(ids.split[[ni]][ci]))
		match.ni.ci = which(array.metadata[,1]==id.ni.ci)
		if (length(match.ni.ci)==1) {
			mapping.samples.rat.invitro.exprs.to.metadata[ni,ci] = which(array.metadata[,1]==id.ni.ci)
		} else if (length(match.ni.ci)>1) {
			error.match = c(error.match, ni)
		}
	}
}
rm(ids.split)
print(length(which(mapping.samples.rat.invitro.exprs.to.metadata[,1]!=(mapping.samples.rat.invitro.exprs.to.metadata[,2]-1)))) # Collapsed samples are always following one another.

# Rat in vivo single-dose data set

ids.split = strsplit(x=rownames(exprs.rat.invivo.single.collapsed), split="_")
mapping.samples.rat.invivo.single.exprs.to.metadata = array(dim=c(nrow(exprs.rat.invivo.single.collapsed), 3)) # Each category has three replicates.
error.match = NULL
for (ni in 1:nrow(exprs.rat.invivo.single.collapsed)) {
	for (ci in 1:length(ids.split[[ni]])) {
		id.ni.ci = substr(x=ids.split[[ni]][ci], start=3, stop=nchar(ids.split[[ni]][ci]))
		match.ni.ci = which(array.metadata[,1]==id.ni.ci)
		if (length(match.ni.ci)==1) {
			mapping.samples.rat.invivo.single.exprs.to.metadata[ni,ci] = which(array.metadata[,1]==id.ni.ci)
		} else if (length(match.ni.ci)>1) {
			error.match = c(error.match, ni)
		}
	}
}
rm(ids.split)

# Human in vitro single-dose data set

ids.split = strsplit(x=rownames(exprs.human.invitro.collapsed), split="_")
mapping.samples.human.invitro.exprs.to.metadata = array(dim=c(nrow(exprs.human.invitro.collapsed), 2))
error.match = NULL
for (ni in 1:nrow(exprs.human.invitro.collapsed)) {
	for (ci in 1:length(ids.split[[ni]])) {
		id.ni.ci = substr(x=ids.split[[ni]][ci], start=3, stop=nchar(ids.split[[ni]][ci]))
		match.ni.ci = which(array.metadata[,1]==id.ni.ci)
		if (length(match.ni.ci)==1) {
			mapping.samples.human.invitro.exprs.to.metadata[ni,ci] = which(array.metadata[,1]==id.ni.ci)
		} else if (length(match.ni.ci)>1) {
			error.match = c(error.match, ni)
		}
	}
}
rm(ids.split)


## Match the probes from the human data set to the probes of the rat data sets.

mapping.probes.human.to.rat = match(x=colnames(exprs.human.invitro.collapsed), table=colnames(exprs.rat.invitro.collapsed))

## Findings

findings.unique = unique(pathology.data$char[which(!is.na(pathology.data$char[,4])),4])
findings.grade.numeric = vector(mode="integer", length=nrow(pathology.data$char))
findings.grade.numeric[which(pathology.data$char[,6]=="minimal")] = 1
findings.grade.numeric[which(pathology.data$char[,6]=="slight")] = 2
findings.grade.numeric[which(pathology.data$char[,6]=="moderate")] = 3
findings.grade.numeric[which(pathology.data$char[,6]=="severe")] = 4
findings.grade.numeric[which(pathology.data$char[,6]=="P")] = 5

## Select a single dosage level and time point.

selected.group = list()
selected.group$dosage = c("Middle", "High") # c("Low", "Middle", "High")
selected.group$time.invitro = c("8 hr", "24 hr") # c("2 hr", "8 hr", "24 hr")
selected.group$time.invivo = c("9 hr", "24 hr") # c("3 hr", "9 hr", "24 hr")

## Find the intersection of compounds between the human and rat data sets.

data.camda.collapsed = list()
samples.intersecting.compounds.selected.group = list()
samples.exprs.intersecting.compounds = list()
samples.exprs.intersecting.compounds$human = list()
samples.exprs.intersecting.compounds$rat.invitro = list()
samples.exprs.intersecting.compounds$rat.invivo = list()
compounds.unique = sort(unique(as.character(array.metadata[,8])))
data.camda.collapsed$category.samples = list()
count.samples.exist = array(data=FALSE, dim=c(length(selected.group$dosage), length(selected.group$time.invitro), 3, 2))
for (ci in 1:length(compounds.unique)) { # Go through all compounds in the meta-data file.
  samples.ci = which(array.metadata[,8]==compounds.unique[ci]) # Find all samples of compund 'ci'.
  # Find samples in the selected ANOVA category.
  samples.ci.selected.group = list()
  ## Check whether there are gene expression observations from all sample groups and all views.
  samples.exist.ci = list()
  samples.exist.ci$expression = array(data=FALSE, dim=c(length(selected.group$dosage), length(selected.group$time.invitro), 3, 2)) # Dosages x time points x cell lines x case & control
  samples.exist.ci$pathology = array(data=FALSE, dim=c(length(selected.group$dosage), length(selected.group$time.invitro)))
  for (di in 1:length(selected.group$dosage)) { # Go through all dosage levels.
		for (ti in 1:length(selected.group$time.invitro)) { # Go through all time points.
			samples.ci.di.ti = list()
			samples.ci.di.ti$human = list()
			samples.ci.di.ti$human$case = samples.ci[which(array.metadata[samples.ci,11]=="Human" & array.metadata[samples.ci,18]==selected.group$time.invitro[ti] & array.metadata[samples.ci,21]==selected.group$dosage[di])]
			if (length(samples.ci.di.ti$human$case)>0) {
				if (length(which(mapping.samples.human.invitro.exprs.to.metadata==samples.ci.di.ti$human$case[1]))>0) {
					samples.exist.ci$expression[di,ti,1,2] = TRUE
				}
			}
			samples.ci.di.ti$human$control = samples.ci[which(array.metadata[samples.ci,11]=="Human" & array.metadata[samples.ci,18]==selected.group$time.invitro[ti] & array.metadata[samples.ci,21]=="Control")]
			if (length(samples.ci.di.ti$human$control)>0) {
				if (length(which(mapping.samples.human.invitro.exprs.to.metadata==samples.ci.di.ti$human$control[1]))>0) {
					samples.exist.ci$expression[di,ti,1,1] = TRUE
				}
			}
			samples.ci.di.ti$rat.invitro = list()
			samples.ci.di.ti$rat.invitro$case = samples.ci[which(array.metadata[samples.ci,11]=="Rat" & array.metadata[samples.ci,12]=="in vitro" & array.metadata[samples.ci,18]==selected.group$time.invitro[ti] & array.metadata[samples.ci,21]==selected.group$dosage[di])]
			if (length(samples.ci.di.ti$rat.invitro$case)>0) {
				if (length(which(mapping.samples.rat.invitro.exprs.to.metadata==samples.ci.di.ti$rat.invitro$case[1]))>0) {
					samples.exist.ci$expression[di,ti,2,2] = TRUE
				}
			}
			samples.ci.di.ti$rat.invitro$control = samples.ci[which(array.metadata[samples.ci,11]=="Rat" & array.metadata[samples.ci,12]=="in vitro" & array.metadata[samples.ci,18]==selected.group$time.invitro[ti] & array.metadata[samples.ci,21]=="Control")]
			if (length(samples.ci.di.ti$rat.invitro$control)>0) {
				if (length(which(mapping.samples.rat.invitro.exprs.to.metadata==samples.ci.di.ti$rat.invitro$control[1]))>0) {
					samples.exist.ci$expression[di,ti,2,1] = TRUE
				}
			}
			samples.ci.di.ti$rat.invivo = list()
			samples.ci.di.ti$rat.invivo$case = samples.ci[which(array.metadata[samples.ci,11]=="Rat" & array.metadata[samples.ci,12]=="in vivo" & array.metadata[samples.ci,18]==selected.group$time.invivo[ti] & array.metadata[samples.ci,21]==selected.group$dosage[di])]
			if (length(samples.ci.di.ti$rat.invivo$case)>0) {
				if (length(which(mapping.samples.rat.invivo.single.exprs.to.metadata==samples.ci.di.ti$rat.invivo$case[1]))>0) {
					samples.exist.ci$expression[di,ti,3,2] = TRUE
				}
			}
			samples.ci.di.ti$rat.invivo$control = samples.ci[which(array.metadata[samples.ci,11]=="Rat" & array.metadata[samples.ci,12]=="in vivo" & array.metadata[samples.ci,18]==selected.group$time.invivo[ti] & array.metadata[samples.ci,21]=="Control")]
			if (length(samples.ci.di.ti$rat.invivo$control)>0) {
				if (length(which(mapping.samples.rat.invivo.single.exprs.to.metadata==samples.ci.di.ti$rat.invivo$control[1]))>0) {
					samples.exist.ci$expression[di,ti,3,1] = TRUE
				}
			}
		}
  }
	count.samples.exist = count.samples.exist + samples.exist.ci$expression
  if (all(samples.exist.ci$expression)) {
		print(compounds.unique[ci])
# 		intersection.compounds = c(intersection.compounds, compounds.unique[ci])
		## Covariates
		for (di in 1:length(selected.group$dosage)) { # Go through all dosage levels.
			for (ti in 1:length(selected.group$time.invitro)) { # Go through all time points.
			  data.camda.collapsed$category.samples$compounds = c(data.camda.collapsed$category.samples$compounds, compounds.unique[ci])
			  data.camda.collapsed$category.samples$time = c(data.camda.collapsed$category.samples$time, selected.group$time.invitro[ti])
			  data.camda.collapsed$category.samples$dose = c(data.camda.collapsed$category.samples$dose, selected.group$dosage[di])
				samples.ci.di.ti = list()
				## Gene expression - human in vitro
				samples.ci.di.ti$human = list()
				samples.ci.di.ti$human$case = samples.ci[which(array.metadata[samples.ci,11]=="Human" & array.metadata[samples.ci,18]==selected.group$time.invitro[ti] & array.metadata[samples.ci,21]==selected.group$dosage[di])]
				samples.exprs.intersecting.compounds$human$case = c(samples.exprs.intersecting.compounds$human$case, which(mapping.samples.human.invitro.exprs.to.metadata==samples.ci.di.ti$human$case[1],arr.ind=T)[1,1])
				samples.ci.di.ti$human$control = samples.ci[which(array.metadata[samples.ci,11]=="Human" & array.metadata[samples.ci,18]==selected.group$time.invitro[ti] & array.metadata[samples.ci,21]=="Control")]
				samples.exprs.intersecting.compounds$human$control = c(samples.exprs.intersecting.compounds$human$control, which(mapping.samples.human.invitro.exprs.to.metadata==samples.ci.di.ti$human$control[1],arr.ind=T)[1,1])
				## Gene expression - rat in vitro
				samples.ci.di.ti$rat.invitro = list()
				samples.ci.di.ti$rat.invitro$case = samples.ci[which(array.metadata[samples.ci,11]=="Rat" & array.metadata[samples.ci,12]=="in vitro" & array.metadata[samples.ci,18]==selected.group$time.invitro[ti] & array.metadata[samples.ci,21]==selected.group$dosage[di])]
				samples.exprs.intersecting.compounds$rat.invitro$case = c(samples.exprs.intersecting.compounds$rat.invitro$case, which(mapping.samples.rat.invitro.exprs.to.metadata==samples.ci.di.ti$rat.invitro$case[1],arr.ind=T)[1,1])
				samples.ci.di.ti$rat.invitro$control = samples.ci[which(array.metadata[samples.ci,11]=="Rat" & array.metadata[samples.ci,12]=="in vitro" & array.metadata[samples.ci,18]==selected.group$time.invitro[ti] & array.metadata[samples.ci,21]=="Control")]
				samples.exprs.intersecting.compounds$rat.invitro$control = c(samples.exprs.intersecting.compounds$rat.invitro$control, which(mapping.samples.rat.invitro.exprs.to.metadata==samples.ci.di.ti$rat.invitro$control[1],arr.ind=T)[1,1])
				## Gene expression - rat in vivo
				samples.ci.di.ti$rat.invivo = list()
				samples.ci.di.ti$rat.invivo$case = samples.ci[which(array.metadata[samples.ci,11]=="Rat" & array.metadata[samples.ci,12]=="in vivo" & array.metadata[samples.ci,18]==selected.group$time.invivo[ti] & array.metadata[samples.ci,21]==selected.group$dosage[di])]
				samples.exprs.intersecting.compounds$rat.invivo$case = c(samples.exprs.intersecting.compounds$rat.invivo$case, which(mapping.samples.rat.invivo.single.exprs.to.metadata==samples.ci.di.ti$rat.invivo$case[1],arr.ind=T)[1,1])
				samples.ci.di.ti$rat.invivo$control = samples.ci[which(array.metadata[samples.ci,11]=="Rat" & array.metadata[samples.ci,12]=="in vivo" & array.metadata[samples.ci,18]==selected.group$time.invivo[ti] & array.metadata[samples.ci,21]=="Control")]
				samples.exprs.intersecting.compounds$rat.invivo$control = c(samples.exprs.intersecting.compounds$rat.invivo$control, which(mapping.samples.rat.invivo.single.exprs.to.metadata==samples.ci.di.ti$rat.invivo$control[1],arr.ind=T)[1,1])
				## Pathological findings - rat in vivo
				samples.ci.di.ti$findings = which(pathology.data$char[,1]==compounds.unique[ci] & pathology.data$char[,7]==selected.group$dosage[di] & pathology.data$char[,8]==selected.group$time.invivo[ti] & !is.na(pathology.data$char[,4]) & !is.na(pathology.data$char[,6])) # Find the pathology samples of the selected ANOVA category - the order is different than in other data sets! 
				findings.ci.di.ti = rep(x=0, times=length(findings.unique))
				if (length(samples.ci.di.ti$findings)>0) {
					for (fi in 1:length(samples.ci.di.ti$findings)) {
						tmp = which(findings.unique==pathology.data$char[samples.ci.di.ti$findings[fi],4]) # index of the type of the finding
						findings.ci.di.ti[tmp] = findings.ci.di.ti[tmp]+findings.grade.numeric[samples.ci.di.ti$findings[fi]]
					}
				}
				data.camda.collapsed$findings = rbind(data.camda.collapsed$findings, findings.ci.di.ti)
			}
		}
  }
}
colnames(data.camda.collapsed$findings) = findings.unique
rownames(data.camda.collapsed$findings) = paste(data.camda.collapsed$category.samples$compounds, "/", data.camda.collapsed$category.samples$dose, "/", data.camda.collapsed$category.samples$time)
data.camda.collapsed$findings = data.camda.collapsed$findings[,which(colSums(data.camda.collapsed$findings)>0)]

## Pick the matched data sets.

data.camda.collapsed$change.exprs.human.invitro = exprs.human.invitro.collapsed[samples.exprs.intersecting.compounds$human$case, which(!is.na(mapping.probes.human.to.rat))]-exprs.human.invitro.collapsed[samples.exprs.intersecting.compounds$human$control, which(!is.na(mapping.probes.human.to.rat))]
data.camda.collapsed$change.exprs.rat.invitro = exprs.rat.invitro.collapsed[samples.exprs.intersecting.compounds$rat.invitro$case, mapping.probes.human.to.rat[which(!is.na(mapping.probes.human.to.rat))]]-exprs.rat.invitro.collapsed[samples.exprs.intersecting.compounds$rat.invitro$control, mapping.probes.human.to.rat[which(!is.na(mapping.probes.human.to.rat))]]
data.camda.collapsed$change.exprs.rat.invivo = exprs.rat.invivo.single.collapsed[samples.exprs.intersecting.compounds$rat.invivo$case, mapping.probes.human.to.rat[which(!is.na(mapping.probes.human.to.rat))]]-exprs.rat.invivo.single.collapsed[samples.exprs.intersecting.compounds$rat.invivo$control, mapping.probes.human.to.rat[which(!is.na(mapping.probes.human.to.rat))]]
rownames(data.camda.collapsed$change.exprs.human.invitro) = rownames(data.camda.collapsed$findings) # intersection.compounds
rownames(data.camda.collapsed$change.exprs.rat.invitro) = rownames(data.camda.collapsed$findings) # intersection.compounds
rownames(data.camda.collapsed$change.exprs.rat.invivo) = rownames(data.camda.collapsed$findings) # intersection.compounds

## Remove variables (genes) without variance in the subset.

variables.without.variance = which(colSums(as.matrix(data.camda.collapsed$change.exprs.human.invitro)==matrix(data.camda.collapsed$change.exprs.human.invitro[1,],nrow=nrow(data.camda.collapsed$change.exprs.human.invitro),ncol=ncol(data.camda.collapsed$change.exprs.human.invitro),byrow=T))==nrow(data.camda.collapsed$change.exprs.human.invitro) | colSums(as.matrix(data.camda.collapsed$change.exprs.rat.invitro)==matrix(data.camda.collapsed$change.exprs.rat.invitro[1,],nrow=nrow(data.camda.collapsed$change.exprs.rat.invitro),ncol=ncol(data.camda.collapsed$change.exprs.rat.invitro),byrow=T))==nrow(data.camda.collapsed$change.exprs.rat.invitro) | colSums(as.matrix(data.camda.collapsed$change.exprs.rat.invivo)==matrix(data.camda.collapsed$change.exprs.rat.invivo[1,],nrow=nrow(data.camda.collapsed$change.exprs.rat.invivo),ncol=ncol(data.camda.collapsed$change.exprs.rat.invivo),byrow=T))==nrow(data.camda.collapsed$change.exprs.rat.invivo))

if (length(variables.without.variance)>0) {
	data.camda.collapsed$change.exprs.human.invitro = data.camda.collapsed$change.exprs.human.invitro[,-variables.without.variance]
	data.camda.collapsed$change.exprs.rat.invitro = data.camda.collapsed$change.exprs.rat.invitro[,-variables.without.variance]
	data.camda.collapsed$change.exprs.rat.invivo = data.camda.collapsed$change.exprs.rat.invivo[,-variables.without.variance]
}

str(data.camda.collapsed) # The resulting data set.

save(data.camda.collapsed, file="data-GFAtoxgen-demo.RData", compress="xz") # Save the resulting data set.