#Loads and preprocesses the TCGA GBM datasets (CGH, EXPRESSION, MIRNA AND METHYLATION)

##LOAD DATA
sprintf("Loading data...")
exprs = read.delim(file.path(path,"data","gExpression","gExprMatrix.csv")) # 422 samples
mthyn = read.delim(file.path(path,"data","methylation","processed.csv")) # 303
mirna = read.delim(file.path(path,"data","mirna","miRNAExprMatrix.csv"))
#cgh = my.read.delim(file.path(path,"data","arrayCGH","copynumberMatrix.csv"), 50000) # Reads the big cgh data in chunks and then combines it.

##MAP FEATURES TO ENSEMBL IDS 
##(To do: We also need to try Entrez id mapping: The cgh entrez mapping is available at http://tcga-data.nci.nih.gov/tcga/tcgaPlatformDesign.jsp under TCGA ADF link.

######################## Mapping Agilent probes (CGH data) to Ensembl ids ########################

mart = useMart("ensembl", dataset = "hsapiens_gene_ensembl")
#pcgh = transform2simcca.input("agilent_cgh_44b",cgh, mart, FALSE) #FOR CGH WE GET 7879 MATCHED ENTRIES FROM ENSEMBL.

######################## Chromosomal info for expression data ########################
pexprs = transform2simcca.input("ensembl_gene_id",exprs, mart, FALSE)

######################## Mapping HGCN names (Methylation data) to Ensembl ids ########################
pmthyn = transform2simcca.input("hgnc_symbol",mthyn, mart, FALSE)

######################## Mapping microRNAs to Ensembl ids ########################
pmirna = transform2simcca.input("mirbase_id",mirna, mart, TRUE)
if(!file.exists(file.path(path,"data","preprocessed"))) dir.create(file.path(path,"data","preprocessed"))

save(pcgh, pexprs, pmthyn, pmirna, file = file.path(path,"data", "preprocessed","data_ensembl.RData"))

