# Script for running probabilistic drug connectivity mapping analysis

# Copyright (C) 2013-2014 Juuso Parkkinen.
# All rights reserved.
# 
# This program is open source software; you can redistribute it and/or modify it under the terms of the FreeBSD License (keep this notice): http://en.wikipedia.org/wiki/BSD_licenses
# 
# This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

##############################################################


message("Set working directory to the folder where you unzipped the files!")


## Run Group Factor Analysis (GFA) ########################

message("Note! Running GFA takes a few hours for the CMap data. The precomputed model output is provided in file 'GFA_model_K350_seed1.RData', so this step can be skipped!")

# Load preprocessed CMap data
load("CMap_expression_list.RData")

# Install and load the CCAGFA package
# install.packages("CCAGFA")
library(CCAGFA)

# Run GFA
opts <- getDefaultOpts()

# Set number of components
K <- 350

# Run model
seed <- 1
set.seed(seed)
GFA.model <- GFA(CMap.expression.list, K, opts)
GFA.model$seed <- seed

# Remove empty components
message("TODO")

# Save model
save(GFA.model, file="GFA_model_K350_seed1.RData")


## Estimate GFA factors for a new sample ###############

# Load preprocessed CMap data
load("CMap_expression_list.RData")

# Load GFA model 
load("GFA_model_K350_seed1.RData")

# Load GFA package
# install.packages("CCAGFA")
library(CCAGFA)

# Use as example a few samples from the CMap data
example.drugs <- c("tretinoin", "metformin", "monorden")

## Example 1: Use data from all three CMap cell lines
new.sample1 <- lapply(CMap.expression.list, function(cl) cl[example.drugs, , drop=FALSE])

# Estimate GFA factors using the 'GFApred()' function
new.sample1.Z <- GFApred(pred=c(1, 1, 1), Y=new.sample1, model=GFA.model)$Z

# Almost same as the original estimate
plot(new.sample1.Z[example.drugs[1],], GFA.model$Z[example.drugs[1],])

## Example 2: Use data from one CMap cell line (HL60) only by adding NA's to the other two cell lines
new.sample2 <- new.sample1
new.sample2[[2]][] <- NA
new.sample2[[3]][] <- NA

# Estimate GFA factors using the 'GFApred()' function
# Use now the 'pred' vector to specify that only the first 'data view' (cell line) is observed
new.sample2.Z <- GFApred(pred=c(1, 0, 0), Y=new.sample2, model=GFA.model)$Z

# Now there's more difference to the original estimate
# i.e. factors not active in the used cell line have zero values in the new sample
plot(new.sample2.Z[example.drugs[1],], GFA.model$Z[example.drugs[1],])


## Run probabilistic connectivity mapping  ######################

message("Need to run the above section first!")

# Source probabilistic connectivity mapping functions
source("ProbCMap_functions.R")

# Load preprocessed ATC codes for CMap drugs
load("ATC.RData")


# Run single drug retrieval within the Connectivity Map data
distmat.cmap <- compute_probcmap_single(model.Z=GFA.model$Z, query.Z=NULL)

# Show top results with ATC codes for one example drug
top.res <- head(sort(distmat.cmap[example.drugs[1], ]))
top.res.df <- data.frame(Drug=names(top.res), Dist=top.res, ATC=ATC$Code[match(names(top.res), ATC$DrugName)], row.names=NULL)
top.res.df
#            Drug      Dist     ATC
# 1     tretinoin 0.0000000 D10AD01
# 2  isotretinoin 0.4465782 D10AD04
# 3 rosiglitazone 0.8705181 A10BG02
# 4      ricinine 0.8711137    <NA>
# 5   todralazine 0.8930645    <NA>
# 6   rilmenidine 0.8988807 C02AC06


# Run single drug retrieval for the first example set
distmat.new1 <- compute_probcmap_single(model.Z=GFA.model$Z, query.Z=new.sample1.Z)

# Show top results with ATC codes for one example drug
top.res <- head(sort(distmat.new1[example.drugs[1], ]))
top.res.df <- data.frame(Drug=names(top.res), Dist=top.res, ATC=ATC$Code[match(names(top.res), ATC$DrugName)], row.names=NULL)
top.res.df
#            Drug         Dist     ATC
# 1     tretinoin 1.260103e-13 D10AD01
# 2  isotretinoin 4.465782e-01 D10AD04
# 3 rosiglitazone 8.705181e-01 A10BG02
# 4      ricinine 8.711137e-01    <NA>
# 5   todralazine 8.930645e-01    <NA>
# 6   rilmenidine 8.988807e-01 C02AC06

# Run single drug retrieval for the second example set
distmat.new2 <- compute_probcmap_single(model.Z=GFA.model$Z, query.Z=new.sample2.Z)

# Show top results with ATC codes for one example drug
top.res <- head(sort(distmat.new2[example.drugs[1], ]))
top.res.df <- data.frame(Drug=names(top.res), Dist=top.res, ATC=ATC$Code[match(names(top.res), ATC$DrugName)], row.names=NULL)
top.res.df
#                      Drug      Dist     ATC
# 1               tretinoin 0.1686455 D10AD01
# 2            isotretinoin 0.5427970 D10AD04
# 3             rilmenidine 0.9103963 C02AC06
# 4                ricinine 0.9156709    <NA>
# 5               vincamine 0.9249275 C04AX07
# 6 methanthelinium bromide 0.9262611    <NA>



## Run combinatorial retrieval #######################

message("Need to run the above sections first!")

# Source probabilistic connectivity mapping functions
source("ProbCMap_functions.R")

# Run combinatorial retrieval for the second example set
# Warning, this is quite slow!
distmat.comb <- compute_probcmap_combinatorial(model.Z=GFA.model$Z, query.Z=new.sample2.Z)


## Process results

# Transform into a data frame
library(reshape2)
res.df <- reshape2::melt(distmat.comb[, , example.drugs[1]], na.rm=TRUE)
# Rename and reorder
names(res.df) <- c("Drug1", "Drug2", "Dist")
res.df <- res.df[order(res.df$Dist), ]
# Remove the query itself from the results
res.df <- droplevels(subset(res.df, Drug1!=example.drugs[1] & Drug2!=example.drugs[1]))
# Add ATC dodes
res.df$Drug1.ATC <- ATC$Code[match(res.df$Drug1, ATC$DrugName)]
res.df$Drug2.ATC <- ATC$Code[match(res.df$Drug2, ATC$DrugName)]

# Print top results
head(res.df)
#                       Drug1        Drug2      Dist Drug1.ATC Drug2.ATC
# 294004         isotretinoin  metrifonate 0.3693980   D10AD04   P02BB01
# 245139         griseofulvin isotretinoin 0.3713313   D01AA08   D10AD04
# 338520         isotretinoin orphenadrine 0.3715663   D10AD04      <NA>
# 347854         isotretinoin  paromomycin 0.3748125   D10AD04   A07AA06
# 244858 acetylsalicylic acid isotretinoin 0.3751508   A01AD05   D10AD04
# 245159   hydroflumethiazide isotretinoin 0.3761867   C03AA02   D10AD04




