# This demonstration allows reproducing the experiments in  "Modelling G x E with historical weather information improves genomic prediction in new environments" by Jussi Gillberg, Pekka Marttinen, Hiroshi Mamitsuka and Samuel Kaski. The kernels can be obtained from Boreal Plant Breeding for non-commericial purposes. In that case, replace the files in data folder with the files obtained from Boreal.


rm(list=ls())

experiment <- list()
experiment$paths <- list()

# set path here
my.path <- '/u/85/lgillber/unix/Desktop/bioinformatics_codes/'

experiment$paths$base.codes.path <- paste0(my.path, 'general')
experiment$paths$kbmf <- paste0(my.path, 'kbmf')
experiment$paths$GBLUP_MOD <- paste0(my.path, 'GBLUP_exps/')
experiment$paths$learn_weights <- paste0(my.path, 'BMKL/')
experiment$paths$A.var.path <- paste0(my.path, 'to_load/variances_var_coef.RData')
experiment$paths$data.path <- paste0(my.path, 'data/')


experiment$current.run <- list()
experiment$current.run$filenames.list <- list()


# load seeds
load(paste0(my.path, 'seeds/seeds.RData'))


# source all functions
source(paste0(experiment$paths$base.codes.path, '/source_demo_code.R'))
source(paste0(experiment$paths$base.codes.path, '/set_dummy_parameters.R'))
source.code(experiment)

# data structure for storing model and other parameters
learnt.params <- list()

test.fold <- 1
validation.fold <- 1

# load data
old.data.path <- paste0("/m/cs/scratch/fidipro_hm/EXPERIMENTS/e_pb17/indexes/model_inputs_setup_1_test.fold_", test.fold, "_g.ds.f_1_e.ds.f_1_gauss.par_1_pheno_sato_e.d.m_1_validation.fold_", validation.fold, ".RData")

new.data.path <- paste0(experiment$paths$data.path, "model_inputs_tf_", test.fold, "_vf_", validation.fold, ".RData")

#load(old.data.path)
load(new.data.path)




for (method in c('GBLUP_REML_FE', 'KBMF_weights', 'KBMF_1way_g_2way_fe_1k','KBMF_1way_g_2way_fe_hst_1k')) {
  
  if (length(grep(x=method, pattern='GBLUP'))>0)  {
    
    #method <- 'GBLUP_REML_FE'
    
    # learn GBLUP
    gblup.preds <- learn.GBLUP.pars(model.inputs, g.ker.sel, method)
    
    # the path for saving GBLUP results
    path.tmp <- paste0(my.path, 'runs/tf_', test.fold, '/', 'vf_', validation.fold, '/GBLUP')
    
    # if necessary, create path
    if (!file.exists(path.tmp)) dir.create(path.tmp, recursive = TRUE)
    
    # save predictions
    save(gblup.preds, file=paste0(path.tmp, '/GBLUP_preds.RData'))
  }
  
  if (method == 'KBMF_weights') {
   
    
    # the path for saving kernel weights results
    path.tmp <- paste0(my.path, 'runs/tf_', test.fold, '/', 'vf_', validation.fold, '/BEMKL_weights')
    
    # random seed
    seed <- seeds$KBMF_weights$ncv[test.fold, validation.fold]
    set.seed(seed=seed)	
    
    # normalize environment kernels wrt summed variance
    learnt.params$kernel.normalizers.env <- unlist(lapply(model.inputs$env.kernels$train, function(x){sqrt(1/sum(apply(x,2,var)))}))
    
    # same for genotypes
    learnt.params$kernel.normalizers.geno <- unlist(lapply(model.inputs$geno.kernels$train, function(x){sqrt(1/sum(apply(x,2,var)))}))
    
    
    # Genotype: process the kernels into the KBMF format
    pars <- list()
    pars$ker.sel <- g.ker.sel
    pars$weight.bias <- NULL
    Kx <- process.kernels(kernels.to.normalize=model.inputs$geno.kernels$train, kernel.normalizers=learnt.params$kernel.normalizers.geno, pars=pars)
    
    # Environment: process the kernels into the KBMF format
    pars <- list()
    
    if (exists('e.ker.sel')) {
      pars$ker.sel <- e.ker.sel
      pars$weight.bias <- NULL
    } else pars$weight.bias <- 0
    
    Kz <- process.kernels(kernels.to.normalize=model.inputs$env.kernels$train, kernel.normalizers=learnt.params$kernel.normalizers.env, pars=pars)
    
    # set input parameters for BEMKL
    parameters <- set.parameters.lw(method, seed)
    
    # regress the experiment means
    #should be an Ntra x 1 matrix containing target outputs
    ytrain <- colMeans(model.inputs$pheno.data$train$pheno.matrix, na.rm = TRUE)
    
    time.start <- proc.time()[[3]]
    
    #perform training
    state <- bemkl_supervised_regression_variational_train(Km=Kz, y=ytrain, parameters=parameters)

    time.finished <- proc.time()[[3]]
    run.time <- time.finished - time.start

    learnt.params$state <- state
    learnt.params$run.time <- run.time

    # save kernel weights
    BEMKL.weights <- list()
    ez.tmp <- learnt.params$state$be$mu[-1]
    ez.tmp <- ez.tmp / sum(ez.tmp^2)
    names(ez.tmp) <- names(learnt.params$kernel.normalizers.env)
    names(ez.tmp)[length(ez.tmp)] <- ''
    BEMKL.weights$ez <- ez.tmp

    
    # if necessary, create path
    if (!file.exists(path.tmp)) dir.create(path.tmp, recursive = TRUE)
    
    # save predictions
    save(BEMKL.weights, file=paste0(path.tmp, '/BEMKL_weights.RData'))
  }
  
  if (method == 'KBMF_1way_g_2way_fe_1k') {
    
    #3,4,5
    for (K in c(3)) {
      # 0.4,0.8,1.2
      for (var.coef in c(0.4)) {
        # 0.5, 1
        for (PTVE.2.way in c(0.5)) {
          
          # the path for saving results
          path.tmp <- paste0(my.path, 'runs/tf_', test.fold, '/', 'vf_', validation.fold, '/', method)
          
          filename.tmp <- paste0(path.tmp, '/', method,'_K', K, '_var_coef_', var.coef, '_PTVE_2_way_', PTVE.2.way, '.RData')
	        experiment$current.run$filenames.list$model.filename <- filename.tmp
          
          # normalize environment kernels wrt summed variance
          learnt.params$kernel.normalizers.env <- unlist(lapply(model.inputs$env.kernels$train, function(x){sqrt(1/sum(apply(x,2,var)))}))
          
          # same for genotypes
          learnt.params$kernel.normalizers.geno <- unlist(lapply(model.inputs$geno.kernels$train, function(x){sqrt(1/sum(apply(x,2,var)))}))
          
          
          # Genotype: process the kernels into the KBMF format
          pars <- list()
          pars$ker.sel <- g.ker.sel
          pars$weight.bias <- NULL
          Kx <- process.kernels(kernels.to.normalize=model.inputs$geno.kernels$train, kernel.normalizers=learnt.params$kernel.normalizers.geno, pars=pars)
          
          
          
          # Environment: process the kernels into the KBMF format
          pars <- list()
          if (exists('e.ker.sel')) {
            pars$ker.sel <- e.ker.sel
            pars$weight.bias <- NULL
          } else pars$weight.bias <- 0
          
          Kz <- process.kernels(kernels.to.normalize=model.inputs$env.kernels$train, kernel.normalizers=learnt.params$kernel.normalizers.env, pars=pars)
          
          
          # Re-normalize in case '_1k'
          if (length(grep(x=method, pattern='_1k'))>0) {
            learnt.params$kernel.normalizer.1k <- sqrt(1/sum(apply(drop(Kz),2,var)))     
            Kz <- Kz * learnt.params$kernel.normalizer.1k
          }
          
          
          learnt.params$pheno.variables <- unique(model.inputs$pheno.data$train$column.key['fenotyyppi', ])
          
          # set input parameters for KBMF
          varargin <- set.parameters(method, seed, pheno.matr=model.inputs$pheno.data$train$pheno.matrix, learnt.params, filenames.list=experiment$current.run$filenames.list, column.key=model.inputs$pheno.data$train$column.key)
          
          
          time.start <- proc.time()[[3]]
          #source("kbmf1k1mkl/kbmf1k1mkl_semisupervised_regression_variational_train.R")
          
          state <- kbmf_regression_train(Kx=Kx, Kz=Kz, Y=model.inputs$pheno.data$train$pheno.matrix, R=K, varargin=varargin)
          time.finished <- proc.time()[[3]]
          run.time <- time.finished - time.start
          
          learnt.params$state <- state
          
          if (n.iter==500) {
            if (!file.exists(path.tmp)) dir.create(path.tmp, recursive = TRUE)
            
            save(learnt.params, file=experiment$current.run$filenames.list$model.filename)
          }
          

          validation.set.scores <- get.test.set.scores(model.inputs, learnt.params, set='valid')
                    
        }
      }
    }
    
    ########################################################
    # 1) select parameters that give best performance in
    # training set
    #
    # 2) pool validation data with training data,
    # can be found by loading the kernels at 
    #    new.data.path <- paste0(experiment$paths$data.path, "model_inputs_tf_", test.fold, ".RData")
    #
    #
    # 3) learn model with selected parameters using
    # the functions within the nested CV loop
    #
    #
    # 4) compute test set scores
    # For demonstration purposes, can be computed with the 
    # model learnt before pooling the validation data to
    # the test set
    test.set.scores <- get.test.set.scores(model.inputs, learnt.params, set='test')
  }
  
  if (method == 'KBMF_1way_g_2way_fe_hst_1k') {
    
    # 3,4,5
    for (K in c(3)) {
      # 0.4,0.8,1.2
      for (var.coef in c(0.4)) {
        # 0.5, 1
        for (PTVE.2.way in c(0.5)) {
          
          
          # the path for saving results
          path.tmp <- paste0(my.path, 'runs/tf_', test.fold, '/', 'vf_', validation.fold, '/', method, '/', method,'_K', K, '_var_coef_', var.coef, '_PTVE_2_way_', PTVE.2.way, '.RData')
          experiment$current.run$filenames.list$model.filename <- path.tmp
          
          model.load.path <- gsub(path.tmp, patter='hst_', replacement = '')
          
          if (file.exists(model.load.path)) load(model.load.path)
          
          validation.set.scores <- get.test.set.scores(model.inputs, learnt.params, set='valid')
          
        }
      }
    }
    
    ########################################################
    # 1) select parameters that give best performance in
    # training set
    #
    # 2) pool validation data with training data,
    # can be found by loading the kernels at 
    #    new.data.path <- paste0(experiment$paths$data.path, "model_inputs_tf_", test.fold, ".RData")
    #
    #
    # 3) learn model with selected parameters using
    # the functions within the nested CV loop
    #
    #
    # 4) compute test set scores
    # For demonstration purposes, can be computed with the 
    # model learnt before pooling the validation data to
    # the test set
    test.set.scores <- get.test.set.scores(model.inputs, learnt.params, set='test')
  }
}




