% Main script example for running the SOM + EOF Pruning algorithm

% Load data file
load Data/ExampleDataset.mat;
[rows, columns] = size(data);

% Necessary parameters, set by user
% Number of Monte Carlo rounds to use in validation
nMonteCarlo = 10;
% Percentage of data to use in each validation round
percentMonteCarlo = 10;
% Minimum SOM grid size
minGridSize = 5;
% Maximum SOM grid size
maxGridSize = 30;
% Maximum number of EOF to use
maxeof = 50;
% Maximum number of EOF Pruning rounds
eofrounds = 20;

% Testset selection
[data2, testData, testIndex] = SelectSet(data, percentMonteCarlo);

% Dataset normalization. Remember to DeNormalize before testerror calculation!
% Normalization does not do anything to the missing values
[dataNorm, normMeans, normStds] = Normalize(data2, 'meancols', 'stdcols');

% SOM results
valErrorsSOM = ones(nMonteCarlo, maxGridSize) * inf;
valErrorsEOF = ones(maxGridSize, nMonteCarlo, eofrounds, maxeof-1) * inf;
selectedEOF = zeros(maxGridSize, eofrounds, maxeof);

%% Learning part
for gridSize = minGridSize:maxGridSize
  for mc = 1:nMonteCarlo
    % Validationset selections for all MonteCarlo rounds
    eval(['global dataMC' num2str(mc) ' valData' num2str(mc) ' valIndex' ...
      num2str(mc) ' allMissing' num2str(mc)]);
    eval(['[dataMC' num2str(mc) ', valData' num2str(mc) ', valIndex' ...
      num2str(mc) '] = SelectSet(dataNorm, percentMonteCarlo);']);
    
    % Initialization for all datasets
    eval(['allMissing' num2str(mc) ' = find(isnan(dataMC' num2str(mc) '));']);
    eval(['SOMinit = SOM(dataMC' num2str(mc) ', allMissing' num2str(mc) ...
      ', gridSize);']);
    eval(['dataMC' num2str(mc) '(allMissing' num2str(mc) ...
      ') = SOMinit(gridSize,:);']);
    
    % Calculating SOM validation error
    eval(['valErrorsSOM(mc,gridSize) = mean((dataMC' num2str(mc) ...
      '(valIndex' num2str(mc) ') - valData' num2str(mc) ') .^2);']);
  end
  
  % EOF estimations for the validation set values
  % Datasets, validation data and indexes should be available on the workspace
  [valErrorsEOF(gridSize,:,:,:), selectedEOF(gridSize,:,:)] = ...
    EOFPruning(nMonteCarlo, eofrounds, maxeof);
end % For loop gridSize


%% Testing part
% Selecting the pruned EOFs with smallest validation error ...
[aa, bestPrunes] = min(squeeze(mean(valErrorsEOF,2)), [], 3);
[cc, bestRounds] = min(aa, [], 2);
[bestValError, bestSOM] = min(cc)
bestRound = bestRounds(bestSOM)

% Getting the best set of EOFs
bestEOFs = zeros(bestRound, max(bestPrunes(bestSOM, :)));
for i = 1:bestRound
  bestEOFs(i, 1:bestPrunes(bestSOM,i)) = ...
    sort(selectedEOF(bestSOM, i, 1:bestPrunes(bestSOM,i)));
end

% ... and plotting the validation errors for all pruned EOF rounds
figure;
plot(mean(valErrorsSOM))
hold on;
plot(cc, 'r');
drawnow;

% Now the test itself
% Initialization
allMissing = find(isnan(dataNorm));
maskTest = isnan(dataNorm);
somInit = SOM(dataNorm, allMissing, bestSOM);
dataNorm(allMissing) = somInit(bestSOM,:);

% Filling
dataFilled = EOFCore(dataNorm, maskTest, bestEOFs, 1, bestRound);

% Denormalization, using the reversed order than in Normalization
dataFilled = DeNormalize(dataFilled, 'std', normStds, 'mean', normMeans);

% Test error calculation
testError = mean((dataFilled(testIndex) - testData) .^2)


%% Final filling of the data set
% In case there's missing values
if any(isnan(data))
  % Dataset normalization. Remember to DeNormalize before error calculation!
  % Normalization does not do anything to the missing values
  [dataNorm, normMeans, normStds] = Normalize(data, 'meancols', 'stdcols');
  
  % Initialization
  allMissing = find(isnan(dataNorm));
  maskFinal = isnan(dataNorm);
  somInitFinal = SOM(dataNorm, allMissing, bestSOM);
  dataNorm(allMissing) = somInitFinal(bestSOM,:);

  % Filling
  dataFilled = EOFCore(dataNorm, maskFinal, bestEOFs, 1, bestRound);
  
  % DeNormalizing the filled dataset
  dataFilled = ...
    DeNormalize(dataFilled, 'std', normStds, 'mean', normMeans);
  
  dataFilledFinal = data;
  dataFilledFinal(isnan(data)) = dataFilled(isnan(data));
end

% End

