function [s, x] = nanpca(x, dim)
% NANPCA  do simple linear PCA, but taking NaN's into account
% 
% Usage:
%    s = NANPCA(data, dim)
%    returns dim first PCA components of data, all scaled to
%    unity variance
%    [s, x] = NANPCA(data, dim)
%    returns dim first unscaled PCA components of data, the means
%    of the original data, the transformation matrices and the
%    reconstructed full data.

% reunanen 2000-07-05, 2000-08-03

% Remove the means from the data and calculate covariance matrix
[s, n] = sumnan(x', 1);
datamean = repmat((s')./(n'), 1, size(x,2));
xn = x - datamean;
% xn = x - mean(x')' * ones(1, size(x, 2));
% covm = xn * xn' ./ size(x, 2)
covm = covnan(x') .* ((size(x,2)-1)/size(x,2));

% Calculate eigenvalues and find the greatest ones
[V0, D0] = eig(covm);
[S, I] = sort(-diag(D0));
S = -S;

% Find NaN elements
inds = find(sum(isnan(xn),1));
V = V0*sqrt(D0);
iV = inv(V);
epsilon = norm(V)/1e12; % this has nothing profound in it

% Go through all vectors containing NaN's, replacing 
% them one by one. The best guess is where an origin-
% centered PCA-ellipsoid just touches the NaN-subspace.
for i=inds
  nans = isnan(xn(:,i));
  fnans = find(nans);
  zxn = xn(:,i);
  zxn(fnans) = 0*ones(length(fnans),1);
  p = (double(nans)*double(nans)').*eye(length(nans));
  p = p(:,find(sum(p)));
  pt = iV*p;
  
  t = null(pt');

  B = [t -pt];
  b = inv(B'*B)*B' * (iV*zxn);
  b1 = b(1:size(t,2));
  b2 = b(size(t,2)+1:end);
  a1 = V*(t*b1);
  a2 = zxn+V*(pt*b2);
  if 0,% ~approx(a1,a2,epsilon)  % remove '0,%' to make sure it works :)
    errstr = sprintf('Error! The two vectors should be the same');
    errstr = sprintf('%s (with accuracy %e),\n', errstr, epsilon)
    errstr = sprintf('%sbut they are not!\n', errstr);
    for k = 1:length(zxn)
      errstr = sprintf('%s[%8.4f] ~ [%8.4f]  (diff: [%13e])\n', ...
	  errstr, a1(k), a2(k), abs(a1(k)-a2(k)));
    end;
    error(errstr);
  end;
  xn(:,i) = a1; % could be a2 as well
end;
x = xn;
% From here on, this is the same as basicpca, but for the data just
% computed, which has no NaN's (they have been replaced with their
% best guesses)

% Remove the means from the data and calculate covariance matrix
datamean = datamean + mean(x')' * ones(1, size(x, 2));
xn = x - mean(x')' * ones(1, size(x, 2));
covm = xn * xn' ./ size(x, 2);

% Calculate eigenvalues and find the greatest ones
[V0, D0] = eig(covm);
[S, I] = sort(-diag(D0));
S = -S;

% Return the sources normalized to unit variance
s = diag(sqrt(1./S(1:dim))) * V0(:,I(1:dim))' * xn;
