%================================================== % % LLSimpute - Local Least Squares Imputation % % (Missing Value Estimation Package) % % Author: Hyunsoo Kim % Date: Fall/2003 - Spring/2004 % E-mail: hskim@cs.umn.edu % Personal homepage: % http://www.cs.umn.edu/~hskim % Reference: Missing value estimation for DNA microarray gene % expression data: Local Least Squares Imputation, H. Kim, % G. H. Golub, and H. Park, Bioinformatics, to appear, 2004. % This software may be free downloaded from site: % http://www.cs.umn.edu/~hskim/tools.html % License: % It is free for academic or nonprofit insistutions. % All right is reserved regarding commecial usage. % Please consult if you try to use this package for % commercial purpose. % Comments: % Please let me know if you have done any improvement. % % Sample Usage: % % please use miss0.mat distributed in the same package % load miss0.mat % % read miss0.mat and impute % E=impute_llsq_l2_blind(0,1,210); % idx=find(miss_matrix==1e99); % answer=matrix(idx); % guess=E(idx); % nrmse=sqrt(mean((guess-answer).^2))/std(answer) % you will see ---> nrmse=0.5145 % % Description: % % function E=impute_llsq_l2_blind(set,fig,mink) % impute the missing values without k-value estimator % % Input parameter: % set - the number of set (if set=0, it reads miss0.mat) % fig - draw helpful figure and echo some comments % mink - the number of nearest neighbor genes % Output parameter: % E - the estimated matrix (if set=0, it writes e0.csv and e0.mat) % Data structure: % miss0.mat should contain miss_matrix variable. % missing values of miss_matrix should be 1e99. % Needed other products: % impute_rowavg.m % %==================================================== function [E] = impute_llsq_l2_blind(set,fig,mink) global m n miss_matrix big E gene_include m_gene_include; big=1e99; % load miss_matrix fn = sprintf('miss%d.mat', set); load(fn); % struct array that contains miss_matrix [m,n]=size(miss_matrix);total=m*n; % the number of minimal experiments for estimating missing values by llsq minexp=2; %minexp=n*0.3; %fn = sprintf('%d_%d.k', set,percentage); %fid = fopen(fn, 'w'); fid = 1; % initial guess gene_include=[]; E=[]; f_rowaverage=0; % turn-off default rowaverage %f_rowaverage=1; % default rowaverage if (f_rowaverage==1) | (m-length(find(miss_matrix==big))) < 400 fprintf('consider all genes after imputing missing values by row-average.\n'); [E]=impute_rowavg(set,miss_matrix,minexp); %[E]=impute_knn(miss_matrix); gene_include=1:m; f_rowaverage=1; else fprintf('exclude genes that have missing values for accurate imputation.\n'); E=miss_matrix; [miss_gene,miss_exp]=find(miss_matrix == big); gene_include = setdiff(1:m, miss_gene); % set minexp to 0 since we do not perform rowaverage minexp=0; f_rowaverage=0; end m_gene_include=length(gene_include); k_max=m_gene_include; % NO~~ parameter estimation for mink %[mink] = kestimate_l2(set,minexp,fig); fprintf(fid,'\n\n'); fprintf(fid,'----------------------------------------------\n'); fprintf(fid, 'LLSimpute/L2/ITER\n'); fprintf(fid,'----------------------------------------------\n'); fprintf(fid,'\n\n'); fprintf(fid, 'miss_matrix(%d,%d) total: %d minexp: %.2f f_rowaverage: %d\n\n', ... m, n, total, minexp, f_rowaverage); % we don't know the exact answer for the missing values guess=[]; fprintf(fid, 'Estimating missing values...\n'); for i=1:m missidxj=find(miss_matrix(i,:) == big); len_miss=length(missidxj); nomissidxj=find(miss_matrix(i,:) < big); len_nomiss=length(nomissidxj); if ((len_nomiss < minexp) | (len_nomiss < 2)) & (f_rowaverage==1) fprintf('%dth gene: skip due to nomiss_exp(%d)<%.2f or < 2\n', ... i,len_nomiss,minexp); elseif (len_miss > 0) %if fig==1 %fprintf('%dth: gene apply llsq --- %d missing\n', i, len_miss); %end [A,B,w] = similargene(i,missidxj,nomissidxj); %answer=[answer; matrix(i,missidxj)']; % for mink Apart=A(1:mink,:); Apart=Apart'; Bpart=B(1:mink,:); %linear combination of experiments X = pinv(Apart)*w'; guess = [guess; Bpart'*X]; end%if end%i %store estimated values s=1; for i=1:m missidxj=find(miss_matrix(i,:) == big); len_miss=length(missidxj); nomissidxj=find(miss_matrix(i,:) < big); len_nomiss=length(nomissidxj); if ((len_nomiss < minexp) | (len_nomiss < 2)) & (f_rowaverage==1) % skip elseif (len_miss > 0) E(i,missidxj) = guess(s:s+len_miss-1)'; s=s+len_miss; end end %fclose(fid); % save result matrix fnout=sprintf('e%d.csv', set); csvwrite(fnout, E); fnout=sprintf('e%d.mat', set); save(fnout,'E'); return; function [A,B,w] = similargene(i,missidxj,nomissidxj) global m n miss_matrix E gene_include m_gene_include; % L2-norm distance calculation mm1=1; mm2=m_gene_include; AA1=E(i,nomissidxj); BB1=E(gene_include,nomissidxj); AA2=sum(AA1.^2,2); BB2=sum(BB1.^2,2); distance=repmat(AA2,1,mm2)+repmat(BB2',mm1,1)-2*AA1*BB1'; [sorted,sortidx]=sort(distance); % gene number gene=gene_include(sortidx); % remove the same gene gene(find(gene==i))=[]; A=E(gene,nomissidxj); B=E(gene,missidxj); w=miss_matrix(i,nomissidxj); return;