%================================================== % % LLSimpute - Local Least Squares Imputation % % (Missing Value Estimation Package) % % Author: Hyunsoo Kim % Date: Fall/2003 - Spring/2004 % E-mail: hskim@cs.umn.edu % Personal homepage: % http://www.cs.umn.edu/~hskim % Reference: Missing value estimation for DNA microarray gene % expression data: Local Least Squares Imputation, H. Kim, % G. H. Golub, and H. Park, Bioinformatics, to appear, 2004. % This software may be free downloaded from site: % http://www.cs.umn.edu/~hskim/tools.html % License: % It is free for academic or nonprofit insistutions. % All right is reserved regarding commecial usage. % Please consult if you try to use this package for % commercial purpose. % Comments: % Please let me know if you have done any improvement. % % Sample Usage: % mink=kestimate(0,10,1); % % Description: % % function mink=kestimate(set,minexp,fig) % estimate the model parameter k by k-value estimator % % Input parameter: % set - the number of set, % If the miss_matrix came from miss0.mat, then assign set=0 % minexp - if the number of non-missing values in a gene (row) is less than % minexp, then report the gene (row). % fig - draw helpful figure and echo some comments % Output parameter: % mink - the estimated k-value % %==================================================== function [mink] = kestimate_l2(set,minexp,fig) %percentage=0.05; global m n miss_matrix big E gene_include m_gene_include; k_min=1;k_jump=1.5;k_max=m_gene_include; %fn = sprintf('%d_%d.k', set,percentage); %fid = fopen(fn, 'w'); fid = 1; % determine pretending missing positions p. % estimate the number of missing positions per line. rho=length(find(miss_matrix==big))/m; % rho=5 --> e=1, rho=10 --> e=2, ... e=floor(rho/5); % at least 1 position should be pretended as a missing value. e=max([1 e]); % the number of pretended missing values should be less than % 20% of non-missing positions..... e=min([e n*0.2*(1-0.2)]); % pretended missing positions among the non-missing positions. p=[1:2:2*e]; fprintf(fid, 'pretended missing positions: '); fprintf(fid,'%d ', p); fprintf(fid,'\n\n'); fprintf(fid,'----------------------------------------------\n'); fprintf(fid, 'Kestimate/L2\n'); fprintf(fid,'----------------------------------------------\n'); fprintf(fid,'\n\n'); fprintf(fid, 'miss_matrix(%d,%d) with minexp: %.2f\n', m, n, minexp); % check entire row answer=[]; k = k_min; kcount=0; while(k < k_max) kcount=kcount+1; guess{kcount}=[]; k = ceil(k*k_jump); end fprintf(fid, 'Estimating missing values to determine k-value....\n'); for i=1:m missidxj=find(miss_matrix(i,:) == big); len_miss=length(missidxj); nomissidxj=find(miss_matrix(i,:) < big); len_nomiss=length(nomissidxj); if (len_nomiss < minexp | len_nomiss < 2) fprintf('%dth gene: skip due to nomiss_exp(%d)<%.2f or < 2\n', ... i,len_nomiss,minexp); elseif (len_miss > 0) % determine the number of artifical missing values among p missnum=min([len_miss length(p)]); % too small nomissing entries to make artificial missing entries if (len_nomiss < 10) missnum=1; end p1=p(1:missnum); if fig==1 fprintf('%dth gene: apply llsq_l2 --- %d --> %d missings\n',i, len_miss, length(p1)); end % generate the artificial missing values missidxj=nomissidxj(p1); nomissidxj(p1)=[]; [A,B,w] = similargene(i,missidxj,nomissidxj); answer=[answer; E(i,missidxj)']; % for k survey k = k_min; kcount=0; while(k < k_max) kcount=kcount+1; Apart=A(1:k,:); Apart=Apart'; Bpart=B(1:k,:); %linear combination of experiments X = pinv(Apart)*w'; guess{kcount} = [guess{kcount}; Bpart'*X]; k = ceil(k*k_jump); end%while k end%if end%i k = k_min; kcount=0; xk=[]; mincount=0; while(k < k_max) kcount=kcount+1; nrmse(kcount) = sqrt( mean( (guess{kcount}-answer).^2 ) ) / std( answer ); fprintf(fid, 'nrmsw(k=%d): %f\n', k, nrmse(kcount)); if(k > n) & (mincount==0) mincount=kcount; end xk=[xk k]; k = ceil(k*k_jump); end % try to find global minimum [minnrmse,minkidx]=min(nrmse); mink=xk(minkidx); % try to find minumum with k > n %[minnrmse,minkidx]=min(nrmse(mincount:end)); mink=xk(minkidx+mincount-1); fprintf(fid, 'minnrmsw: %f minkidx:%d mink: %d\n', minnrmse, minkidx, mink); %fclose(fid); % plot graph if fig==1 plot(xk, nrmse,'k^-'); axis([0 k_max 0 2.0]); label=sprintf('k (mink=%d,minnrms: %f)', mink, minnrmse); xlabel(label); ylabel('NRMSE'); title('Preliminary Test for Determine k-value'); drawnow; end function [A,B,w] = similargene(i,missidxj,nomissidxj) global m n E gene_include m_gene_include; % L2-norm distance calculation mm1=1; mm2=m_gene_include; AA1=E(i,nomissidxj); BB1=E(gene_include,nomissidxj); AA2=sum(AA1.^2,2); BB2=sum(BB1.^2,2); distance=repmat(AA2,1,mm2)+repmat(BB2',mm1,1)-2*AA1*BB1'; [sorted,sortidx]=sort(distance); % gene number gene=gene_include(sortidx); % remove the same gene gene(find(gene==i))=[]; A=E(gene,nomissidxj); B=E(gene,missidxj); w=E(i,nomissidxj); return;