Saturday, October 22, 2011

matlab normalization

Using meand stddev


 function [featureIn,meanFeatIn, stdDevFeatIn] = mynorm_train(featureIn)
meanFeatIn = mean(featureIn,1);
stdDevFeatIn = std(featureIn,1,1);
noSample = size(featureIn,1);
for i=1:noSample
    featureIn(i,:) = (featureIn(i,:) - meanFeatIn) ./ stdDevFeatIn ;
end
end

 function [testFeatureIn] = mynorm_test(testFeatureIn,meanFeatIn,stdDevFeatIn)
    noSample = size(testFeatureIn,1);
    noInputFeat = size(testFeatureIn,2);
    for i=1:noSample
            testFeatureIn(i,1:noInputFeat) = (testFeatureIn(i,1:noInputFeat) - meanFeatIn ) ./ stdDevFeatIn;          
    end  
end





Using range

 function [ N_feature,feature_range,feature_bases ] = normalize( features )
%NORMALIZE Summary of this function goes here
%   Detailed explanation goes here
% samples are in rows

for NoF = 1:size(features,2)
    F_min(NoF) = min(features(:,NoF));
    F_max(NoF) = max(features(:,NoF));
  
    feature_range(NoF) = (F_max(NoF)-F_min(NoF))/2;
    feature_bases(NoF) = (F_max(NoF)+F_min(NoF))/2;
  
    for NoS = 1:size(features,1)
        if (feature_range(NoF) ~=0)
            N_feature(NoS,NoF) = (features(NoS,NoF)-feature_bases(NoF))/feature_range(NoF);
        else
            N_feature(NoS,NoF)=features(NoS,NoF)-feature_bases(NoF);
        end
    end
end

end
   
function [ feature ] = normalize_t( t_features,range,bases )
%NORMALIZE_T Summary of this function goes here
%   Detailed explanation goes here
range = repmat(range,size(t_features,1),1);
bases = repmat(bases,size(t_features,1),1);
feature = (t_features - bases)./range;
end

matlab feature ranking

used function rankfeatures (consider sample as column)
====================================================

train = [trainFeature trainLabel];
[IDX ,Z] = rankfeatures(trainFeature' ,trainLabel' ,'Criterion', 'ttest');
%ttest / entropy/ etc...



topRankedFeature = (size(trainLabel,1)) / 2 ; 

classify( testFeature( :,IDX(1:topRankedFeature) ),   ...
          trainFeature( :,IDX(1:topRankedFeature) ), trainLabel, ...     'diagquadratic' ) % liner/quadratic/diagquadratic etc


% transpose as it takes sample as column vector
%ttest / entropy/ etc...  
%IDX is the list of indices to the rows in X with the most significant features.  
%Z is the absolute value of the criterion used (see below) 


 

MATLAB discriminant analysis

function used classify
===================

predictedClass = classify(testSample,trainSample,trainGroup,diagquadratic ) % liner/quadratic/ etc





Wednesday, October 19, 2011

MATLAB check unique string in file

function identifyDuplicate
clc;

uniqueSeq={};
dupSeq={};

index=1;
uniqueIndex=1;
dupIndex=1;
uniq=[];
dup=[];
isDuplicated = 0;
fid = fopen('1400M_from_287PS_287NS.ranked','r');


tline = fgetl(fid); % ******
 while ischar(tline)
    
     consensusSeq = fgetl(fid); % Consessus: AAACC
     consensusSeq = upper(consensusSeq);

     curSeq = sscanf(consensusSeq,'%*s %s', [1, inf]);
     curSeq = upper(curSeq);

     fgetl(fid); % Threshold
     fgetl(fid); % Coverage
     fgetl(fid); % p-value
     fgetl(fid); % r1
     fgetl(fid); % r2
     fgetl(fid); % r3
     fgetl(fid); % r4
    
     isExist=0;
    
     for en=1:uniqueIndex -1    
         exist = strcmp(curSeq,uniqueSeq{en})
         if exist ==1
            isDuplicated = 1;
             break;
         end
     end
      
     if( isDuplicated == 1 ) % already exist       
         dupSeq{dupIndex}  = {curSeq};      
         dupIndex = dupIndex + 1;
         dup = [dup;index];
     else % not found
        
         uniqueSeq{uniqueIndex} = {curSeq};  
         uniqueIndex = uniqueIndex + 1;
         uniq = [ uniq;index];
     end
      
   
     
     tline = fgetl(fid); % next ******
     index = index + 1;
     isDuplicated = 0;
    
    
 end


 dlmwrite('unique',uniq,'\t'); % index of unique entry
 dlmwrite('dup'   ,dup   ,'\t'); % index of duplicate entry


fclose(fid);


Tuesday, October 11, 2011

MATLAB cross validation

// use built-in function
samplesize = size( matrix , 1);
c = cvpartition(samplesize,  'kfold' , k); % return the indexes on each fold

///// output in matlab console
K-fold cross validation partition
             N: 10
   NumTestSets: 4
     TrainSize: 8  7  7  8
      TestSize: 2  3  3  2
//////////////////////

for i=1 : k
   trainIdxs = find(training(c,i) ); %training(c,i);  // 1 means in train , 0 means in test
   testInxs  = find(test(c,i)       ); % test(c,i);       // 1 means in test , 0 means in train

   trainMatrix = matrix (  matrix(trainIdxs ), : );
   testMatrix  = matrix (  matrix(testIdxs  ), : );
end

//// now calculate performance


%%  calculate performance of a partition
    selectedKfoldSen=[];selectedKfoldSpe=[];selectedKfoldAcc=[];
    indexSen=1;indexSpe=1;indexAcc=1;
    if ( kfold == (P+N) )% leave one out
        sensitivity = sum(cvtp) /( sum(cvtp) + sum(cvfn) )
        specificity = sum(cvtn) /( sum(cvfp) + sum(cvtn) )
        accuracy = (sum(cvtp)+sum(cvtn)) / ( sum(cvtp) + sum(cvfn) + sum(cvfp) + sum(cvtn) )
       
    else
       
        sensitivity=[]; specificity=[];accuracy=[];
        for i=1: kfold
            if( ( cvtp(i) + cvfn(i) )==0) % no POSITIVE sample was selected for evaluation
                % sensitivity(i) = 1 ;
            else
                sensitivity(indexSen) = cvtp(i) /( cvtp(i) + cvfn(i) ) ;     
                indexSen = indexSen + 1;
                selectedKfoldSen = [selectedKfoldSen i];
            end
           
            if ( cvfp(i) + cvtn(i) ) ==0 % no POSITIVE sample was selected for evaluation
                   %  specificity(i)=  1 ;
            else
                specificity(indexSpe)=  cvtn(i) /( cvfp(i) + cvtn(i) ) ;
                indexSpe = indexSpe + 1;
                selectedKfoldSpe = [selectedKfoldSpe i];
            end
            accuracy(i) = (cvtp(i)+ cvtn(i)) / ( cvtp(i) + cvfn(i) + cvfp(i) + cvtn(i) );
        end
       
        sen = mean(sensitivity)
        spe = mean(specificity)
        acc = mean(accuracy)
       
    end

   
    dlmwrite('cv',[ cvtp' ] , 'delimiter','\t','-append');
    dlmwrite('cv',[ cvfn' ] , 'delimiter','\t','-append');
    dlmwrite('cv',[ cvfp' ] , 'delimiter','\t','-append');
    dlmwrite('cv',[ cvtn']  , 'delimiter','\t','-append');
   
     dlmwrite('cv',[ selectedKfoldSen]  , 'delimiter','\t','-append');
    dlmwrite('cv',[ selectedKfoldSpe]  , 'delimiter','\t','-append');
   
    dlmwrite('cv',[ sensitivity]  , 'delimiter','\t','-append');
    dlmwrite('cv',[ specificity]  , 'delimiter','\t','-append');
    dlmwrite('cv',[ accuracy]     , 'delimiter','\t','-append');

Sunday, October 9, 2011

MATLAB distance based learning

kNN
=========
all=        [ 1 2 ; 3 4 ; 5 6 ; 7 8; 9 10];
newpoint = [ 1 7];
[indexes,distances] = knnsearch(all , newpoint,'k', 3) % 3 nearest neighbour




Thursday, October 6, 2011

MATLAB confusion matrix


%  test_class  & predicted_class must be same dimension
% 'order' - describes the order of label. Here labels are 'g' as positive and 'h' as negative

[C,order] = confusionmat( test_class(1: noSampleTest), predicted_class, 'order', ['g' ;'h'] )
tp = C(1,1);
fn = C(1,2);
fp = C(2,1);
tn = C(2,2);
sensitivity = tp /( tp + fn )
specificity = tn /( fp + tn )
accuracy = (tp+tn) / (tp+fn+fp+tn)
tpr = sensitivity
fpr = 1-specificity
precision = tp /( tp + fp )
fVal = (2*tpr*precision)/(tpr+precision)

MATLAB string manipulation



string cell array
================
ar = {'aa';'bbbbb'}



mat2str   (convert number to string)
======================
parameter = '-s 0 -t 0 -c ';
for c=1:2
    nextp = [parameter mat2str(  c) ]
end

string comparison
================
 sa='ab';
 sb='aba';
 t = strcmp(sa , sb )
if( t == 1)
  display('Yest match');
else
   display('No match')
end


strmatch  find exact string in cell array
=======================
list = {'max', 'minimax', 'maximum', 'max'}
x = strmatch('max',list,'exact')

find  string starts with in cell array
=======================
list = {'max', 'minimax', 'maximum', 'max'}
x = strmatch('max',list)

String Comparison
================
 strmatch('ab' , 'abc') ; % return 1
strcmp('ab' , 'abc') ; % return0

Find index of  char/ string strfind  find the pattern
====================
S = 'Find the starting indices of the pattern string';
strfind(S, 'in')
ans =
     2    15    19    45
 
Sub String
===================
 
s = s(1 : n) 

String trim
==========
out = strtrim(ins)

Reading from file  % parse each line
===========================
function createLogo
motifCount=0;
fNamePPM = '../gene.features.v2/v2_C_287_NC_48_profile.model'; % columns as Letter
fid = fopen(fNamePPM);

tline = fgets(fid);
while ischar(tline)
      if tline(1) =='*' % start of a motif    
           motifCount = motifCount + 1;
       
           tline = fgets(fid); % read ID
           %% tokenize the line
           remain  = tline;
           countToken = 0;
           while true
                [word, remain] = strtok(remain);
                if isempty( word  )
                    break;
                end
                countToken = countToken +1;
                if countToken ==2
                   motifLen = str2num(word);
                   profileMatrix = zeros(4,motifLen);
                end
            disp( word  );
           end

        tline = fgets(fid); % score
        tline = fgets(fid); % A
        profileMatrix(1,1:motifLen) = str2num(tline);
     end
     tline = fgets(fid); % to read for next iteration
   
end
fclose(fid);
end

String tokenizer ( strtok )
==============
remain  = wholeString;
    while true
       [word, remain] = strtok(remain);
       if isempty( word  ),  break;  end
       disp( word  );  
    end

% convert tab delimited line into vector of  number
=====================================
myvector = str2num( myLine );

Read file with FIXED COLUMN like matrix but content may be string

fid = fopen('scan1.dat');
C = textscan(fid, '%d\t%f\t%f\t%f\t%f'); % each row have 5 column
C{1} (1) ; % first column , 1st row
C{1} (2) ; % first column , 2nd row
fclose(fid);

[ covTarget covBG score source consensus ] = textread(fnameStat,'%f\t%f\t%f\t%s\t%s');
% here in the file there are 5 columns , first 3 are numeric, and last 2 are string


Write cell Array dlmwrite is not for cell array 


fid = fopen('LLGencodeCommonID.txt', 'w');
fprintf(fid, '%s\n', commonID{2}); % write 2nd column
fclose(fid);



sscanf:  Parsing word inside of a string
=====================
     consensusSeq = fgetl(fid) % Consessus: AAACC
     curSeq = sscanf(consensusSeq,'%*s %s', [1, inf]) % curSeq = AAACC


Tuesday, October 4, 2011

MATLAB normalize train and test



matrixTrain = load('matrix.train');
 function [matrixTrain , meanFeatIn, stdDevFeatIn] = mynorm_train(matrixTrain)


featureIn = matrixTrain(:,1:end-1);
featureOut = matrixTrain(:,end);

meanFeatIn = mean(featureIn,1);
stdDevFeatIn = std(featureIn,1,1);
meanFeatOut = mean(featureOut,1);
stdDevFeatOut = std(featureOut,1,1) ; 
dlmwrite('normInfo',[meanFeatOut stdDevFeatOut],'delimiter','\t');
noSample = size(featureIn,1);
 for i=1:noSample
            featureIn(i,:) = (featureIn(i,:) - meanFeatIn) ./ stdDevFeatIn ;
            featureOut(i,:) = (featureOut(i,:) - meanFeatOut) ./ stdDevFeatOut ;
 end
matrixTrain = [ featureIn featureOut];

end

matrixTest = load('matrix.test');
function [matrixTest] = mynorm_train(matrixTest,meanFeatIn, stdDevFeatIn,meanFeatOut ,stdDevFeatOut )

noSample = size(matrixTest,1);
noInputFeat = size(matrixTest,2) - 1;
for i=1:noSample
            matrixTest(i,1:noInputFeat) = (matrixTest(i,1:noInputFeat) - meanFeatIn )  ./ stdDevFeatIn ;
            matrixTest(i,noInputFeat+1) = (matrixTest(i,noInputFeat+1) - meanFeatOut )  ./ stdDevFeatOut ;  
end  






Sunday, October 2, 2011

matlab matrix to weka .arff format conversion


inputFormat ( matrix are tab seperated , last column indicates the label, This is for two class problem,
for multiclass need to change the code in few lines)
======================================================================
5.0  6.5  7.9 +1
6.6  8.9  6.1 -1
code
=======
function matlabToarff


% convert matrix int arff(Attribute relation file format )format
clc;
fNameData = 'seqLabel';
fNameARFF = 'seqLabel.arff';

fidARFF = fopen( fNameARFF ,'w');
matrix = load(fNameData);
feature = matrix ( : , 1:end-1);
label = matrix (: , end) ;
noFeature = size(feature,2);
noSample = size(feature,1);

%%%%%%%%%% header

fprintf(fidARFF,'%s\n\n','@RELATION LNCRNAsequence');
for i=1:noFeature % noFeature
         fprintf(fidARFF,'%s\t%d\t%s\n' ,'@ATTRIBUTE' , i, 'NUMERIC' );
end
fprintf(fidARFF,'%s\n\n','@ATTRIBUTE class {+1,-1 }');

%%%%%%%%%%  data
fprintf(fidARFF,'%s\n','@DATA');
for r=1:noSample
     for c=1:noFeature
          fprintf(fidARFF,'%f,',matrix(r,c) );
     end
     if label(r)==1
            fprintf(fidARFF,'%s\n', '+1');
     else
            fprintf(fidARFF,'%s\n', '-1');
     end
end

fclose(fidARFF);

end