Skip to main content

R tutorial


Install R in linux
============
In CRAN home page, the latest version is not available. So, in fedora,
Open the terminal
  • yum list R  --> To check the latest available version of r
  • yum install R --> install R version
  • yum update R --> update current version to latest one

0 find help
============
?exact topic name (  i.e.   ?mean )


0.0 INSTALL 3rd party package
 ====================
install.packages('mvtnorm' , dependencies = TRUE , lib='/home/alamt/myRlibrary/')   #  install new package

BED file parsing (Always use read.delim it is the best)


library(MASS)
#library(ggplot2)

dirRoot="D:/research/F5shortRNA/TestRIKEN/Rscripts/"
dirData="D:/research/F5shortRNA/TestRIKEN/"
setwd(dirRoot)
getwd()

myBed="test.bed"
fnmBed=paste(dirData, myBed, sep="") # ccdsHh19.bed   tmp.bed




## Read bed use read.delim - it is the  best

mybed=read.delim(fnmBed, header = FALSE, sep = "\t", quote = "\"", dec = ".", fill = TRUE, comment.char = "");

## PARSE BED column

noGene=dim(mybed)[1]
allChr=as.matrix(mybed[1])
allStart=as.matrix(mybed[2])
allEnd=as.matrix(mybed[3])
allWidth=allEnd - allStart

## Iterate over rows of bed

for(i in myStart: myEnd) {
 
  curChr = allChr[i]
  curStart = allStart[i]
  curWidth = allWidth[i]
 
  if(i%%1000 ==0){
    print("Now calculating for geneno: ");
    print (i)
  }
 
}

## Find the index of row matching value a particular column

matchIdx = which( allChr == 'chr19')

mydata=read.delim(fnmCSV, header = FALSE, sep = ",", quote = "\"", dec = ".", fill = TRUE, comment.char = "");
  mydataRiyadh = mydata[ mydata$V6=="Riyadh Province" , ]
  mydataMakkah = mydata[ mydata$V6=="Makkah Province" , ]
 

0.1 Set path in .libPath() 3rd party package
==============================
.libPaths() -- show the path R will search for library

to append a new path:

.libPaths( c( .libPaths(), "/NewPath") )

for example

.libPaths( c( .libPaths(), "/home/alamt/R/x86_64-unknown-linux-gnu-library/3.0/") )

CBRC cluster R .libPath() should be
========================

 [1] "/home/alamt/R/x86_64-pc-linux-gnu-library/2.15"  
[2] "/home/apps/R/3.0.0/lib64/R/library"              
[3] "/home/alamt/R/x86_64-unknown-linux-gnu-library/3.0"




0.1.1. Call R code from terminal , shell script
====================

in shell type:
Rscript myCode.R tanviralam 99

myCode.R
------------------
args  =commandArgs(TRUE)
myWorkDir = as.character(args[1])
myval = as.double(args[2])
print(myval)
setwd(myWorkDir)




0.2. Use the 3rd party package
====================

library(mvtnorm)   #  use external package

0.3. Complete example of reading a tab delimitted file and build feature and label factor, In R always use factor instead of matrix
===============================================

data = read.delim( "feature_2_111.feature" , header= FALSE, sep="\t");
mymat =  as.matrix(data);

totRow =-nrow (data)
totFeat = ncol (data)
print(totRow);
print(totFeat);

myFeatureFrame = data[ c(1:totRow), c(1:totFeat) ] ;
myLabelFrame = data[ c(1:totRow), c(totFeat+1) ] ;


0.1. Create DataFrame Manually

// Create a dataframe with two variable in two columns: ExprValue and ClusterNo    
val=as.matrix(valforplot)
cno = as.matrix(clusternoforplot)
dfForPlot = data.frame( clusterno=cno ,  Exprvalue=val)
colnames(dfForPlot)
# check the value of first variable
dfForPlot$ExprValue

1. Create Matrix Manually


B = matrix(   c(2, 4, 3, 1, 5, 7),   nrow=3,    ncol=2) ;

http://www.r-tutor.com/r-introduction/matrix/matrix-construction


1. Read Tab delimited file ( Read bed file )

mybed=read.delim(fnmBed, header = FALSE, sep = "\t", quote = "\"", dec = ".", fill = TRUE, comment.char = "");

noGene=dim(mybed)[1]
allChr=as.matrix(mybed[1])
allStart=as.matrix(mybed[2])
allEnd=as.matrix(mybed[3])
allWidth=allEnd - allStart



resultFinal=NULL;
print("Calculation of PhastCons score started ...")
for(i in 1:noGene) {
 
  curChr = allChr[i]
  curStart = allStart[i]
  curWidth = allWidth[i]

  }
 
}



1. Read as frame and convert to matrix


data = read.delim( "test.dat" , header= FALSE, sep="\t")
myMat =  as.matrix(data)



// read as tab delimited file
mydata = read.delim( fnmData ,header=TRUE,  sep="\t")

 2. Write matrix into file

=====================

library(MASS)
write.matrix(mymat, file = "data.in", sep = "\t")


2.a Write data frame into file

==========================

results = data.frame();
uniqTF=unique(data$V9);

for (tf in uniqTF)
{
    tmp = subset( data, data$V9==tf);
    results = rbind( results , data.frame(tmp$V1,tmp$V2,tmp$V3,tmp$V4,tmp$V5,tmp$V6,tmp$V7,tmp$V8,tmp$V9 ) );

}


write.table(results, file = "track.tf.gff",col.names=F, row.names=F, quote=FALSE, sep = "\t");


How to remove double quote in writing ( use quote=FALSE)

write.table(results, file = "track.tf.gff",col.names=F, row.names=F, quote=FALSE, sep = "\t");



2.b. Write output image into file
==========================

par(mfrow=c(4,3))  // divide canvas into 4 by 3 cells

png(filename="./netsimQ05.png",   bg="white")  ;  # create device
plot(apres1a) ; # make your plot 
dev.off() # stop device


figname='GM12878_All.eps'
postscript(file=figname, onefile=FALSE, horizontal=FALSE , bg="white")dev.off()

3. Convert to matrix

=================

mymat = as.matrix(myframe)

4. dimension of matrix

===============
dim(mymat)
norow = dim (mymat)[1]  OR   nrow(mymat)
nocol  = dim(mymat) [2]   OR  ncol(mymat)



4. Common Matrix Operation
====================================
# Removing rows with all zeros
mat = sample(0:1, 49, TRUE)
dim(mat) = c(7, 7)
mat[c(1,4), ] = 0
ind = rowSums(mat) != 0
newMat = mat[ind,]

# Fill matrix by same value
a= replicate(times , value)
a = replicate(10, 9999) creates a structure with ten values, where each value equals to 9999.

# Find index of all positions matching a value

matchIdx = which( mymatrix == value)  # it returns the index of matrix entry whose value match your value

mydata=read.delim(fnmCSV, header = FALSE, sep = ",", quote = "\"", dec = ".", fill = TRUE, comment.char = "");
  mydataRiyadh = mydata[ mydata$V6=="Riyadh Province" , ]
  mydataMakkah = mydata[ mydata$V6=="Makkah Province" , ]
 


# Transpose of a matrix
mymatTrans = t(mymat)

# Subselect ROW based on Value in column

mat1common = mat2[ mat1$V1 %in%  ma2$V1, ]


# Sort Matrix based on a column

myMatrixSorted = myMatrix [order( myMatrix$V1),]


5. common statistics
===============

mean of columns :  
colMeans(mymat)

mean of allvalue : 
mean(mymat)

variance : 

var(data)

range : 
range(data[ , 1])  # range of first column

Speakman/Pearson correlation coefficient of rows of matrix:

library(Hmisc)
mymat= as.matrix(mtcars) # here mtcars is the data frame having 32 row, 11 column

mymatTrans=t(mymat) # As these calculate correlation between columns


## cor:  measure each pair column cor-relation between 2 matrices
resultCorr = cor( mymatTrans1 , mymatTrans2 , method="spearman")
resultCorr[is.na(resultCorr)] =  0.0
finalResult = diag(resultCorr) # select each entry of diagonal(i,i)  as result


## rcorr: measure all pair column cor-relation
resultCorr = rcorr( mymatTrans , type="spearman")

resultCorr$r # show the correlation
resultCorr$P  # shows the p-value of cor-relation



dlmwrite ("mycoeff.txt" , resTrans$r )

P-value correction ; P-value adjustment for multiple adjustment
 http://www.r-bloggers.com/example-9-30-addressing-multiple-comparisons/
 https://stat.ethz.ch/R-manual/R-devel/library/stats/html/p.adjust.html

p.adjust(p, method = p.adjust.methods, n = length(p))
p.adjust.methods
# c("holm", "hochberg", "hommel", "bonferroni", "BH", "BY",   "fdr", "none")
Example
pvals = c(.001, .001, .001, .02, .22, .59, .87)
BONF = p.adjust(pvals, "bonferroni")
BH = p.adjust(pvals, "BH")
res = cbind(pvals, BH=round(BH, 3), BONF=round(BONF, 3))

6 sorting   and ordering to get sorted index:
================================
sortedVal  = sort ( data )
sortedVal  = rev  ( sort (data )  )

sortedId = order ( data , decreasing=TRUE )
sortedId  = order ( data ,  decreasing= FALSE )




6. Create vector of object provided into list : use c to create that vector
====================================
mynames = c ( 'ta' , 'al')
myvect =  c ( 5 , 3 ,89)
myvect = c ( myvect , 90)   # concate new element

insert element into vector

 myVect=c();
 
  for(i in 1: N)
  {
       
    cur =AllValue[i]
    if(i==1){
        myVect= c(cur)
    }else
    {
        myVect= append(myVect, cur)
    }
   
  }


7 Query
===================

find most frequent word and its index in array
---------------------------
mostFreq =  myarr="myarr" table="table" which.max="which.max"
mostFreqIndex =
myarr="=mostFreq,arr.ind=TRUE)"

8. Function calling
=========================

callee.r
--------------
myfnc1 <- function ( fname, totA ) {
    print(  paste("inside callee:" , fname) );
    print(  paste("inside callee:" , totA));
    return ( totA) # You must use (  ) to return value
}

caller.r
------------------
setwd('.')
source('callee.r');
y  <-myfnc1("tanvir", 5 );

call from command prompt :
-------------------------------------------
/usr/bin/Rscript caller.r  myval1  myval2


Include functions from another file
======================
source('filename.r');


pass command line argument to rscript call
===============================
args  < commandArgs (TRUE)
print(args[1])
print(args[2])



9. String Operation
==============



String Array
=========
cellName=c("HM1","HM2" );
print(cellName[i] );

String concat
===========

a="hell"
b="fire"
c=paste( a, b,sep = "")  

String Length
=============
word=”apple”
nchar(word)
Substring
=======
substr(word,start=2,stop=5)

10. loop
===========

cellName=c("HM1","HM2" );
 for(i in 1:length(cellName)) {
    print(cellName[i] );
    filePath=paste(myDir , '/data/',cellName[i] , sep="");
    print(filePath)
}
11. If - else if - else (DON'T PUT SPACE BEFORE else)
==========================================
val=6
if(val>10)
{
    print("it is above 10");
}else if (val >5)
{
    print("it is above 5");
}else
{
    print("it is default");
}

Using Hash / Hashmap / Key Value pair in R [Values are not inserted serially]




use library
=======
 library(hash)

# create my hash
=============
myHashCagt <- br="" hash="">


# insert new values in to hash
====================
  .set( myHashCagt, keys=curKey, values=curCluster )

# Find the expression values  of those  key
================================
valExpr = values(myHashCagt, keys=curKey)
valAsMat = as.matrix(valExpr) # N row, 1 column

12. Fisher 's exact test : package : stats
======================
myContin = matrix(c(1,4,7,4), nrow = 2);
fisher.test(myContin ,alternative = "two.sided" ) ; // two sided

fisher.test(myContin ,alternative = "greater" ) ; // right-hand sided
fisher.test(myContin ,alternative = "less" ) ; // left-hand sided

13. GRAPH PLOT
==============
http://rfunction.com/code/1203/cex-guide.pdf

Best tutorial: with details example 

http://www.programmingr.com/content/controlling-margins-and-axes-oma-and-mgp/

http://www.stat.auckland.ac.nz/~paul/Talks/Rgraphics.pdf

4 BASIC PARAMETER
par(cex.axis=1, cex.lab=2, cex.main=1.2, cex.sub=1)

 cex.axis= fix font size of axis tick
 cex.lab= fix font size size of axis label
 cex.main= fix size of main header font size
 cex.sub= fix size of sub header font size

http://www.r-bloggers.com/setting-graph-margins-in-r-using-the-par-function-and-lots-of-cow-milk/

par(mar=c(5.1,4.1,4.1,2.1)  , mgp(3,1,0) )

# mar ( bottomMargin, leftMargin, topMargin, rightMargin)
# mgp ( distanceBorderTo_AxisLabel, distanceBordeTo_tickMark, distanceBordeTo_tick)

http://rfunction.com/archives/1302

#Blank space line before drawing the box/axisLine of figure. It will contain label, tick etc.
mar=c(5.1,4.1,4.1,2.1)

mgp : Distance from the axis line to( the axis title, axis labels and axis line). default value of mgp is c(3,1,0), which means that the
axis title: def 3: is drawn in the fourth line of the margin starting from the plot region,  
tick labels: def 1: are drawn in the second line
ticks: def 0: itself is the first line.

Comments

Popular posts from this blog

MATLAB cross validation

// use built-in function samplesize = size( matrix , 1); c = cvpartition(samplesize,  'kfold' , k); % return the indexes on each fold ///// output in matlab console K-fold cross validation partition              N: 10    NumTestSets: 4      TrainSize: 8  7  7  8       TestSize: 2  3  3  2 ////////////////////// for i=1 : k    trainIdxs = find(training(c,i) ); %training(c,i);  // 1 means in train , 0 means in test    testInxs  = find(test(c,i)       ); % test(c,i);       // 1 means in test , 0 means in train    trainMatrix = matrix (  matrix(trainIdxs ), : );    testMatrix  = matrix (  matrix(testIdxs  ), : ); end //// now calculate performance %%  calculate performance of a partition     selectedKfoldSen=[];selectedKfoldSpe=[];selectedKfoldAcc=[];     indexSen=1;indexSpe=1;indexAcc=1;     if ( kfold == (P+N) )% leave one out         sensitivity = sum(cvtp) /( sum(cvtp) + sum(cvfn) )         specificity = sum(cvtn) /( sum(cvfp) + sum(cvtn) )         acc