R tutorial

Install R in linux
============
In CRAN home page, the latest version is not available. So, in fedora,
Open the terminal

yum list R --> To check the latest available version of r
yum install R --> install R version
yum update R --> update current version to latest one

0 find help
============
?exact topic name ( i.e. ?mean )

0.0 INSTALL 3rd party package
====================
install.packages('mvtnorm' , dependencies = TRUE , lib='/home/alamt/myRlibrary/') # install new package

BED file parsing (Always use read.delim it is the best)

library(MASS)
#library(ggplot2)

dirRoot="D:/research/F5shortRNA/TestRIKEN/Rscripts/"
dirData="D:/research/F5shortRNA/TestRIKEN/"
setwd(dirRoot)
getwd()

myBed="test.bed"
fnmBed=paste(dirData, myBed, sep="") # ccdsHh19.bed tmp.bed

## Read bed use read.delim - it is the best

mybed=read.delim(fnmBed, header = FALSE, sep = "\t", quote = "\"", dec = ".", fill = TRUE, comment.char = "");

## PARSE BED column

noGene=dim(mybed)[1]
allChr=as.matrix(mybed[1])
allStart=as.matrix(mybed[2])
allEnd=as.matrix(mybed[3])
allWidth=allEnd - allStart

## Iterate over rows of bed

for(i in myStart: myEnd) {

curChr = allChr[i]
curStart = allStart[i]
curWidth = allWidth[i]

if(i%%1000 ==0){
print("Now calculating for geneno: ");
print (i)
}

}

## Find the index of row matching value a particular column

matchIdx = which( allChr == 'chr19')

mydata=read.delim(fnmCSV, header = FALSE, sep = ",", quote = "\"", dec = ".", fill = TRUE, comment.char = "");
mydataRiyadh = mydata[ mydata$V6=="Riyadh Province" , ]
mydataMakkah = mydata[ mydata$V6=="Makkah Province" , ]

0.1 Set path in .libPath() 3rd party package
==============================
.libPaths() -- show the path R will search for library

to append a new path:

.libPaths( c( .libPaths(), "/NewPath") )

for example

.libPaths( c( .libPaths(), "/home/alamt/R/x86_64-unknown-linux-gnu-library/3.0/") )

CBRC cluster R .libPath() should be
========================

[1] "/home/alamt/R/x86_64-pc-linux-gnu-library/2.15"
[2] "/home/apps/R/3.0.0/lib64/R/library"
[3] "/home/alamt/R/x86_64-unknown-linux-gnu-library/3.0"

0.1.1. Call R code from terminal , shell script
====================

in shell type:
Rscript myCode.R tanviralam 99

myCode.R
------------------
args =commandArgs(TRUE)
myWorkDir = as.character(args[1])
myval = as.double(args[2])
print(myval)
setwd(myWorkDir)

0.2. Use the 3rd party package
====================

library(mvtnorm)   # use external package

0.3. Complete example of reading a tab delimitted file and build feature and label factor, In R always use factor instead of matrix
===============================================

data = read.delim( "feature_2_111.feature" , header= FALSE, sep="\t");
mymat = as.matrix(data);

totRow =-nrow (data)
totFeat = ncol (data)
print(totRow);
print(totFeat);

myFeatureFrame = data[ c(1:totRow), c(1:totFeat) ] ;
myLabelFrame = data[ c(1:totRow), c(totFeat+1) ] ;

0.1. Create DataFrame Manually

// Create a dataframe with two variable in two columns: ExprValue and ClusterNo

val=as.matrix(valforplot)

cno = as.matrix(clusternoforplot)

dfForPlot = data.frame( clusterno=cno , Exprvalue=val)

colnames(dfForPlot)

# check the value of first variable

dfForPlot$ExprValue

1. Create Matrix Manually

B = matrix( c(2, 4, 3, 1, 5, 7), nrow=3, ncol=2) ;

http://www.r-tutor.com/r-introduction/matrix/matrix-construction

1. Read Tab delimited file ( Read bed file )

mybed=read.delim(fnmBed, header = FALSE, sep = "\t", quote = "\"", dec = ".", fill = TRUE, comment.char = "");

noGene=dim(mybed)[1]
allChr=as.matrix(mybed[1])
allStart=as.matrix(mybed[2])
allEnd=as.matrix(mybed[3])
allWidth=allEnd - allStart

resultFinal=NULL;
print("Calculation of PhastCons score started ...")
for(i in 1:noGene) {

curChr = allChr[i]
curStart = allStart[i]
curWidth = allWidth[i]

}

}

1. Read as frame and convert to matrix

data = read.delim( "test.dat" , header= FALSE, sep="\t")
myMat = as.matrix(data)

// read as tab delimited file
mydata = read.delim( fnmData ,header=TRUE, sep="\t")

2. Write matrix into file

=====================

library(MASS)
write.matrix(mymat, file = "data.in", sep = "\t")

2.a Write data frame into file

==========================

results = data.frame();
uniqTF=unique(data$V9);

for (tf in uniqTF)
{
    tmp = subset( data, data$V9==tf);
   results = rbind( results , data.frame(tmp$V1,tmp$V2,tmp$V3,tmp$V4,tmp$V5,tmp$V6,tmp$V7,tmp$V8,tmp$V9 ) );

}

write.table(results, file = "track.tf.gff",col.names=F, row.names=F, quote=FALSE, sep = "\t");

How to remove double quote in writing ( use quote=FALSE)

write.table(results, file = "track.tf.gff",col.names=F, row.names=F, quote=FALSE, sep = "\t");

2.b. Write output image into file
==========================

par(mfrow=c(4,3)) // divide canvas into 4 by 3 cells

png(filename="./netsimQ05.png",   bg="white") ; # create device
plot(apres1a) ; # make your plot
dev.off() # stop device

figname='GM12878_All.eps'
postscript(file=figname, onefile=FALSE, horizontal=FALSE , bg="white")dev.off()

3. Convert to matrix

=================

mymat = as.matrix(myframe)

4. dimension of matrix

===============
dim(mymat)
norow = dim (mymat)[1] OR nrow(mymat)
nocol = dim(mymat) [2] OR ncol(mymat)

4. Common Matrix Operation
====================================
# Removing rows with all zeros

mat = sample(0:1, 49, TRUE)
dim(mat) = c(7, 7)
mat[c(1,4), ] = 0
ind = rowSums(mat) != 0

newMat = mat[ind,]

# Fill matrix by same value
a= replicate(times , value)
a = replicate(10, 9999) creates a structure with ten values, where each value equals to 9999.

# Find index of all positions matching a value

matchIdx = which( mymatrix == value) # it returns the index of matrix entry whose value match your value

mydata=read.delim(fnmCSV, header = FALSE, sep = ",", quote = "\"", dec = ".", fill = TRUE, comment.char = "");
mydataRiyadh = mydata[ mydata$V6=="Riyadh Province" , ]
mydataMakkah = mydata[ mydata$V6=="Makkah Province" , ]

# Transpose of a matrix
mymatTrans = t(mymat)

# Subselect ROW based on Value in column

mat1common = mat2[ mat1$V1 %in% ma2$V1, ]

# Sort Matrix based on a column

myMatrixSorted = myMatrix [order( myMatrix$V1),]

5. common statistics
===============

mean of columns :
colMeans(mymat)

mean of allvalue :
mean(mymat)

variance :

var(data)

range :
range(data[ , 1]) # range of first column

Speakman/Pearson correlation coefficient of rows of matrix:

library(Hmisc)
mymat= as.matrix(mtcars) # here mtcars is the data frame having 32 row, 11 column

mymatTrans=t(mymat) # As these calculate correlation between columns

## cor: measure each pair column cor-relation between 2 matrices
resultCorr = cor( mymatTrans1 , mymatTrans2 , method="spearman")
resultCorr[is.na(resultCorr)] = 0.0
finalResult = diag(resultCorr) # select each entry of diagonal(i,i) as result

## rcorr: measure all pair column cor-relation
resultCorr = rcorr( mymatTrans , type="spearman")

resultCorr$r # show the correlation
resultCorr$P # shows the p-value of cor-relation

dlmwrite ("mycoeff.txt" , resTrans$r )

P-value correction ; P-value adjustment for multiple adjustment
http://www.r-bloggers.com/example-9-30-addressing-multiple-comparisons/
https://stat.ethz.ch/R-manual/R-devel/library/stats/html/p.adjust.html

p.adjust(p, method = p.adjust.methods, n = length(p))
p.adjust.methods
# c("holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none")
Example

pvals = c(.001, .001, .001, .02, .22, .59, .87)
BONF = p.adjust(pvals, "bonferroni")
BH = p.adjust(pvals, "BH")
res = cbind(pvals, BH=round(BH, 3), BONF=round(BONF, 3))

6 sorting   and ordering to get sorted index:
================================
sortedVal = sort ( data )
sortedVal = rev ( sort (data ) )

sortedId = order ( data , decreasing=TRUE )
sortedId = order ( data , decreasing= FALSE )

6. Create vector of object provided into list : use c to create that vector
====================================
mynames = c ( 'ta' , 'al')
myvect = c ( 5 , 3 ,89)
myvect = c ( myvect , 90)   # concate new element

insert element into vector

myVect=c();

for(i in 1: N)
{

    cur =AllValue[i]
    if(i==1){
        myVect= c(cur)
    }else
    {
        myVect= append(myVect, cur)
    }

}

7 Query
===================

find most frequent word and its index in array
---------------------------
mostFreq = myarr="myarr" table="table" which.max="which.max"
mostFreqIndex =
myarr="=mostFreq,arr.ind=TRUE)"

8. Function calling
=========================

callee.r
--------------
myfnc1 <- function ( fname, totA ) {
   print( paste("inside callee:" , fname) );
   print( paste("inside callee:" , totA));
return ( totA) # You must use ( ) to return value
}

caller.r
------------------
setwd('.')
source('callee.r');
y <-myfnc1("tanvir", 5 );

call from command prompt :
-------------------------------------------
/usr/bin/Rscript caller.r myval1 myval2

Include functions from another file
======================
source('filename.r');

pass command line argument to rscript call
===============================
args < commandArgs (TRUE)
print(args[1])
print(args[2])

9. String Operation
==============

String Array
=========
cellName=c("HM1","HM2" );
print(cellName[i] );

String concat
===========
a="hell"
b="fire"
c=paste( a, b,sep = "")

String Length

=============

word=”apple”

nchar(word)

Substring

=======

substr(word,start=2,stop=5)

10. loop
===========
cellName=c("HM1","HM2" );
for(i in 1:length(cellName)) {
    print(cellName[i] );
    filePath=paste(myDir , '/data/',cellName[i] , sep="");
    print(filePath)
}
11. If - else if - else (DON'T PUT SPACE BEFORE else)
==========================================
val=6
if(val>10)
{
    print("it is above 10");
}else if (val >5)
{
    print("it is above 5");
}else
{
    print("it is default");
}

Using Hash / Hashmap / Key Value pair in R [Values are not inserted serially]

use library
=======
library(hash)

# create my hash
=============
myHashCagt <- br="" hash="">

# insert new values in to hash
====================
.set( myHashCagt, keys=curKey, values=curCluster )

# Find the expression values of those key
================================
valExpr = values(myHashCagt, keys=curKey)
valAsMat = as.matrix(valExpr) # N row, 1 column

12. Fisher 's exact test : package : stats
======================
myContin = matrix(c(1,4,7,4), nrow = 2);
fisher.test(myContin ,alternative = "two.sided" ) ; // two sided

fisher.test(myContin ,alternative = "greater" ) ; // right-hand sided
fisher.test(myContin ,alternative = "less" ) ; // left-hand sided

13. GRAPH PLOT
==============
http://rfunction.com/code/1203/cex-guide.pdf

Best tutorial: with details example

http://www.programmingr.com/content/controlling-margins-and-axes-oma-and-mgp/

http://www.stat.auckland.ac.nz/~paul/Talks/Rgraphics.pdf

4 BASIC PARAMETER
par(cex.axis=1, cex.lab=2, cex.main=1.2, cex.sub=1)

cex.axis= fix font size of axis tick
cex.lab= fix font size size of axis label
cex.main= fix size of main header font size
cex.sub= fix size of sub header font size

http://www.r-bloggers.com/setting-graph-margins-in-r-using-the-par-function-and-lots-of-cow-milk/

par(mar=c(5.1,4.1,4.1,2.1) , mgp(3,1,0) )

# mar ( bottomMargin, leftMargin, topMargin, rightMargin)
# mgp ( distanceBorderTo_AxisLabel, distanceBordeTo_tickMark, distanceBordeTo_tick)

http://rfunction.com/archives/1302

#Blank space line before drawing the box/axisLine of figure. It will contain label, tick etc.
mar=c(5.1,4.1,4.1,2.1)

mgp : Distance from the axis line to( the axis title, axis labels and axis line). default value of mgp is c(3,1,0), which means that the
axis title: def 3: is drawn in the fourth line of the margin starting from the plot region,
tick labels: def 1: are drawn in the second line
ticks: def 0: itself is the first line.

My Blog

Search This Blog