Install R in linux
============
In CRAN home page, the latest version is not available. So, in fedora,
Open the terminal
- yum list R --> To check the latest available version of r
- yum install R --> install R version
- yum update R --> update current version to latest one
0 find help
============
?exact topic name ( i.e. ?mean )
0.0 INSTALL 3rd party package
====================
install.packages('mvtnorm' , dependencies = TRUE , lib='/home/alamt/myRlibrary/') # install new package
BED file parsing (Always use read.delim it is the best)
library(MASS)
#library(ggplot2)
dirRoot="D:/research/F5shortRNA/TestRIKEN/Rscripts/"
dirData="D:/research/F5shortRNA/TestRIKEN/"
setwd(dirRoot)
getwd()
myBed="test.bed"
fnmBed=paste(dirData, myBed, sep="") # ccdsHh19.bed tmp.bed
## Read bed use read.delim - it is the best
mybed=read.delim(fnmBed, header = FALSE, sep = "\t", quote = "\"", dec = ".", fill = TRUE, comment.char = "");## PARSE BED column
noGene=dim(mybed)[1]
allChr=as.matrix(mybed[1])
allStart=as.matrix(mybed[2])
allEnd=as.matrix(mybed[3])
allWidth=allEnd - allStart
allChr=as.matrix(mybed[1])
allStart=as.matrix(mybed[2])
allEnd=as.matrix(mybed[3])
allWidth=allEnd - allStart
## Iterate over rows of bed
for(i in myStart: myEnd) {curChr = allChr[i]
curStart = allStart[i]
curWidth = allWidth[i]
if(i%%1000 ==0){
print("Now calculating for geneno: ");
print (i)
}
}
## Find the index of row matching value a particular column
matchIdx = which( allChr == 'chr19')
mydata=read.delim(fnmCSV, header = FALSE, sep = ",", quote = "\"", dec = ".", fill = TRUE, comment.char = "");
mydataRiyadh = mydata[ mydata$V6=="Riyadh Province" , ]
mydataMakkah = mydata[ mydata$V6=="Makkah Province" , ]
0.1 Set path in .libPath() 3rd party package
==============================
.libPaths() -- show the path R will search for library
to append a new path:
.libPaths( c( .libPaths(), "/NewPath") )
for example
.libPaths( c( .libPaths(), "/home/alamt/R/x86_64-unknown-linux-gnu-library/3.0/") )
CBRC cluster R .libPath() should be
========================
[1] "/home/alamt/R/x86_64-pc-linux-gnu-library/2.15"
[2] "/home/apps/R/3.0.0/lib64/R/library"
[3] "/home/alamt/R/x86_64-unknown-linux-gnu-library/3.0"
0.1.1. Call R code from terminal , shell script
====================
in shell type:
Rscript myCode.R tanviralam 99
myCode.R
------------------
args =commandArgs(TRUE)
myWorkDir = as.character(args[1])
myval = as.double(args[2])
print(myval)
setwd(myWorkDir)
0.2. Use the 3rd party package
====================
library(mvtnorm) # use external package
0.3. Complete example of reading a tab delimitted file and build feature and label factor, In R always use factor instead of matrix
===============================================
data = read.delim( "feature_2_111.feature" , header= FALSE, sep="\t");
mymat = as.matrix(data);
totRow =-nrow (data)
totFeat = ncol (data)
print(totRow);
print(totFeat);
myFeatureFrame = data[ c(1:totRow), c(1:totFeat) ] ;
myLabelFrame = data[ c(1:totRow), c(totFeat+1) ] ;
0.1. Create DataFrame Manually
// Create a dataframe with two variable in two columns: ExprValue and ClusterNo
val=as.matrix(valforplot)
cno = as.matrix(clusternoforplot)
10. loop
===========
cellName=c("HM1","HM2" );
for(i in 1:length(cellName)) {
print(cellName[i] );
filePath=paste(myDir , '/data/',cellName[i] , sep="");
print(filePath)
}
11. If - else if - else (DON'T PUT SPACE BEFORE else)
==========================================
val=6
if(val>10)
{
print("it is above 10");
}else if (val >5)
{
print("it is above 5");
}else
{
print("it is default");
}
use library
=======
library(hash)
# create my hash
=============
myHashCagt <- br="" hash="">
# insert new values in to hash
====================
.set( myHashCagt, keys=curKey, values=curCluster )
# Find the expression values of those key
================================
valExpr = values(myHashCagt, keys=curKey)
valAsMat = as.matrix(valExpr) # N row, 1 column
12. Fisher 's exact test : package : stats
======================
myContin = matrix(c(1,4,7,4), nrow = 2);
fisher.test(myContin ,alternative = "two.sided" ) ; // two sided
fisher.test(myContin ,alternative = "greater" ) ; // right-hand sided
fisher.test(myContin ,alternative = "less" ) ; // left-hand sided
13. GRAPH PLOT
==============
http://rfunction.com/code/1203/cex-guide.pdf
Best tutorial: with details example
http://www.programmingr.com/content/controlling-margins-and-axes-oma-and-mgp/
http://www.stat.auckland.ac.nz/~paul/Talks/Rgraphics.pdf
4 BASIC PARAMETER
par(cex.axis=1, cex.lab=2, cex.main=1.2, cex.sub=1)
cex.axis= fix font size of axis tick
cex.lab= fix font size size of axis label
cex.main= fix size of main header font size
cex.sub= fix size of sub header font size
http://www.r-bloggers.com/setting-graph-margins-in-r-using-the-par-function-and-lots-of-cow-milk/
par(mar=c(5.1,4.1,4.1,2.1) , mgp(3,1,0) )
# mar ( bottomMargin, leftMargin, topMargin, rightMargin)
# mgp ( distanceBorderTo_AxisLabel, distanceBordeTo_tickMark, distanceBordeTo_tick)
http://rfunction.com/archives/1302
#Blank space line before drawing the box/axisLine of figure. It will contain label, tick etc.
mar=c(5.1,4.1,4.1,2.1)
mgp : Distance from the axis line to( the axis title, axis labels and axis line). default value of mgp is c(3,1,0), which means that the
axis title: def 3: is drawn in the fourth line of the margin starting from the plot region,
tick labels: def 1: are drawn in the second line
ticks: def 0: itself is the first line.
->
dfForPlot = data.frame( clusterno=cno , Exprvalue=val)
B = matrix( c(2, 4, 3, 1, 5, 7), nrow=3, ncol=2) ;
http://www.r-tutor.com/r-introduction/matrix/matrix-construction
noGene=dim(mybed)[1]
allChr=as.matrix(mybed[1])
allStart=as.matrix(mybed[2])
allEnd=as.matrix(mybed[3])
allWidth=allEnd - allStart
resultFinal=NULL;
print("Calculation of PhastCons score started ...")
for(i in 1:noGene) {
curChr = allChr[i]
curStart = allStart[i]
curWidth = allWidth[i]
}
}
data = read.delim( "test.dat" , header= FALSE, sep="\t")
myMat = as.matrix(data)
// read as tab delimited file
mydata = read.delim( fnmData ,header=TRUE, sep="\t")
library(MASS)
write.matrix(mymat, file = "data.in", sep = "\t")
uniqTF=unique(data$V9);
for (tf in uniqTF)
{
tmp = subset( data, data$V9==tf);
results = rbind( results , data.frame(tmp$V1,tmp$V2,tmp$V3,tmp$V4,tmp$V5,tmp$V6,tmp$V7,tmp$V8,tmp$V9 ) );
}
write.table(results, file = "track.tf.gff",col.names=F, row.names=F, quote=FALSE, sep = "\t");
How to remove double quote in writing ( use quote=FALSE)
write.table(results, file = "track.tf.gff",col.names=F, row.names=F, quote=FALSE, sep = "\t");
2.b. Write output image into file
==========================
par(mfrow=c(4,3)) // divide canvas into 4 by 3 cells
png(filename="./netsimQ05.png", bg="white") ; # create device
plot(apres1a) ; # make your plot
dev.off() # stop device
figname='GM12878_All.eps'
postscript(file=figname, onefile=FALSE, horizontal=FALSE , bg="white")dev.off()
dim(mymat)
norow = dim (mymat)[1] OR nrow(mymat)
nocol = dim(mymat) [2] OR ncol(mymat)
4. Common Matrix Operation
====================================
# Removing rows with all zeros
# Fill matrix by same value
a= replicate(times , value)
a = replicate(10, 9999) creates a structure with ten values, where each value equals to 9999.
# Find index of all positions matching a value
matchIdx = which( mymatrix == value) # it returns the index of matrix entry whose value match your value
mydata=read.delim(fnmCSV, header = FALSE, sep = ",", quote = "\"", dec = ".", fill = TRUE, comment.char = "");
mydataRiyadh = mydata[ mydata$V6=="Riyadh Province" , ]
mydataMakkah = mydata[ mydata$V6=="Makkah Province" , ]
# Transpose of a matrix
mymatTrans = t(mymat)
# Subselect ROW based on Value in column
mat1common = mat2[ mat1$V1 %in% ma2$V1, ]
# Sort Matrix based on a column
myMatrixSorted = myMatrix [order( myMatrix$V1),]
5. common statistics
===============
mean of columns :
colMeans(mymat)
mean of allvalue :
mean(mymat)
variance :
var(data)
range :
range(data[ , 1]) # range of first column
Speakman/Pearson correlation coefficient of rows of matrix:
library(Hmisc)
mymat= as.matrix(mtcars) # here mtcars is the data frame having 32 row, 11 column
mymatTrans=t(mymat) # As these calculate correlation between columns
## cor: measure each pair column cor-relation between 2 matrices
resultCorr = cor( mymatTrans1 , mymatTrans2 , method="spearman")
resultCorr[is.na(resultCorr)] = 0.0
finalResult = diag(resultCorr) # select each entry of diagonal(i,i) as result
## rcorr: measure all pair column cor-relation
resultCorr = rcorr( mymatTrans , type="spearman")
resultCorr$r # show the correlation
resultCorr$P # shows the p-value of cor-relation
dlmwrite ("mycoeff.txt" , resTrans$r )
P-value correction ; P-value adjustment for multiple adjustment
http://www.r-bloggers.com/example-9-30-addressing-multiple-comparisons/
https://stat.ethz.ch/R-manual/R-devel/library/stats/html/p.adjust.html
p.adjust(p, method = p.adjust.methods, n = length(p))
p.adjust.methods
# c("holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none")
Example
6 sorting and ordering to get sorted index:
================================
sortedVal = sort ( data )
sortedVal = rev ( sort (data ) )
sortedId = order ( data , decreasing=TRUE )
sortedId = order ( data , decreasing= FALSE )
6. Create vector of object provided into list : use c to create that vector
====================================
mynames = c ( 'ta' , 'al')
myvect = c ( 5 , 3 ,89)
myvect = c ( myvect , 90) # concate new element
insert element into vector
myVect=c();
for(i in 1: N)
{
cur =AllValue[i]
if(i==1){
myVect= c(cur)
}else
{
myVect= append(myVect, cur)
}
}
7 Query
===================
find most frequent word and its index in array
---------------------------
mostFreq = myarr="myarr" table="table" which.max="which.max"
mostFreqIndex =
myarr="=mostFreq,arr.ind=TRUE)"
8. Function calling
=========================
callee.r
--------------
myfnc1 <- function ( fname, totA ) {
print( paste("inside callee:" , fname) );
print( paste("inside callee:" , totA));
return ( totA) # You must use ( ) to return value
}
caller.r
------------------
setwd('.')
source('callee.r');
y <-myfnc1("tanvir", 5 );
call from command prompt :
-------------------------------------------
/usr/bin/Rscript caller.r myval1 myval2
Include functions from another file
======================
source('filename.r');
pass command line argument to rscript call
===============================
args < commandArgs (TRUE)
print(args[1])
print(args[2])
9. String Operation
==============
String Array
=========
cellName=c("HM1","HM2" );
print(cellName[i] );
String concat
===========
a="hell"
b="fire"
c=paste( a, b,sep = "")
colnames(dfForPlot)
# check the value of first variable
dfForPlot$ExprValue
1. Create Matrix Manually
B = matrix( c(2, 4, 3, 1, 5, 7), nrow=3, ncol=2) ;
http://www.r-tutor.com/r-introduction/matrix/matrix-construction
1. Read Tab delimited file ( Read bed file )
mybed=read.delim(fnmBed, header = FALSE, sep = "\t", quote = "\"", dec = ".", fill = TRUE, comment.char = "");noGene=dim(mybed)[1]
allChr=as.matrix(mybed[1])
allStart=as.matrix(mybed[2])
allEnd=as.matrix(mybed[3])
allWidth=allEnd - allStart
resultFinal=NULL;
print("Calculation of PhastCons score started ...")
for(i in 1:noGene) {
curChr = allChr[i]
curStart = allStart[i]
curWidth = allWidth[i]
}
}
1. Read as frame and convert to matrix
data = read.delim( "test.dat" , header= FALSE, sep="\t")
myMat = as.matrix(data)
// read as tab delimited file
mydata = read.delim( fnmData ,header=TRUE, sep="\t")
2. Write matrix into file
=====================library(MASS)
write.matrix(mymat, file = "data.in", sep = "\t")
2.a Write data frame into file
==========================
results = data.frame();uniqTF=unique(data$V9);
for (tf in uniqTF)
{
tmp = subset( data, data$V9==tf);
results = rbind( results , data.frame(tmp$V1,tmp$V2,tmp$V3,tmp$V4,tmp$V5,tmp$V6,tmp$V7,tmp$V8,tmp$V9 ) );
}
write.table(results, file = "track.tf.gff",col.names=F, row.names=F, quote=FALSE, sep = "\t");
How to remove double quote in writing ( use quote=FALSE)
write.table(results, file = "track.tf.gff",col.names=F, row.names=F, quote=FALSE, sep = "\t");
2.b. Write output image into file
==========================
par(mfrow=c(4,3)) // divide canvas into 4 by 3 cells
png(filename="./netsimQ05.png", bg="white") ; # create device
plot(apres1a) ; # make your plot
dev.off() # stop device
figname='GM12878_All.eps'
postscript(file=figname, onefile=FALSE, horizontal=FALSE , bg="white")dev.off()
3. Convert to matrix
=================
mymat = as.matrix(myframe)4. dimension of matrix
===============dim(mymat)
norow = dim (mymat)[1] OR nrow(mymat)
nocol = dim(mymat) [2] OR ncol(mymat)
4. Common Matrix Operation
====================================
# Removing rows with all zeros
mat = sample(0:1, 49, TRUE) dim(mat) = c(7, 7) mat[c(1,4), ] = 0 ind = rowSums(mat) != 0newMat = mat[ind,]
# Fill matrix by same value
a= replicate(times , value)
a = replicate(10, 9999) creates a structure with ten values, where each value equals to 9999.
# Find index of all positions matching a value
matchIdx = which( mymatrix == value) # it returns the index of matrix entry whose value match your value
mydata=read.delim(fnmCSV, header = FALSE, sep = ",", quote = "\"", dec = ".", fill = TRUE, comment.char = "");
mydataRiyadh = mydata[ mydata$V6=="Riyadh Province" , ]
mydataMakkah = mydata[ mydata$V6=="Makkah Province" , ]
# Transpose of a matrix
mymatTrans = t(mymat)
# Subselect ROW based on Value in column
mat1common = mat2[ mat1$V1 %in% ma2$V1, ]
# Sort Matrix based on a column
myMatrixSorted = myMatrix [order( myMatrix$V1),]
5. common statistics
===============
mean of columns :
colMeans(mymat)
mean of allvalue :
mean(mymat)
variance :
var(data)
range :
range(data[ , 1]) # range of first column
Speakman/Pearson correlation coefficient of rows of matrix:
library(Hmisc)
mymat= as.matrix(mtcars) # here mtcars is the data frame having 32 row, 11 column
mymatTrans=t(mymat) # As these calculate correlation between columns
## cor: measure each pair column cor-relation between 2 matrices
resultCorr = cor( mymatTrans1 , mymatTrans2 , method="spearman")
resultCorr[is.na(resultCorr)] = 0.0
finalResult = diag(resultCorr) # select each entry of diagonal(i,i) as result
## rcorr: measure all pair column cor-relation
resultCorr = rcorr( mymatTrans , type="spearman")
resultCorr$r # show the correlation
resultCorr$P # shows the p-value of cor-relation
dlmwrite ("mycoeff.txt" , resTrans$r )
P-value correction ; P-value adjustment for multiple adjustment
http://www.r-bloggers.com/example-9-30-addressing-multiple-comparisons/
https://stat.ethz.ch/R-manual/R-devel/library/stats/html/p.adjust.html
p.adjust(p, method = p.adjust.methods, n = length(p))
p.adjust.methods
# c("holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none")
Example
pvals = c(.001, .001, .001, .02, .22, .59, .87) BONF = p.adjust(pvals, "bonferroni") BH = p.adjust(pvals, "BH") res = cbind(pvals, BH=round(BH, 3), BONF=round(BONF, 3))
6 sorting and ordering to get sorted index:
================================
sortedVal = sort ( data )
sortedVal = rev ( sort (data ) )
sortedId = order ( data , decreasing=TRUE )
sortedId = order ( data , decreasing= FALSE )
6. Create vector of object provided into list : use c to create that vector
====================================
mynames = c ( 'ta' , 'al')
myvect = c ( 5 , 3 ,89)
myvect = c ( myvect , 90) # concate new element
insert element into vector
myVect=c();
for(i in 1: N)
{
cur =AllValue[i]
if(i==1){
myVect= c(cur)
}else
{
myVect= append(myVect, cur)
}
}
7 Query
===================
find most frequent word and its index in array
---------------------------
mostFreq = myarr="myarr" table="table" which.max="which.max"
mostFreqIndex =
myarr="=mostFreq,arr.ind=TRUE)"
8. Function calling
=========================
callee.r
--------------
myfnc1 <- function ( fname, totA ) {
print( paste("inside callee:" , fname) );
print( paste("inside callee:" , totA));
return ( totA) # You must use ( ) to return value
}
caller.r
------------------
setwd('.')
source('callee.r');
y <-myfnc1("tanvir", 5 );
call from command prompt :
-------------------------------------------
/usr/bin/Rscript caller.r myval1 myval2
Include functions from another file
======================
source('filename.r');
pass command line argument to rscript call
===============================
args < commandArgs (TRUE)
print(args[1])
print(args[2])
9. String Operation
==============
String Array
=========
cellName=c("HM1","HM2" );
print(cellName[i] );
String concat
===========
a="hell"
b="fire"
c=paste( a, b,sep = "")
String Length
=============
word=”apple”
nchar(word)
Substring
=======
substr(word,start=2,stop=5)
10. loop
===========
cellName=c("HM1","HM2" );
for(i in 1:length(cellName)) {
print(cellName[i] );
filePath=paste(myDir , '/data/',cellName[i] , sep="");
print(filePath)
}
11. If - else if - else (DON'T PUT SPACE BEFORE else)
==========================================
val=6
if(val>10)
{
print("it is above 10");
}else if (val >5)
{
print("it is above 5");
}else
{
print("it is default");
}
Using Hash / Hashmap / Key Value pair in R [Values are not inserted serially]
use library
=======
library(hash)
# create my hash
=============
myHashCagt <- br="" hash="">
# insert new values in to hash
====================
.set( myHashCagt, keys=curKey, values=curCluster )
# Find the expression values of those key
================================
valExpr = values(myHashCagt, keys=curKey)
valAsMat = as.matrix(valExpr) # N row, 1 column
12. Fisher 's exact test : package : stats
======================
myContin = matrix(c(1,4,7,4), nrow = 2);
fisher.test(myContin ,alternative = "two.sided" ) ; // two sided
fisher.test(myContin ,alternative = "greater" ) ; // right-hand sided
fisher.test(myContin ,alternative = "less" ) ; // left-hand sided
13. GRAPH PLOT
==============
http://rfunction.com/code/1203/cex-guide.pdf
Best tutorial: with details example
http://www.programmingr.com/content/controlling-margins-and-axes-oma-and-mgp/
http://www.stat.auckland.ac.nz/~paul/Talks/Rgraphics.pdf
4 BASIC PARAMETER
par(cex.axis=1, cex.lab=2, cex.main=1.2, cex.sub=1)
cex.axis= fix font size of axis tick
cex.lab= fix font size size of axis label
cex.main= fix size of main header font size
cex.sub= fix size of sub header font size
http://www.r-bloggers.com/setting-graph-margins-in-r-using-the-par-function-and-lots-of-cow-milk/
par(mar=c(5.1,4.1,4.1,2.1) , mgp(3,1,0) )
# mar ( bottomMargin, leftMargin, topMargin, rightMargin)
# mgp ( distanceBorderTo_AxisLabel, distanceBordeTo_tickMark, distanceBordeTo_tick)
http://rfunction.com/archives/1302
#Blank space line before drawing the box/axisLine of figure. It will contain label, tick etc.
mar=c(5.1,4.1,4.1,2.1)
mgp : Distance from the axis line to( the axis title, axis labels and axis line). default value of mgp is c(3,1,0), which means that the
axis title: def 3: is drawn in the fourth line of the margin starting from the plot region,
tick labels: def 1: are drawn in the second line
ticks: def 0: itself is the first line.
->
Comments
Post a Comment