Skip to main content

ggplot ggplot2 tutorial




Data format processing for ggplot2


You have to make data frame to work with ggplot2

http://mundosubmundo.kaiux.com/?p=346

START WITH THIS SITE


http://www.cookbook-r.com/Graphs/Bar_and_line_graphs_%28ggplot2%29/



 Official web site

http://docs.ggplot2.org/current/

Practical Tips


http://www.ceb-institute.org/bbs/wp-content/uploads/2011/09/handout_ggplot2.pdf

3 Steps to make any ggplot graph (Tanvir's 3 step approach; it will work on 90 percent graph)

 

1. Make a data frame

2. Make a ggplot object with aes ( group =.. , color=.. , x = .. , y= ..)

Here for all plot don't require both x and y. For example, density plot only requires x , so for density provide only x for aes (). For boxplot it requires y value and x-value to show different group. so provide both x- and y-value in aes().

3. Use different function to make different type of plot

##### Step -1 
NoA = 10
u = rnorm(NoA)
NoB = 20
v = rnorm(NoB)

myfm =data.frame(genegroup = factor( rep(c("ccds","miRNA"), c(NoA,NoB) )  ) , exprVal = c(u,v) )

##### Step -2  and Step-3 [Densityplot : Y-value not required]
m = ggplot(myfm, aes(x=exprVal, colour=genegroup, group=genegroup))
m + geom_density(fill=NA)
   





 

##### Step -2  and Step-3 [ BoxPlot: : X and Y-value required]

m = ggplot(myfm, aes(y=exprVal,x=genegroup, colour=genegroup, group=genegroup))
 m + geom_boxplot()





Bar chart



NoA = 10
NoB = 20
myfm =data.frame(genegroup = factor( rep(c("lncRNA","miRNA"), c(NoA,NoB) )  )  )
m = ggplot(myfm , aes(     genegroup  )   )
m+ geom_bar( aes(fill = genegroup )) + theme( text = element_text(size=8),  axis.text.x = element_text(angle=90, vjust=1)) + ylab(" type ") +  xlab("Population")





Now some more complex data to make graphs

BoxPlot

 # Create a dataframe with two varialble Yvalues, Xvalues
val=as.matrix(valforplot)
cno=as.matrix(clusternoforplot)
dfForPlot=data.frame(Yvalues=val,Xvalues=cno)
colnames(dfForPlot)
# check the value of first variable
dfForPlot$Yvalues

# make the gg plot grouping agins Xvalues
ggplot(data=dfForPlot, aes(x=factor( as.matrix(dfForPlot$Xvalues)), y=as.matrix(dfForPlot$Yvalues) ) ) + geom_boxplot(  notch = FALSE,outlier.colour = "green", outlier.size = 3  ) +
xlab("Cluster no") +  ylab("Log Expr") 

Histogram

args  =commandArgs(TRUE)
fName= as.character(args[1]);
myTitle= as.character(args[2]);


## Step -1: Read file
 

baseMatH = read.table(fName, header=FALSE, row.names=1)
noRowH= dim(baseMatH)[1]
noColH= dim(baseMatH)[2]
goTermCount=baseMatH[,2]
maxCount= max(goTermCount)


## Step -2: Make Data Frame
myfm =data.frame(method = factor( rep(c("GOcount"), c(noRowH) )  ) , goCount = c(goTermCount) )

## Step -3 : Histogram from ggplot2
m = ggplot(myfm, aes(x=goCount, colour=method, group=method)) +  xlim(0,maxCount )
m + geom_histogram(binwidth = 1)  + labs(list(title = myTitle , x = "Number of GO association", y = "Total Gene")) + theme(text = element_text(size=14) )

 

Normal Bar chart


fname="../dataV1/F9_RA_Sox17.bed.100.100.summary"
data looks like following with two column
TF1  500
TF2 700
TF3 900

# A . Read the table
myTable = read.table( fname , header= FALSE, sep="\t")

# B . Form data frame. You must use factor with Level . otherwise it will sort the first column
 mydf = data.frame(
 Commodity =  factor( myTable$V1, levels = myTable$V1  ),
 Production = c(myTable$V2)   
 )

# C . Call the ggplot 
ggplot(data=mydf, aes(x=mydf$Commodity, y=mydf$Production )) + geom_bar(stat="identity")  + theme(text = element_text(size=8),  axis.text.x = element_text(angle=90, vjust=1)) + ylab("Number of ENCODE peaks overlap with [-100,+100] region of peak summit") +  xlab("TF")




Normal Density


smallNumber=1e-6
finalX=NULL
if(T){
       
    for(curFile in allOutWithout ) {  
        myfname = paste(myData  , curFile, sep="")
        print(myfname)
        baseTable = read.table(myfname, header=T, row.names=1)
        myvarMat=as.matrix(baseTable)
        myvarRow=rowVars(myvarMat)
        myvarRow=as.matrix(log(myvarRow+smallNumber ))
        totSample = dim(myvarRow)[1]
        finalX=c(finalX,myvarRow)
       
    }
   
}

if(T)
{
    dat =data.frame(timepoint = factor(rep(c("24","28","36","48","72"), each=totSample)), deviationCAGE = finalX )
   
    ggplot(dat, aes(x=deviationCAGE, fill=timepoint)) + geom_density(alpha=0.2) + ylab("deviation density") +  xlab("log(CAGE expression deviation)")
    ggsave(file=paste("deviationCAGEdensity",'.png',sep="") , width = 140, height = 140, units = "mm", dpi = 400, scale = 2 )

## box plot
    ggplot(data=dat, aes(x=factor( timepoint), y=as.matrix(deviationCAGE) ) ) + geom_boxplot(  notch = FALSE,outlier.colour = "green", outlier.size = 3  ) + xlab("Time Points") +  ylab("Log (CAGE expression deviation)") 
    ggsave(file=paste("deviationCAGEboxplot",'.png',sep="") , width = 140, height = 140, units = "mm", dpi = 400, scale = 2 )


}



Multiple Plots in a single plot

 

 

 

if(TRUE){
    library(ggplot2)
    library(grid)
    library(gridExtra) # You need to use this library
}
 m1 = ggplot(myfm, aes(y=featureCount,x=location, colour=location, group=location))
m1 = m1 + geom_boxplot() + labs(list(title = "Transcripts", x = "", y = "")) + theme(legend.position = "none")


 m2 = ggplot(myfm, aes(y=featureCount,x=location, colour=location, group=location))
m2 = m2 + geom_boxplot() + labs(list(title = "Transcripts", x = "", y = "")) + theme(legend.position = "none")

 m3 = ggplot(myfm, aes(y=featureCount,x=location, colour=location, group=location))
m3 = m3 + geom_boxplot() + labs(list(title = "Transcripts", x = "", y = "")) + theme(legend.position = "none")



png(paste("myimagename" '.png',sep=""))
grid.arrange(m1 ,m2,m3   ncol=2,nrow=3);
dev.off()


Heatmap

if(TRUE){

    library(MASS)
    library(ggplot2)
    library(gplots)
    library(extrafont)
    library(stringr)
    library(plyr)
}
typeRes="Mtb Only"
fnmInput=paste(myData , "NONSTIM.table.txt.report.specific", sep="" ) ;
figureTitle=paste(typeRes , " specific", sep="" ) ;
myTable = read.table( fnmInput , header= TRUE, row.names=1, sep="\t")
myValues= myTable[,2:5];

png(file=paste(fnmInput, '_heatmap' ,'.png',sep=""),   bg="white")  ;  # create device

heatmap.2(as.matrix(myValues), col=redblue(50), scale="row", dendrogram = "row",Colv="NA" ,  key=TRUE, keysize=1.0,margins = c(8, 16), density.info="none",row.names=TRUE,  trace="none",labRow=rownames(myTable),labCol=colnames(myTable)[2:5], main = figureTitle ,ylab = "change after Mtb infection", xlab = "DE clusters")

dev.off()
# margins = c(8, 16) # to fit all labels





How to use small font and angle when there are lot of x/y tick label

your ggplot +
theme(text = element_text(size=8),  axis.text.x = element_text(angle=90, vjust=1))

 

How to use x- and y-axis  label

your ggplot +
ylab("Number of ENCODE peaks overlap with [-100,+100] region of peak summit") +  xlab("TF")

How to save image created by ggplot


It allows extensions eps/ps, tex (pictex), pdf, jpeg, tiff, png, bmp, svg and wmf (windows only).

Suppose you have crated a box plot using ggplot

ggplot(data=dfForPlot, aes(x=factor( as.matrix(dfForPlot$clusterno)), y=as.matrix(dfForPlot$expr) ) ) + 
    geom_boxplot(notch = FALSE,outlier.colour = "green", outlier.size = 3   ) + 
    xlab("Cluster no") +  ylab("Log Expr")



Now save using following command

    ggsave(file=paste(outputPlot,'.png',sep="") , width = 140, height = 140, units = "mm", dpi = 400, scale = 2 )


Line graph (Mostly use for drawing average lines)




 x =  seq(0.01, .99, length=100)
df  = data.frame(x = rep(x, 2), y = c(qlogis(x), 2 * qlogis(x)), genetype = rep(c("e_lncRNA","p_lncRNA"), each=100))
p  =  ggplot(df, aes(x=x, y=y, group=genetype)) # Should work p + geom_line(linetype = 2) 

p + geom_line(aes(colour = genetype), linetype = 1)

How to use confidence interval in line graph


           xval    yval   groupname
ab       1        10    group1
ab       2        15    group1
c         1        10    group2
cd       2        15    group2


p = ggplot(mtcars, aes(x=xval, y= yval, group=groupname , colour=factor(groupname) ))
p + stat_smooth()

+ theme(legend.position = "none")
 
 

Customizing label, axis, legend

How to remove legends 

p + theme(legend.position = "none")
 
How to change x/y axis label, title etc

p+  labs(list(title = "Gm12878", x = "Distance to TSS", y = "Avg. Signal"))
 

How to change axis-tick label 

http://www.cookbook-r.com/Graphs/Axes_%28ggplot2%29/#setting-tick-mark-labels

# discrete caser
x <- br="" c="">
p+ scale_x_discrete(breaks=x,labels=x) 
 
# groupwise 
p + scale_x_discrete(breaks=c("cl", "t1", "t2"),labels=c("Control", "Treat1", "Treat2"))

# continusous case : Specify tick marks directly
p + coord_cartesian(ylim=c(5, 7.5)) +  scale_y_continuous(breaks=seq(0, 10, 0.25))  # Ticks from 0-10, every .25

Comments

Popular posts from this blog

Running openmp in eclipse

As we know to run openmp in gcc , C++ project we have to compile it with g++ -fopenmp option. To configure this with eclipse you just need to add -fopenmp under GCC C++ linker command option

MATLAB cross validation

// use built-in function samplesize = size( matrix , 1); c = cvpartition(samplesize,  'kfold' , k); % return the indexes on each fold ///// output in matlab console K-fold cross validation partition              N: 10    NumTestSets: 4      TrainSize: 8  7  7  8       TestSize: 2  3  3  2 ////////////////////// for i=1 : k    trainIdxs = find(training(c,i) ); %training(c,i);  // 1 means in train , 0 means in test    testInxs  = find(test(c,i)       ); % test(c,i);       // 1 means in test , 0 means in train    trainMatrix = matrix (  matrix(trainIdxs ), : );    testMatrix  = matrix (  matrix(testIdxs  ), : ); end //// now calculate performance %%  calculate performance of a partition     selectedKfoldSen=[];selectedKfoldSpe=[];selectedKfoldAcc=[];     indexSen=1;indexSpe=1;indexAcc=1;     if ( kfold == (P+N) )% leave one out         sensitivity = sum(cvtp) /( sum(cvtp) + sum(cvfn) )         specificity = sum(cvtn) /( sum(cvfp) + sum(cvtn) )         acc

R tutorial

Install R in linux ============ In CRAN home page, the latest version is not available. So, in fedora, Open the terminal yum list R  --> To check the latest available version of r yum install R --> install R version yum update R --> update current version to latest one 0 find help ============ ?exact topic name (  i.e.   ?mean ) 0.0 INSTALL 3rd party package  ==================== install.packages('mvtnorm' , dependencies = TRUE , lib='/home/alamt/myRlibrary/')   #  install new package BED file parsing (Always use read.delim it is the best) library(MASS) #library(ggplot2) dirRoot="D:/research/F5shortRNA/TestRIKEN/Rscripts/" dirData="D:/research/F5shortRNA/TestRIKEN/" setwd(dirRoot) getwd() myBed="test.bed" fnmBed=paste(dirData, myBed, sep="") # ccdsHh19.bed   tmp.bed ## Read bed use read.delim - it is the  best mybed=read.delim(fnmBed, header = FALSE, sep = "\t", quote = &q