Data format processing for ggplot2
You have to make data frame to work with ggplot2
http://mundosubmundo.kaiux.com/?p=346
START WITH THIS SITE
http://www.cookbook-r.com/Graphs/Bar_and_line_graphs_%28ggplot2%29/
Official web site
http://docs.ggplot2.org/current/Practical Tips
http://www.ceb-institute.org/bbs/wp-content/uploads/2011/09/handout_ggplot2.pdf
3 Steps to make any ggplot graph (Tanvir's 3 step approach; it will work on 90 percent graph)
1. Make a data frame
2. Make a ggplot object with aes ( group =.. , color=.. , x = .. , y= ..)
Here for all plot don't require both x and y. For example, density plot only requires x , so for density provide only x for aes (). For boxplot it requires y value and x-value to show different group. so provide both x- and y-value in aes().
3. Use different function to make different type of plot
##### Step -1NoA = 10
u = rnorm(NoA)
NoB = 20
v = rnorm(NoB)
myfm =data.frame(genegroup = factor( rep(c("ccds","miRNA"), c(NoA,NoB) ) ) , exprVal = c(u,v) )
##### Step -2 and Step-3 [Densityplot : Y-value not required]
m = ggplot(myfm, aes(x=exprVal, colour=genegroup, group=genegroup))
m + geom_density(fill=NA)
##### Step -2 and Step-3 [ BoxPlot: : X and Y-value required]
m = ggplot(myfm, aes(y=exprVal,x=genegroup, colour=genegroup, group=genegroup))
m + geom_boxplot()
Bar chart
NoA = 10
NoB = 20
myfm =data.frame(genegroup = factor( rep(c("lncRNA","miRNA"), c(NoA,NoB) ) ) )
m = ggplot(myfm , aes( genegroup ) )
m+ geom_bar( aes(fill = genegroup )) + theme( text = element_text(size=8), axis.text.x = element_text(angle=90, vjust=1)) + ylab(" type ") + xlab("Population")
Now some more complex data to make graphs
BoxPlot
# Create a dataframe with two varialble Yvalues, Xvalues
val=as.matrix(valforplot)
cno=as.matrix(clusternoforplot)
# A . Read the table
myTable = read.table( fname , header= FALSE, sep="\t")
# C . Call the ggplot
dfForPlot=data.frame(Yvalues=val,Xvalues=cno)
# make the gg plot grouping agins Xvalues
ggplot(data=dfForPlot, aes(x=factor( as.matrix(dfForPlot$Xvalues)), y=as.matrix(dfForPlot$Yvalues) ) ) + geom_boxplot( notch = FALSE,outlier.colour = "green", outlier.size = 3 ) +
xlab("Cluster no") + ylab("Log Expr")
args =commandArgs(TRUE)
fname="../dataV1/F9_RA_Sox17.bed.100.100.summary"
colnames(dfForPlot)
# check the value of first variable
dfForPlot$Yvalues# make the gg plot grouping agins Xvalues
ggplot(data=dfForPlot, aes(x=factor( as.matrix(dfForPlot$Xvalues)), y=as.matrix(dfForPlot$Yvalues) ) ) + geom_boxplot( notch = FALSE,outlier.colour = "green", outlier.size = 3 ) +
xlab("Cluster no") + ylab("Log Expr")
Histogram
args =commandArgs(TRUE)
fName= as.character(args[1]);
myTitle= as.character(args[2]);
## Step -1: Read file
baseMatH = read.table(fName, header=FALSE, row.names=1)
noRowH= dim(baseMatH)[1]
noColH= dim(baseMatH)[2]
goTermCount=baseMatH[,2]
maxCount= max(goTermCount)
## Step -2: Make Data Frame
myfm =data.frame(method = factor( rep(c("GOcount"), c(noRowH) ) ) , goCount = c(goTermCount) )
## Step -3 : Histogram from ggplot2
m = ggplot(myfm, aes(x=goCount, colour=method, group=method)) + xlim(0,maxCount )
m + geom_histogram(binwidth = 1) + labs(list(title = myTitle , x = "Number of GO association", y = "Total Gene")) + theme(text = element_text(size=14) )
Normal Bar chart
fname="../dataV1/F9_RA_Sox17.bed.100.100.summary"
data looks like following with two column
TF1 500
TF2 700
TF3 900
# A . Read the table
myTable = read.table( fname , header= FALSE, sep="\t")
# B . Form data frame. You must use factor with Level . otherwise it will sort the first column
mydf = data.frame(
Commodity = factor( myTable$V1, levels = myTable$V1 ),
Production = c(myTable$V2)
)
mydf = data.frame(
Commodity = factor( myTable$V1, levels = myTable$V1 ),
Production = c(myTable$V2)
)
# C . Call the ggplot
ggplot(data=mydf, aes(x=mydf$Commodity, y=mydf$Production )) + geom_bar(stat="identity") + theme(text = element_text(size=8), axis.text.x = element_text(angle=90, vjust=1)) + ylab("Number of ENCODE peaks overlap with [-100,+100] region of peak summit") + xlab("TF")
smallNumber=1e-6
finalX=NULL
if(T){
for(curFile in allOutWithout ) {
myfname = paste(myData , curFile, sep="")
print(myfname)
baseTable = read.table(myfname, header=T, row.names=1)
myvarMat=as.matrix(baseTable)
myvarRow=rowVars(myvarMat)
myvarRow=as.matrix(log(myvarRow+smallNumber ))
totSample = dim(myvarRow)[1]
finalX=c(finalX,myvarRow)
}
}
if(T)
{
dat =data.frame(timepoint = factor(rep(c("24","28","36","48","72"), each=totSample)), deviationCAGE = finalX )
ggplot(dat, aes(x=deviationCAGE, fill=timepoint)) + geom_density(alpha=0.2) + ylab("deviation density") + xlab("log(CAGE expression deviation)")
ggsave(file=paste("deviationCAGEdensity",'.png',sep="") , width = 140, height = 140, units = "mm", dpi = 400, scale = 2 )
## box plot
ggplot(data=dat, aes(x=factor( timepoint), y=as.matrix(deviationCAGE) ) ) + geom_boxplot( notch = FALSE,outlier.colour = "green", outlier.size = 3 ) + xlab("Time Points") + ylab("Log (CAGE expression deviation)")
ggsave(file=paste("deviationCAGEboxplot",'.png',sep="") , width = 140, height = 140, units = "mm", dpi = 400, scale = 2 )
}
library(MASS)
library(ggplot2)
library(gplots)
library(extrafont)
library(stringr)
library(plyr)
}
typeRes="Mtb Only"
fnmInput=paste(myData , "NONSTIM.table.txt.report.specific", sep="" ) ;
figureTitle=paste(typeRes , " specific", sep="" ) ;
myTable = read.table( fnmInput , header= TRUE, row.names=1, sep="\t")
myValues= myTable[,2:5];
png(file=paste(fnmInput, '_heatmap' ,'.png',sep=""), bg="white") ; # create device
heatmap.2(as.matrix(myValues), col=redblue(50), scale="row", dendrogram = "row",Colv="NA" , key=TRUE, keysize=1.0,margins = c(8, 16), density.info="none",row.names=TRUE, trace="none",labRow=rownames(myTable),labCol=colnames(myTable)[2:5], main = figureTitle ,ylab = "change after Mtb infection", xlab = "DE clusters")
dev.off()
# margins = c(8, 16) # to fit all labels
theme(text = element_text(size=8), axis.text.x = element_text(angle=90, vjust=1))
ylab("Number of ENCODE peaks overlap with [-100,+100] region of peak summit") + xlab("TF")
It allows extensions eps/ps, tex (pictex), pdf, jpeg, tiff, png, bmp, svg and wmf (windows only).
Suppose you have crated a box plot using ggplot
ggplot(data=dfForPlot, aes(x=factor( as.matrix(dfForPlot$clusterno)), y=as.matrix(dfForPlot$expr) ) ) +
geom_boxplot(notch = FALSE,outlier.colour = "green", outlier.size = 3 ) +
xlab("Cluster no") + ylab("Log Expr")
Now save using following command
ggsave(file=paste(outputPlot,'.png',sep="") , width = 140, height = 140, units = "mm", dpi = 400, scale = 2 )
p + geom_line(aes(colour = genetype), linetype = 1)
xval yval groupname
ab 1 10 group1
ab 2 15 group1
c 1 10 group2
cd 2 15 group2
p = ggplot(mtcars, aes(x=xval, y= yval, group=groupname , colour=factor(groupname) ))
p + stat_smooth()
# continusous case : Specify tick marks directly
p + coord_cartesian(ylim=c(5, 7.5)) + scale_y_continuous(breaks=seq(0, 10, 0.25)) # Ticks from 0-10, every .25
Normal Density
smallNumber=1e-6
finalX=NULL
if(T){
for(curFile in allOutWithout ) {
myfname = paste(myData , curFile, sep="")
print(myfname)
baseTable = read.table(myfname, header=T, row.names=1)
myvarMat=as.matrix(baseTable)
myvarRow=rowVars(myvarMat)
myvarRow=as.matrix(log(myvarRow+smallNumber ))
totSample = dim(myvarRow)[1]
finalX=c(finalX,myvarRow)
}
}
if(T)
{
dat =data.frame(timepoint = factor(rep(c("24","28","36","48","72"), each=totSample)), deviationCAGE = finalX )
ggplot(dat, aes(x=deviationCAGE, fill=timepoint)) + geom_density(alpha=0.2) + ylab("deviation density") + xlab("log(CAGE expression deviation)")
ggsave(file=paste("deviationCAGEdensity",'.png',sep="") , width = 140, height = 140, units = "mm", dpi = 400, scale = 2 )
## box plot
ggplot(data=dat, aes(x=factor( timepoint), y=as.matrix(deviationCAGE) ) ) + geom_boxplot( notch = FALSE,outlier.colour = "green", outlier.size = 3 ) + xlab("Time Points") + ylab("Log (CAGE expression deviation)")
ggsave(file=paste("deviationCAGEboxplot",'.png',sep="") , width = 140, height = 140, units = "mm", dpi = 400, scale = 2 )
}
Multiple Plots in a single plot
if(TRUE){
library(ggplot2)
library(grid)
library(gridExtra) # You need to use this library
}
m1 = ggplot(myfm, aes(y=featureCount,x=location, colour=location, group=location))
m1 = m1 + geom_boxplot() + labs(list(title = "Transcripts", x = "", y = "")) + theme(legend.position = "none")
m2 = ggplot(myfm, aes(y=featureCount,x=location, colour=location, group=location))
m2 = m2 + geom_boxplot() + labs(list(title = "Transcripts", x = "", y = "")) + theme(legend.position = "none")
m3 = ggplot(myfm, aes(y=featureCount,x=location, colour=location, group=location))
m3 = m3 + geom_boxplot() + labs(list(title = "Transcripts", x = "", y = "")) + theme(legend.position = "none")
png(paste("myimagename" '.png',sep=""))
grid.arrange(m1 ,m2,m3 ncol=2,nrow=3);
dev.off()
grid.arrange(m1 ,m2,m3 ncol=2,nrow=3);
dev.off()
Heatmap
if(TRUE){library(MASS)
library(ggplot2)
library(gplots)
library(extrafont)
library(stringr)
library(plyr)
}
typeRes="Mtb Only"
fnmInput=paste(myData , "NONSTIM.table.txt.report.specific", sep="" ) ;
figureTitle=paste(typeRes , " specific", sep="" ) ;
myTable = read.table( fnmInput , header= TRUE, row.names=1, sep="\t")
myValues= myTable[,2:5];
png(file=paste(fnmInput, '_heatmap' ,'.png',sep=""), bg="white") ; # create device
heatmap.2(as.matrix(myValues), col=redblue(50), scale="row", dendrogram = "row",Colv="NA" , key=TRUE, keysize=1.0,margins = c(8, 16), density.info="none",row.names=TRUE, trace="none",labRow=rownames(myTable),labCol=colnames(myTable)[2:5], main = figureTitle ,ylab = "change after Mtb infection", xlab = "DE clusters")
dev.off()
# margins = c(8, 16) # to fit all labels
How to use small font and angle when there are lot of x/y tick label
your ggplot +theme(text = element_text(size=8), axis.text.x = element_text(angle=90, vjust=1))
How to use x- and y-axis label
your ggplot +ylab("Number of ENCODE peaks overlap with [-100,+100] region of peak summit") + xlab("TF")
How to save image created by ggplot
It allows extensions eps/ps, tex (pictex), pdf, jpeg, tiff, png, bmp, svg and wmf (windows only).
Suppose you have crated a box plot using ggplot
ggplot(data=dfForPlot, aes(x=factor( as.matrix(dfForPlot$clusterno)), y=as.matrix(dfForPlot$expr) ) ) +
geom_boxplot(notch = FALSE,outlier.colour = "green", outlier.size = 3 ) +
xlab("Cluster no") + ylab("Log Expr")
Now save using following command
ggsave(file=paste(outputPlot,'.png',sep="") , width = 140, height = 140, units = "mm", dpi = 400, scale = 2 )
Line graph (Mostly use for drawing average lines)
x = seq(0.01, .99, length=100)
df = data.frame(x = rep(x, 2), y = c(qlogis(x), 2 * qlogis(x)), genetype = rep(c("e_lncRNA","p_lncRNA"), each=100))
p = ggplot(df, aes(x=x, y=y, group=genetype)) # Should work p + geom_line(linetype = 2)
df = data.frame(x = rep(x, 2), y = c(qlogis(x), 2 * qlogis(x)), genetype = rep(c("e_lncRNA","p_lncRNA"), each=100))
p = ggplot(df, aes(x=x, y=y, group=genetype)) # Should work p + geom_line(linetype = 2)
p + geom_line(aes(colour = genetype), linetype = 1)
How to use confidence interval in line graph
xval yval groupname
ab 1 10 group1
ab 2 15 group1
c 1 10 group2
cd 2 15 group2
p = ggplot(mtcars, aes(x=xval, y= yval, group=groupname , colour=factor(groupname) ))
p + stat_smooth()
+ theme(legend.position = "none")
Customizing label, axis, legend
How to remove legends
p + theme(legend.position = "none")
How to change x/y axis label, title etcp+ labs(list(title = "Gm12878", x = "Distance to TSS", y = "Avg. Signal"))
How to change axis-tick label
http://www.cookbook-r.com/Graphs/Axes_%28ggplot2%29/#setting-tick-mark-labels
# discrete caser
x <- br="" c="">->
p+ scale_x_discrete(breaks=x,labels=x)
# groupwise
p + scale_x_discrete(breaks=c("cl", "t1", "t2"),labels=c("Control", "Treat1", "Treat2"))
# continusous case : Specify tick marks directly
p + coord_cartesian(ylim=c(5, 7.5)) + scale_y_continuous(breaks=seq(0, 10, 0.25)) # Ticks from 0-10, every .25
Comments
Post a Comment