Intro
Gencode V3 GTF format is different from UCSC GFF/GTF. Here is the description of format. Gencode V3 GTF has 9 mandatory key=value pairs, which are not mandatory for UCSC GFF/GTF. Here is the description
http://www.sanger.ac.uk/resources/databases/encode/gencodeformat.html
Here is a code to convert bed file to Gencode v3 GTF format:
import java.util.Vector;
import java.util.regex.Pattern;
import com.cbrc.bean.TrxExonInfo;
import com.cbrc.common.CommonFunction;
public class BedTools_BedToGencodeGTFv3 {
void convert_Bed_GencodeGtfv3(
{
String tmp[];
Pattern p = Pattern.compile("[\\t]+");
Vector
StringBuffer buf = new StringBuffer();
for(int i=0;i
tmp = p.split(vectBedStr.get(i), 12);
TrxExonInfo trx = new TrxExonInfo(tmp[0], tmp[1], tmp[2], tmp[3],
tmp[4], tmp[5], tmp[6], tmp[7], tmp[8], tmp[9], tmp[10], tmp[11]) ;
// create 1 entry for transcript
buf.append(trx.getChrom()+"\t"
(trx.getStart()+1)+"\t"
+trx.getEnd()+"\t"+trx.
"."+"\t"+ getAdditionalInfoMandatory(
// create multiple entry for exons
for(int e=0; e
buf.append(trx.getChrom()+"\t"
(trx.getExonStarts().get(e)+1)
+trx.getExonEnds().get(e)+"\t"
+
"."+"\t"+ getAdditionalInfoMandatory(
}
}
CommonFunction.
}
String getAdditionalInfoMandatory(
{
return
" gene_id " + trxID + ";" +
" transcript_id " + trxID + ";" +
" gene_type " + "RNA" + ";" +
" gene_status " + "KNOWN" + ";" +
" gene_name " + trxID + ";" +
" transcript_type " + "RNA" + ";" +
" transcript_status " + "KNOWN" + ";" +
" transcript_name " + trxID + ";" +
" level " + "1" + ";" ;
}
public static void main(String[] args) {
BedTools_BedToGencodeGTFv3 obj = new BedTools_BedToGencodeGTFv3();
obj.convert_Bed_GencodeGtfv3(
// example
// obj.convert_Bed_GencodeGtfv3("
"./coding.withrpt.bed.
"./coding.withrpt.bed.
// obj.convert_Bed_GencodeGtfv3("
"./noncoding.withrpt.bed.
}
}
Comments
Post a Comment