Skip to main content

Converter Bed to GENCODE v3 GTF



Intro

Gencode V3 GTF format is different from UCSC GFF/GTF. Here is the description of format. Gencode V3 GTF has 9 mandatory key=value pairs, which are not mandatory for UCSC GFF/GTF. Here is the description

http://www.sanger.ac.uk/resources/databases/encode/gencodeformat.html

Here is a code to convert bed file to Gencode v3 GTF format:

import java.util.Vector;
import java.util.regex.Pattern;

import com.cbrc.bean.TrxExonInfo;
import com.cbrc.common.CommonFunction;

public class BedTools_BedToGencodeGTFv3 {


        void convert_Bed_GencodeGtfv3(String fnmBed, String fnmGtf)
        {

                String tmp[];
                Pattern p = Pattern.compile("[\\t]+");
                Vector vectBedStr = CommonFunction.readlinesOfAfile(fnmBed);
                StringBuffer buf = new StringBuffer();
                for(int i=0;i                 {
                        tmp = p.split(vectBedStr.get(i), 12);
                        TrxExonInfo trx = new TrxExonInfo(tmp[0], tmp[1], tmp[2], tmp[3],
tmp[4], tmp[5], tmp[6], tmp[7], tmp[8], tmp[9], tmp[10], tmp[11]) ;


                        // create 1 entry for transcript

                        buf.append(trx.getChrom()+"\t"+"LLlab"+"\t"+"transcript"+ "\t"+
                                        (trx.getStart()+1)+"\t"
+trx.getEnd()+"\t"+trx.getScore()+"\t"+trx.getStrand()+"\t" +
                                        "."+"\t"+ getAdditionalInfoMandatory(trx.getName() ) +"\n");

                        // create multiple entry for exons

                        for(int  e=0; e                         {
                                buf.append(trx.getChrom()+"\t"+"LLlab"+"\t"+"exon"+ "\t"+
                                                (trx.getExonStarts().get(e)+1)+"\t"
+trx.getExonEnds().get(e)+"\t"+trx.getScore()+"\t"+trx.getStrand()+"\t"
+
                                                "."+"\t"+ getAdditionalInfoMandatory(trx.getName() ) +"\n");

                        }
                }



                CommonFunction.writeContentToFile(fnmGtf, buf+"");

        }

        String getAdditionalInfoMandatory(String trxID)
        {
                return
                        " gene_id "    + trxID + ";" +
                        " transcript_id " + trxID + ";" +
                        " gene_type "    + "RNA" + ";" +
                        " gene_status "    + "KNOWN" + ";" +
                        " gene_name "    + trxID + ";" +
                        " transcript_type "    + "RNA" + ";" +
                        " transcript_status "    + "KNOWN" + ";" +
                        " transcript_name "    + trxID + ";" +
                        " level "    + "1" + ";" ;

        }


        public static void main(String[] args) {
                BedTools_BedToGencodeGTFv3 obj = new BedTools_BedToGencodeGTFv3();
                obj.convert_Bed_GencodeGtfv3(args[0],   args[1] );
                // example
//              obj.convert_Bed_GencodeGtfv3("./test.bed",
"./coding.withrpt.bed.wholebody.bed.gtf3" ); //
"./coding.withrpt.bed.wholebody.bed"
//              obj.convert_Bed_GencodeGtfv3("./noncoding.withrpt.bed.wholebody.bed.bed",
"./noncoding.withrpt.bed.wholebody.bed.gtf3" );

        }

}

Comments

Popular posts from this blog

MATLAB cross validation

// use built-in function samplesize = size( matrix , 1); c = cvpartition(samplesize,  'kfold' , k); % return the indexes on each fold ///// output in matlab console K-fold cross validation partition              N: 10    NumTestSets: 4      TrainSize: 8  7  7  8       TestSize: 2  3  3  2 ////////////////////// for i=1 : k    trainIdxs = find(training(c,i) ); %training(c,i);  // 1 means in train , 0 means in test    testInxs  = find(test(c,i)       ); % test(c,i);       // 1 means in test , 0 means in train    trainMatrix = matrix (  matrix(trainIdxs ), : );    testMatrix  = matrix (  matrix(testIdxs  ), : ); end //// now calculate performance %%  calculate performance of a partition     selectedKfoldSen=[];selectedKfoldSpe=[];selectedKfoldAcc=[];     indexSen=1;indexSpe=1;indexAcc=1;     if ( kfold == (P+N) )% leave one out         sensitivity = sum(cvtp) /( sum(cvtp) + sum(cvfn) )         specificity = sum(cvtn) /( sum(cvfp) + sum(cvtn) )         acc

R tutorial

Install R in linux ============ In CRAN home page, the latest version is not available. So, in fedora, Open the terminal yum list R  --> To check the latest available version of r yum install R --> install R version yum update R --> update current version to latest one 0 find help ============ ?exact topic name (  i.e.   ?mean ) 0.0 INSTALL 3rd party package  ==================== install.packages('mvtnorm' , dependencies = TRUE , lib='/home/alamt/myRlibrary/')   #  install new package BED file parsing (Always use read.delim it is the best) library(MASS) #library(ggplot2) dirRoot="D:/research/F5shortRNA/TestRIKEN/Rscripts/" dirData="D:/research/F5shortRNA/TestRIKEN/" setwd(dirRoot) getwd() myBed="test.bed" fnmBed=paste(dirData, myBed, sep="") # ccdsHh19.bed   tmp.bed ## Read bed use read.delim - it is the  best mybed=read.delim(fnmBed, header = FALSE, sep = "\t", quote = &q