Skip to main content

Text Mining in R



install.packages( c("tm", "VCorpus", "hunspell"),  dependencies = TRUE)

install.packages( c("Rweka", "RMySQL", "textmining"),  dependencies = TRUE)

install.packages( c("iemisctext", "ngram"), dependencies = TRUE)

if(TRUE)
{
  WORKDIR="D:/DeltaPartners2017/FieldOperations/TextMining/Mining/";
  setwd(WORKDIR);
  Sys.setlocale("LC_ALL","Arabic")
  memory.limit(size = 16072)
 
 
  library(tm);
  library("iemisctext")
  library(ngram)
  library(RWeka)
 
  data(anarchy)
  a <- anarchy="" documenttermmatrix="" p="">
  findFreqTerms(a, 5)
 
  data("crude")
 
  # Corpust --> DataFrame
  ddataframeCrude<-data .frame="" content="" stringsasfactors="F)</p" text="unlist(sapply(crude,">  dataframeCrude[1,]
 
}
 
 
  # Raw data to Corpus

  serv_log <- nbsp="" p="" paste="" read.csv="" sep="" test.csv="" workdir="">  mycorpus <- corpus="" ectorsource="" p="" serv_log=""> 
  mydfCorpus <- data.frame="" stringsasfactors="FALSE)</p" text="unlist(mycorpus)">  mydfCorpus[1]
 
  # Map the corpus by applying differnt filters

  mycorpusAfterMap <- mycorpus="" p="" tm_map="" tolower="">  mycorpusAfterMap <- mycorpusaftermap="" p="" removepunctuation="" tm_map="">  mycorpusAfterMap <- mycorpusaftermap="" p="" removenumbers="" tm_map="">  mycorpusAfterMap <- kind="en" mycorpusaftermap="" p="" removewords="" stopwords="" tm_map="">  mycorpusAfterMap <- mycorpusaftermap="" p="" stripwhitespace="" tm_map="">  mycorpusAfterMap <- mycorpusaftermap="" p="" stemdocument="" tm_map=""> 
  # If apply this, gives error#  mycorpusAfterMap <- laintextdocument="" mycorpusaftermap="" p="" tm_map=""> 
  ########################  Generate DT matrix    #################################
  
myDTmatrix <- documenttermmatrix="" mycorpusaftermap="" p="">  findFreqTerms( myDTmatrix, 5)
  keyword.freq <- appearing="" as.matrix="" frequency="" mydtmatrix="" of="" p="" rowsums="" words=""> 
  myDTmatrix_NonSparse <- 0.9="" mydtmatrix="" p="" removesparseterms="">  keyword_NonSparse.freq <- appearing="" as.matrix="" frequency="" mydtmatrix_nonsparse="" of="" p="" rowsums="" words=""> 
 
  # Find N-gram from corpus using NGRAM package
  
  strCorpus <- 1="" concatenate="" lapply="" mycorpusaftermap="" p="">  bigram <- n="2)</p" ngram="" strcorpus="">  # print(ngram(strCorpus, sep=" " , n=2), output="full")
  get.phrasetable(bigram)
 
  # Find N-gram from corpus using RWeka package
  #mydfCorpus <- data.frame="" stringsasfactors="FALSE)</p" text="unlist(mycorpusAfterMap)">  #bigram <- max="2))</p" min="2," mydfcorpus="" ngramtokenizer="" weka_control="">  

Comments

Popular posts from this blog

MATLAB cross validation

// use built-in function samplesize = size( matrix , 1); c = cvpartition(samplesize,  'kfold' , k); % return the indexes on each fold ///// output in matlab console K-fold cross validation partition              N: 10    NumTestSets: 4      TrainSize: 8  7  7  8       TestSize: 2  3  3  2 ////////////////////// for i=1 : k    trainIdxs = find(training(c,i) ); %training(c,i);  // 1 means in train , 0 means in test    testInxs  = find(test(c,i)       ); % test(c,i);       // 1 means in test , 0 means in train    trainMatrix = matrix (  matrix(trainIdxs ), : );    testMatrix  = matrix (  matrix(testIdxs  ), : ); end //// now calculate performance %%  calculate performance of a partiti...

R tutorial

Install R in linux ============ In CRAN home page, the latest version is not available. So, in fedora, Open the terminal yum list R  --> To check the latest available version of r yum install R --> install R version yum update R --> update current version to latest one 0 find help ============ ?exact topic name (  i.e.   ?mean ) 0.0 INSTALL 3rd party package  ==================== install.packages('mvtnorm' , dependencies = TRUE , lib='/home/alamt/myRlibrary/')   #  install new package BED file parsing (Always use read.delim it is the best) library(MASS) #library(ggplot2) dirRoot="D:/research/F5shortRNA/TestRIKEN/Rscripts/" dirData="D:/research/F5shortRNA/TestRIKEN/" setwd(dirRoot) getwd() myBed="test.bed" fnmBed=paste(dirData, myBed, sep="") # ccdsHh19.bed   tmp.bed ## Read bed use read.delim - it is the  best mybed=read.delim(fnmBed, header = FALSE, sep = "\t", quote = ...

SLURM tutorial : Basic commands

Main website for learning SLRUM http://slurm.schedmd.com/tutorials.html Submit a job with name and outputfile name(This will overwrite the parameters in shell file header ) sbatch   -J   job1  -o   job1.out  --partition=batch    myscript.sh   Basic shell script for job #!/bin/sh # #SBATCH --job-name=testJob #SBATCH --time=01:00:00 #SBATCH --nodes=1 #SBATCH --ntasks=1 #SBATCH --partition=dragon-default # # Display all variables set by slurm env | grep "^SLURM" | sort # cd /projects/dragon/FANTOM5/processed_data_feature ## All my commands for job will go here date;time; mkdir t1 How to submit a batch job sbatch myscript.sh How to check the list of jobs of a user squeue -u user1 squeue -u user1 -l # it will show in details   How to check the whole history and status of a job   scontrol show job=JOBID   How to use one particular node in interactive mode. Useful when all...