rm(list=ls(all=TRUE)) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(quanteda) library(readtext) library(naivebayes) # The data we will be using are some English social media disaster tweets discussed in # this article: https://arxiv.org/pdf/1705.02009.pdf # It consists of a number of tweets regarding accidents mixed in with a selection control tweets (not about accidents) ########################## # ALTERNATIVE APPROACH # let's create a unique dataset out of the training and the test-set and let's estimate on such dataset our ML algorithm ########################## x <- read.csv("train_disaster.csv", stringsAsFactors=FALSE) str(x) # class-label variable: choose_one (0=tweets not relevant for accidencts; 1=relevant tweets for accidents) table(x$choose_one) prop.table(table(x$choose_one)) nrow(x) x10 <- read.csv("test_disaster.csv", stringsAsFactors=FALSE) str(x10) nrow(x10) x_tot <- rbind(x, x10) str(x_tot) myCorpusTwitterTrain <- corpus(x_tot) tok2 <- tokens(myCorpusTwitterTrain , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE, remove_URL = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) # let's also remove the unicode symbols tok2 <- tokens_remove(tok2, c("0*")) tok2 <- tokens_wordstem (tok2) Dfm_tot <- dfm(tok2) Dfm_tot <- dfm_trim(Dfm_tot , min_docfreq = 2, verbose=TRUE) Dfm_tot <- dfm_remove(Dfm_tot , min_nchar = 2) head(docvars(Dfm_tot)) # let's separate the texts included in our original training form the test-set trainDfm <- dfm_subset(Dfm_tot,!is.na(Dfm_tot@docvars$choose_one)) testDfm <- dfm_subset(Dfm_tot,is.na(Dfm_tot@docvars$choose_one)) ndoc(trainDfm) ndoc(testDfm ) head(docvars(trainDfm)) head(docvars(testDfm )) # same features between trainDfm & testDfm setequal(featnames(trainDfm), featnames(testDfm )) train <- as.matrix(trainDfm) test <- as.matrix(testDfm) system.time(NB <- multinomial_naive_bayes(x=train , y=as.factor(trainDfm@docvars$choose_one), laplace = 1)) predicted_nb <- predict(NB ,test ) table(predicted_nb ) prop.table(table(predicted_nb ))