rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(naivebayes)

# The data we will be using are some English social media disaster tweets discussed in 
# this article: https://arxiv.org/pdf/1705.02009.pdf 
# It consists of a number of tweets regarding accidents mixed in with a selection control tweets (not about accidents)

##########################
# ALTERNATIVE APPROACH
# let's create a unique dataset out of the training and the test-set and let's estimate on such dataset our ML algorithm
##########################

x <- read.csv("train_disaster.csv", stringsAsFactors=FALSE)
str(x)
# class-label variable: choose_one (0=tweets not relevant for accidencts; 1=relevant tweets for accidents)
table(x$choose_one)
prop.table(table(x$choose_one))
nrow(x)

x10 <- read.csv("test_disaster.csv", stringsAsFactors=FALSE)
str(x10)
nrow(x10)

x_tot <- rbind(x, x10)
str(x_tot)

myCorpusTwitterTrain <- corpus(x_tot)
tok2 <- tokens(myCorpusTwitterTrain , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE, remove_URL = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
# let's also remove the unicode symbols
tok2 <- tokens_remove(tok2, c("0*"))
tok2 <- tokens_wordstem (tok2)
Dfm_tot <- dfm(tok2)
Dfm_tot <- dfm_trim(Dfm_tot , min_docfreq = 2, verbose=TRUE)
Dfm_tot <- dfm_remove(Dfm_tot , min_nchar = 2)
head(docvars(Dfm_tot))

# let's separate the texts included in our original training form the test-set

trainDfm <- dfm_subset(Dfm_tot,!is.na(Dfm_tot@docvars$choose_one))
testDfm <- dfm_subset(Dfm_tot,is.na(Dfm_tot@docvars$choose_one))

ndoc(trainDfm)
ndoc(testDfm )

head(docvars(trainDfm))
head(docvars(testDfm ))

# same features between trainDfm & testDfm 
setequal(featnames(trainDfm), featnames(testDfm )) 

train <- as.matrix(trainDfm)
test <- as.matrix(testDfm)

system.time(NB <- multinomial_naive_bayes(x=train , y=as.factor(trainDfm@docvars$choose_one), laplace = 1))
predicted_nb <- predict(NB ,test )
table(predicted_nb )
prop.table(table(predicted_nb ))