rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(caTools)
library(e1071)
library(randomForest)
library(caret)
library(naivebayes)
library(car)
library(ggplot2)
library(dplyr)
library(reshape2)
library(iSAX)

##############################################
##############################################
#### Let's call the training and test-sets
##############################################
##############################################

# Let's focus on a database related to detecting spam in emails
x <- read.csv("spam.train.csv", stringsAsFactors=FALSE)
str(x)

# a pretty imbalanced training-set: this should negatively affect the performance 
# of a ML algorithm much more than the performance of a proportional algorithm
prop.table(table(x$type))

myCorpusTwitterTrain <- corpus(x)
Dfm_train <- dfm(myCorpusTwitterTrain , remove = c(stopwords("english")), remove_punct = TRUE, remove_numbers=TRUE, 
tolower = TRUE, remove_symbols=TRUE,  remove_separators=TRUE, remove_url = TRUE, split_hyphens = TRUE)
Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE)
Dfm_train  <- dfm_remove(Dfm_train , min_nchar = 2)
topfeatures(Dfm_train , 20)  # 20 top words

# in this case we know the "true" value of ham/spam for the texts in our test-set. But this is of course just an example!
x10 <- read.csv("spam.test.csv", stringsAsFactors=FALSE)
str(x10)
myCorpusTwitterTest <- corpus(x10)
Dfm_test<- dfm(myCorpusTwitterTest , remove = c(stopwords("english")), remove_punct = TRUE, remove_numbers=TRUE, 
tolower = TRUE, remove_symbols=TRUE,  remove_separators=TRUE, remove_url = TRUE, split_hyphens = TRUE)
Dfm_test<- dfm_trim(Dfm_test, min_docfreq = 0.05, verbose=TRUE, docfreq_type = "prop")
Dfm_test <- dfm_trim(Dfm_test , min_docfreq = 2, verbose=TRUE)
Dfm_test <- dfm_remove(Dfm_test , min_nchar = 2)
topfeatures(Dfm_test , 20)  # 20 top words

test_dfm  <- dfm_match(Dfm_test, features = featnames(Dfm_train))
train <- as.matrix(Dfm_train) 
test <- as.matrix(test_dfm)

######################################################
######################################################
# Let's estimate a NB as our benchmark
######################################################
######################################################

set.seed(123) # (define a set.seed for being able to replicate the results!)
system.time(NB22 <- multinomial_naive_bayes(x=train, y=as.factor(Dfm_train@docvars$type), laplace = 1))

# Let's do some features analysis

# let's investigate about this issue a bit more
NB_prob <- as.data.frame(NB22$params)
NB_prob$Feature <- row.names(NB_prob)
str(NB_prob)
# the features that change the most the difference between the spam and non-spam features conditional probabilities
NB_prob$diff <- NB_prob$spam-NB_prob$ham
str(NB_prob)
print(head(NB_prob[order(NB_prob$diff , decreasing=TRUE),], 15)) # spam words
print(head(NB_prob[order(NB_prob$diff , decreasing=FALSE),], 15)) # non-spam words
NB_prob$sign <- ifelse(NB_prob$diff>0,"spam","ham")
str(NB_prob)

# let's extract the top 20-most positive and the 20-most negative contributing features
NB_prob2 <- top_n(NB_prob, 20, diff ) 
NB_prob2
NB_prob3 <- top_n(NB_prob, -20, diff ) 
NB_prob3
NB_prob_new <- rbind(NB_prob2, NB_prob3)
# reorder the features
NB_prob_new <-  mutate(NB_prob_new, Feature= reorder(Feature, diff))

  ggplot(NB_prob_new, aes(Feature, diff, fill = sign)) +
  geom_col(show.legend = F) +
  coord_flip() + 
  ylab("Difference in the conditional probabilities") +
  scale_fill_manual(values = c("orange", "blue")) +
  labs(title = "Movie Reviews", 
       subtitle = "Ham (-) versus Spam (+) words - NB")

######### let's predict the test-set

predicted_nb <- predict(NB22 ,test )
table(predicted_nb )
prop.table(table(predicted_nb ))

######################################################
######################################################
# Let's learn how to estimate a proportional model via iSAX
######################################################
######################################################

### STEP 1: TRAINING-TEST
x <- read.csv("spam.train.csv", stringsAsFactors=FALSE)
str(x)
x$typeFC <- as.factor(x$type) # let's transform the variable "type" in a factor variable
str(x)
prop.table(table(x$typeFC))

### STEP 2: TEST-SET
x10 <- read.csv("spam.test.csv", stringsAsFactors=FALSE)
str(x10)

### STEP 3: Let's create a unique dataset including both TEST and TRAINING SET 
x10$typeFC<- NA # for doing that let's add a new column called "typeFC" in the test-set given that such column is presented in the training-set
str(x10)
documents <- rbind(x10, x) # let's combine the test and the training-set
str(documents ) # we have 5572 texts (1000 as training-set and 4572 as training-set)
prop.table(table(documents $typeFC))

### STEP 4: iSA needs the TM package (not Quanteda!) to build the DfM
corpus <- VCorpus(VectorSource(documents $text)) # let's build the corpus
corpus [[1]]$content
corpus <- tm_map(corpus , removeWords, stopwords("english")) # let's remove stopwords
corpus [[1]]$content
ocome <- prep.data(corpus,verbose=TRUE, th=0.995) # let's prepare data for iSA algorithm
# This is a pre-processing step which performs stemming and other cleaning steps (remove punctuation, numbers, etc.), as well as producing the DfM.
# th=0.995 means that we drop those features that appear in less than 5% of the texts
str(ocome)

# if you want the hexadecimal notation (not usually suggested unless you have a very large training-set)
# ocome2 <- prep.data(corpus,verbose=TRUE, th=0.995, shannon=TRUE) 

### STEP 5: let's separate the resulting object "ocome" according to the presence or absence of info about the type factor variable
### (i.e., training vs. test-set)
train <- !is.na(documents $typeFC) # I create an index=TRUE for the training-set documents (i.e., those texts with typeFC)
# different than NA)
train
summary(train)
D <- documents$typeFC[train] # I recover the vector of the values for the type in the training-set
str(D)
# Same results indeed!
prop.table(table(D))
prop.table(table(x$typeFC))

Strain <- ocome$S[which(train)] # I select out of "ocome" the vector of stems belonging to the training-set
Stest <- ocome$S[-which(train)] # I select out of "ocome" the vector of stems belonging to the test-set

length(Strain ) # 500!
length(Stest) # 4572!

### STEP 6: let's run the proporional algorithm
set.seed(123)
system.time(outSent <- iSA(Strain ,Stest , D)) # D is the vector of codings belonging to the training set

### STEP 7: let's predict
round(outSent$btab, 5) # I have also bootstrapped s.e.

###############################################################################
# let's estimate the MAE - mean average error (just to have a rough idea of the algorithms' performance)
###############################################################################

mae <- as.data.frame(rbind(outSent$btab[,"Estimate"], prop.table(table(predicted_nb )),
prop.table(table(x10$type))))
str(mae)
mae$algorithm <- c("iSA", "NB", "TRUE")
mae

mae_iSA <- (mean(abs(mae[1,1]-mae[3,1]))+mean(abs(mae[1,2]-mae[3,2])))/2
mae_NB <- (mean(abs(mae[2,1]-mae[3,1]))+mean(abs(mae[2,2]-mae[3,2])))/2

mae_iSA
mae_NB

###############################################################################
# Of course, you can also run a CV with iSA! Try to do that by yourself!
###############################################################################