rm(list=ls(all=TRUE))
getwd()
# setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL")
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(caTools)
library(e1071)
library(randomForest)
library(caret)
library(naivebayes)
library(car)
library(ggplot2)
library(dplyr)
library(reshape2)

#####################################################
# FIRST STEP: let's create the DfM for the training-set
#####################################################

# let's focus on MOVIE reviews (either positive or negative)
x <- read.csv("train_review2.csv", stringsAsFactors=FALSE)
str(x)
nrow(x)
table(x$Sentiment)
prop.table(table(x$Sentiment))
myCorpusTwitterTrain <- corpus(x)
Dfm_train <- dfm(myCorpusTwitterTrain , remove = c(stopwords("english")), remove_punct = TRUE, remove_numbers=TRUE, 
tolower = TRUE, remove_symbols=TRUE,  remove_separators=TRUE, remove_url = TRUE, split_hyphens = TRUE)
# Let's trim the dfm in order to keep only tokens that appear in at least 5% of the reviews
Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 0.05, verbose=TRUE, docfreq_type = "prop")
topfeatures(Dfm_train , 20)  # 20 top words

#####################################################
# SECOND STEP: let's create the DfM for the test-set
#####################################################

x10 <- read.csv("test_review2.csv", stringsAsFactors=FALSE)
# NOTE that in this case we also have the "true" value for sentiment in the test-set. Of course usually this wont' happen!
# It is a test-set after all!
str(x10)
nrow(x10)
myCorpusTwitterTest <- corpus(x10)
Dfm_test<- dfm(myCorpusTwitterTest , remove = c(stopwords("english")), remove_punct = TRUE, remove_numbers=TRUE, 
tolower = TRUE, remove_symbols=TRUE,  remove_separators=TRUE, remove_url = TRUE, split_hyphens = TRUE)
Dfm_test<- dfm_trim(Dfm_test, min_docfreq = 0.05, verbose=TRUE, docfreq_type = "prop")
topfeatures(Dfm_test , 20)  # 20 top words

#####################################################
# THIRD STEP: Let's make the features identical between train and test-set by passing Dfm_train to dfm_match() as a pattern.
# after this step, we can "predict" by employing only the features included in the training-set! 
# this moreover makes the DfM for the test-set smaller if you have a giant test-set compared
# to the training-set (i.e., of course this is not the case!)
#####################################################

setequal(featnames(Dfm_train), featnames(Dfm_test)) 
length(Dfm_test@Dimnames$features) 
length(Dfm_train@Dimnames$features) 
test_dfm  <- dfm_match(Dfm_test, features = featnames(Dfm_train))
length(test_dfm@Dimnames$features) 
setequal(featnames(Dfm_train), featnames(test_dfm ))

#####################################################
# FOURTH STEP: Let's convert the two DfMs into matrices for the ML algorithms to work
#####################################################

train <- as.matrix(Dfm_train) # dense matrix
object.size(train)
# a compressed sparse matrix! 
trainSP <- as(Dfm_train, "dgCMatrix") # compressed matrix
object.size(trainSP )
object.size(train)/object.size(trainSP )

# this is going to make a HUGE difference when you have a large matrix as test and train!!!
# we will mainly use normal (i.e., not compressed) matrix in our class given that we will always use small dataset.
# But keeps this in mind!

test <- as.matrix(test_dfm)

#####################################################
# FIFHT STEP: let's estimate a ML Model
#####################################################

#####################################################
# Let's start with a Naive Bayes Model
#####################################################

# we will use the naivebayes package. Another possibile package you can consider: the fastNaiveBayes package 

# given our training-set, we have to use a Multinomial rather rather than a Binomial distribution given that your
# features can assume a value different than just 0/1, i.e., a one-hot-encoding. And in fact:
table(Dfm_train@x )
# to run a Binomial model with naivebayes just replace "multinomial_naive_bayes" with "bernoulli_naive_bayes" in the below command

# our DV
str(Dfm_train@docvars$Sentiment) # that's a character variable! Not good! So we will use as.factor to transorm it into a factor variable

set.seed(123) # (define a set.seed for being able to replicate the results!)
system.time(NB22 <- multinomial_naive_bayes(x=train, y=as.factor(Dfm_train@docvars$Sentiment), laplace = 1))
summary(NB22)
prop.table(table(Dfm_train@docvars$Sentiment)) # prior probabilities
# let's see the association between words and probabilities (i.e., matrix with class conditional parameter estimates).
# take a look at "family" and "comedy". The former increases the probability of a positive review compared to the latter one
head(NB22$params) 

# let's investigate about this issue a bit more
NB_prob <- as.data.frame(NB22$params)
NB_prob$Feature <- row.names(NB_prob)
str(NB_prob)
# the features that change the most the difference between the positive and negative conditional probabilities
NB_prob$diff <- NB_prob$pos-NB_prob$neg
str(NB_prob)
print(head(NB_prob[order(NB_prob$diff , decreasing=TRUE),], 15)) # positive words
print(head(NB_prob[order(NB_prob$diff , decreasing=FALSE),], 15)) # negative words
NB_prob$sign <- ifelse(NB_prob$diff>0,"positive","negative")
str(NB_prob)

# let's extract the top 20-most positive and the 20-most negative contributing features
NB_prob2 <- top_n(NB_prob, 20, diff ) 
NB_prob2
NB_prob3 <- top_n(NB_prob, -20, diff ) 
NB_prob3
NB_prob_new <- rbind(NB_prob2, NB_prob3)
# reorder the features
NB_prob_new <-  mutate(NB_prob_new, Feature= reorder(Feature, diff))

  ggplot(NB_prob_new, aes(Feature, diff, fill = sign)) +
  geom_col(show.legend = F) +
  coord_flip() + 
  ylab("Difference in the conditional probabilities") +
  scale_fill_manual(values = c("orange", "blue")) +
  labs(title = "Movie Reviews", 
       subtitle = "Negative (-) versus Positive (+) words - NB")

# let's FINALLY predict the test-set
predicted_nb <- predict(NB22 ,test )
table(predicted_nb )
prop.table(table(predicted_nb ))

#####################################################
# let's run a Random Forest
#####################################################

# we will use the randomForest package. If you are employing compressed matrices, plz use the ranger package.
# the ranger package is also usually faster than the randomForest one

# note that as hyperameter (or tuning-parameter), I select a specific value for the number of trees. A RF has also other hyperparameters. 
# More on this later on

set.seed(123) # (define a set.seed for being able to replicate the results!)
system.time(RF <- randomForest(y= as.factor(Dfm_train@docvars$Sentiment), x=train, importance=TRUE,  do.trace=TRUE, ntree=500))
RF

# why 32 for the no. of variables randomly tried at each split? That's another hyperparameter of a RF
# The default is mtry=sqrt(p), where p is number of variables in x. In our case x=length(Dfm_train@Dimnames$features)= 1058! 
# therefore sqrt(length(Dfm_train@Dimnames$features))=32.5)

# A natural benefit of the bootstrap resampling process is that random forests have an out-of-bag (OOB) sample
# that provides an efficient and reasonable approximation of the test error. This provides a built-in validation set without 
# any extra work on your part, and you do not need to sacrifice any of your training data to use for validation. 
# This makes identifying the number of trees required to stablize the error rate during tuning more efficient; 
# however, REMEMBER: this is less efficient than changing several hyperparameters and doing k-fold as we will see later on!

RF
# why OOB=20.6%? This estimate is calculated by counting however many points in the training set were misclassified 
# (67 negative and 36 positive = 103) and dividing this number by the total number of observations (103/500 = 20.6%)
# We will discuss at great lenght about the Confusion matrix next week!
plot(RF) # shows: the errores for the two classes (red=negative; green=positive) and in the black the average
str(RF$err.rate)
error <- as.data.frame(RF$err.rate)
str(error)
# number of trees with lowest OOB
which.min(error$OOB) # 432

set.seed(123)
system.time(RF2 <- randomForest(y= as.factor(Dfm_train@docvars$Sentiment), x=train, importance=TRUE, ntree=432, do.trace=TRUE))
RF2 

# What about the importance of each feature for our trained model?
# Variable importance is a standard metric in machine learning, which indicates the amount of information a variable provides 
# to the model for predicting the outcome
head(RF$importance[,3:4])

# let's grap the result
varImpPlot(RF )

# Each features’s importance is assessed based on two criteria:
# -MeanDecreaseAccuracy: gives a rough estimate of the loss in prediction performance when that particular variable is omitted from the training set. 
# Caveat: if two variables are somewhat redundant, then omitting one of them may not lead to massive gains in prediction performance, 
# but would make the second variable more important.

# -MeanDecreaseGini: GINI is a measure of node impurity. Think of it like this: if you use this feature to split the data, how pure will the nodes be? 
# Highest purity means that each node contains only elements of a single class. 
# Assessing the decrease in GINI when that feature is omitted leads to an understanding of how important that feature is to split the data correctly.

# Plz note that these measures are used to rank variables in terms of importance and, thus, their absolute values could be disregarded.

# The problem is that we just get for example the GINI statistics overall w/o differentiating the words most important for specific classes.
# which are however the most important words for the positive label? and for the negative one?

# let's extract the matrix for GINI and Accuracy
importance_RF <- as.data.frame(RF$importance[,3:4])
str(importance_RF)
importance_RF$Feature<- row.names(importance_RF)
str(importance_RF)
# same words we get with 
varImpPlot(RF )
print(head(importance_RF[order(importance_RF$MeanDecreaseGini, decreasing=TRUE),]))

# let's predict the training-set and let's store a new variable in our dfm of the training-set with such predictions
predicted_rf <- predict(RF, train, type="class")
table(predicted_rf )
Dfm_train@docvars$predRF <- ifelse(predicted_rf=="neg",0,1)
# NOTE: perfect prediction of the training-set!
table(Dfm_train@docvars$Sentiment, Dfm_train@docvars$predRF)

# adding sign [if 0/1 according to the content of the review - neg or pos]
sums <- list()
for (v in 0:1){
    sums[[v+1]] <- colSums(train[Dfm_train@docvars[,"predRF"]==v,])
}

# let's assign a word to pos/neg according to its frequency (if for example the word "bad" appears
# 10 times among prediceted negative reviews and 5 times among predicted positive reviews,
# we classify it as pertaining to he "negative reviews" world; and so on)

sums <- do.call(cbind, sums)
sign <- apply(sums, 1, which.max)

# get the feature 
names <-  dimnames(train)[[2]]
str(names)

df <- data.frame(
    Feature = names, 
    sign = sign-1,
    stringsAsFactors=F)

str(df)

importance <- merge(importance_RF, df, by="Feature")
str(importance)

## best predictors
for (v in 0:1){
    cat("\n\n")
    cat("value==", v)
    importance <- importance[order(importance$MeanDecreaseGini, decreasing=TRUE),]
    print(head(importance[importance$sign==v,], n=10))
    cat("\n")
    cat(paste(unique(head(importance$Features[importance$sign==v], n=10)), collapse=", "))
}

# let's draw a graph with our results!
# first of all let's recode as negative values the MeanDecreaseGini for the negative features
importance$Gini <- ifelse(importance$sign>0,importance$MeanDecreaseGini,importance$MeanDecreaseGini*-1)
# the twop 20 positive words
importance2 <- top_n(importance, 20, Gini ) 
importance2
# the twop 20 negative words
importance3 <- top_n(importance, -20, Gini ) 
importance3
importance_new <- rbind(importance2, importance3)
str(importance_new)
# reorder the features
importance_new <-  mutate(importance_new, Feature= reorder(Feature, Gini ))
importance_new$sign2<- ifelse(importance_new$sign>0,"positive","negative")
importance_new <-  mutate(importance_new, Feature= reorder(Feature, Gini ))

  ggplot(importance_new, aes(Feature, Gini , fill = sign2)) +
  geom_col(show.legend = F) +
  coord_flip() + 
  ylab("Mean Decrease Gini (we recoded as negative the values for the Negative features)") +
  scale_fill_manual(values = c("orange", "blue")) +
  labs(title = "Movie Reviews", 
       subtitle = "Negative (-) versus Positive (+) features - RF")

#### and with 3 categories? try to find it out by yourself!

# let's FINALLY predict the test-set
system.time(predicted_rf <- predict(RF, test,type="class"))
table(predicted_rf )
prop.table(table(predicted_rf))

#####################################################
# let's run a SVM
#####################################################

# note that here I select a linear kernel (in my experience a linear kernel is doing fine with texts data)
# and, as hyperameter, a specific value for the cost C(=1)- A SVM has also other hyperparameters. More on this later on

set.seed(123)# (define a set.seed for being able to replicate the results!)
system.time(SV <- svm(y= as.factor(Dfm_train@docvars$Sentiment), x=train, kernel='linear', cost = 1))

# how many supporting vectors?
length(SV$index) 
nrow(train) # 352 out of 500 texts in the train data frame
head(SV$coefs)

# The coefficients that you're pulling out are the weights for the support vectors.
# Looking at the estimated coefficients is not as informative because they only tell us what support vectors were estimated in the model. 
# But we can have a sense of what observations are more “important” or “separate” better the data by extracting the support vectors 
# in the data matrix and then their corresponding coefficients (times the training labels):

# let's predict the training-set and let's store a new variable in our dfm of the training-set with such predictions
predicted_sv <- predict(SV, train, type="class")
table(predicted_sv )
Dfm_train@docvars$predSV <- ifelse(predicted_sv=="neg",0,1)
# Once again perfect prediction of the training-set!
table(Dfm_train@docvars$Sentiment, Dfm_train@docvars$predSV)

# let's identify the 343 vectors (and their corresponding texts!)
str(x)

df <- data.frame(
  vector = x$text[SV$index],
  coef = SV$coefs,
  sentiment = predicted_sv[SV$index],
  stringsAsFactors = F
)

str(df)
str(SV)

# take a look at "decision.values". You can read "neg/pos"
# A positive coeff. means in this case a "negative" review (the numerator) and viceversa

# negative rewiew (and positive value)
df <- df[order(df$coef, decreasing=TRUE),]
head(df[,c("coef", "sentiment", "vector")], n=10)

# positive review (and negative value)
df <- df[order(df$coef),]
head(df[,c("coef", "sentiment", "vector")], n=10)

# compute feature importance matrix
str(SV$coefs) # coefficients for the vectors
str(SV$SV) # the scaled coordinates of the vectors (for each feature)
dtb <- as.data.frame(SV$SV)
str(dtb)

W <- t(SV$coefs) %*% SV$SV # let's multiply the two set of coefficients
str(W)
W <- t(W)
str(W)

# The so estimated weights indicate what features best predict the source (in our case the type of movie
# review: either positive or negative); features that are most “telling” of the source/type of review. 

W <- as.data.frame(W)
W$Feature <- row.names(W)
names(W)[1] <- "weights"
str(W)

# let's recode the feature for the Positive class as positive values and viceversa for the Negative class
W$weights <- W$weights*-1
W$sign <- ifelse(W$weights>0,"positive","negative")

print(head(W[order(W$weights, decreasing=TRUE),], 20)) # positive words
print(head(W[order(W$weights, decreasing=FALSE),], 20)) # negative words

# let's draw a graph!
W2 <- top_n(W, 20, weights) 
str(W2)
W3 <- top_n(W, -20, weights) 
str(W3)
W3
W_new <- rbind(W2, W3)
str(W_new)
# reorder the features
W_new <-  mutate(W_new, Feature= reorder(Feature, weights))

  ggplot(W_new, aes(Feature, weights, fill = sign)) +
  geom_col(show.legend = F) +
  coord_flip() + 
  ylab("weights") +
  scale_fill_manual(values = c("orange", "blue")) +
  labs(title = "Movie Reviews", 
       subtitle = "Negative (-) versus Positive (+) words - SVM")

# let's FINALLY predict the test-set
system.time(predicted_svm <- predict(SV , test))
table(predicted_svm )
prop.table(table(predicted_svm ))

######################################################
######################################################
# Let's compare the results out-of-sample we got via Naive Bayes, SVM & RF
######################################################
######################################################

prop.table(table(predicted_nb ))
prop.table(table(predicted_svm ))
prop.table(table(predicted_rf ))

results <- as.data.frame(rbind(prop.table(table(predicted_nb )), prop.table(table(predicted_rf )),
prop.table(table(predicted_svm ))))
str(results)
results$algorithm <- c("NB", "RF", "SVM")
str(results)

# Let's plot the results!

df.long<-melt(results,id.vars=c("algorithm"))
str(df.long)

ggplot(df.long,aes(algorithm,value,fill=variable))+
 geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() +  
ylab(label="Sentiment class in the test-set") +  xlab("algorithm") 

# and so, which result is to trust more than the other one? 
# In our case we have the "true" reviews value! So we can say that! However, this is just an exception of course!
# The test-set is BY DEFINITION always unlabeled!
# how can we compare the true value with the predicted ones? 
# let's estimate the MAE - mean average error (just to have a rough idea of the algorithms' performance).
# HOWEVER remember: this is NOT the MOST appropriate way to measure (and compare) the performance of ML algorithms that estimate individual classification of texts.
# A much better alternative (as we will discuss next week) is computing a Confusion matrix, and the accuracy/F1 statistics (more on that next week!)

prop.table(table(x10$Sentiment))
mae <- as.data.frame(rbind(prop.table(table(predicted_nb )), prop.table(table(predicted_rf )),
prop.table(table(predicted_svm )), prop.table(table(x10$Sentiment))))
str(mae)
mae$algorithm <- c("NB", "RF", "SVM", "TRUE")
mae

mae_NB <- (mean(abs(mae[1,1]-mae[4,1]))+mean(abs(mae[1,2]-mae[4,2])))/2
mae_RF <- (mean(abs(mae[2,1]-mae[4,1]))+mean(abs(mae[2,2]-mae[4,2])))/2
mae_SV <- (mean(abs(mae[3,1]-mae[4,1]))+mean(abs(mae[3,2]-mae[4,2])))/2

# NB appears to do better, followed by SVM and RF
mae_NB
mae_RF
mae_SV 

# We will be back to this point in the next class when we will discuss about Cross-Validation and ML!