rm(list=ls(all=TRUE))
getwd()
# setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL")
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(caTools)
library(caret)
library(car)
library(ggplot2)
library(dplyr)
library(reshape2)
library(glmnet)

#####################################################
# FIRST STEP: let's create the DfM for the training-set
#####################################################

# let's focus on MOVIE reviews (either positive or negative)
x <- read.csv("train_review2.csv", stringsAsFactors=FALSE)
str(x)
nrow(x)
table(x$Sentiment)
prop.table(table(x$Sentiment))
myCorpusTwitterTrain <- corpus(x)
Dfm_train <- dfm(myCorpusTwitterTrain , remove = c(stopwords("english")), remove_punct = TRUE, remove_numbers=TRUE, 
tolower = TRUE, remove_symbols=TRUE,  remove_separators=TRUE, remove_url = TRUE, split_hyphens = TRUE)
# Let's trim the dfm in order to keep only tokens that appear in at least 5% of the reviews
Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 0.05, verbose=TRUE, docfreq_type = "prop")
topfeatures(Dfm_train , 20)  # 20 top words

#####################################################
# SECOND STEP: let's create the DfM for the test-set
#####################################################

x10 <- read.csv("test_review2.csv", stringsAsFactors=FALSE)
str(x10)
nrow(x10)
myCorpusTwitterTest <- corpus(x10)
Dfm_test<- dfm(myCorpusTwitterTest , remove = c(stopwords("english")), remove_punct = TRUE, remove_numbers=TRUE, 
tolower = TRUE, remove_symbols=TRUE,  remove_separators=TRUE, remove_url = TRUE, split_hyphens = TRUE)
Dfm_test<- dfm_trim(Dfm_test, min_docfreq = 0.05, verbose=TRUE, docfreq_type = "prop")
topfeatures(Dfm_test , 20)  # 20 top words

#####################################################
# THIRD STEP: Let's make the features identical between train and test-set by passing Dfm_train to dfm_match() as a pattern.
# after this step, we can "predict" by employing only the features included in the training-set! 
# this moreover makes the DfM for the test-set smaller if you have a giant test-set compared
# to the training-set (i.e., of course this is not the case!)
#####################################################

setequal(featnames(Dfm_train), featnames(Dfm_test)) 
length(Dfm_test@Dimnames$features) 
length(Dfm_train@Dimnames$features) 
test_dfm  <- dfm_match(Dfm_test, features = featnames(Dfm_train))
length(test_dfm@Dimnames$features) 
setequal(featnames(Dfm_train), featnames(test_dfm ))

#####################################################
# FOURTH STEP: Let's convert the DfM into matrices for the ML algorithms to work
#####################################################

train <- as.matrix(Dfm_train) 
test <- as.matrix(test_dfm)

#####################################################
# FIFHT STEP: let's estimate a ML Model
#####################################################

#####################################################
# Let's start with a Regularized regression - RIDGE model
#####################################################

# Ridge regression requires that alpha=0

# As a family here you identify "binomial" cause your output variable has 2 categories (negative, positive)
# if it were a 3 or more categories, you HAVE to use the family "multinomial"
 
system.time(ROLS <- glmnet(y= as.factor(Dfm_train@docvars$Sentiment), x=train,  family="binomial", alpha=0, trace.it=1))
# as you can see as lambda increases, the penalty becomes large and forces the coefficients to a value closer (but
# NOT exactly equal to) zero
plot(ROLS, xvar = "lambda")

# let's predict the test-set
system.time(predicted_rr <- predict(ROLS, test,type="class"))
# why you get 100,000 predictions? cause it is 1000 texts in the test-set * 100 Lambda estimated by default!
table(predicted_rr )

# So better always use cv.glmnet and extract the optimal lasso$lambda.min
set.seed(123)
system.time(ridge <-  cv.glmnet(y= as.factor(Dfm_train@docvars$Sentiment), x=train,
    family="binomial", alpha=0, nfolds=10, type.measure="class", trace.it=1 ))
plot(ridge)

# The plot output illustrates the 10-fold CV minimum miss-classification error across the lambda values. 
# As we constrain our coefficients with log of lambda>2 penalty, the miss-classification error rises considerably. 
# The number at the top of the plot (1058) just refers to the number of variables (features in our case) in the model. 
# Ridge regression does not force any variables to exactly zero so all features will remain in the model.
# The first and second vertical dashed lines represent the lambda value with the minimum miss-classification error and the largest 
# lambra value within one standard error of the minimum miss-classification error

min(ridge $cvm) # minimum miss-classification error
ridge $lambda.min # lambda for this minimum
log(ridge $lambda.min )

ridge $cvm[ridge $lambda == ridge $lambda.1se]  # 1 st.error of minimum miss-classification error
ridge $lambda.1se  # lambda for this error
log(ridge$lambda.1se)
lse <- as.numeric(ridge$lambda.1se) # let's save the lambda in this latter case
lse

# Once you have identified your preferred lambda, you can simply use the command "predict" to predict the same model on a new data set.
# The only caveat is that you need to supply to predict an s parameter with the preferred models lambda value
# Let's go back to our previous ROLS object

plot(ROLS, xvar = "lambda")
abline(v = log(ridge $lambda.min), col = "red", lty = "dashed")
abline(v = log(ridge $lambda.1se), col = "blue", lty = "dashed")

# these are the results when you use lambda.min
system.time(predicted_glment <- predict(ROLS, test, s = ridge $lambda.min, type="class"))
table(predicted_glment)
prop.table(table(predicted_glment))

# same results you get if you predict with the CV for ridge above
system.time(predicted_rr2 <- predict(ridge ,  s= ridge $lambda.min, test,type="class"))
table(predicted_rr2 )
prop.table(table(predicted_rr2))

# these are the results when you use lambda.1se
system.time(predicted_glment2 <- predict(ROLS, test, s = lse, type="class"))
table(predicted_glment2)
prop.table(table(predicted_glment2))

# Why should you ever want to use lambda.lse rather than lambda.min? 
# The advantage of identifying the lambda with a miss-classification error within one standard error from the minimum
# becomes more obvious with the lasso and elastic net models (see below!) 

# CHECK FOR COEFFICIENTS of the model with lambda.min. Why caring about them???

# Often, ML models are considered black boxes due to their complex inner-workings. However, because of their complexity, 
# they are typically more accurate for predicting nonlinear, faint, or rare phenomena. Unfortunately, more accuracy often comes at the 
# expense of interpretability, and interpretability is crucial. It is not enough to identify a machine learning model that optimizes 
# predictive performance; understanding and trusting model results is a hallmark of good science! 
# Luckily, several advancements have been made to aid in interpreting ML models over the years.
# Interpreting ML models is an emerging field that has become known as interpretable machine learning (IML).

# Approaches to model interpretability can be broadly categorized as providing global or local explanations.
# Global interpretations help us understand the inputs and their entire modeled relationship with the prediction target.
# Local interpretations help us understand model predictions for a single row of data or a group of similar rows.

# Global interpretability is about understanding how the model makes predictions, based on a holistic view of its features and how they 
# influence the underlying model structure. It answers questions regarding which features are relatively influential as well as how these 
# features influence the response variable. In this latter case, we assess the magnitude of a variable’s relationship with the response 
# as compared to other variables used in the model. 

# In text analytics (given that we are dealining with huge DfM wherein the features per-se are not the main focus of our interest)
# we are mainly interested in global interpretations; however if you use ML for other aims, local interpretations become VERY important!!!

# So let's do some Global Interpretation for our RR!

newROLS <-  glmnet(y= as.factor(Dfm_train@docvars$Sentiment), x=train,
    family="binomial", alpha=0, trace.it=1, lambda=ridge $lambda.min)
str(newROLS)

# let's create a dataframe with the features and their coefficients
coeff_df <- as.data.frame(Dfm_train@Dimnames$features)
str(coeff_df)
names(coeff_df)[1] <- "Feature"
coeff_df$coeff <- newROLS$beta@x
str(coeff_df)

# From here, let's replicate what we did for RF earlier

predicted_rr <- predict(newROLS, train, type="class")
table(predicted_rr )
Dfm_train@docvars$predRR <- ifelse(predicted_rr=="neg",0,1)
table(Dfm_train@docvars$predRR)
#  NOTE: no perfect prediction of the training-set
table(Dfm_train@docvars$Sentiment, Dfm_train@docvars$predRR)

# adding sign [if 0/1 according to the content of the tweet - negative or positive]
sums <- list()
for (v in 0:1){
    sums[[v+1]] <- colSums(train[Dfm_train@docvars[,"predRR"]==v,])
}

sums <- do.call(cbind, sums)
sign <- apply(sums, 1, which.max)
names <-  dimnames(train)[[2]]
str(names)

df <- data.frame(
    Feature = names, 
    sign = sign-1,
    stringsAsFactors=F)

str(df)
str(coeff_df)

importanceRR <- merge(coeff_df, df, by="Feature")
str(importanceRR)

# let's identify the absolute value for each coefficient
summary(importanceRR$coeff)
importanceRR$coeff <- abs(importanceRR$coeff)
summary(importanceRR$coeff)

## best predictors
for (v in 0:1){
    cat("\n\n")
    cat("value==", v)
    importanceRR<- importanceRR[order(importanceRR$coeff, decreasing=TRUE),]
    print(head(importanceRR[importanceRR$sign==v,], n=10))
    cat("\n")
    cat(paste(unique(head(importanceRR$Features[importanceRR$sign==v], n=10)), collapse=", "))
}

# let's draw a graph with our results focusing on such coefficients
importanceRR$sign2 <- ifelse(importanceRR$sign>0,importanceRR$coeff ,importanceRR$coeff *-1)
# the twop 20 relevant words
importance2RR <- top_n(importanceRR, 20, sign2 ) 
importance2RR
# the twop 20 irrelevant words
importance3RR <- top_n(importanceRR, -20, sign2 ) 
importance3RR
importance_newRR <- rbind(importance2RR, importance3RR)
str(importance_newRR)
# reorder the features
importance_newRR <-  mutate(importance_newRR, Feature= reorder(Feature, sign2 ))
importance_newRR$sign3<- ifelse(importance_newRR$sign>0,"Positive","Negative")

  ggplot(importance_newRR, aes(Feature, sign2 , fill = sign3)) +
  geom_col(show.legend = F) +
  coord_flip() + 
  ylab("RR Coefficients (we recoded as negative the values for the negative reviews)") +
  scale_fill_manual(values = c("orange", "blue")) +
  labs(title = "Movie Reviews", 
       subtitle = "Negative (-) versus Positive (+) features - RR")

######################################################
######################################################
# Let's now move to a lasso regression model
######################################################
######################################################

# lasso regression requires that alpha=1

set.seed(123)
system.time(lasso <-  cv.glmnet(y= as.factor(Dfm_train@docvars$Sentiment), x=train,
    family="binomial", alpha=1, nfolds=10, type.measure="class", trace.it=1 ))
plot(lasso )

min(lasso $cvm)       # minimum miss-classification error of the lasso regression
min(ridge $cvm)       # minimum miss-classification error of the ridge regression
lasso $lambda.min     # lambda for this value
log(lasso $lambda.min)
lasso $lambda.1se
log(lasso $lambda.1se)

# Now the advantage of identifying the lambda with a miss-classification error within one standard error becomes more obvious. 
# If we use the lambda that drives the minimum miss-classification error we can reduce our feature set to less than 109. 
# However, there will be some variability with this miss-classification error and we can reasonably assume that we can achieve 
# a similar miss-classification error with a slightly more constrained model that uses only 44 features. 
# If describing and interpreting the predictors is an important outcome of your analysis, this may significantly aid your endeavor.
# This is not necessarily the case while doing text analytics with lots of features...still, that's up to you as a decision

# Note that whereas the ridge regression approach pushes variables to approximately but not equal to zero, the lasso penalty will actually 
# push coefficients to zero 

system.time(ROLS_lasso <- glmnet(y= as.factor(Dfm_train@docvars$Sentiment), x=train,  family="binomial", alpha=1, trace.it=1))
# as you can see as lambda increases, the penalty becomes large and forces our coefficients to zero
plot(ROLS_lasso, xvar = "lambda") # with lasso
plot(ROLS, xvar = "lambda") # with ridge

######################################################
######################################################
# Let's now move to elastic nets regression model
######################################################
######################################################

# Any alpha value between 0-1 will perform an elastic net. 
# When alpha = 0.5 we perform an equal combination of penalties whereas alpha closer 0
# will have a heavier ridge penalty applied and alpha closer 1 will have a heavier lasso penalty.

# Ideally you could look for the optimal values of both lambda AND alpha. In the below example, however,
# we have fixed alpha = 0.5. Then cv.glmnet will not cross-validate across values of alpha

set.seed(123)
system.time(elastic<-  cv.glmnet(y= as.factor(Dfm_train@docvars$Sentiment), x=train, 
    family="binomial", alpha=0.5, nfolds=10,  trace.it=1,
    type.measure="class"))

plot(elastic)

min(elastic$cvm)       # minimum miss-classification error of the elastic nets regression model
elastic$lambda.min     # lambda for this value
log(elastic$lambda.min)
elastic$lambda.1se
log(elastic$lambda.1se)

# it seems that the ridge model is the one performing better in this case

min(ridge $cvm)       # minimum miss-classification error of the ridge regression
min(lasso $cvm)       # minimum miss-classification error of the lasso regression
min(elastic$cvm)       # minimum miss-classification error of the elastic nets regression model

#### let's check MAE for the test-set (given that in this case we do know the true class-labels for each text in the test-set)

# RIDGE
system.time(ROLS <- glmnet(y= as.factor(Dfm_train@docvars$Sentiment), x=train,  family="binomial", alpha=0, trace.it=1))
system.time(predicted_glmentR_min <- predict(ROLS, test, s = ridge $lambda.min, type="class"))

# LASSO
system.time(ROLS <- glmnet(y= as.factor(Dfm_train@docvars$Sentiment), x=train,  family="binomial", alpha=1, trace.it=1))
system.time(predicted_glmentL_min <- predict(ROLS, test, s = lasso $lambda.min, type="class"))

# ELASTIC NETS
system.time(ROLS <- glmnet(y= as.factor(Dfm_train@docvars$Sentiment), x=train,  family="binomial", alpha=0.5, trace.it=1))
system.time(predicted_glmentE_min <- predict(ROLS, test, s = elastic $lambda.min, type="class"))

mae<- as.data.frame(rbind(prop.table(table(predicted_glmentR_min)), prop.table(table(predicted_glmentL_min )),
prop.table(table(predicted_glmentE_min)), prop.table(table(x10$Sentiment))))

str(mae)
mae$algorithm <- c("Ridge", "Lasso", "Elastic", "TRUE")

mae_Ridge <- (mean(abs(mae[1,1]-mae[4,1]))+mean(abs(mae[1,2]-mae[4,2])))/2
mae_Lasso <- (mean(abs(mae[2,1]-mae[4,1]))+mean(abs(mae[2,2]-mae[4,2])))/2
mae_Elasitc <- (mean(abs(mae[3,1]-mae[4,1]))+mean(abs(mae[3,2]-mae[4,2])))/2

# Ridge better
mae_Ridge 
mae_Lasso 
mae_Elasitc 

######################################################
######################################################
# Let's run a 5-folds CV with a Ridge Regression (to compare it with other ML algorithms for example)
######################################################
######################################################

# STEP 1: create the 5 folds
library(cvTools)
ttrain <- train # let's change the name of the original train data.frame, given that we are already going to use such name below in the loop
# let's split our training-set in 5 folds 
set.seed(123) # set the see for replicability
k <- 5 # the number of folds; it does not matter the number of folds you decide here; the below procedure always will work!
folds <- cvFolds(NROW(ttrain ), K=k)
str(folds)

system.time(for(i in 1:k){
  train <- ttrain [folds$subsets[folds$which != i], ] # Set the training set
  validation <- ttrain [folds$subsets[folds$which == i], ] # Set the validation set
  set.seed(123)
  newrf <- cv.glmnet(y= as.factor(Dfm_train[folds$subsets[folds$which != i], ]@docvars$Sentiment), x=train,
    family="binomial", alpha=0, nfolds=10, type.measure="class", trace.it=1 ) # Get your new model (just fit on the train data) and ADD the name of the output (in this case "Sentiment")
  newpred <- predict(newrf,validation, s = newrf $lambda.min, type="class") # Get the predicitons for the validation set (from the model just fit on the train data)
class_table <- table("Predictions"= newpred, "Actual"=Dfm_train[folds$subsets[folds$which == i], ]@docvars$Sentiment)
print(class_table)  
  df<-confusionMatrix( class_table,  mode = "everything") 
  df.name<-paste0("conf.mat.rr",i) # create the name for the object that will save the confusion matrix for each loop (=5)  
  assign(df.name,df)
})

# STEP 3: the metrics
ls()
# we have created 5 objects that have saved the 5 confusion matrices we have created. I can estimate now the performance metrics on such results
# for example:
conf.mat.rr1

RRPredict <- data.frame(col1=vector(), col2=vector(), col3=vector()) # makes a blank data frame with three columns to fill with the predictions; 
# why 3 columns? 1 for accuracy; and 2 for the K1 value of the classes in the Sentiment (given you have just two classes!)

for(i in  mget(ls(pattern = "conf.mat.rr")) ) {
Accuracy <-(i)$overall[1] # save in the matrix the accuracy value
F1_negative<- (2*(i)$byClass[1]*(i)$byClass[3])/((i)$byClass[1]+(i)$byClass[3]) # save in the matrix the F1 value for negative 
F1_positive <- (2*(i)$byClass[2]*(i)$byClass[4])/((i)$byClass[2]+(i)$byClass[4]) # save in the matrix the F1 value for positive
RRPredict <- rbind(RRPredict , cbind(Accuracy , F1_negative, F1_positive ))
 }

str(RRPredict )
RRPredict [is.na(RRPredict )] <- 0

# Let's compare the average value for accuracy and f1

acc_rr_avg <- mean(RRPredict[, 1] )
f1_rr_avg <- mean(colMeans(RRPredict[-1] ))

acc_rr_avg
f1_rr_avg