rm(list=ls(all=TRUE)) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(quanteda) library(readtext) library(caTools) library(e1071) library(randomForest) library(caret) library(naivebayes) library(car) library(cvTools) library(reshape2) library(dplyr) ##################################################### # let's prepare the training-set with 3 categories (this script works fine for any number of categories>2) ##################################################### uk_train <- read.csv("uk_train2022.csv") str(uk_train) myCorpusTwitterTrain <- corpus(uk_train) tok2 <- tokens(myCorpusTwitterTrain, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE, remove_url = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_wordstem (tok2) Dfm_train <- dfm(tok2) # Let's trim the dfm in order to keep only tokens that appear in 2 or more tweets (tweets are very short texts...) Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE) topfeatures(Dfm_train , 20) # 20 top words # Always checking if after trimming you have some texts with just 0s! # 7 in this case Dfm_train[ntoken(Dfm_train) == 0,] Dfm_train <- Dfm_train[ntoken(Dfm_train) != 0,] Dfm_train[ntoken(Dfm_train) == 0,] # our classes table(Dfm_train@docvars$sentiment) # our benchmark: accuracy .57 prop.table(table(Dfm_train@docvars$sentiment)) train <- as.matrix(Dfm_train) ###################################################### ###################################################### # which main changes? Compared to previous scripts? ###################################################### ###################################################### # First during the CV stage: # consider the case of a SVM - but that applies of course to all the other ML algorithms ###################################################### ###################################################### # STEP 1: create the 5 folds. ttrain <- train nrow(ttrain) set.seed(1234) # set the see for replicability k <- 5 folds <- cvFolds(NROW(ttrain ), K=k) str(folds) # STEP 2: the LOOP system.time(for(i in 1:k){ train <- ttrain [folds$subsets[folds$which != i], ] # Set the training set validation <- ttrain [folds$subsets[folds$which == i], ] # Set the validation set newrf <- svm(y= as.factor(Dfm_train[folds$subsets[folds$which != i], ]@docvars$sentiment) ,x=train, kernel='linear', cost = 1) # Get your new model # (just fit on the train data) and ADD the name of the output (in this case "sentiment") newpred <- predict(newrf,newdata=validation) # Get the predicitons for the validation set (from the model just fit on the train data) class_table <- table("Predictions"= newpred, "Actual"=Dfm_train[folds$subsets[folds$which == i], ]@docvars$sentiment) print(class_table) df<-confusionMatrix( class_table, mode = "everything") df.name<-paste0("conf.mat.sv",i) # create the name for the object that will save the confusion matrix for each loop (=5) assign(df.name,df) }) # STEP 3: the metrics SVMPredict <- data.frame(col1=vector(), col2=vector(), col3=vector(), col4=vector()) ##### FIRST CHANGE # Why 4 columns NOW? 1 for accuracy; and 3 for the K1 value of the classes in the Sentiment: negative, neutral, positive. # According to the number of classes in your output variable, changes the number of columns to fill!!! for(i in mget(ls(pattern = "conf.mat.sv")) ) { Accuracy <-(i)$overall[1] # save in the matrix the accuracy value ##### SECOND CHANGE: the following 4 lines; p <- as.data.frame((i)$byClass) F1_negative <- p$F1[1] # save in the matrix the F1 value for negative F1_neutral <- p$F1[2] # save in the matrix the F1 value for neutral F1_positive <- p$F1[3] # save in the matrix the F1 value for positive SVMPredict <- rbind(SVMPredict , cbind(Accuracy , F1_negative , F1_neutral, F1_positive )) } str(SVMPredict ) # you see that we are not doing that well with the class "negative" # Let's compare the average value for accuracy and f1 acc_sv_avg <- mean(SVMPredict[, 1] ) f1_sv_avg <- mean(colMeans(SVMPredict[-1] )) acc_sv_avg f1_sv_avg # you see here that we do not improve that much compared to our benchmark model. # Moreover there is a wide gap between accuracy and the avg. value of F1: why? # Cause we are doing relatively bad with the "negative" class mean(colMeans(SVMPredict[2] )) mean(colMeans(SVMPredict[3] )) mean(colMeans(SVMPredict[4] )) ###################################################### ###################################################### # Second: in terms of the global-interpretation of your results [but you can look through it by your own] ###################################################### ###################################################### ##################################################### # Let's start with a Naive Bayes Model ##################################################### train <- as.matrix(Dfm_train) set.seed(123) system.time(NB22 <- multinomial_naive_bayes(x=train, y=as.factor(Dfm_train@docvars$sentiment), laplace = 1)) summary(NB22) NB_prob <- as.data.frame(NB22$params) NB_prob$Feature <- row.names(NB_prob) # let's identify the max value between negative, neutral and positive for each feature NB_prob$winner <- apply(NB_prob[c(1:3)], 1, FUN=max) str(NB_prob) NB_prob$sentiment <- ifelse(NB_prob$winner == NB_prob$negative, "negative", ifelse(NB_prob$winner == NB_prob$positive, "positive", "neutral")) str(NB_prob) negatives <- NB_prob[ which(NB_prob$sentiment =="negative"), ] positives <- NB_prob[ which(NB_prob$sentiment =="positive"), ] neutrals <- NB_prob[ which(NB_prob$sentiment =="neutral"), ] # the features that change the most the difference between the positive and negative conditional probabilities print(head(negatives [order(negatives $winner , decreasing=TRUE),], 15)) # negative words print(head(positives [order(positives $winner , decreasing=TRUE),], 15)) # positive words print(head(neutrals [order(neutrals $winner , decreasing=TRUE),], 15)) # neutral words # let's extract the top 15-most positive values, the 15-most negative values and the 15-most neutral values with respect to contributing features df1 <- top_n(negatives , 15, winner ) df2 <- top_n(positives , 15, winner) df3 <- top_n(neutrals , 15, winner) NB_prob_new <- rbind(df1, df2, df3) # reorder the features NB_prob_new <- mutate(NB_prob_new, Feature= reorder(Feature, winner)) ggplot(NB_prob_new, aes(Feature, winner, fill = sentiment)) + geom_bar(stat="identity", fill= "white") + theme_classic()+ geom_col(show.legend = TRUE) + coord_flip() + theme(legend.position="bottom")+ ylab("Conditional probabilities") + scale_fill_manual(values = c("#000033", "#006699", "#99CCFF")) + labs(title = "Tweets about uk 2014 EP elections", subtitle = "positive versus neutral versus negative words - Naive Bayes Model") # let's reorder the features but differentiating according to the category (positive, negative and neutral) NB_prob_new$Feature <- with(NB_prob_new, factor(Feature,levels=Feature[order(ave(winner, sentiment,FUN=max),winner)])) ggplot(NB_prob_new, aes(Feature, winner, fill = sentiment)) + geom_bar(stat="identity", fill= "white") + theme_classic()+ geom_col(show.legend = TRUE) + coord_flip() + theme(legend.position="bottom")+ ylab("Conditional probabilities") + scale_fill_manual(values = c("#000033", "#006699", "#99CCFF")) + labs(title = "Tweets about uk 2014 EP elections", subtitle = "positive versus neutral versus negative words - Naive Bayes Model") NB_graph <- ggplot(NB_prob_new, aes(Feature, winner, fill = sentiment)) + geom_bar(stat="identity", fill= "white") + theme_classic()+ geom_col(show.legend = TRUE) + coord_flip() + theme(legend.position="bottom")+ ylab("Conditional probabilities") + scale_fill_manual(values = c("#000033", "#006699", "#99CCFF")) + labs(title = "Tweets about uk 2014 EP elections", subtitle = "positive versus neutral versus negative words - Naive Bayes Model") ##################################################### # let's run a Random Forest ##################################################### set.seed(123) # (define a set.seed for being able to replicate the results!) system.time(RF <- randomForest(y= as.factor(Dfm_train@docvars$sentiment), x=train, importance=TRUE, do.trace=TRUE, ntree=500)) RF str(RF$err.rate) plot(RF, type = "l", col = c("red", "black","springgreen4", "blue"), main = "Class errors in Random Forest Model / OOB") legend("topright", horiz = F, cex = 0.7, fill = c( "red", "black", "springgreen4", "blue"), c( "OOB","negative", "neutral", "positive")) head(RF$importance[,4:5]) varImpPlot(RF ) # let's extract the matrix for GINI and Accuracy importance_RF <- as.data.frame(RF$importance[,4:5]) str(importance_RF) importance_RF$Feature<- row.names(importance_RF) str(importance_RF) # same words we get with varImpPlot(RF ) print(head(importance_RF[order(importance_RF$MeanDecreaseGini, decreasing=TRUE),])) predicted_rf <- predict(RF, train, type="class") table(predicted_rf ) Dfm_train@docvars$predRF <- ifelse(predicted_rf=="negative",0, ifelse(predicted_rf == "neutral",1,2)) table(Dfm_train@docvars$predRF) # adding sign [if 0/1 according to the content of the review - neg or pos] sums <- list() for (v in 0:2){ sums[[v+1]] <- colSums(train[Dfm_train@docvars[,"predRF"]==v,], na.rm = TRUE) } sums <- do.call(cbind, sums) sign <- apply(sums, 1, which.max) # get the feature names <- dimnames(train)[[2]] str(names) df <- data.frame( Feature = names, sign = sign-1, stringsAsFactors=F) str(df) importance <- merge(importance_RF, df, by="Feature") str(importance) table(importance$sign) for (v in 0:2){ cat("\n\n") cat("value==", v) importance <- importance[order(importance$MeanDecreaseGini, decreasing=TRUE),] print(head(importance[importance$sign==v,], n=10)) cat("\n") cat(paste(unique(head(importance$Features[importance$sign==v], n=10)), collapse=", ")) } # let's draw a graph with our results! str(importance) negatives <- importance[ which(importance$sign ==0), ] neutrals <- importance[ which(importance$sign ==1), ] positives <- importance[ which(importance$sign ==2), ] df1 <- top_n(negatives , 10, MeanDecreaseGini) df2 <- top_n(positives , 10, MeanDecreaseGini) df3 <- top_n(neutrals , 10, MeanDecreaseGini) RF_prob_new <- rbind(df1, df2, df3) str(RF_prob_new) RF_prob_new$sign2 <- factor(RF_prob_new$sign, labels = c("Negative","Neutral","Positive")) str(RF_prob_new) table(RF_prob_new$sign) table(RF_prob_new$sign2) # reorder the features RF_prob_new <- mutate(RF_prob_new, Feature= reorder(Feature, MeanDecreaseGini )) ggplot(RF_prob_new, aes(Feature, MeanDecreaseGini , fill = sign2)) + geom_bar(stat="identity", fill= "white") + theme_classic() + geom_col(show.legend = T) + theme(legend.position="bottom")+ coord_flip() + ylab("Mean Decrease Gini") + labs(title = "Tweets about uk 2014 EP elections", subtitle = "negative versus neutral versus positive features - Random Forest Model", fill= "sentiment") + scale_fill_manual(values = c("#003333", "#009999", "steelblue")) # let's reorder the features but differentiating according to the category (positive, negative and neutral) RF_prob_new$Feature <- with(RF_prob_new, factor(Feature,levels=Feature[order(ave(MeanDecreaseGini, sign2,FUN=max),MeanDecreaseGini)])) ggplot(RF_prob_new, aes(Feature, MeanDecreaseGini , fill = sign2)) + geom_bar(stat="identity", fill= "white") + theme_classic() + geom_col(show.legend = T) + theme(legend.position="bottom")+ coord_flip() + ylab("Mean Decrease Gini") + labs(title = "Tweets about uk 2014 EP elections", subtitle = "negative versus neutral versus positive features - Random Forest Model", fill= "sentiment") + scale_fill_manual(values = c("#003333", "#009999", "steelblue")) RF_graph <- ggplot(RF_prob_new, aes(Feature, MeanDecreaseGini , fill = sign2)) + geom_bar(stat="identity", fill= "white") + theme_classic() + geom_col(show.legend = T) + theme(legend.position="bottom")+ coord_flip() + ylab("Mean Decrease Gini") + labs(title = "Tweets about uk 2014 EP elections", subtitle = "negative versus neutral versus positive features - Random Forest Model", fill= "sentiment") + scale_fill_manual(values = c("#003333", "#009999", "steelblue")) ##################################################### # Let's estimate a SVM ##################################################### set.seed(123)# (define a set.seed for being able to replicate the results!) system.time(SV <- svm(y= as.factor(Dfm_train@docvars$sentiment), x=train, kernel='linear', cost = 1)) length(SV$index) # permutation train2 <- convert(Dfm_train, to = "data.frame") colnames(train2 ) <- make.names(colnames(train2 ), unique = TRUE) # rerun the SVM with the data frame set.seed(123) tree2 <- svm(as.factor(Dfm_train@docvars$sentiment) ~ ., data = train2[-1], kernel='linear', cost = 1) mod <- Predictor$new(tree2, data = train2[-1], y = as.factor(Dfm_train@docvars$sentiment), type = "prob") system.time({ plan("callr", workers = 6) set.seed(123) imp2 <- FeatureImp$new(mod, loss = "ce", n.repetitions=1) }) imp2 res <- imp2$results colnames(train) <- make.names(colnames(train), unique = TRUE) predicted_sv <- predict(tree2 , train, type="class") table(predicted_sv ) Dfm_train@docvars$predSV <- ifelse(predicted_sv=="negative",0, ifelse(predicted_sv == "neutral",1,2)) table(Dfm_train@docvars$predSV) # adding sign [if 0/2 according to the content of the review - neg (0) or pos (1) or neutral (2)] sums <- list() for (v in 0:2){ sums[[v+1]] <- colSums(train[Dfm_train@docvars[,"predSV"]==v,], na.rm = TRUE) } sums <- do.call(cbind, sums) sign <- apply(sums, 1, which.max) # get the feature names <- dimnames(train)[[2]] str(names) df <- data.frame( feature = names, sign = sign-1, stringsAsFactors=F) str(df) str(res) importanceSVM <- merge(res, df, by="feature") str(importanceSVM ) names(importanceSVM )[3] <- "importanceSVM" str(importanceSVM ) ## best predictors for (v in 0:2){ cat("\n\n") cat("value==", v) importanceSVM <- importanceSVM [order(importanceSVM $importanceSVM, decreasing=TRUE),] print(head(importanceSVM [importanceSVM $sign==v,], n=10)) cat("\n") cat(paste(unique(head(importanceSVM $features[importanceSVM $sign==v], n=10)), collapse=", ")) } negatives <- importanceSVM [ which(importanceSVM $sign ==0), ] neutrals <- importanceSVM [ which(importanceSVM $sign ==1), ] positives <- importanceSVM [ which(importanceSVM $sign ==2), ] df1 <- top_n(negatives , 10, importanceSVM ) df2 <- top_n(positives , 10, importanceSVM ) df3 <- top_n(neutrals , 10, importanceSVM ) SV_prob_new <- rbind(df1, df2, df3) str(SV_prob_new) SV_prob_new$sign2 <- factor(SV_prob_new$sign, labels = c("Negative","Neutral","Positive")) str(SV_prob_new) table(SV_prob_new$sign) table(SV_prob_new$sign2) # reorder the features SV_prob_new <- mutate(SV_prob_new, feature= reorder(feature, importanceSVM)) ggplot(SV_prob_new, aes(feature, importanceSVM, fill = sign2)) + geom_bar(stat="identity", fill= "white") + theme_classic() + geom_col(show.legend = T) + theme(legend.position="bottom")+ coord_flip() + ylab("Permutation Results") + labs(title = "Tweets about uk 2014 EP elections", subtitle = "negative versus neutral versus positive features - SVM Model", fill= "sentiment") + scale_fill_manual(values = c("#003333", "#009999", "steelblue")) # let's reorder the features but differentiating according to the category (positive, negative and neutral) SV_prob_new$feature <- with(SV_prob_new, factor(feature,levels=feature[order(ave(importanceSVM, sign2,FUN=max),importanceSVM)])) ggplot(SV_prob_new, aes(feature, importanceSVM, fill = sign2)) + geom_bar(stat="identity", fill= "white") + theme_classic() + geom_col(show.legend = T) + theme(legend.position="bottom")+ coord_flip() + ylab("Permutation Results") + labs(title = "Tweets about uk 2014 EP elections", subtitle = "negative versus neutral versus positive features - SVM Model", fill= "sentiment") + scale_fill_manual(values = c("#003333", "#009999", "steelblue")) SVM_graph <- ggplot(SV_prob_new, aes(feature, importanceSVM, fill = sign2)) + geom_bar(stat="identity", fill= "white") + theme_classic() + geom_col(show.legend = T) + theme(legend.position="bottom")+ coord_flip() + ylab("Permutation Results") + labs(title = "Tweets about uk 2014 EP elections", subtitle = "negative versus neutral versus positive features - SVM Model", fill= "sentiment") + scale_fill_manual(values = c("#003333", "#009999", "steelblue")) ################################################# # let's compare feature importance across models ################################################# library(gridExtra) grid.arrange(NB_graph, RF_graph, SVM_graph, nrow=2) # Plot everything together # let's check some correlation str(NB_prob) str(importance) str(importanceSVM ) names(importanceSVM )[1] <- "Feature" importanceSVM$Feature <- gsub("X.", "@", importanceSVM$Feature) str(importanceSVM ) # let's merge the dataframe words <- merge(NB_prob, importance, by="Feature") words <- merge(words , importanceSVM , by="Feature") str(words) library(PerformanceAnalytics) chart.Correlation(words[c(5,7:8,11)])