rm(list=ls(all=TRUE)) getwd() # setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL") setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(quanteda) library(readtext) library(caTools) library(e1071) library(randomForest) library(caret) library(naivebayes) library(car) library(reshape2) library(gridExtra) library(cvTools) ##################################################### # FIRST STEP: let's prepare the training-set ##################################################### # let's focus on MOVIE reviews (either positive or negative) x <- read.csv("train_review2.csv", stringsAsFactors=FALSE) str(x) myCorpusTwitterTrain <- corpus(x) Dfm_train <- dfm(myCorpusTwitterTrain , remove = c(stopwords("english")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_separators=TRUE, remove_url = TRUE, split_hyphens = TRUE ) # Let's trim the dfm in order to keep only tokens that appear in at least 5% of the reviews Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 0.05, verbose=TRUE, docfreq_type = "prop") topfeatures(Dfm_train , 20) # 20 top words train <- as.matrix(Dfm_train) # our classes table(Dfm_train@docvars$Sentiment) # our benchmark: accuracy .524 prop.table(table(Dfm_train@docvars$Sentiment)) ###################################################### ###################################################### # Let's make CROSS-VALIDATION with k-fold=5 with a loop! # HERE you HAVE JUST 2 classes for the outcome (negative/positive) ###################################################### ###################################################### # Some R packages, such as Caret, allows you to run a k-fold cross-validation with few lines of command. # But I want that you understand what's going on, so better work with this loop! And then if you desire, move to Caret # STEP 1: create the 5 folds ttrain <- train # let's change the name of the original train data.frame, given that we are already going to use such name below in the loop # let's split our training-set in 5 folds set.seed(123) # set the see for replicability k <- 5 # the number of folds; it does not matter the number of folds you decide here; the below procedure always will work! folds <- cvFolds(NROW(ttrain ), K=k) str(folds) ###################################################### ###################################################### # Let's start with SV ###################################################### ###################################################### # STEP 2: the LOOP system.time(for(i in 1:k){ train <- ttrain [folds$subsets[folds$which != i], ] # Set the training set validation <- ttrain [folds$subsets[folds$which == i], ] # Set the validation set set.seed(123) newrf <- svm(y= as.factor(Dfm_train[folds$subsets[folds$which != i], ]@docvars$Sentiment) ,x=train, kernel='linear', cost = 1) # Get your new model # (just fit on the train data) and ADD the name of the output (in this case "Sentiment") newpred <- predict(newrf,newdata=validation) # Get the predicitons for the validation set (from the model just fit on the train data) class_table <- table("Predictions"= newpred, "Actual"=Dfm_train[folds$subsets[folds$which == i], ]@docvars$Sentiment) print(class_table) df<-confusionMatrix( class_table, mode = "everything") df.name<-paste0("conf.mat.sv",i) # create the name for the object that will save the confusion matrix for each loop (=5) assign(df.name,df) }) # STEP 3: the metrics ls() # we have created 5 objects that have saved the 5 confusion matrices we have created. I can estimate now the performance metrics on such results # for example: conf.mat.sv1 SVMPredict <- data.frame(col1=vector(), col2=vector(), col3=vector()) # makes a blank data frame with three columns to fill with the predictions; # why 3 columns? 1 for accuracy; and 2 for the K1 value of the classes in the Sentiment (given you have just two classes!) for(i in mget(ls(pattern = "conf.mat.sv")) ) { Accuracy <-(i)$overall[1] # save in the matrix the accuracy value F1_negative<- (2*(i)$byClass[1]*(i)$byClass[3])/((i)$byClass[1]+(i)$byClass[3]) # save in the matrix the F1 value for negative F1_positive <- (2*(i)$byClass[2]*(i)$byClass[4])/((i)$byClass[2]+(i)$byClass[4]) # save in the matrix the F1 value for positive SVMPredict <- rbind(SVMPredict , cbind(Accuracy , F1_negative, F1_positive )) } str(SVMPredict ) SVMPredict [is.na(SVMPredict )] <- 0 # if I get some NA for some categories with respect to F1 (this happens when BOTH precision and recall score for that category is 0), # replace NA with 0; this usually can happen when your training-set is not that big, so that during the k-fold cv, you can have a training-set with few observations # for some given class str(SVMPredict ) # Let's compare the average value for accuracy and f1 acc_sv_avg <- mean(SVMPredict[, 1] ) f1_sv_avg <- mean(colMeans(SVMPredict[-1] )) acc_sv_avg f1_sv_avg ###################################################### ###################################################### # Random Forest LOOP ###################################################### ###################################################### # If you have not still done STEP 1 above, replicate it; otherwise skip this step # STEP 2: the LOOP # here we fix ntree=100 to make things faster! system.time(for(i in 1:k){ train <- ttrain [folds$subsets[folds$which != i], ] validation <- ttrain [folds$subsets[folds$which == i], ] set.seed(123) newrf <- randomForest(y= as.factor(Dfm_train[folds$subsets[folds$which != i], ]@docvars$Sentiment) ,x=train, do.trace=TRUE, ntree=100) newpred <- predict(newrf,newdata=validation, type="class") class_table <- table("Predictions"= newpred, "Actual"=Dfm_train[folds$subsets[folds$which == i], ]@docvars$Sentiment) print(class_table) df<-confusionMatrix( class_table, mode = "everything") df.name<-paste0("conf.mat.rf",i) assign(df.name,df) }) # STEP 3: the metrics RFPredict <- data.frame(col1=vector(), col2=vector(), col3=vector()) for(i in mget(ls(pattern = "conf.mat.rf")) ) { Accuracy <-(i)$overall[1] # save in the matrix the accuracy value F1_negative <- (2*(i)$byClass[1]*(i)$byClass[3])/((i)$byClass[1]+(i)$byClass[3]) # save in the matrix the F1 value for negative F1_positive <- (2*(i)$byClass[2]*(i)$byClass[4])/((i)$byClass[2]+(i)$byClass[4]) # save in the matrix the F1 value for positive RFPredict <- rbind(RFPredict , cbind(Accuracy , F1_negative, F1_positive)) } RFPredict [is.na(RFPredict )] <- 0 str(RFPredict ) # Let's compare the average value for accuracy and f1 acc_rf_avg <- mean(RFPredict [, 1] ) f1_rf_avg <- mean(colMeans(RFPredict [-1] )) acc_rf_avg f1_rf_avg ###################################################### ###################################################### # Naive Bayes LOOP ###################################################### ###################################################### system.time(for(i in 1:k){ train <- ttrain [folds$subsets[folds$which != i], ] validation <- ttrain [folds$subsets[folds$which == i], ] set.seed(123) newrf <- multinomial_naive_bayes(y= as.factor(Dfm_train[folds$subsets[folds$which != i], ]@docvars$Sentiment) ,x=train, laplace = 1) newpred <- predict(newrf,newdata=validation, type="class") class_table <- table("Predictions"= newpred, "Actual"=Dfm_train[folds$subsets[folds$which == i], ]@docvars$Sentiment) print(class_table) df<-confusionMatrix( class_table, mode = "everything") df.name<-paste0("conf.mat.nb",i) assign(df.name,df) }) # STEP 3: the metrics NBPredict <- data.frame(col1=vector(), col2=vector(), col3=vector()) for(i in mget(ls(pattern = "conf.mat.nb")) ) { Accuracy <-(i)$overall[1] # save in the matrix the accuracy value F1_negative <- (2*(i)$byClass[1]*(i)$byClass[3])/((i)$byClass[1]+(i)$byClass[3]) # save in the matrix the F1 value for negative F1_positive <- (2*(i)$byClass[2]*(i)$byClass[4])/((i)$byClass[2]+(i)$byClass[4]) # save in the matrix the F1 value for positive NBPredict <- rbind(NBPredict , cbind(Accuracy , F1_negative, F1_positive)) } NBPredict [is.na(NBPredict )] <- 0 str(NBPredict ) # Let's compare the average value for accuracy and f1 acc_nb_avg <- mean(NBPredict [, 1] ) f1_nb_avg <- mean(colMeans(NBPredict [-1] )) acc_nb_avg f1_nb_avg ###################################################### ###################################################### # Let's compare the results we got via Naive Bayes, SVM and RF ###################################################### ###################################################### acc_nb_avg acc_sv_avg acc_rf_avg f1_nb_avg f1_sv_avg f1_rf_avg # Let's plot the results! gb1 <- as.data.frame(acc_nb_avg ) colnames(gb1)[1] <- "Accuracy NB" gb2 <- as.data.frame(acc_sv_avg ) colnames(gb2)[1] <- "Accuracy SV" gb3 <- as.data.frame(acc_rf_avg ) colnames(gb3)[1] <- "Accuracy RF" ac_tot <- cbind(gb1, gb2, gb3) ac_tot str(ac_tot) df.long_ac_tot<-melt(ac_tot) str(df.long_ac_tot) p <- ggplot(df.long_ac_tot, aes(x=variable, y=value)) + geom_boxplot() + xlab("Algorithm") + ylab(label="Value") + ggtitle("Naive Bayes vs. SVM vs. RF K-fold cross-validation (K=5): Accuracy") + coord_flip() gb1 <- as.data.frame(f1_nb_avg) colnames(gb1)[1] <- "F1 NB" gb2 <- as.data.frame(f1_sv_avg ) colnames(gb2)[1] <- "F1 SV" gb3 <- as.data.frame(f1_rf_avg ) colnames(gb3)[1] <- "F1 RF" f1_tot <- cbind(gb1, gb2, gb3) f1_tot str(f1_tot) df.long_f1_tot<-melt(f1_tot) str(df.long_f1_tot) p2 <- ggplot(df.long_f1_tot, aes(x=variable, y=value)) + geom_boxplot() + xlab("Algorithm") + ylab(label="Value") + ggtitle("Naive Bayes vs. SVM vs. RF K-fold cross-validation (K=5): F1") + coord_flip() grid.arrange(p, p2, nrow=2) # Plot everything together # So in this particular case, NB appears to be better than the other 3 algorithms - yes! But remember: you are employing a training-set consisting of just 500 documents # and NB usually does well in this scenario. Moreover... # Wait a minute! In this example we kept the hyperparameters fixed! # But is there any other hyperameters configuration for RF, SVM, and NB that will improve the model? ###################################################### ###################################################### # Exercise: try to replicate the previous analysis with k-fold: 10 ###################################################### ######################################################