rm(list=ls(all=TRUE)) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(quanteda) library(readtext) library(caTools) library(e1071) library(randomForest) library(caret) library(naivebayes) library(car) library(reshape2) library(gridExtra) library(cvTools) ##################################################### # FIRST STEP: let's prepare the training-set ##################################################### # let's focus on MOVIE reviews (our DV: either a positive or a negative movie-review) x <- read.csv("train_review2.csv", stringsAsFactors=FALSE) str(x) myCorpusTwitterTrain <- corpus(x) tok2 <- tokens(myCorpusTwitterTrain, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_wordstem (tok2) Dfm_train <- dfm( tok2) # Let's trim the dfm in order to keep only tokens that appear in at least 5% of the reviews Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 0.05, verbose=TRUE, docfreq_type = "prop") topfeatures(Dfm_train , 20) # 20 top words # our classes table(Dfm_train@docvars$Sentiment) # our benchmark: accuracy .524 prop.table(table(Dfm_train@docvars$Sentiment)) train <- as.matrix(Dfm_train) ###################################################### ###################################################### # Let's make CROSS-VALIDATION with k-fold=5 with a loop! # HERE you HAVE JUST 2 classes for the outcome (negative/positive) ###################################################### ###################################################### # Some R packages, such as Caret, allows you to run a k-fold cross-validation with few lines of command. # But I want that you understand what's going on, so better work with this loop! And then if you desire, move to Caret # STEP 1: create the 5 folds ttrain <- train # let's change the name of the original train matrix, given that we are already going to use such name below in the loop # let's split our training-set in 5 folds set.seed(123) # set the see for replicability k <- 5 # the number of folds; it does not matter the number of folds you decide here; the below procedure always will work! folds <- cvFolds(NROW(ttrain ), K=k) str(folds) ###################################################### ###################################################### # Let's start with SVM ###################################################### ###################################################### # STEP 2: the LOOP system.time(for(i in 1:k){ train <- ttrain [folds$subsets[folds$which != i], ] # Set the training set validation <- ttrain [folds$subsets[folds$which == i], ] # Set the validation set newrf <- svm(y= as.factor(Dfm_train[folds$subsets[folds$which != i], ]@docvars$Sentiment) ,x=train, kernel='linear', cost = 1) # Get your new model # (just fit on the train data) and ADD the name of the output (in this case "Sentiment") newpred <- predict(newrf,newdata=validation) # Get the predictions for the validation set (from the model just fit on the train data) class_table <- table("Predictions"= newpred, "Actual"=Dfm_train[folds$subsets[folds$which == i], ]@docvars$Sentiment) print(class_table) df<-confusionMatrix( class_table, mode = "everything") df.name<-paste0("conf.mat.sv",i) # create the name for the object that will save the confusion matrix for each loop (=5) assign(df.name,df) }) # STEP 3: the metrics ls() # we have created 5 objects that have saved the 5 confusion matrices we have created. I can estimate now the performance metrics on such results # for example: conf.mat.sv1 str(conf.mat.sv1) conf.mat.sv1$overall[1] # overall accuracy: (36+47)/(36+47+8+9) # note that the F1 value you see is not the one for the overall model, but just for the first class (i.e., "negative") # Therefore we have to estimate the average value of F1 for each k-fold by hands! See below conf.mat.sv1$byClass[1] # Recall for negative: (36)/(36+9) - think vertically! conf.mat.sv1$byClass[3] # Precision for negative: (36)/(36+8) - think horizontally! conf.mat.sv1$byClass[2] # Recall for positive: (47)/(47+8) - think vertically! conf.mat.sv1$byClass[4] # Precision for positive: (47)/(47+9) - think horizontally! # F1 per the negative class: conf.mat.sv1$byClass[7] # that is: (2*conf.mat.sv1$byClass[3]*conf.mat.sv1$byClass[1])/(conf.mat.sv1$byClass[3]+conf.mat.sv1$byClass[1]) # F1 for negative # and for positive? (2*conf.mat.sv1$byClass[2]*conf.mat.sv1$byClass[4])/(conf.mat.sv1$byClass[2]+conf.mat.sv1$byClass[4]) # F1 for positive SVMPredict <- data.frame(col1=vector(), col2=vector(), col3=vector()) # makes a blank data frame with three columns to fill with the predictions; # why 3 columns? 1 for accuracy; and 2 for the K1 value of the classes in the Sentiment (given you have just two classes!) for(i in mget(ls(pattern = "conf.mat.sv")) ) { Accuracy <-(i)$overall[1] # save in the matrix the accuracy value F1_negative<- (2*(i)$byClass[1]*(i)$byClass[3])/((i)$byClass[1]+(i)$byClass[3]) # save in the matrix the F1 value for negative F1_positive <- (2*(i)$byClass[2]*(i)$byClass[4])/((i)$byClass[2]+(i)$byClass[4]) # save in the matrix the F1 value for positive SVMPredict <- rbind(SVMPredict , cbind(Accuracy , F1_negative, F1_positive )) } str(SVMPredict ) # Let's compute the average value for accuracy and f1 acc_sv_avg <- mean(SVMPredict[, 1] ) f1_sv_avg <- mean(colMeans(SVMPredict[-1] )) # comparing the avg. accuracy with the avg. F1 value is always a good practice. You can use it as a diagnostic tool- # If the values are very different, that implies that you are doing well with some class labels and poorly # with other class labels. In this case, not such a huge difference (being the 2 values very close to each other) acc_sv_avg f1_sv_avg ###################################################### ###################################################### # Random Forest LOOP ###################################################### ###################################################### # If you have not still done STEP 1 above, replicate it; otherwise skip this step # STEP 2: the LOOP # here we fix ntree=100 to make things faster! system.time(for(i in 1:k){ train <- ttrain [folds$subsets[folds$which != i], ] validation <- ttrain [folds$subsets[folds$which == i], ] set.seed(123) newrf <- randomForest(y= as.factor(Dfm_train[folds$subsets[folds$which != i], ]@docvars$Sentiment) ,x=train, do.trace=TRUE, ntree=100) newpred <- predict(newrf,newdata=validation, type="class") class_table <- table("Predictions"= newpred, "Actual"=Dfm_train[folds$subsets[folds$which == i], ]@docvars$Sentiment) print(class_table) df<-confusionMatrix( class_table, mode = "everything") df.name<-paste0("conf.mat.rf",i) assign(df.name,df) }) # STEP 3: the metrics RFPredict <- data.frame(col1=vector(), col2=vector(), col3=vector()) for(i in mget(ls(pattern = "conf.mat.rf")) ) { Accuracy <-(i)$overall[1] # save in the matrix the accuracy value F1_negative <- (2*(i)$byClass[1]*(i)$byClass[3])/((i)$byClass[1]+(i)$byClass[3]) # save in the matrix the F1 value for negative F1_positive <- (2*(i)$byClass[2]*(i)$byClass[4])/((i)$byClass[2]+(i)$byClass[4]) # save in the matrix the F1 value for positive RFPredict <- rbind(RFPredict , cbind(Accuracy , F1_negative, F1_positive)) } str(RFPredict ) # Let's compute the average value for accuracy and f1 acc_rf_avg <- mean(RFPredict [, 1] ) f1_rf_avg <- mean(colMeans(RFPredict [-1] )) acc_rf_avg f1_rf_avg ###################################################### ###################################################### # Naive Bayes LOOP ###################################################### ###################################################### system.time(for(i in 1:k){ train <- ttrain [folds$subsets[folds$which != i], ] validation <- ttrain [folds$subsets[folds$which == i], ] newrf <- multinomial_naive_bayes(y= as.factor(Dfm_train[folds$subsets[folds$which != i], ]@docvars$Sentiment) ,x=train, laplace = 1) newpred <- predict(newrf,newdata=validation, type="class") class_table <- table("Predictions"= newpred, "Actual"=Dfm_train[folds$subsets[folds$which == i], ]@docvars$Sentiment) print(class_table) df<-confusionMatrix( class_table, mode = "everything") df.name<-paste0("conf.mat.nb",i) assign(df.name,df) }) # STEP 3: the metrics NBPredict <- data.frame(col1=vector(), col2=vector(), col3=vector()) for(i in mget(ls(pattern = "conf.mat.nb")) ) { Accuracy <-(i)$overall[1] # save in the matrix the accuracy value F1_negative <- (2*(i)$byClass[1]*(i)$byClass[3])/((i)$byClass[1]+(i)$byClass[3]) # save in the matrix the F1 value for negative F1_positive <- (2*(i)$byClass[2]*(i)$byClass[4])/((i)$byClass[2]+(i)$byClass[4]) # save in the matrix the F1 value for positive NBPredict <- rbind(NBPredict , cbind(Accuracy , F1_negative, F1_positive)) } str(NBPredict ) # Let's compute the average value for accuracy and f1 acc_nb_avg <- mean(NBPredict [, 1] ) f1_nb_avg <- mean(colMeans(NBPredict [-1] )) acc_nb_avg f1_nb_avg ###################################################### ###################################################### # Let's compare the results we got via Naive Bayes, SVM and RF ###################################################### ###################################################### acc_nb_avg acc_sv_avg acc_rf_avg f1_nb_avg f1_sv_avg f1_rf_avg # Let's plot the results! gb1 <- as.data.frame(acc_nb_avg ) colnames(gb1)[1] <- "Accuracy NB" gb2 <- as.data.frame(acc_sv_avg ) colnames(gb2)[1] <- "Accuracy SV" gb3 <- as.data.frame(acc_rf_avg ) colnames(gb3)[1] <- "Accuracy RF" ac_tot <- cbind(gb1, gb2, gb3) ac_tot str(ac_tot) df.long_ac_tot<-melt(ac_tot) str(df.long_ac_tot) p <- ggplot(df.long_ac_tot, aes(x=variable, y=value, color = variable)) + geom_boxplot() + xlab("Algorithm") + ylab(label="Values of Accuracy") + labs(title = "Cross-validation with k =5: values of Accuracy") + coord_flip() + theme_bw() + theme(legend.position = "None") gb1 <- as.data.frame(f1_nb_avg) colnames(gb1)[1] <- "F1 NB" gb2 <- as.data.frame(f1_sv_avg ) colnames(gb2)[1] <- "F1 SV" gb3 <- as.data.frame(f1_rf_avg ) colnames(gb3)[1] <- "F1 RF" f1_tot <- cbind(gb1, gb2, gb3) f1_tot str(f1_tot) df.long_f1_tot<-melt(f1_tot) str(df.long_f1_tot) p2 <- ggplot(df.long_f1_tot, aes(x=variable, y=value, color = variable)) + geom_boxplot() + xlab("Algorithm") + ylab(label="Values of F1") + labs(title = "Cross-validation with k =5: values of F1") + coord_flip() + theme_bw() + theme(legend.position = "None") grid.arrange(p, p2, nrow=2) # Plot everything together # So in this particular case, NB appears to be better than the other 3 algorithms - yes! # But remember: you are employing a training-set consisting of just 500 documents # and NB usually does well in this scenario. Moreover... # Wait a minute! In this example we kept the hyperparameters fixed! # But is there any other hyperameters configuration for RF, SVM, and NB that will improve the model?