rm(list=ls(all=TRUE)) getwd() setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL") getwd() library(quanteda) library(readtext) library(caTools) library(e1071) library(randomForest) library(caret) library(stringr) library(cvTools) library(magicfor) library(car) library(reshape2) library(gridExtra) library(cvTools) #### let's repeat all the steps discussed the last time to build the training and the test for NB, RF and SVM ################################################################ # FIRST STEP: create the DFM for the training-set ################################################################ x11 <- read.csv("trainTrump.csv", stringsAsFactors=FALSE) x11$text <- str_replace_all(x11$text, "[^[:alnum:]]", " ") myCorpusTwitterTrain <- corpus(x11) Dfm_train<- dfm(myCorpusTwitterTrain , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https"), ("â"), ("com"), ("ly")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE) ################################################################ # SECOND STEP: create the DFM for the test-set ################################################################ x10 <- read.csv("testTrump.csv", stringsAsFactors=FALSE) x10$text <- str_replace_all(x10$text, "[^[:alnum:]]", " ") myCorpusTwitterTest <- corpus(x10) Dfm_test<- dfm(myCorpusTwitterTest, remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https"), ("â"), ("com"), ("ly")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) Dfm_test<- dfm_trim(Dfm_test, min_docfreq = 2, verbose=TRUE) ################################################################ # THIRD STEP: Let's make the features identical between train and test-set by passing Dfm_train to dfm_match() as a pattern. ################################################################ test_dfm <- dfm_match(Dfm_test, features = featnames(Dfm_train)) ################################################################ # FOURTH STEP/B # transform both dfm (train and test) in a data frame ################################################################ train <- as.data.frame(as.matrix(Dfm_train)) test <- as.data.frame(as.matrix(test_dfm)) colnames(train ) <- make.names(colnames(train )) colnames(test ) <- make.names(colnames(test )) ###################################################### ###################################################### # Let's make CROSS-VALIDATION with k-fold=5 with a loop! ###################################################### ###################################################### # Some R packages, such as Caret, allows you to run a k-fold cross-validation with few lines of command. # But I want that you understand what's going on, so better work with this loop! And then if you desire, move to Caret ###################################################### ###################################################### # Let's start with SV ###################################################### ###################################################### # STEP 1: create the folds ttrain <- train # let's change the name of the original train data.frame, given that we are already going to use such name below in the loop # let's split our training-set in 5 folds (this time we will use a slight different line of command - make things faster here) set.seed(123) # set the see for replicability k <- 5 # the number of folds; it does not matter the number of folds you decide here; the below procedure always will work! folds <- cvFolds(NROW(ttrain ), K=k) str(folds) # STEP 2: the LOOP system.time(for(i in 1:k){ train <- ttrain [folds$subsets[folds$which != i], ] # Set the training set validation <- ttrain [folds$subsets[folds$which == i], ] # Set the validation set set.seed(123) newrf <- svm(y= as.factor(Dfm_train[folds$subsets[folds$which != i], ]@docvars$Sentiment) ,x=train, kernel='linear', cost = 1) # Get your new model # (just fit on the train data) and ADD the name of the output (in this case "Sentiment") newpred <- predict(newrf,newdata=validation) # Get the predicitons for the validation set (from the model just fit on the train data) class_table <- table("Predictions"= newpred, "Actual"=Dfm_train[folds$subsets[folds$which == i], ]@docvars$Sentiment) print(class_table) df<-confusionMatrix( class_table, mode = "everything") df.name<-paste0("conf.mat.sv",i) # create the name for the object that will save the confusion matrix for each loop (=5) assign(df.name,df) }) # STEP 3: the metrics ls() # we have just created 5 objects that have saved the 5 confusion matrices we have created. I can estimate now the performance metrics on such results SVMPredict <- data.frame(col1=vector(), col2=vector(), col3=vector(), col4=vector()) # makes a blank data frame with four columns to fill with the predictions; # why 4 columns? 1 for accuracy; and 3 for the K1 value of the classes in the Sentiment: negative, neutral, positive. According to the number of classes # in your output variable, changes the number of columns to fill!!! for(i in mget(ls(pattern = "conf.mat.sv")) ) { col1 <-(i)$overall[1] # save in the matrix the accuracy value p <- as.data.frame((i)$byClass) col2 <- p$F1[1] # save in the matrix the F1 value for negative col3 <- p$F1[2] # save in the matrix the F1 value for neutral col4 <- p$F1[3] # save in the matrix the F1 value for positive SVMPredict <- rbind(SVMPredict , cbind(col1, col2, col3, col4)) } colnames(SVMPredict )[1] <- "Accuracy" colnames(SVMPredict )[2] <- "F1 Negative" colnames(SVMPredict )[3] <- "F1 Neutral" colnames(SVMPredict )[4] <- "F1 Positive" SVMPredict [is.na(SVMPredict )] <- 0 # if I get some NA for some categories with respect to F1 (this happens when BOTH precision and recall score for that category is 0), replace NA with 0 SVMPredict str(SVMPredict ) # Let's compare the average value for accuracy and f1 acc_sv_avg <- mean(SVMPredict[, 1] ) f1_sv_avg <- mean(colMeans(SVMPredict[-1] )) acc_sv_avg f1_sv_avg ###################################################### ###################################################### # Random Forest LOOP ###################################################### ###################################################### # If you have not still done STEP 1 above, replicate it; otherwise skip this step # STEP 2: the LOOP system.time(for(i in 1:k){ train <- ttrain [folds$subsets[folds$which != i], ] validation <- ttrain [folds$subsets[folds$which == i], ] set.seed(123) newrf <- randomForest(y= as.factor(Dfm_train[folds$subsets[folds$which != i], ]@docvars$Sentiment) ,x=train, do.trace=TRUE, ntree=100) newpred <- predict(newrf,newdata=validation, type="class") class_table <- table("Predictions"= newpred, "Actual"=Dfm_train[folds$subsets[folds$which == i], ]@docvars$Sentiment) print(class_table) df<-confusionMatrix( class_table, mode = "everything") df.name<-paste0("conf.mat.rf",i) assign(df.name,df) }) # STEP 3: the metrics RFPredict <- data.frame(col1=vector(), col2=vector(), col3=vector(), col4=vector()) for(i in mget(ls(pattern = "conf.mat.rf")) ) { col1 <-(i)$overall[1] # save in the matrix the accuracy value p <- as.data.frame((i)$byClass) col2 <- p$F1[1] # save in the matrix the F1 value for negative col3 <- p$F1[2] # save in the matrix the F1 value for neutral col4 <- p$F1[3] # save in the matrix the F1 value for positive RFPredict <- rbind(RFPredict , cbind(col1, col2, col3, col4)) } colnames(RFPredict )[1] <- "Accuracy" colnames(RFPredict )[2] <- "F1 Negative" colnames(RFPredict )[3] <- "F1 Neutral" colnames(RFPredict )[4] <- "F1 Positive" RFPredict [is.na(RFPredict )] <- 0 RFPredict str(RFPredict ) # Let's compare the average value for accuracy and f1 acc_rf_avg <- mean(RFPredict [, 1] ) f1_rf_avg <- mean(colMeans(RFPredict [-1] )) acc_rf_avg f1_rf_avg ###################################################### ###################################################### # Naive Bayes LOOP ###################################################### ###################################################### # STEP 1: create the folds (here let's follow our usual procedure that we saw in the "CV first part" script) summary( myCorpusTwitterTrain ) N <- ndoc(myCorpusTwitterTrain ) k <- 5 # Number of desired folds set.seed(123) holdout <- split(sample(1:N), 1:k) docvars(myCorpusTwitterTrain , "id_numeric") <- 1:ndoc(myCorpusTwitterTrain ) # STEP 2: the LOOP system.time(for(i in 1:k){ train <- dfm(corpus_subset(myCorpusTwitterTrain , id_numeric %in% holdout[[i]]), remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https"), ("â"), ("com"), ("ly")),remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) # Set the training set train <- dfm_trim(train,min_docfreq= 2, verbose=TRUE) # Trim the dfm validation <- dfm(corpus_subset(myCorpusTwitterTrain , !id_numeric %in% holdout[[i]]), remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https"), ("â"), ("com"), ("ly")),remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) # Set the validation set validation <- dfm_trim(validation , min_docfreq= 2, verbose=TRUE) # Trim the dfm set.seed(123) newrf <- textmodel_nb(train, docvars(train, "Sentiment"), distribution = c("multinomial")) # Get your new model (just fit on the train data) and ADD the name of the output (in this case "Sentiment") validation <- dfm_select(validation, train) # Let's make the features identical by passing training_dfm to dfm_select() as a pattern newpred <- predict(newrf,validation) # Get the predicitons for the validation set (from the model just fit on the train data) actual_class <- docvars(validation, "Sentiment") # Add name output (in this case "Sentiment") predicted_class <- predict(newrf, newdata = validation) tab_class <- table(predicted_class, actual_class) df<-confusionMatrix( tab_class, mode = "everything") # Add name output (in this case "Sentiment") df.name<-paste0("conf.mat.nb",i) assign(df.name,df) }) # STEP 3: the metrics NBPredict <- data.frame(col1=vector(), col2=vector(), col3=vector(), col4=vector()) for(i in mget(ls(pattern = "conf.mat.nb")) ) { col1 <-(i)$overall[1] # save in the matrix the accuracy value p <- as.data.frame((i)$byClass) col2 <- p$F1[1] # save in the matrix the F1 value for negative col3 <- p$F1[2] # save in the matrix the F1 value for neutral col4 <- p$F1[3] # save in the matrix the F1 value for positive NBPredict <- rbind(NBPredict , cbind(col1, col2, col3, col4)) } colnames(NBPredict )[1] <- "Accuracy" colnames(NBPredict )[2] <- "F1 Negative" colnames(NBPredict )[3] <- "F1 Neutral" colnames(NBPredict )[4] <- "F1 Positive" NBPredict [is.na(NBPredict )] <- 0 NBPredict str(NBPredict ) # Let's compare the average value for accuracy and f1 acc_nb_avg <- mean(NBPredict [, 1] ) f1_nb_avg <- mean(colMeans(NBPredict [-1] )) acc_nb_avg f1_nb_avg ###################################################### ###################################################### # Let's compare the results we got via Naive Bayes, SVM and RF ###################################################### ###################################################### acc_nb_avg acc_sv_avg acc_rf_avg f1_nb_avg f1_sv_avg f1_rf_avg # Let's plot the results! gb1 <- as.data.frame(acc_nb_avg ) colnames(gb1)[1] <- "Accuracy NB" gb2 <- as.data.frame(acc_sv_avg ) colnames(gb2)[1] <- "Accuracy SV" gb3 <- as.data.frame(acc_rf_avg ) colnames(gb3)[1] <- "Accuracy RF" ac_tot <- cbind(gb1, gb2, gb3) ac_tot str(ac_tot) df.long_ac_tot<-melt(ac_tot) str(df.long_ac_tot) p <- ggplot(df.long_ac_tot, aes(x=variable, y=value)) + geom_boxplot() + xlab("Algorithm") + ylab(label="Value") + ggtitle("Naive Bayes vs. SVM vs. RF K-fold cross-validation (K=5): Accuracy") + coord_flip() gb1 <- as.data.frame(f1_nb_avg) colnames(gb1)[1] <- "F1 NB" gb2 <- as.data.frame(f1_sv_avg ) colnames(gb2)[1] <- "F1 SV" gb3 <- as.data.frame(f1_rf_avg ) colnames(gb3)[1] <- "F1 RF" f1_tot <- cbind(gb1, gb2, gb3) f1_tot str(f1_tot) df.long_f1_tot<-melt(f1_tot) str(df.long_f1_tot) p2 <- ggplot(df.long_f1_tot, aes(x=variable, y=value)) + geom_boxplot() + xlab("Algorithm") + ylab(label="Value") + ggtitle("Naive Bayes vs. SVM vs. RF K-fold cross-validation (K=5): F1") + coord_flip() grid.arrange(p, p2, nrow=2) # Plot everything together # So in this particular case, SVM better than the other 2 algorithms? Wait a minute! In this example we kept the hyperparameters fixed! # But is there any other hyperameters configuration for RF and SVM that will improve the model [for NB in Quanteda, no hyperparameters need to be changed]?