rm(list=ls(all=TRUE)) getwd() setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL") getwd() library(quanteda) library(readtext) library(caTools) library(e1071) library(randomForest) library(caret) library(stringr) library(car) #### let's repeat all the steps discussed the last time to build the training and the test for NB, RF and SVM ################################################################ # FIRST STEP: create the DFM for the training-set ################################################################ x11 <- read.csv("trainTrump.csv", stringsAsFactors=FALSE) x11$text <- str_replace_all(x11$text, "[^[:alnum:]]", " ") myCorpusTwitterTrain <- corpus(x11) Dfm_train<- dfm(myCorpusTwitterTrain , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https"), ("â"), ("com"), ("ly")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE) ################################################################ # SECOND STEP: create the DFM for the test-set ################################################################ x10 <- read.csv("testTrump.csv", stringsAsFactors=FALSE) x10$text <- str_replace_all(x10$text, "[^[:alnum:]]", " ") myCorpusTwitterTest <- corpus(x10) Dfm_test<- dfm(myCorpusTwitterTest, remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https"), ("â"), ("com"), ("ly")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) Dfm_test<- dfm_trim(Dfm_test, min_docfreq = 2, verbose=TRUE) ################################################################ # THIRD STEP: Let's make the features identical between train and test-set by passing Dfm_train to dfm_match() as a pattern. ################################################################ test_dfm <- dfm_match(Dfm_test, features = featnames(Dfm_train)) ################################################################ # FOURTH STEP/B # transform both dfm (train and test) in a data frame ################################################################ train <- as.data.frame(as.matrix(Dfm_train)) test <- as.data.frame(as.matrix(test_dfm)) colnames(train ) <- make.names(colnames(train )) colnames(test ) <- make.names(colnames(test )) ###################################################### ###################################################### # Let's make CROSS-VALIDATION with k-fold=2 as an example ###################################################### ###################################################### ###################################################### # Let's start with NB ###################################################### ###################################################### # STEP 1: divide the training-set randomly according to the number of k (here k=2) ###################################################### N <- ndoc(myCorpusTwitterTrain ) # number of documents in the training-set k <- 2 # Number of desired folds # Generate indices of holdout observations set.seed(123) holdout <- split(sample(1:N), 1:k) str(holdout) holdout[[1]] holdout[[2]] # Check that each observation appears exactly once in the holdout object: holdout %>% unlist() %>% length() == N ###################################################### # STEP 2: create the dfm for BOTH training-sets ###################################################### head(summary(myCorpusTwitterTrain )) docvars(myCorpusTwitterTrain , "id_numeric") <- 1:ndoc(myCorpusTwitterTrain ) # create docvar with ID head(summary(myCorpusTwitterTrain )) # get training set1 (all the documents in holdout[[1]]) and compute the dfm out of it tr_1_dfm <- dfm(corpus_subset(myCorpusTwitterTrain , id_numeric %in% holdout[[1]]), remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https"), ("â"), ("com"), ("ly")),remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) tr_1_dfm <- dfm_trim(tr_1_dfm ,min_docfreq= 2, verbose=TRUE) # get training set_other1 (all the documents NOT in holdout[[1]]) and compute the dfm out of it tr_other1_dfm <- dfm(corpus_subset(myCorpusTwitterTrain , !id_numeric %in% holdout[[1]]), remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https"), ("â"), ("com"), ("ly")),remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) tr_other1_dfm <- dfm_trim(tr_other1_dfm , min_docfreq= 2, verbose=TRUE) # get training set2 (all the documents in holdout[[2]]) and compute the dfm out of it tr_2_dfm <- dfm(corpus_subset(myCorpusTwitterTrain , id_numeric %in% holdout[[2]]), remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https"), ("â"), ("com"), ("ly")),remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) tr_2_dfm <- dfm_trim(tr_2_dfm ,min_docfreq= 2, verbose=TRUE) # get training set_other2 (all the documents NOT in holdout[[2]]) and compute the dfm out of it tr_other2_dfm <- dfm(corpus_subset(myCorpusTwitterTrain , !id_numeric %in% holdout[[2]]), remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https"), ("â"), ("com"), ("ly")),remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) tr_other2_dfm <- dfm_trim(tr_other2_dfm , min_docfreq= 2, verbose=TRUE) ###################################################### # STEP 3: estimate the results with k=2 ###################################################### ####################################### # K=1 NB (we use as a training-set all the documents NOT in holdout(1), that is tr_other1_dfm, # to predict the documents of the test-set, i.e., the documents in holdout(1), that is tr_1_dfm) ####################################### nb <- textmodel_nb(tr_other1_dfm , docvars(tr_other1_dfm, "Sentiment"), distribution = c("multinomial")) summary(nb) # Let's make the features identical by passing training_dfm to dfm_select() as a pattern. test_dfm <- dfm_select(tr_1_dfm, tr_other1_dfm) # Let’s inspect how well the classification worked actual_class1 <- docvars(test_dfm, "Sentiment") predicted_class1 <- predict(nb, test_dfm) table(actual_class1 ) # actual values presented in the test set table(predicted_class1 ) # the predicted values class_table1 <- table(predicted_class1, actual_class1) # let's put together the predicted and actual values class_table1 confusionMatrix(class_table1, mode = "everything") ####################################### # K=2 NB (we use as a training-set all the documents NOT in holdout(2), that is tr_other2_dfm, # to predict the documents of the test-set, i.e., the documents in holdout(2), that is tr_2_dfm) ####################################### nb <- textmodel_nb(tr_other2_dfm , docvars(tr_other2_dfm, "Sentiment"), distribution = c("multinomial")) summary(nb) # Let's make the features identical by passing training_dfm to dfm_select() as a pattern. test_dfm <- dfm_select(tr_2_dfm, tr_other2_dfm) # Let’s inspect how well the classification worked actual_class2 <- docvars(test_dfm, "Sentiment") predicted_class2 <- predict(nb, test_dfm) table(predicted_class2) prop.table(table(predicted_class2 )) class_table2 <- table( predicted_class2, actual_class2) class_table2 confusionMatrix(class_table2, mode = "everything") ####################################### # Estimating CV performance metrics for NB ####################################### conf.mat_nb1 <- confusionMatrix(class_table1, mode = "everything") conf.mat_nb2 <- confusionMatrix(class_table2, mode = "everything") res1_nb <- as.data.frame(conf.mat_nb1 $byClass) res2_nb <- as.data.frame(conf.mat_nb2 $byClass) res_accuracy <- data.frame( accuracy1=conf.mat_nb1$overall[1], accuracy2=conf.mat_nb2$overall[1] ) res_precision <- data.frame( precision1=res1_nb$Precision, precision2=res2_nb$Precision ) res_recall <- data.frame( recall1=res1_nb$Recall, recall2=res2_nb$Recall ) res_f1 <- data.frame( f1_1=res1_nb$F1, f1_2=res2_nb$F1 ) rownames(res_precision ) <- rownames(res1_nb) rownames(res_recall) <- rownames(res1_nb) rownames(res_f1) <- rownames(res1_nb) res_accuracy res_precision res_recall res_f1 acc_nb <- rowMeans(res_accuracy ) # let's estimate the overall accuracy f1_nb <- mean(rowMeans(res_f1 )) # let's estimate the overall F1 acc_nb f1_nb ###################################################### ###################################################### # CROSS-VALIDATION: k-fold=2 for SVM ###################################################### ###################################################### # STEP 1B: divide the training-set randomly according to the number of k N <- nrow(train ) # we need to work on the data frame "train" rather than on the corpus this time k <- 2 # Number of desired folds # Generate indices of holdout observations set.seed(123) holdout <- split(sample(1:N), 1:k) str(holdout) # Check that each observation appears exactly once in the holdout object: holdout %>% unlist() %>% length() == N # training set with all the data but the ones included in the sampled train 1 can be called as: # head(train [-holdout$`1`,]) # training set with only the data included in the sampled train 1 can be called as: # head(train[holdout$`1`,]) ####################################### # K=1 (train 1) SVM (we use as a training-set all the documents NOT in holdout(1), # to predict the documents of the test-set, i.e., the documents in holdout(1) ) ####################################### system.time(SVM <- svm(y= as.factor(Dfm_train[-holdout$`1`,]@docvars$Sentiment) ,x=train[-holdout$`1`,], kernel='linear', cost = 1)) predictSVM1 <- predict(SVM, newdata=train[holdout$`1`,]) class_table1 <- table("Predictions"= predictSVM1, "Actual"=Dfm_train[holdout$`1`,]@docvars$Sentiment) class_table1 ####################################### # K=2 (train 2) SVM (we use as a training-set all the documents NOT in holdout(2), # to predict the documents of the test-set, i.e., the documents in holdout(2) ) ####################################### set.seed(123) system.time(SVM <- svm(y= as.factor(Dfm_train[holdout$`1`,]@docvars$Sentiment) ,x=train[holdout$`1`,], kernel='linear', cost = 1)) predictSVM2 <- predict(SVM, newdata=train[-holdout$`1`,]) class_table2 <- table("Predictions"= predictSVM2, "Actual"=Dfm_train[-holdout$`1`,]@docvars$Sentiment) class_table2 ####################################### # # Estimating CV performance metrics for SVM ####################################### conf.mat_svm1 <- confusionMatrix(class_table1, mode = "everything") conf.mat_svm2 <- confusionMatrix(class_table2, mode = "everything") res1_svm <- as.data.frame(conf.mat_svm1 $byClass) res2_svm <- as.data.frame(conf.mat_svm2 $byClass) res_accuracy <- data.frame( accuracy1=conf.mat_svm1$overall[1], accuracy2=conf.mat_svm2$overall[1] ) res_precision <- data.frame( precision1=res1_svm$Precision, precision2=res2_svm$Precision ) res_recall <- data.frame( recall1=res1_svm$Recall, recall2=res2_svm$Recall ) res_f1 <- data.frame( f1_1=res1_svm$F1, f1_2=res2_svm$F1 ) rownames(res_precision ) <- rownames(res1_svm) rownames(res_recall) <- rownames(res1_svm) rownames(res_f1) <- rownames(res1_svm) acc_svm <- rowMeans(res_accuracy ) f1_svm <- mean(rowMeans(res_f1 )) acc_svm f1_svm ###################################################### ###################################################### # CROSS-VALIDATION: k-fold=2 for RF ###################################################### ###################################################### # replicate STEP 1B if you have not still done it; otherwise, skyp STEP 1B ####################################### # K=1 (train 1) RF (we use as a training-set all the documents NOT in holdout(1), # to predict the documents of the test-set, i.e., the documents in holdout(1) ) ####################################### set.seed(123) system.time(RF<- randomForest(y= as.factor(Dfm_train[-holdout$`1`,]@docvars$Sentiment) ,x=train[-holdout$`1`,], ntree=100, do.trace=TRUE)) predictRF1 <- predict(RF, newdata=train[holdout$`1`,]) class_table1 <- table("Predictions"= predictRF1 , "Actual"=Dfm_train[holdout$`1`,]@docvars$Sentiment) class_table1 ####################################### # K=2 (train 2) RF (we use as a training-set all the documents NOT in holdout(2), # to predict the documents of the test-set, i.e., the documents in holdout(2) ) ####################################### set.seed(123) system.time(RF2<- randomForest(y= as.factor(Dfm_train[holdout$`1`,]@docvars$Sentiment) ,x=train[holdout$`1`,], ntree=100, do.trace=TRUE)) predictRF2 <- predict(RF2, newdata=train[-holdout$`1`,]) class_table2 <- table("Predictions"= predictRF2, "Actual"=Dfm_train[-holdout$`1`,]@docvars$Sentiment) class_table2 ####################################### # # Estimating CV performance metrics for RF ####################################### conf.mat_rf1 <- confusionMatrix(class_table1, mode = "everything") conf.mat_rf2 <- confusionMatrix(class_table2, mode = "everything") res1_rf <- as.data.frame(conf.mat_rf1 $byClass) res2_rf <- as.data.frame(conf.mat_rf2 $byClass) res_accuracy <- data.frame( accuracy1=conf.mat_rf1$overall[1], accuracy2=conf.mat_rf2$overall[1] ) res_precision <- data.frame( precision1=res1_rf$Precision, precision2=res2_rf$Precision ) res_recall <- data.frame( recall1=res1_rf$Recall, recall2=res2_rf$Recall ) res_f1 <- data.frame( f1_1=res1_rf$F1, f1_2=res2_rf$F1 ) rownames(res_precision ) <- rownames(res1_rf) rownames(res_recall) <- rownames(res1_rf) rownames(res_f1) <- rownames(res1_rf) acc_rf <- rowMeans(res_accuracy ) f1_rf <- mean(rowMeans(res_f1 )) acc_rf f1_rf ###################################################### # let's compare the results we got! ###################################################### acc_nb acc_svm acc_rf f1_nb f1_svm f1_rf ###################################################### ###################################################### # OF COURSE k should be always higher than 2! Usually is 5 or 10... # Replicating the analysis in this case could be very boring, isn't it? And then?!? ###################################################### ######################################################