rm(list=ls(all=TRUE)) getwd() setwd("C:/Users/mw/Dropbox (Personale)/TOPIC MODEL") getwd() library(e1071) library(caTools) library(randomForest) library(caret) library(quanteda) library(readtext) ### TRAIN-TEST # This dataset is a sample of tweets mentioning the official account of Donald Trump “@realDonaldTrump”, # on dates 7–13 June 2016, written in English and coming from the US. # Data have been collected through Twitter API also specifying language and origin of tweets. # This dataset include a sample of around 482 tweets that have been manually codified by a group # of students. The coding stage involved detecting the sentiment towards Trump (negative, positive, # neutral). x <- read.csv("Trump-orig3.csv", stringsAsFactors=FALSE) str(x) table(x$Sentiment) x$Sentiment <- as.factor(x$Sentiment) str(x) table(x$Sentiment) prop.table(table(x$Sentiment)) myCorpusTrain <- corpus(x) summary(myCorpusTrain ) ### TEST-SET ### This is a sample of 1000 tweets written in English about Trump and published since 1.17.2018 till 1.19.2018 x10 <- read.csv("Trump_tweets2.csv", stringsAsFactors=FALSE) str(x10) x10$X <- NULL myCorpusTest <- corpus(x10) summary(myCorpusTest) ########################################### ## Prepare the two DFMs (for traing and test set) ########################################### # get training set (documents in id_train) and compute the dfm out of it training_corpus <- myCorpusTrain summary(training_corpus ) training_dfm <- dfm(training_corpus , remove = c(stopwords("english"), ("pic.twitter.com"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) topfeatures(training_dfm , 20) # 20 top words # Keep terms that appear at least in the 5% or more of the tweets (tweets are very short texts...) training_dfm <- dfm_trim(training_dfm , min_docfreq= 0.05) # get test set (documents in id_train) and compute the dfm out of it test_corpus <- myCorpusTest summary(test_corpus ) test_dfm <- dfm(test_corpus , remove = c(stopwords("english"), ("pic.twitter.com"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) topfeatures(test_dfm , 20) # 20 top words # Keep terms that appear at least in the 5% or more of the tweets (tweets are very short texts...) test_dfm <- dfm_trim(test_dfm , min_docfreq= 0.05) # Let's make the features identical by passing training_dfm to dfm_select() as a pattern. test_dfm <- dfm_select(test_dfm, training_dfm) ########################################### ## Try a Naive Bayes model ########################################### # train the naive Bayes classifier using textmodel_nb() and a Multinomial distribution nb <- textmodel_nb(training_dfm, docvars(training_dfm, "Sentiment"), distribution = c("multinomial")) summary(nb) predicted_class <- predict(nb, test_dfm) table(predicted_class) str(predicted_class) nb_results <- prop.table(table(predicted_class )) nb_results ########################################### ## Prepare the two data-frames out of the two DFMs (for traing and test set) ## this is needed for RF, SVM, etc. that work with data.frames not with DFM! ########################################### # extract a data frame from the training DFM train_df <- as.data.frame(as.matrix(training_dfm)) str(train_df) colnames(train_df) <- make.names(colnames(train_df), unique=TRUE) # add the column of the Sentiment included in the training-set back to the training-set data frame table(training_corpus$documents$Sentiment) train_df$sentiment<- training_corpus$documents$Sentiment table(train_df$sentiment) str(train_df$sentiment) # extract a data frame from the test DFM test_df <- as.data.frame(as.matrix(test_dfm)) colnames(test_df) <- make.names(colnames(test_df), unique=TRUE) ########################################### ## Try a Random Forest model ########################################### # train the Random Forest classifier set.seed(123) RF <- randomForest(sentiment~ ., data=train_df, type="classification") predictRF <- predict(RF, newdata=test_df) table( predictRF) rf_results <- prop.table(table(predictRF )) rf_results ########################################### ## Try a Support Vector Machine model ########################################### # train the SVM classifier set.seed(123) SVM <- svm(sentiment~ ., data=train_df, method = "C-classification", kernel='linear', cost=1) predictSVM <- predict(SVM , newdata=test_df) table( predictSVM ) svm_results <- prop.table(table(predictSVM )) svm_results ####################################### #### plot NB, RF and SVM results ####################################### myFrame1 <- as.data.frame(prop.table(table(predictRF ))) str(myFrame1) colnames(myFrame1)[1] <- "Prediction" myFrame1$class <- c("RF", "RF", "RF") str(myFrame1) myFrame2 <- as.data.frame(prop.table(table(predicted_class ))) str(myFrame2) colnames(myFrame2)[1] <- "Prediction" myFrame2$class <- c("NB", "NB", "NB") str(myFrame2) myFrame1 <- as.data.frame(prop.table(table(predictRF ))) str(myFrame1) colnames(myFrame1)[1] <- "Prediction" myFrame1$class <- c("RF", "RF", "RF") str(myFrame1) myFrame3 <- as.data.frame(prop.table(table(predictSVM ))) colnames(myFrame3)[1] <- "Prediction" myFrame3$class <- c("SVM", "SVM", "SVM") str(myFrame3) myFrame_tot <- rbind(myFrame1, myFrame2, myFrame3) str(myFrame_tot) library(reshape2) df.long<-melt(myFrame_tot,id.vars=c("class", "Freq")) str(df.long) ggplot(df.long,aes(class,Freq,fill=value))+ geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() + ylab(label="Frequency") + xlab("Algorithm") + ggtitle("RF vs. Naive Bayes vs. SVM predictions for the test-set") # which of the three algorithm to choose? Do some cross-validation!!! ########################################### # cross-validation with NB with K=4 (each TS of 118) ########################################### summary( myCorpusTrain) N <- ndoc(myCorpusTrain) # Number of desired splits folds <- 4 # Generate indices of holdout observations holdout <- split(sample(1:N), 1:folds) str(holdout) # Check that each observation appears exactly once in the holdout object: holdout %>% unlist() %>% length() == N holdout[[1]] # create docvar with ID docvars(myCorpusTrain, "id_numeric") <- 1:ndoc(myCorpusTrain) summary(myCorpusTrain) # get training set_other1 (all the documents NOT in holdout[[1]]) and compute the dfm out of it tr_other1 <- corpus_subset(myCorpusTrain, !id_numeric %in% holdout[[1]]) tr_other1_dfm <- dfm(tr_other1 , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) tr_other1_dfm <- dfm_trim(tr_other1_dfm , min_docfreq= 0.05) # get training set_other2 (all the documents NOT in holdout[[2]]) and compute the dfm out of it tr_other2 <- corpus_subset(myCorpusTrain, !id_numeric %in% holdout[[2]]) tr_other2_dfm <- dfm(tr_other2 , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) tr_other2_dfm <- dfm_trim(tr_other2_dfm , min_docfreq= 0.05) # get training set_other3 (all the documents NOT in holdout[[3]]) and compute the dfm out of it tr_other3 <- corpus_subset(myCorpusTrain, !id_numeric %in% holdout[[3]]) tr_other3_dfm <- dfm(tr_other3 , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) tr_other3_dfm <- dfm_trim(tr_other3_dfm , min_docfreq= 0.05) # get training set_other4 (all the documents NOT in holdout[[4]]) and compute the dfm out of it tr_other4 <- corpus_subset(myCorpusTrain, !id_numeric %in% holdout[[4]]) tr_other4_dfm <- dfm(tr_other4 , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) tr_other4_dfm <- dfm_trim(tr_other4_dfm , min_docfreq= 0.05) # get training set1 (all the documents in holdout[[1]]) and compute the dfm out of it tr_1 <- corpus_subset(myCorpusTrain, id_numeric %in% holdout[[1]]) tr_1_dfm <- dfm(tr_1 , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) tr_1_dfm <- dfm_trim(tr_1_dfm , min_docfreq= 0.05) # get training set2 (all the documents in holdout[[2]]) and compute the dfm out of it tr_2 <- corpus_subset(myCorpusTrain, id_numeric %in% holdout[[2]]) tr_2_dfm <- dfm(tr_2 , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) tr_2_dfm <- dfm_trim(tr_2_dfm , min_docfreq= 0.05) # get training set3 (all the documents in holdout[[3]]) and compute the dfm out of it tr_3 <- corpus_subset(myCorpusTrain, id_numeric %in% holdout[[3]]) tr_3_dfm <- dfm(tr_3 , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) tr_3_dfm <- dfm_trim(tr_3_dfm , min_docfreq= 0.05) # get training set4 (all the documents in holdout[[4]]) and compute the dfm out of it tr_4 <- corpus_subset(myCorpusTrain, id_numeric %in% holdout[[4]]) tr_4_dfm <- dfm(tr_4 , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) tr_4_dfm <- dfm_trim(tr_4_dfm , min_docfreq= 0.05) ####################################### # K=1 (train 1) NB ####################################### nb <- textmodel_nb(tr_other1_dfm , docvars(tr_other1_dfm, "Sentiment"), distribution = c("multinomial")) summary(nb) # Let's make the features identical by passing training_dfm to dfm_select() as a pattern. test_dfm <- dfm_select(tr_1_dfm, tr_other1_dfm) # Let’s inspect how well the classification worked actual_class1 <- docvars(test_dfm, "Sentiment") predicted_class1 <- predict(nb, test_dfm) table(predicted_class1) prop.table(table(predicted_class1 )) class_table1 <- table(actual_class1, predicted_class1) class_table1 confusionMatrix(class_table1, mode = "everything") ####################################### # K=2 (train 2) NB ####################################### nb <- textmodel_nb(tr_other2_dfm , docvars(tr_other2_dfm, "Sentiment"), distribution = c("multinomial")) summary(nb) # Let's make the features identical by passing training_dfm to dfm_select() as a pattern. test_dfm <- dfm_select(tr_2_dfm, tr_other2_dfm) # Let’s inspect how well the classification worked actual_class2 <- docvars(test_dfm, "Sentiment") predicted_class2 <- predict(nb, test_dfm) table(predicted_class2) prop.table(table(predicted_class2 )) class_table2 <- table(actual_class2, predicted_class2) class_table2 confusionMatrix(class_table2, mode = "everything") ####################################### # K=3 (train 3) NB ####################################### nb <- textmodel_nb(tr_other3_dfm , docvars(tr_other3_dfm, "Sentiment"), distribution = c("multinomial")) summary(nb) # Let's make the features identical by passing training_dfm to dfm_select() as a pattern. test_dfm <- dfm_select(tr_3_dfm, tr_other3_dfm) # Let’s inspect how well the classification worked actual_class3 <- docvars(test_dfm, "Sentiment") predicted_class3 <- predict(nb, test_dfm) table(predicted_class3) prop.table(table(predicted_class3 )) class_table3 <- table(actual_class3, predicted_class3) class_table3 confusionMatrix(class_table3, mode = "everything") ####################################### # K=4 (train 4) NB ####################################### nb <- textmodel_nb(tr_other4_dfm , docvars(tr_other4_dfm, "Sentiment"), distribution = c("multinomial")) summary(nb) # Let's make the features identical by passing training_dfm to dfm_select() as a pattern. test_dfm <- dfm_select(tr_4_dfm, tr_other4_dfm) # Let’s inspect how well the classification worked actual_class4 <- docvars(test_dfm, "Sentiment") predicted_class4 <- predict(nb, test_dfm) table(predicted_class4) prop.table(table(predicted_class4 )) class_table4 <- table(actual_class4, predicted_class4) class_table4 confusionMatrix(class_table4, mode = "everything") ####################################### # NB cross-validation accuracy, precision & recall for each class in Sentiment ####################################### cm1 <- confusionMatrix(class_table1, mode = "everything") cm2 <-confusionMatrix(class_table2, mode = "everything") cm3 <-confusionMatrix(class_table3, mode = "everything") cm4 <-confusionMatrix(class_table4, mode = "everything") str(cm1) str(cm1$overall) (cm1$overall[1]+cm2$overall[1]+cm3$overall[1]+cm4$overall[1])/4 accuracy_mean_nb <- rbind(cm1$overall[1], cm2$overall[1], cm3$overall[1], cm4$overall[1]) accuracy_mean_nb accuracy_nb_avg <- mean(accuracy_mean_nb) accuracy_nb_sd <-sd(accuracy_mean_nb) # Recall (=Sensitivity; True Positive/Actual Positive or TP/(TP+FN)) # Precision (=Pos Pred Value; True Positive/Predicted Positive or TP/(TP+FP)) confusionMatrix(class_table1, mode = "everything") str(cm1$byClass) cm1$byClass[1:3] # recall row cm1$byClass[7:9] # precision row precision_nb <- as.data.frame(cbind(cm1$byClass[7:9], cm2$byClass[7:9], cm3$byClass[7:9], cm4$byClass[7:9])) precision_nb$sentiment <- c("Negative", "Neutral", "Positive") str(precision_nb ) precision_nb [1:4] precision_nb_avg <- as.data.frame(rowMeans(precision_nb [1:4], na.rm = TRUE)) precision_nb_avg $sentiment <- c("Negative", "Neutral", "Positive") str(precision_nb_avg) colnames(precision_nb_avg)[1] <- "Precision NB" str(precision_nb_avg) recall_nb <- as.data.frame(cbind(cm1$byClass[1:3], cm2$byClass[1:3], cm3$byClass[1:3], cm4$byClass[1:3])) recall_nb$sentiment <- c("Negative", "Neutral", "Positive") str(recall_nb ) recall_nb [1:4] recall_nb_avg <- as.data.frame(rowMeans(recall_nb [1:4], na.rm = TRUE)) recall_nb_avg $sentiment <- c("Negative", "Neutral", "Positive") str(recall_nb_avg) colnames(recall_nb_avg)[1] <- "Recall NB" str(recall_nb_avg) accuracy_nb_avg accuracy_nb_sd precision_nb_avg recall_nb_avg ########################################### # cross-validation with RF with K=4 (each TS of 118) ########################################### N <- nrow(train_df) N # Number of desired splits folds <- 4 # Generate indices of holdout observations holdout <- split(sample(1:N), 1:folds) str(holdout) # Check that each observation appears exactly once in the holdout object: holdout %>% unlist() %>% length() == N holdout[[1]] # training set with all the data but the ones included in the sampled train 1 data=train_df[-holdout$`1`,] nrow(data) 472-118 # training set with only the data included in the sampled train 1 newdata=train_df[holdout$`1`,] nrow(data) ####################################### # K=1 (train 1) RF ####################################### set.seed(123) system.time(RF <- randomForest(sentiment~ ., data=train_df[-holdout$`1`,], type="classification")) predictRF1 <- predict(RF, newdata=train_df[holdout$`1`,]) table( predictRF1) rf_results1 <- prop.table(table(predictRF1 )) rf_results1 newdata<-train_df[holdout$`1`,] table("Predictions"= predictRF1, "Actual"=newdata$sentiment) # Let's use the confusionMatrix command conf.rf1 <- confusionMatrix( predictRF1, newdata$sentiment) conf.rf1 ####################################### # K=2 (train 2) RF ####################################### set.seed(123) system.time(RF <- randomForest(sentiment~ ., data=train_df[-holdout$`2`,], type="classification")) predictRF2 <- predict(RF, newdata=train_df[holdout$`2`,]) table( predictRF2) rf_results2 <- prop.table(table(predictRF2 )) rf_results2 newdata<-train_df[holdout$`2`,] table("Predictions"= predictRF2, "Actual"=newdata$sentiment) # Let's use the confusionMatrix command conf.rf2 <- confusionMatrix( predictRF2, newdata$sentiment) conf.rf2 ####################################### # K=3 (train 3) RF ####################################### set.seed(123) system.time(RF <- randomForest(sentiment~ ., data=train_df[-holdout$`3`,], type="classification")) predictRF3 <- predict(RF, newdata=train_df[holdout$`3`,]) table( predictRF3) rf_results3 <- prop.table(table(predictRF3 )) rf_results3 newdata<-train_df[holdout$`3`,] table("Predictions"= predictRF3, "Actual"=newdata$sentiment) # Let's use the confusionMatrix command conf.rf3 <- confusionMatrix( predictRF3, newdata$sentiment) conf.rf3 ####################################### # K=4 (train 4) RF ####################################### set.seed(123) system.time(RF <- randomForest(sentiment~ ., data=train_df[-holdout$`4`,], type="classification")) predictRF4 <- predict(RF, newdata=train_df[holdout$`4`,]) table( predictRF4) rf_results4 <- prop.table(table(predictRF4 )) rf_results4 newdata<-train_df[holdout$`4`,] table("Predictions"= predictRF4, "Actual"=newdata$sentiment) # Let's use the confusionMatrix command conf.rf4 <- confusionMatrix( predictRF4, newdata$sentiment) conf.rf4 ####################################### # RF cross-validation accuracy, precision & recall for each class in Sentiment ####################################### (conf.rf1$overall[1] + conf.rf2$overall[1] + conf.rf3$overall[1] + conf.rf4$overall[1])/4 accuracy_mean_rf <- rbind(conf.rf1$overall[1], conf.rf2$overall[1], conf.rf3$overall[1], conf.rf4$overall[1]) accuracy_mean_rf accuracy_rf_avg <- mean(accuracy_mean_rf) accuracy_rf_sd <- sd(accuracy_mean_rf) recall_rf<- as.data.frame(cbind(conf.rf1$byClass[1:3], conf.rf2$byClass[1:3], conf.rf3$byClass[1:3], conf.rf4$byClass[1:3])) recall_rf$sentiment <- c("Negative", "Neutral", "Positive") str(recall_rf) recall_rf[1:4] recall_rf_avg <- as.data.frame(rowMeans(recall_rf[1:4], na.rm = TRUE)) recall_rf_avg $sentiment <- c("Negative", "Neutral", "Positive") colnames(recall_rf_avg)[1] <- "Recall RF" str(recall_rf_avg) precision_rf <- as.data.frame(cbind(conf.rf1$byClass[7:9], conf.rf2$byClass[7:9], conf.rf3$byClass[7:9], conf.rf4$byClass[7:9])) precision_rf$sentiment <- c("Negative", "Neutral", "Positive") str(precision_rf) precision_rf[1:4] precision_rf_avg <- as.data.frame(rowMeans(precision_rf[1:4], na.rm = TRUE)) precision_rf_avg $sentiment <- c("Negative", "Neutral", "Positive") colnames(precision_rf_avg)[1] <- "Precision RF" str(precision_rf_avg) accuracy_rf_avg accuracy_rf_sd recall_rf_avg precision_rf_avg ########################################### # cross-validation with SVM with K=4 (each TS of 118) ########################################### ####################################### # K=1 (train 1) SVM ####################################### set.seed(123) system.time(SVM <- svm(sentiment~ ., data=train_df[-holdout$`1`,], method = "C-classification", kernel='linear', cost=1)) predictSVM1 <- predict(SVM, newdata=train_df[holdout$`1`,]) table( predictSVM1) svm_results1 <- prop.table(table(predictSVM1 )) svm_results1 newdata<-train_df[holdout$`1`,] table("Predictions"= predictSVM1, "Actual"=newdata$sentiment) # Let's use the confusionMatrix command conf.svm1 <- confusionMatrix( predictSVM1, newdata$sentiment) conf.svm1 ####################################### # K=2 (train 2) SVM ####################################### set.seed(123) system.time(SVM <- svm(sentiment~ ., data=train_df[-holdout$`2`,], method = "C-classification", kernel='linear', cost=1)) predictSVM2 <- predict(SVM, newdata=train_df[holdout$`2`,]) table( predictSVM2) svm_results2 <- prop.table(table(predictSVM2 )) svm_results2 newdata<-train_df[holdout$`2`,] table("Predictions"= predictSVM2, "Actual"=newdata$sentiment) # Let's use the confusionMatrix command conf.svm2 <- confusionMatrix( predictSVM2, newdata$sentiment) conf.svm2 ####################################### # K=3 (train 3) SVM ####################################### set.seed(123) system.time(SVM <- svm(sentiment~ ., data=train_df[-holdout$`3`,], method = "C-classification", kernel='linear', cost=1)) predictSVM3 <- predict(SVM, newdata=train_df[holdout$`3`,]) table( predictSVM3) svm_results3 <- prop.table(table(predictSVM3 )) svm_results3 newdata<-train_df[holdout$`3`,] table("Predictions"= predictSVM3, "Actual"=newdata$sentiment) # Let's use the confusionMatrix command conf.svm3 <- confusionMatrix( predictSVM3, newdata$sentiment) conf.svm3 ####################################### # K=4 (train 4) SVM ####################################### set.seed(123) system.time(SVM <- svm(sentiment~ ., data=train_df[-holdout$`4`,], method = "C-classification", kernel='linear', cost=1)) predictSVM4 <- predict(SVM, newdata=train_df[holdout$`4`,]) table( predictSVM4) svm_results4 <- prop.table(table(predictSVM4 )) svm_results4 newdata<-train_df[holdout$`4`,] table("Predictions"= predictSVM4, "Actual"=newdata$sentiment) # Let's use the confusionMatrix command conf.svm4 <- confusionMatrix( predictSVM4, newdata$sentiment) conf.svm4 ####################################### # SVM cross-validation accuracy, precision & recall for each class in Sentiment ####################################### (conf.svm1$overall[1] + conf.svm2$overall[1] + conf.svm3$overall[1] + conf.svm4$overall[1])/4 accuracy_mean_svm <- rbind(conf.svm1$overall[1], conf.svm2$overall[1], conf.svm3$overall[1], conf.svm4$overall[1]) accuracy_mean_svm accuracy_svm_avg <- mean(accuracy_mean_svm) accuracy_svm_sd <- sd(accuracy_mean_svm) recall_svm<- as.data.frame(cbind(conf.svm1$byClass[1:3], conf.svm2$byClass[1:3], conf.svm3$byClass[1:3], conf.svm4$byClass[1:3])) recall_svm$sentiment <- c("Negative", "Neutral", "Positive") str(recall_svm) recall_svm[1:4] recall_svm_avg <- as.data.frame(rowMeans(recall_svm[1:4], na.rm = TRUE)) recall_svm_avg $sentiment <- c("Negative", "Neutral", "Positive") colnames(recall_svm_avg)[1] <- "Recall SVM" str(recall_svm_avg) precision_svm <- as.data.frame(cbind(conf.svm1$byClass[7:9], conf.svm2$byClass[7:9], conf.svm3$byClass[7:9], conf.svm4$byClass[7:9])) precision_svm$sentiment <- c("Negative", "Neutral", "Positive") str(precision_svm) precision_svm[1:4] precision_svm_avg <- as.data.frame(rowMeans(precision_svm[1:4], na.rm = TRUE)) precision_svm_avg $sentiment <- c("Negative", "Neutral", "Positive") colnames(precision_svm_avg)[1] <- "Precision SVM" str(precision_svm_avg) accuracy_svm_avg accuracy_svm_sd recall_svm_avg precision_svm_avg ###################### ###################### ## comparing NB, RF and SVM cross validation and plotting the results ###################### ###################### gb1 <- as.data.frame(accuracy_mean_rf ) colnames(gb1)[1] <- "Accuracy RF" gb2 <- as.data.frame(accuracy_mean_nb ) colnames(gb2)[1] <- "Accuracy NB" gb3 <- as.data.frame(accuracy_mean_svm ) colnames(gb3)[1] <- "Accuracy SVM" gb_tot <- cbind(gb1, gb2, gb3) gb_tot str(gb_tot) df.long_gb_tot<-melt(gb_tot) str(df.long_gb_tot) ggplot(df.long_gb_tot, aes(x=variable, y=value)) + geom_boxplot() + coord_flip() + xlab("Algorithm") + ylab(label="Value") + ggtitle("RF vs. Naive Bayes vs. SVM K-fold cross-validation (K=4): Accuracy") p2 <- ggplot(df.long_gb_tot, aes(x=variable, y=value)) + geom_boxplot() + coord_flip() + xlab("Algorithm") + ylab(label="Value") + ggtitle("RF vs. Naive Bayes K-fold vs. SVM cross-validation (K=4): Accuracy") cv_rf <- merge(recall_rf_avg, precision_rf_avg, by=c("sentiment")) str(cv_rf) cv_nb <- merge(recall_nb_avg, precision_nb_avg, by=c("sentiment")) cv_svm <- merge(recall_svm_avg, precision_svm_avg, by=c("sentiment")) cv <- merge(cv_rf, cv_nb, by=c("sentiment")) cv <- merge(cv, cv_svm, by=c("sentiment")) str(cv) cv<- cv[c(1,3,5,7,2, 4, 6)] str(cv) df.long2<-melt(cv,id.vars=c("sentiment")) str(df.long2) p <- ggplot(df.long2,aes(variable,value,fill=sentiment))+ geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() + ylab(label="Frequency") + xlab("Algorithm") + ggtitle("RF vs. Naive Bayes vs. SVM K-fold cross-validation (K=4): Precision and Recall") # Plot everything together grid.arrange(p2, p, nrow=2) library(gridExtra) grid.arrange( tableGrob(cv, theme=tt1)) grid.table(cv) tt1 <- ttheme_default() t <-tableGrob(cv, theme=tt1) # Plot chart and table into one object grid.arrange(p, t, nrow=2, as.table=TRUE, heights=c(3,1)) # Plot together chart and table ggplot(df.long2,aes(variable,value,fill=sentiment))+ geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() + ylab(label="Frequency") + xlab("Algorithm") + ggtitle("RF vs. Naive Bayes K-fold cross-validation (K=4)") + annotation_custom(tableGrob(cv), xmin=0.5, xmax=1, ymin=0.5, ymax=1)