rm(list=ls(all=TRUE)) getwd() setwd("C:/Users/mw/Dropbox (Personale)/TOPIC MODEL") getwd() library(e1071) library(caTools) library(randomForest) library(caret) library(quanteda) library(readtext) ### TRAIN-TEST # This dataset is a sample of tweets mentioning the official account of Donald Trump “@realDonaldTrump”, # on dates 7–13 June 2016, written in English and coming from the US. # Data have been collected through Twitter API also specifying language and origin of tweets. # This dataset include a sample of around 482 tweets that have been manually codified by a group # of students. The coding stage involved detecting the sentiment towards Trump (negative, positive, # neutral). x <- read.csv("Trump-orig3.csv", stringsAsFactors=FALSE) str(x) table(x$Sentiment) x$Sentiment <- as.factor(x$Sentiment) str(x) table(x$Sentiment) prop.table(table(x$Sentiment)) myCorpusTrain <- corpus(x) summary(myCorpusTrain ) ### TEST-SET ### This is a sample of 1000 tweets written in English about Trump and published since 1.17.2018 till 1.19.2018 x10 <- read.csv("Trump_tweets2.csv", stringsAsFactors=FALSE) str(x10) x10$X <- NULL myCorpusTest <- corpus(x10) summary(myCorpusTest) MyTotalCorpus <-corpus (myCorpusTrain +myCorpusTest) head(summary(MyTotalCorpus )) ########################################### # Try a random forest model ########################################### myDfm <- dfm(MyTotalCorpus , remove = c(stopwords("english"), ("pic.twitter.com"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) topfeatures(myDfm , 20) # 20 top words # Keep terms that appear at least in the 5% or more of the tweets. trim <- dfm_trim(myDfm , min_docfreq= 0.05) data <- as.data.frame(as.matrix(trim)) str(data ) colnames(data ) # this is important: randomForest can't recognize the colname that begins with space, comma, number or other specific punctuation. # the command below would add a letter in front of a number (if you have any numbers left in the tdm). Highly suggested colnames(data ) <- make.names(colnames(data )) colnames(data ) str(MyTotalCorpus ) MyTotalCorpus$documents$Sentiment table(MyTotalCorpus$documents$Sentiment) data$sentiment<- MyTotalCorpus$documents$Sentiment table(data$sentiment) # train <- subset(data, sentiment!="NA's") train <- subset(data,!is.na(sentiment)) nrow(train) test <- subset(data,is.na(sentiment)) nrow(test ) colnames(train ) table(train $sentiment) str(train$sentiment) set.seed(123) system.time(RF <- randomForest(sentiment~ ., data=train, type="classification")) predictRF <- predict(RF, newdata=test) table( predictRF) rf_results <- prop.table(table(predictRF )) rf_results ########################################### ## Try a Naive Bayes model ########################################### summary(MyTotalCorpus ) str(MyTotalCorpus ) str(MyTotalCorpus$documents$Sentiment) table(MyTotalCorpus$documents$Sentiment) table(is.na(MyTotalCorpus$documents$Sentiment)) # get training set (documents in id_train) and compute the dfm out of it training_corpus <- corpus_subset(MyTotalCorpus, !is.na(Sentiment)) # or you directly start with the corpus of the training-set training_corpus <- myCorpusTrain summary(training_corpus ) training_dfm <- dfm(training_corpus , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) topfeatures(training_dfm , 20) # 20 top words # Keep terms that appear at least in the 5% or more of the tweets (tweets are very short texts...) training_dfm <- dfm_trim(training_dfm , min_docfreq= 0.05) # get training set (documents in id_train) and compute the dfm out of it test_corpus <- corpus_subset(MyTotalCorpus,is.na(Sentiment)) # or you directly start with the corpus of the test-set test_corpus <- myCorpusTest summary(test_corpus ) test_dfm <- dfm(test_corpus , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) topfeatures(test_dfm , 20) # 20 top words # Keep terms that appear at least in the 5% or more of the tweets (tweets are very short texts...) test_dfm <- dfm_trim(test_dfm , min_docfreq= 0.05) # train the naive Bayes classifier using textmodel_nb() and a Multinomial distribution nb <- textmodel_nb(training_dfm, docvars(training_dfm, "Sentiment"), distribution = c("multinomial")) summary(nb) # Let's make the features identical by passing training_dfm to dfm_select() as a pattern. test_dfm <- dfm_select(test_dfm, training_dfm) predicted_class <- predict(nb, test_dfm) table(predicted_class) str(predicted_class) nb_results <- prop.table(table(predicted_class )) nb_results ####################################### #### plot RF e NB results ####################################### myFrame1 <- as.data.frame(prop.table(table(predictRF ))) str(myFrame1) colnames(myFrame1)[1] <- "Prediction" myFrame1$class <- c("RF", "RF", "RF") str(myFrame1) # myFrame1[,1] = NULL myFrame2 <- as.data.frame(prop.table(table(predicted_class ))) str(myFrame2) colnames(myFrame2)[1] <- "Prediction" myFrame2$class <- c("NB", "NB", "NB") str(myFrame2) # myFrame2[,1] = NULL myFrame_tot <- rbind(myFrame1, myFrame2) str(myFrame_tot) library(reshape2) df.long<-melt(myFrame_tot,id.vars=c("class", "Freq")) str(df.long) ggplot(df.long,aes(class,Freq,fill=value))+ geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() + ylab(label="Frequency") + xlab("Algorithm") + ggtitle("RF vs. Naive Bayes predictions for the test-set") ########################################### # cross-validation with NB with K=4 (each TS of 118) ########################################### summary( myCorpusTrain) N <- ndoc(myCorpusTrain) # Number of desired splits folds <- 4 # Generate indices of holdout observations holdout <- split(sample(1:N), 1:folds) str(holdout) # Check that each observation appears exactly once in the holdout object: holdout %>% unlist() %>% length() == N holdout[[1]] # create docvar with ID docvars(myCorpusTrain, "id_numeric") <- 1:ndoc(myCorpusTrain) summary(myCorpusTrain) # get training set_other1 (all the documents NOT in holdout[[1]]) and compute the dfm out of it tr_other1 <- corpus_subset(myCorpusTrain, !id_numeric %in% holdout[[1]]) tr_other1_dfm <- dfm(tr_other1 , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) tr_other1_dfm <- dfm_trim(tr_other1_dfm , min_docfreq= 0.05) # get training set_other2 (all the documents NOT in holdout[[2]]) and compute the dfm out of it tr_other2 <- corpus_subset(myCorpusTrain, !id_numeric %in% holdout[[2]]) tr_other2_dfm <- dfm(tr_other2 , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) tr_other2_dfm <- dfm_trim(tr_other2_dfm , min_docfreq= 0.05) # get training set_other3 (all the documents NOT in holdout[[3]]) and compute the dfm out of it tr_other3 <- corpus_subset(myCorpusTrain, !id_numeric %in% holdout[[3]]) tr_other3_dfm <- dfm(tr_other3 , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) tr_other3_dfm <- dfm_trim(tr_other3_dfm , min_docfreq= 0.05) # get training set_other4 (all the documents NOT in holdout[[4]]) and compute the dfm out of it tr_other4 <- corpus_subset(myCorpusTrain, !id_numeric %in% holdout[[4]]) tr_other4_dfm <- dfm(tr_other4 , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) tr_other4_dfm <- dfm_trim(tr_other4_dfm , min_docfreq= 0.05) # get training set1 (all the documents in holdout[[1]]) and compute the dfm out of it tr_1 <- corpus_subset(myCorpusTrain, id_numeric %in% holdout[[1]]) tr_1_dfm <- dfm(tr_1 , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) tr_1_dfm <- dfm_trim(tr_1_dfm , min_docfreq= 0.05) # get training set2 (all the documents in holdout[[2]]) and compute the dfm out of it tr_2 <- corpus_subset(myCorpusTrain, id_numeric %in% holdout[[2]]) tr_2_dfm <- dfm(tr_2 , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) tr_2_dfm <- dfm_trim(tr_2_dfm , min_docfreq= 0.05) # get training set3 (all the documents in holdout[[3]]) and compute the dfm out of it tr_3 <- corpus_subset(myCorpusTrain, id_numeric %in% holdout[[3]]) tr_3_dfm <- dfm(tr_3 , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) tr_3_dfm <- dfm_trim(tr_3_dfm , min_docfreq= 0.05) # get training set4 (all the documents in holdout[[4]]) and compute the dfm out of it tr_4 <- corpus_subset(myCorpusTrain, id_numeric %in% holdout[[4]]) tr_4_dfm <- dfm(tr_4 , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) tr_4_dfm <- dfm_trim(tr_4_dfm , min_docfreq= 0.05) ####################################### # K=1 (train 1) NB ####################################### nb <- textmodel_nb(tr_other1_dfm , docvars(tr_other1_dfm, "Sentiment"), distribution = c("multinomial")) summary(nb) # Let's make the features identical by passing training_dfm to dfm_select() as a pattern. test_dfm <- dfm_select(tr_1_dfm, tr_other1_dfm) # Let’s inspect how well the classification worked actual_class1 <- docvars(test_dfm, "Sentiment") predicted_class1 <- predict(nb, test_dfm) table(predicted_class1) prop.table(table(predicted_class1 )) class_table1 <- table(actual_class1, predicted_class1) class_table1 confusionMatrix(class_table1, mode = "everything") ####################################### # K=2 (train 2) NB ####################################### nb <- textmodel_nb(tr_other2_dfm , docvars(tr_other2_dfm, "Sentiment"), distribution = c("multinomial")) summary(nb) # Let's make the features identical by passing training_dfm to dfm_select() as a pattern. test_dfm <- dfm_select(tr_2_dfm, tr_other2_dfm) # Let’s inspect how well the classification worked actual_class2 <- docvars(test_dfm, "Sentiment") predicted_class2 <- predict(nb, test_dfm) table(predicted_class2) prop.table(table(predicted_class2 )) class_table2 <- table(actual_class2, predicted_class2) class_table2 confusionMatrix(class_table2, mode = "everything") ####################################### # K=3 (train 3) NB ####################################### nb <- textmodel_nb(tr_other3_dfm , docvars(tr_other3_dfm, "Sentiment"), distribution = c("multinomial")) summary(nb) # Let's make the features identical by passing training_dfm to dfm_select() as a pattern. test_dfm <- dfm_select(tr_3_dfm, tr_other3_dfm) # Let’s inspect how well the classification worked actual_class3 <- docvars(test_dfm, "Sentiment") predicted_class3 <- predict(nb, test_dfm) table(predicted_class3) prop.table(table(predicted_class3 )) class_table3 <- table(actual_class3, predicted_class3) class_table3 confusionMatrix(class_table3, mode = "everything") ####################################### # K=4 (train 4) NB ####################################### nb <- textmodel_nb(tr_other4_dfm , docvars(tr_other4_dfm, "Sentiment"), distribution = c("multinomial")) summary(nb) # Let's make the features identical by passing training_dfm to dfm_select() as a pattern. test_dfm <- dfm_select(tr_4_dfm, tr_other4_dfm) # Let’s inspect how well the classification worked actual_class4 <- docvars(test_dfm, "Sentiment") predicted_class4 <- predict(nb, test_dfm) table(predicted_class4) prop.table(table(predicted_class4 )) class_table4 <- table(actual_class4, predicted_class4) class_table4 confusionMatrix(class_table4, mode = "everything") ####################################### # NB cross-validation accuracy, precision & recall for each class in Sentiment ####################################### cm1 <- confusionMatrix(class_table1, mode = "everything") cm2 <-confusionMatrix(class_table2, mode = "everything") cm3 <-confusionMatrix(class_table3, mode = "everything") cm4 <-confusionMatrix(class_table4, mode = "everything") str(cm1) str(cm1$overall) (cm1$overall[1]+cm2$overall[1]+cm3$overall[1]+cm4$overall[1])/4 accuracy_mean_nb <- rbind(cm1$overall[1], cm2$overall[1], cm3$overall[1], cm4$overall[1]) accuracy_mean_nb accuracy_nb_avg <- mean(accuracy_mean_nb) accuracy_nb_sd <-sd(accuracy_mean_nb) # Recall (=Sensitivity; True Positive/Actual Positive or TP/(TP+FN)) # Precision (=Pos Pred Value; True Positive/Predicted Positive or TP/(TP+FP)) confusionMatrix(class_table1, mode = "everything") str(cm1$byClass) cm1$byClass[1:3] # recall row cm1$byClass[7:9] # precision row precision_nb <- as.data.frame(cbind(cm1$byClass[7:9], cm2$byClass[7:9], cm3$byClass[7:9], cm4$byClass[7:9])) precision_nb$sentiment <- c("Negative", "Neutral", "Positive") str(precision_nb ) precision_nb [1:4] precision_nb_avg <- as.data.frame(rowMeans(precision_nb [1:4], na.rm = TRUE)) precision_nb_avg $sentiment <- c("Negative", "Neutral", "Positive") str(precision_nb_avg) colnames(precision_nb_avg)[1] <- "Precision NB" str(precision_nb_avg) recall_nb <- as.data.frame(cbind(cm1$byClass[1:3], cm2$byClass[1:3], cm3$byClass[1:3], cm4$byClass[1:3])) recall_nb$sentiment <- c("Negative", "Neutral", "Positive") str(recall_nb ) recall_nb [1:4] recall_nb_avg <- as.data.frame(rowMeans(recall_nb [1:4], na.rm = TRUE)) recall_nb_avg $sentiment <- c("Negative", "Neutral", "Positive") str(recall_nb_avg) colnames(recall_nb_avg)[1] <- "Recall NB" str(recall_nb_avg) accuracy_nb_avg accuracy_nb_sd precision_nb_avg recall_nb_avg ########################################### # cross-validation with RF with K=4 (each TS of 118) ########################################### library(randomForest) N <- nrow(train) # Number of desired splits folds <- 4 # Generate indices of holdout observations holdout <- split(sample(1:N), 1:folds) str(holdout) # Check that each observation appears exactly once in the holdout object: holdout %>% unlist() %>% length() == N holdout[[1]] # training set with all the data but the ones included in the sampled train 1 data=train[-holdout$`1`,] nrow(data) 472-118 # training set with only the data included in the sampled train 1 newdata=train[holdout$`1`,] nrow(data) ####################################### # K=1 (train 1) RF ####################################### set.seed(123) system.time(RF <- randomForest(sentiment~ ., data=train[-holdout$`1`,], type="classification")) predictRF1 <- predict(RF, newdata=train[holdout$`1`,]) table( predictRF1) rf_results1 <- prop.table(table(predictRF1 )) rf_results1 newdata<-train[holdout$`1`,] table("Predictions"= predictRF1, "Actual"=newdata$sentiment) # Let's use the confusionMatrix command conf.rf1 <- confusionMatrix( predictRF1, newdata$sentiment) conf.rf1 ####################################### # K=2 (train 2) RF ####################################### set.seed(123) system.time(RF <- randomForest(sentiment~ ., data=train[-holdout$`2`,], type="classification")) predictRF2 <- predict(RF, newdata=train[holdout$`2`,]) table( predictRF2) rf_results2 <- prop.table(table(predictRF2 )) rf_results2 newdata<-train[holdout$`2`,] table("Predictions"= predictRF2, "Actual"=newdata$sentiment) # Let's use the confusionMatrix command conf.rf2 <- confusionMatrix( predictRF2, newdata$sentiment) conf.rf2 ####################################### # K=3 (train 3) RF ####################################### set.seed(123) system.time(RF <- randomForest(sentiment~ ., data=train[-holdout$`3`,], type="classification")) predictRF3 <- predict(RF, newdata=train[holdout$`3`,]) table( predictRF3) rf_results3 <- prop.table(table(predictRF3 )) rf_results3 newdata<-train[holdout$`3`,] table("Predictions"= predictRF3, "Actual"=newdata$sentiment) # Let's use the confusionMatrix command conf.rf3 <- confusionMatrix( predictRF3, newdata$sentiment) conf.rf3 ####################################### # K=4 (train 4) RF ####################################### set.seed(123) system.time(RF <- randomForest(sentiment~ ., data=train[-holdout$`4`,], type="classification")) predictRF4 <- predict(RF, newdata=train[holdout$`4`,]) table( predictRF4) rf_results4 <- prop.table(table(predictRF4 )) rf_results4 newdata<-train[holdout$`4`,] table("Predictions"= predictRF4, "Actual"=newdata$sentiment) # Let's use the confusionMatrix command conf.rf4 <- confusionMatrix( predictRF4, newdata$sentiment) conf.rf4 ####################################### # RF cross-validation accuracy, precision & recall for each class in Sentiment ####################################### (conf.rf1$overall[1] + conf.rf2$overall[1] + conf.rf3$overall[1] + conf.rf4$overall[1])/4 accuracy_mean_rf <- rbind(conf.rf1$overall[1], conf.rf2$overall[1], conf.rf3$overall[1], conf.rf4$overall[1]) accuracy_mean_rf accuracy_rf_avg <- mean(accuracy_mean_rf) accuracy_rf_sd <- sd(accuracy_mean_rf) recall_rf<- as.data.frame(cbind(conf.rf1$byClass[1:3], conf.rf2$byClass[1:3], conf.rf3$byClass[1:3], conf.rf4$byClass[1:3])) recall_rf$sentiment <- c("Negative", "Neutral", "Positive") str(recall_rf) recall_rf[1:4] recall_rf_avg <- as.data.frame(rowMeans(recall_rf[1:4], na.rm = TRUE)) recall_rf_avg $sentiment <- c("Negative", "Neutral", "Positive") colnames(recall_rf_avg)[1] <- "Recall RF" str(recall_rf_avg) precision_rf <- as.data.frame(cbind(conf.rf1$byClass[7:9], conf.rf2$byClass[7:9], conf.rf3$byClass[7:9], conf.rf4$byClass[7:9])) precision_rf$sentiment <- c("Negative", "Neutral", "Positive") str(precision_rf) precision_rf[1:4] precision_rf_avg <- as.data.frame(rowMeans(precision_rf[1:4], na.rm = TRUE)) precision_rf_avg $sentiment <- c("Negative", "Neutral", "Positive") colnames(precision_rf_avg)[1] <- "Precision RF" str(precision_rf_avg) accuracy_rf_avg accuracy_rf_sd recall_rf_avg precision_rf_avg ###################### ###################### ## comparing NB and RF cross validation and plotting the results ###################### ###################### gb1 <- as.data.frame(accuracy_mean_rf ) colnames(gb1)[1] <- "Accuracy RF" gb2 <- as.data.frame(accuracy_mean_nb ) colnames(gb2)[1] <- "Accuracy NB" gb_tot <- cbind(gb1, gb2) gb_tot str(gb_tot) df.long_gb_tot<-melt(gb_tot) str(df.long_gb_tot) ggplot(df.long_gb_tot, aes(x=variable, y=value)) + geom_boxplot() + coord_flip() + xlab("Algorithm") + ylab(label="Value") + ggtitle("RF vs. Naive Bayes K-fold cross-validation (K=4): Accuracy") p2 <- ggplot(df.long_gb_tot, aes(x=variable, y=value)) + geom_boxplot() + coord_flip() + xlab("Algorithm") + ylab(label="Value") + ggtitle("RF vs. Naive Bayes K-fold cross-validation (K=4): Accuracy") cv_rf <- merge(recall_rf_avg, precision_rf_avg, by=c("sentiment")) str(cv_rf) cv_nb <- merge(recall_nb_avg, precision_nb_avg, by=c("sentiment")) cv <- merge(cv_rf, cv_nb, by=c("sentiment")) str(cv) cv<- cv[c(1,5,3,4,2)] str(cv) library(gridExtra) grid.table(cv) tt1 <- ttheme_default() t <-tableGrob(cv, theme=tt1) grid.arrange( tableGrob(cv, theme=tt1)) str(cv) cv<- cv[c(1,4,5,2,3)] str(cv) df.long2<-melt(cv,id.vars=c("sentiment")) str(df.long2) p <- ggplot(df.long2,aes(variable,value,fill=sentiment))+ geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() + ylab(label="Frequency") + xlab("Algorithm") + ggtitle("RF vs. Naive Bayes K-fold cross-validation (K=4): Precision and Recall") # Plot chart and table into one object grid.arrange(p, t, nrow=2, as.table=TRUE, heights=c(3,1)) # Plot together chart and table p3 <- ggplot(df.long2,aes(variable,value,fill=sentiment))+ geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() + ylab(label="Frequency") + xlab("Algorithm") + ggtitle("RF vs. Naive Bayes K-fold cross-validation (K=4)") + annotation_custom(tableGrob(cv), xmin=0.5, xmax=1, ymin=0.5, ymax=1) # Plot everything together grid.arrange(p2, p, nrow=2)