rm(list=ls(all=TRUE)) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(quanteda) library(readtext) library(caTools) library(randomForest) library(caret) library(naivebayes) library(car) library(ggplot2) library(dplyr) library(reshape2) ##################################################### # FIRST STEP: let's create the DfM for the training-set ##################################################### # let's focus on UK tweets x <- read.csv("uk_train.csv", stringsAsFactors=FALSE) myCorpusTwitterTrain <- corpus(x) tok2 <- tokens(myCorpusTwitterTrain , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_remove(tok2, c("0*")) tok2 <- tokens_wordstem (tok2) Dfm_train <- dfm(tok2) Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2) ##################################################### # SECOND STEP: let's create the DfM for the test-set ##################################################### x10 <- read.csv("uk_test2.csv", stringsAsFactors=FALSE) myCorpusTwitterTest <- corpus(x10) tok<- tokens(myCorpusTwitterTest, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok<- tokens_remove(tok, stopwords("en")) tok<- tokens_remove(tok, c("0*")) tok<- tokens_wordstem (tok) Dfm_test<- dfm(tok) Dfm_test<- dfm_trim(Dfm_test, min_docfreq = 2) ##################################################### # THIRD STEP: Let's make the features identical between train and test-set ##################################################### test_dfm <- dfm_match(Dfm_test, features = featnames(Dfm_train)) setequal(featnames(Dfm_train), featnames(test_dfm )) ##################################################### # FOURTH STEP: Let's convert the DfM into matrices for the ML algorithms to work ##################################################### train <- as.matrix(Dfm_train) test <- as.matrix(test_dfm) ##################################################### # FIFHT STEP: let's estimate a ML Model ##################################################### ##################################################### # Let's start with a Naive Bayes Model ##################################################### system.time(NB22 <- multinomial_naive_bayes(x=train, y=as.factor(Dfm_train@docvars$Sentiment), laplace = 1)) NB_prob <- as.data.frame(NB22$params) NB_prob$Feature <- row.names(NB_prob) # let's identify the max value between negative, neutral and positive for each feature NB_prob$winner <- apply(NB_prob[c(1:3)], 1, FUN=max) NB_prob$sentiment <- ifelse(NB_prob$winner == NB_prob$negative, "negative", ifelse(NB_prob$winner == NB_prob$positive, "positive", "neutral")) str(NB_prob) negatives <- NB_prob[ which(NB_prob$sentiment =="negative"), ] positives <- NB_prob[ which(NB_prob$sentiment =="positive"), ] neutrals <- NB_prob[ which(NB_prob$sentiment =="neutral"), ] # the features that change the most the difference between the positive and negative conditional probabilities print(head(negatives [order(negatives $winner , decreasing=TRUE),], 15)) # negative words print(head(positives [order(positives $winner , decreasing=TRUE),], 15)) # positive words print(head(neutrals [order(neutrals $winner , decreasing=TRUE),], 15)) # neutral words # let's extract the top 15-most positive values, the 15-most negative values and the 15-most neutral values with respect to contributing features df1 <- top_n(negatives , 15, winner ) df2 <- top_n(positives , 15, winner) df3 <- top_n(neutrals , 15, winner) NB_prob_new <- rbind(df1, df2, df3) # reorder the features NB_prob_new <- mutate(NB_prob_new, Feature= reorder(Feature, winner)) ggplot(NB_prob_new, aes(Feature, winner, fill = sentiment)) + geom_bar(stat="identity", fill= "white") + theme_classic()+ geom_col(show.legend = TRUE) + coord_flip() + theme(legend.position="bottom")+ ylab("Conditional probabilities") + scale_fill_manual(values = c("#000033", "#006699", "#99CCFF")) + labs(title = "Tweets about uk 2014 EP elections", subtitle = "positive versus neutral versus negative words - Naive Bayes Model") # let's reorder the features but differentiating according to the category (positive, negative and neutral) NB_prob_new$Feature <- with(NB_prob_new, factor(Feature,levels=Feature[order(ave(winner, sentiment,FUN=max),winner)])) ggplot(NB_prob_new, aes(Feature, winner, fill = sentiment)) + geom_bar(stat="identity", fill= "white") + theme_classic()+ geom_col(show.legend = TRUE) + coord_flip() + theme(legend.position="bottom")+ ylab("Conditional probabilities") + scale_fill_manual(values = c("#000033", "#006699", "#99CCFF")) + labs(title = "Tweets about uk 2014 EP elections", subtitle = "positive versus neutral versus negative words - Naive Bayes Model") NB_graph <- ggplot(NB_prob_new, aes(Feature, winner, fill = sentiment)) + geom_bar(stat="identity", fill= "white") + theme_classic()+ geom_col(show.legend = TRUE) + coord_flip() + theme(legend.position="bottom")+ ylab("Conditional probabilities") + scale_fill_manual(values = c("#000033", "#006699", "#99CCFF")) + labs(title = "Tweets about uk 2014 EP elections", subtitle = "positive versus neutral versus negative words - Naive Bayes Model") # let's FINALLY predict the test-set predicted_nb <- predict(NB22 ,test ) table(predicted_nb ) prop.table(table(predicted_nb )) ##################################################### # let's run a Random Forest ##################################################### set.seed(123) # (define a set.seed for being able to replicate the results!) system.time(RF <- randomForest(y= as.factor(Dfm_train@docvars$Sentiment), x=train, importance=TRUE, do.trace=TRUE, ntree=500)) # let's extract the matrix for GINI and Accuracy importance_RF <- as.data.frame(RF$importance[,4:5]) str(importance_RF) importance_RF$Feature<- row.names(importance_RF) str(importance_RF) # same words we get with varImpPlot(RF ) print(head(importance_RF[order(importance_RF$MeanDecreaseGini, decreasing=TRUE),])) predicted_rf <- predict(RF, train, type="class") table(predicted_rf ) Dfm_train@docvars$predRF <- ifelse(predicted_rf=="negative",0, ifelse(predicted_rf == "neutral",1,2)) table(Dfm_train@docvars$predRF) # adding sign [if 0/1 according to the content of the review - neg or pos] sums <- list() for (v in 0:2){ sums[[v+1]] <- colSums(train[Dfm_train@docvars[,"predRF"]==v,], na.rm = TRUE) } sums <- do.call(cbind, sums) sign <- apply(sums, 1, which.max) # get the feature names <- dimnames(train)[[2]] str(names) df <- data.frame( Feature = names, sign = sign-1, stringsAsFactors=F) str(df) importance <- merge(importance_RF, df, by="Feature") str(importance) table(importance$sign) for (v in 0:2){ cat("\n\n") cat("value==", v) importance <- importance[order(importance$MeanDecreaseGini, decreasing=TRUE),] print(head(importance[importance$sign==v,], n=10)) cat("\n") cat(paste(unique(head(importance$Features[importance$sign==v], n=10)), collapse=", ")) } # let's draw a graph with our results! str(importance) negatives <- importance[ which(importance$sign ==0), ] neutrals <- importance[ which(importance$sign ==1), ] positives <- importance[ which(importance$sign ==2), ] df1 <- top_n(negatives , 10, MeanDecreaseGini) df2 <- top_n(positives , 10, MeanDecreaseGini) df3 <- top_n(neutrals , 10, MeanDecreaseGini) RF_prob_new <- rbind(df1, df2, df3) str(RF_prob_new) RF_prob_new$sign2 <- factor(RF_prob_new$sign, labels = c("Negative","Neutral","Positive")) str(RF_prob_new) table(RF_prob_new$sign) table(RF_prob_new$sign2) # reorder the features RF_prob_new <- mutate(RF_prob_new, Feature= reorder(Feature, MeanDecreaseGini )) ggplot(RF_prob_new, aes(Feature, MeanDecreaseGini , fill = sign2)) + geom_bar(stat="identity", fill= "white") + theme_classic() + geom_col(show.legend = T) + theme(legend.position="bottom")+ coord_flip() + ylab("Mean Decrease Gini") + labs(title = "Tweets about uk 2014 EP elections", subtitle = "negative versus neutral versus positive features - Random Forest Model", fill= "sentiment") + scale_fill_manual(values = c("#003333", "#009999", "steelblue")) # let's reorder the features but differentiating according to the category (positive, negative and neutral) RF_prob_new$Feature <- with(RF_prob_new, factor(Feature,levels=Feature[order(ave(MeanDecreaseGini, sign2,FUN=max),MeanDecreaseGini)])) ggplot(RF_prob_new, aes(Feature, MeanDecreaseGini , fill = sign2)) + geom_bar(stat="identity", fill= "white") + theme_classic() + geom_col(show.legend = T) + theme(legend.position="bottom")+ coord_flip() + ylab("Mean Decrease Gini") + labs(title = "Tweets about uk 2014 EP elections", subtitle = "negative versus neutral versus positive features - Random Forest Model", fill= "sentiment") + scale_fill_manual(values = c("#003333", "#009999", "steelblue")) RF_graph <- ggplot(RF_prob_new, aes(Feature, MeanDecreaseGini , fill = sign2)) + geom_bar(stat="identity", fill= "white") + theme_classic() + geom_col(show.legend = T) + theme(legend.position="bottom")+ coord_flip() + ylab("Mean Decrease Gini") + labs(title = "Tweets about uk 2014 EP elections", subtitle = "negative versus neutral versus positive features - Random Forest Model", fill= "sentiment") + scale_fill_manual(values = c("#003333", "#009999", "steelblue")) # let's FINALLY predict the test-set system.time(predicted_rf <- predict(RF, test,type="class")) table(predicted_rf ) prop.table(table(predicted_rf)) ###################################################### ###################################################### # Let's compare the results out-of-sample we got via Naive Bayes, SVM & RF ###################################################### ###################################################### prop.table(table(predicted_nb )) prop.table(table(predicted_rf )) results <- as.data.frame(rbind(prop.table(table(predicted_nb )), prop.table(table(predicted_rf )))) results$algorithm <- c("NB", "RF") str(results) # Let's plot the results! library(reshape2) df.long<-melt(results,id.vars=c("algorithm")) str(df.long) ggplot(df.long,aes(algorithm,value,fill=variable))+ geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() + ylab(label="UK tweets in the test-set") + xlab("algorithm") + scale_fill_discrete(name = "Prediction", labels = c("Negative", "Neutral", "Positive")) ################################################# # let's compare feature importance across models ################################################# library(gridExtra) grid.arrange(NB_graph, RF_graph, nrow=2) # Plot everything together NB_graph RF_graph # let's check some correlation # let's merge the dataframe words <- merge(NB_prob, importance, by="Feature") str(words) library(PerformanceAnalytics) chart.Correlation(words[c(5,7:8)])