rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(caTools)
library(randomForest)
library(caret)
library(naivebayes)
library(car)
library(ggplot2)
library(dplyr)
library(reshape2)


#####################################################
# FIRST STEP: let's create the DfM for the training-set
#####################################################

# let's focus on UK tweets 
x <- read.csv("uk_train.csv", stringsAsFactors=FALSE)
myCorpusTwitterTrain <- corpus(x)
tok2 <- tokens(myCorpusTwitterTrain , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
tok2 <- tokens_remove(tok2, c("0*"))
tok2 <- tokens_wordstem (tok2)
Dfm_train <- dfm(tok2)
Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2)

#####################################################
# SECOND STEP: let's create the DfM for the test-set
#####################################################

x10 <- read.csv("uk_test2.csv", stringsAsFactors=FALSE)
myCorpusTwitterTest <- corpus(x10)
tok<- tokens(myCorpusTwitterTest, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok<- tokens_remove(tok, stopwords("en"))
tok<- tokens_remove(tok, c("0*"))
tok<- tokens_wordstem (tok)
Dfm_test<- dfm(tok)
Dfm_test<- dfm_trim(Dfm_test, min_docfreq = 2)

#####################################################
# THIRD STEP: Let's make the features identical between train and test-set 
#####################################################

test_dfm  <- dfm_match(Dfm_test, features = featnames(Dfm_train))
setequal(featnames(Dfm_train), featnames(test_dfm ))

#####################################################
# FOURTH STEP: Let's convert the DfM into matrices for the ML algorithms to work
#####################################################

train <- as.matrix(Dfm_train) 
test <- as.matrix(test_dfm)

#####################################################
# FIFHT STEP: let's estimate a ML Model
#####################################################

#####################################################
# Let's start with a Naive Bayes Model
#####################################################

system.time(NB22 <- multinomial_naive_bayes(x=train, y=as.factor(Dfm_train@docvars$Sentiment), laplace = 1))
NB_prob <- as.data.frame(NB22$params)
NB_prob$Feature <- row.names(NB_prob)
# let's identify the max value between negative, neutral and positive for each feature
NB_prob$winner <- apply(NB_prob[c(1:3)], 1, FUN=max)
NB_prob$sentiment <- ifelse(NB_prob$winner == NB_prob$negative, "negative",
ifelse(NB_prob$winner == NB_prob$positive, "positive", "neutral"))
str(NB_prob)

negatives <- NB_prob[ which(NB_prob$sentiment =="negative"), ]
positives <- NB_prob[ which(NB_prob$sentiment =="positive"), ]
neutrals <- NB_prob[ which(NB_prob$sentiment =="neutral"), ]

# the features that change the most the difference between the positive and negative conditional probabilities
print(head(negatives [order(negatives $winner , decreasing=TRUE),], 15)) # negative words
print(head(positives [order(positives $winner , decreasing=TRUE),], 15)) # positive words
print(head(neutrals [order(neutrals $winner , decreasing=TRUE),], 15)) # neutral words

# let's extract the top 15-most positive values, the 15-most negative values and the 15-most neutral values with respect to contributing features
df1 <- top_n(negatives , 15, winner )
df2 <- top_n(positives , 15, winner)
df3 <- top_n(neutrals , 15, winner)
NB_prob_new <- rbind(df1, df2, df3)

# reorder the features
NB_prob_new <-  mutate(NB_prob_new, Feature= reorder(Feature, winner))

ggplot(NB_prob_new, aes(Feature, winner, fill = sentiment)) +
geom_bar(stat="identity", fill= "white") + theme_classic()+
geom_col(show.legend = TRUE) +
coord_flip() + theme(legend.position="bottom")+
ylab("Conditional probabilities") +
scale_fill_manual(values = c("#000033", "#006699", "#99CCFF")) +
labs(title = "Tweets about uk 2014 EP elections",
subtitle = "positive versus neutral versus negative words - Naive Bayes Model")

# let's reorder the features but differentiating according to the category (positive, negative and neutral)
NB_prob_new$Feature <- with(NB_prob_new, factor(Feature,levels=Feature[order(ave(winner,
sentiment,FUN=max),winner)]))

ggplot(NB_prob_new, aes(Feature, winner, fill = sentiment)) +
geom_bar(stat="identity", fill= "white") + theme_classic()+
geom_col(show.legend = TRUE) +
coord_flip() + theme(legend.position="bottom")+
ylab("Conditional probabilities") +
scale_fill_manual(values = c("#000033", "#006699", "#99CCFF")) +
labs(title = "Tweets about uk 2014 EP elections",
subtitle = "positive versus neutral versus negative words - Naive Bayes Model")

NB_graph <- ggplot(NB_prob_new, aes(Feature, winner, fill = sentiment)) +
geom_bar(stat="identity", fill= "white") + theme_classic()+
geom_col(show.legend = TRUE) +
coord_flip() + theme(legend.position="bottom")+
ylab("Conditional probabilities") +
scale_fill_manual(values = c("#000033", "#006699", "#99CCFF")) +
labs(title = "Tweets about uk 2014 EP elections",
subtitle = "positive versus neutral versus negative words - Naive Bayes Model")

# let's FINALLY predict the test-set
predicted_nb <- predict(NB22 ,test )
table(predicted_nb )
prop.table(table(predicted_nb ))

#####################################################
# let's run a Random Forest
#####################################################

set.seed(123) # (define a set.seed for being able to replicate the results!)
system.time(RF <- randomForest(y= as.factor(Dfm_train@docvars$Sentiment), x=train, importance=TRUE,  do.trace=TRUE, ntree=500))

# let's extract the matrix for GINI and Accuracy
importance_RF <- as.data.frame(RF$importance[,4:5])
str(importance_RF)
importance_RF$Feature<- row.names(importance_RF)
str(importance_RF)
# same words we get with 
varImpPlot(RF )
print(head(importance_RF[order(importance_RF$MeanDecreaseGini, decreasing=TRUE),]))

predicted_rf <- predict(RF, train, type="class")
table(predicted_rf )
Dfm_train@docvars$predRF <- ifelse(predicted_rf=="negative",0, ifelse(predicted_rf == "neutral",1,2))
table(Dfm_train@docvars$predRF)

# adding sign [if 0/1 according to the content of the review - neg or pos]
sums <- list()
for (v in 0:2){
    sums[[v+1]] <- colSums(train[Dfm_train@docvars[,"predRF"]==v,], na.rm = TRUE)
}

sums <- do.call(cbind, sums)
sign <- apply(sums, 1, which.max)
# get the feature 
names <-  dimnames(train)[[2]]
str(names)

df <- data.frame(
    Feature = names, 
    sign = sign-1,
    stringsAsFactors=F)

str(df)

importance <- merge(importance_RF, df, by="Feature")
str(importance)
table(importance$sign)

for (v in 0:2){
    cat("\n\n")
    cat("value==", v)
    importance <- importance[order(importance$MeanDecreaseGini, decreasing=TRUE),]
    print(head(importance[importance$sign==v,], n=10))
    cat("\n")
    cat(paste(unique(head(importance$Features[importance$sign==v], n=10)), collapse=", "))
}

# let's draw a graph with our results!
str(importance)

negatives <- importance[ which(importance$sign ==0), ]
neutrals <- importance[ which(importance$sign ==1), ]
positives <- importance[ which(importance$sign ==2), ]

df1 <- top_n(negatives , 10, MeanDecreaseGini)
df2 <- top_n(positives , 10, MeanDecreaseGini)
df3 <- top_n(neutrals , 10, MeanDecreaseGini)

RF_prob_new <- rbind(df1, df2, df3)
str(RF_prob_new)
RF_prob_new$sign2 <- factor(RF_prob_new$sign, labels = c("Negative","Neutral","Positive"))
str(RF_prob_new)
table(RF_prob_new$sign)
table(RF_prob_new$sign2)

# reorder the features
RF_prob_new <-  mutate(RF_prob_new, Feature= reorder(Feature, MeanDecreaseGini ))

ggplot(RF_prob_new, aes(Feature, MeanDecreaseGini , fill = sign2)) +
geom_bar(stat="identity", fill= "white") + theme_classic() +
geom_col(show.legend = T) + theme(legend.position="bottom")+
coord_flip() +
ylab("Mean Decrease Gini") +
labs(title = "Tweets about uk 2014 EP elections",
subtitle = "negative versus neutral versus positive features - Random Forest Model",
fill= "sentiment") + scale_fill_manual(values = c("#003333", "#009999", "steelblue")) 

# let's reorder the features but differentiating according to the category (positive, negative and neutral)
RF_prob_new$Feature <- with(RF_prob_new, factor(Feature,levels=Feature[order(ave(MeanDecreaseGini,
sign2,FUN=max),MeanDecreaseGini)]))

ggplot(RF_prob_new, aes(Feature, MeanDecreaseGini , fill = sign2)) +
geom_bar(stat="identity", fill= "white") + theme_classic() +
geom_col(show.legend = T) + theme(legend.position="bottom")+
coord_flip() +
ylab("Mean Decrease Gini") +
labs(title = "Tweets about uk 2014 EP elections",
subtitle = "negative versus neutral versus positive features - Random Forest Model",
fill= "sentiment") + scale_fill_manual(values = c("#003333", "#009999", "steelblue")) 

RF_graph <- ggplot(RF_prob_new, aes(Feature, MeanDecreaseGini , fill = sign2)) +
geom_bar(stat="identity", fill= "white") + theme_classic() +
geom_col(show.legend = T) + theme(legend.position="bottom")+
coord_flip() +
ylab("Mean Decrease Gini") +
labs(title = "Tweets about uk 2014 EP elections",
subtitle = "negative versus neutral versus positive features - Random Forest Model",
fill= "sentiment") + scale_fill_manual(values = c("#003333", "#009999", "steelblue")) 

# let's FINALLY predict the test-set
system.time(predicted_rf <- predict(RF, test,type="class"))
table(predicted_rf )
prop.table(table(predicted_rf))

######################################################
######################################################
# Let's compare the results out-of-sample we got via Naive Bayes, SVM & RF
######################################################
######################################################

prop.table(table(predicted_nb ))
prop.table(table(predicted_rf ))

results <- as.data.frame(rbind(prop.table(table(predicted_nb )), prop.table(table(predicted_rf ))))
results$algorithm <- c("NB", "RF")
str(results)

# Let's plot the results!
library(reshape2)
df.long<-melt(results,id.vars=c("algorithm"))
str(df.long)

ggplot(df.long,aes(algorithm,value,fill=variable))+
 geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() +  
ylab(label="UK tweets in the test-set") +  xlab("algorithm") + scale_fill_discrete(name = "Prediction", labels = c("Negative", "Neutral", "Positive"))

#################################################
# let's compare feature importance across models
#################################################

library(gridExtra)
grid.arrange(NB_graph, RF_graph, nrow=2) # Plot everything together

NB_graph
RF_graph

# let's check some correlation
# let's merge the dataframe 
words <- merge(NB_prob, importance, by="Feature")
str(words)

library(PerformanceAnalytics)
chart.Correlation(words[c(5,7:8)])