rm(list=ls(all=TRUE))
getwd()
setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(caTools)
library(e1071)
library(randomForest)
library(caret)
library(iSAX)
library(tm) 

######################################################
######################################################
# Let's learn how to estimate a Naive Bayes model
######################################################
######################################################

#####################################################
# FIRST STEP: let's create the DfM for the training-set
#####################################################

# This is a sample of 500 tweets written in English about Trump and published since 1.17.2018 till 1.19.2018.
# Such tweets have been codified as expressing a positive/negative/neutral sentiment towards Donald Trump.
# We treat such tweets as our training-set. 

x11 <- read.csv("trainTrump.csv", stringsAsFactors=FALSE)
str(x11)
table(x11$Sentiment)
prop.table(table(x11$Sentiment))

# Let's do some text preprocessing directly at this stage 
library(stringr)
kwic(x11$text, "Trump's") # several tweets included the word "Trump's" while we want to keep just "Trump"
x11$text <- str_replace_all(x11$text, "[^[:alnum:]]", " ")
kwic(x11$text, "Trump's") # solved the issue!

myCorpusTwitterTrain <- corpus(x11)
head(summary( myCorpusTwitterTrain ))

Dfm_train <- dfm(myCorpusTwitterTrain , remove = c(stopwords("english")),
                remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, 
remove_separators=TRUE, remove_url = TRUE)
topfeatures(Dfm_train , 20)  # 20 top words

# some problems here. Let's clean the DfM
Dfm_train<- dfm(myCorpusTwitterTrain , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https"), ("ā"),
("com"), ("ly")),
                remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, 
remove_separators=TRUE, remove_url = TRUE)
topfeatures(Dfm_train , 20)  # 20 top words

# Let's trim the dfm in order to keep only tokens that appear in 2 or more tweets (tweets are very short texts...) 
Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE)

#####################################################
# SECOND STEP: let's create the DfM for the test-set
#####################################################

# This is a sample of 500 tweets written in English about Trump and published since 1.17.2018 till 1.19.2018
# that we treat as our test set.
 
x10 <- read.csv("testTrump.csv", stringsAsFactors=FALSE)
str(x10)

# removing all punctuations also here 
kwic(x10$text, "Trump's")
x10$text <- str_replace_all(x10$text, "[^[:alnum:]]", " ")
kwic(x10$text, "Trump's")

myCorpusTwitterTest <- corpus(x10)
head(summary( myCorpusTwitterTest))

Dfm_test<- dfm(myCorpusTwitterTest, remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https"), ("ā"),
("com"), ("ly")),
                remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, 
remove_separators=TRUE, remove_url = TRUE)
topfeatures(Dfm_test, 20)  # 20 top words
Dfm_test<- dfm_trim(Dfm_test, min_docfreq = 2, verbose=TRUE)

#####################################################
# THIRD STEP: Let's make the features identical between train and test-set by passing Dfm_train to dfm_match() as a pattern.
# This is needed cause Naive Bayes in Quanteda can only take features into consideration that occur both in the training set and the test set
#####################################################

str(Dfm_train) 
length(Dfm_train@Dimnames$features) # 939 features
str(Dfm_test)
length(Dfm_test@Dimnames$features)  # 912 features
test_dfm  <- dfm_match(Dfm_test, features = featnames(Dfm_train))
length(test_dfm@Dimnames$features)  # 939 features

#####################################################
# FOURTH STEP: Let's run a Naive Bayes model in Quanteda [actually at the moment the best way to run a Naive Bayes model on texts out there]
#####################################################

# a) train the Naive Bayes classifier using textmodel_nb() and a Multinomial distribution (the default)
head(docvars(Dfm_train))
system.time(nb <- textmodel_nb(Dfm_train, docvars(Dfm_train, "Sentiment"), distribution = c("multinomial"))) # very fast!
summary(nb)

# b) predict the test-set
predicted_nb <- predict(nb, test_dfm)
table(predicted_nb )
prop.table(table(predicted_nb ))
# compare with the % in the train-set
prop.table(table(Dfm_train@docvars$Sentiment))

######################################################
######################################################
# Let's learn how to estimate a Random Forest model
######################################################
######################################################

# KEEP steps 1-3 from above (if you start from scratch, you should repeat them!)

#####################################################
# FOURTH STEP/B: transform both dfm (training and test set) in a data frame
#####################################################

train <- as.data.frame(as.matrix(Dfm_train))
test <- as.data.frame(as.matrix(test_dfm))

# this is important: randomForest can't recognize the colname that begins with space, comma, number or other specific punctuation.
# the command below would add a letter in front of a number (if you have any numbers left in the tdm). Highly suggested

colnames(train ) <- make.names(colnames(train ))
colnames(test ) <- make.names(colnames(test ))

#####################################################
# FIFTH STEP/B: let's run the Random Forest
#####################################################

# a) train the RF classifier (define a set.seed for being able to replicate the results!)
# The main deafault hyperparameters in the case of a RF are the following ones: ntree=500 (Number of trees to grow. Here we set ntree=100 to save time),
# mtry=sqrt(p)(Number of variables randomly sampled as candidates at each split, 
# where p is number of variables in x. In our case x=length(train)=939! therefore sqrt(length(train))=30.64)
# nodesize=1 (Minimum size of terminal nodes. Setting this number larger causes smaller trees to be grown (and thus take less time))

# Note that your output variable should be a factor or a numeric value, not a character! Here I threefore transform the Sentiment variable 
# in a factor via the command as.factor given that I want to run a classification model!
str(Dfm_train@docvars$Sentiment) # that's a character variable! Not good!

set.seed(123) # (define a set.seed for being able to replicate the results!)
system.time(RF <- randomForest(y= as.factor(Dfm_train@docvars$Sentiment), x=train, importance=TRUE, ntree=100, do.trace=TRUE))

# The graph below shows what will happen to the predictive power of your model if you drop some variables.
# The variables with the highest importance scores are the ones that give the best prediction and contribute most to the model.
# On the left graph: what will happen to the MSE if you drop that variable
# On the right graph: a somehow similar result with respect to what will happen before and after the split on that variable.
varImpPlot(RF )

# b) predict the test-set
system.time(predicted_rf <- predict(RF, test,type="class"))
table(predicted_rf)
prop.table(table(predicted_rf ))
# compare with the % in the train-set
prop.table(table(Dfm_train@docvars$Sentiment))

######################################################
######################################################
# Let's learn how to estimate a SVM model
######################################################
######################################################

# KEEP steps 1-3 from above (if you start from scratch, you should repeat them!)
# KEEP also steps FOURTH STEP/B from above (if you start from scratch, you should repeat them!)

#####################################################
# FIFTH STEP/C: let's run a SVM model
#####################################################

# a) train the SVM classifier 

# note that here I select a linear kernel and, as hyperameter, a specific value for the cost C(=1)
# other main hypermaters for a linear kernel are: epsilon (epsilon in the insensitive-loss function (default: 0.1))
# for all other kernel (radial, polynomial), you also have gamma (default: 1/(data dimension))
# for polynomial kernel you also have degree (default: 3) and coef0 (default: 0)

set.seed(123)# (define a set.seed for being able to replicate the results!)
system.time(SV <- svm(y= as.factor(Dfm_train@docvars$Sentiment), x=train, kernel='linear', cost = 1))

# how many supporting vectors?
length(SV$index) # 373 texts out of 500 documents
nrow(train) # 500 texts in the train data frame
# these are the indices of the supporting vectors 
SV$index
# those are the first 6 vectors with the corresponding values for each features included in them
head(SV$SV)
# why you get such strange vales? Cause by default SVM rescale all the values to zero mean and unit variance. 
# The center and scale values are returned and used for later predictions.

# and indeed, if you add to the estimation scale=TRUE - but do not do that on your analysis!
SV2 <- svm(y= as.factor(Dfm_train@docvars$Sentiment),  x=train, kernel='linear', cost = 1, scale=FALSE)
head(SV2$SV)

# let's read those observations (i.e., documents) that are more “important” or “separate” better the data 
str(x11)
vectors <- x11[SV$index,]
head(x11$text) 
head(vectors$text) # you see that the [3] to [6] documents in the training-set, for example, are not vectors

# b) predict the test-set
system.time(predicted_svm <- predict(SV , test))
table(predicted_svm )
prop.table(table(predicted_svm ))
# compare with the % in the train-set
prop.table(table(Dfm_train@docvars$Sentiment))

#####################################################
# Let's compare the three results we got with NB, RF and SVM:
#####################################################

prop.table(table(predicted_nb ))
prop.table(table(predicted_rf ))
prop.table(table(predicted_svm ))

# there is some difference! Therefore, which one to "trust" more?
# the ANSWER: do a Cross-Validation!!!!

######################################################
######################################################
# Let's learn how to estimate a proportional model via iSAX
######################################################
######################################################

### STEP 1/D: TRAINING-TEST
# Let's start with our usual training-set about Trump
x <- read.csv("trainTrump.csv", stringsAsFactors=FALSE)
x$text <- str_replace_all(x$text, "[^[:alnum:]]", " ")
x$Sentiment <- as.factor(x$Sentiment) # let's transform the variable "Sentiment" in a factor variable
prop.table(table(x$Sentiment))

### STEP 2/D: TEST-SET
### That's our usual test-set
x10 <- read.csv("testTrump.csv", stringsAsFactors=FALSE)
x10$text <- str_replace_all(x10$text, "[^[:alnum:]]", " ")

### STEP 3/D: Let's create a unique dataset including both TEST and TRAINING SET 
x10$Sentiment <- NA # for doing that let's add a new column called "Sentiment" in the test-set given that such column is presented in the training-set
documents <- rbind(x10, x) # let's combine the test and the training-set
str(documents ) # we have 1000 texts (500 as test-set and 500 as training-set)
prop.table(table(documents $Sentiment))

### STEP 4/D: iSA needs the TM package (not Quanteda!) to build the DfM
corpus <- VCorpus(VectorSource(documents $text)) # let's build the corpus
str(corpus[[1]])
ocome <- prep.data(corpus,verbose=TRUE, th=0.995) # let's prepare data for iSA algorithm. 
# This is a pre-processing step which performs stemming and other cleaning steps, as well as producing the DfM.
# th=0.995 means that we drop those features that appear in less than 5% of the texts

### STEP 5/D: let's separate the resulting object "ocome" according to the presence or absence of info about the Sentiment 
### (i.e., training vs. test-set)
train <- !is.na(documents $Sentiment) # I create an index=TRUE for the training-set documents (i.e., those texts with Sentiment 
# different than NA)
train
summary(train)
D <- documents$Sentiment[train] # I recover the vector of the values for the Sentiment in the training-set
str(D)
# Same results indeed!
prop.table(table(D))
prop.table(table(x$Sentiment))

Strain <- ocome$S[which(train)] # I select out of "ocome" the vector of stems belonging to the training-set
Stest <- ocome$S[-which(train)] # I select out of "ocome" the vector of stems belonging to the test-set

length(Strain ) # 500!
length(Stest) # 500!

### STEP 6/D: let's run the proporional algorithm
set.seed(123)
system.time(outSent <- iSA(Strain ,Stest , D)) # D is the vector of codings belonging to the training set

### STEP 7/D: let's classify the test-test
round(outSent$btab, 5) # I have also bootstrapped s.e.!

#####################################################
# Let's compare the three results we got with NB, RF and SVM:
#####################################################

prop.table(table(predicted_nb ))
prop.table(table(predicted_rf ))
prop.table(table(predicted_svm ))
outSent$btab[1:3]

# there is some difference! Therefore, which one to trust more?
# once again the ANSWER: do a Cross-Validation!!!!