####################################################
# classifiers with a general example
####################################################

rm(list=ls(all=TRUE))
getwd()
setwd("YOUR WORKING DIRECTORY")
getwd()
library(e1071)
library(caTools)
library(randomForest)
data(iris)
str(iris)
 
iris$spl=sample.split(iris,SplitRatio=0.5)
train=subset(iris, iris$spl==TRUE)
test=subset(iris, iris$spl==FALSE)
 
NB <- naiveBayes(Species ~ ., data=train)
predictNB <- predict(NB, newdata=test, type="class")
table( predictNB)
table( predictNB, test$Species)

# Accuracy (proportion of correctly classified documents): 0.96
(30+28+28) / nrow(test)

set.seed(123)
RF <- randomForest(Species ~ ., data=train, type="classification")
predictRF <- predict(RF, newdata=test)
table( predictRF)
table(  test$Species, predictRF)
# Accuracy (proportion of correctly classified documents): 0.91
(30+28+24) / nrow(test)

set.seed(123)
SVM <- svm(Species ~ . ,data=train, method = "C-classification")
predictSVM <-predict(SVM, newdata=test)
table( predictSVM )
table(  test$Species, predictSVM )
# Accuracy (proportion of correctly classified documents): 0.94
(30+29+26) / nrow(test)

####################################################
# classifiers with texts
####################################################

rm(list=ls(all=TRUE))
getwd()
setwd("YOUR WORKING DIRECTORY")
getwd()
library(quanteda)
library(readtext)
library(caTools)
library(randomForest)
library(e1071)

x10 <- read.csv("Trump_tweets1.csv", stringsAsFactors=FALSE)
str(x10)
myCorpusTwitter <- corpus(x10)

# let's suppose that the sentiment classification via lexdict represents the "true" sentiment that
# a set of human coders would have found

lexdict <- dictionary(file = "C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL/LSDaug2015/LSD2015.lc3", format = "lexicoder")
str(lexdict)
trump3 <- dfm(myCorpusTwitter , remove = c(stopwords("english"), ("rt"), ("t.co") ),
                remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE,  remove_twitter = TRUE, 
           remove_url = TRUE  ,  dictionary = lexdict )
trump3 
str(trump3 )
df <- data.frame(trump3 )
str(df)
head(df)
df$sentiment_diff <- df$X.positive.008800-df$X.negative.AA0000
str(df)

# we assume 0=neutral. Alternatively, we could have forced everythin between -1 and +1 to be neutral

df$sentiment2[df$sentiment_diff==0] <- "Neutral"
df$sentiment2[df$sentiment_diff<0] <- "Negative"
df$sentiment2[df$sentiment_diff>0] <- "Positive"
str(df)
table(df$sentiment2)
df$sentiment2 <- as.factor(df$sentiment2)
str(df)

# add the sentiment to the original set of tweets
x10$sentiment <- df$sentiment2
str(x10)
table(x10$sentiment)

myCorpusTwitter  <- corpus(x10)
myDfm3 <- dfm(myCorpusTwitter  , remove = c(stopwords("english"), ("rt") ,("t.co")),
                remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, stem = TRUE, remove_twitter = TRUE, remove_url = TRUE)
topfeatures(myDfm3 , 20)  # 20 top words

# Keep terms that appear at least in the 1% or more of the tweets.
trim <- dfm_trim(myDfm3 , min_docfreq= 0.01)

data <- as.data.frame(as.matrix(trim))
str(data )
colnames(data )

# this is important: randomForest can't recognize the colname that begins with space, comma, number or other specific punctuation.
# the command below would add a letter in front of a number (if you have any numbers left in the tdm). Highly suggested
colnames(data ) <- make.names(colnames(data ))
colnames(data )

summary(x10$sentiment)
data$sentiment<- x10$sentiment
str(sentiment)
colnames(sentiment)
summary(data$sentiment)

# Build a training and testing set. Given that we are assuming that the sentiment classification via lexdict represents
# a kind of "human coding" we are in effect running a kind of cross-classification procedure somehow!

set.seed(123)
split <- sample.split(data$sentiment, SplitRatio=0.6)
table(split)
trainSparse <- subset(data, split==TRUE)
testSparse <- subset(data, split==FALSE)
nrow(trainSparse )
nrow(testSparse )

str(trainSparse)
str(trainSparse$sentiment)
colnames(trainSparse)
str(trainSparse$sentiment)

# Try a random forest model. 
set.seed(123)
tweetRF <- randomForest(sentiment~ ., data=trainSparse, type="classification")
str(tweetRF)
predictRF <- predict(tweetRF, newdata=testSparse)
table( predictRF)
table(testSparse$sentiment)
table(testSparse$sentiment, predictRF)

# Accuracy (proportion of correctly classified documents): 0.68 
(162 + 58 + 53) / nrow(testSparse)
# Precision for "Negative": number of documents correctly classified into category k, 
# divided by the total number of documents that the model classifies as category k: 0.75
(162/(162+38+15))
# Recall for "Negative": number of correctly classified category k documents divided by the number of human coded documents 
# in category k: 0.83 
(162/(196))

# Try a SVM model
set.seed(123)
# If you ever get this warning message:
# In svm.default(x, y, scale = scale, ..., na.action = na.action) :
# Variable(s) ‘graham’ and ‘lindsey’ and ‘confirm’ constant. Cannot scale data.
# it means that in the dtm in the training-set you have some words with always 0 across all texts 
# Either you drop such words and re-run the analysis or just write in the SVM command: "scale=FALSE"
SV <- svm(sentiment ~ . ,data=trainSparse,method = "C-classification")
predictSV <-predict(SV, newdata=testSparse)
table( predictSV )
table(testSparse$sentiment)
table(testSparse$sentiment, predictSV )
# Accuracy: 0.69
(181 + 49 + 44) / nrow(testSparse)
# Precision for "Negative": 0.68
(181/(181+55+31))
# Recall for "Negative": 0.92 
(181/(196))

## Try a Naive Bayes model: problem with sparcity!
NB <- naiveBayes(sentiment ~ ., data=trainSparse)
predictNB <- predict(NB , newdata=testSparse, type="class")
table( predictNB)
table(testSparse$sentiment, predictNB)
# Accuracy: 0.25
(3 + 5 + 91) / nrow(testSparse)

#### replicate the entire analysis but now keep terms 
#### in the tdm that appear at least in the 5% or more of the tweets.
#### Which are the main changes in terms of accuracy?

#### replicate the entire analysis but now keep terms 
#### in the tdm that appear at least in the 1% or more of the tweets and
#### reduces the sample to 0.3.
#### Which are the main changes in terms of accuracy?