rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(text2vec)
library(lsa)
library(dplyr)
library(e1071)
library(caret)
library(stringr)
library(cvTools)

# let's focus on MOVIE reviews and let's sum our usual training-set to the test-set to have a larger corpus
# on which running the WE

x <- read.csv("train_review2.csv", stringsAsFactors=FALSE)
str(x)
x10 <- read.csv("test_review2.csv", stringsAsFactors=FALSE)
str(x10)
tot <- rbind(x, x10)
str(tot)
nrow(tot)

tot$text <- gsub("'"," ",tot$text) # let's replace the apostrophe with an empty space (sometimes Quanteda has some problems
# to remove them

myCorpusTwitter <- corpus(tot)

# We do minimal preprocessing. We also follow standard practice which is to include all words with a minimum count above 
# a given threshold, between 5-10 (we choose 5).

Dfm <- dfm(myCorpusTwitter , remove = c(stopwords("english")), remove_punct = TRUE, remove_numbers=TRUE, 
tolower = TRUE, remove_symbols=TRUE,  remove_separators=TRUE, remove_url = TRUE, split_hyphens = TRUE)
topfeatures(Dfm )
Dfm <- dfm(myCorpusTwitter , remove = c(stopwords("english"), "s", "t", "p"), remove_punct = TRUE, remove_numbers=TRUE, 
tolower = TRUE, remove_symbols=TRUE,  remove_separators=TRUE, remove_url = TRUE, split_hyphens = TRUE)
topfeatures(Dfm )
Dfm <- dfm_trim(Dfm,  min_termfreq = 5, verbose=TRUE)

#############################################
#############################################
# Applying the GloVe algorithm via Quanteda 
#############################################
#############################################

# Let's first extract the vocabulary from our Dfm
Dfm_vocab <- featnames(Dfm )
str(Dfm_vocab)
# Then let's select the tokens that are in the corpus
mov_tokens <- tokens(myCorpusTwitter) 
# Then let's match the two vocabularies (i.e., let's keep only those features that are both present in the Corpus
# as well as in the Dfm (after that we pre-processed the texts!))
# Note the following: the command "padding=TRUE" leaves an empty string where the removed tokens previously existed. 
# This is useful if a positional match is needed between the pre- and post-selected tokens, 
# for instance if a window of adjacency needs to be computed. This prevents that non-adjacent words (in the original text) 
# becomes adjacent
mov_tokens2 <- tokens_select(mov_tokens, Dfm_vocab, padding = TRUE)
str(mov_tokens2)

# Create a term co-occurance matrix (default window is 5; you can change it by using the window	command)
# why weights? I weight more tokens closer than far away 
fcmat_news <- fcm(mov_tokens2, context = "window", count = "weighted", weights = 1/(1:5))
fcmat_news
str(fcmat_news)
dim(fcmat_news)

# alternatively
fcmat_news_noweights <- fcm(mov_tokens2, context = "window")
fcmat_news_noweights

#############################################
#############################################
# Let's estimate word embeddings via Glove
#############################################
#############################################

# Which are the main parameters to look at?
# rank=number of dimensions (100 dimensions is the default)
# x_max=maximum number of co-occurrences to use in the weighting function
# See the GloVe paper for details: http://nlp.stanford.edu/pubs/glove.pdf. Here we select 10

# if you want to exactly reproduce your results, you have to specify n_threads = 1 and then identify the set.seed.
# you will need more computational time, but at least replication is assured. Note that the differences if
# we replicate your analysis with a larger number of threads will be minimal

set.seed(123)
system.time(glove <- GlobalVectors$new(rank=100, x_max=10))
# you can increase n_iter or decreasing the convergence_tol (but you would need more time...)
# These two statistics refer to the ML algorithm Glove employs
system.time(glove_main <- glove$fit_transform(fcmat_news, n_iter = 20, convergence_tol = 0.01, n_threads = 1)) 
str(glove_main)

#############################################
#############################################
############# plotting words in the dimensional space
#############################################
#############################################

glove_dataframe <- as.data.frame(glove_main)
nrow(glove_dataframe)
colnames(glove_dataframe )

# let's add to glove_dataframe a specific column called "word" with the list of features
glove_dataframe$word <- row.names(glove_dataframe )
colnames(glove_dataframe )

plot_words <- function(words, glove_dataframe){
  # empty plot
  plot(0, 0, xlim=c(-0.5, 0.5), ylim=c(-0.5,0.5), type="n",
       xlab="Second dimension", ylab="Third dimension")
  for (word in words){
    # extract second and third dimensions
    vector <- as.numeric(glove_dataframe[glove_dataframe$word==word,2:3])
    # add to plot
    text(vector[1], vector[2], labels=word)
  }
}

plot_words(c("teen", "alien", "monster", "war", "bad", "murder", "good", "fear", "summer"), glove_dataframe)

# Now we can start to play. The measure of interest in comparing two vectors will be cosine similarity, which 
# you can think of it similarly to the standard correlation. Let’s see what is similar to "girl"

girl <- glove_main["girl", , drop = F]
cos_sim_girl <- sim2(x = glove_main, y = girl, method = "cosine", norm = "l2")
head(sort(cos_sim_girl[,1], decreasing = T), 10)

#### Once we have the vectors for each word, we can also compute the similarity between a pair of words:

similarity <- function(word1, word2){
    lsa::cosine(
        x=as.numeric(glove_dataframe[glove_dataframe$word==word1,1:100]),
        y=as.numeric(glove_dataframe[glove_dataframe$word==word2,1:100]))
}

similarity("summer", "monster")
similarity("tarantino", "detective")
similarity("tarantino", "jackie") # "Jackie Brown" is a movie directed by Tarantino

# Let's see an analogy
# Which is the closest word vectors for fear - male + female = ?

ex <- glove_main["fear", , drop = FALSE] -
  glove_main["male", , drop = FALSE] +
  glove_main["female", , drop = FALSE]

cos_sim <- textstat_simil(x = as.dfm(glove_main), y = as.dfm(ex),
                          method = "cosine")
head(sort(cos_sim[, 1], decreasing = TRUE), 5) # some stereotypes going on here?

# alternatively
cos_sim_test <- sim2(x = glove_main, y = ex , method = "cosine", norm = "l2")
head(sort(cos_sim_test[,1], decreasing = T), 5)

#############################################
#############################################
############# An important note
#############################################
#############################################

# Note that the GLOVE model learns two sets of word vectors - main (the one we employed above!) and context
# You can think about that as "target" vectors and "context" vectors. Each word should have its own vector (target) but it also serve as context
# for other words. Since all computations are symmetrical in the end both vectors should be identical. 
# But since they use approximate computations the two vectors might be slightly different. 

# While both of word-vectors matrices can be used as result it could be a good idea to average or take a sum of main and context vector.
# This could lead to higher quality embeddings. In the present case not such a great improvement (so we will avoid that).
# Having said that, if you want to take the sum of the two vectors, you could simply write:

wv_context <- glove$components

str(t(wv_context)) # we need to transpose the matrix to directly compare it to the original glove_main matrix
str(glove_main)

dim(wv_context)
glove_main2 <- glove_main + t(wv_context)

str(t(wv_context))
str(glove_main)
str(glove_main2)

#############################################
#############################################
# A SVM with WE
#############################################
#############################################

glove_dataframe <- select(glove_dataframe, word, everything()) # move word to the top
colnames(glove_dataframe )
glove_dataframe[1:5, 2:11]

# At the moment wlv2 is a matrix of 12,264 rows (one for each feature) and 101 columns (1 column for word and the other 100 for the 100 
# dimensions of WE)
nrow(glove_dataframe)
# in the original Dfm I had 1,500 documents
ndoc(Dfm)

# what I want to do is estimating for each document in the Dfm a value equals to the weighted average of its words
# in each of the 100 dimensions of WE. As a result I can generate a new matrix with 1,500 rows (one for each document)
# and 100 columns (with the average position of each document in each of the 100 dimensions of WE)

embed <- matrix(NA, nrow=ndoc(Dfm), ncol=100) # empty matrix

for (i in 1:ndoc(Dfm)){
  if (i %% 100 == 0) message(i, '/', ndoc(Dfm))
  # extract word counts
  vec <- as.numeric(Dfm[i,])
  # keep words with counts of 1 or more
  doc_words <- featnames(Dfm)[vec>0] 
  # extract embeddings for those words
  embed_vec <- glove_dataframe[glove_dataframe$word %in% doc_words, 2:101]
  # aggregate from word- to document-level embeddings by taking AVG
  embed[i,] <- colMeans(embed_vec, na.rm=TRUE)
  # if no words in embeddings, simply set to 0
  if (nrow(embed_vec)==0) embed[i,] <- 0
} 

str(embed) 
str(tot)

# let's separate our original training and the test-set

training <- c(1:500)
str(training)
test <- c(501:1500)
str(test)

# our benchmark: 0.524
str(tot)
prop.table(table(tot$Sentiment[training]))

# SVM
set.seed(123)
system.time(SVM <-  svm(x=embed[training,], y=as.factor(tot$Sentiment[training]), kernel='linear', cost = 1)) 
# computing predicted values
predicted_SV <- predict(SVM, embed[test,], type="class")
table(predicted_SV)
prop.table(table(predicted_SV))

# in this case we do know the "true" value for Sentiment in the test-set. So let's build a confusion-matrix
confusion <- confusionMatrix( predicted_SV,  as.factor(tot$Sentiment[test]),  mode = "everything")
accuracySV <- confusion $overall[1]
F1_0 <- (2*confusion $byClass[1]*confusion $byClass[3])/(confusion $byClass[1]+confusion $byClass[3]) # F1 value for category negative 
F1_1 <- (2*confusion $byClass[2]*confusion $byClass[4])/(confusion $byClass[2]+confusion $byClass[4]) # F1 value for category positive
F1_SV <- (F1_0+F1_1)/2
accuracySV
F1_SV

###########################################################################
###########################################################################
############ SAME ANALYIS with with a pre-trained word embeddings computed 
# on a sample of Google news (not sharing that much with movie reviews...still...)
###########################################################################
###########################################################################

# extracting the word embeddings on 100 dimensions 
pre_trained <- readr::read_delim("vector.txt", 
                  skip=1, delim=" ", quote="",
                  col_names=c("word", paste0("V", 1:100)))

# 100 dimensions + 1 column for features
colnames(pre_trained)
nrow(pre_trained) # almost 72K features included; much more than the number of features included in our corpus
nrow(glove_dataframe)

# let's match the words included in the pre-trained WE object (wlv) with the words included in the Dfm of our corpus

pre_trained<- pre_trained[pre_trained$word %in% featnames(Dfm),] 
nrow(pre_trained) # a number of words < than in our DfM. Why? Cause some words included in the DfM were not included in wlv
pre_trained[1:20, 1:11]

embed2 <- matrix(NA, nrow=ndoc(Dfm), ncol=100) 

for (i in 1:ndoc(Dfm)){
  if (i %% 100 == 0) message(i, '/', ndoc(Dfm))
  # extract word counts
  vec <- as.numeric(Dfm[i,])
  # keep words with counts of 1 or more
  doc_words <- featnames(Dfm)[vec>0] 
  # extract embeddings for those words
  embed_vec2 <- pre_trained[pre_trained$word %in% doc_words, 2:101]
  # aggregate from word- to document-level embeddings by taking AVG
  embed2[i,] <- colMeans(embed_vec2, na.rm=TRUE)
  # if no words in embeddings, simply set to 0
  if (nrow(embed_vec2)==0) embed2[i,] <- 0
} 

str(embed2) # 1500 documents, with 100 columns (1 for each dimension) 

# SVM
library(e1071)
set.seed(123)
system.time(SVM <-  svm(x=embed2[training,], y=as.factor(tot$Sentiment[training]), kernel='linear', cost = 1)) 
# computing predicted values
predicted_SV <- predict(SVM, embed2[test,], type="class")
table(predicted_SV)
prop.table(table(predicted_SV))
confusion <- confusionMatrix( predicted_SV,  as.factor(tot$Sentiment[test]),  mode = "everything")
accuracySV_newWE <- confusion $overall[1]
F1_0 <- (2*confusion $byClass[1]*confusion $byClass[3])/(confusion $byClass[1]+confusion $byClass[3]) # F1 value for category negative 
F1_1 <- (2*confusion $byClass[2]*confusion $byClass[4])/(confusion $byClass[2]+confusion $byClass[4]) # F1 value for category positive
F1_SV_newWE <- (F1_0+F1_1)/2

# better performance than what we got when using the local WE
accuracySV_newWE
F1_SV_newWE

accuracySV
F1_SV

#############################################
#############################################
# A SVM with the pre-trained WE + features from the BoW approach
#############################################
#############################################

# Finally, we can also combine both bag-of-words and WE  features into a single matrix, 
# and use a ML algorithm to let it choose for us the best set of features. 
# This combination of features and classifier could help to reach the best performance

X <- as.matrix(cbind(Dfm, embed2))
str(Dfm) # 12282 feauteres in our DfM
str(X) # 12282 features in Dfm + 100 dimensions from WE = 12382 features

set.seed(123)
system.time(SVM_ALL <- svm(x=X[training,], y=as.factor(tot$Sentiment[training]), 
    kernel='linear', cost = 1))
# computing predicted values
predicted_SV_ALL <- predict(SVM_ALL, X[test,], type="class")
confusion <- confusionMatrix( predicted_SV_ALL,  as.factor(tot$Sentiment[test]),  mode = "everything")
accuracySV_ALL <- confusion $overall[1]
F1_0 <- (2*confusion $byClass[1]*confusion $byClass[3])/(confusion $byClass[1]+confusion $byClass[3]) # F1 value for category 0 
F1_1 <- (2*confusion $byClass[2]*confusion $byClass[4])/(confusion $byClass[2]+confusion $byClass[4]) # F1 value for category 0
F1_SV_ALL <- (F1_0+F1_1)/2
accuracySV_ALL
F1_SV_ALL

# an external WE better than the local one; the best model is a WE+BoW 

accuracySV
accuracySV_newWE
accuracySV_ALL

F1_SV
F1_SV_newWE
F1_SV_ALL

#############################################
#############################################
# A SVM with CV and WE? Same old stuff!
#############################################
#############################################

train <- as.matrix(embed[training,]) # let's focus on the local WE

ttrain <- train # let's change the name of the original train data.frame, given that we are already going to use such name below in the loop
# let's split our training-set in 5 folds 
set.seed(123) # set the see for replicability
k <- 5 # the number of folds
folds <- cvFolds(NROW(ttrain ), K=k)
str(folds)

system.time(for(i in 1:k){
  train <- embed[folds$subsets[folds$which != i], ] # Set the training set
  validation <- embed[folds$subsets[folds$which == i], ] # Set the validation set
  set.seed(123)
  newrf <- svm(y= as.factor(Dfm[folds$subsets[folds$which != i], ]@docvars$Sentiment) ,x=train, kernel='linear', cost = 1) # Get your new model (just fit on the train data) and ADD the name of the output (in this case "Sentiment")
  newpred <- predict(newrf, validation, type="class") # Get the predicitons for the validation set (from the model just fit on the train data)
class_table <- table("Predictions"= newpred, "Actual"=Dfm[folds$subsets[folds$which == i], ]@docvars$Sentiment)
print(class_table)  
  df<-confusionMatrix( class_table, mode = "everything") # Add name output (in this case "Sentiment")
  df.name<-paste0("conf.mat.sv",i) # create the name for the object that will save the confusion matrix for each loop (=5)  
  assign(df.name,df)
})

SVMPredict <- data.frame(col1=vector(), col2=vector(), col3=vector()) 
for(i in  mget(ls(pattern = "conf.mat.sv")) ) {
col1 <-(i)$overall[1] # save in the matrix the accuracy value
col2 <- (2*(i)$byClass[1]*(i)$byClass[3])/((i)$byClass[1]+(i)$byClass[3]) # save in the matrix the F1 value for negative
col3 <- (2*(i)$byClass[2]*(i)$byClass[4])/((i)$byClass[2]+(i)$byClass[4]) # save in the matrix the F1 value for positive
SVMPredict <- rbind(SVMPredict , cbind(col1, col2, col3))
 }

colnames(SVMPredict )[1] <- "Accuracy"
colnames(SVMPredict )[2] <- "F1 negative"
colnames(SVMPredict )[3] <- "F1 positive"
SVMPredict [is.na(SVMPredict )] <- 0
SVMPredict
str(SVMPredict )

# Let's compare the average value for accuracy and f1

acc_sv_avg <- mean(SVMPredict[, 1] )
f1_sv_avg <- mean(colMeans(SVMPredict[-1] ))

acc_sv_avg 
f1_sv_avg