rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(dplyr)
library(stringr)
library(e1071)
library(caret)
library(cvTools)
library(text2vec)
library(lsa)
library(quanteda.textstats)
library(tsne)
library(knitr)
library(plotly)
library(Rtsne)
library(ggplot2)
library(htmlwidgets)
# Let's focus on MOVIE reviews. We have a training and a test-set here.
# Let's sum the two sets to have a larger corpus on which running the WE
x <- read.csv("train_review2.csv", stringsAsFactors=FALSE)
str(x)
# Note that in this case, we do know the "true" value for sentiment with respect to the texts included in the test-set!
x10 <- read.csv("test_review2.csv", stringsAsFactors=FALSE)
str(x10)
tot <- rbind(x, x10)
str(tot)
nrow(tot)
# look at the word "krippendorf's"
str(tot)
tot$text <- gsub("'"," ",tot$text) # let's replace the apostrophe with an empty space (sometimes Quanteda has problems to remove them
# now...
str(tot)
myCorpusTwitter <- corpus(tot)
# We do minimal preprocessing. We also follow standard practice which is to include all words with a minimum count above
# a given threshold, between 5-10 (we choose 5).
tok2 <- tokens(myCorpusTwitter , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
Dfm <- dfm(tok2 )
topfeatures(Dfm )
Dfm <- dfm_remove(Dfm , min_nchar=2)
# tok2 <- tokens_remove(tok2, c("s", "t"))
# Dfm <- dfm(tok2 )
topfeatures(Dfm )
Dfm <- dfm_trim(Dfm, min_termfreq = 5, verbose=TRUE)
#############################################
#############################################
# Applying the GloVe algorithm via Quanteda
#############################################
#############################################
# Let's first extract the vocabulary from our Dfm
Dfm_vocab <- featnames(Dfm )
str(Dfm_vocab)
# Then let's select the tokens that are in the corpus
mov_tokens <- tokens(myCorpusTwitter)
# Then let's match the two vocabularies (i.e., let's keep only those features that are both present in the tokenized object
# as well as in the Dfm (after that we pre-processed and trimmed the texts!)).
# Note the following: the command "padding=TRUE" leaves an empty string where the removed tokens previously existed.
# This is useful if a positional match is needed between the pre- and post-selected tokens,
# for instance if a window of adjacency needs to be computed. This prevents that non-adjacent words (in the original text)
# becomes adjacent (after pre-processing). For example "I like the ice-cream" and windows=1
# Summing up: removal of tokens changes the lengths of documents, but they remain the same if you set padding = TRUE
mov_tokens2 <- tokens_select(mov_tokens, Dfm_vocab, padding = TRUE)
# Create a term co-occurance matrix (default window is 5; you can change it by using the window command)
fcmat_news_noweights <- fcm(mov_tokens2, context = "window")
fcmat_news_noweights
# alternatively we can weight out FCM
# Why using the weights option? In this case, we want to weight more tokens that are closer compared to those more far away
# to the centre/target word
fcmat_news <- fcm(mov_tokens2, context = "window", count = "weighted", weights = 1/(1:5))
fcmat_news
# Note that fcm creates a sparse feature co-occurrence matrix. This makes the matrix more easy to treat (and less
# computational heavy). This is a good thing when dealing with text analysis!
# Compare the two following examples. We want to fcm these two texts
testText <- c("Kitty likes milk", "Cat likes milk")
testText
# a dense co-occurance matrix with window=2 would be:
dense <- matrix(
c(0,1,1,0,1,0,2,1,1,2,0,1,0,1,1,0), nrow = 4, ncol = 4, byrow = TRUE)
rownames(dense ) = c("kitty", "likes", "milk", "cat")
colnames(dense ) = c("kitty", "likes", "milk", "cat")
dense
# a sparse feature co-occurrence matrix
testCorpus <- corpus(testText)
tok3 <- tokens(testCorpus)
sparse <- fcm(tok3, context = "window", window=2)
sparse
dense
# we can also visualize a semantic network analysis starting from the fcm
library(quanteda.textplots)
set.seed(123)
textplot_network(sparse)
# where is the graph? With R-Studio you are able to plot it correctly.
# In R-Gui you have to save it for example as a .pdf file
pdf(file = "semantic_network.pdf")
set.seed(123)
textplot_network(sparse)
dev.off()
#############################################
#############################################
# Let's estimate word embeddings via Glove
#############################################
#############################################
# First let's define the loss function for GloVe
# Which are the main parameters to look at?
# 1) rank=number of dimensions (100 dimensions is the default)
# 2) x_max=maximum number of co-occurrences to use in the weighting function. Here we select 10.
# This parameter affects the weighting function in the loss function that GloVe wants to minimize as discussed
set.seed(123)
system.time(glove <- GlobalVectors$new(rank=100, x_max=10))
# Now let's train the model!
# Note: if you want to exactly reproduce your results, you have to identify a set.seed and specify n_threads = 1 (i.e., no parallelization!).
# You will need more computational time, but at least replication is assured.
# The differences if we replicate the analysis with a larger number of threads will be however minimal
# You can increase n_iter or decreasing the convergence_tol (but you would need more time...).
system.time(glove_main <- glove$fit_transform(fcmat_news, n_iter = 20, convergence_tol = 0.01, n_threads = 1)) # around 45 seconds
# take a look at the loss-function final value. You can use it to compare across different GloVe specifications as we discussed
# that's the result of our analysis
str(glove_main)
# as we discussed earlier, GloVe actually learns two matrices of word-vectors - main (the one we employed above!) and context
# You can think about that as "target" vectors and "context" vectors. Each word (target) should have its own vector but it also serve
# as context for other words. Since all computations are symmetrical in the end both vectors should be identical.
# But since they use approximate computations the two vectors might be slightly different.
# While both of word-vectors matrices can be used as result it could be a good idea to average or take a sum of main and context vector.
# This could lead to higher quality embeddings. In the present case we will avoid that.
# Having said that, if you want to take the sum of the two vectors, you can simply write:
wv_context <- glove$components
str(t(wv_context)) # we need to transpose the matrix to directly compare it to the original glove_main matrix
str(glove_main)
dim(wv_context)
glove_main2 <- glove_main + t(wv_context)
# original matrix of word-vectors
str(glove_main)
# second matrix
str(t(wv_context))
# sum of the two
str(glove_main2)
#############################################
#############################################
############# plotting words in the dimensional space
#############################################
#############################################
glove_dataframe <- as.data.frame(glove_main)
nrow(glove_dataframe)
# the same as our co-occurance matrix of course!
nrow(fcmat_news)
colnames(glove_dataframe )
# let's add to glove_dataframe a specific column called "word" with the list of features
glove_dataframe$word <- row.names(glove_dataframe )
colnames(glove_dataframe )
# let's define a plot function for the second and third dimension
plot_words <- function(words, glove_dataframe){
# empty plot
plot(0, 0, xlim=c(-0.5, 0.5), ylim=c(-0.5,0.5), type="n",
xlab="Second dimension", ylab="Third dimension")
for (word in words){
# extract second and third dimensions
vector <- as.numeric(glove_dataframe[glove_dataframe$word==word,2:3])
# add to plot
text(vector[1], vector[2], labels=word)
}
}
plot_words(c("teen", "alien", "monster", "war", "bad", "murder", "good", "fear", "summer"), glove_dataframe)
# But what if we would like to plot all the dimensions together?
# The t-SNE algorithm can be used to visualize the embeddings.
# The goal of this algorithm is to take a set of points in a high-dimensional space and find
# a faithful representation of those points in a lower-dimensional space, typically the 2D plan.
# Because of time constraints we will only use it with the first 500 words.
# The algorithm is non-linear and adapts to the underlying data, performing different transformations on different regions.
# Those differences can be a major source of confusion. Therefore...
# ...to understand more about the t-SNE method take a look at: https://distill.pub/2016/misread-tsne/
# For example, a second feature of t-SNE is a tuneable parameter, ?perplexity,? which says (loosely)
# how to balance attention between local and global aspects of your data.
# The parameter is, in a sense, a guess about the number of close neighbors each point has.
# But which value consider for perplexity? A typical value for it is between 5-50
tsne <- Rtsne(glove_main[1:500,], perplexity = 50, pca = FALSE)
str(tsne)
tsne_plot <- tsne$Y
tsne_plot <- as.data.frame(tsne_plot)
str(tsne_plot)
tsne_plot$word <- row.names(glove_main)[1:500]
str(tsne_plot)
tsne_plot <- ggplot(tsne_plot, aes(x = V1, y = V2, label = word)) +
geom_text(size = 3)
# This plot may look like a mess, but if you zoom into the small groups you end up seeing some interesting patterns,
# for example on the top-right panel (intersection between hor. 2 and vert. 1), you can see "woman" and "man" close to each other,
# but "man" closer to "director"
tsne_plot
# let's transform the ggplot into an interacting plotly plot
ggplotly(tsne_plot)
# we can also save the html
# m <- ggplotly(tsne_plot)
# if you are using R-Studio, add "selfcontained =TRUE"
#saveWidget(m, "map2.html", selfcontained =FALSE, background = "white")
# Now we can start to play. The measure of interest in comparing two vectors will be cosine similarity (do you remember about it?)
# Let?s see what is similar to "girl" and "woman"
girl <- glove_main["girl", , drop = F]
cos_sim_girl <- sim2(x = glove_main, y = girl, method = "cosine", norm = "l2")
head(sort(cos_sim_girl[,1], decreasing = T), 10)
woman <- glove_main["woman", , drop = F]
cos_sim_woman <- sim2(x = glove_main, y = woman , method = "cosine", norm = "l2")
head(sort(cos_sim_woman [,1], decreasing = T), 10)
# Once we have the vectors for each word, we can also compute the similarity between a pair of words:
similarity <- function(word1, word2){
lsa::cosine(
x=as.numeric(glove_dataframe[glove_dataframe$word==word1,1:100]),
y=as.numeric(glove_dataframe[glove_dataframe$word==word2,1:100]))
}
similarity("father", "mother")
similarity("summer", "monster")
similarity("tarantino", "fiction")
# Let's see an analogy
# Which is the spatially closest word vector to "(vec)man - (vec)fun + (vec)woman" = ?
ex <- glove_main["man", , drop = FALSE] -
glove_main["fun", , drop = FALSE] +
glove_main["woman", , drop = FALSE]
cos_sim_test <- sim2(x = glove_main, y = ex , method = "cosine", norm = "l2")
head(sort(cos_sim_test[,1], decreasing = T), 5)
# Subtracting the?"fun" vector from the?"man" vector and adding?"woman", the most similar word to this would be therefore?"wife".
# A rather not-politically correct result that points to societal stereotypes encoded in movie reviews?
#############################################
#############################################
# A SVM with WE
#############################################
#############################################
colnames(glove_dataframe )
glove_dataframe <- select(glove_dataframe, word, everything()) # move the "word" column to the top
colnames(glove_dataframe )
glove_dataframe[1:5, 2:11]
# At the moment glove_dataframe is a matrix of 12,243 rows (one for each feature) and 101 columns (1 column for word and the other 100 for the 100
# dimensions of WE)
nrow(glove_dataframe)
ncol(glove_dataframe)
# but in the original Dfm I had 1,500 documents
ndoc(Dfm)
# what I want to do is estimating for each document in the Dfm a value equals to the weighted average of its words
# in each of the 100 dimensions of WE. As a result I can generate a new matrix with 1,500 rows (one for each document)
# and 100 columns (with the average position of each document in each of the 100 dimensions of WE), wherin
# each text will be represented by a mean vector. Note that other possibilities are available.
# For example, recent applications, such as the doc2vec approach, combine word and document embeddings:
# see: Le, Q., & Mikolov, T. (2014). Distributed representations of sentences and documents. International Conference on Machine Learning
embed <- matrix(NA, nrow=ndoc(Dfm), ncol=100) # empty matrix
for (i in 1:ndoc(Dfm)){
if (i %% 100 == 0) message(i, '/', ndoc(Dfm))
# extract word counts
vec <- as.numeric(Dfm[i,])
# keep words with counts of 1 or more
doc_words <- featnames(Dfm)[vec>0]
# extract embeddings for those words
embed_vec <- glove_dataframe[glove_dataframe$word %in% doc_words, 2:101]
# aggregate from word- to document-level embeddings by taking AVG
embed[i,] <- colMeans(embed_vec, na.rm=TRUE)
# if no words in embeddings, simply set to 0
if (nrow(embed_vec)==0) embed[i,] <- 0
}
str(embed)
str(tot)
# let's separate our original training [first 500 documents] and the test-set
training <- c(1:500)
str(training)
test <- c(501:1500)
str(test)
# our benchmark given our super "naive" algorithm: 0.524
str(tot)
prop.table(table(tot$Sentiment[training]))
# let's compute our SVM
system.time(SVM <- svm(x=embed[training,], y=as.factor(tot$Sentiment[training]), kernel='linear', cost = 1))
# let's predict the values for the test-set
predicted_SV <- predict(SVM, embed[test,], type="class")
table(predicted_SV)
prop.table(table(predicted_SV))
# In this specific case we do know the "true" value for Sentiment in the test-set:
table(tot$Sentiment[test])
# So let's build a confusion-matrix contrasting the "tue" values with the predicted "values"
confusion <- confusionMatrix( predicted_SV, as.factor(tot$Sentiment[test]), mode = "everything")
confusion
accuracySV <- confusion $overall[1]
F1_0 <- (2*confusion $byClass[1]*confusion $byClass[3])/(confusion $byClass[1]+confusion $byClass[3]) # F1 value for category negative
F1_1 <- (2*confusion $byClass[2]*confusion $byClass[4])/(confusion $byClass[2]+confusion $byClass[4]) # F1 value for category positive
F1_SV <- (F1_0+F1_1)/2
accuracySV
F1_SV
# NOTE one important drawback in using SVM + WE: now the usual global interepration that we give to a model becomes almost
# impossible (given that it will be based on Dimensions not on words...)
library(iml)
library(future)
library(future.callr)
library(gridExtra)
# For running a permutation, remember you need to have a data frame, not a matrix (the iml package requires that)
embed_df <- as.data.frame(embed)
svm_model <- svm(as.factor(tot$Sentiment[training]) ~ ., data = embed_df[training,], kernel='linear', cost = 1)
mod <- Predictor$new(svm_model, data = embed_df[training,], y = as.factor(tot$Sentiment[training]), type = "prob")
system.time({
plan("callr", workers = 6)
set.seed(123)
imp2 <- FeatureImp$new(mod, loss = "ce", n.repetitions=1)
}) # around 25 secs on my laptop
imp2
###########################################################################
###########################################################################
# SAME ANALYIS with with a pre-trained word embeddings computed
# on a sample of Google news (not sharing that much with movie reviews, still...)
###########################################################################
###########################################################################
# extracting the word embeddings on 100 dimensions
pre_trained <- readr::read_delim("vector.txt",
skip=1, delim=" ", quote="",
col_names=c("word", paste0("V", 1:100)))
# 100 dimensions + 1 column for features
colnames(pre_trained)
nrow(pre_trained) # almost 72K features included; much more than the number of features included in our corpus!
nrow(glove_dataframe)
################# let's play a bit with this new pre-trained WE
# convert from a tibble to a data frame
class(pre_trained)
pre_trained2 <- as.data.frame(pre_trained)
class(pre_trained2)
row.names(pre_trained2) <- pre_trained2$word
pre_trained2 <- pre_trained2[-c(1)]
pre_trainedMatrix <- as.matrix(pre_trained2)
str(pre_trainedMatrix)
str(glove_main)
# Let's see an analogy
# Which is the spatially closest word vector to "(vec)king - (vec)male + (vec)female" = ?
ex <- pre_trainedMatrix["king", , drop = FALSE] -
pre_trainedMatrix["male", , drop = FALSE] +
pre_trainedMatrix["female", , drop = FALSE]
cos_sim_test <- sim2(x = pre_trainedMatrix, y = ex , method = "cosine", norm = "l2")
head(sort(cos_sim_test[,1], decreasing = T), 5)
# Let's see a second analogy
# Which is the spatially closest word vector to "(vec)paris - (vec)france + (vec)uk" = ?
ex2 <- pre_trainedMatrix["paris", , drop = FALSE] -
pre_trainedMatrix["france", , drop = FALSE] +
pre_trainedMatrix["uk", , drop = FALSE]
cos_sim_test <- sim2(x = pre_trainedMatrix, y = ex2 , method = "cosine", norm = "l2")
head(sort(cos_sim_test[,1], decreasing = T), 5)
#################
# let's NOW match the words included in the pre-trained WE object (pre_trained) with the words included in the Dfm of our corpus
pre_trained<- pre_trained[pre_trained$word %in% featnames(Dfm),]
nrow(pre_trained) # around 1,000 features less than in our previous GloVe data-frame (see below).
# Why? Cause some words included in the DfM were not included in pre_trained
nrow(glove_dataframe)
pre_trained[1:20, 1:11]
embed2 <- matrix(NA, nrow=ndoc(Dfm), ncol=100)
for (i in 1:ndoc(Dfm)){
if (i %% 100 == 0) message(i, '/', ndoc(Dfm))
# extract word counts
vec <- as.numeric(Dfm[i,])
# keep words with counts of 1 or more
doc_words <- featnames(Dfm)[vec>0]
# extract embeddings for those words
embed_vec2 <- pre_trained[pre_trained$word %in% doc_words, 2:101]
# aggregate from word- to document-level embeddings by taking AVG
embed2[i,] <- colMeans(embed_vec2, na.rm=TRUE)
# if no words in embeddings, simply set to 0
if (nrow(embed_vec2)==0) embed2[i,] <- 0
}
str(embed2) # 1500 documents, with 100 columns (1 for each dimension)
# SVM
system.time(SVM <- svm(x=embed2[training,], y=as.factor(tot$Sentiment[training]), kernel='linear', cost = 1))
# computing predicted values
predicted_SV <- predict(SVM, embed2[test,], type="class")
table(predicted_SV)
prop.table(table(predicted_SV))
confusion <- confusionMatrix( predicted_SV, as.factor(tot$Sentiment[test]), mode = "everything")
accuracySV_newWE <- confusion $overall[1]
F1_0 <- (2*confusion $byClass[1]*confusion $byClass[3])/(confusion $byClass[1]+confusion $byClass[3]) # F1 value for category negative
F1_1 <- (2*confusion $byClass[2]*confusion $byClass[4])/(confusion $byClass[2]+confusion $byClass[4]) # F1 value for category positive
F1_SV_newWE <- (F1_0+F1_1)/2
# better performance than what we got when using the local WE
accuracySV_newWE
F1_SV_newWE
accuracySV
F1_SV
#############################################
#############################################
# A SVM with the pre-trained WE + features from the BoW approach
#############################################
#############################################
# Finally, we can also combine both bag-of-words and WE features into a single matrix,
# and use a ML algorithm to let it choose for us the best set of features.
# This combination of features and classifier could help to reach the best performance
# let's combine our dfm with the matrix we estimate out of the last GloVe results
X <- as.matrix(cbind(Dfm, embed2))
length(Dfm@Dimnames$features) # 12261 feauteres in our DfM
str(X) # 12261 features in Dfm + 100 dimensions from WE = 12361 features
system.time(SVM_ALL <- svm(x=X[training,], y=as.factor(tot$Sentiment[training]),
kernel='linear', cost = 1))
# computing predicted values
predicted_SV_ALL <- predict(SVM_ALL, X[test,], type="class")
confusion <- confusionMatrix( predicted_SV_ALL, as.factor(tot$Sentiment[test]), mode = "everything")
accuracySV_ALL <- confusion $overall[1]
F1_0 <- (2*confusion $byClass[1]*confusion $byClass[3])/(confusion $byClass[1]+confusion $byClass[3]) # F1 value for category 0
F1_1 <- (2*confusion $byClass[2]*confusion $byClass[4])/(confusion $byClass[2]+confusion $byClass[4]) # F1 value for category 0
F1_SV_ALL <- (F1_0+F1_1)/2
accuracySV_ALL
F1_SV_ALL
# an external WE better than the local one; the best model is a WE+BoW
accuracySV
accuracySV_newWE
accuracySV_ALL
F1_SV
F1_SV_newWE
F1_SV_ALL
#############################################
#############################################
# How to run Cross-Validation with a SVM with WE? Same old stuff!
#############################################
#############################################
# let's focus here on the matrix we compute out of the first GloVE estimation (i.e., using the local WE)
train <- as.matrix(embed[training,]) # let's focus on the local WE and let's turn it into a matrix
ttrain <- train # let's change the name of the original train data.frame, given that we are already going to use such name below in the loop
# let's split our training-set in 5 folds
set.seed(123) # set the see for replicability
k <- 5 # the number of folds
folds <- cvFolds(NROW(ttrain ), K=k)
str(folds)
system.time(for(i in 1:k){
train <- embed[folds$subsets[folds$which != i], ] # Set the training set
validation <- embed[folds$subsets[folds$which == i], ] # Set the validation set
set.seed(123)
newrf <- svm(y= as.factor(Dfm[folds$subsets[folds$which != i], ]@docvars$Sentiment) ,x=train, kernel='linear', cost = 1) # Get your new model (just fit on the train data) and ADD the name of the output (in this case "Sentiment")
newpred <- predict(newrf, validation, type="class") # Get the predicitons for the validation set (from the model just fit on the train data)
class_table <- table("Predictions"= newpred, "Actual"=Dfm[folds$subsets[folds$which == i], ]@docvars$Sentiment)
print(class_table)
df<-confusionMatrix( class_table, mode = "everything") # Add name output (in this case "Sentiment")
df.name<-paste0("conf.mat.sv",i) # create the name for the object that will save the confusion matrix for each loop (=5)
assign(df.name,df)
})
SVMPredict <- data.frame(col1=vector(), col2=vector(), col3=vector())
for(i in mget(ls(pattern = "conf.mat.sv")) ) {
col1 <-(i)$overall[1] # save in the matrix the accuracy value
col2 <- (2*(i)$byClass[1]*(i)$byClass[3])/((i)$byClass[1]+(i)$byClass[3]) # save in the matrix the F1 value for negative
col3 <- (2*(i)$byClass[2]*(i)$byClass[4])/((i)$byClass[2]+(i)$byClass[4]) # save in the matrix the F1 value for positive
SVMPredict <- rbind(SVMPredict , cbind(col1, col2, col3))
}
colnames(SVMPredict )[1] <- "Accuracy"
colnames(SVMPredict )[2] <- "F1 negative"
colnames(SVMPredict )[3] <- "F1 positive"
SVMPredict [is.na(SVMPredict )] <- 0
SVMPredict
str(SVMPredict )
# Let's compare the average value for accuracy and f1
acc_sv_avg <- mean(SVMPredict[, 1] )
f1_sv_avg <- mean(colMeans(SVMPredict[-1] ))
acc_sv_avg
f1_sv_avg
# Note that the result you get via cross-validation is very close to the result you get in the test-set (when knowing the "true" value for it).
# Indeed, cross-validation is a good procedure!!!
accuracySV
F1_SV