rm(list=ls(all=TRUE)) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(quanteda) library(readtext) library(text2vec) library(quanteda.textplots) library(Rtsne) library(ggplot2) library(plotly) library(randomForest) library(caret) # Let's focus on a dataset containing MOVIE reviews. We have a training-set and a test-set. # Let's sum the two sets to have a larger corpus on which running the WE # training set x <- read.csv("train_review2.csv", stringsAsFactors=FALSE) str(x) # test set # Note that in this case, we do know the "true" value for sentiment with respect to the texts included in the test-set! x10 <- read.csv("test_review2.csv", stringsAsFactors=FALSE) str(x10) # Let's add the test-set to the training-set tot <- rbind(x, x10) str(tot) nrow(tot) # look at the word "krippendorf's" str(tot) tot$text <- gsub("'"," ",tot$text) # let's replace the apostrophe with an empty space (sometimes Quanteda has problems to remove them) # now... str(tot) # Let's create our corpus myCorpus <- corpus(tot) # Let's extract the tokens from each of the text included in the corpus. # We also do some minimal preprocessing tok2 <- tokens(myCorpus , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) Dfm <- dfm(tok2 ) topfeatures(Dfm ) # Let's remove the words with just 1 character Dfm <- dfm_remove(Dfm , min_nchar=2) topfeatures(Dfm ) # Moreover, we follow standard practice which is to include all words with a minimum count above a given threshold, between 5-10 (we choose 5), # otherwise the COM would be almost empty for such words Dfm <- dfm_trim(Dfm, min_termfreq = 5, verbose=TRUE) ############################################# ############################################# # Applying the GloVe algorithm via Quanteda ############################################# ############################################# # Let's first extract the vocabulary from our Dfm Dfm_vocab <- featnames(Dfm ) str(Dfm_vocab) # Then let's select the tokens that are present in our previously defined corpus mov_tokens <- tokens(myCorpus) # Then let's match the two vocabularies (i.e., let's keep only those features that are both present in the tokenized corpus # as well as in the Dfm - after that we pre-processed and trimmed the texts!) # Note the following: the command "padding=TRUE" below leaves an empty string where the removed tokens previously existed. # This is useful if a positional match is needed between the pre- and post-selected tokens, # for instance if a window of adjacency needs to be computed. This prevents that non-adjacent words (in the original text) # becomes adjacent (after pre-processing). W/o "padding=TRUE" the word "eat" and "sushi" in the following sentence: # "I like to eat 2 sushi sets" and windows=1 would be included in the same window (after you remove the numbers as we did from our # corpus). With "padding=TRUE" that wouldn't happen # Summing up: removal of tokens changes the lengths of documents, but they remain the same if you set "padding = TRUE" mov_tokens2 <- tokens_select(mov_tokens, Dfm_vocab, padding = TRUE) # Create a term co-occurance matrix (default window is 5; you can change it by using the window command) fcmat_news_noweights <- fcm(mov_tokens2, context = "window") fcmat_news_noweights # Alternatively we can weight out FCM. Why using the weights option? # In this case, we want to weight more tokens that are closer compared to those more far away to the centre/target word. # This is one way to account for the fact that very distant word pairs are expected to contain less relevant information # about the words’ relationship to one another fcmat_news <- fcm(mov_tokens2, context = "window", count = "weighted", weights = 1/(1:5)) fcmat_news # Note that fcm creates a sparse feature co-occurrence matrix. This makes the matrix more easy to treat (and less # computational heavy). This is a good thing when dealing with text analysis! # Compare the two following examples testText <- c("Kitty likes milk", "Cat likes milk") testText # A dense co-occurance matrix with window=2 would be as the following one: dense <- matrix( c(0,1,1,0,1,0,2,1,1,2,0,1,0,1,1,0), nrow = 4, ncol = 4, byrow = TRUE) rownames(dense ) = c("kitty", "likes", "milk", "cat") colnames(dense ) = c("kitty", "likes", "milk", "cat") dense # However, if we apply the command fcm to these two texts we obtain a sparse feature co-occurrence matrix testCorpus <- corpus(testText) tok3 <- tokens(testCorpus) sparse <- fcm(tok3, context = "window", window=2) sparse dense # We can also visualize a semantic network analysis starting from the fcm set.seed(123) textplot_network(sparse) ############################################# ############################################# # Let's estimate WE via Glove ############################################# ############################################# # First let's define the loss function for GloVe # Which are the main parameters to look at? # 1) rank=number of dimensions (100 dimensions is the default) # 2) x_max=maximum number of co-occurrences to use in the weighting function. Here we select 10. # This parameter affects the weighting function in the loss function that GloVe wants to minimize (remember our discussion!) glove <- GlobalVectors$new(rank=100, x_max=10) # Now let's train the model! # Note: if you want to exactly reproduce your results, you have to identify a set.seed and specify n_threads = 1 (i.e., no parallelization!). # You will need more computational time, but at least replication is assured. # The differences if we replicate the analysis with a larger number of threads will be however minimal # You can also increase n_iter (20 below is pretty low. A good value is between 50 and 100; but you would need more time...) or decreasing the convergence_tol. # convergence_tol defines early stopping strategy. Glove stops fitting when one of two following conditions is satisfied: # (a) Glove has used all the specified iterations, or (b) cost_previous_iter / cost_current_iter - 1 < convergence_tol # By default Glove performs all iterations set.seed(123) system.time(glove_main <- glove$fit_transform(fcmat_news, n_iter = 20, convergence_tol = 0.01, n_threads = 1)) # around 45 seconds # You can use the loss-function final value to compare across different GloVe specifications # That's the result of our analysis str(glove_main) # As we discussed earlier, GloVe actually learns two matrices of word-vectors - main (the one we employed above!) and context. # You can think about that as "target" vectors and "context" vectors. Each word (target) should have its own vector but it also serve # as context for other words. Since all computations are symmetrical in the end both vectors should be identical. # But since they use approximate computations the two vectors might be slightly different. # While both of word-vectors matrices can be used as result it could be a good idea to average or take a sum of main and context vector. # This could lead to higher quality embeddings. In the present case we will avoid that. # Having said that, if you want to take the sum of the two vectors, you can simply write: wv_context <- glove$components str(t(wv_context)) # we need to transpose the matrix to directly compare it to the original glove_main matrix dim(wv_context) glove_main2 <- glove_main + t(wv_context) # original matrix of word-vectors str(glove_main) # second matrix str(t(wv_context)) # sum of the two str(glove_main2) ############################################# ############################################# ############# Plotting words in the WE dimensional space ############################################# ############################################# # Let's create a dataframe out of the Glove results glove_dataframe <- as.data.frame(glove_main) nrow(glove_dataframe) # the same # of words as in our co-occurance matrix of course! nrow(fcmat_news) colnames(glove_dataframe ) # let's add to glove_dataframe a specific column called "word" with the list of features glove_dataframe$word <- row.names(glove_dataframe ) colnames(glove_dataframe ) # let's define a plot function for the second and third dimension for example (we can of course change the dimensions!) plot_words <- function(words, glove_dataframe){ # empty plot plot(0, 0, xlim=c(-0.5, 0.5), ylim=c(-0.5,0.5), type="n", xlab="Second dimension", ylab="Third dimension") for (word in words){ # extract second and third dimensions vector <- as.numeric(glove_dataframe[glove_dataframe$word==word,2:3]) # add to plot text(vector[1], vector[2], labels=word) } } plot_words(c("teen", "alien", "monster", "war", "bad", "murder", "good", "fear", "summer"), glove_dataframe) # But what if we would like to plot all the dimensions together? # The t-SNE algorithm can be used to visualize the embeddings. # The goal of this algorithm is to take a set of points in a high-dimensional space and find # a faithful representation of those points in a lower-dimensional space, typically the 2D plan. # The algorithm is non-linear and adapts to the underlying data, performing different transformations on different regions. # For example, a feature of t-SNE is a tuneable parameter, perplexity, which says (loosely) # how to balance attention between local and global aspects of your data. # A typical value for perplexity is between 5-50 # Because of time constraints we will only use it with the first 500 words of our corpus set.seed(123) tsne <- Rtsne(glove_main[1:500,], perplexity = 50) str(tsne) tsne_plot <- tsne$Y tsne_plot <- as.data.frame(tsne_plot) str(tsne_plot) tsne_plot$word <- row.names(glove_main)[1:500] str(tsne_plot) tsne_plot <- ggplot(tsne_plot, aes(x = V1, y = V2, label = word)) + geom_text(size = 3) # This plot may look like a mess, but if you zoom into the small groups you end up seeing some interesting patterns, # for example you can see "great" and "good" close to each other, as well as "film" and "movie" tsne_plot # let's transform the ggplot into an interacting plotly plot ggplotly(tsne_plot) # We can also save the plot as an html file: # library(htmlwidgets) # m <- ggplotly(tsne_plot) # if you are using R-Studio, add "selfcontained =TRUE" # saveWidget(m, "map2.html", selfcontained =FALSE, background = "white") # Now we can start to play. A typical measure of interest in comparing two vectors is the cosine similarity. # "Cosine similarity" is an intuitive measure of semantic similarity. # More in details, the cosine similarity between two texts ranges between 0 and 1, where 0 is reached when two texts are completely # different and 1 is reached when two texts have identical feature proportions # Let's see what is similar to "girl" and "woman" girl <- glove_main["girl", , drop = F] cos_sim_girl <- sim2(x = glove_main, y = girl, method = "cosine", norm = "l2") head(sort(cos_sim_girl[,1], decreasing = T), 10) woman <- glove_main["woman", , drop = F] cos_sim_woman <- sim2(x = glove_main, y = woman , method = "cosine", norm = "l2") head(sort(cos_sim_woman [,1], decreasing = T), 10) # Once we have the vectors for each word, we can also compute the similarity between a pair of words: similarity <- function(word1, word2){ lsa::cosine( x=as.numeric(glove_dataframe[glove_dataframe$word==word1,1:100]), y=as.numeric(glove_dataframe[glove_dataframe$word==word2,1:100])) } similarity("father", "mother") similarity("summer", "monster") similarity("tarantino", "fiction") similarity("woman", "wife") similarity("man", "husband") similarity("woman", "mother") similarity("man", "father") # Let's explore an analogy. # Which is the spatially closest word vector to "(vec)man - (vec)fun + (vec)woman" = ? ex <- glove_main["man", , drop = FALSE] - glove_main["fun", , drop = FALSE] + glove_main["woman", , drop = FALSE] cos_sim_test <- sim2(x = glove_main, y = ex , method = "cosine", norm = "l2") head(sort(cos_sim_test[,1], decreasing = T), 5) # Subtracting the "fun" vector from the "man" vector and adding "woman", the most similar word to this would be therefore "wife". # A rather not-politically correct result that points to societal stereotypes encoded in movie reviews? ############################################# ############################################# # Doing Machine Learning classification with WE ############################################# ############################################# colnames(glove_dataframe ) glove_dataframe <- select(glove_dataframe, word, everything()) # let's move the "word" column to the top colnames(glove_dataframe ) glove_dataframe[1:5, 2:11] # At the moment glove_dataframe is a matrix of 12,243 rows (one for each feature) and 101 columns (1 column for word and the other 100 for the 100 # dimensions of WE) nrow(glove_dataframe) ncol(glove_dataframe) # but in the original Dfm I had 1,500 documents ndoc(Dfm) # What I want to do now is estimating for each document in the Dfm a value equals to the weighted average of its words # in each of the 100 dimensions of WE. As a result I can generate a new matrix with 1,500 rows (one for each document) # and 100 columns (with the average position of each document in each of the 100 dimensions of WE), wherin # each text will be represented by a mean vector. Note that other possibilities are available. # For example, recent applications, such as the doc2vec approach, combine word and document embeddings: # see: Le, Q., & Mikolov, T. (2014). Distributed representations of sentences and documents. International Conference on Machine Learning embed <- matrix(NA, nrow=ndoc(Dfm), ncol=100) # empty matrix for (i in 1:ndoc(Dfm)){ if (i %% 100 == 0) message(i, '/', ndoc(Dfm)) # extract word counts vec <- as.numeric(Dfm[i,]) # keep words with counts of 1 or more doc_words <- featnames(Dfm)[vec>0] # extract embeddings for those words embed_vec <- glove_dataframe[glove_dataframe$word %in% doc_words, 2:101] # aggregate from word- to document-level embeddings by taking AVG embed[i,] <- colMeans(embed_vec, na.rm=TRUE) # if no words in embeddings, simply set to 0 if (nrow(embed_vec)==0) embed[i,] <- 0 } str(embed) str(tot) # let's separate our original training [first 500 documents] and the test-set training <- c(1:500) str(training) test <- c(501:1500) str(test) # Our benchmark given a super "naive" algorithm: 0.524. We want that the accuracy of our ML algorithm outperforms such value str(tot) prop.table(table(tot$Sentiment[training])) # let's compute a Random Forest model set.seed(123) # Let's define a set.seed for being able to replicate the results! system.time(RF <- randomForest(x=embed[training,], y=as.factor(tot$Sentiment[training]), do.trace=TRUE, importance=TRUE, ntree=500)) # let's predict the values for the test-set set.seed(123) predicted_RF <- predict(RF, embed[test,]) table(predicted_RF) prop.table(table(predicted_RF)) # In this specific case we do know the "true" value for Sentiment in the test-set: table(tot$Sentiment[test]) # So let's build a confusion-matrix contrasting the "tue" values with the predicted "values" confusion <- confusionMatrix( predicted_RF, as.factor(tot$Sentiment[test]), mode = "everything") confusion accuracyRF <- confusion $overall[1] F1_0 <- (2*confusion $byClass[1]*confusion $byClass[3])/(confusion $byClass[1]+confusion $byClass[3]) # F1 value for category negative F1_1 <- (2*confusion $byClass[2]*confusion $byClass[4])/(confusion $byClass[2]+confusion $byClass[4]) # F1 value for category positive F1_RF <- (F1_0+F1_1)/2 accuracyRF F1_RF # NOTE one important drawback in using any ML + WE: now a global interepration of our model becomes meaningless # (given that it will be based on WE Dimensions not on words...) # Each feature's importance is assessed based on two criteria: # -MeanDecreaseAccuracy: gives a rough estimate of the loss in prediction performance when that particular variable is omitted from the training set. # Caveat: if two variables are somewhat redundant, then omitting one of them may not lead to massive gains in prediction performance, # but would make the second variable more important. # -MeanDecreaseGini: GINI is a measure of node impurity. Think of it like this: if you use this feature to split the data, how pure will the nodes be? # Highest purity means that each node contains only elements of a single class. # Assessing the decrease in GINI when that feature is omitted leads to an understanding of how important that feature is to split the data correctly. varImpPlot(RF ) ########################################################################### ########################################################################### # Let's now employ a pre-trained word embeddings computed # on a sample of Google news (not sharing that much with movie reviews, still...) ########################################################################### ########################################################################### # extracting the word embeddings on 100 dimensions pre_trained <- readr::read_delim("vector.txt", skip=1, delim=" ", quote="", col_names=c("word", paste0("V", 1:100))) # 100 dimensions + 1 column for features colnames(pre_trained) nrow(pre_trained) # almost 72K features included; much more than the number of features included in our corpus! nrow(glove_dataframe) # let's play a bit with this new pre-trained WE # convert the object from a tibble to a data frame class(pre_trained) pre_trained2 <- as.data.frame(pre_trained) class(pre_trained2) row.names(pre_trained2) <- pre_trained2$word pre_trained2 <- pre_trained2[-c(1)] pre_trainedMatrix <- as.matrix(pre_trained2) str(pre_trainedMatrix) str(glove_main) # Let's see an analogy # Which is the spatially closest word vector to "(vec)king - (vec)male + (vec)female" = ? ex <- pre_trainedMatrix["king", , drop = FALSE] - pre_trainedMatrix["male", , drop = FALSE] + pre_trainedMatrix["female", , drop = FALSE] cos_sim_test <- sim2(x = pre_trainedMatrix, y = ex , method = "cosine", norm = "l2") head(sort(cos_sim_test[,1], decreasing = T), 5) # Let's see a second analogy # Which is the spatially closest word vector to "(vec)paris - (vec)france + (vec)uk" = ? ex2 <- pre_trainedMatrix["paris", , drop = FALSE] - pre_trainedMatrix["france", , drop = FALSE] + pre_trainedMatrix["uk", , drop = FALSE] cos_sim_test <- sim2(x = pre_trainedMatrix, y = ex2 , method = "cosine", norm = "l2") head(sort(cos_sim_test[,1], decreasing = T), 5) # and if we replace uk with japan? ex3 <- pre_trainedMatrix["paris", , drop = FALSE] - pre_trainedMatrix["france", , drop = FALSE] + pre_trainedMatrix["japan", , drop = FALSE] cos_sim_test <- sim2(x = pre_trainedMatrix, y = ex3 , method = "cosine", norm = "l2") head(sort(cos_sim_test[,1], decreasing = T), 5) # Working so good on these analogies implies that our pre-trained WE is of good quality # Let's search for some semantic similarities woman <- pre_trainedMatrix["woman", , drop = F] cos_sim_woman <- sim2(x = pre_trainedMatrix, y = woman , method = "cosine", norm = "l2") head(sort(cos_sim_woman [,1], decreasing = T), 10) pre_trained2$word <- row.names(pre_trained2) similarity <- function(word1, word2){ lsa::cosine( x=as.numeric(pre_trained2[pre_trained2$word==word1,1:100]), y=as.numeric(pre_trained2[pre_trained2$word==word2,1:100])) } similarity("woman", "wife") similarity("man", "husband") similarity("woman", "mother") similarity("man", "father") # let's NOW match the words included in the pre-trained WE object (pre_trained) with the words included in the Dfm of our corpus pre_trained<- pre_trained[pre_trained$word %in% featnames(Dfm),] nrow(pre_trained) # around 1,000 features less than in our previous GloVe data-frame (see below). # Why? Cause some words included in the DfM were not included in pre_trained nrow(glove_dataframe) pre_trained[1:20, 1:11] # Once again, let's estimate for each document in the Dfm a value equals to the weighted average of its words # in each of the 100 dimensions of the pre-trained WE embed2 <- matrix(NA, nrow=ndoc(Dfm), ncol=100) for (i in 1:ndoc(Dfm)){ if (i %% 100 == 0) message(i, '/', ndoc(Dfm)) # extract word counts vec <- as.numeric(Dfm[i,]) # keep words with counts of 1 or more doc_words <- featnames(Dfm)[vec>0] # extract embeddings for those words embed_vec2 <- pre_trained[pre_trained$word %in% doc_words, 2:101] # aggregate from word- to document-level embeddings by taking AVG embed2[i,] <- colMeans(embed_vec2, na.rm=TRUE) # if no words in embeddings, simply set to 0 if (nrow(embed_vec2)==0) embed2[i,] <- 0 } str(embed2) # 1500 documents, with 100 columns (1 for each dimension) # Let's compute a new RF model set.seed(123) system.time(RF2 <- randomForest(x=embed2[training,], y=as.factor(tot$Sentiment[training]), do.trace=TRUE, ntree=500)) # computing predicted values set.seed(123) predicted_RF2 <- predict(RF2, embed2[test,]) table(predicted_RF2) prop.table(table(predicted_RF2)) confusion <- confusionMatrix( predicted_RF2, as.factor(tot$Sentiment[test]), mode = "everything") accuracyRF_newWE <- confusion $overall[1] F1_0 <- (2*confusion $byClass[1]*confusion $byClass[3])/(confusion $byClass[1]+confusion $byClass[3]) # F1 value for category negative F1_1 <- (2*confusion $byClass[2]*confusion $byClass[4])/(confusion $byClass[2]+confusion $byClass[4]) # F1 value for category positive F1_RF_newWE <- (F1_0+F1_1)/2 # in this case, slightly better using the local WE accuracyRF_newWE F1_RF_newWE accuracyRF F1_RF ############################################# ############################################# # A RF with the pre-trained WE + features from the BoW approach ############################################# ############################################# # We can also combine both bag-of-words and WE features into a single matrix, # and use a ML algorithm to let it choose for us the best set of features. # This combination of features and classifier could help to reach the best performance # Let's combine our dfm with the matrix we estimate out of the last GloVe results X <- as.matrix(cbind(Dfm, embed)) length(Dfm@Dimnames$features) # 12261 feauteres in our DfM str(X) # 12261 features in Dfm + 100 dimensions from WE = 12361 features set.seed(123) system.time(RF_ALL <- randomForest(x=X[training,], y=as.factor(tot$Sentiment[training]), importance=TRUE, do.trace=TRUE, ntree=500)) set.seed(123) predicted_RF_ALL <- predict(RF_ALL, X[test,]) confusion <- confusionMatrix( predicted_RF_ALL, as.factor(tot$Sentiment[test]), mode = "everything") accuracyRF_ALL <- confusion $overall[1] F1_0 <- (2*confusion $byClass[1]*confusion $byClass[3])/(confusion $byClass[1]+confusion $byClass[3]) # F1 value for category 0 F1_1 <- (2*confusion $byClass[2]*confusion $byClass[4])/(confusion $byClass[2]+confusion $byClass[4]) # F1 value for category 0 F1_RF_ALL <- (F1_0+F1_1)/2 accuracyRF_ALL F1_RF_ALL # WE+BoW better than WE alone accuracyRF accuracyRF_ALL F1_RF F1_RF_ALL # and as you can see, several of the WE dimensions are very important for model estimation varImpPlot(RF_ALL)