rm(list=ls(all=TRUE)) getwd() setwd("C:/Users/luigi/Dropbox/TOPIC MODEL/") getwd() library(newsmap) library(quanteda) # Step 1: let's begin with 4 texts text_en <- c(text1 = "This is an article about Italy", text2 = "Milan is a beautiful Italian city", text3= "I miss the sea of Spain", text4="Granada is an ancient Spanish city", text5 = "We liked more food in Siena than in Milan", text6 = "In Granada you can feel history all around you") corp <- corpus(text_en) feat_dfm_en <- dfm(corp , tolower = TRUE) # Step 2: let's apply a dictionary of seed-words # let's use this list of seed-words about countries data_dictionary_newsmap_en names(data_dictionary_newsmap_en) names(data_dictionary_newsmap_en[["EUROPE"]]) names(data_dictionary_newsmap_en[["EUROPE"]] [["SOUTH"]]) print(data_dictionary_newsmap_en[["EUROPE"]][["SOUTH"]], max_nkey=15) label <- dfm(corp, dictionary = data_dictionary_newsmap_en) label[1:6, 190:205] # i.e., the keywords of the topic "IT" of our dictionary appears twice in the corpus, the same for the topic "ES" # while 2 texts have no seed words # Step 3: let's train the model model_en <- textmodel_newsmap(feat_dfm_en, label) # now all the words of our two texts, including those not included in the seed-words, get a value! # take a look at the scores for Milan and Granada (not originally included in the seed dictionary!) model_en$model # let's predict the label - we also predicted (correctly) both text5 and text6 (i.e., thoese texts that did not # include any seed words originally) predict(model_en) # Step 4: we can now apply the trained model to other new texts text_en2 <- c(text5 = "Make India great again", text6 = "I love the sea!") corp2 <- tokens(text_en2) feat_dfm_en2 <- dfm(corp2, tolower = TRUE) # can you explain why you get such results? predict(model_en, newdata=feat_dfm_en2) # we cannot classify text5 because it does not include any words included in the texts included considered during # our training-stage. On the other side text6 has been classified as "Spain" thanks to the word sea model_en$model ######################################## ### another example with movie review ######################################## # Step 1: let's begin with the texts data("data_corpus_moviereviews", package = "quanteda.textmodels") corp <- tail(data_corpus_moviereviews, 500) # keeping the first 500 texts texts(corp )[2] ndoc(corp ) dfmt <- dfm(corp, remove_number = TRUE, tolower = TRUE) # min_char: I specify the minimum length in characters for tokens to be removed dfmt <- dfm_remove(dfmt, stopwords('en'), min_nchar = 2) dfmt <- dfm_trim(dfmt, min_termfreq = 0.90, termfreq_type = "quantile", max_docfreq = 0.1, docfreq_type = "prop") topfeatures(dfmt, 50) # Step 2: let's apply a dictionary of seed-words - remember my dictionary option dict <- dictionary(list(people = c("family", "couple", "kids", "child", "parents"), space = c("alien", "planet", "space"), monster = c("monster*", "ghost*", "zombie*", "scream"), war = c("war*", "soldier*", "tanks"), crime = c("crime*", "murder", "killer", "police"))) dict label <- dfm(corp, remove_number = TRUE, tolower = TRUE, dictionary = dict) label[1:2, 1:5] # i.e., the keywords of the topic "crime" of our dictionary appears 3 times in the second review, etc. # Step 3: let's train the model model_en <- textmodel_newsmap(dfmt, label) # now all the words of our texts, including those not included in the seed-words, get a value! model_en$model # let's predict the label predict(model_en) prop.table(table(predict(model_en))) # let's save the trained prediction newsmap_pred <- prop.table(table(predict(model_en))) newsmap_pred # Step 4: let's apply the trained model to completely new texts corp2 <- tail(data_corpus_moviereviews, 250) # keeping the last 250 texts ndoc(corp2) dfmt2 <- dfm(corp2, remove_number = TRUE, tolower = TRUE) dfmt2 <- dfm_remove(dfmt2, stopwords('en'), min_nchar = 2) dfmt2 <- dfm_trim(dfmt2, min_termfreq = 0.90, termfreq_type = "quantile", max_docfreq = 0.1, docfreq_type = "prop") predict(model_en, newdata=dfmt2) prop.table(table(predict(model_en, newdata=dfmt2))) prop.table(table(predict(model_en)))