rm(list=ls(all=TRUE)) getwd() setwd("C:/Users/luigi/Dropbox/TOPIC MODEL/") getwd() library(newsmap) library(quanteda) # Step 1: let's begin with 6 texts text_en <- c(text1 = "This is an article about Italy", text2 = "Milan is a beautiful Italian city", text3= "I miss the sea of Spain", text4="Granada is an ancient Spanish city", text5 = "We liked food more in Siena than in Milan", text6 = "In Granada you can feel history all around you") corp <- corpus(text_en) tok <- tokens(corp) feat_dfm_en <- dfm(corp) feat_dfm_en # Step 2: let's apply a dictionary of seed-words # let's use this list of seed-words about countries coming with newsmap - including word stems/wildcards! data_dictionary_newsmap_en names(data_dictionary_newsmap_en) names(data_dictionary_newsmap_en[["EUROPE"]]) names(data_dictionary_newsmap_en[["EUROPE"]] [["SOUTH"]]) print(data_dictionary_newsmap_en[["EUROPE"]][["SOUTH"]], max_nkey=15) label <- dfm_lookup(feat_dfm_en, dictionary = data_dictionary_newsmap_en) label[1:6, 190:205] # i.e., the keywords of the topic "IT" of our dictionary appears twice in the corpus, the same for the topic "ES" # while 2 texts have no seed words # Step 3: let's train the model model_en <- textmodel_newsmap(feat_dfm_en, label) # now all the words of our two texts, including those not included in the seed-words, get a value! # take a look at the scores for Milan and Granada (not originally included in the seed dictionary!) model_en$model # let's predict the label - we also predicted (correctly) both text5 and text6 (i.e., those texts that did not # include any seed words originally). How was it possible? predict(model_en) # Step 4: we can now apply the trained model to other new texts text_en2 <- c(text7 = "Make India great again", text8 = "I love the sea!") corp2 <- tokens(text_en2) feat_dfm_en2 <- dfm(corp2, tolower = TRUE) # can you explain why you get such results? predict(model_en, newdata=feat_dfm_en2) # we cannot classify text5 because it does not include any words included in the texts included considered during # our training-stage. On the other side text6 has been classified as "Spain" thanks to the word sea model_en$model ######################################## ### another example with movie reviews ######################################## # Step 1: let's begin with the texts data("data_corpus_moviereviews", package = "quanteda.textmodels") corp <- tail(data_corpus_moviereviews, 500) # keeping the first 500 texts as.character(corp )[2] ndoc(corp ) tok <- tokens(corp, remove_number = TRUE) tok <- tokens_remove(tok, stopwords("en")) dfmt <- dfm(tok ) # min_char: I specify the minimum length in characters for tokens to be removed dfmt <- dfm_remove(dfmt, min_nchar = 2) dfmt <- dfm_trim(dfmt, min_termfreq = 0.90, termfreq_type = "quantile", max_docfreq = 0.1, docfreq_type = "prop") topfeatures(dfmt, 50) # Step 2: let's apply a dictionary of seed-words - remember my dictionary option dict <- dictionary(list(people = c("family", "couple", "kid*", "child*", "parents", "husband", "wife"), space = c("alien*", "planet", "space"), monster = c("monster*", "ghost*", "zombie*", "fear"), war = c("war", "soldier*", "tanks", "military"), crime = c("crime*", "murder*", "killer*", "police"))) dict # dfm_lookup label <- dfm_lookup(dfmt, dictionary = dict) label[2:3, 1:5] # i.e., the keywords of the topic "crime" of our dictionary appears 2 times in the second review, etc. # Step 3: let's train the model model_en <- textmodel_newsmap(dfmt, label) # now all the words of our texts, including those not included in the seed-words, get a value! # take a look at words such as dating and tarantino str(model_en$model) model_en$model[c("people", "space", "monster", "war", "crime"), c("dating", "tarantino")] # let's see the top ten features associated to each of the 5 topics we defined above for (i in 1:5) { print(dict[i,]) print(sort(model_en $model[i,], decreasing = T)[1:10]) } # let's predict the label predict(model_en) prop.table(table(predict(model_en))) # let's plot it newsmap_pred <- as.data.frame(prop.table(table(predict(model_en)))) str(newsmap_pred) names(newsmap_pred)[1] <- "Topics" str(newsmap_pred) library(ggplot2) ggplot(newsmap_pred, aes(x = reorder(Topics, -Freq), y = Freq)) + geom_bar(stat = "identity") + xlab("Topics in the training-set (via Newsmap)") # let's save the trained prediction newsmap_pred <- prop.table(table(predict(model_en))) newsmap_pred # Step 4: let's apply the trained model to completely new texts corp2 <- tail(data_corpus_moviereviews, 250) # keeping the last 250 texts ndoc(corp2) tok2 <- tokens(corp2, remove_number = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) dfmt2 <- dfm(tok2 ) # min_char: I specify the minimum length in characters for tokens to be removed dfmt2 <- dfm_remove(dfmt2, min_nchar = 2) dfmt2 <- dfm_trim(dfmt2, min_termfreq = 0.90, termfreq_type = "quantile", max_docfreq = 0.1, docfreq_type = "prop") predict(model_en, newdata=dfmt2) prop.table(table(predict(model_en))) prop.table(table(predict(model_en, newdata=dfmt2)))