rm(list=ls(all=TRUE))
getwd()
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL/")
getwd()

library(newsmap)
library(quanteda)

# Step 1: let's begin with 4 texts
text_en <- c(text1 = "This is an article about Italy",
             text2 = "Milan is a beautiful Italian city",
             text3= "I miss the sea of Spain",
             text4="Granada is an ancient Spanish city",
             text5 = "We liked more food in Siena than in Milan",
             text6 = "In Granada you can feel history all around you") 
corp <- corpus(text_en)
feat_dfm_en <- dfm(corp , tolower = TRUE)

# Step 2: let's apply a dictionary of seed-words
# let's use this list of seed-words about countries
data_dictionary_newsmap_en
names(data_dictionary_newsmap_en)
names(data_dictionary_newsmap_en[["EUROPE"]])
names(data_dictionary_newsmap_en[["EUROPE"]] [["SOUTH"]])
print(data_dictionary_newsmap_en[["EUROPE"]][["SOUTH"]], max_nkey=15)

label <- dfm(corp, dictionary = data_dictionary_newsmap_en)
label[1:6, 190:205]  
# i.e., the keywords of the topic "IT" of our dictionary appears twice in the corpus, the same for the topic "ES"
# while 2 texts have no seed words

# Step 3: let's train the model
model_en <- textmodel_newsmap(feat_dfm_en, label)
# now all the words of our two texts, including those not included in the seed-words, get a value!
# take a look at the scores for Milan and Granada (not originally included in the seed dictionary!)
model_en$model
# let's predict the label - we also predicted (correctly) both text5 and text6 (i.e., thoese texts that did not
# include any seed words originally)
predict(model_en)

# Step 4: we can now apply the trained model to other new texts
text_en2 <- c(text5 = "Make India great again", text6 = "I love the sea!")
corp2 <- tokens(text_en2)
feat_dfm_en2 <- dfm(corp2, tolower = TRUE)
# can you explain why you get such results?
predict(model_en, newdata=feat_dfm_en2)

# we cannot classify text5 because it does not include any words included in the texts included considered during
# our training-stage. On the other side text6 has been classified as "Spain" thanks to the word sea
model_en$model

########################################
### another example with movie review
########################################

# Step 1: let's begin with the texts
data("data_corpus_moviereviews", package = "quanteda.textmodels")
corp <- tail(data_corpus_moviereviews, 500) # keeping the first 500 texts 
texts(corp )[2]
ndoc(corp )
dfmt <- dfm(corp, remove_number = TRUE,  tolower = TRUE)
# min_char: I specify the minimum length in characters for tokens to be removed 
dfmt <- dfm_remove(dfmt, stopwords('en'), min_nchar = 2)
dfmt <- dfm_trim(dfmt, min_termfreq = 0.90, termfreq_type = "quantile",
             max_docfreq = 0.1, docfreq_type = "prop")

topfeatures(dfmt, 50)
 
# Step 2: let's apply a dictionary of seed-words - remember my dictionary option
dict <- dictionary(list(people = c("family", "couple", "kids", "child", "parents"),
                        space = c("alien", "planet", "space"),
                        monster = c("monster*", "ghost*", "zombie*", "scream"),
                        war = c("war*", "soldier*", "tanks"),
                        crime = c("crime*", "murder", "killer", "police")))
dict
label <- dfm(corp, remove_number = TRUE,  tolower = TRUE, dictionary = dict)
label[1:2, 1:5] # i.e., the keywords of the topic "crime" of our dictionary appears 3 times in the second review, etc.

# Step 3: let's train the model
model_en <- textmodel_newsmap(dfmt, label)
# now all the words of our texts, including those not included in the seed-words, get a value!
model_en$model
# let's predict the label
predict(model_en)
prop.table(table(predict(model_en)))
# let's save the trained prediction
newsmap_pred <- prop.table(table(predict(model_en)))
newsmap_pred

# Step 4: let's apply the trained model to completely new texts
corp2 <- tail(data_corpus_moviereviews, 250) # keeping the last 250 texts 
ndoc(corp2)
dfmt2 <- dfm(corp2, remove_number = TRUE,  tolower = TRUE)
dfmt2 <- dfm_remove(dfmt2, stopwords('en'), min_nchar = 2)
dfmt2 <- dfm_trim(dfmt2, min_termfreq = 0.90, termfreq_type = "quantile",
             max_docfreq = 0.1, docfreq_type = "prop")

predict(model_en, newdata=dfmt2)
prop.table(table(predict(model_en, newdata=dfmt2)))
prop.table(table(predict(model_en)))