rm(list=ls(all=TRUE))
# set your working directory (i.e., where you have saved the datasets, etc.) 
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()

library(readtext)
library(quanteda)
library(ggplot2)
library(quanteda.textstats)
library(quanteda.textplots )
library(SnowballC)
library(corrplot)

#########################################################################
#########################################################################
# Creating and Working with a Corpus
#########################################################################
#########################################################################

#########################################################################
# Several different ways to create a corpus in Quanteda. Let's look at 2 of them
# FIRST: you have already a matrix file with a text for each row (such as .csv or .xls)
#########################################################################

# This dataset is a sample of 100 tweet from boston area discussing about food
# Data have been collected through Twitter API also specifying language and origin of tweets. 
# We will discuss later about how to retrieve such type of data in the next weeks
x <- read.csv("boston.csv", stringsAsFactors=FALSE)
str(x)

# You read the dataset via readtext by identifying the name of the column in the dataset with the texts (in this case "text")
myText2 <- readtext("boston.csv", text_field = "text")
str(myText2)

# You create your corpus
myCorpus2 <- corpus(myText2)
# Jargon: types=number of unique terms; tokens=number of words 
head(summary(myCorpus2))

# number of documents in the corpus
ndoc(myCorpus2 )
# Show the first text
as.character(myCorpus2)[1]
strwrap(as.character(myCorpus2)[1])
# Show the first 3 texts
as.character(myCorpus2)[1:3]
strwrap(as.character(myCorpus2)[1:3])

# Let's move from the corpus to the document-feature(term) matrix! 
# In quanteda, we first tokenize the texts via tokens then we use the dfm function to produce such a matrix, 
# where documents are in rows and “features” (aka: words) are columns
tokens(myCorpus2)
strwrap(as.character(myCorpus2)[3])

tok2 <- tokens(myCorpus2)
str(tok2)
# when you create a dfm by default "tolower=TRUE", i.e., we convert all features to lowercase
myDfm <- dfm(tok2)
# Let's see the first five documents and the first 10 words of our dfm
myDfm[1:5, 1:10]
str(myDfm)
# Let's see the texts of the first two documents
as.character(myCorpus2)[1:2]
# 20 top features in the dfm
topfeatures(myDfm , 20) 

# let's clean the dfm! 
# FIRST: let's remove numbers, separators, etc.
tok2_clean  <- tokens(myCorpus2, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)

# SECOND: let's remove the stopwords
head(stopwords("english"), 20)
head(stopwords("russian"), 10)
head(stopwords("italian"), 10)

# the source "marino" for som given language (in particular asian languages: japanese, chinese, korean; but also arabic and hebrew)
# is a better option. See: https://github.com/koheiw/marimo
stopwords("en")
stopwords("en", source = "marimo")

# The stop words options available in Quanteda (based on the Snowball stopwords list: see http://snowball.tartarus.org/) 
# works for all the main European languages (ftp://cran.r-project.org/pub/R/web/packages/stopwords/stopwords.pdf). 
getStemLanguages()
# For other languages things are a bit more complex. For Arabic ones, for example, a good source is also the stemming package ‘arabicStemR’
tok2_clean_stop <- tokens_remove(tok2_clean , stopwords("english"))

# THIRD: let's stem the words
tok2_clean_stop_stem<- tokens_wordstem (tok2_clean_stop, language =("english"))

# Let's now re-create the dfm
myDfm2 <- dfm(tok2_clean_stop_stem)

topfeatures(myDfm , 50) 
topfeatures(myDfm2 , 20) # the featurea "for", "with" and "the" have disappeared! while #smoked is now #smoke

# still some symbols to remove! for example the words starting with "00" (they are unicode characters)
"\U00BD"

tok2_clean_stop_stem2 <- tokens_remove(tok2_clean_stop_stem, c(("rt"), ("00*"), ("ed"), ("u"), ("lyft")))
myDfm2 <- dfm(tok2_clean_stop_stem2)
topfeatures(myDfm2 , 20)  # 20 top features [better!]

# We can also create a dfm identifying only some specific words, such as for example only the hashtags 
# in the tweets using select = "#*" when creating the dfm.
dfm_hashtag <- dfm_select(myDfm2, pattern = c("#*"))

topfeatures(myDfm2 , 20)  # 20 top features 
topfeatures(dfm_hashtag, 20)  # 20 top features 

# Note that you can remove stopwords also with dfm_remove, after you have created a dfm 
dfm_remove(myDfm, pattern = stopwords("en"))

# You can also decide to exclude some features. For example, let's exclude all the hashtags
dfm_NOhashtag <- dfm_remove(myDfm2, pattern = c("#*"))
topfeatures(myDfm2 , 20)  # 20 top features 
topfeatures(dfm_hashtag, 20)  # 20 top features 
topfeatures(dfm_NOhashtag , 20)  # 20 top features 

# trimming the dfm
myDfm[1:10, 1:10]

# keep only words occurring >= 10 times and in >= 2 documents
dfm_trim(myDfm, min_termfreq = 10, min_docfreq = 2)

# keep only words occurring >= 20 times and in at least 0.4 of the documents
dfm_trim(myDfm, min_termfreq = 20, min_docfreq = 0.4)

# keep only words occurring <= 10 times and in <=2 documents
dfm_trim(myDfm, max_termfreq = 10, max_docfreq = 2)

# keep only words occurring 5 times in 1000, and in 2 of 5 of documents
dfm_trim(myDfm, min_docfreq = 0.4, min_termfreq = 0.005, termfreq_type = "prop")

# keep only words occurring in all the 100 documents of my corpus
dfm_trim(myDfm,  min_termfreq = 1, min_docfreq = 100)

# weighting a dfm according to the relative term frequency, i.e.,  
# normalizing a dfm by considering the proportions of the feature counts within each document

myDfm_weight <- dfm_weight(myDfm, scheme = "prop")
# compare the two matrices below (the first one: unweighted; the second one: weighted)
myDfm[1:5, 1:5]
myDfm_weight [1:5, 1:5]

# weighting a dfm by tf-idf
# remember: tf-idf adds a weight that approaches zero as the number of documents in which a term appears 
# (in any frequency) approaches the number of documents in the collection. And indeed here #dinner close to 0! 
myDfm_tf <- dfm_tfidf(myDfm)
myDfm[1:5, 1:5]
myDfm_tf [1:5, 1:5]

#########################################################################
# SECOND: you have saved in a directory a set of files (one for each document) in a given format (.txt, .doc, .pdf)
#########################################################################

# SOURCE: http://www.presidency.ucsb.edu/inaugurals.php
# it is always better to save txt files in the UTF-8 format. If you have saved them in some other formats (ISO, etc.),
# it's a good idea to specify it in the encoding command 
# Note that readtext reads also other file formats: check "?readtext"

# our txt files are included in the folder called "Inaugural Speeches" included in our working directory
myText <- readtext("Inaugural Speeches/*.txt", encoding = "UTF-8")
str(myText)

# we can actually extract three pieces of info from each file name (Name, Surname, Year: i.e., "George_Washington_1789.txt")
# Note: to use the docvarsfrom = "filenames" option, the "file names" should be consistent, i.e., in the below example, in ALL the txts' title 
# you should have the same ordering: Name, Surname, Year using the same separators (i.e., "_")

myText <- readtext("Inaugural Speeches/*.txt", 
docvarsfrom = "filenames", dvsep = "_", docvarnames = c("Name", "Surname", "Year"), encoding = "UTF-8")
str(myText)

testCorpus <- corpus(myText )
summary(testCorpus)
# inspect the document-level variables (this is a very important option, as we will see later on...)
head(docvars(testCorpus))

tok4 <- tokens(testCorpus,  remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok4 <- tokens_remove(tok4 , stopwords("en"))
tok4 <- tokens_wordstem (tok4 )
myDfm <- dfm(tok4)
topfeatures(myDfm , 20)  # 20 top words

#########################################################################
# Playing with the corpus
#########################################################################

# let's load a corpus already presented in Quanteda: the corpus of all the US Presidents' Inaugural Speeches
# To summarize the texts from a corpus, we can call a summary() method defined for a corpus.
summary(data_corpus_inaugural)
# inspect the document-level variables 
head(docvars(data_corpus_inaugural))

as.character(data_corpus_inaugural)[1]
as.character(data_corpus_inaugural)[2]

# to see the entire Trump speech
trump <- corpus_subset(data_corpus_inaugural, President == "Trump")
summary(trump)
strwrap(as.character(trump [[1]]))

# Adding two corpus together
# First five inaug. speeches
mycorpus1 <- corpus(data_corpus_inaugural[1:5])
# Last five inaug. speeches
mycorpus2 <- corpus(data_corpus_inaugural[53:58])
mycorpus3 <- mycorpus1 + mycorpus2
summary(mycorpus3)

# subsetting a corpus 
summary(corpus_subset(data_corpus_inaugural, Year > 1990))
summary(corpus_subset(data_corpus_inaugural, President == "Adams"))
summary(corpus_subset(data_corpus_inaugural, Year > 1990 & Party== "Republican"))

#########################################################################
#  Let's explore some statistical summaries methods
#########################################################################

# Statistical summary methods are essentially quantitative summaries of texts to describe their characteristics on some indicator, and may use (or not) statistical methods 
# based on sampling theory for comparison

#########################################################################
#  Statistical summaries (1): Lexical dispersion plot (Positional Analysis: i.e., analysis that retain the original text sequence)
#########################################################################

# The kwic function (keywords-in-context) performs a search for a word in a corpus and it allows us to view the contexts in which it occurs
# NOTE: by working on a token object, we retain the original text sequence!

options(width = 200)
kwic(tokens(data_corpus_inaugural), "terror")
# also the words starting with "terror" including "terrorism"
kwic(tokens(data_corpus_inaugural), "terror*")
# same result as above
kwic(tokens(data_corpus_inaugural), "terror", valuetype = "regex")
kwic(tokens(data_corpus_inaugural), "communis*")

# Note that by default, the kwic() is word-based. If you like to look up a multiword combination, use phrase()
kwic(tokens(data_corpus_inaugural), phrase("by terror"))

# We can plot a kwic object via a Lexical dispersion plot 
# a Lexical dispersion plot allow you to detect both the relative frequency of an employed word across documents as well as the “timing” of that word in a given text

textplot_xray(kwic(tokens(data_corpus_inaugural[40:59]), "american"))

textplot_xray(
     kwic(tokens(data_corpus_inaugural[40:59]), "american"),
     kwic(tokens(data_corpus_inaugural[40:59]), "people"),
     kwic(tokens(data_corpus_inaugural[40:59]), "communis*")
)

# If you’re only plotting a single document, but with multiple keywords, then the keywords are displayed 
# one below the other rather than side-by-side.

textplot_xray(
    kwic(tokens(corpus_subset(data_corpus_inaugural, Year > 2015 & Year<2020)), "america"),
    kwic(tokens(corpus_subset(data_corpus_inaugural, Year > 2015 & Year<2020)), "people"),
    kwic(tokens(corpus_subset(data_corpus_inaugural, Year > 2015 & Year<2020)), "chief")
)

# You might also have noticed that the x-axis scale is the absolute token index for single texts 
# and relative token index when multiple texts are being compared. 
# If you prefer, you can specify that you want an absolute scale 

textplot_xray(
     kwic(tokens(data_corpus_inaugural[40:59]), "american"),
     kwic(tokens(data_corpus_inaugural[40:59]), "people"),
     kwic(tokens(data_corpus_inaugural[40:59]), "communis*"),
     scale = 'absolute'
)

# The object returned is a ggplot object, which can be modified using ggplot

plot <- textplot_xray(
     kwic(tokens(data_corpus_inaugural[40:59]), "american"),
     kwic(tokens(data_corpus_inaugural[40:59]), "people"),
     kwic(tokens(data_corpus_inaugural[40:59]), "communist"))
plot + aes(color = keyword) + scale_color_manual(values = c('red', 'blue', "green"))

#########################################################################
# Statistical summaries (2): Plotting the wordclouds (Non-positional Analysis: i.e., analysis that DO NOT retain the original text sequence - bag of words)
#########################################################################

# One of the simplest statistical summary method you can apply to a DfM is a tag cloud.
# A tag cloud is a visual representation of text data, in which tags are single words whose frequency is shown with different font size (and/or color)

myCorpus <- corpus_subset(data_corpus_inaugural, Year > 1990)
summary(myCorpus)
tok2 <- tokens(myCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
tok2 <- tokens_wordstem (tok2)
myDfm <- dfm(tok2)

# if you define a seed, each time you get always the same plot
set.seed(123)
textplot_wordcloud(myDfm , min.count = 6, rot.per = .25, 
                   colors = RColorBrewer::brewer.pal(8,"Dark2"))

textplot_wordcloud(myDfm , min.count = 10,
     colors = c('red', 'pink', 'green', 'purple', 'orange', 'blue'))

# You can also plot a “comparison cloud”, but this can only be done with fewer than eight documents:
corp2 <- corpus_subset(data_corpus_inaugural, President %in% c("Washington", "Jefferson", "Madison"))
summary(corp2)
tok2 <- tokens(corp2, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
tok2 <- tokens_wordstem (tok2)
myDfm <- dfm(tok2)
# let's group the speeches made by the same President (such as the two speeches made by Washington) in one single dfm using
# the command we can use is dfm_group
myDfm2 <- dfm_group(myDfm, groups = President)
str(myDfm)
length(myDfm@Dimnames$docs) # 6 documents
str(myDfm2)
length(myDfm2@Dimnames$docs) # 3 documents!

set.seed(123)
textplot_wordcloud(myDfm2, comparison = TRUE)
# let's plot the dfm 
set.seed(123)
textplot_wordcloud(dfm_trim(myDfm2, min_termfreq = 5, verbose = FALSE), comparison = TRUE)

# Exercise: let's plot a "comparison cloud" between Biden, Trump and Obama

corp2 <- corpus_subset(data_corpus_inaugural, President %in% c("Biden", "Trump", "Obama"))
tok2 <- tokens(corp2, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
tok2 <- tokens_wordstem (tok2)
myDfm <- dfm(tok2)
myDfm2 <- dfm_group(myDfm, groups = President)
set.seed(123)
textplot_wordcloud(dfm_trim(myDfm2 , min_termfreq = 5, verbose = FALSE), comparison = TRUE)

#########################################################################
# Statistical summaries (3): Comparing words associated with a target group vs. reference group (Non-positional Analysis)
#########################################################################

# More sophisticated methods compare the differential occurrences of words across texts or partitions of a corpus, using statistical association measures, 
# to identify the words that belong for example to different sub-groups of texts, such as those predominantly associated with male- versus female - authored documents

# In this respect we can employ for example a chi2 test. Chi-squared test is used to determine whether there is a statistically significant difference between 
# the expected frequencies and the observed frequencies in one or more categories of a contingency table (in our cases
# we are talking about the frequencies of words in two different set of texts). If you look at 95% c.i., the chi2 value should be of course
# larger than |1.96|

# More in details, if you want to compare the differential associations of keywords in a target and reference group, 
# you can calculate “keyness” which is based on the textstat_keyness command. 
# In this example, we compare the inaugural speech by Donald Trump with the speech by Joe Biden

pres_corpus <- corpus_subset(data_corpus_inaugural, President %in% c("Trump", "Biden"))
summary(pres_corpus)
# Create a dfm grouped by president
tok2 <- tokens(pres_corpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
docnames(pres_corpus)
pres_dfm<- dfm(tok2)

# Calculate keyness and determine Trump as target group
result_keyness <- textstat_keyness(pres_dfm, target = "2017-Trump")

# if you get a negative value, it means that Biden uses that word more than Trump (i.e., the target group) and viceversa
# Plot estimated word keyness
textplot_keyness(result_keyness) 

# Plot without the reference text (in this case Biden)
textplot_keyness(result_keyness, show_reference = FALSE)

head(result_keyness , 10)
tail(result_keyness , 10)

# Your reference text can also include more than one text. For example by typing:

pres_corpus <- corpus_subset(data_corpus_inaugural, President %in% c("Trump", "Biden", "Obama"))
tok2 <- tokens(pres_corpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
pres_dfm<- dfm(tok2)
# You calculate keyness and determine Trump as target group vs. Obama and Biden speeches as the reference group
result_keyness <- textstat_keyness(pres_dfm, target = "2017-Trump")

#########################################################################
# Statistical summaries (4): Lexical diversity (Non-positional Analysis)
#########################################################################

# Other quantitative summary measures of documents are designed to characterize specific qualities of texts 
# Comparing the rates of types and tokens forms the foundation for measures of lexical diversity (the rate of vocabulary usage), with most common such measure 
# comparing the number of types to the number of tokens (the “type-token ratio”)
# For example, it is argued that populist communication means simplified political discourse (lower diversity), in an attempt to reach the public more easily

# textstat_lexdiv() command calcuates precisely lexical diversity in various measures based on the number of unique types of tokens 
# and the length of a document. It is useful for analysing speakers’ or writers’ linguistic skill, or complexity 
# of ideas expressed in documents.

inaug_tokens <- tokens(data_corpus_inaugural)
tok2 <- tokens_remove(inaug_tokens , stopwords("en"))
inaug_dfm <- dfm(tok2)
lexdiv <- textstat_lexdiv(inaug_dfm)
str(lexdiv)

ggplot(data=lexdiv , aes(x=document, y=TTR, group=1)) +
  geom_line()+
  geom_point()+
  theme_minimal() + scale_x_discrete(breaks=c("1789-Washington","1933-Roosevelt","1961-Kennedy", "2017-Trump")) 

# a temporal decreasing in the level of complexity?
# let's see
ggplot(data=lexdiv , aes(x=document, y=TTR, group=1)) +
  geom_line()+
  geom_point()+
  theme_minimal() + scale_x_discrete(breaks=c("1789-Washington","1933-Roosevelt","1961-Kennedy", "2017-Trump")) + geom_smooth(method = "lm")

# TTR is estimated as V/N, where
# V (types=total number of unique terms); N (tokens=total number of words in the dfm)
head(lexdiv, 5)
tail(lexdiv, 5)

# when you run textstat_lexdiv it automatically removes numbers punctuation etc from the corpus [w/o the need for you
# to specify that when you create your dfm: see ?textstat_lexdiv]
# that's why the ratio you get between Types and Tokens from the corpus gives you a different value:
summary(data_corpus_inaugural)
# for Washington 1789
625/1537
head(lexdiv, 1)

#########################################################################
# Statistical summaries (5): Cosine similarities (Non-positional Analysis)
#########################################################################

# cosine similarity is an intuitive measure of semantic distance that has been increasingly used in a number of fields of social sciences. 
# By representing texts as vectors in a Cartesian space, cosine similarity estimates the differences between two texts based on vectors 
# of word occurrences. The possible divergence between two texts ranges between 0 and 1, where 0 is reached when two texts are completely 
# different and 1 is reached when two texts have identical feature proportions

# create a dfm from inaugural addresses from Reagan onwards
myCorpus <- corpus_subset(data_corpus_inaugural, Year > 1980)
# alternative way to get the speeches after 1980 [note the use of the logical operator "&"]
# myCorpus <- corpus_subset(data_corpus_inaugural, Year > 1980 & Year<2021)
summary(myCorpus)
tok2 <- tokens(myCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
tok2 <- tokens_wordstem (tok2)
presDfm<- dfm(tok2)
presDfm

# compute some document similarities
Simil <- textstat_simil(presDfm, margin = "documents", method = "cosine")
Simil

# Let's plot it!
Simil2 <-as.matrix(Simil)
str(Simil2)
corrplot(Simil2, method = 'number')
corrplot(Simil2, method = 'color') 
corrplot(Simil2, method = 'shade', type = 'lower') 

# for specific comparisons: here the two speeches by Obama
obamaSimil <- textstat_simil(presDfm, presDfm[c("2009-Obama", "2013-Obama"), ], margin = "documents", method = "cosine")
obamaSimil

# compute some term similarities
tstat3 <- textstat_simil(presDfm, presDfm[, c("fair", "health", "terror")], method = "cosine", 
               margin = "features", 20)
head(as.matrix(tstat3), 10)
as.list(tstat3, n = 6)

#########################################################################
# TOKENIZE Japanese or Chinese language (i.e., languages w/o white spaces)
#########################################################################

# When you want to show Japanese kanji on a non-Japanese laptop, always write the following at the beginning of your session:
Sys.setlocale("LC_CTYPE","japanese")

# Suppose we want to analyze the text of speeches made on 17 Nov, 2017 and 20 Nov, 2017 for the new Japanese parliamentary session
myText <- readtext("Diet speeches/*.txt", encoding = "UTF-8")
str(myText)

# tokens() can segment Japanese texts without additional tools based on the rules defined in the ICU library, 
# which is available via the stringi package (that comes with the Quanteda one). ICU detects boundaries of Japanese words using a dictionary 
# with frequency information (see: http://source.icu-project.org/repos/icu/icu/tags/release-58-rc/source/data/brkitr/dictionaries/
# for the list of Asian languages covered, including Chinese language)

icu_toks <- tokens(myText$text)
# this expression means: consider document 2, and report me the first 40 characters appearing in it 
head(icu_toks[[2]], 40)

# Japanese stopwords
stopwords("ja", source = "stopwords-iso")
# A better alternative is using the source "marimo"
stopwords("ja", source = "marimo")

# About stopwords in different languages, take a look at here: https://cran.r-project.org/web/packages/stopwords/README.html
# Alternatively, you want to remove them by yourself by using the tokens_remove option discussed above

# tokenize corpus and apply pre-processing
toks <- tokens(myText$text, remove_punct = TRUE, remove_numbers = TRUE, remove_separators = TRUE)
head(icu_toks[[1]], 40)
head(toks[[1]], 40)
toks <- tokens_remove(toks, stopwords("ja", source = "marimo"))
head(toks[[1]], 40)

# You can also decide toselect tokens only with Hiragana:
# \\p means match a collection of character, not single one
# $ means "Match the end of the string" (the position after the last character in the string) to ensure that the entire string 
# is matched instead of just a substring

toksHira <- tokens_select(toks, pattern = "\\p{script=Hira}+$", valuetype = "regex")

head(toks[[1]], 40)
head(toksHira [[1]], 40)

# You can also select only Japanese words (Hiragana, Katakana, Kanji) - in this case you do not see any differences
toks2 <- tokens_select(toks, pattern =c("\\p{script=Hira}", "\\p{script=Kana}", "\\p{script=Hani}"), valuetype = "regex")
head(toks[[1]], 100)
head(toks2[[1]], 100)

jap_dfm <- dfm(toks)
topfeatures(jap_dfm, 20)
jap_dfm[, 1:5]

# I want to name properly the texts
jap_dfm@docvars
jap_dfm@docvars[,2]
jap_dfm@docvars[,2] <- myText$doc_id
jap_dfm[, 1:5]

# I could also write directly the name I want to give to such texts (this is an hypothetical example with fictious names!)
jap_dfm@docvars[,2] <- c("A", "B", "C", "D", "E", "F", "G")
jap_dfm@docvars[,2]
jap_dfm[, 1:5]

# If you want to perform more accurate tokenization, you need to install a morphological analysis tool, and call it from R. 
# For example: RcppMeCab (Chinese, Japanese, and Korean): https://github.com/junhewk/RcppMeCab

#########################################################################
# Running example: How to deal with Chinese language 
#########################################################################

corp <- quanteda.corpora::download(url = "https://www.dropbox.com/s/37ojd5knz1qeyul/data_corpus_chinesegovreport.rds?dl=1")

# tokenize corpus and apply pre-processing
ch_toks<- tokens(corp, remove_punct = TRUE, remove_numbers = TRUE, remove_separators = TRUE)
head(ch_toks[[1]], 40)
ch_toks<- tokens_remove(toks, stopwords("zh_cn", source = "marimo"))
head(ch_toks[[1]], 40)
# Alternative Chinese stopwords
stopwords("zh", source = "misc")
stopwords("zh", source = "stopwords-iso")

# You can keep only Chinese characters 
ch_toks<- tokens_select(toks, pattern = "\\p{script=Hani}+$", valuetype = "regex")
head(ch_toks[[1]], 40)

ch_dfm <- dfm(ch_toks)
topfeatures(ch_dfm)
ch_dfm[, 1:5]