rm(list=ls(all=TRUE)) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") library(readtext) library(quanteda) library(ggplot2) ######################################################################### # Running example: How to deal with Japanese language ######################################################################### # When you want to show Japanese kanji on a non-Japanese laptop, always write the following at the beginning of your session: Sys.setlocale("LC_CTYPE","japanese") # Suppose we want to analyze the text of speeches made on 17 Nov, 2017 and 20 Nov, 2017 # for the new Japanese parliamentary session myText <- readtext("Diet speeches/*.txt", encoding = "UTF-8") str(myText) # tokens() can segment Japanese texts without additional tools based on the rules defined in the ICU library, # which is available via the stringi package (that comes with the Quanteda one). ICU detects boundaries of Japanese words using a dictionary # with frequency information (see: http://source.icu-project.org/repos/icu/icu/tags/release-58-rc/source/data/brkitr/dictionaries/) # for the list of Asian languages covered, including Chinese, Thai, but not Korean icu_toks <- tokens(myText$text) # this expression means: consider document 2, and report me the first 40 characters appearing in it head(icu_toks[[2]], 40) # Even if you use a morphological analysis tool, tokenization of Japanese text is far from perfect. # You can refine tokens by compounding sequence of the same character class using textstat_collocations() # and tokens_compound(). Explore it by yourself if you are interested! # Japanese stopwords stopwords("ja", source = "stopwords-iso") # A better alternative is using the source "marimo" stopwords("ja", source = "marimo") # About stopwords in different languages, take a look at here: https://cran.r-project.org/web/packages/stopwords/README.html # Alternatively, you want to remove them by yourself by using the "remove" option discussed previously jap_dfm <- dfm(icu_toks, tolower = FALSE, remove_punct = TRUE, remove = c(stopwords("ja", source = "marimo"))) topfeatures(jap_dfm, 20) jap_dfm[, 1:5] # I want to name properly the texts jap_dfm@docvars jap_dfm@docvars[,2] jap_dfm@docvars[,2] <- myText$doc_id jap_dfm[, 1:5] # I could also write directly the name I want to give to such texts (this is an hypothetical example with fictious names!) jap_dfm@docvars[,2] <- c("A", "B", "C", "D", "E", "F", "G") jap_dfm@docvars[,2] jap_dfm[, 1:5] # If you want to perform more accurate tokenization, you need to install a morphological analysis tool, and call it from R. # For example: RcppMeCab (Chinese, Japanese, and Korean): https://github.com/junhewk/RcppMeCab ######################################################################### # Running example: How to deal with Chinese language ######################################################################### library(quanteda.corpora) corp <- quanteda.corpora::download(url = "https://www.dropbox.com/s/37ojd5knz1qeyul/data_corpus_chinesegovreport.rds?dl=1") # Chinese stopwords stopwords("zh", source = "misc") # Alternative stopwords("zh", source = "stopwords-iso") # tokenize, remove stop-words and remove punctuation ch_toks <- tokens(corp) head(ch_toks[[2]], 40) ch_dfm <- dfm(ch_toks, tolower = FALSE, remove_punct = TRUE, remove = c(stopwords("zh", source = "misc"))) topfeatures(ch_dfm)