library(readtext) library(quanteda) library(rtweet) ######################### ## A seach query with emoji ######################### # for the list of emoji: https://github.com/hadley/emo library(emo) x <- emo::ji("smile") x smile <- search_tweets(x, n = 200, lang = "en", include_rts = FALSE) print(smile $text[1:5]) # how to convert the emoji to text? library(textclean) replace_non_ascii(replace_emoji(smile $text[1:5])) # note the difference! # here without conversion of emoji to text myCorpusTwitter<- corpus(smile ) as.character(myCorpusTwitter)[1:2] tok <- tokens(myCorpusTwitter, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE, remove_url = TRUE) tok <- tokens_remove(tok , stopwords("english")) tok <- tokens_wordstem (tok , language =("english")) myDfm <- dfm(tok) topfeatures(myDfm , 20) # 20 top word length(myDfm@Dimnames$features) # here with conversion of emoji to text smile2 <- smile smile2 $text <- replace_non_ascii(replace_emoji(smile2 $text)) myCorpusTwitter2<- corpus(smile2 ) as.character(myCorpusTwitter2)[1:2] tok <- tokens(myCorpusTwitter2, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE, remove_url = TRUE) tok <- tokens_remove(tok , stopwords("english")) tok <- tokens_wordstem (tok , language =("english")) myDfm2 <- dfm(tok) # of course now we have more features in the new dfm (after conversion) length(myDfm2@Dimnames$features) length(myDfm@Dimnames$features) topfeatures(myDfm , 20) # 20 top word topfeatures(myDfm2 , 20) # 20 top word