rm(list=ls(all=TRUE)) getwd() ##### set here your Working Directory! setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL") getwd() library(manifestoR) library(quanteda) library(ggplot2) library(DT) # add among " " your CMP key! mp_setapikey(key.file = NULL, key = "###########") # I want to use the latest verstion of the CMP corpus (different from the core CMP dataset just saw in the # first part of the class) mp_use_corpus_version("2017-2") ########################## # Viewing original party manifestoes and the original CMP coding ######################### Japan <- mp_availability(countryname == "Japan") Japan # I can get access to 77 party manifestoes on-line; out of this, 2 are also in the CMP archive in digital format # and for 2 documents I can actually check their original coding str(Japan) # if originals=TRUE then the programs are available on-line (but such texts are NOT necessarily # in the CMP archive in digital format! Indeed, only 2 are available!) print(Japan [c("party", "date", "originals")]) print(Japan [c("party", "date", "originals")], n=123) cmp <- mp_maindataset() Japan2 <- cmp [which(cmp $countryname == "Japan"),] table(Japan2$date) # let's see the pdf of the Japanese party manifestos that are available mp_view_originals(countryname=="Japan" & date==196011, maxn = 4 ) mp_view_originals(countryname=="Japan" & date==200509, maxn = 6 ) # the only two programs included in the CMP archive in digital format belongs both to 2014 elections, # for parties 71220 and 71320 metas <- mp_metadata(countryname == "Japan") print(metas [c("party", "date", "annotations")], n=123) # let's call the documents that are available in the CMP archive in digital format Japan_corpus <- mp_corpus(Japan) Japan_corpus summary(Japan_corpus) head(content(Japan_corpus[["71220_201412"]])) head(content(Japan_corpus[["71320_201412"]])) # but which party is this 71220? japan_cmp <- cmp[ which(cmp$countryname=="Japan" & cmp$date==201412),] print(japan_cmp [c("partyname", "party", "edate", "date")]) # 71220 is the Japanese Communist Party # while 71320 is the Social Democratic Party # let's see how the Japanese Communist Party has been codified doc <- Japan_corpus[["71220_201412"]] # the categories employed to codify that program head(codes(doc), n = 13) codes(doc) # headline codes (“H”), uncoded (“0”,“000”) and with codes missing (NA) # what's the meaning of 202.1 or 305.1? This refers to new CMP coding scheme that also includes # subcategories (see https://manifesto-project.wzb.eu/down/papers/Evolution_of_the_Manifesto_Coding_Instructions_and_the_Category_Scheme.pdf) table(codes(doc)) # in total, 522 quasi-sentences in the text length(doc ) # it might be a good idea to first recode version 5 codes, with subcategories, to version 4, w/o subcategories, using manifestoR doc2 <- recode_v5_to_v4(doc ) # compare the differences! table(codes(doc2)) table(codes(doc)) # to read the program txt <- content(Japan_corpus[["71220_201412"]]) head(txt, n = 4) head(txt, n = 50) # number of quasi-sentences codified in the document as 402, 408 or 701 doc_subcodes <- subset(doc, codes(doc) %in% c(402, 408, 701)) length(doc_subcodes) length(doc_subcodes)/length(doc) # subsetting based on text: for example, I want to focus on all those # quasi-sentences including the word "GDP" in the party manifesto ofthe Japanese Communist Party in 2014 # in this case, two quasi-sentences. One codified as 409 and the other as 404 doc_subtext <- subset(doc, grepl("GDP", content(doc))) strwrap(head(content(doc_subtext))) head(codes(doc_subtext)) ########################## # Passing CMP party manifestoes to Quanteda ########################## # Let's focus on the Irish manifestoes of the 2016 elections ireland <- mp_corpus(countryname=="Ireland" & date == 201602) ireland summary(ireland) # converting the 10 party manifestoes recovered from the CMP dataset to a Quanteda corpus quanteda_ireland <- corpus(ireland) summary(quanteda_ireland ) summary(head(quanteda_ireland )) ndoc(quanteda_ireland ) # hmmm, we have much more than 10 documents, actually it seems that we have 10809 documents! why? # because we are keeping each quasi-sentence for each party manifestoes as a separated document! # how to deal with that? we will see below! # The meta data information from the Manifesto Corpus is stored in the docvars # and is available for each quasi-sentence. For example, let's see the info about the coding head(docvars(quanteda_ireland , field = "cmp_code"), 100) # as with all the corpus, we can use our usual command, such as kwich x <- kwic(quanteda_ireland , "social*", window = 10) datatable(x, caption="Keywords in context", rownames=FALSE, options = list(scrollX = TRUE,pageLength = 5, lengthMenu = c(5, 10, 15, 20))) # or we can subset the corpus quanteda_irish_sing <- corpus_subset(quanteda_ireland, party == 53110) texts(quanteda_irish_sing )[1] texts(quanteda_irish_sing )[1:5] strwrap(quanteda_irish_sing[[1]]) texts(quanteda_irish_sing )[1] # making the DFM dfm_irish <- dfm(quanteda_ireland, stem = TRUE, tolower=TRUE, remove_punct = TRUE, remove = stopwords("english"), remove_numbers=TRUE) dfm_irish[1:10, 1:5] # but of course we would like to create a DFM by grouping together all the quasi-sentences beloning to the same party! # so we use the dfm_group option! grouping by "manifesto_id" or "party" summary(head(quanteda_ireland )) dfm_irish2 <- dfm_group(dfm_irish, "manifesto_id") dfm_irish2[1:10, 1:5] # give me back the 10 most frequent word for each party feature_frequencies_categories <- textstat_frequency(dfm_irish2, n = 10, group = "manifesto_id") str(feature_frequencies_categories) # now I want to plot such frequencies using the original Irish party name included in the CMP core dataset... ireland<- cmp[ which(cmp $countryname=="Ireland" & cmp $date == 201602),] print(ireland[c("partyname", "party", "edate", "date", "partyabbrev")]) ireland$partyname feature_frequencies_categories <- mutate(feature_frequencies_categories, manifesto_id = factor(group, labels = ireland$partyname)) str(feature_frequencies_categories) ggplot(feature_frequencies_categories,aes(x = reorder(feature_frequencies_categories$feature, feature_frequencies_categories$frequency) , y = feature_frequencies_categories$frequency, fill = feature_frequencies_categories$manifesto_id)) + geom_col(show.legend = FALSE) + labs(x = NULL, y = "share of words per category") + facet_wrap(~feature_frequencies_categories$manifesto_id, ncol = 2, scales = "free") + coord_flip() ######################################################################### # Using wordfish ######################################################################### wfm <- textmodel_wordfish(dfm_irish2, dir = c(1, 3)) summary(wfm) str(wfm) wfm$docs <- ireland$partyname wfm$docs # Plot estimated word positions textplot_scale1d(wfm, margin = "features") # Plot estimated document positions textplot_scale1d(wfm, margin = "documents")