rm(list=ls(all=TRUE)) getwd() setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL/") getwd() # let's create a contingency table, which is to say that each number in the table # represents the number of people/unit of analysis in each pair of categories # In this case the table shows the relationship between conscientiousness of newspaper readership by education level N = matrix(c(5, 18, 19, 12, 3, 7, 46, 29, 40, 7, 2, 20, 39, 49, 16), nrow = 5, dimnames = list( "Level of education" = c("Some primary", "Primary completed", "Some secondary", "Secondary completed", "Some tertiary"), "Category of readership" = c("Glance", "Fairly carefully", "Very carefully"))) N # For example, the cell in the top-left corner tells us that 5 people with some primary education glanced at the newspaper # You can think of the 3 column values in each row of the table as coordinates in a 3-dimensional space, and you could compute # the (Euclidean) distances between the 5 row points in the 3-dimensional space. # The distances between the points in such space would summarize all information about the SIMILARITIES between the rows in the table above. # Now suppose you could find a lower-dimensional space, in which to position the row points in a manner that retains all, or almost all, # of the information about the differences between the rows. # You could then present all information about the similarities between the rows (types of education in this case) in a simple 1 or 2-dimensional graph # FIRST STEP: compute the observed proportions (P) n = sum(N) # summing up all the values in the table P = N / n # table of proportions P # SECOND STEP: compute the masses # In the language of CA, the sums of the rows and columns of the table of proportions are called masses column.masses = colSums(P) column.masses # Glance describe the reading habits of 18.3% of the sample row.masses = rowSums(P) row.masses # In the sample Some primary is held by 4.5% of the sample # THIRD STEP: compute Expected proportions (E) P # 1.6% of people glanced and had some primary education. Is this number big or small? # We can compute the value that we would expect to see under the assumption that there is NO RELATIONSHIP between # education and readership. The proportion that glances at a newspaper is 18.2% and 4.5% have only Some primary education # Thus, if there is no relationship between education and readership, we would expect that 4.5% of 18.2% of people # (i.e., 0.045*0.182 = 0.008 = 0.8%) have both glanced and have primary education E = row.masses %o% column.masses # %o% means that a table is created by multiplying each of the row totals (row masses) by each of the column totals E # FOURTH STEP: compute the Residuals (R) # Residuals quantify the difference between the observed data and the expectated proportions R = P - E R P E # The biggest residual is -0.045 for Primary completed and Very carefully: i.e., there is a negative association between having completed primary education # and reading very carefully; while you have a positive relationship between having completed secondary education and reading very caferully; etc. # FIFTH STEP: compute the Indexed residuals (I) # All numbers in R close to 0. Why? Cause P and E are both small! We can solve this problem by dividing the residuals # by the expected values I = R / E I # I has a straightforward interpretation. The further the value from the table, the larger the observed proportion RELATIVE to the expected proportion. # For example, the biggest value on the table is the .95 for Some primary and Glance. This tells us that people with some primary education are almost # twice as likely to Glance at a newspaper as we would expect if there were no relationship between education and reading. # In other words, the observed value is 95% higher than the expected value!!! # Reading along this first row, we see that there is a weaker, but positive, indexed residual of 0.21 for Fairly carefully and Some primary. # This tells us that people with some primary education were 21% more likely to be fairly careful readers that we would expect. # And, a score of -.65 for Very carefully, tells us that people with Some primary education were 65% less likely to be Very careful readers than expected. # Reading through all the numbers on the table, the overall pattern is that higher levels of education equate to a more careful readership. # Basically CA is a technique designed for visualizing these indexed values in a smart way (i.e., in lower dimensions!) # SIXTH STEP: compute Singular values, eigenvalues, and variance explained # To compute the coordinates for the rows and columns of our original contingency table in a low-dimensional space, we need first of all # to run a Singular Value Decomposition (SVD) on I [i.e., a decomposition of a complex matrix into smaller or simpler objects of the same kind] Z = I * sqrt(E) # compute standardized residual. By doing that, we cause SVD to be weighted, such that cells with a higher expected value are given # a higher weight in the data. As often the expected values are related to the sample size, this weighting means that smaller cells on the table, # for which the sampling error will be larger, are down-weighted. In other words, this weighting makes CA relatively robust to outliers # caused by sampling error, when the table being analyzed is a contingency table SVD = svd(Z) # estimating SVD rownames(SVD$u) = rownames(P) rownames(SVD$v) = colnames(P) SVD # A singular value decomposition has three outputs: # A vector, d, contains the singular values. # A matrix u which contains the left singular vectors that correspond to the categories in the rows of the table # A matrix v with the right singular vectors that correspond to the columns # Some nice properties of SVD: # 1) the two matrices and vector can be "multiplied" together to re-create the original input data, Z # for example: in the data we started with (Z), we have a value of -0.064751 in the 5th row, 2nd column Z # We can work this out from the results of the SVD by multiplying each element of d with the elements # of the 5th row of u and the 2nd row v. # That is: -0.064751 = 0.2652708*0.468524*(-0.4887795) + 0.1135421*(-0.0597979)*0.5896041 + 0*(-0.58784451)*(-0.6430097) 0.2652708*0.468524*(-0.4887795) + 0.1135421*(-0.0597979)*0.5896041 + 0*(-0.58784451)*(-0.6430097) # 2) The second useful property of the SVD relates to the values in d. Each of the singular values d, and the corresponding vectors # (i.e., columns of u and v), correspond to a dimension! Accordingly, each value of d tells us the relative importance of each of the columns in u and v # in describing the original data. We can compute the variance in the original data (Z) that is explained by the columns by first squaring # the values in d, and then expressing these as proportions. # Squared singular values are known as eigenvalues eigenvalues = SVD$d^2 eigenvalues variance.explained = prop.table(eigenvalues) variance.explained # The results shows that the first dimension explains 85% of variance in the data # (i.e., the relative frequency values that can be reconstructed from a single dimension can reproduce 85% of the total variance - # in our case, the deviations from the expected values) # So, if we are happy to ignore 15% of the information in the original data, we only need to look at the first column in u and the first column in v # Now we have to look at less than half the numbers that we started with in the original contingency table! # Once we have decided the number of dimensions to keep in the analysis, # we can use the SVD values together with the row (and column) masses to estimate the coordinates of row and column categories in a CA plot # There are SEVERAL other interesting properties of CA on which we are not going to enter here ##################################################### #### CA within QUANTEDA ##################################################### library(readtext) library(quanteda) library(cowplot) library(psych) library(PerformanceAnalytics) # let's use our familiar UK party programs of 1992 and 1997 myText <- readtext("Lecture 2/Wordscores manifestos/UK/*.txt", docvarsfrom = "filenames", dvsep = " ", docvarnames = c("Party", "Year")) testCorpus <- corpus(myText ) # I rename the texts docnames(testCorpus) <- gsub(".txt", "", docnames(testCorpus )) summary(testCorpus) myDfm <- dfm(testCorpus , remove = stopwords("english"), tolower = TRUE, stem = TRUE, remove_punct = TRUE, remove_numbers=TRUE) # Run correspondence analysis on a dfm ca <- textmodel_ca(myDfm ) str(ca) # sv: singular values (the "d" of the previous example) # nd: number of dimensions # p.s. the maximum number of eigenvalues (and dimensions) that can be extracted from a two-way table is equal to the minimum of the number # of columns minus 1 (in our case: 4499-1), and the number of rows minus 1 (in our case 6-1=5!) # From singular values to variance explained by each single dimension: variance.exp <- prop.table(ca$sv^2) variance.exp # It’s also possible to calculate an average eigenvalue above which the axis should be kept in the solution. # Our data contains 6 rows (i.e., documents) and 4499 columns (words) # If the data were random, the expected value of the eigenvalue for each axis would be 1/(length(ca$colmass)-1) # 0.02% in terms of 4499 columns # Likewise, the average axis should account for: 1/(length(ca$rowmass)-1) # 20% in terms of the 6 rows # According to (Bendixen 1995) any axis with a contribution larger than the maximum of these two percentages should be considered # as important and included in the solution for the interpretation of the data. In our case between 1 and 2 dimensions # If you want to extract just 1 dimension ca1 <- textmodel_ca(myDfm , nd=1) str(ca1) textplot_scale1d(ca1) # the coordinates of the columns can help to understand the words associated to the two polarities of the scale coord_words <- as.data.frame(ca1$colcoord) coord_words$words <- row.names(coord_words) str(coord_words) # let's order the words according to their absolute value step <- order(coord_words$Dim1) coord_words <-coord_words[step , ] head(coord_words, 10) tail(coord_words, 10) ############################################################################################### #### Comparing the results you get via CA with Wordfish and Wordscores for the same set of documents ############################################################################################### # Wordscores raw scores (with our usual reference scores along the economic dimension) ws <- textmodel_wordscores(myDfm, c(17.21, rep(NA,1), 5.35, rep(NA,1), 8.21, rep(NA,1))) pr_all <- predict(ws, interval = "confidence") # Wordfish with LAB 92 to the left of CONS 92 wfm <- textmodel_wordfish(myDfm, dir = c(3, 1)) # Comparing wordscores vs wordfish wordscores <- textplot_scale1d(pr_all) wordfish <- textplot_scale1d(wfm) ca <- textplot_scale1d(ca) plot_grid(wordscores , wordfish , ca, labels = c('Wordscores', 'Wordfish', "CA")) # check for the correlation party <- wfm$docs score_wf <-wfm$theta score_ws <- pr_all$fit score_ca <- ca1$rowcoord scores_texts <-data.frame(party, score_wf, score_ws, score_ca) str(scores_texts) colnames(scores_texts)[3] <- "score_ws" colnames(scores_texts)[6] <- "score_ca" str(scores_texts) corr.test(scores_texts[c(2:3,6)]) chart.Correlation(scores_texts[c(2:3,6)]) ############################################################################################### #### Comparing the results you get via CA with Wordfish and Wordscores for the features ############################################################################################### # first let's compare wordfish and CA features scores # check for the correlation words <- wfm$features score_wf <-wfm$beta score_ca <- ca1$colcoord scores_words <-data.frame(words, score_wf, score_ca) str(scores_words ) colnames(scores_words )[3] <- "score_ca" scores_words$words <- as.character(scores_words$words) str(scores_words) corr.test(scores_words[c(2:3)]) chart.Correlation(scores_words[c(2:3)]) # note that the words' betas in Wordfish shows a larger variance (at the extreme) # compared to the coordinates of the columns in CA. This is coherent with that fact that Wordfish is more robust when a single document # is very different than the others, which happens not infrequently in political documents # now let's move to Wordscores scores for features. # Of course you CANNOT directly compare wordscores scores for features with the ones you obtain via CA and Wordfish, cause in the former case # you have just those features included in the reference texts (< than all the features included in the corpus. In our case: 4499 vs. 3575) str(myDfm) str(ca1) str(wfm) str(ws) # let's create a data frame with just the features scores from Wordscores words_ws <- as.data.frame(ws$wordscores) str(words_ws) words_ws$words <- row.names(words_ws) colnames(words_ws )[1] <- "score_ws" str(words_ws) # merge words_ws data frame with scores_words data frame, by including only the words included in the former, not in the latter library(dplyr) semi_join(scores_words, words_ws, by='words') # variables present in both data frame: 3575! x <- semi_join(scores_words, words_ws, by='words') # variables present in both data frame and also presented in scores_words # let's add to the dataframe x we just created, the Wordscores features scores x$score_ws <- words_ws$score_ws str(x) corr.test(x[c(2:4)]) chart.Correlation(x[c(2:4)]) ############################################################################################### #### If you want to extract 2 dimensions from a CA analysis ############################################################################################### ca <- textmodel_ca(myDfm) str(ca) dat_ca <- data.frame(dim1 = coef(ca, doc_dim = 1)$coef_document, dim2 = coef(ca, doc_dim = 2)$coef_document) head(dat_ca) str(dat_ca) plot(dat_ca$dim1, dat_ca$dim2, main="2D CA", pch=19, xlim = c(-2, 2), ylim = c(-2, 2), xlab = 'Dimension 1', ylab = 'Dimension 2') grid() text(dat_ca$dim1, dat_ca$dim2, labels = rownames(dat_ca), cex = 0.8, pos=4)