rm(list=ls(all=TRUE))
getwd()
setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL/")
getwd()
# let's create a contingency table, which is to say that each number in the table
# represents the number of people/unit of analysis in each pair of categories
# In this case the table shows the relationship between conscientiousness of newspaper readership by education level
N = matrix(c(5, 18, 19, 12, 3, 7, 46, 29, 40, 7, 2, 20, 39, 49, 16),
nrow = 5,
dimnames = list(
"Level of education" = c("Some primary", "Primary completed", "Some secondary", "Secondary completed", "Some tertiary"),
"Category of readership" = c("Glance", "Fairly carefully", "Very carefully")))
N
# For example, the cell in the top-left corner tells us that 5 people with some primary education glanced at the newspaper
# You can think of the 3 column values in each row of the table as coordinates in a 3-dimensional space, and you could compute
# the (Euclidean) distances between the 5 row points in the 3-dimensional space.
# The distances between the points in such space would summarize all information about the SIMILARITIES between the rows in the table above.
# Now suppose you could find a lower-dimensional space, in which to position the row points in a manner that retains all, or almost all,
# of the information about the differences between the rows.
# You could then present all information about the similarities between the rows (types of education in this case) in a simple 1 or 2-dimensional graph
# FIRST STEP: compute the observed proportions (P)
n = sum(N) # summing up all the values in the table
P = N / n # table of proportions
P
# SECOND STEP: compute the masses
# In the language of CA, the sums of the rows and columns of the table of proportions are called masses
column.masses = colSums(P)
column.masses # Glance describe the reading habits of 18.3% of the sample
row.masses = rowSums(P)
row.masses # In the sample Some primary is held by 4.5% of the sample
# THIRD STEP: compute Expected proportions (E)
P
# 1.6% of people glanced and had some primary education. Is this number big or small?
# We can compute the value that we would expect to see under the assumption that there is NO RELATIONSHIP between
# education and readership. The proportion that glances at a newspaper is 18.2% and 4.5% have only Some primary education
# Thus, if there is no relationship between education and readership, we would expect that 4.5% of 18.2% of people
# (i.e., 0.045*0.182 = 0.008 = 0.8%) have both glanced and have primary education
E = row.masses %o% column.masses # %o% means that a table is created by multiplying each of the row totals (row masses) by each of the column totals
E
# FOURTH STEP: compute the Residuals (R)
# Residuals quantify the difference between the observed data and the expectated proportions
R = P - E
R
P
E
# The biggest residual is -0.045 for Primary completed and Very carefully: i.e., there is a negative association between having completed primary education
# and reading very carefully; while you have a positive relationship between having completed secondary education and reading very caferully; etc.
# FIFTH STEP: compute the Indexed residuals (I)
# All numbers in R close to 0. Why? Cause P and E are both small! We can solve this problem by dividing the residuals
# by the expected values
I = R / E
I
# I has a straightforward interpretation. The further the value from the table, the larger the observed proportion RELATIVE to the expected proportion.
# For example, the biggest value on the table is the .95 for Some primary and Glance. This tells us that people with some primary education are almost
# twice as likely to Glance at a newspaper as we would expect if there were no relationship between education and reading.
# In other words, the observed value is 95% higher than the expected value!!!
# Reading along this first row, we see that there is a weaker, but positive, indexed residual of 0.21 for Fairly carefully and Some primary.
# This tells us that people with some primary education were 21% more likely to be fairly careful readers that we would expect.
# And, a score of -.65 for Very carefully, tells us that people with Some primary education were 65% less likely to be Very careful readers than expected.
# Reading through all the numbers on the table, the overall pattern is that higher levels of education equate to a more careful readership.
# Basically CA is a technique designed for visualizing these indexed values in a smart way (i.e., in lower dimensions!)
# SIXTH STEP: compute Singular values, eigenvalues, and variance explained
# To compute the coordinates for the rows and columns of our original contingency table in a low-dimensional space, we need first of all
# to run a Singular Value Decomposition (SVD) on I [i.e., a decomposition of a complex matrix into smaller or simpler objects of the same kind]
Z = I * sqrt(E) # compute standardized residual. By doing that, we cause SVD to be weighted, such that cells with a higher expected value are given
# a higher weight in the data. As often the expected values are related to the sample size, this weighting means that smaller cells on the table,
# for which the sampling error will be larger, are down-weighted. In other words, this weighting makes CA relatively robust to outliers
# caused by sampling error, when the table being analyzed is a contingency table
SVD = svd(Z) # estimating SVD
rownames(SVD$u) = rownames(P)
rownames(SVD$v) = colnames(P)
SVD
# A singular value decomposition has three outputs:
# A vector, d, contains the singular values.
# A matrix u which contains the left singular vectors that correspond to the categories in the rows of the table
# A matrix v with the right singular vectors that correspond to the columns
# Some nice properties of SVD:
# 1) the two matrices and vector can be "multiplied" together to re-create the original input data, Z
# for example: in the data we started with (Z), we have a value of -0.064751 in the 5th row, 2nd column
Z
# We can work this out from the results of the SVD by multiplying each element of d with the elements
# of the 5th row of u and the 2nd row v.
# That is: -0.064751 = 0.2652708*0.468524*(-0.4887795) + 0.1135421*(-0.0597979)*0.5896041 + 0*(-0.58784451)*(-0.6430097)
0.2652708*0.468524*(-0.4887795) + 0.1135421*(-0.0597979)*0.5896041 + 0*(-0.58784451)*(-0.6430097)
# 2) The second useful property of the SVD relates to the values in d. Each of the singular values d, and the corresponding vectors
# (i.e., columns of u and v), correspond to a dimension! Accordingly, each value of d tells us the relative importance of each of the columns in u and v
# in describing the original data. We can compute the variance in the original data (Z) that is explained by the columns by first squaring
# the values in d, and then expressing these as proportions.
# Squared singular values are known as eigenvalues
eigenvalues = SVD$d^2
eigenvalues
variance.explained = prop.table(eigenvalues)
variance.explained
# The results shows that the first dimension explains 85% of variance in the data
# (i.e., the relative frequency values that can be reconstructed from a single dimension can reproduce 85% of the total variance -
# in our case, the deviations from the expected values)
# So, if we are happy to ignore 15% of the information in the original data, we only need to look at the first column in u and the first column in v
# Now we have to look at less than half the numbers that we started with in the original contingency table!
# Once we have decided the number of dimensions to keep in the analysis,
# we can use the SVD values together with the row (and column) masses to estimate the coordinates of row and column categories in a CA plot
# There are SEVERAL other interesting properties of CA on which we are not going to enter here
#####################################################
#### CA within QUANTEDA
#####################################################
library(readtext)
library(quanteda)
library(cowplot)
library(psych)
library(PerformanceAnalytics)
# let's use our familiar UK party programs of 1992 and 1997
myText <- readtext("Lecture 2/Wordscores manifestos/UK/*.txt",
docvarsfrom = "filenames", dvsep = " ", docvarnames = c("Party", "Year"))
testCorpus <- corpus(myText )
# I rename the texts
docnames(testCorpus) <- gsub(".txt", "", docnames(testCorpus ))
summary(testCorpus)
myDfm <- dfm(testCorpus , remove = stopwords("english"), tolower = TRUE, stem = TRUE,
remove_punct = TRUE, remove_numbers=TRUE)
# Run correspondence analysis on a dfm
ca <- textmodel_ca(myDfm )
str(ca)
# sv: singular values (the "d" of the previous example)
# nd: number of dimensions
# p.s. the maximum number of eigenvalues (and dimensions) that can be extracted from a two-way table is equal to the minimum of the number
# of columns minus 1 (in our case: 4499-1), and the number of rows minus 1 (in our case 6-1=5!)
# From singular values to variance explained by each single dimension:
variance.exp <- prop.table(ca$sv^2)
variance.exp
# It’s also possible to calculate an average eigenvalue above which the axis should be kept in the solution.
# Our data contains 6 rows (i.e., documents) and 4499 columns (words)
# If the data were random, the expected value of the eigenvalue for each axis would be
1/(length(ca$colmass)-1) # 0.02% in terms of 4499 columns
# Likewise, the average axis should account for:
1/(length(ca$rowmass)-1) # 20% in terms of the 6 rows
# According to (Bendixen 1995) any axis with a contribution larger than the maximum of these two percentages should be considered
# as important and included in the solution for the interpretation of the data. In our case between 1 and 2 dimensions
# If you want to extract just 1 dimension
ca1 <- textmodel_ca(myDfm , nd=1)
str(ca1)
textplot_scale1d(ca1)
# the coordinates of the columns can help to understand the words associated to the two polarities of the scale
coord_words <- as.data.frame(ca1$colcoord)
coord_words$words <- row.names(coord_words)
str(coord_words)
# let's order the words according to their absolute value
step <- order(coord_words$Dim1)
coord_words <-coord_words[step , ]
head(coord_words, 10)
tail(coord_words, 10)
###############################################################################################
#### Comparing the results you get via CA with Wordfish and Wordscores for the same set of documents
###############################################################################################
# Wordscores raw scores (with our usual reference scores along the economic dimension)
ws <- textmodel_wordscores(myDfm, c(17.21, rep(NA,1), 5.35, rep(NA,1), 8.21, rep(NA,1)))
pr_all <- predict(ws, interval = "confidence")
# Wordfish with LAB 92 to the left of CONS 92
wfm <- textmodel_wordfish(myDfm, dir = c(3, 1))
# Comparing wordscores vs wordfish
wordscores <- textplot_scale1d(pr_all)
wordfish <- textplot_scale1d(wfm)
ca <- textplot_scale1d(ca)
plot_grid(wordscores , wordfish , ca, labels = c('Wordscores', 'Wordfish', "CA"))
# check for the correlation
party <- wfm$docs
score_wf <-wfm$theta
score_ws <- pr_all$fit
score_ca <- ca1$rowcoord
scores_texts <-data.frame(party, score_wf, score_ws, score_ca)
str(scores_texts)
colnames(scores_texts)[3] <- "score_ws"
colnames(scores_texts)[6] <- "score_ca"
str(scores_texts)
corr.test(scores_texts[c(2:3,6)])
chart.Correlation(scores_texts[c(2:3,6)])
###############################################################################################
#### Comparing the results you get via CA with Wordfish and Wordscores for the features
###############################################################################################
# first let's compare wordfish and CA features scores
# check for the correlation
words <- wfm$features
score_wf <-wfm$beta
score_ca <- ca1$colcoord
scores_words <-data.frame(words, score_wf, score_ca)
str(scores_words )
colnames(scores_words )[3] <- "score_ca"
scores_words$words <- as.character(scores_words$words)
str(scores_words)
corr.test(scores_words[c(2:3)])
chart.Correlation(scores_words[c(2:3)]) # note that the words' betas in Wordfish shows a larger variance (at the extreme)
# compared to the coordinates of the columns in CA. This is coherent with that fact that Wordfish is more robust when a single document
# is very different than the others, which happens not infrequently in political documents
# now let's move to Wordscores scores for features.
# Of course you CANNOT directly compare wordscores scores for features with the ones you obtain via CA and Wordfish, cause in the former case
# you have just those features included in the reference texts (< than all the features included in the corpus. In our case: 4499 vs. 3575)
str(myDfm)
str(ca1)
str(wfm)
str(ws)
# let's create a data frame with just the features scores from Wordscores
words_ws <- as.data.frame(ws$wordscores)
str(words_ws)
words_ws$words <- row.names(words_ws)
colnames(words_ws )[1] <- "score_ws"
str(words_ws)
# merge words_ws data frame with scores_words data frame, by including only the words included in the former, not in the latter
library(dplyr)
semi_join(scores_words, words_ws, by='words') # variables present in both data frame: 3575!
x <- semi_join(scores_words, words_ws, by='words') # variables present in both data frame and also presented in scores_words
# let's add to the dataframe x we just created, the Wordscores features scores
x$score_ws <- words_ws$score_ws
str(x)
corr.test(x[c(2:4)])
chart.Correlation(x[c(2:4)])
###############################################################################################
#### If you want to extract 2 dimensions from a CA analysis
###############################################################################################
ca <- textmodel_ca(myDfm)
str(ca)
dat_ca <- data.frame(dim1 = coef(ca, doc_dim = 1)$coef_document,
dim2 = coef(ca, doc_dim = 2)$coef_document)
head(dat_ca)
str(dat_ca)
plot(dat_ca$dim1, dat_ca$dim2, main="2D CA",
pch=19, xlim = c(-2, 2), ylim = c(-2, 2),
xlab = 'Dimension 1', ylab = 'Dimension 2')
grid()
text(dat_ca$dim1, dat_ca$dim2, labels = rownames(dat_ca), cex = 0.8, pos=4)