# Principal Component Analysis in R
rm(list=ls(all=TRUE))
library(psych)
mydata<- read.csv("C:/Users/mw/Dropbox/JAPAN JSPS2013 WIAS2012 WIAS2013/Tokyo 2016/New Lectures/Lecture 4 2016/Lab/Germany expert factor analysis.csv")
str(mydata)
levels(mydata$Party)
# exclude data with missing values
newdata <- na.omit(mydata)
attach(newdata )
# Define variables/policy dimensions
X <- cbind(Taxes, Social, Environment, Decentralization, EU_Peacekeeping, Immigration , EU_Accountability, EU_Authority)
# Descriptive statistics
summary(X)
cor(X)
# nfactors: number of factors to extract
pca2 <- principal(X, rotate="none", nfactors=2, scores=TRUE)
str(pca2)
# Eigenvalues: The eigenvalue of each extracted factor. The sum of all eigenvalues = total number of variables
pca2$values
# How many eigenvalues keeping in the analysis?
# An eigenvalue greater than 1 indicates that PCs account for more variance than accounted by one of the original input variables
# Intuitively, this rule means that any retained factor z should account for at least as much variation
# as any of the original variables x. This is commonly used as a cutoff point for which PCs are retained
# If eigenvalue = 1.91 this indicates that the power of such new dimension to distinguish between the various observations
# (in our case, parties) under investigation is 1.9 times as high as the power of separate indicators to do so
# Scree test: plotting the eigenvalues against the corresponding PC produces a scree plot that illustrates the rate of change
# in the magnitude of the eigenvalues for the PC. The rate of decline tends to be fast first then levels off
# The “elbow”, or the point at which the curve bends, is considered to indicate the maximum number of PC to extract
# One less PC than the number at the elbow might be appropriate if you are concerned about getting an overly defined solution
plot(pca2$values , type = "l", xlab="Factors", ylab="Eigenvalues" )
abline(a=1, b=0, lty=5)
# On this measure, therefore, since 2 factors have eigenvalues of greater than 1,
# we might consider the German policy space to be two-dimensional!
# Since the sum of eigenvalues = total number of variables, Proportion indicates the relative variance accounted by each factor
variance <- pca2$values / length(pca2$values)
variance
# First two components/factors explain almost 60% of the total variance!
sum( 0.37936467+0.21630983)
plot(variance , type = "l", xlab="Factors", ylab="Variance" )
abline(a=1, b=0, lty=5)
# Loadings: Factor loadings are the correlations between each variable and the factor.
# The higher the load the more relevant in defining the factor’s dimensionality.
# A negative value indicates an inverse impact on the factor.
pca2$loadings
pca2$loadings[1:8, 1:2]
# By looking at the Factor Loadings we can give a more substantial policy content to the two policy dimensions extracted
loadings <- pca2$loadings
str(loadings)
names(loadings) <- attr(loadings,"dimnames")[[1]]
names(loadings)
plot(loadings)
text(loadings,labels=names(loadings), pos=1 )
# It seems that ‘Social’ ‘Environment’ ‘Immigration’ and ‘EU accountability’ define Factor1,
# and ‘Taxes’ and ‘Decentralization’’ define Factor2
# Uniqueness (i.e. [1-communality]): the variance of the original variable not explained by the extracted PCs
# Notice that the greater ‘uniqueness’ the lower the relevance of the variable in the factor model.
pca2$uniquenesses
pca2$communality
# The coordinates of the original variables on the new bi-dimensional space (i.e., PC scores)
pca2$scores
scores.R <- as.data.frame(pca2$scores)
names(scores.R)
results2 <- cbind(newdata, scores.R)
str(results2)
# aggregate the 20th and 21st variable (i.e. PC1 and PC2)
aggregate(results2[, 20:21], list(results2[,1]), mean)
# plotting parties positions in the new space
p <- aggregate(results2[, 20:21], list(results2$Party), mean)
plot(p$PC1,p$PC2, main="German 2005 policy space",
xlab="Social", ylab="Right Left", pch=19)
text(p$PC1,p$PC2,p$Group.1, cex=1, pos=3)
#############################################
# Rotating axis
#############################################
# The goal of rotating the axis is to find clusters of variables that to a large extent define only one factor.
# The factor loadings matrix can be “rotated” or re-oriented in order to make most factor loadings on any specific factor small
# while only a few factor loadings large in absolute value.
# This strategy allows factors to be more easily interpreted as the clusters of variables that are highly correlated with a particular factor.
# On the other hand, when we rotate, say, the leading three principal components, the total variance explained
# by the three rotated components is still equal to the variance explained by the three principal components.
# The only thing that has changed is that the explanation is distributed differently among the three rotated components.
# If the rotated components have a clearer interpretation, you may actually prefer to use them in your subsequent work.
# There are several possibilities for rotating axis:
# Varimax rotation: the theoretical idea of varimax rotation is that ideally each dimension will have
# its own subset of (non-overlapping) indicators. Therefore on would expect that some indicators will have high factor loadings
# on a factor, whereas the factor loadings of the other indicators will be zero.
# Varimax rotation preserves the perpendicularity of the axes (rotated components/factors remain uncorrelated)
pca3 <- principal(X, nfactors=2, rotate="varimax", scores=TRUE)
str(pca3)
pca3$values
pca2$values
pca3$loadings
pca2$loadings
loadings2 <- pca3$loadings
str(loadings2)
names(loadings2) <- attr(loadings2,"dimnames")[[1]]
names(loadings2)
plot(loadings2)
text(loadings2,labels=names(loadings2), pos=1 )
par(mfrow=c(1,2))
plot(loadings)
text(loadings,labels=names(loadings), pos=1 )
plot(loadings2)
text(loadings2,labels=names(loadings2), pos=1 )
plot(loadings, xlab="First Dimension", ylab="Second Dimension",col = "blue", pch=19)
text(loadings,labels=names(loadings), cex=1, pos=1, col = "blue", pch=19 )
par(new=TRUE)
plot(loadings2, axes=FALSE, xlab="", ylab="", col="red", pch=19)
text(loadings2,labels=names(loadings2),cex=1, pos=1, col ="red", pch=19)
legend("topright", c("Unrotated", "Rotated"), pch=c(19), col=c("blue", "red"))
pca3$scores
scores.R2 <- as.data.frame(pca3$scores)
results3 <- cbind(results2, scores.R2)
str(results3)
# aggregate the 22th and 23st variable (i.e. RC1 and RC2)
aggregate(results3[, 22:23], list(results3[,1]), mean)
# plotting parties positions in the new space
pR <- aggregate(results3[, 22:23], list(results3$Party), mean)
plot(pR$RC1,pR$RC2, main="German 2005 policy space (rotated)",
xlab="Social", ylab="Right Left", pch=19)
text(pR$RC1,pR$RC2,pR $Group.1, cex=1, pos=3)
# combine 2 graphs (rotated and unrotated)
par(mfrow=c(1,2))
plot(p$PC1,p$PC2, main="German 2002 policy space (unrotated)",
xlab="Social", ylab="Right Left", pch=19)
text(p$PC1,p$PC2,p$Group.1, cex=1, pos=3)
plot(pR$RC1,pR$RC2, main="German 2005 policy space (rotated)",
xlab="Social", ylab="Left Right", pch=19)
text(pR$RC1,pR$RC2,pR $Group.1, cex=1, pos=3)
plot(p$PC1,p$PC2, main="German 2005 policy space (unrotated vs. rotated)",
xlab="Social", ylab="Right Left",col = "blue")
text(p$PC1,p$PC2,p$Group.1, cex=1, pos=3)
par(new=TRUE)
plot(pR$RC1,pR$RC2,axes=FALSE, xlab="", ylab="", col = "red")
legend("topright", c("Unrotated", "Rotated"), pch=c(1), col=c("blue", "red"))
plot(p$PC1,p$PC2, main="German 2005 policy space (unrotated vs. rotated)",
xlab="Social", ylab="Right Left",col = "blue", pch=19)
text(p$PC1,p$PC2,p$Group.1, cex=1, pos=3)
par(new=TRUE)
plot(pR$RC1,pR$RC2,axes=FALSE, xlab="", ylab="", col = "red", pch=19)
legend("topright", c("Unrotated", "Rotated"), pch=c(19), col=c("blue", "red"))
######################################
# Adding aggregate results to starting dataset
colnames(p)
names(p)[1] <- 'Party'
names(p)[2] <- 'Social_axis'
names(p)[3] <- 'Left_Right_axis'
colnames(p)
total <- merge(results2, p , by="Party")
str(total )
## save your data estimation for Cybersenate
aggr_pca <- aggregate(results2[, 20:21], list(results2[,1]), mean)
aggr_pca2 <- aggregate(results3[, 22:23], list(results3[,1]), mean)
cybersenate <- cbind(aggr_pca, aggr_pca2)
str(cybersenate )
# here change the destination selecting the one you like most
write.csv(cybersenate, "C:/Users/mw/Desktop/cybersenate.csv")
# alternative change the current working directory of the R process
# with getwd() you get the current working directory of the R process
getwd()
# with setwd() you change the current working directory of the R process; for example here I identify my "Desktop" directory as the working directory
setwd("C:/Users/mw/Desktop")
getwd()
write.csv(cybersenate, "cybersenate2.csv")