# create artificial data with two variables x1 and x2
x1 <- c(20, 16, 16, 15, 14, 10, 9, 3, 5, 6, 8, 16, 21, 7, 12, 13, 12, 11, 8, 10)
x2 <- c(19, 22, 16, 17, 15, 15, 13, 12, 15, 16, 19, 20, 23, 17, 22, 18, 17, 20, 15, 17)
(data <- data.frame(x1 = x1, x2 = x2))
## x1 x2
## 1 20 19
## 2 16 22
## 3 16 16
## 4 15 17
## 5 14 15
## 6 10 15
## 7 9 13
## 8 3 12
## 9 5 15
## 10 6 16
## 11 8 19
## 12 16 20
## 13 21 23
## 14 7 17
## 15 12 22
## 16 13 18
## 17 12 17
## 18 11 20
## 19 8 15
## 20 10 17
cov(data) # covariance matrix
## x1 x2
## x1 23.410526 9.273684
## x2 9.273684 8.884211
plot(data$x1, data$x2, axes=F); box(bty="l"); axis(1); axis(2) # scatter plot of the two variables
There are several functions we can use to do principal component analysis (PCA). Two of them are prcomp() and princomp() functions which come with the basic stats package in R. For this demonstration, we will use princomp() function.
pca.cov = princomp(x = data) # By default, it uses the covariance matrix to perform PCA
summary(pca.cov, loadings = T)
## Importance of components:
## Comp.1 Comp.2
## Standard deviation 5.1507715 2.0370452
## Proportion of Variance 0.8647473 0.1352527
## Cumulative Proportion 0.8647473 1.0000000
##
## Loadings:
## Comp.1 Comp.2
## x1 0.899 0.438
## x2 0.438 -0.899
# The coefficients (eigenvectors) are displayed under 'Loadings'
# Each column vector is the vector of coefficients for the corresponding principal component (PC)
# The proportion of variance explained by the second PC is substantially smaller than that of the first PC
# The coefficients for the second PC are not necessarily smaller than those of the first PC.
Let \(\mathbf{A=(a_1 , a_2)}\) where \(\mathbf{a_1}\) and \(\mathbf{a_2}\) are the vector of coefficients for the first (\(PC_1\)) and second principal components (\(PC_2\)), respectively. Then we can write: \[\mathbf{A}=\begin{bmatrix} 0.899 & 0.438 \\ 0.438 & -0.899 \end{bmatrix}\]
Let’s see if matrix \(\mathbf{A}\) is orthogonal (i.e., \(\mathbf{A^TA=AA^T=I}\)).
A <- pca.cov$loadings
A %*% t(A) # multiplication of A and the transpose of A => I
## x1 x2
## x1 1.000000e+00 8.225832e-18
## x2 8.225832e-18 1.000000e+00
# This means the inverse of A equals to A
solve(A) # inverse of A
## x1 x2
## Comp.1 0.8990543 0.4378371
## Comp.2 0.4378371 -0.8990543
Based on the coefficients, we can express each PC as a linear combination of the original variables: \[PC_1 = 0.899 X_1 + 0.438 X_2 \\ PC_2 = 0.438 X_1 - 0.899 X_2\]
Then, the variance of PC scores can be calculated as : \[var(PC_1)=var(0.899 X_1 + 0.438 X_2) = 0.899^2 s_{11} + 0.438^2 s_{22} + 2(0.899)(0.438) s_{12}=\mathbf{a_1^TSa_1}\\ var(PC_2)=var(0.438 X_1 - 0.899 X_2) = 0.438^2 s_{11} + (-0.899)^2 s_{22} + 2(0.438)(-0.899) s_{12}=\mathbf{a_2^TSa_2}\]
Recall that the covariance matrix \(\mathbf{S}\) is :
(S <- cov(data)) # returns the covariance matrix
## x1 x2
## x1 23.410526 9.273684
## x2 9.273684 8.884211
By plugging in this covariance matrix to the equations above, we get:
t(A[,1]) %*% S %*% A[,1] # PC1 variance
## [,1]
## [1,] 27.92679
t(A[,2]) %*% S %*% A[,2] # PC2 variance
## [,1]
## [1,] 4.367951
These variances are basically equal to the eigenvalues (\(\lambda\)) associated with each component.
(eigen <- pca.cov$sdev^2) # eigenvalues for the two component (which correspond to the variance of each PC)
## Comp.1 Comp.2
## 26.530447 4.149553
# The differences between the variances and eigenvalues is due to the use of different divisors when calculating the covariance matrix. 'cov()' function uses divisor 'N-1' for calculating the covariance matrix (to derive the unbiased estimator of population variance) whereas 'princomp()' function uses divisor 'N'. Such differences are negligible if the sample size is large.
# Let's see if we can get the exact number of the eigenvalues
S2 <- S * (nrow(data)-1)/nrow(data) # use divisor 'N' instead of 'N-1'
t(A[,1]) %*% S2 %*% A[,1] # PC1 variance
## [,1]
## [1,] 26.53045
t(A[,2]) %*% S2 %*% A[,2] # PC2 variance
## [,1]
## [1,] 4.149553
# Note that 'prcomp()' function computes the variances with the usual divisor 'N-1'.
The total variance of the two PCs are the sum of the eigenvalues (\(=\lambda_1+\lambda_2\)) and this equals the total variance of the original variables (i.e., \(=trace(\mathbf{S})\)).
sum(eigen) # total variance of the two PCs
## [1] 30.68
sum(S2[1,1], S2[2,2]) # trace of the covariance matrix; that is the sum of the values on the main diagonal of the matrix
## [1] 30.68
Now we can compute the proportion of the “total variance” explained by the principal components by dividing the variance of each component with the total variance in data. \[\text{Proportion of variance explained by } PC_i =\frac{\lambda_i}{(\lambda_1+\lambda_2)}=\frac{\lambda_i}{trace(\mathbf{S})}\]
(pr_1 <- eigen[1]/sum(eigen))
## Comp.1
## 0.8647473
(pr_2 <- eigen[2]/sum(eigen))
## Comp.2
## 0.1352527
summary(pca.cov)
## Importance of components:
## Comp.1 Comp.2
## Standard deviation 5.1507715 2.0370452
## Proportion of Variance 0.8647473 0.1352527
## Cumulative Proportion 0.8647473 1.0000000
plot(x1, x2, type="n")
text(x1, x2, label=1:20)
abline(h=mean(x2), col="grey")
abline(v=mean(x1), col="grey")
# add PC lines
center <- pca.cov$center
slope1 <- A[2, 1]/A[1, 1]
a1 <- center[2]-slope1*center[1]
abline(c(a1, slope1), col="orange")
slope2 <- A[2, 2]/A[1, 2]
a2 <- center[2]-slope2*center[1]
abline(c(a2, slope2), col="orange")
Let’s plot the points on the coordinate system derived by PCA.
plot(pca.cov$scores[,1], pca.cov$scores[,2], xlab="PC1", ylab="PC2", type="n")
text(pca.cov$scores[,1], pca.cov$scores[,2], label=1:20)
abline(0,0, col="grey")
abline(v=0, col="grey")
We can observe from the plot that the variation of \(PC_1\) is much larger than the variance of \(PC_1\).
Also, the PCs are centered at zero which means that the scores are mean-centered and then rotated (note that covariance implies mean-centered variables and that covariance matrix of original variables and that of the mean-centered variables are identical).
When calculating PC scores, we usually use mean-centered variable values: \[PC_1 = a_{11}(X_1 - \bar X_1)+a_{21}(X_2 - \bar X_2) \\ PC_2 = a_{12}(X_1 - \bar X_1)+a_{22}(X_2 - \bar X_2)\]
Let’s calculate PC scores for respondent 7.
plot(x1, x2, type="n")
text(x1, x2, label=1:20)
abline(h=mean(x2), col="grey")
abline(v=mean(x1), col="grey")
# add PC lines
center <- pca.cov$center
slope1 <- A[2, 1]/A[1, 1]
a1 <- center[2]-slope1*center[1]
abline(c(a1, slope1), col="orange")
slope2 <- A[2, 2]/A[1, 2]
a2 <- center[2]-slope2*center[1]
abline(c(a2, slope2), col="orange")
text(x1[7], x2[7]-0.8, label=paste("(", x1[7], ", ", x2[7], ")"), col="blue") # original variable values for respondent 7
text(mean(x1), 12, label=mean(x1), col="grey") # mean value of X1
text(3, mean(x2), label=mean(x2), col="grey") # mean value of X2
## Calculate PC scores for respondent 7
(z1_7 = A[1,1] * (x1[7] - mean(x1)) + A[2,1] * (x2[7] - mean(x2))) # PC1 score
## [1] -4.264024
(z2_7 = A[1,2] * (x1[7] - mean(x1)) + A[2,2] * (x2[7] - mean(x2))) # PC2 score
## [1] 2.817463
# the scores are identical to:
pca.cov$scores[7,]
## Comp.1 Comp.2
## -4.264024 2.817463
plot(pca.cov$scores[,1], pca.cov$scores[,2], xlab="PC1", ylab="PC2", type="n")
text(pca.cov$scores[,1], pca.cov$scores[,2], label=1:20)
abline(h=0, col="grey")
abline(v=0, col="grey")
text(z1_7, z2_7-0.5, label=paste("(", round(z1_7, 2), ", ", round(z2_7, 2), ")"), col="blue") # PC scores for respondent 7
We can calculate PC scores for all respondents in the same way.
# PC scores for all respondents
z1 = A[1,1]*(x1-mean(x1))+A[2,1]*(x2-mean(x2))
z2 = A[1,2]*(x1-mean(x1))+A[2,2]*(x2-mean(x2))
cbind(z1, z2)
## z1 z2
## [1,] 8.2525957 2.23934464
## [2,] 5.9698897 -2.20916670
## [3,] 3.3428671 3.18515926
## [4,] 2.8816499 1.84826784
## [5,] 1.1069214 3.20853940
## [6,] -2.4892959 1.45719104
## [7,] -4.2640244 2.81746260
## [8,] -10.0961875 1.08949438
## [9,] -6.9845676 -0.73199442
## [10,] -5.6476762 -1.19321165
## [11,] -2.5360562 -3.01470045
## [12,] 5.0942155 -0.41105805
## [13,] 10.9029984 -0.91903557
## [14,] -4.3107847 -1.65442889
## [15,] 2.3736724 -3.96051506
## [16,] 1.5213783 0.07353933
## [17,] 0.1844869 0.53475657
## [18,] 0.5989438 -2.60024350
## [19,] -4.2874046 0.58151685
## [20,] -1.6136218 -0.34091762
# or
pca.cov$scores
## Comp.1 Comp.2
## [1,] 8.2525957 2.23934464
## [2,] 5.9698897 -2.20916670
## [3,] 3.3428671 3.18515926
## [4,] 2.8816499 1.84826784
## [5,] 1.1069214 3.20853940
## [6,] -2.4892959 1.45719104
## [7,] -4.2640244 2.81746260
## [8,] -10.0961875 1.08949438
## [9,] -6.9845676 -0.73199442
## [10,] -5.6476762 -1.19321165
## [11,] -2.5360562 -3.01470045
## [12,] 5.0942155 -0.41105805
## [13,] 10.9029984 -0.91903557
## [14,] -4.3107847 -1.65442889
## [15,] 2.3736724 -3.96051506
## [16,] 1.5213783 0.07353933
## [17,] 0.1844869 0.53475657
## [18,] 0.5989438 -2.60024350
## [19,] -4.2874046 0.58151685
## [20,] -1.6136218 -0.34091762
One thing to highlight here is that the sign of PC scores is arbitrary (and accordingly the sign of the coefficients is also undetermined). It is customary (but not universal) to choose \(a\) to be positive (or in a way that is more easily interpretable).
# original plot
plot(x1, x2, type="n")
text(x1, x2, label=1:20)
abline(h=mean(x2), col="grey")
abline(v=mean(x1), col="grey")
# add PC lines
center <- pca.cov$center
slope1 <- A[2, 1]/A[1, 1]
a1 <- center[2]-slope1*center[1]
abline(c(a1, slope1), col="orange")
slope2 <- A[2, 2]/A[1, 2]
a2 <- center[2]-slope2*center[1]
abline(c(a2, slope2), col="orange")
## opposite sign for PC2 score
plot(pca.cov$scores[,1], - pca.cov$scores[,2], xlab="PC1", ylab="PC2", type="n")
text(pca.cov$scores[,1], - pca.cov$scores[,2], label=1:20)
abline(0,0, col="grey")
abline(v=0, col="grey")