Chapter 8 Principal Component Analysis
Principal Component Analysis (PCA) involves the process of understanding different features in a dataset and can be used in conjunction with cluster analysis.
PCA is also a popular machine learning algorithm used for feature selection. Imagine if you have more than 100 features or factors. It is useful to select the most important features for further analysis.
The basic idea when using PCA as a tool for feature selection is to select variables according to the magnitude (from largest to smallest in absolute values) of their coefficients (loadings).
library(readr)
<-read_csv('https://raw.githubusercontent.com/utjimmyx/regression/master/Segmentation.csv') mydata
## Rows: 221 Columns: 22
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (22): ID, Fashn, Price, Convnience, ShpTime, Fitness, Perceptn, ChNoise,...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
=prcomp(mydata, scale=TRUE)
pr.outnames(pr.out)
## [1] "sdev" "rotation" "center" "scale" "x"
$center pr.out
## ID Fashn Price Convnience ShpTime Fitness Perceptn
## 111.000000 3.203620 4.669683 3.764706 4.081448 5.723982 4.271493
## ChNoise RetailEx KnowdgStaf Brand4Slf Brand4Els Populr StrDisp
## 4.167421 5.271493 4.665158 3.755656 3.959276 3.656109 3.484163
## SaleStaf Fabric Cut Seam ShpOHngr ShpOBody Colrs
## 4.411765 5.393665 5.864253 5.072398 4.140271 6.298643 5.221719
## Match
## 3.986425
$scale pr.out
## ID Fashn Price Convnience ShpTime Fitness Perceptn
## 63.941379 1.612222 1.588148 1.768107 1.690258 1.116331 1.479727
## ChNoise RetailEx KnowdgStaf Brand4Slf Brand4Els Populr StrDisp
## 1.810080 1.303336 1.772545 1.849822 1.717071 1.860919 1.506342
## SaleStaf Fabric Cut Seam ShpOHngr ShpOBody Colrs
## 1.603305 1.342913 1.213424 1.666299 1.655091 1.070874 1.495654
## Match
## 1.927032
$rotation pr.out
## PC1 PC2 PC3 PC4 PC5
## ID 0.232595096 -0.141728585 0.162142007 -0.21221088 0.33763990
## Fashn 0.215075203 0.008296644 0.236253036 -0.08331053 0.48190015
## Price -0.083110129 0.359970599 0.175602432 -0.04356591 0.07336974
## Convnience -0.129765928 0.282295545 0.254669340 -0.45500198 -0.18857851
## ShpTime -0.082123186 0.199061091 0.305618880 -0.46012182 -0.33109347
## Fitness 0.209268975 0.027707680 0.076243811 0.28686695 -0.04789778
## Perceptn 0.147760141 0.103104921 0.137993748 0.05434480 0.10732316
## ChNoise -0.003294572 -0.233316351 -0.120761865 -0.18335256 -0.30086801
## RetailEx 0.057200617 -0.419440757 0.027286117 -0.20367795 -0.05877606
## KnowdgStaf 0.035294012 -0.460685457 -0.009809578 -0.32728733 0.01675947
## Brand4Slf 0.239982873 -0.108116296 0.358603555 0.14934679 -0.27287281
## Brand4Els 0.234695661 -0.043562575 0.282722854 0.26864743 -0.30488902
## Populr 0.261932725 -0.199672179 0.304403056 0.16942853 -0.23625373
## StrDisp 0.229784086 0.104411685 0.233837720 -0.02217908 0.10337670
## SaleStaf 0.165776569 -0.238756050 -0.049230811 -0.25359804 0.01089464
## Fabric 0.308936323 0.107783147 -0.253300022 -0.03955053 -0.13576706
## Cut 0.288415407 0.138871804 -0.333556041 -0.09375959 -0.18391648
## Seam 0.279488386 0.198628322 -0.204774189 -0.03394160 -0.12093246
## ShpOHngr 0.189696058 0.258009558 0.078701383 -0.13760657 0.02856658
## ShpOBody 0.280336909 0.111337308 -0.299520162 -0.09505428 -0.14504102
## Colrs 0.280369645 0.076412285 -0.128959129 -0.13220865 0.12568307
## Match 0.295824141 0.020763617 0.018328003 -0.12239359 0.23783231
## PC6 PC7 PC8 PC9 PC10
## ID -0.09936768 0.14283046 -0.0685984991 0.253487657 -0.0224000183
## Fashn -0.01846548 0.13058533 -0.0525594170 0.316896079 0.1854043488
## Price 0.06197305 -0.37854795 -0.4726624609 -0.199677536 -0.0248230763
## Convnience 0.01923234 -0.05066530 0.0442101471 0.172426411 -0.0379037716
## ShpTime -0.15817949 0.13103289 0.0369622760 0.110483323 0.0191260527
## Fitness -0.47156341 -0.06818683 -0.0321992287 -0.148845298 0.1779744186
## Perceptn -0.66349501 -0.20454426 0.2054186561 -0.036881532 0.0607460350
## ChNoise -0.29174562 0.55876382 -0.1626992313 -0.321101581 0.0051380492
## RetailEx -0.07832437 -0.23094388 -0.5385279367 -0.001412293 0.1011347052
## KnowdgStaf 0.02651470 -0.31354256 0.0346217801 -0.113099321 0.2996221390
## Brand4Slf 0.10784271 0.07607115 0.1234309157 0.045644118 0.0517372832
## Brand4Els 0.23413148 -0.13830891 -0.1112940009 0.017464874 0.1655779689
## Populr 0.13667544 0.03673937 0.0256843848 0.121479068 -0.1841272877
## StrDisp -0.09228047 0.01314168 -0.1421089664 -0.280855783 -0.5711160996
## SaleStaf 0.04976306 -0.34203110 0.5092839842 -0.175056332 -0.3232488537
## Fabric 0.01792504 -0.01653396 -0.0963759364 0.093429467 -0.1515360618
## Cut -0.01603493 -0.04422843 0.0144540630 0.291912365 0.0009586715
## Seam -0.03147089 0.02685632 -0.0369481537 0.030189867 0.2824112316
## ShpOHngr 0.19687356 -0.01609979 0.2329164518 -0.463421702 0.4527997594
## ShpOBody -0.05454202 -0.20366974 -0.0848453570 0.214722272 -0.0726756190
## Colrs 0.13808572 0.06741453 -0.1572498313 -0.356737953 -0.1277118733
## Match 0.21797364 0.31064927 -0.0006717061 -0.066682878 0.0575566758
## PC11 PC12 PC13 PC14 PC15
## ID 0.033166682 -0.408770006 0.001943678 -0.403904619 0.34054465
## Fashn 0.066560605 -0.046790566 0.208258450 0.316642978 -0.37753143
## Price -0.061963457 -0.405869553 0.056356193 0.097200623 -0.22646745
## Convnience -0.171465336 0.131557974 -0.085157731 0.125963701 0.17651665
## ShpTime -0.098199023 0.141446370 0.121822252 -0.066631393 -0.08887428
## Fitness -0.456395748 0.100611447 0.375854428 0.117411067 0.32566676
## Perceptn 0.147829992 0.052127714 -0.480249459 0.063662314 -0.21152112
## ChNoise 0.081047361 -0.379700269 0.072471438 0.205275841 -0.16328647
## RetailEx 0.167566673 0.283827819 -0.056806705 0.036613594 0.07519052
## KnowdgStaf -0.040529232 0.050199951 -0.025188875 -0.044176912 0.02323119
## Brand4Slf -0.065761293 -0.140381996 -0.230008554 -0.328647157 -0.01976633
## Brand4Els 0.003899588 -0.070251263 0.131535920 0.161959180 0.01278092
## Populr 0.018594174 -0.005475979 -0.122842728 0.141039311 -0.21034439
## StrDisp 0.355532774 0.206560766 0.248220742 -0.126863446 0.19201862
## SaleStaf -0.089561083 -0.132730270 0.307898283 0.127950149 -0.18774518
## Fabric 0.213471404 0.147000835 -0.277750562 0.254019026 0.11223577
## Cut 0.151888028 -0.123929942 0.145813682 0.129176630 -0.01343941
## Seam 0.137358945 0.280605265 0.323045930 -0.475353392 -0.36004557
## ShpOHngr 0.360502093 -0.139893342 -0.070350868 0.077967635 0.26174726
## ShpOBody -0.202555418 -0.298738797 -0.085779424 -0.006378797 0.19159894
## Colrs -0.469305865 0.071038903 -0.296165230 -0.225445991 -0.28162046
## Match -0.259403018 0.272133779 -0.076411401 0.304727926 0.15374949
## PC16 PC17 PC18 PC19 PC20
## ID -0.334358269 0.079211449 -0.022892051 0.131664498 0.201859006
## Fashn 0.128189237 -0.249383298 -0.166399170 -0.002960108 -0.213226246
## Price 0.120156085 0.305228173 -0.121626972 0.010897363 0.107116583
## Convnience -0.290107083 -0.207061515 -0.131952487 -0.310698966 -0.337077658
## ShpTime 0.183326826 0.088203600 0.104871049 0.353350274 0.349860528
## Fitness 0.009239597 -0.073374849 -0.284228618 -0.057922386 0.115110069
## Perceptn -0.093398816 0.105759553 0.258625507 0.012641618 -0.054705646
## ChNoise -0.051527054 -0.004035871 0.005549466 -0.027056492 -0.174227358
## RetailEx -0.180115769 0.172562936 0.002434281 -0.334846847 0.024880448
## KnowdgStaf 0.385621724 -0.235183370 -0.014523542 0.291588946 -0.054911629
## Brand4Slf 0.319037221 0.280683171 -0.319571943 -0.156651395 -0.368810213
## Brand4Els -0.399419150 -0.072200257 0.372349545 0.400655587 -0.252849257
## Populr 0.036956915 -0.227755389 0.003507327 -0.276791236 0.568082137
## StrDisp 0.241681928 -0.182393025 0.136223709 0.025468922 -0.186454322
## SaleStaf -0.229219400 0.269749514 -0.089502920 -0.050642160 -0.032161008
## Fabric -0.083670603 0.079235023 -0.551575668 0.448169431 0.019941531
## Cut 0.072364853 -0.033534344 0.055378735 -0.215137661 0.078947847
## Seam -0.114057985 0.130428302 0.002132349 -0.067646872 -0.040502242
## ShpOHngr -0.012936451 -0.139691831 -0.035126926 -0.180973113 0.178636342
## ShpOBody 0.304653162 -0.134629472 0.341173909 -0.087295815 -0.129731545
## Colrs -0.186521138 -0.337620491 -0.024003416 0.035695864 0.022339337
## Match 0.142473081 0.517783039 0.296305392 -0.038728227 0.008693611
## PC21 PC22
## ID -0.09527772 -0.08769059
## Fashn 0.20842502 0.12539398
## Price -0.15290128 -0.15656574
## Convnience -0.14961407 -0.28634534
## ShpTime 0.17532725 0.30911080
## Fitness -0.02426357 0.01307766
## Perceptn -0.09959543 -0.02897106
## ChNoise -0.01238490 -0.14545390
## RetailEx 0.22009019 0.27043783
## KnowdgStaf -0.33239274 -0.25732199
## Brand4Slf -0.01296480 0.19796052
## Brand4Els -0.05211704 0.06849901
## Populr -0.01172779 -0.33566016
## StrDisp -0.08245944 -0.02010442
## SaleStaf 0.15040692 0.05563568
## Fabric 0.10893530 -0.12298692
## Cut -0.59797747 0.39992494
## Seam 0.05598012 -0.37953228
## ShpOHngr 0.17704597 0.11158359
## ShpOBody 0.49242100 -0.13243215
## Colrs -0.05991053 0.27545467
## Match -0.13768821 -0.16730109
dim(pr.out$x)
## [1] 221 22
biplot(pr.out, scale=0)
$rotation=-pr.out$rotation
pr.out$x=-pr.out$x
pr.outbiplot(pr.out, scale=0)
$sdev pr.out
## [1] 2.1719592 1.4895211 1.3821415 1.2386098 1.1277009 1.0828239 0.9932730
## [8] 0.9718161 0.9277673 0.9034626 0.8432543 0.8370234 0.7630835 0.7382999
## [15] 0.7202818 0.6662806 0.6394777 0.6300103 0.5969997 0.5817634 0.5432168
## [22] 0.5192567
=pr.out$sdev^2
pr.var pr.var
## [1] 4.7174066 2.2186732 1.9103151 1.5341542 1.2717093 1.1725077 0.9865913
## [8] 0.9444266 0.8607522 0.8162448 0.7110778 0.7006083 0.5822965 0.5450867
## [15] 0.5188059 0.4439298 0.4089318 0.3969129 0.3564086 0.3384487 0.2950845
## [22] 0.2696275
=pr.var/sum(pr.var)
pve pve
## [1] 0.21442757 0.10084878 0.08683251 0.06973428 0.05780497 0.05329580
## [7] 0.04484506 0.04292848 0.03912510 0.03710203 0.03232172 0.03184583
## [13] 0.02646802 0.02477667 0.02358209 0.02017863 0.01858781 0.01804150
## [19] 0.01620039 0.01538403 0.01341293 0.01225580
plot(pve, xlab="Principal Component", ylab="Proportion of Variance Explained", ylim=c(0,1),type='b')
plot(cumsum(pve), xlab="Principal Component", ylab="Cumulative Proportion of Variance Explained", ylim=c(0,1),type='b')
#save your cluster solutions in the working directory
#We want to examine the cluster memberships for each observation - see last column of pca_data
8.1 References
Principal component analysis - reading (p.404-p.405) https://faculty.marshall.usc.edu/gareth-james/ISL/ISLR%20Seventh%20Printing.pdf
Principal Component Methods in R: Practical Guide http://www.sthda.com/english/articles/31-principal-component-methods-in-r-practical-guide/118-principal-component-analysis-in-r-prcomp-vs-princomp/
Interpretation of the Principal Components https://online.stat.psu.edu/stat505/lesson/11/11.4mydata