Chapter 8 Principal Component Analysis

Principal Component Analysis (PCA) involves the process of understanding different features in a dataset and can be used in conjunction with cluster analysis.

PCA is also a popular machine learning algorithm used for feature selection. Imagine if you have more than 100 features or factors. It is useful to select the most important features for further analysis.

The basic idea when using PCA as a tool for feature selection is to select variables according to the magnitude (from largest to smallest in absolute values) of their coefficients (loadings).

library(readr)

mydata <-read_csv('https://raw.githubusercontent.com/utjimmyx/regression/master/Segmentation.csv')
## Rows: 221 Columns: 22
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (22): ID, Fashn, Price, Convnience, ShpTime, Fitness, Perceptn, ChNoise,...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
pr.out=prcomp(mydata, scale=TRUE)
names(pr.out)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
pr.out$center
##         ID      Fashn      Price Convnience    ShpTime    Fitness   Perceptn 
## 111.000000   3.203620   4.669683   3.764706   4.081448   5.723982   4.271493 
##    ChNoise   RetailEx KnowdgStaf  Brand4Slf  Brand4Els     Populr    StrDisp 
##   4.167421   5.271493   4.665158   3.755656   3.959276   3.656109   3.484163 
##   SaleStaf     Fabric        Cut       Seam   ShpOHngr   ShpOBody      Colrs 
##   4.411765   5.393665   5.864253   5.072398   4.140271   6.298643   5.221719 
##      Match 
##   3.986425
pr.out$scale
##         ID      Fashn      Price Convnience    ShpTime    Fitness   Perceptn 
##  63.941379   1.612222   1.588148   1.768107   1.690258   1.116331   1.479727 
##    ChNoise   RetailEx KnowdgStaf  Brand4Slf  Brand4Els     Populr    StrDisp 
##   1.810080   1.303336   1.772545   1.849822   1.717071   1.860919   1.506342 
##   SaleStaf     Fabric        Cut       Seam   ShpOHngr   ShpOBody      Colrs 
##   1.603305   1.342913   1.213424   1.666299   1.655091   1.070874   1.495654 
##      Match 
##   1.927032
pr.out$rotation
##                     PC1          PC2          PC3         PC4         PC5
## ID          0.232595096 -0.141728585  0.162142007 -0.21221088  0.33763990
## Fashn       0.215075203  0.008296644  0.236253036 -0.08331053  0.48190015
## Price      -0.083110129  0.359970599  0.175602432 -0.04356591  0.07336974
## Convnience -0.129765928  0.282295545  0.254669340 -0.45500198 -0.18857851
## ShpTime    -0.082123186  0.199061091  0.305618880 -0.46012182 -0.33109347
## Fitness     0.209268975  0.027707680  0.076243811  0.28686695 -0.04789778
## Perceptn    0.147760141  0.103104921  0.137993748  0.05434480  0.10732316
## ChNoise    -0.003294572 -0.233316351 -0.120761865 -0.18335256 -0.30086801
## RetailEx    0.057200617 -0.419440757  0.027286117 -0.20367795 -0.05877606
## KnowdgStaf  0.035294012 -0.460685457 -0.009809578 -0.32728733  0.01675947
## Brand4Slf   0.239982873 -0.108116296  0.358603555  0.14934679 -0.27287281
## Brand4Els   0.234695661 -0.043562575  0.282722854  0.26864743 -0.30488902
## Populr      0.261932725 -0.199672179  0.304403056  0.16942853 -0.23625373
## StrDisp     0.229784086  0.104411685  0.233837720 -0.02217908  0.10337670
## SaleStaf    0.165776569 -0.238756050 -0.049230811 -0.25359804  0.01089464
## Fabric      0.308936323  0.107783147 -0.253300022 -0.03955053 -0.13576706
## Cut         0.288415407  0.138871804 -0.333556041 -0.09375959 -0.18391648
## Seam        0.279488386  0.198628322 -0.204774189 -0.03394160 -0.12093246
## ShpOHngr    0.189696058  0.258009558  0.078701383 -0.13760657  0.02856658
## ShpOBody    0.280336909  0.111337308 -0.299520162 -0.09505428 -0.14504102
## Colrs       0.280369645  0.076412285 -0.128959129 -0.13220865  0.12568307
## Match       0.295824141  0.020763617  0.018328003 -0.12239359  0.23783231
##                    PC6         PC7           PC8          PC9          PC10
## ID         -0.09936768  0.14283046 -0.0685984991  0.253487657 -0.0224000183
## Fashn      -0.01846548  0.13058533 -0.0525594170  0.316896079  0.1854043488
## Price       0.06197305 -0.37854795 -0.4726624609 -0.199677536 -0.0248230763
## Convnience  0.01923234 -0.05066530  0.0442101471  0.172426411 -0.0379037716
## ShpTime    -0.15817949  0.13103289  0.0369622760  0.110483323  0.0191260527
## Fitness    -0.47156341 -0.06818683 -0.0321992287 -0.148845298  0.1779744186
## Perceptn   -0.66349501 -0.20454426  0.2054186561 -0.036881532  0.0607460350
## ChNoise    -0.29174562  0.55876382 -0.1626992313 -0.321101581  0.0051380492
## RetailEx   -0.07832437 -0.23094388 -0.5385279367 -0.001412293  0.1011347052
## KnowdgStaf  0.02651470 -0.31354256  0.0346217801 -0.113099321  0.2996221390
## Brand4Slf   0.10784271  0.07607115  0.1234309157  0.045644118  0.0517372832
## Brand4Els   0.23413148 -0.13830891 -0.1112940009  0.017464874  0.1655779689
## Populr      0.13667544  0.03673937  0.0256843848  0.121479068 -0.1841272877
## StrDisp    -0.09228047  0.01314168 -0.1421089664 -0.280855783 -0.5711160996
## SaleStaf    0.04976306 -0.34203110  0.5092839842 -0.175056332 -0.3232488537
## Fabric      0.01792504 -0.01653396 -0.0963759364  0.093429467 -0.1515360618
## Cut        -0.01603493 -0.04422843  0.0144540630  0.291912365  0.0009586715
## Seam       -0.03147089  0.02685632 -0.0369481537  0.030189867  0.2824112316
## ShpOHngr    0.19687356 -0.01609979  0.2329164518 -0.463421702  0.4527997594
## ShpOBody   -0.05454202 -0.20366974 -0.0848453570  0.214722272 -0.0726756190
## Colrs       0.13808572  0.06741453 -0.1572498313 -0.356737953 -0.1277118733
## Match       0.21797364  0.31064927 -0.0006717061 -0.066682878  0.0575566758
##                    PC11         PC12         PC13         PC14        PC15
## ID          0.033166682 -0.408770006  0.001943678 -0.403904619  0.34054465
## Fashn       0.066560605 -0.046790566  0.208258450  0.316642978 -0.37753143
## Price      -0.061963457 -0.405869553  0.056356193  0.097200623 -0.22646745
## Convnience -0.171465336  0.131557974 -0.085157731  0.125963701  0.17651665
## ShpTime    -0.098199023  0.141446370  0.121822252 -0.066631393 -0.08887428
## Fitness    -0.456395748  0.100611447  0.375854428  0.117411067  0.32566676
## Perceptn    0.147829992  0.052127714 -0.480249459  0.063662314 -0.21152112
## ChNoise     0.081047361 -0.379700269  0.072471438  0.205275841 -0.16328647
## RetailEx    0.167566673  0.283827819 -0.056806705  0.036613594  0.07519052
## KnowdgStaf -0.040529232  0.050199951 -0.025188875 -0.044176912  0.02323119
## Brand4Slf  -0.065761293 -0.140381996 -0.230008554 -0.328647157 -0.01976633
## Brand4Els   0.003899588 -0.070251263  0.131535920  0.161959180  0.01278092
## Populr      0.018594174 -0.005475979 -0.122842728  0.141039311 -0.21034439
## StrDisp     0.355532774  0.206560766  0.248220742 -0.126863446  0.19201862
## SaleStaf   -0.089561083 -0.132730270  0.307898283  0.127950149 -0.18774518
## Fabric      0.213471404  0.147000835 -0.277750562  0.254019026  0.11223577
## Cut         0.151888028 -0.123929942  0.145813682  0.129176630 -0.01343941
## Seam        0.137358945  0.280605265  0.323045930 -0.475353392 -0.36004557
## ShpOHngr    0.360502093 -0.139893342 -0.070350868  0.077967635  0.26174726
## ShpOBody   -0.202555418 -0.298738797 -0.085779424 -0.006378797  0.19159894
## Colrs      -0.469305865  0.071038903 -0.296165230 -0.225445991 -0.28162046
## Match      -0.259403018  0.272133779 -0.076411401  0.304727926  0.15374949
##                    PC16         PC17         PC18         PC19         PC20
## ID         -0.334358269  0.079211449 -0.022892051  0.131664498  0.201859006
## Fashn       0.128189237 -0.249383298 -0.166399170 -0.002960108 -0.213226246
## Price       0.120156085  0.305228173 -0.121626972  0.010897363  0.107116583
## Convnience -0.290107083 -0.207061515 -0.131952487 -0.310698966 -0.337077658
## ShpTime     0.183326826  0.088203600  0.104871049  0.353350274  0.349860528
## Fitness     0.009239597 -0.073374849 -0.284228618 -0.057922386  0.115110069
## Perceptn   -0.093398816  0.105759553  0.258625507  0.012641618 -0.054705646
## ChNoise    -0.051527054 -0.004035871  0.005549466 -0.027056492 -0.174227358
## RetailEx   -0.180115769  0.172562936  0.002434281 -0.334846847  0.024880448
## KnowdgStaf  0.385621724 -0.235183370 -0.014523542  0.291588946 -0.054911629
## Brand4Slf   0.319037221  0.280683171 -0.319571943 -0.156651395 -0.368810213
## Brand4Els  -0.399419150 -0.072200257  0.372349545  0.400655587 -0.252849257
## Populr      0.036956915 -0.227755389  0.003507327 -0.276791236  0.568082137
## StrDisp     0.241681928 -0.182393025  0.136223709  0.025468922 -0.186454322
## SaleStaf   -0.229219400  0.269749514 -0.089502920 -0.050642160 -0.032161008
## Fabric     -0.083670603  0.079235023 -0.551575668  0.448169431  0.019941531
## Cut         0.072364853 -0.033534344  0.055378735 -0.215137661  0.078947847
## Seam       -0.114057985  0.130428302  0.002132349 -0.067646872 -0.040502242
## ShpOHngr   -0.012936451 -0.139691831 -0.035126926 -0.180973113  0.178636342
## ShpOBody    0.304653162 -0.134629472  0.341173909 -0.087295815 -0.129731545
## Colrs      -0.186521138 -0.337620491 -0.024003416  0.035695864  0.022339337
## Match       0.142473081  0.517783039  0.296305392 -0.038728227  0.008693611
##                   PC21        PC22
## ID         -0.09527772 -0.08769059
## Fashn       0.20842502  0.12539398
## Price      -0.15290128 -0.15656574
## Convnience -0.14961407 -0.28634534
## ShpTime     0.17532725  0.30911080
## Fitness    -0.02426357  0.01307766
## Perceptn   -0.09959543 -0.02897106
## ChNoise    -0.01238490 -0.14545390
## RetailEx    0.22009019  0.27043783
## KnowdgStaf -0.33239274 -0.25732199
## Brand4Slf  -0.01296480  0.19796052
## Brand4Els  -0.05211704  0.06849901
## Populr     -0.01172779 -0.33566016
## StrDisp    -0.08245944 -0.02010442
## SaleStaf    0.15040692  0.05563568
## Fabric      0.10893530 -0.12298692
## Cut        -0.59797747  0.39992494
## Seam        0.05598012 -0.37953228
## ShpOHngr    0.17704597  0.11158359
## ShpOBody    0.49242100 -0.13243215
## Colrs      -0.05991053  0.27545467
## Match      -0.13768821 -0.16730109
dim(pr.out$x)
## [1] 221  22
biplot(pr.out, scale=0)

pr.out$rotation=-pr.out$rotation
pr.out$x=-pr.out$x
biplot(pr.out, scale=0)

pr.out$sdev
##  [1] 2.1719592 1.4895211 1.3821415 1.2386098 1.1277009 1.0828239 0.9932730
##  [8] 0.9718161 0.9277673 0.9034626 0.8432543 0.8370234 0.7630835 0.7382999
## [15] 0.7202818 0.6662806 0.6394777 0.6300103 0.5969997 0.5817634 0.5432168
## [22] 0.5192567
pr.var=pr.out$sdev^2
pr.var
##  [1] 4.7174066 2.2186732 1.9103151 1.5341542 1.2717093 1.1725077 0.9865913
##  [8] 0.9444266 0.8607522 0.8162448 0.7110778 0.7006083 0.5822965 0.5450867
## [15] 0.5188059 0.4439298 0.4089318 0.3969129 0.3564086 0.3384487 0.2950845
## [22] 0.2696275
pve=pr.var/sum(pr.var)
pve
##  [1] 0.21442757 0.10084878 0.08683251 0.06973428 0.05780497 0.05329580
##  [7] 0.04484506 0.04292848 0.03912510 0.03710203 0.03232172 0.03184583
## [13] 0.02646802 0.02477667 0.02358209 0.02017863 0.01858781 0.01804150
## [19] 0.01620039 0.01538403 0.01341293 0.01225580
plot(pve, xlab="Principal Component", ylab="Proportion of Variance Explained", ylim=c(0,1),type='b')

plot(cumsum(pve), xlab="Principal Component", ylab="Cumulative Proportion of Variance Explained", ylim=c(0,1),type='b')

#save your cluster solutions in the working directory
#We want to examine the cluster memberships for each observation - see last column of pca_data