Chapter 4 多元数据分析 - 聚类,降维
对变量的聚类可以讲含有相同信息的变量聚为同一个族类
当我们有大量的变量的时候,这种方法可以很好的用于进行降维。同样可以用于降维的方法还有主成分分析和因子分析。
Model_data1$app1 <- as.factor(Model_data1$app1)
Model_data1$label <- as.factor(Model_data1$label)
Model_data1$地址种类 <- as.factor(Model_data1$地址种类)
Model_data1$发货方式 <- as.factor(Model_data1$发货方式)
Model_data1$用户性别 <- as.factor(Model_data1$用户性别)
Model_data1$州 <- as.factor(Model_data1$州)
factors <- sapply(Model_data1, is.factor)
#subset Qualitative variables
vars_quali <- Model_data1 %>% select(names(Model_data1)[factors])
#vars_quali$good_bad_21<-vars_quali$good_bad_21[drop=TRUE] # remove empty factors
str(vars_quali)
## Classes 'data.table' and 'data.frame': 322715 obs. of 6 variables:
## $ 地址种类: Factor w/ 6 levels "Inappropriate",..: 6 6 4 6 4 6 6 6 4 6 ...
## $ app1 : Factor w/ 71 levels "android_2.33",..: 67 42 45 40 42 67 42 32 42 29 ...
## $ 发货方式: Factor w/ 3 levels "Delhivery","Ecom",..: 1 1 2 2 1 1 1 2 2 1 ...
## $ 用户性别: Factor w/ 3 levels "men","not set",..: 3 3 1 3 1 3 3 3 3 1 ...
## $ 州 : Factor w/ 70 levels "Andaman and Nicobar Islands",..: 59 59 38 38 29 29 29 14 29 29 ...
## $ label : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
#subset Quantitative variables
vars_quanti <- Model_data1 %>% select(names(Model_data1)[!factors])
str(vars_quanti)
## Classes 'data.table' and 'data.frame': 322715 obs. of 6 variables:
## $ 下单与付款时间间隔: num 19.5 16.9 17.4 16.9 19.6 ...
## $ cod运费 : num 1.55 1.55 1.55 1.55 1.55 1.55 1.55 1.55 1.55 1.55 ...
## $ 修改后金额 : num 5.6 6.92 10.32 4.67 10.26 ...
## $ 原始来单金额 : num 5.6 6.92 10.32 4.67 10.26 ...
## $ 金额差异 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ 付款到派送 : num 2.71 -0.477 -0.151 -0.127 -0.17 ...
## - attr(*, ".internal.selfref")=<externalptr>
4.1 6 变量的层次聚类
tree <- hclustvar(X.quanti=vars_quanti,X.quali=vars_quali)
par(family='STKaiti')
plot(tree, main="variable clustering")
rect.hclust(tree, k=8, border = 1:8)
summary(tree)
## Length Class Mode
## call 3 -none- call
## rec 16 -none- list
## init 12 -none- numeric
## merge 22 -none- numeric
## height 11 -none- numeric
## order 12 -none- numeric
## labels 12 -none- character
## clusmat 144 -none- numeric
## X.quanti 6 data.table list
## X.quali 6 data.table list
# Phylogenetic trees
# require library("ape")
par(family='STKaiti')
plot(as.phylo(tree), type = "fan",
tip.color = hsv(runif(15, 0.65, 0.95), 1, 1, 0.7),
edge.color = hsv(runif(10, 0.65, 0.75), 1, 1, 0.7),
edge.width = runif(20, 0.5, 3), use.edge.length = TRUE, col = "gray80")
summary.phylo(as.phylo(tree))
##
## Phylogenetic tree: as.phylo(tree)
##
## Number of tips: 12
## Number of nodes: 11
## Branch lengths:
## mean: 0.2498154
## variance: 0.02762882
## distribution summary:
## Min. 1st Qu. Median 3rd Qu. Max.
## 0.01203149 0.11483605 0.24931255 0.40189405 0.49995107
## No root edge.
## First ten tip labels: 下单与付款时间间隔
## cod运费
## 修改后金额
## 原始来单金额
## 金额差异
## 付款到派送
## 地址种类
## app1
## 发货方式
## 用户性别
## No node labels.
part<-cutreevar(tree,8)
print(part)
##
## Call:
## cutreevar(obj = tree, k = 8)
##
##
##
## name
## "$var"
## "$sim"
## "$cluster"
## "$wss"
## "$E"
## "$size"
## "$scores"
## "$coef"
## description
## "list of variables in each cluster"
## "similarity matrix in each cluster"
## "cluster memberships"
## "within-cluster sum of squares"
## "gain in cohesion (in %)"
## "size of each cluster"
## "synthetic score of each cluster"
## "coef of the linear combinations defining the synthetic scores of each cluster"
summary(part)
##
## Call:
## cutreevar(obj = tree, k = 8)
##
##
##
## Data:
## number of observations: 322715
## number of variables: 12
## number of numerical variables: 6
## number of categorical variables: 6
## number of clusters: 8
##
## Cluster 1 :
## squared loading correlation
## 1 1
##
##
## Cluster 2 :
## squared loading correlation
## 修改后金额 0.93 -0.96
## 原始来单金额 0.92 -0.96
## cod运费 0.65 -0.81
##
##
## Cluster 3 :
## squared loading correlation
## 1 1
##
##
## Cluster 4 :
## squared loading correlation
## 州 0.68 NA
## 付款到派送 0.56 -0.75
## 发货方式 0.44 NA
##
##
## Cluster 5 :
## squared loading correlation
## 1 NA
##
##
## Cluster 6 :
## squared loading correlation
## 1 NA
##
##
## Cluster 7 :
## squared loading correlation
## 1 NA
##
##
## Cluster 8 :
## squared loading correlation
## 1 NA
##
##
## Gain in cohesion (in %): 80.38
4.2 7 通过聚类选取部分变量
# cod运费
# 付款到派送
# keep<- c(1,2,3,4,7,8,10,12)
cdata_reduced_2 <- Model_data1 # %>% select(keep)
str(cdata_reduced_2)
## Classes 'data.table' and 'data.frame': 322715 obs. of 12 variables:
## $ 地址种类 : Factor w/ 6 levels "Inappropriate",..: 6 6 4 6 4 6 6 6 4 6 ...
## $ app1 : Factor w/ 71 levels "android_2.33",..: 67 42 45 40 42 67 42 32 42 29 ...
## $ 下单与付款时间间隔: num 19.5 16.9 17.4 16.9 19.6 ...
## $ cod运费 : num 1.55 1.55 1.55 1.55 1.55 1.55 1.55 1.55 1.55 1.55 ...
## $ 修改后金额 : num 5.6 6.92 10.32 4.67 10.26 ...
## $ 原始来单金额 : num 5.6 6.92 10.32 4.67 10.26 ...
## $ 金额差异 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ 付款到派送 : num 2.71 -0.477 -0.151 -0.127 -0.17 ...
## $ 发货方式 : Factor w/ 3 levels "Delhivery","Ecom",..: 1 1 2 2 1 1 1 2 2 1 ...
## $ 用户性别 : Factor w/ 3 levels "men","not set",..: 3 3 1 3 1 3 3 3 3 1 ...
## $ 州 : Factor w/ 70 levels "Andaman and Nicobar Islands",..: 59 59 38 38 29 29 29 14 29 29 ...
## $ label : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>