第 3 章 农村人力资本投资
劳动力流动、家庭收入与农村人力资本投资
library(tidyverse)
library(here)
library(fs)
library(haven)
library(broom)
3.1 数据导入
我们选取了北京大学开放数据平台中的中国家庭追踪调查CFPS2的2014年数据。
cfps2014adult <- read_dta("../data/2014AllData/Cfps2014famecon_170630.dta",
encoding = "GB2312"
)
3.2 数据探索
# colnames(cfps2014adult)
3.3 选取变量
研究劳动力流动、家庭收入与农村人力资本投资的关系3,因此我们选取了…
vars_select <- c(
"fid14", "provcd14", "fa1",
"fa7_s_1", "fa7_s_2", "fa7_s_3", "fk1l",
"fn1_s_1", "fn1_s_2", "fn1_s_3", "fn1_s_4",
"fn101", "fn2", "fn3", "fo1", "fp510",
"fincome1_per"
)
cfps2014adult %>% select(vars_select) %>% glimpse()
## Observations: 13,946
## Variables: 17
## $ fid14 <dbl+lbl> 100051, 100125, 100160, 1...
## $ provcd14 <dbl+lbl> 11, 11, 12, 13, 13, 13, 4...
## $ fa1 <dbl+lbl> 1, 1, 1, 1, 5, 5, 1, 5, 5...
## $ fa7_s_1 <dbl+lbl> 78, 78, 78, 78, 78, NA, 7...
## $ fa7_s_2 <dbl+lbl> -8, -8, -8, -8, -8, NA, -...
## $ fa7_s_3 <dbl+lbl> -8, -8, -8, -8, -8, NA, -...
## $ fk1l <dbl+lbl> 0, 0, 0, 1, 1, 0, 0, 0, 0...
## $ fn1_s_1 <dbl+lbl> 78, 78, 78, 78, 78, 3, 78...
## $ fn1_s_2 <dbl+lbl> -8, -8, -8, -8, -8, -8, -...
## $ fn1_s_3 <dbl+lbl> -8, -8, -8, -8, -8, -8, -...
## $ fn1_s_4 <dbl+lbl> -8, -8, -8, -8, -8, -8, -...
## $ fn101 <dbl+lbl> -8, -8, -8, -8, -8, -1, -...
## $ fn2 <dbl+lbl> 0, 0, 0, 0, 0, 0, 0, 0, 1...
## $ fn3 <dbl+lbl> 5, 5, 5, 5, 5, NA, 1, 5, ...
## $ fo1 <dbl+lbl> 0, 0, 0, 1, 1, NA, 0, 0, ...
## $ fp510 <dbl+lbl> 5000, 3000, 0, 0, 0, 1000...
## $ fincome1_per <dbl+lbl> 23333, 400000, 30000, 440...
3.4 获取标签
library(purrr)
get_var_label <- function(dta) {
labels <- map(dta, function(x) attr(x, "label"))
data_frame(
name = names(labels),
label = as.character(labels)
)
}
cfps2014adult %>% select(vars_select) %>% get_var_label()
## # A tibble: 17 x 2
## name label
## <chr> <chr>
## 1 fid14 2014年家户号
## 2 provcd14 省国标码
## 3 fa1 社区性质
## 4 fa7_s_1 住房困难
## 5 fa7_s_2 住房困难
## 6 fa7_s_3 住房困难
## 7 fk1l 是否从事农业工作
## 8 fn1_s_1 您家收到哪些政府补助-选择1
## 9 fn1_s_2 您家收到哪些政府补助-选择2
## 10 fn1_s_3 您家收到哪些政府补助-选择3
## 11 fn1_s_4 您家收到哪些政府补助-选择4
## 12 fn101 政府补助总额(元)
## 13 fn2 是否收到社会捐助
## 14 fn3 领取离退休或养老金
## 15 fo1 作农活或外出打工
## 16 fp510 过去12个月教育培训支出(元)
## 17 fincome1_per 人均家庭纯收入
这里多选题 fa7_s_2
, fa7_s_3
, fn1_s_2
, fn1_s_3
, fn1_s_4
有没有用到呢? 还是看看原始的调查问卷比较好。
3.5 数据统计
cfps2014adult %>%
summarise_all(n_distinct) %>%
tidyr::gather(vars_name, num_distinct_answers) %>%
arrange(desc(num_distinct_answers))
## # A tibble: 459 x 2
## vars_name num_distinct_answers
## <chr> <int>
## 1 fid14 13946
## 2 fresp1pid 13939
## 3 fz1pid 13932
## 4 pid_a_1 13893
## 5 fid12 12711
## 6 pid_a_2 12697
## 7 fid10 12292
## 8 total_asset 11538
## 9 fswt_natcs14 11198
## 10 fswt_natpn1014 11116
## # ... with 449 more rows
cfps2014adult %>%
select(vars_select) %>%
map(~ count(data.frame(x = .x), x))
## $fid14
## # A tibble: 13,946 x 2
## x n
## <dbl+lbl> <int>
## 1 100051 1
## 2 100125 1
## 3 100160 1
## 4 100286 1
## 5 100376 1
## 6 100435 1
## 7 100453 1
## 8 100551 1
## 9 100569 1
## 10 100724 1
## # ... with 13,936 more rows
##
## $provcd14
## # A tibble: 29 x 2
## x n
## <dbl+lbl> <int>
## 1 11 142
## 2 12 98
## 3 13 775
## 4 14 596
## 5 15 6
## 6 21 1381
## 7 22 271
## 8 23 474
## 9 31 1011
## 10 32 296
## # ... with 19 more rows
##
## $fa1
## # A tibble: 3 x 2
## x n
## <dbl+lbl> <int>
## 1 -1 2
## 2 1 4180
## 3 5 9764
##
## $fa7_s_1
## # A tibble: 9 x 2
## x n
## <dbl+lbl> <int>
## 1 -1 1
## 2 1 714
## 3 2 482
## 4 3 97
## 5 4 60
## 6 5 255
## 7 77 614
## 8 78 11182
## 9 NA 541
##
## $fa7_s_2
## # A tibble: 8 x 2
## x n
## <dbl+lbl> <int>
## 1 -8 13016
## 2 1 24
## 3 2 121
## 4 3 103
## 5 4 23
## 6 5 88
## 7 77 30
## 8 NA 541
##
## $fa7_s_3
## # A tibble: 8 x 2
## x n
## <dbl+lbl> <int>
## 1 -8 13269
## 2 1 3
## 3 2 7
## 4 3 38
## 5 4 12
## 6 5 48
## 7 77 28
## 8 NA 541
##
## $fk1l
## # A tibble: 2 x 2
## x n
## <dbl+lbl> <int>
## 1 0 7131
## 2 1 6815
##
## $fn1_s_1
## # A tibble: 10 x 2
## x n
## <dbl+lbl> <int>
## 1 -1 5
## 2 1 1458
## 3 2 775
## 4 3 4265
## 5 4 41
## 6 5 57
## 7 6 9
## 8 7 38
## 9 77 326
## 10 78 6972
##
## $fn1_s_2
## # A tibble: 9 x 2
## x n
## <dbl+lbl> <int>
## 1 -8 12233
## 2 1 54
## 3 2 308
## 4 3 1136
## 5 4 36
## 6 5 30
## 7 6 5
## 8 7 37
## 9 77 107
##
## $fn1_s_3
## # A tibble: 9 x 2
## x n
## <dbl+lbl> <int>
## 1 -8 13616
## 2 1 10
## 3 2 10
## 4 3 204
## 5 4 9
## 6 5 26
## 7 6 7
## 8 7 20
## 9 77 44
##
## $fn1_s_4
## # A tibble: 7 x 2
## x n
## <dbl+lbl> <int>
## 1 -8 13902
## 2 2 2
## 3 4 5
## 4 5 5
## 5 6 1
## 6 7 10
## 7 77 21
##
## $fn101
## # A tibble: 551 x 2
## x n
## <dbl+lbl> <int>
## 1 -8 6977
## 2 -2 1
## 3 -1 178
## 4 0 3
## 5 1 6
## 6 4 2
## 7 5 9
## 8 6 1
## 9 8 1
## 10 9 1
## # ... with 541 more rows
##
## $fn2
## # A tibble: 3 x 2
## x n
## <dbl+lbl> <int>
## 1 -1 2
## 2 0 13765
## 3 1 179
##
## $fn3
## # A tibble: 4 x 2
## x n
## <dbl+lbl> <int>
## 1 -1 1
## 2 1 4814
## 3 5 8590
## 4 NA 541
##
## $fo1
## # A tibble: 3 x 2
## x n
## <dbl+lbl> <int>
## 1 0 8032
## 2 1 5373
## 3 NA 541
##
## $fp510
## # A tibble: 295 x 2
## x n
## <dbl+lbl> <int>
## 1 -2 1
## 2 -1 85
## 3 0 7273
## 4 9 1
## 5 10 2
## 6 20 3
## 7 30 2
## 8 40 1
## 9 50 10
## 10 55 1
## # ... with 285 more rows
##
## $fincome1_per
## # A tibble: 5,789 x 2
## x n
## <dbl+lbl> <int>
## 1 0.2500 1
## 2 0.5000 1
## 3 0.8333 1
## 4 1.6667 1
## 5 2.6667 1
## 6 3.3333 2
## 7 5.0000 2
## 8 6.2500 1
## 9 8.0000 2
## 10 9.0000 1
## # ... with 5,779 more rows
# map_if(is.character, ~count(data.frame(x = .x), x) )
3.6 缺失值
library(naniar)
cfps2014adult %>%
select(vars_select) %>%
miss_var_summary()
## # A tibble: 17 x 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 fincome1_per 1245 8.93
## 2 fa7_s_1 541 3.88
## 3 fa7_s_2 541 3.88
## 4 fa7_s_3 541 3.88
## 5 fn3 541 3.88
## 6 fo1 541 3.88
## 7 fid14 0 0
## 8 provcd14 0 0
## 9 fa1 0 0
## 10 fk1l 0 0
## 11 fn1_s_1 0 0
## 12 fn1_s_2 0 0
## 13 fn1_s_3 0 0
## 14 fn1_s_4 0 0
## 15 fn101 0 0
## 16 fn2 0 0
## 17 fp510 0 0
library(visdat)
cfps2014adult %>%
select(vars_select) %>%
vis_dat()
3.7 数据规整
我们是这样做数据规整和清洗的:
- 问卷题目
- 变量统计
- 信息提取
- 代码实现
tb <- cfps2014adult %>%
select(vars_select)
3.7.1 fid14
tb %>% count(fid14)
## # A tibble: 13,946 x 2
## fid14 n
## <dbl+lbl> <int>
## 1 100051 1
## 2 100125 1
## 3 100160 1
## 4 100286 1
## 5 100376 1
## 6 100435 1
## 7 100453 1
## 8 100551 1
## 9 100569 1
## 10 100724 1
## # ... with 13,936 more rows
数据清洗: 检查是否有重复的
#tb %>% distinct(fid14, .keep_all = TRUE)
tb %>%
group_by(fid14) %>%
filter(n() > 1)
## # A tibble: 0 x 17
## # Groups: fid14 [0]
## # ... with 17 variables: fid14 <dbl+lbl>,
## # provcd14 <dbl+lbl>, fa1 <dbl+lbl>,
## # fa7_s_1 <dbl+lbl>, fa7_s_2 <dbl+lbl>,
## # fa7_s_3 <dbl+lbl>, fk1l <dbl+lbl>,
## # fn1_s_1 <dbl+lbl>, fn1_s_2 <dbl+lbl>,
## # fn1_s_3 <dbl+lbl>, fn1_s_4 <dbl+lbl>,
## # fn101 <dbl+lbl>, fn2 <dbl+lbl>, fn3 <dbl+lbl>,
## # fo1 <dbl+lbl>, fp510 <dbl+lbl>,
## # fincome1_per <dbl+lbl>
3.7.2 provcd14
tb %>% count(provcd14)
## # A tibble: 29 x 2
## provcd14 n
## <dbl+lbl> <int>
## 1 11 142
## 2 12 98
## 3 13 775
## 4 14 596
## 5 15 6
## 6 21 1381
## 7 22 271
## 8 23 474
## 9 31 1011
## 10 32 296
## # ... with 19 more rows
数据清洗: 创建虚拟变量
# 虚拟变量
fastDummies::dummy_cols("provcd14")
3.7.3 fa1
tb %>% count(fa1)
## # A tibble: 3 x 2
## fa1 n
## <dbl+lbl> <int>
## 1 -1 2
## 2 1 4180
## 3 5 9764
数据清洗: 过滤符合情形的fa1 == 5
filter(fa1 == 5)
3.7.4 fa7
变量 fa7_s_2
, fa7_s_3
没有用到? 还是看看原始的调查问卷比较好。问卷调查是这样设计的: 这道题是多选题,最多选3项。因此就会出现(1,2,3),(2,4,6), (1,4,5) …这答案,分别用 fa7_s_1
, fa7_s_2
, fa7_s_3
存放。如果少于三项(缺失的、只选1项或者2项的)就左对齐依次存放, 即从fa7_s_1
开始存放。
fa7_s_1 | fa7_s_2 | fa7_s_3 |
---|---|---|
78 | ||
2 | ||
2 | 3 | |
1 | 2 | 3 |
4 | 5 | 6 |
1 | 3 | 6 |
也就是说,我们可以是否有住房困难,只需要通过fa7_s_1
即可。
tb %>% count(fa7_s_1)
## # A tibble: 9 x 2
## fa7_s_1 n
## <dbl+lbl> <int>
## 1 -1 1
## 2 1 714
## 3 2 482
## 4 3 97
## 5 4 60
## 6 5 255
## 7 77 614
## 8 78 11182
## 9 NA 541
数据清洗:
删除
fa7_s_1
缺失值行filter(!is.na(fa7_s_1))
删除
fa7_s_1
包含c(-10, -8, -2, -1)的行并根据
fa7_s_1
住房困难,新建housing变量
# 删除缺失值
filter(!is.na(fa7_s_1)) %>%
# 住房困难fa7_s
filter(!fa7_s_1 %in% c(-10, -8, -2, -1)) %>%
# 新建housing变量
mutate(housing = if_else(fa7_s_1 %in% c(78) | fa7_s_2 %in% c(78) | fa7_s_3 %in% c(78), 0, 1))
3.7.5 fk1l
tb %>% count(fk1l)
## # A tibble: 2 x 2
## fk1l n
## <dbl+lbl> <int>
## 1 0 7131
## 2 1 6815
数据清洗: 无
3.7.6 fn1
tb %>% count(fn1_s_1)
## # A tibble: 10 x 2
## fn1_s_1 n
## <dbl+lbl> <int>
## 1 -1 5
## 2 1 1458
## 3 2 775
## 4 3 4265
## 5 4 41
## 6 5 57
## 7 6 9
## 8 7 38
## 9 77 326
## 10 78 6972
fn1
与fa7
情况类似。fn1_s_1
, fn1_s_2
, fn1_s_3
, fn1_s_4
主要用到fn1_s_1
。 换句话说只有fn1_s_1
包含了1到7,就说明收到政府补助。
数据清洗:
- 删除
fn1_s_1
缺失值和 c(-10, -9, -8, -2, -1) 行
# 过滤
filter(!fn1_s_1 %in% c(-10, -9, -8, -2, -1)) %>%
3.7.7 fn101
tb %>% count(fn101)
## # A tibble: 551 x 2
## fn101 n
## <dbl+lbl> <int>
## 1 -8 6977
## 2 -2 1
## 3 -1 178
## 4 0 3
## 5 1 6
## 6 4 2
## 7 5 9
## 8 6 1
## 9 8 1
## 10 9 1
## # ... with 541 more rows
数据清洗:
将-8替换为0,并删除缺失值和c(-1, -2, -9, -10)行,
求政府补助总额+1的自然对数
# 替换
mutate_at(vars(fn101), funs(replace(., . == -8, 0))) %>%
# 家庭收到政府补助
filter(!fn101 %in% c(-1, -2, -9, -10)) %>%
3.7.8 fn2
tb %>% count(fn2)
## # A tibble: 3 x 2
## fn2 n
## <dbl+lbl> <int>
## 1 -1 2
## 2 0 13765
## 3 1 179
数据清洗: 删除-1
# 否收到社会捐助
filter(!fn2 %in% c(-1)) %>%
3.7.9 fn3
tb %>% count(fn3)
## # A tibble: 4 x 2
## fn3 n
## <dbl+lbl> <int>
## 1 -1 1
## 2 1 4814
## 3 5 8590
## 4 NA 541
数据清洗: 删除-1和NA 所在的行
# 是否有人领取离退休金和养老金
filter(!fn3 %in% c(-1, NA))
3.7.10 fo1
tb %>% count(fo1)
## # A tibble: 3 x 2
## fo1 n
## <dbl+lbl> <int>
## 1 0 8032
## 2 1 5373
## 3 NA 541
数据清洗: 删除缺失值
# 是否打工
filter(!is.na(fo1)) %>%
3.7.11 fp510
tb %>% count(fp510)
## # A tibble: 295 x 2
## fp510 n
## <dbl+lbl> <int>
## 1 -2 1
## 2 -1 85
## 3 0 7273
## 4 9 1
## 5 10 2
## 6 20 3
## 7 30 2
## 8 40 1
## 9 50 10
## 10 55 1
## # ... with 285 more rows
数据清洗:
删除c(-10, -9, -8, -2, NA) 这种答案,
并根据是否为0,新建变量Y(=是否人力资本投资),
并计算教育支出+1的自然对数
# 过去12月的教育支出
filter(!fp510 %in% c(-10, -9, -8, -2, NA)) %>%
# 是否进行人力资本投入
mutate(Y = if_else(fp510 %in% c(0), 0, 1)) %>%
# 教育支出计算其自然对数
mutate(ln_fp510 = log(fp510 + 1)) %>%
3.7.12 fincome1_per
tb %>% count(fincome1_per)
## # A tibble: 5,789 x 2
## fincome1_per n
## <dbl+lbl> <int>
## 1 0.2500 1
## 2 0.5000 1
## 3 0.8333 1
## 4 1.6667 1
## 5 2.6667 1
## 6 3.3333 2
## 7 5.0000 2
## 8 6.2500 1
## 9 8.0000 2
## 10 9.0000 1
## # ... with 5,779 more rows
数据清洗: 删除其中缺失值,并求自然对数
# 删除缺失值
filter(!is.na(fincome1_per)) %>%
# 家庭人均收入fincome1_per的自然对数
mutate(ln_fincome1_per = log(fincome1_per))
3.8 代码实现
变量 | 数据清洗 |
---|---|
fa1 | 过滤符合情形的fa1 == 5 |
fa7_s_1 | 删除缺失值行filter(!is.na(fa7_s_1)) |
fa7_s_1 | 删除包含c(-10, -8, -2, -1)的行 + 并根据住房困难多选题,新建housing变量 |
fn1_s_1 | 删除缺失值和 c(-10, -9, -8, -2, -1) 行 |
fn101 | 将-8替换为0,并删除缺失值和c(-1, -2, -9, -10)行, 并求政府补助总额+1的自然对数 |
fn2 | 删除-1 |
fn3 | 删除-1和NA 所在的行 |
fo1 | 删除缺失值 |
fp510 | 删除c(-10, -9, -8, -2, NA) 这种答案,并根据是否为0,新建变量Y(=是否人力资本投资), 并计算教育支出+1的自然对数 |
fincome1_per | 删除其中缺失值,并求自然对数 |
provcd14 | 创建省份的虚拟变量 |
针对数据探索过程中发现的问题,我们一一做相应的数据规整处理。
a <- cfps2014adult %>%
select(vars_select) %>%
# 过滤符合情形的
filter(fa1 == 5) %>%
# 删除缺失值
filter(!is.na(fa7_s_1)) %>%
# 住房困难fa7_s
filter(!fa7_s_1 %in% c(-10, -8, -2, -1)) %>%
# 新建housing变量
# mutate(housing = if_else(fa7_s_1 == 78 | fa7_s_2 == 78 | fa7_s_3 == 78, 0, 1)) %>%
mutate(housing = if_else(fa7_s_1 %in% c(78) | fa7_s_2 %in% c(78) | fa7_s_3 %in% c(78), 0, 1)) %>% #
# 过滤
filter(!fn1_s_1 %in% c(-10, -9, -8, -2, -1)) %>%
# 替换
mutate_at(vars(fn101), funs(replace(., . == -8, 0))) %>%
# 家庭收到政府补助
filter(!fn101 %in% c(-1, -2, -9, -10)) %>%
# 否收到社会捐助
filter(!fn2 %in% c(-1)) %>%
# 是否有人领取离退休金和养老金
filter(!fn3 %in% c(-1, NA)) %>%
# 是否打工
filter(!is.na(fo1)) %>%
# 过去12月的教育支出
filter(!fp510 %in% c(-10, -9, -8, -2, NA)) %>%
# 是否进行人力资本投入
# mutate(Y = if_else(fp510 == 0, 0, 1)) %>%
mutate(Y = if_else(fp510 %in% c(0), 0, 1)) %>%
# 删除缺失值
filter(!is.na(fincome1_per)) %>%
# 虚拟变量
fastDummies::dummy_cols("provcd14") %>%
# 家庭人均收入fincome1_per的自然对数
mutate(ln_fincome1_per = log(fincome1_per)) %>%
# 政府补助总额+1的自然对数
mutate(ln_fn101 = log(fn101 + 1)) %>%
# 教育支出计算其自然对数
mutate(ln_fp510 = log(fp510 + 1)) %>%
# 是否外出打工与收入对数的交乘项
mutate(fo1xln_fincome1_per = ln_fincome1_per * fo1) %>%
# 收入的自然对数的二次项
mutate(ln_fincome1_perxln_fincome1_per = ln_fincome1_per * ln_fincome1_per)
交乘项的意义?
a %>% map_df(~ sum(is.na(.)))
## # A tibble: 1 x 51
## fid14 provcd14 fa1 fa7_s_1 fa7_s_2 fa7_s_3 fk1l
## <int> <int> <int> <int> <int> <int> <int>
## 1 0 0 0 0 0 0 0
## # ... with 44 more variables: fn1_s_1 <int>,
## # fn1_s_2 <int>, fn1_s_3 <int>, fn1_s_4 <int>,
## # fn101 <int>, fn2 <int>, fn3 <int>, fo1 <int>,
## # fp510 <int>, fincome1_per <int>, housing <int>,
## # Y <int>, provcd14_53 <int>, provcd14_13 <int>,
## # provcd14_14 <int>, provcd14_11 <int>,
## # provcd14_21 <int>, provcd14_32 <int>,
## # provcd14_33 <int>, provcd14_34 <int>,
## # provcd14_36 <int>, provcd14_44 <int>,
## # provcd14_37 <int>, provcd14_41 <int>,
## # provcd14_43 <int>, provcd14_51 <int>,
## # provcd14_45 <int>, provcd14_31 <int>,
## # provcd14_61 <int>, provcd14_12 <int>,
## # provcd14_23 <int>, provcd14_35 <int>,
## # provcd14_62 <int>, provcd14_64 <int>,
## # provcd14_22 <int>, provcd14_50 <int>,
## # provcd14_52 <int>, provcd14_42 <int>,
## # provcd14_15 <int>, ln_fincome1_per <int>,
## # ln_fn101 <int>, ln_fp510 <int>,
## # fo1xln_fincome1_per <int>,
## # ln_fincome1_perxln_fincome1_per <int>
# or
# a %>% summarise_at(1:5, ~sum(is.na(.)))
a %>% summarise_all(~ sum(is.na(.)))
## # A tibble: 1 x 51
## fid14 provcd14 fa1 fa7_s_1 fa7_s_2 fa7_s_3 fk1l
## <int> <int> <int> <int> <int> <int> <int>
## 1 0 0 0 0 0 0 0
## # ... with 44 more variables: fn1_s_1 <int>,
## # fn1_s_2 <int>, fn1_s_3 <int>, fn1_s_4 <int>,
## # fn101 <int>, fn2 <int>, fn3 <int>, fo1 <int>,
## # fp510 <int>, fincome1_per <int>, housing <int>,
## # Y <int>, provcd14_53 <int>, provcd14_13 <int>,
## # provcd14_14 <int>, provcd14_11 <int>,
## # provcd14_21 <int>, provcd14_32 <int>,
## # provcd14_33 <int>, provcd14_34 <int>,
## # provcd14_36 <int>, provcd14_44 <int>,
## # provcd14_37 <int>, provcd14_41 <int>,
## # provcd14_43 <int>, provcd14_51 <int>,
## # provcd14_45 <int>, provcd14_31 <int>,
## # provcd14_61 <int>, provcd14_12 <int>,
## # provcd14_23 <int>, provcd14_35 <int>,
## # provcd14_62 <int>, provcd14_64 <int>,
## # provcd14_22 <int>, provcd14_50 <int>,
## # provcd14_52 <int>, provcd14_42 <int>,
## # provcd14_15 <int>, ln_fincome1_per <int>,
## # ln_fn101 <int>, ln_fp510 <int>,
## # fo1xln_fincome1_per <int>,
## # ln_fincome1_perxln_fincome1_per <int>
# library(visdat)
# vis_miss(a)
library(naniar)
a %>%
miss_var_summary() %>%
filter(n_miss > 0)
## # A tibble: 0 x 3
## # ... with 3 variables: variable <chr>, n_miss <int>,
## # pct_miss <dbl>
df <- a %>% select(
Y, fo1, fo1xln_fincome1_per, ln_fincome1_per,
ln_fincome1_perxln_fincome1_per, ln_fn101,
fn2, fk1l, housing, fn3,
starts_with("provcd14_")
)
df %>%
map(~ count(data.frame(x = .x), x))
## $Y
## # A tibble: 2 x 2
## x n
## <dbl> <int>
## 1 0 4577
## 2 1 4363
##
## $fo1
## # A tibble: 2 x 2
## x n
## <dbl+lbl> <int>
## 1 0 4426
## 2 1 4514
##
## $fo1xln_fincome1_per
## # A tibble: 3,223 x 2
## x n
## <dbl+lbl> <int>
## 1 -0.6931 1
## 2 0.0000 4426
## 3 3.4812 1
## 4 3.9120 1
## 5 4.4248 1
## 6 4.6333 1
## 7 4.8283 1
## 8 4.9618 1
## 9 5.0106 2
## 10 5.1160 1
## # ... with 3,213 more rows
##
## $ln_fincome1_per
## # A tibble: 5,105 x 2
## x n
## <dbl+lbl> <int>
## 1 -1.3863 1
## 2 -0.6931 1
## 3 -0.1823 1
## 4 0.5108 1
## 5 0.9808 1
## 6 1.2040 1
## 7 1.6094 2
## 8 2.0794 1
## 9 2.1972 1
## 10 2.3354 1
## # ... with 5,095 more rows
##
## $ln_fincome1_perxln_fincome1_per
## # A tibble: 5,105 x 2
## x n
## <dbl+lbl> <int>
## 1 0.03324 1
## 2 0.26094 1
## 3 0.48045 1
## 4 0.96203 1
## 5 1.44955 1
## 6 1.92181 1
## 7 2.59029 2
## 8 4.32408 1
## 9 4.82780 1
## 10 5.45397 1
## # ... with 5,095 more rows
##
## $ln_fn101
## # A tibble: 483 x 2
## x n
## <dbl+lbl> <int>
## 1 0.0000 3020
## 2 0.6931 5
## 3 1.6094 2
## 4 1.7918 8
## 5 1.9459 1
## 6 2.1972 1
## 7 2.3026 1
## 8 2.3979 4
## 9 2.4849 1
## 10 2.7081 1
## # ... with 473 more rows
##
## $fn2
## # A tibble: 2 x 2
## x n
## <dbl+lbl> <int>
## 1 0 8816
## 2 1 124
##
## $fk1l
## # A tibble: 2 x 2
## x n
## <dbl+lbl> <int>
## 1 0 2794
## 2 1 6146
##
## $housing
## # A tibble: 2 x 2
## x n
## <dbl> <int>
## 1 0 7500
## 2 1 1440
##
## $fn3
## # A tibble: 2 x 2
## x n
## <dbl+lbl> <int>
## 1 1 2937
## 2 5 6003
##
## $provcd14_53
## # A tibble: 2 x 2
## x n
## <int> <int>
## 1 0 8607
## 2 1 333
##
## $provcd14_13
## # A tibble: 2 x 2
## x n
## <int> <int>
## 1 0 8367
## 2 1 573
##
## $provcd14_14
## # A tibble: 2 x 2
## x n
## <int> <int>
## 1 0 8491
## 2 1 449
##
## $provcd14_11
## # A tibble: 2 x 2
## x n
## <int> <int>
## 1 0 8925
## 2 1 15
##
## $provcd14_21
## # A tibble: 2 x 2
## x n
## <int> <int>
## 1 0 8129
## 2 1 811
##
## $provcd14_32
## # A tibble: 2 x 2
## x n
## <int> <int>
## 1 0 8744
## 2 1 196
##
## $provcd14_33
## # A tibble: 2 x 2
## x n
## <int> <int>
## 1 0 8701
## 2 1 239
##
## $provcd14_34
## # A tibble: 2 x 2
## x n
## <int> <int>
## 1 0 8744
## 2 1 196
##
## $provcd14_36
## # A tibble: 2 x 2
## x n
## <int> <int>
## 1 0 8753
## 2 1 187
##
## $provcd14_44
## # A tibble: 2 x 2
## x n
## <int> <int>
## 1 0 8228
## 2 1 712
##
## $provcd14_37
## # A tibble: 2 x 2
## x n
## <int> <int>
## 1 0 8386
## 2 1 554
##
## $provcd14_41
## # A tibble: 2 x 2
## x n
## <int> <int>
## 1 0 7886
## 2 1 1054
##
## $provcd14_43
## # A tibble: 2 x 2
## x n
## <int> <int>
## 1 0 8729
## 2 1 211
##
## $provcd14_51
## # A tibble: 2 x 2
## x n
## <int> <int>
## 1 0 8437
## 2 1 503
##
## $provcd14_45
## # A tibble: 2 x 2
## x n
## <int> <int>
## 1 0 8705
## 2 1 235
##
## $provcd14_31
## # A tibble: 2 x 2
## x n
## <int> <int>
## 1 0 8546
## 2 1 394
##
## $provcd14_61
## # A tibble: 2 x 2
## x n
## <int> <int>
## 1 0 8746
## 2 1 194
##
## $provcd14_12
## # A tibble: 2 x 2
## x n
## <int> <int>
## 1 0 8915
## 2 1 25
##
## $provcd14_23
## # A tibble: 2 x 2
## x n
## <int> <int>
## 1 0 8797
## 2 1 143
##
## $provcd14_35
## # A tibble: 2 x 2
## x n
## <int> <int>
## 1 0 8809
## 2 1 131
##
## $provcd14_62
## # A tibble: 2 x 2
## x n
## <int> <int>
## 1 0 7760
## 2 1 1180
##
## $provcd14_64
## # A tibble: 2 x 2
## x n
## <int> <int>
## 1 0 8938
## 2 1 2
##
## $provcd14_22
## # A tibble: 2 x 2
## x n
## <int> <int>
## 1 0 8812
## 2 1 128
##
## $provcd14_50
## # A tibble: 2 x 2
## x n
## <int> <int>
## 1 0 8857
## 2 1 83
##
## $provcd14_52
## # A tibble: 2 x 2
## x n
## <int> <int>
## 1 0 8651
## 2 1 289
##
## $provcd14_42
## # A tibble: 2 x 2
## x n
## <int> <int>
## 1 0 8840
## 2 1 100
##
## $provcd14_15
## # A tibble: 2 x 2
## x n
## <int> <int>
## 1 0 8937
## 2 1 3
3.9 Probit 模型
Probit 模型
\[ \Pr(Y=1 \mid X) = \Phi(X^T\beta), \]
这里 \(\Phi\) 为累积分布函数.
加入模型之前,有两点需要考虑
是否需要转换因子类型?
是否需要做标准化处理?
data <- df %>%
mutate_at(vars(fo1, fn2, fk1l, housing, fn3), as.factor) %>%
mutate_at(vars(
fo1xln_fincome1_per, ln_fincome1_per,
ln_fincome1_perxln_fincome1_per, ln_fn101
), funs((. - mean(.)) / sd(.)))
# glimpse(data)
probit_t <- glm(
formula = Y ~ .,
family = binomial(link = "probit"), # canonical link function
data = data
)
tidy(probit_t)
## # A tibble: 37 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -1.90e+11 1.58e+12 -0.120 9.04e- 1
## 2 fo11 1.51e+ 0 2.87e- 1 5.25 1.55e- 7
## 3 fo1xln_finco~ -6.43e- 1 1.46e- 1 -4.41 1.03e- 5
## 4 ln_fincome1_~ 7.16e- 2 9.43e- 2 0.760 4.47e- 1
## 5 ln_fincome1_~ -4.22e- 2 9.74e- 2 -0.433 6.65e- 1
## 6 ln_fn101 -9.56e- 3 1.56e- 2 -0.611 5.41e- 1
## 7 fn21 -1.14e- 1 1.17e- 1 -0.974 3.30e- 1
## 8 fk1l1 1.94e- 1 3.30e- 2 5.88 4.15e- 9
## 9 housing1 2.69e- 1 3.78e- 2 7.13 9.79e-13
## 10 fn35 1.96e- 1 2.97e- 2 6.60 4.02e-11
## # ... with 27 more rows
3.10 边际影响
library(mfx)
probitmfx(formula = Y ~ ., data = data)
## Call:
## probitmfx(formula = Y ~ ., data = data)
##
## Marginal Effects:
## dF/dx Std. Err.
## fo11 0.54796 NA
## fo1xln_fincome1_per -0.25647 NA
## ln_fincome1_per 0.02857 2.90282
## ln_fincome1_perxln_fincome1_per -0.01683 NA
## ln_fn101 -0.00381 NA
## fn21 -0.04510 NA
## fk1l1 0.07718 61.02366
## housing1 0.10698 NA
## fn35 0.07798 98.87063
## provcd14_53 1.00000 0.00000
## provcd14_13 1.00000 0.00000
## provcd14_14 1.00000 0.00000
## provcd14_11 1.00000 0.00000
## provcd14_21 1.00000 0.00000
## provcd14_32 1.00000 0.00000
## provcd14_33 1.00000 0.00000
## provcd14_34 1.00000 0.00000
## provcd14_36 1.00000 0.00000
## provcd14_44 1.00000 0.00000
## provcd14_37 1.00000 0.00000
## provcd14_41 1.00000 0.00000
## provcd14_43 1.00000 0.00000
## provcd14_51 1.00000 0.00000
## provcd14_45 1.00000 0.00000
## provcd14_31 1.00000 0.00000
## provcd14_61 1.00000 0.00000
## provcd14_12 1.00000 0.00000
## provcd14_23 1.00000 0.00000
## provcd14_35 1.00000 0.00000
## provcd14_62 1.00000 0.00000
## provcd14_64 1.00000 0.00000
## provcd14_22 1.00000 0.00000
## provcd14_50 1.00000 0.00000
## provcd14_52 1.00000 0.00000
## provcd14_42 1.00000 0.00000
## provcd14_15 1.00000 0.00000
## z P>|z|
## fo11 NA NA
## fo1xln_fincome1_per NA NA
## ln_fincome1_per 0.01 0.99
## ln_fincome1_perxln_fincome1_per NA NA
## ln_fn101 NA NA
## fn21 NA NA
## fk1l1 0.00 1.00
## housing1 NA NA
## fn35 0.00 1.00
## provcd14_53 Inf <2e-16 ***
## provcd14_13 Inf <2e-16 ***
## provcd14_14 Inf <2e-16 ***
## provcd14_11 Inf <2e-16 ***
## provcd14_21 Inf <2e-16 ***
## provcd14_32 Inf <2e-16 ***
## provcd14_33 Inf <2e-16 ***
## provcd14_34 Inf <2e-16 ***
## provcd14_36 Inf <2e-16 ***
## provcd14_44 Inf <2e-16 ***
## provcd14_37 Inf <2e-16 ***
## provcd14_41 Inf <2e-16 ***
## provcd14_43 Inf <2e-16 ***
## provcd14_51 Inf <2e-16 ***
## provcd14_45 Inf <2e-16 ***
## provcd14_31 Inf <2e-16 ***
## provcd14_61 Inf <2e-16 ***
## provcd14_12 Inf <2e-16 ***
## provcd14_23 Inf <2e-16 ***
## provcd14_35 Inf <2e-16 ***
## provcd14_62 Inf <2e-16 ***
## provcd14_64 Inf <2e-16 ***
## provcd14_22 Inf <2e-16 ***
## provcd14_50 Inf <2e-16 ***
## provcd14_52 Inf <2e-16 ***
## provcd14_42 Inf <2e-16 ***
## provcd14_15 Inf <2e-16 ***
## ---
## Signif. codes:
## 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## dF/dx is for discrete change for the following variables:
##
## [1] "fo11" "fn21" "fk1l1"
## [4] "housing1" "fn35" "provcd14_53"
## [7] "provcd14_13" "provcd14_14" "provcd14_11"
## [10] "provcd14_21" "provcd14_32" "provcd14_33"
## [13] "provcd14_34" "provcd14_36" "provcd14_44"
## [16] "provcd14_37" "provcd14_41" "provcd14_43"
## [19] "provcd14_51" "provcd14_45" "provcd14_31"
## [22] "provcd14_61" "provcd14_12" "provcd14_23"
## [25] "provcd14_35" "provcd14_62" "provcd14_64"
## [28] "provcd14_22" "provcd14_50" "provcd14_52"
## [31] "provcd14_42" "provcd14_15"
或者
# library(margins)
# m <- margins(probit_t)
# plot(m)
重复的是这一篇文章↩