第 4 章 社会资本与精准扶贫

社会资本与精准扶贫

library(tidyverse)
library(here)
library(fs)
library(haven)
library(broom)

4.1 数据导入

我们选取了北京大学开放数据平台中的中国家庭追踪调查CFPS4的2014年数据。

cfps2014famecon <- read_dta("../data/2014AllData/Cfps2014famecon_170630.dta",
  encoding = "GB2312"
)

cfps2014adult <- read_dta("../data/2014AllData/cfps2014adult_170630.dta",
  encoding = "GB2312"
)

4.2 选取变量

筛选家庭数据库中相关的变量

  • use: Cfps2014famecon_170630.dta
  • keep: fid14, fa3, fa4, fa5, fa8, fa9, fs6_s_1-fs6_s_15

筛选成人数据库中相关的变量

  • use: cfps2014adult_170630.dta
  • keep: pid, fid14, cfps2012_latest_edu, qp201, qn12012, qn12013, qn12014, qn1011-qn1017
## 获取标签

library(purrr)
get_var_label <- function(dta) {
  labels <- map(dta, function(x) attr(x, "label"))
  data_frame(
    name = names(labels),
    label = as.character(labels)
  )
}
df_famecon <- cfps2014famecon %>%
  dplyr::select(
    fid14, fa3, fa4, fa5, fa8,
    fa9, fs6_s_1:fs6_s_15
  )

df_famecon %>% get_var_label()
## # A tibble: 21 x 2
##    name    label                   
##    <chr>   <chr>                   
##  1 fid14   2014年家户号            
##  2 fa3     做饭用水                
##  3 fa4     做饭燃料                
##  4 fa5     通电                    
##  5 fa8     卫生间类型              
##  6 fa9     垃圾倾倒场所            
##  7 fs6_s_1 您家有下列哪些物品-选择1
##  8 fs6_s_2 您家有下列哪些物品-选择2
##  9 fs6_s_3 您家有下列哪些物品-选择3
## 10 fs6_s_4 您家有下列哪些物品-选择4
## # ... with 11 more rows
df_adult <- cfps2014adult %>%
  dplyr::select(
    pid, fid14, cfps2012_latest_edu, qp201,
    qn12012, qn12013, qn12014, qn1011:qn1017
  )

df_adult %>% get_var_label()
## # A tibble: 14 x 2
##    name                label                 
##    <chr>               <chr>                 
##  1 pid                 个人ID                
##  2 fid14               2014年家庭样本编码    
##  3 cfps2012_latest_edu 最近一次调查最高学历  
##  4 qp201               健康状况              
##  5 qn12012             对自己生活满意度      
##  6 qn12013             对自家生活满意度      
##  7 qn12014             对自己未来信心程度    
##  8 qn1011              因贫富差距而受到不公  
##  9 qn1012              因户籍而受到不公      
## 10 qn1013              因性别而受到不公      
## 11 qn1014              受到政府干部不公      
## 12 qn1015              与政府干部发生冲突    
## 13 qn1016              到政府办事受到拖延推诿
## 14 qn1017              是否遭政府不合理收费

4.3 合并

df_set <- df_famecon %>%
  left_join(df_adult, by = "fid14") # %>%
## Warning: Column `fid14` has different attributes on LHS
## and RHS of join
# drop_na()

4.4 变量解读

依次对变量解读和规整

colnames(df_set)
##  [1] "fid14"               "fa3"                
##  [3] "fa4"                 "fa5"                
##  [5] "fa8"                 "fa9"                
##  [7] "fs6_s_1"             "fs6_s_2"            
##  [9] "fs6_s_3"             "fs6_s_4"            
## [11] "fs6_s_5"             "fs6_s_6"            
## [13] "fs6_s_7"             "fs6_s_8"            
## [15] "fs6_s_9"             "fs6_s_10"           
## [17] "fs6_s_11"            "fs6_s_12"           
## [19] "fs6_s_13"            "fs6_s_14"           
## [21] "fs6_s_15"            "pid"                
## [23] "cfps2012_latest_edu" "qp201"              
## [25] "qn12012"             "qn12013"            
## [27] "qn12014"             "qn1011"             
## [29] "qn1012"              "qn1013"             
## [31] "qn1014"              "qn1015"             
## [33] "qn1016"              "qn1017"
df_set %>% get_var_label()
## # A tibble: 34 x 2
##    name    label                   
##    <chr>   <chr>                   
##  1 fid14   2014年家户号            
##  2 fa3     做饭用水                
##  3 fa4     做饭燃料                
##  4 fa5     通电                    
##  5 fa8     卫生间类型              
##  6 fa9     垃圾倾倒场所            
##  7 fs6_s_1 您家有下列哪些物品-选择1
##  8 fs6_s_2 您家有下列哪些物品-选择2
##  9 fs6_s_3 您家有下列哪些物品-选择3
## 10 fs6_s_4 您家有下列哪些物品-选择4
## # ... with 24 more rows

4.4.1 fid14

df_set %>% count(fid14)
## # A tibble: 13,946 x 2
##    fid14         n
##    <dbl+lbl> <int>
##  1 100051        3
##  2 100125        1
##  3 100160        1
##  4 100286        1
##  5 100376        1
##  6 100435        1
##  7 100453        4
##  8 100551        1
##  9 100569        2
## 10 100724        1
## # ... with 13,936 more rows

4.4.2 fa3

df_set %>% count(fa3)
## # A tibble: 11 x 2
##    fa3           n
##    <dbl+lbl> <int>
##  1 -2            1
##  2 -1            4
##  3  1          174
##  4  2         9612
##  5  3        23882
##  6  4          271
##  7  5          111
##  8  6          765
##  9  7         1278
## 10 77           80
## 11 NA          856

4.4.3 fa4

df_set %>% count(fa4)
## # A tibble: 10 x 2
##    fa4           n
##    <dbl+lbl> <int>
##  1 -2            1
##  2 -1            3
##  3  1        12021
##  4  2         2286
##  5  3         9779
##  6  4         4335
##  7  5          343
##  8  6         7310
##  9 77          100
## 10 NA          856

4.4.4 fa5

df_set %>% count(fa5)
## # A tibble: 6 x 2
##   fa5           n
##   <dbl+lbl> <int>
## 1 -1            1
## 2  1           70
## 3  2          963
## 4  3        13075
## 5  4        22069
## 6 NA          856

数据清洗: 无

4.4.5 fa8

df_set %>% count(fa8)
## # A tibble: 8 x 2
##   fa8           n
##   <dbl+lbl> <int>
## 1  1        14717
## 2  2         1707
## 3  3          356
## 4  4         1670
## 5  5        15939
## 6  6          955
## 7 77          834
## 8 NA          856

4.4.6 fa9

df_set %>% count(fa9)
## # A tibble: 9 x 2
##   fa9           n
##   <dbl+lbl> <int>
## 1  1        17354
## 2  2         7832
## 3  3         3180
## 4  4         2352
## 5  5         1131
## 6  6          241
## 7  7         3205
## 8 77          883
## 9 NA          856

4.4.7 fs6_s_1:fs6_s_15

df_set %>% count(fs6_s_1)
## # A tibble: 16 x 2
##    fs6_s_1       n
##    <dbl+lbl> <int>
##  1  1         6184
##  2  2        10081
##  3  3         8931
##  4  4         6865
##  5  5         1290
##  6  6         1986
##  7  7          193
##  8  8            6
##  9 10            8
## 10 11           28
## 11 12          420
## 12 13            5
## 13 14            2
## 14 77            2
## 15 78          177
## 16 NA          856

4.4.8 pid

df_set %>% count(pid)
## # A tibble: 36,866 x 2
##    pid           n
##    <dbl+lbl> <int>
##  1 100051501     1
##  2 100051502     1
##  3 100453431     1
##  4 101129501     1
##  5 103671501     1
##  6 103788501     1
##  7 103924503     1
##  8 105179433     1
##  9 107624501     1
## 10 108211501     1
## # ... with 36,856 more rows

4.4.9 cfps2012_latest_edu

df_set %>% count(cfps2012_latest_edu)
## # A tibble: 11 x 2
##    cfps2012_latest_edu     n
##    <dbl+lbl>           <int>
##  1 -8                   3707
##  2  0                     46
##  3  1                   9409
##  4  2                   7773
##  5  3                   9381
##  6  4                   4359
##  7  5                   1382
##  8  6                    763
##  9  7                     44
## 10  8                      1
## 11 NA                    169

4.4.10 qp201

df_set %>% count(qp201)
## # A tibble: 9 x 2
##   qp201         n
##   <dbl+lbl> <int>
## 1 -8            5
## 2 -2            1
## 3 -1           12
## 4  1         5441
## 5  2         7805
## 6  3        12532
## 7  4         5328
## 8  5         5741
## 9 NA          169

4.4.11 qn12012:qn12014

df_set %>% count(qn12012)
## # A tibble: 8 x 2
##   qn12012       n
##   <dbl+lbl> <int>
## 1 -2            4
## 2 -1           29
## 3  1          838
## 4  2         1828
## 5  3         9185
## 6  4        10411
## 7  5         9163
## 8 NA         5576

4.4.12 qn1011:qn1017

df_set %>% count(qn1011)
## # A tibble: 8 x 2
##   qn1011        n
##   <dbl+lbl> <int>
## 1 -8            4
## 2 -2            5
## 3 -1           15
## 4  1         3691
## 5  3         5103
## 6  5        21914
## 7 79          726
## 8 NA         5576

4.5 多维度贫困测量

4.6 剥夺矩阵

假定有这样一个数据框

df <- tribble(
  ~id, ~x, ~y, ~z, ~g,
  #--|--|--|--|--
  "a", 13.1, 14, 4, 1,
  "b", 15.2, 7, 5, 0,
  "c", 12.5, 10, 1, 0,
  "d", 20, 11, 3, 1
)
df
## # A tibble: 4 x 5
##   id        x     y     z     g
##   <chr> <dbl> <dbl> <dbl> <dbl>
## 1 a      13.1    14     4     1
## 2 b      15.2     7     5     0
## 3 c      12.5    10     1     0
## 4 d      20      11     3     1

设定每个变量的临界值为

cutoffs <- list(
  x = 13,
  y = 12,
  z = 3,
  g = 1
)

剥夺矩阵

get_deprivation_df <- function(df, ..., cutoffs) {
  vars <- rlang::enexprs(...)
  quos <- purrr::map(vars, function(var) {
    rlang::quo(dplyr::if_else(!!var < cutoffs[[rlang::as_name(var)]], 1, 0))
  }) %>%
    purrr::set_names(nm = purrr::map_chr(vars, rlang::as_name))

  df %>%
    dplyr::mutate(!!!quos)
}

g0 <- df %>%
  get_deprivation_df(x, y, z, g, cutoffs = cutoffs)
g0
## # A tibble: 4 x 5
##   id        x     y     z     g
##   <chr> <dbl> <dbl> <dbl> <dbl>
## 1 a         0     0     0     0
## 2 b         0     1     0     1
## 3 c         1     1     1     1
## 4 d         0     1     0     0

统计每个人被剥夺了多少, 即每一行中有多个变量是1。

add_deprivations_counting <- function(df) {
  fun <- function(...) sum(c(...) == 1)
  df %>% mutate(n = pmap_dbl(select_if(., is.numeric), fun))
}

g0_k <- g0 %>% add_deprivations_counting()
g0_k
## # A tibble: 4 x 6
##   id        x     y     z     g     n
##   <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 a         0     0     0     0     0
## 2 b         0     1     0     1     2
## 3 c         1     1     1     1     4
## 4 d         0     1     0     0     1

得到了剥夺矩阵,下面进行的是贫困人口识别。

4.6.1 识别方法一:统计剥夺变量个数

再给定一个k值,如果每个人剥夺次数小于k值,说明不贫穷,那么这个人所在行全部归零。 没被清零的,说明是贫穷的

censor_aggregation <- function(df, k) {
  fun <- function(...) sum(c(...) == 1)
  df %>%
    nest(-id, -n) %>%
    mutate(data = if_else(
      n == k, # Censor data of nonpoor
      map(data, function(x) mutate_all(x, funs(replace(., TRUE, 0)))),
      data
    )) %>%
    tidyr::unnest() %>%
    dplyr::select(-n) %>%
    mutate(n = pmap_dbl(select_if(., is.numeric), fun))
}

g0_k1 <- g0_k %>% censor_aggregation(k = 1)
g0_k1
## # A tibble: 4 x 6
##   id        x     y     z     g     n
##   <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 a         0     0     0     0     0
## 2 b         0     1     0     1     2
## 3 c         1     1     1     1     4
## 4 d         0     0     0     0     0

根据上面识别出来的贫穷人口,计算一些比率

Headcount_Ratio <- function(df) {
  d <- nrow(df)
  q <- df %>% summarise(num_nopoor = sum(n != 0)) %>% pull()
  H <- q / d

  return(H)
}

average_deprivation_share <- function(df) {
  d <- nrow(df)
  q <- df %>% summarise(num_nopoor = sum(n != 0)) %>% pull()
  A <- df %>%
    mutate(ck_d = n / d) %>%
    summarise(sum_sk_d = sum(ck_d) / q) %>%
    pull()

  return(A)
}
g0_k1 %>% Headcount_Ratio()
## [1] 0.5
g0_k1 %>% average_deprivation_share()
## [1] 0.75

4.6.2 识别方法二:赋予变量权重再统计

这里不再是数每行有多少个1,而是对每个变量赋予权重,求得贫困系数(或者叫贫困概率)。这个过程 也可以用tidyverse的办法

weights <- c(
  x = 0.25,
  y = 0.25,
  z = 0.25,
  g = 0.25
)

weighted_sum <- function(.data, weights) {
  ab <- .data %>%
    gather(var, val, x:g) %>%
    mutate(x_weight = val * weights[var]) %>%
    group_by(id) %>%
    summarise(sum_weight = sum(x_weight))

  .data %>% left_join(ab)
}


g0 %>% weighted_sum(weights)
## Joining, by = "id"
## # A tibble: 4 x 6
##   id        x     y     z     g sum_weight
##   <chr> <dbl> <dbl> <dbl> <dbl>      <dbl>
## 1 a         0     0     0     0       0   
## 2 b         0     1     0     1       0.5 
## 3 c         1     1     1     1       1   
## 4 d         0     1     0     0       0.25

或者

fun <- function(...) {weighted.mean(..., weights)}
g0 %>% mutate(wt_sum = pmap_dbl(select_if(., is.numeric), lift_vd(fun)))
## # A tibble: 4 x 6
##   id        x     y     z     g wt_sum
##   <chr> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 a         0     0     0     0   0   
## 2 b         0     1     0     1   0.5 
## 3 c         1     1     1     1   1   
## 4 d         0     1     0     0   0.25

4.7 变量赋权重再统计(一站式代码)

给出每个变量的临界值,得到剥夺矩阵,然后再对每个变量赋予权重,求得贫困系数(或者叫贫困概率),根据贫困系数,我们识别贫困人口。事实上,这两步可以合并成一步来完成。

cutoffs <- list(
  x = 13,
  y = 12,
  z = 3,
  g = 1
)

weights <- c(
  x = 0.25,
  y = 0.25,
  z = 0.25,
  g = 0.25
)

df_wt <- df %>%
  gather(var, val, x:g) %>%
  mutate(below_cutoff = as.integer(val < cutoffs[var])) %>%
  mutate(weighted_point = below_cutoff * weights[var]) %>%
  group_by(id) %>%
  summarise(weighted_sum = sum(weighted_point))

df %>% left_join(df_wt)
## Joining, by = "id"
## # A tibble: 4 x 6
##   id        x     y     z     g weighted_sum
##   <chr> <dbl> <dbl> <dbl> <dbl>        <dbl>
## 1 a      13.1    14     4     1         0   
## 2 b      15.2     7     5     0         0.5 
## 3 c      12.5    10     1     0         1   
## 4 d      20      11     3     1         0.25

或者写出子函数的形式,一劳永逸

cutoffs <- c(
  x = 13,
  y = 12,
  z = 3,
  g = 1
)

weights <- c(
  x = 0.25,
  y = 0.25,
  z = 0.25,
  g = 0.25
)

get_weightedSum_df <- function(.data, cutoffs, weights) {
  ab <- .data %>%
    gather(var, val, x:g) %>%
    mutate(below_cutoff = as.integer(val < cutoffs[var])) %>%
    mutate(weighted_point = below_cutoff * weights[var]) %>%
    group_by(id) %>%
    summarise(weightSum = sum(weighted_point))

  .data %>% left_join(ab)
}


df %>% get_weightedSum_df(cutoffs, weights)
## # A tibble: 4 x 6
##   id        x     y     z     g weightSum
##   <chr> <dbl> <dbl> <dbl> <dbl>     <dbl>
## 1 a      13.1    14     4     1      0   
## 2 b      15.2     7     5     0      0.5 
## 3 c      12.5    10     1     0      1   
## 4 d      20      11     3     1      0.25