第 2 章 农村土地流转研究

library(tidyverse)
library(purrr)
library(haven)
library(visdat)

2.1 数据导入

cfps2010family <- read_dta("../data/2010AllData/cfps2010family_report_nat092014.dta",
  encoding = "GB2312"
)

2.2 数据探索

# colnames(cfps2010family)

2.3 选取变量

cfps2010family %>%
  select(urban, starts_with("fk201_a")) %>%
  glimpse()
## Observations: 14,798
## Variables: 7
## $ urban     <dbl+lbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ fk201_a_1 <dbl+lbl> -8, -8, -8, -8, -8, -8, -8, ...
## $ fk201_a_2 <dbl+lbl> -8.0, -8.0, -8.0, -8.0, 2.5,...
## $ fk201_a_3 <dbl+lbl> -8, -8, -8, -8, -8, -8, -8, ...
## $ fk201_a_4 <dbl+lbl> -8, -8, -8, -8, -8, -8, -8, ...
## $ fk201_a_5 <dbl+lbl> -8, -8, -8, -8, -8, -8, -8, ...
## $ fk201_a_6 <dbl+lbl> -8, -8, -8, -8, -8, -8, -8, ...

2.4 获取标签

library(purrr)
get_var_label <- function(dta) {
  labels <- map(dta, function(x) attr(x, "label"))
  data_frame(
    name = names(labels),
    label = as.character(labels)
  )
}

cfps2010family %>%
  select(urban, starts_with("fk201_a")) %>%
  get_var_label()
## # A tibble: 7 x 2
##   name      label                           
##   <chr>     <chr>                           
## 1 urban     基于国家统计局资料的城乡分类变量
## 2 fk201_a_1 您家拥有多少亩水田              
## 3 fk201_a_2 您家拥有多少亩旱地              
## 4 fk201_a_3 您家拥有多少亩林地              
## 5 fk201_a_4 您家拥有多少亩果园              
## 6 fk201_a_5 您家拥有多少亩草场              
## 7 fk201_a_6 您家拥有多少亩池塘

2.5 数据统计

cfps2010family %>%
  select(urban, starts_with("fk201_a")) %>%
  map(~ count(data.frame(x = .x), x))
## $urban
## # A tibble: 2 x 2
##   x             n
##   <dbl+lbl> <int>
## 1 0          7694
## 2 1          7104
## 
## $fk201_a_1
## # A tibble: 114 x 2
##    x             n
##    <dbl+lbl> <int>
##  1 -8.0      11657
##  2 -1.0          3
##  3  0.0         20
##  4  0.1          6
##  5  0.2         14
##  6  0.3         36
##  7  0.4         36
##  8  0.5         74
##  9  0.6         54
## 10  0.7         49
## # ... with 104 more rows
## 
## $fk201_a_2
## # A tibble: 198 x 2
##    x             n
##    <dbl+lbl> <int>
##  1 -8.0       8553
##  2 -1.0         12
##  3  0.0         42
##  4  0.1         31
##  5  0.2         67
##  6  0.3         74
##  7  0.4         39
##  8  0.5        178
##  9  0.6         64
## 10  0.7         50
## # ... with 188 more rows
## 
## $fk201_a_3
## # A tibble: 115 x 2
##    x             n
##    <dbl+lbl> <int>
##  1 -8.0      13700
##  2 -1.0          5
##  3  0.0         13
##  4  0.1         10
##  5  0.2         10
##  6  0.3         17
##  7  0.4          6
##  8  0.5         30
##  9  0.6          6
## 10  0.7          6
## # ... with 105 more rows
## 
## $fk201_a_4
## # A tibble: 59 x 2
##    x             n
##    <dbl+lbl> <int>
##  1 -8.0      14152
##  2 -1.0          6
##  3  0.0         19
##  4  0.1          6
##  5  0.2          9
##  6  0.3         10
##  7  0.4          4
##  8  0.5         33
##  9  0.6          2
## 10  0.7          4
## # ... with 49 more rows
## 
## $fk201_a_5
## # A tibble: 5 x 2
##   x             n
##   <dbl+lbl> <int>
## 1 -8.0      14790
## 2  1.0          4
## 3  2.0          2
## 4  3.0          1
## 5  3.3          1
## 
## $fk201_a_6
## # A tibble: 36 x 2
##    x             n
##    <dbl+lbl> <int>
##  1 -8.0      14707
##  2  0.0         10
##  3  0.1          1
##  4  0.2          3
##  5  0.3          3
##  6  0.4          1
##  7  0.5          7
##  8  0.6          1
##  9  0.7          1
## 10  0.8          2
## # ... with 26 more rows
# map_if(is.character, ~count(data.frame(x = .x), x) )

2.6 缺失值

library(naniar)
cfps2010family %>%
  select(urban, starts_with("fk201_a")) %>%
  miss_var_summary()
## # A tibble: 7 x 3
##   variable  n_miss pct_miss
##   <chr>      <int>    <dbl>
## 1 urban          0        0
## 2 fk201_a_1      0        0
## 3 fk201_a_2      0        0
## 4 fk201_a_3      0        0
## 5 fk201_a_4      0        0
## 6 fk201_a_5      0        0
## 7 fk201_a_6      0        0
library(visdat)
cfps2010family %>%
  select(urban, starts_with("fk201_a")) %>%
  vis_dat()

2.7 数据规整

cfps2010family %>%
  select(urban, starts_with("fk2_s")) %>%
  filter(urban == "0")
## # A tibble: 7,694 x 6
##    urban   fk2_s_1  fk2_s_2  fk2_s_3  fk2_s_4  fk2_s_5 
##    <dbl+l> <dbl+lb> <dbl+lb> <dbl+lb> <dbl+lb> <dbl+lb>
##  1 0        4       -8       -8       -8       -8      
##  2 0        2       -8       -8       -8       -8      
##  3 0        4       -8       -8       -8       -8      
##  4 0        4       -8       -8       -8       -8      
##  5 0       -8       -8       -8       -8       -8      
##  6 0       -8       -8       -8       -8       -8      
##  7 0       -8       -8       -8       -8       -8      
##  8 0        6       -8       -8       -8       -8      
##  9 0        4       -8       -8       -8       -8      
## 10 0       -8       -8       -8       -8       -8      
## # ... with 7,684 more rows
# filter_if,filter_at,filter_all
# para 1:tbl, para 2:column, para 3:row

a <- cfps2010family %>%
  select(urban, starts_with("fk201_a")) %>%
  filter_at(vars(starts_with("fk201_a")), any_vars(. > 0))

a
## # A tibble: 7,688 x 7
##    urban fk201_a_1 fk201_a_2 fk201_a_3 fk201_a_4
##    <dbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl>
##  1 1     -8         2.5      -8        -8.0     
##  2 1     -8        -8.0      -8         0.9     
##  3 0     -8        -8.0      -8         5.1     
##  4 0     -8         1.8      -8        -8.0     
##  5 0     -8        -8.0      -8         6.0     
##  6 0     -8        -8.0      -8        -8.0     
##  7 0     -8        -8.0      -8         4.0     
##  8 0     -8        -8.0      -8         1.5     
##  9 0     -8        -8.0      -8         3.0     
## 10 0     -8         1.5      -8        -8.0     
## # ... with 7,678 more rows, and 2 more variables:
## #   fk201_a_5 <dbl+lbl>, fk201_a_6 <dbl+lbl>
a %>% mutate_at(vars(starts_with("fk201_a")), funs(replace(., . < 0, 0)))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## please use list() instead
## 
## # Before:
## funs(name = f(.)
## 
## # After: 
## list(name = ~f(.))
## This warning is displayed once per session.
## # A tibble: 7,688 x 7
##    urban fk201_a_1 fk201_a_2 fk201_a_3 fk201_a_4
##    <dbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl>
##  1 1     0         2.5       0         0.0      
##  2 1     0         0.0       0         0.9      
##  3 0     0         0.0       0         5.1      
##  4 0     0         1.8       0         0.0      
##  5 0     0         0.0       0         6.0      
##  6 0     0         0.0       0         0.0      
##  7 0     0         0.0       0         4.0      
##  8 0     0         0.0       0         1.5      
##  9 0     0         0.0       0         3.0      
## 10 0     0         1.5       0         0.0      
## # ... with 7,678 more rows, and 2 more variables:
## #   fk201_a_5 <dbl+lbl>, fk201_a_6 <dbl+lbl>

2.8 模型建立

# probit_t <- glm(
#   formula = Y ~ .,
#   family = binomial(link = "probit"), # canonical link function
#   data = data
# )
#
# summary(probit_t)

2.9 结论分析

# tidy(probit_t)
# confint(myprobit)