第 1 章 个人信用借贷

这次使用 tidyverse 和数据库打造数据分析流,分析 220 多万条个人信用借贷记录,数据来源于 Kaggle 比赛官网 https://www.kaggle.com/wendykan/lending-club-loan-data,它存储在 SQLite 数据库中,分析期间需要编写一定的 SQL 语句1

信用评分卡开发

数据思维和统计思维

数据预处理

1.1 查看数据

以 Kaggle 上的借贷数据集为例,介绍 tidyverse 在数据分析和建模过程中的使用。首先从竞赛网站下载数据集 https://www.kaggle.com/wendykan/lending-club-loan-data 存放到本地,网站提供了 csv 和 sqlite 两种格式的数据集,一般地,对于大数据级(GB 以上),我们推荐借助数据库来探索分析数据,所以这里选择 lend_loan.sqlite 文件,它保存在本地 F:/lend-loan/lending-club-loan-data/lend_loan.sqlite

library(DBI)
con <- dbConnect(
  RSQLite::SQLite(),
  "F:/lend-loan/lending-club-loan-data/lend_loan.sqlite"
)

查看数据库中有哪些表

dbListTables(con)
## [1] "loan"

列出表 loan 中的各个字段,一共 145 个字段

dbListFields(con, "loan")
##   [1] "id"                                        
##   [2] "member_id"                                 
##   [3] "loan_amnt"                                 
##   [4] "funded_amnt"                               
##   [5] "funded_amnt_inv"                           
##   [6] "term"                                      
##   [7] "int_rate"                                  
##   [8] "installment"                               
##   [9] "grade"                                     
##  [10] "sub_grade"                                 
##  [11] "emp_title"                                 
##  [12] "emp_length"                                
##  [13] "home_ownership"                            
##  [14] "annual_inc"                                
##  [15] "verification_status"                       
##  [16] "issue_d"                                   
##  [17] "loan_status"                               
##  [18] "pymnt_plan"                                
##  [19] "url"                                       
##  [20] "desc"                                      
##  [21] "purpose"                                   
##  [22] "title"                                     
##  [23] "zip_code"                                  
##  [24] "addr_state"                                
##  [25] "dti"                                       
##  [26] "delinq_2yrs"                               
##  [27] "earliest_cr_line"                          
##  [28] "inq_last_6mths"                            
##  [29] "mths_since_last_delinq"                    
##  [30] "mths_since_last_record"                    
##  [31] "open_acc"                                  
##  [32] "pub_rec"                                   
##  [33] "revol_bal"                                 
##  [34] "revol_util"                                
##  [35] "total_acc"                                 
##  [36] "initial_list_status"                       
##  [37] "out_prncp"                                 
##  [38] "out_prncp_inv"                             
##  [39] "total_pymnt"                               
##  [40] "total_pymnt_inv"                           
##  [41] "total_rec_prncp"                           
##  [42] "total_rec_int"                             
##  [43] "total_rec_late_fee"                        
##  [44] "recoveries"                                
##  [45] "collection_recovery_fee"                   
##  [46] "last_pymnt_d"                              
##  [47] "last_pymnt_amnt"                           
##  [48] "next_pymnt_d"                              
##  [49] "last_credit_pull_d"                        
##  [50] "collections_12_mths_ex_med"                
##  [51] "mths_since_last_major_derog"               
##  [52] "policy_code"                               
##  [53] "application_type"                          
##  [54] "annual_inc_joint"                          
##  [55] "dti_joint"                                 
##  [56] "verification_status_joint"                 
##  [57] "acc_now_delinq"                            
##  [58] "tot_coll_amt"                              
##  [59] "tot_cur_bal"                               
##  [60] "open_acc_6m"                               
##  [61] "open_act_il"                               
##  [62] "open_il_12m"                               
##  [63] "open_il_24m"                               
##  [64] "mths_since_rcnt_il"                        
##  [65] "total_bal_il"                              
##  [66] "il_util"                                   
##  [67] "open_rv_12m"                               
##  [68] "open_rv_24m"                               
##  [69] "max_bal_bc"                                
##  [70] "all_util"                                  
##  [71] "total_rev_hi_lim"                          
##  [72] "inq_fi"                                    
##  [73] "total_cu_tl"                               
##  [74] "inq_last_12m"                              
##  [75] "acc_open_past_24mths"                      
##  [76] "avg_cur_bal"                               
##  [77] "bc_open_to_buy"                            
##  [78] "bc_util"                                   
##  [79] "chargeoff_within_12_mths"                  
##  [80] "delinq_amnt"                               
##  [81] "mo_sin_old_il_acct"                        
##  [82] "mo_sin_old_rev_tl_op"                      
##  [83] "mo_sin_rcnt_rev_tl_op"                     
##  [84] "mo_sin_rcnt_tl"                            
##  [85] "mort_acc"                                  
##  [86] "mths_since_recent_bc"                      
##  [87] "mths_since_recent_bc_dlq"                  
##  [88] "mths_since_recent_inq"                     
##  [89] "mths_since_recent_revol_delinq"            
##  [90] "num_accts_ever_120_pd"                     
##  [91] "num_actv_bc_tl"                            
##  [92] "num_actv_rev_tl"                           
##  [93] "num_bc_sats"                               
##  [94] "num_bc_tl"                                 
##  [95] "num_il_tl"                                 
##  [96] "num_op_rev_tl"                             
##  [97] "num_rev_accts"                             
##  [98] "num_rev_tl_bal_gt_0"                       
##  [99] "num_sats"                                  
## [100] "num_tl_120dpd_2m"                          
## [101] "num_tl_30dpd"                              
## [102] "num_tl_90g_dpd_24m"                        
## [103] "num_tl_op_past_12m"                        
## [104] "pct_tl_nvr_dlq"                            
## [105] "percent_bc_gt_75"                          
## [106] "pub_rec_bankruptcies"                      
## [107] "tax_liens"                                 
## [108] "tot_hi_cred_lim"                           
## [109] "total_bal_ex_mort"                         
## [110] "total_bc_limit"                            
## [111] "total_il_high_credit_limit"                
## [112] "revol_bal_joint"                           
## [113] "sec_app_earliest_cr_line"                  
## [114] "sec_app_inq_last_6mths"                    
## [115] "sec_app_mort_acc"                          
## [116] "sec_app_open_acc"                          
## [117] "sec_app_revol_util"                        
## [118] "sec_app_open_act_il"                       
## [119] "sec_app_num_rev_accts"                     
## [120] "sec_app_chargeoff_within_12_mths"          
## [121] "sec_app_collections_12_mths_ex_med"        
## [122] "sec_app_mths_since_last_major_derog"       
## [123] "hardship_flag"                             
## [124] "hardship_type"                             
## [125] "hardship_reason"                           
## [126] "hardship_status"                           
## [127] "deferral_term"                             
## [128] "hardship_amount"                           
## [129] "hardship_start_date"                       
## [130] "hardship_end_date"                         
## [131] "payment_plan_start_date"                   
## [132] "hardship_length"                           
## [133] "hardship_dpd"                              
## [134] "hardship_loan_status"                      
## [135] "orig_projected_additional_accrued_interest"
## [136] "hardship_payoff_balance_amount"            
## [137] "hardship_last_payment_amount"              
## [138] "disbursement_method"                       
## [139] "debt_settlement_flag"                      
## [140] "debt_settlement_flag_date"                 
## [141] "settlement_status"                         
## [142] "settlement_date"                           
## [143] "settlement_amount"                         
## [144] "settlement_percentage"                     
## [145] "settlement_term"

查看 loan 中的记录总数,count 命令有点费时间,220多万,确切地说一共 2260668 条记录,

dbGetQuery(con, "SELECT COUNT(*) FROM loan")
##   COUNT(*)
## 1  2260668

查看俩个字段的前 10 个记录,是不是很奇怪怎么都是空的,因为匿名处理了,真实的 ID 都被处理了,实际上数据集 loan 只有 143 个字段

dbGetQuery(con, "SELECT id,member_id FROM loan LIMIT 10")
##    id member_id
## 1              
## 2              
## 3              
## 4              
## 5              
## 6              
## 7              
## 8              
## 9              
## 10

接下来还要看看其它字段的缺失情况,不妨先取 10 行记录简单看一下。

dbGetQuery(con, "
SELECT *
FROM loan 
LIMIT 10
")
##    id member_id loan_amnt funded_amnt funded_amnt_inv       term int_rate
## 1                    2500        2500            2500  36 months    13.56
## 2                   30000       30000           30000  60 months    18.94
## 3                    5000        5000            5000  36 months    17.97
## 4                    4000        4000            4000  36 months    18.94
## 5                   30000       30000           30000  60 months    16.14
## 6                    5550        5550            5550  36 months    15.02
## 7                    2000        2000            2000  36 months    17.97
## 8                    6000        6000            6000  36 months    13.56
## 9                    5000        5000            5000  36 months    17.97
## 10                   6000        6000            6000  36 months    14.47
##    installment grade sub_grade           emp_title emp_length
## 1        84.92     C        C1                Chef  10+ years
## 2       777.23     D        D2         Postmaster   10+ years
## 3       180.69     D        D1      Administrative    6 years
## 4       146.51     D        D2       IT Supervisor  10+ years
## 5       731.78     C        C4            Mechanic  10+ years
## 6       192.45     C        C3        Director COE  10+ years
## 7        72.28     D        D1     Account Manager    4 years
## 8       203.79     C        C1  Assistant Director  10+ years
## 9       180.69     D        D1 Legal Assistant III  10+ years
## 10      206.44     C        C2                       < 1 year
##    home_ownership annual_inc verification_status  issue_d loan_status
## 1            RENT      55000        Not Verified Dec-2018     Current
## 2        MORTGAGE      90000     Source Verified Dec-2018     Current
## 3        MORTGAGE      59280     Source Verified Dec-2018     Current
## 4        MORTGAGE      92000     Source Verified Dec-2018     Current
## 5        MORTGAGE      57250        Not Verified Dec-2018     Current
## 6        MORTGAGE     152500        Not Verified Dec-2018     Current
## 7            RENT      51000     Source Verified Dec-2018     Current
## 8            RENT      65000     Source Verified Dec-2018     Current
## 9        MORTGAGE      53580     Source Verified Dec-2018     Current
## 10            OWN     300000        Not Verified Dec-2018     Current
##    pymnt_plan url desc            purpose                   title zip_code
## 1           n          debt_consolidation      Debt consolidation    109xx
## 2           n          debt_consolidation      Debt consolidation    713xx
## 3           n          debt_consolidation      Debt consolidation    490xx
## 4           n          debt_consolidation      Debt consolidation    985xx
## 5           n          debt_consolidation      Debt consolidation    212xx
## 6           n                 credit_card Credit card refinancing    461xx
## 7           n          debt_consolidation      Debt consolidation    606xx
## 8           n                 credit_card Credit card refinancing    460xx
## 9           n          debt_consolidation      Debt consolidation    327xx
## 10          n          debt_consolidation      Debt consolidation    068xx
##    addr_state   dti delinq_2yrs earliest_cr_line inq_last_6mths
## 1          NY 18.24           0         Apr-2001              1
## 2          LA 26.52           0         Jun-1987              0
## 3          MI 10.51           0         Apr-2011              0
## 4          WA 16.74           0         Feb-2006              0
## 5          MD 26.35           0         Dec-2000              0
## 6          IN 37.94           0         Sep-2002              3
## 7          IL   2.4           0         Nov-2004              1
## 8          IN  30.1           0         Nov-1997              0
## 9          FL 21.16           0         Aug-1998              1
## 10         CT 17.43           1         Apr-2002              1
##    mths_since_last_delinq mths_since_last_record open_acc pub_rec
## 1                                             45        9       1
## 2                      71                     75       13       1
## 3                                                       8       0
## 4                                                      10       0
## 5                                                      12       0
## 6                                                      18       0
## 7                                                       1       0
## 8                                                      19       0
## 9                      32                               8       0
## 10                     17                              38       0
##    revol_bal revol_util total_acc initial_list_status out_prncp
## 1       4341       10.3        34                   w   2386.02
## 2      12315       24.2        44                   w  29387.75
## 3       4599       19.1        13                   w   4787.21
## 4       5468       78.1        13                   w   3831.93
## 5        829        3.6        26                   w  29339.02
## 6      53854       48.1        44                   w   5302.50
## 7          0                    9                   w   1914.71
## 8      38476       69.3        37                   w   5864.01
## 9       8018       35.2        38                   w   4786.79
## 10     65950       49.8        58                   w   5730.20
##    out_prncp_inv total_pymnt total_pymnt_inv total_rec_prncp total_rec_int
## 1        2386.02      167.02          167.02          113.98         53.04
## 2       29387.75     1507.11         1507.11          612.25        894.86
## 3        4787.21      353.89          353.89          212.79        141.10
## 4        3831.93      286.71          286.71          168.07        118.64
## 5       29339.02     1423.21         1423.21          660.98        762.23
## 6        5302.50      377.95          377.95          247.50        130.45
## 7        1914.71      141.56          141.56           85.29         56.27
## 8        5864.01      201.53          201.53          135.99         65.54
## 9        4786.79      353.89          353.89          213.21        140.68
## 10       5730.20      405.64          405.64          269.80        135.84
##    total_rec_late_fee recoveries collection_recovery_fee last_pymnt_d
## 1                 0.0        0.0                     0.0     Feb-2019
## 2                 0.0        0.0                     0.0     Feb-2019
## 3                 0.0        0.0                     0.0     Feb-2019
## 4                 0.0        0.0                     0.0     Feb-2019
## 5                 0.0        0.0                     0.0     Feb-2019
## 6                 0.0        0.0                     0.0     Feb-2019
## 7                 0.0        0.0                     0.0     Feb-2019
## 8                 0.0        0.0                     0.0     Feb-2019
## 9                 0.0        0.0                     0.0     Feb-2019
## 10                0.0        0.0                     0.0     Feb-2019
##    last_pymnt_amnt next_pymnt_d last_credit_pull_d
## 1            84.92     Mar-2019           Feb-2019
## 2           777.23     Mar-2019           Feb-2019
## 3           180.69     Mar-2019           Feb-2019
## 4           146.51     Mar-2019           Feb-2019
## 5           731.78     Mar-2019           Feb-2019
## 6           192.45     Mar-2019           Feb-2019
## 7            72.28     Mar-2019           Feb-2019
## 8           208.31     Mar-2019           Feb-2019
## 9           180.69     Mar-2019           Feb-2019
## 10          206.44     Mar-2019           Feb-2019
##    collections_12_mths_ex_med mths_since_last_major_derog policy_code
## 1                           0                                       1
## 2                           0                                       1
## 3                           0                                       1
## 4                           0                                       1
## 5                           0                                       1
## 6                           0                                       1
## 7                           0                                       1
## 8                           0                                       1
## 9                           0                          45           1
## 10                          0                                       1
##    application_type annual_inc_joint dti_joint verification_status_joint
## 1        Individual                                                     
## 2        Individual                                                     
## 3        Individual                                                     
## 4        Individual                                                     
## 5        Individual                                                     
## 6        Individual                                                     
## 7        Individual                                                     
## 8        Individual                                                     
## 9        Individual                                                     
## 10       Individual                                                     
##    acc_now_delinq tot_coll_amt tot_cur_bal open_acc_6m open_act_il
## 1               0            0       16901           2           2
## 2               0         1208      321915           4           4
## 3               0            0      110299           0           1
## 4               0          686      305049           1           5
## 5               0            0      116007           3           5
## 6               0            0      685749           1           7
## 7               0            0         854           0           0
## 8               0            0       91535           0           5
## 9               0            0       41882           5           2
## 10              0            0      349502           1           4
##    open_il_12m open_il_24m mths_since_rcnt_il total_bal_il il_util
## 1            1           2                  2        12560      69
## 2            2           3                  3        87153      88
## 3            0           2                 14         7150      72
## 4            3           5                  5        30683      68
## 5            3           5                  4        28845      89
## 6            2           3                  4       131524      72
## 7            2           3                  7            0        
## 8            0           1                 23        53059      87
## 9            5           5                  3        33864      98
## 10           1           3                  7        39961      45
##    open_rv_12m open_rv_24m max_bal_bc all_util total_rev_hi_lim inq_fi
## 1            2           7       2137       28            42000      1
## 2            4           5        998       57            50800      2
## 3            0           2          0       35            24100      1
## 4            0           0       3761       70             7000      2
## 5            2           4        516       54            23100      1
## 6            1           4      17584       58           111900      2
## 7            0           1          0      100                0      0
## 8            0           2       9413       74            55500      1
## 9            1           6       3132       73            22800      2
## 10           1          12      15926       48           132500      2
##    total_cu_tl inq_last_12m acc_open_past_24mths avg_cur_bal
## 1           11            2                    9        1878
## 2           15            2                   10       24763
## 3            5            0                    4       18383
## 4            4            3                    5       30505
## 5            0            0                    9        9667
## 6            4            6                    8       40338
## 7            0            1                    4         854
## 8            2            0                    3        5085
## 9            1            4                   12        5235
## 10           2            2                   15        9197
##    bc_open_to_buy bc_util chargeoff_within_12_mths delinq_amnt
## 1           34360     5.9                        0           0
## 2           13761     8.3                        0           0
## 3           13800       0                        0           0
## 4            1239    75.2                        0           0
## 5            8471     8.9                        0           0
## 6           23746      64                        0           0
## 7                                                0           0
## 8            3034    90.8                        0           0
## 9           13786    35.9                        0           0
## 10          38683    60.6                        0           0
##    mo_sin_old_il_acct mo_sin_old_rev_tl_op mo_sin_rcnt_rev_tl_op
## 1                 140                  212                     1
## 2                 163                  378                     4
## 3                  87                   92                    15
## 4                  62                  154                    64
## 5                  53                  216                     2
## 6                 195                  176                    10
## 7                 169                   40                    23
## 8                 169                  253                    13
## 9                 145                  244                     6
## 10                166                  200                     4
##    mo_sin_rcnt_tl mort_acc mths_since_recent_bc mths_since_recent_bc_dlq
## 1               1        0                    1                         
## 2               3        3                    4                         
## 3              14        2                   77                         
## 4               5        3                   64                         
## 5               2        2                    2                         
## 6               4        6                   20                         
## 7               7        0                                              
## 8              13        1                   14                         
## 9               3        3                    6                       33
## 10              4        1                    4                         
##    mths_since_recent_inq mths_since_recent_revol_delinq
## 1                      2                               
## 2                      4                               
## 3                     14                               
## 4                      5                               
## 5                     13                               
## 6                      3                               
## 7                      1                               
## 8                     13                               
## 9                      2                             32
## 10                     4                             17
##    num_accts_ever_120_pd num_actv_bc_tl num_actv_rev_tl num_bc_sats
## 1                      0              2               5           3
## 2                      0              2               4           4
## 3                      0              0               3           3
## 4                      0              1               2           1
## 5                      0              2               2           3
## 6                      0              4               6           6
## 7                      0              0               0           0
## 8                      0              7              12           8
## 9                      2              4               5           5
## 10                     0             16              20          19
##    num_bc_tl num_il_tl num_op_rev_tl num_rev_accts num_rev_tl_bal_gt_0
## 1          3        16             7            18                   5
## 2          9        27             8            14                   4
## 3          3         4             6             7                   3
## 4          2         7             2             3                   2
## 5          8         9             6            15                   2
## 6         10        23             9            15                   7
## 7          3         5             0             3                   0
## 8         10        15            14            20                  12
## 9         10        20             6            15                   5
## 10        26         9            33            48                  20
##    num_sats num_tl_120dpd_2m num_tl_30dpd num_tl_90g_dpd_24m
## 1         9                0            0                  0
## 2        13                0            0                  0
## 3         8                0            0                  0
## 4        10                0            0                  0
## 5        12                0            0                  0
## 6        18                0            0                  0
## 7         1                0            0                  0
## 8        19                0            0                  0
## 9         8                0            0                  0
## 10       38                0            0                  0
##    num_tl_op_past_12m pct_tl_nvr_dlq percent_bc_gt_75 pub_rec_bankruptcies
## 1                   3            100                0                    1
## 2                   6             95                0                    1
## 3                   0            100                0                    0
## 4                   3            100              100                    0
## 5                   5           92.3                0                    0
## 6                   4            100               60                    0
## 7                   2            100                                     0
## 8                   0            100             85.7                    0
## 9                   6           78.9               60                    0
## 10                  2            100             26.3                    0
##    tax_liens tot_hi_cred_lim total_bal_ex_mort total_bc_limit
## 1          0           60124             16901          36500
## 2          0          372872             99468          15000
## 3          0          136927             11749          13800
## 4          0          385183             36151           5000
## 5          0          157548             29674           9300
## 6          0          831687            185378          65900
## 7          0             854               854              0
## 8          0          117242             91535          33100
## 9          0           57426             41882          21500
## 10         0          477390            105911          98300
##    total_il_high_credit_limit revol_bal_joint sec_app_earliest_cr_line
## 1                       18124                                         
## 2                       94072                                         
## 3                       10000                                         
## 4                       44984                                         
## 5                       32332                                         
## 6                      203159                                         
## 7                           0                                         
## 8                       61742                                         
## 9                       34626                                         
## 10                      89600                                         
##    sec_app_inq_last_6mths sec_app_mort_acc sec_app_open_acc
## 1                                                          
## 2                                                          
## 3                                                          
## 4                                                          
## 5                                                          
## 6                                                          
## 7                                                          
## 8                                                          
## 9                                                          
## 10                                                         
##    sec_app_revol_util sec_app_open_act_il sec_app_num_rev_accts
## 1                                                              
## 2                                                              
## 3                                                              
## 4                                                              
## 5                                                              
## 6                                                              
## 7                                                              
## 8                                                              
## 9                                                              
## 10                                                             
##    sec_app_chargeoff_within_12_mths sec_app_collections_12_mths_ex_med
## 1                                                                     
## 2                                                                     
## 3                                                                     
## 4                                                                     
## 5                                                                     
## 6                                                                     
## 7                                                                     
## 8                                                                     
## 9                                                                     
## 10                                                                    
##    sec_app_mths_since_last_major_derog hardship_flag hardship_type
## 1                                                  N              
## 2                                                  N              
## 3                                                  N              
## 4                                                  N              
## 5                                                  N              
## 6                                                  N              
## 7                                                  N              
## 8                                                  N              
## 9                                                  N              
## 10                                                 N              
##    hardship_reason hardship_status deferral_term hardship_amount
## 1                                                               
## 2                                                               
## 3                                                               
## 4                                                               
## 5                                                               
## 6                                                               
## 7                                                               
## 8                                                               
## 9                                                               
## 10                                                              
##    hardship_start_date hardship_end_date payment_plan_start_date
## 1                                                               
## 2                                                               
## 3                                                               
## 4                                                               
## 5                                                               
## 6                                                               
## 7                                                               
## 8                                                               
## 9                                                               
## 10                                                              
##    hardship_length hardship_dpd hardship_loan_status
## 1                                                   
## 2                                                   
## 3                                                   
## 4                                                   
## 5                                                   
## 6                                                   
## 7                                                   
## 8                                                   
## 9                                                   
## 10                                                  
##    orig_projected_additional_accrued_interest
## 1                                            
## 2                                            
## 3                                            
## 4                                            
## 5                                            
## 6                                            
## 7                                            
## 8                                            
## 9                                            
## 10                                           
##    hardship_payoff_balance_amount hardship_last_payment_amount
## 1                                                             
## 2                                                             
## 3                                                             
## 4                                                             
## 5                                                             
## 6                                                             
## 7                                                             
## 8                                                             
## 9                                                             
## 10                                                            
##    disbursement_method debt_settlement_flag debt_settlement_flag_date
## 1                 Cash                    N                          
## 2                 Cash                    N                          
## 3                 Cash                    N                          
## 4                 Cash                    N                          
## 5                 Cash                    N                          
## 6                 Cash                    N                          
## 7                 Cash                    N                          
## 8            DirectPay                    N                          
## 9                 Cash                    N                          
## 10                Cash                    N                          
##    settlement_status settlement_date settlement_amount
## 1                                                     
## 2                                                     
## 3                                                     
## 4                                                     
## 5                                                     
## 6                                                     
## 7                                                     
## 8                                                     
## 9                                                     
## 10                                                    
##    settlement_percentage settlement_term
## 1                                       
## 2                                       
## 3                                       
## 4                                       
## 5                                       
## 6                                       
## 7                                       
## 8                                       
## 9                                       
## 10

或者更方便地,调用 dplyr 包中的 tbl 函数,查看 loan 表中的各个字段的部分记录,但是为了后续的分析,我们这里加载 tidyverse 全家桶

library(tidyverse)

tidyverse 集合了以 dplyr 和 ggplot2 包为核心的数据操作和可视化等功能

tbl(con, "loan")
## # Source:   table<loan> [?? x 145]
## # Database: sqlite 3.29.0
## #   [F:\lend-loan\lending-club-loan-data\lend_loan.sqlite]
##   id    member_id loan_amnt funded_amnt funded_amnt_inv term  int_rate
##   <chr> <chr>     <chr>     <chr>       <chr>           <chr> <chr>   
## 1 ""    ""        2500      2500        2500            " 36~ 13.56   
## 2 ""    ""        30000     30000       30000           " 60~ 18.94   
## 3 ""    ""        5000      5000        5000            " 36~ 17.97   
## 4 ""    ""        4000      4000        4000            " 36~ 18.94   
## 5 ""    ""        30000     30000       30000           " 60~ 16.14   
## 6 ""    ""        5550      5550        5550            " 36~ 15.02   
## # ... with more rows, and 138 more variables: installment <chr>,
## #   grade <chr>, sub_grade <chr>, emp_title <chr>, emp_length <chr>,
## #   home_ownership <chr>, annual_inc <chr>, verification_status <chr>,
## #   issue_d <chr>, loan_status <chr>, pymnt_plan <chr>, url <chr>,
## #   desc <chr>, purpose <chr>, title <chr>, zip_code <chr>,
## #   addr_state <chr>, dti <chr>, delinq_2yrs <chr>,
## #   earliest_cr_line <chr>, inq_last_6mths <chr>,
## #   mths_since_last_delinq <chr>, mths_since_last_record <chr>,
## #   open_acc <chr>, pub_rec <chr>, revol_bal <chr>, revol_util <chr>,
## #   total_acc <chr>, initial_list_status <chr>, out_prncp <chr>,
## #   out_prncp_inv <chr>, total_pymnt <chr>, total_pymnt_inv <chr>,
## #   total_rec_prncp <chr>, total_rec_int <chr>, total_rec_late_fee <chr>,
## #   recoveries <chr>, collection_recovery_fee <chr>, last_pymnt_d <chr>,
## #   last_pymnt_amnt <chr>, next_pymnt_d <chr>, last_credit_pull_d <chr>,
## #   collections_12_mths_ex_med <chr>, mths_since_last_major_derog <chr>,
## #   policy_code <chr>, application_type <chr>, annual_inc_joint <chr>,
## #   dti_joint <chr>, verification_status_joint <chr>,
## #   acc_now_delinq <chr>, tot_coll_amt <chr>, tot_cur_bal <chr>,
## #   open_acc_6m <chr>, open_act_il <chr>, open_il_12m <chr>,
## #   open_il_24m <chr>, mths_since_rcnt_il <chr>, total_bal_il <chr>,
## #   il_util <chr>, open_rv_12m <chr>, open_rv_24m <chr>, max_bal_bc <chr>,
## #   all_util <chr>, total_rev_hi_lim <chr>, inq_fi <chr>,
## #   total_cu_tl <chr>, inq_last_12m <chr>, acc_open_past_24mths <chr>,
## #   avg_cur_bal <chr>, bc_open_to_buy <chr>, bc_util <chr>,
## #   chargeoff_within_12_mths <chr>, delinq_amnt <chr>,
## #   mo_sin_old_il_acct <chr>, mo_sin_old_rev_tl_op <chr>,
## #   mo_sin_rcnt_rev_tl_op <chr>, mo_sin_rcnt_tl <chr>, mort_acc <chr>,
## #   mths_since_recent_bc <chr>, mths_since_recent_bc_dlq <chr>,
## #   mths_since_recent_inq <chr>, mths_since_recent_revol_delinq <chr>,
## #   num_accts_ever_120_pd <chr>, num_actv_bc_tl <chr>,
## #   num_actv_rev_tl <chr>, num_bc_sats <chr>, num_bc_tl <chr>,
## #   num_il_tl <chr>, num_op_rev_tl <chr>, num_rev_accts <chr>,
## #   num_rev_tl_bal_gt_0 <chr>, num_sats <chr>, num_tl_120dpd_2m <chr>,
## #   num_tl_30dpd <chr>, num_tl_90g_dpd_24m <chr>,
## #   num_tl_op_past_12m <chr>, pct_tl_nvr_dlq <chr>,
## #   percent_bc_gt_75 <chr>, pub_rec_bankruptcies <chr>, tax_liens <chr>,
## #   ...

或者调用 glimpse 函数,对数据集一瞥

tbl(con, "loan") %>% 
  glimpse()
## Observations: ??
## Variables: 145
## Database: sqlite 3.29.0 [F:\lend-loan\lending-club-loan-data\lend_loan.sqlite]
## $ id                                         <chr> "", "", "", "", "",...
## $ member_id                                  <chr> "", "", "", "", "",...
## $ loan_amnt                                  <chr> "2500", "30000", "5...
## $ funded_amnt                                <chr> "2500", "30000", "5...
## $ funded_amnt_inv                            <chr> "2500", "30000", "5...
## $ term                                       <chr> " 36 months", " 60 ...
## $ int_rate                                   <chr> "13.56", "18.94", "...
## $ installment                                <chr> "84.92", "777.23", ...
## $ grade                                      <chr> "C", "D", "D", "D",...
## $ sub_grade                                  <chr> "C1", "D2", "D1", "...
## $ emp_title                                  <chr> "Chef", "Postmaster...
## $ emp_length                                 <chr> "10+ years", "10+ y...
## $ home_ownership                             <chr> "RENT", "MORTGAGE",...
## $ annual_inc                                 <chr> "55000", "90000", "...
## $ verification_status                        <chr> "Not Verified", "So...
## $ issue_d                                    <chr> "Dec-2018", "Dec-20...
## $ loan_status                                <chr> "Current", "Current...
## $ pymnt_plan                                 <chr> "n", "n", "n", "n",...
## $ url                                        <chr> "", "", "", "", "",...
## $ desc                                       <chr> "", "", "", "", "",...
## $ purpose                                    <chr> "debt_consolidation...
## $ title                                      <chr> "Debt consolidation...
## $ zip_code                                   <chr> "109xx", "713xx", "...
## $ addr_state                                 <chr> "NY", "LA", "MI", "...
## $ dti                                        <chr> "18.24", "26.52", "...
## $ delinq_2yrs                                <chr> "0", "0", "0", "0",...
## $ earliest_cr_line                           <chr> "Apr-2001", "Jun-19...
## $ inq_last_6mths                             <chr> "1", "0", "0", "0",...
## $ mths_since_last_delinq                     <chr> "", "71", "", "", "...
## $ mths_since_last_record                     <chr> "45", "75", "", "",...
## $ open_acc                                   <chr> "9", "13", "8", "10...
## $ pub_rec                                    <chr> "1", "1", "0", "0",...
## $ revol_bal                                  <chr> "4341", "12315", "4...
## $ revol_util                                 <chr> "10.3", "24.2", "19...
## $ total_acc                                  <chr> "34", "44", "13", "...
## $ initial_list_status                        <chr> "w", "w", "w", "w",...
## $ out_prncp                                  <chr> "2386.02", "29387.7...
## $ out_prncp_inv                              <chr> "2386.02", "29387.7...
## $ total_pymnt                                <chr> "167.02", "1507.11"...
## $ total_pymnt_inv                            <chr> "167.02", "1507.11"...
## $ total_rec_prncp                            <chr> "113.98", "612.25",...
## $ total_rec_int                              <chr> "53.04", "894.86", ...
## $ total_rec_late_fee                         <chr> "0.0", "0.0", "0.0"...
## $ recoveries                                 <chr> "0.0", "0.0", "0.0"...
## $ collection_recovery_fee                    <chr> "0.0", "0.0", "0.0"...
## $ last_pymnt_d                               <chr> "Feb-2019", "Feb-20...
## $ last_pymnt_amnt                            <chr> "84.92", "777.23", ...
## $ next_pymnt_d                               <chr> "Mar-2019", "Mar-20...
## $ last_credit_pull_d                         <chr> "Feb-2019", "Feb-20...
## $ collections_12_mths_ex_med                 <chr> "0", "0", "0", "0",...
## $ mths_since_last_major_derog                <chr> "", "", "", "", "",...
## $ policy_code                                <chr> "1", "1", "1", "1",...
## $ application_type                           <chr> "Individual", "Indi...
## $ annual_inc_joint                           <chr> "", "", "", "", "",...
## $ dti_joint                                  <chr> "", "", "", "", "",...
## $ verification_status_joint                  <chr> "", "", "", "", "",...
## $ acc_now_delinq                             <chr> "0", "0", "0", "0",...
## $ tot_coll_amt                               <chr> "0", "1208", "0", "...
## $ tot_cur_bal                                <chr> "16901", "321915", ...
## $ open_acc_6m                                <chr> "2", "4", "0", "1",...
## $ open_act_il                                <chr> "2", "4", "1", "5",...
## $ open_il_12m                                <chr> "1", "2", "0", "3",...
## $ open_il_24m                                <chr> "2", "3", "2", "5",...
## $ mths_since_rcnt_il                         <chr> "2", "3", "14", "5"...
## $ total_bal_il                               <chr> "12560", "87153", "...
## $ il_util                                    <chr> "69", "88", "72", "...
## $ open_rv_12m                                <chr> "2", "4", "0", "0",...
## $ open_rv_24m                                <chr> "7", "5", "2", "0",...
## $ max_bal_bc                                 <chr> "2137", "998", "0",...
## $ all_util                                   <chr> "28", "57", "35", "...
## $ total_rev_hi_lim                           <chr> "42000", "50800", "...
## $ inq_fi                                     <chr> "1", "2", "1", "2",...
## $ total_cu_tl                                <chr> "11", "15", "5", "4...
## $ inq_last_12m                               <chr> "2", "2", "0", "3",...
## $ acc_open_past_24mths                       <chr> "9", "10", "4", "5"...
## $ avg_cur_bal                                <chr> "1878", "24763", "1...
## $ bc_open_to_buy                             <chr> "34360", "13761", "...
## $ bc_util                                    <chr> "5.9", "8.3", "0", ...
## $ chargeoff_within_12_mths                   <chr> "0", "0", "0", "0",...
## $ delinq_amnt                                <chr> "0", "0", "0", "0",...
## $ mo_sin_old_il_acct                         <chr> "140", "163", "87",...
## $ mo_sin_old_rev_tl_op                       <chr> "212", "378", "92",...
## $ mo_sin_rcnt_rev_tl_op                      <chr> "1", "4", "15", "64...
## $ mo_sin_rcnt_tl                             <chr> "1", "3", "14", "5"...
## $ mort_acc                                   <chr> "0", "3", "2", "3",...
## $ mths_since_recent_bc                       <chr> "1", "4", "77", "64...
## $ mths_since_recent_bc_dlq                   <chr> "", "", "", "", "",...
## $ mths_since_recent_inq                      <chr> "2", "4", "14", "5"...
## $ mths_since_recent_revol_delinq             <chr> "", "", "", "", "",...
## $ num_accts_ever_120_pd                      <chr> "0", "0", "0", "0",...
## $ num_actv_bc_tl                             <chr> "2", "2", "0", "1",...
## $ num_actv_rev_tl                            <chr> "5", "4", "3", "2",...
## $ num_bc_sats                                <chr> "3", "4", "3", "1",...
## $ num_bc_tl                                  <chr> "3", "9", "3", "2",...
## $ num_il_tl                                  <chr> "16", "27", "4", "7...
## $ num_op_rev_tl                              <chr> "7", "8", "6", "2",...
## $ num_rev_accts                              <chr> "18", "14", "7", "3...
## $ num_rev_tl_bal_gt_0                        <chr> "5", "4", "3", "2",...
## $ num_sats                                   <chr> "9", "13", "8", "10...
## $ num_tl_120dpd_2m                           <chr> "0", "0", "0", "0",...
## $ num_tl_30dpd                               <chr> "0", "0", "0", "0",...
## $ num_tl_90g_dpd_24m                         <chr> "0", "0", "0", "0",...
## $ num_tl_op_past_12m                         <chr> "3", "6", "0", "3",...
## $ pct_tl_nvr_dlq                             <chr> "100", "95", "100",...
## $ percent_bc_gt_75                           <chr> "0", "0", "0", "100...
## $ pub_rec_bankruptcies                       <chr> "1", "1", "0", "0",...
## $ tax_liens                                  <chr> "0", "0", "0", "0",...
## $ tot_hi_cred_lim                            <chr> "60124", "372872", ...
## $ total_bal_ex_mort                          <chr> "16901", "99468", "...
## $ total_bc_limit                             <chr> "36500", "15000", "...
## $ total_il_high_credit_limit                 <chr> "18124", "94072", "...
## $ revol_bal_joint                            <chr> "", "", "", "", "",...
## $ sec_app_earliest_cr_line                   <chr> "", "", "", "", "",...
## $ sec_app_inq_last_6mths                     <chr> "", "", "", "", "",...
## $ sec_app_mort_acc                           <chr> "", "", "", "", "",...
## $ sec_app_open_acc                           <chr> "", "", "", "", "",...
## $ sec_app_revol_util                         <chr> "", "", "", "", "",...
## $ sec_app_open_act_il                        <chr> "", "", "", "", "",...
## $ sec_app_num_rev_accts                      <chr> "", "", "", "", "",...
## $ sec_app_chargeoff_within_12_mths           <chr> "", "", "", "", "",...
## $ sec_app_collections_12_mths_ex_med         <chr> "", "", "", "", "",...
## $ sec_app_mths_since_last_major_derog        <chr> "", "", "", "", "",...
## $ hardship_flag                              <chr> "N", "N", "N", "N",...
## $ hardship_type                              <chr> "", "", "", "", "",...
## $ hardship_reason                            <chr> "", "", "", "", "",...
## $ hardship_status                            <chr> "", "", "", "", "",...
## $ deferral_term                              <chr> "", "", "", "", "",...
## $ hardship_amount                            <chr> "", "", "", "", "",...
## $ hardship_start_date                        <chr> "", "", "", "", "",...
## $ hardship_end_date                          <chr> "", "", "", "", "",...
## $ payment_plan_start_date                    <chr> "", "", "", "", "",...
## $ hardship_length                            <chr> "", "", "", "", "",...
## $ hardship_dpd                               <chr> "", "", "", "", "",...
## $ hardship_loan_status                       <chr> "", "", "", "", "",...
## $ orig_projected_additional_accrued_interest <chr> "", "", "", "", "",...
## $ hardship_payoff_balance_amount             <chr> "", "", "", "", "",...
## $ hardship_last_payment_amount               <chr> "", "", "", "", "",...
## $ disbursement_method                        <chr> "Cash", "Cash", "Ca...
## $ debt_settlement_flag                       <chr> "N", "N", "N", "N",...
## $ debt_settlement_flag_date                  <chr> "", "", "", "", "",...
## $ settlement_status                          <chr> "", "", "", "", "",...
## $ settlement_date                            <chr> "", "", "", "", "",...
## $ settlement_amount                          <chr> "", "", "", "", "",...
## $ settlement_percentage                      <chr> "", "", "", "", "",...
## $ settlement_term                            <chr> "", "", "", "", "",...

借贷金额 loan_amnt 按申请类型 application_type 分组求和

dbGetQuery(con, "
SELECT application_type, SUM(loan_amnt) as sum_loan_amnt
FROM loan 
GROUP BY application_type
LIMIT 10
")
##   application_type sum_loan_amnt
## 1       Individual   31646288600
## 2        Joint App    2369827325

在上述代码中,SQL 语句其实被当作字符串,我们其实可以用 paste 一段一段拼接,

dbGetQuery(con, paste(
  "SELECT application_type, SUM(loan_amnt) as sum_loan_amnt",
  "FROM loan",
  "GROUP BY application_type",
  "LIMIT 10"
))

平台上借贷数额最大的一笔,按申请类型分组

SELECT id, application_type, MAX(loan_amnt) as max_loan_amnt
FROM loan 
GROUP BY application_type
ORDER BY max_loan_amnt DESC
LIMIT 10

输出结果

data_output1
##   id application_type max_loan_amnt
## 1          Individual          9975
## 2           Joint App          9975
# 
dbGetQuery(con, paste(
  "SELECT id, application_type, loan_amnt",
  "FROM loan",
  "GROUP BY application_type",
  "ORDER BY loan_amnt DESC",
  "LIMIT 10"
))
##   id application_type loan_amnt
## 1           Joint App      3500
## 2          Individual      2500

或者也可以直接使用 SQL 代码块,支持语法高亮,这是 R Markdown 的独特,将查询的结果返回到 R 语言环境,保存为变量 data_output

SELECT application_type, SUM(loan_amnt) as sum_loan_amnt
FROM loan 
GROUP BY application_type
LIMIT 10

查看结果

data_output2
##   application_type sum_loan_amnt
## 1       Individual   31646288600
## 2        Joint App    2369827325

或者使用 tidyverse (此处主要是使用 dplyr 的数据操作)提供的方式

tbl(con, "loan") %>% 
  group_by(application_type) %>% 
  summarise(total_amnt = sum(loan_amnt, na.rm = TRUE))
## # Source:   lazy query [?? x 2]
## # Database: sqlite 3.29.0
## #   [F:\lend-loan\lending-club-loan-data\lend_loan.sqlite]
##   application_type total_amnt 
##   <chr>            <int64>    
## 1 Individual       31646288600
## 2 Joint App         2369827325

在 SQL 中,缺失值默认不进入计算,在上述语句中,sum(loan_amnt, na.rm = TRUE),如果不显式地指定 na.rm = TRUE 程序运行结束会有伴随一个警告提醒,并且在每次连接中出现一次。所以,我们推荐还是应该指定缺失值得处理方式

按照 application_type 分组计算 total_amnt 的平均值,分组统计 application_type 的数量

tbl(con, "loan") %>% 
  group_by(application_type) %>% 
  summarise(mean_amnt = mean(loan_amnt, na.rm = TRUE), n_count = n())
## # Source:   lazy query [?? x 3]
## # Database: sqlite 3.29.0
## #   [F:\lend-loan\lending-club-loan-data\lend_loan.sqlite]
##   application_type mean_amnt n_count
##   <chr>                <dbl>   <int>
## 1 Individual          14788. 2139958
## 2 Joint App           19632.  120710

我还希望知道各个字段的缺失情况,比如我想查看字段 emp_title 的情况,即借贷人的职务信息

tbl(con, "loan") %>% 
  group_by(emp_title) %>% # 按照职务分组
  summarise(n_count = n()) %>% # 统计每类人的数量
  arrange(desc(n_count)) # 降序排列
## # Source:     lazy query [?? x 2]
## # Database:   sqlite 3.29.0
## #   [F:\lend-loan\lending-club-loan-data\lend_loan.sqlite]
## # Ordered by: desc(n_count)
##   emp_title        n_count
##   <chr>              <int>
## 1 ""                166931
## 2 Teacher            38824
## 3 Manager            34298
## 4 Owner              21977
## 5 Registered Nurse   15867
## 6 Driver             14753
## # ... with more rows

我们注意到竟有 166931 人的职务信息缺失,占比 0.0738414。篇幅所限,这里看到的只是部分,而且已经按照人数由高到低排列,实际上还有一些职务是一样的但是书写不规范,所以这里应该有类别合并。

下面用 tidyverse 方式统计所有字段的缺失比例,缺失值在该数据库中以 "" (空字符)表示,而不是常见的 NA。举个小例子,统计数据集 airquality 每个字段的缺失比例

apply(airquality, 2, function(x) {
  mean(is.na(x))
})
##      Ozone    Solar.R       Wind       Temp      Month        Day 
## 0.24183007 0.04575163 0.00000000 0.00000000 0.00000000 0.00000000
colMeans(is.na(airquality))
##      Ozone    Solar.R       Wind       Temp      Month        Day 
## 0.24183007 0.04575163 0.00000000 0.00000000 0.00000000 0.00000000
purrr::map(airquality, function(x) mean(is.na(x))) # 返回列表
## $Ozone
## [1] 0.2418301
## 
## $Solar.R
## [1] 0.04575163
## 
## $Wind
## [1] 0
## 
## $Temp
## [1] 0
## 
## $Month
## [1] 0
## 
## $Day
## [1] 0
purrr::map_dbl(airquality, function(x) mean(is.na(x))) # 返回向量
##      Ozone    Solar.R       Wind       Temp      Month        Day 
## 0.24183007 0.04575163 0.00000000 0.00000000 0.00000000 0.00000000
dplyr::summarise_all(airquality, ~ mean(is.na(.)))
##       Ozone    Solar.R Wind Temp Month Day
## 1 0.2418301 0.04575163    0    0     0   0
purrr::map_dbl(airquality, ~ mean(is.na(.)))
##      Ozone    Solar.R       Wind       Temp      Month        Day 
## 0.24183007 0.04575163 0.00000000 0.00000000 0.00000000 0.00000000

下面正式统计借贷数据集中各个字段的缺失比例,下面变形的一步使用了 tidyr 包的 gather 函数,更多数据变形操作见 https://d.cosx.org/d/420920/,特别是 tidyr 包的 pivot_longerpivot_wider 函数,因为在 tidyr 1.0.0 版本后,函数 gatherspread 已经停止开发了。

missing_ratio_db <- tbl(con, "loan") %>%
  summarise_all(~ mean(. == "", na.rm = TRUE)) %>% # 统计每个字段的缺失比例
  collect() %>% 
  gather(key = "colnames", value = "missing_ratio") %>% # 数据变形 reshape
  filter(missing_ratio > 0.95) %>% # 缺失比例大于 95% 的行
  arrange(desc(missing_ratio)) # 缺失比例降序排序
missing_ratio_db %>% DT::datatable() # DT 展示结果

各个字段的缺失比例以条形图的方式展示出来

missing_ratio_db %>% 
  ggplot(aes(x = reorder(colnames, missing_ratio), y = missing_ratio)) +
  geom_col() +
  coord_flip()

缺失比例非常高的,比如 0.95 以上就移除该字段,实际中可能还需要根据业务问题(指标含义)来决定。

搞清楚所需要的数据集后,可以将数据从数据库拉到本地,加载到内存做后续的建模,比如我想要借贷数额 loan_amnt 和借期 term,篇幅所限,这里仅显示 10 条记录

tbl(con, "loan") %>% 
  group_by(application_type) %>% # 按申请类型分组
  filter(loan_amnt > 10000) %>% # 借贷金额大于1万
  select(loan_amnt, term) %>% # 借贷金额和借期
  collect() %>% # 执行 SQL 查询返回结果
  sample_n(5) # 每组抽取 5 个样本
## # A tibble: 10 x 3
## # Groups:   application_type [2]
##   application_type loan_amnt term        
##   <chr>            <chr>     <chr>       
## 1 Individual       6000      " 36 months"
## 2 Individual       7200      " 36 months"
## 3 Individual       15000     " 36 months"
## 4 Individual       22000     " 60 months"
## 5 Individual       30000     " 60 months"
## 6 Joint App        6500      " 36 months"
## # ... with 4 more rows

sample_n 函数只对 data.frame 类型的数据随机抽样,所以它必须在 collect 函数之后调用。

1.2 探索性分析

回答一些有意义的问题,进一步熟悉数据集,一方面直接回答业务问题,一方面为预测建模做准备。

  1. 数据查询
  2. 数据探索性分析
  3. 数据建模
  4. 模型评估和选择
  5. 模型部署和维护
  6. 模型优化
library(DataExplorer)

计算所有字段的缺失情况的操作很费时间和内存,随便选取前几个字段统计一下

emp_title_db <- tbl(con, "loan") %>% 
  select(emp_title) %>% 
  group_by(emp_title) %>% # 按照职务分组
  summarise(n_count = n()) %>% # 统计每类人的数量
  arrange(desc(n_count)) %>% # 降序排列
  collect()

以条形图的方式展示数据

emp_title_db %>% 
  head(20) %>% 
  ggplot(aes(x = emp_title, y = n_count)) +
  geom_col() +
  coord_flip()

从这个图,不难看出有些类别其实就是一类,如 Owner 和 owner, Manager 和 manager 等,还有的可能是一类,我们需要思考是否需要合并?再从整体上看一下,职位中包含 Manager 的有多少?

emp_title_db %>% 
  filter(grepl("(M|m)anager", emp_title)) %>% 
  count()
## # A tibble: 1 x 1
##       n
##   <int>
## 1 43631

43631 个职位!我们知道不同 Manager 之间是有区别的,所以除了大小写导致的外,其它都不该合并。所以接下来,先统一将职位名称转为小写,然后分组计数

emp_title_db <- tbl(con, "loan") %>% 
  select(emp_title) %>% 
  mutate(emp_title = tolower(emp_title)) %>% # 将职务名称都转为小写
  group_by(emp_title) %>% 
  summarise(n_count = n()) %>% 
  arrange(desc(n_count)) %>% 
  collect()

然后重画条形图,职务按人数重新排序

emp_title_db %>% 
  head(20) %>% 
  ggplot(aes(x = reorder(emp_title, n_count), y = n_count)) +
  geom_col() +
  coord_flip()

将职务缺失的部分去掉

emp_title_db %>% 
  head(20) %>% 
  filter(emp_title != "") %>% 
  ggplot(aes(x = reorder(emp_title, n_count), y = n_count)) +
  geom_col() +
  coord_flip()

下面重点关注需要预测的借贷状态,loan_status

# HAVING 与 WHERE 不同,它在 GROUP BY 之后,它可以使用聚合操作比如 COUNT(*)
# dbGetQuery(con, paste(
#   "SELECT loan_status, COUNT(*) ",
#   "FROM loan",
#   "GROUP BY loan_status",
#   "HAVING COUNT(*) > 1",
#   "LIMIT 10"
# ))
# 是否违约,贷款状态
loan_status_db <- dbGetQuery(con, paste(
  "SELECT loan_status, COUNT(*) AS num, SUM(loan_amnt) AS sum_loan_amnt, AVG(loan_amnt) AS avg_loan_amnt",
  "FROM loan",
  "GROUP BY loan_status",
  "ORDER BY num DESC"
))

共有 9 个贷款状态,分别是 Fully Paid (全额付款)、Current、Charged Off(核销:不良贷款)、Late (16-30 days) 逾期 15 天至 1 个月、Late (31-120 days) 逾期 1-4 个月、In Grace Period、Does not meet the credit policy. Status:Fully Paid(不满足贷款政策)、Does not meet the credit policy. Status:Charged Off(不满足贷款政策)、Default(违约),num 每种状态的笔数。 sum_loan_amnt 每种状态总共贷出去的总额,avg_loan_amnt 每笔贷款平均贷出的钱。

loan_status_db[1:6,]
##          loan_status     num sum_loan_amnt avg_loan_amnt
## 1         Fully Paid 1041952   14725380075      14132.49
## 2            Current  919695   14609030925      15884.65
## 3        Charged Off  261655    4068468200      15548.98
## 4 Late (31-120 days)   21897     366014950      16715.30
## 5    In Grace Period    8952     158415150      17696.06
## 6  Late (16-30 days)    3737      63466375      16983.24

画个图感受一下,每类贷款状态对应的贷款笔数

ggplot(loan_status_db, aes(x = reorder(loan_status, num), y = num)) +
  geom_col() +
  coord_flip()

用比例图可能更好些,每类贷款状态对应的百分比

ggplot(loan_status_db, aes(x = reorder(loan_status, num), y = num/(sum(num)))) +
  geom_col() +
  coord_flip()

# 风玫瑰图是饼图的改进版
# ggplot(loan_status_db, aes(x = reorder(loan_status, num), y = num/(sum(num)))) +
#   geom_col() +
#   coord_polar()

每类贷款状态对应的贷款金额的分布,将贷款状态 loan_status 和贷款数额 loan_amnt 提取出来

loan_status_df <- dbGetQuery(con, paste(
  "SELECT loan_status, loan_amnt",
  "FROM loan"
))

loan_amnt 转化为数值型变量

loan_status_df$loan_amnt <- as.numeric(loan_status_df$loan_amnt)
loan_status_df %>% 
  head(10)
##    loan_status loan_amnt
## 1      Current      2500
## 2      Current     30000
## 3      Current      5000
## 4      Current      4000
## 5      Current     30000
## 6      Current      5550
## 7      Current      2000
## 8      Current      6000
## 9      Current      5000
## 10     Current      6000

堆积密度图

loan_status_df %>% 
  ggplot(aes(x = loan_amnt, fill = loan_status)) +
  geom_density(position = "stack")

分面密度图

loan_status_df %>% 
  ggplot(aes(x = loan_amnt, fill = loan_status)) +
  geom_density(show.legend = FALSE) +
  facet_wrap(~loan_status, nrow = 3, ncol = 3)

下面重点关注违约的情况

loan_status_df %>% 
  filter(loan_status == "Default") %>% 
  ggplot(aes(x = loan_amnt, fill = loan_status)) +
  geom_density(show.legend = FALSE)

而不良贷款其实还有6类 Charged Off,Default,Does not meet the credit policy. Status:Charged Off,In Grace Period,Late (16-30 days),Late (31-120 days)。

# loan_status_df %>% 
#   filter(!loan_status %in% c("Current", "Fully Paid", "Does not meet the credit policy. Status:Fully Paid")) %>% 
#   distinct(loan_status)

loan_status_df %>% 
  mutate(Loan_Condition = if_else(loan_status %in% c("Current", "Fully Paid", "Does not meet the credit policy. Status:Fully Paid"), "Good_Loan", "Bad_Loan")) %>% 
  ggplot(aes(x = loan_amnt, fill = Loan_Condition)) +
  geom_density(alpha = 0.5)

从上图不难发现,不同贷款状态下贷款数额的分布走向基本一致。有没有一些变量/特征使得贷款状态的分布不一致呢?事实上肯定有的,那么就要对这些变量分比例抽样来解决数据集不平衡的问题,防止过拟合还需要重抽样 boostrap 和交叉验证的方式选模型。

loan_db <- tbl(con, "loan") %>%
  select(
    loan_status, loan_amnt, int_rate, grade,
    emp_length, home_ownership,
    annual_inc, term
  )
head(loan_db)
## # Source:   lazy query [?? x 8]
## # Database: sqlite 3.29.0
## #   [F:\lend-loan\lending-club-loan-data\lend_loan.sqlite]
##   loan_status loan_amnt int_rate grade emp_length home_ownership annual_inc
##   <chr>       <chr>     <chr>    <chr> <chr>      <chr>          <chr>     
## 1 Current     2500      13.56    C     10+ years  RENT           55000     
## 2 Current     30000     18.94    D     10+ years  MORTGAGE       90000     
## 3 Current     5000      17.97    D     6 years    MORTGAGE       59280     
## 4 Current     4000      18.94    D     10+ years  MORTGAGE       92000     
## 5 Current     30000     16.14    C     10+ years  MORTGAGE       57250     
## 6 Current     5550      15.02    C     10+ years  MORTGAGE       152500    
## # ... with 1 more variable: term <chr>

使用完就不要占用与数据库连接的进程,最后记得断开与数据库的连接

dbDisconnect(con)