第 84 章 探索性数据分析-ames房屋价格

84.1 数据故事

这是数据故事的地图

图 84.1: 这是数据故事的地图

这是一份Ames房屋数据,您可以把它想象为房屋中介推出的成都市武侯区、锦江区以及高新区等各区县的房屋信息

library(tidyverse)
ames <- read_csv("./demo_data/ames_houseprice.csv") %>% 
        janitor::clean_names()

glimpse(ames)
## Rows: 1,460
## Columns: 81
## $ id              <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10…
## $ ms_sub_class    <dbl> 60, 20, 60, 70, 60, 50, 20, 6…
## $ ms_zoning       <chr> "RL", "RL", "RL", "RL", "RL",…
## $ lot_frontage    <dbl> 65, 80, 68, 60, 84, 85, 75, N…
## $ lot_area        <dbl> 8450, 9600, 11250, 9550, 1426…
## $ street          <chr> "Pave", "Pave", "Pave", "Pave…
## $ alley           <chr> NA, NA, NA, NA, NA, NA, NA, N…
## $ lot_shape       <chr> "Reg", "Reg", "IR1", "IR1", "…
## $ land_contour    <chr> "Lvl", "Lvl", "Lvl", "Lvl", "…
## $ utilities       <chr> "AllPub", "AllPub", "AllPub",…
## $ lot_config      <chr> "Inside", "FR2", "Inside", "C…
## $ land_slope      <chr> "Gtl", "Gtl", "Gtl", "Gtl", "…
## $ neighborhood    <chr> "CollgCr", "Veenker", "CollgC…
## $ condition1      <chr> "Norm", "Feedr", "Norm", "Nor…
## $ condition2      <chr> "Norm", "Norm", "Norm", "Norm…
## $ bldg_type       <chr> "1Fam", "1Fam", "1Fam", "1Fam…
## $ house_style     <chr> "2Story", "1Story", "2Story",…
## $ overall_qual    <dbl> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5,…
## $ overall_cond    <dbl> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6,…
## $ year_built      <dbl> 2003, 1976, 2001, 1915, 2000,…
## $ year_remod_add  <dbl> 2003, 1976, 2002, 1970, 2000,…
## $ roof_style      <chr> "Gable", "Gable", "Gable", "G…
## $ roof_matl       <chr> "CompShg", "CompShg", "CompSh…
## $ exterior1st     <chr> "VinylSd", "MetalSd", "VinylS…
## $ exterior2nd     <chr> "VinylSd", "MetalSd", "VinylS…
## $ mas_vnr_type    <chr> "BrkFace", "None", "BrkFace",…
## $ mas_vnr_area    <dbl> 196, 0, 162, 0, 350, 0, 186, …
## $ exter_qual      <chr> "Gd", "TA", "Gd", "TA", "Gd",…
## $ exter_cond      <chr> "TA", "TA", "TA", "TA", "TA",…
## $ foundation      <chr> "PConc", "CBlock", "PConc", "…
## $ bsmt_qual       <chr> "Gd", "Gd", "Gd", "TA", "Gd",…
## $ bsmt_cond       <chr> "TA", "TA", "TA", "Gd", "TA",…
## $ bsmt_exposure   <chr> "No", "Gd", "Mn", "No", "Av",…
## $ bsmt_fin_type1  <chr> "GLQ", "ALQ", "GLQ", "ALQ", "…
## $ bsmt_fin_sf1    <dbl> 706, 978, 486, 216, 655, 732,…
## $ bsmt_fin_type2  <chr> "Unf", "Unf", "Unf", "Unf", "…
## $ bsmt_fin_sf2    <dbl> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0…
## $ bsmt_unf_sf     <dbl> 150, 284, 434, 540, 490, 64, …
## $ total_bsmt_sf   <dbl> 856, 1262, 920, 756, 1145, 79…
## $ heating         <chr> "GasA", "GasA", "GasA", "GasA…
## $ heating_qc      <chr> "Ex", "Ex", "Ex", "Gd", "Ex",…
## $ central_air     <chr> "Y", "Y", "Y", "Y", "Y", "Y",…
## $ electrical      <chr> "SBrkr", "SBrkr", "SBrkr", "S…
## $ x1st_flr_sf     <dbl> 856, 1262, 920, 961, 1145, 79…
## $ x2nd_flr_sf     <dbl> 854, 0, 866, 756, 1053, 566, …
## $ low_qual_fin_sf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ gr_liv_area     <dbl> 1710, 1262, 1786, 1717, 2198,…
## $ bsmt_full_bath  <dbl> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,…
## $ bsmt_half_bath  <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ full_bath       <dbl> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1,…
## $ half_bath       <dbl> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0,…
## $ bedroom_abv_gr  <dbl> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2,…
## $ kitchen_abv_gr  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,…
## $ kitchen_qual    <chr> "Gd", "TA", "Gd", "Gd", "Gd",…
## $ tot_rms_abv_grd <dbl> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5,…
## $ functional      <chr> "Typ", "Typ", "Typ", "Typ", "…
## $ fireplaces      <dbl> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2,…
## $ fireplace_qu    <chr> NA, "TA", "TA", "Gd", "TA", N…
## $ garage_type     <chr> "Attchd", "Attchd", "Attchd",…
## $ garage_yr_blt   <dbl> 2003, 1976, 2001, 1998, 2000,…
## $ garage_finish   <chr> "RFn", "RFn", "RFn", "Unf", "…
## $ garage_cars     <dbl> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1,…
## $ garage_area     <dbl> 548, 460, 608, 642, 836, 480,…
## $ garage_qual     <chr> "TA", "TA", "TA", "TA", "TA",…
## $ garage_cond     <chr> "TA", "TA", "TA", "TA", "TA",…
## $ paved_drive     <chr> "Y", "Y", "Y", "Y", "Y", "Y",…
## $ wood_deck_sf    <dbl> 0, 298, 0, 0, 192, 40, 255, 2…
## $ open_porch_sf   <dbl> 61, 0, 42, 35, 84, 30, 57, 20…
## $ enclosed_porch  <dbl> 0, 0, 0, 272, 0, 0, 0, 228, 2…
## $ x3ssn_porch     <dbl> 0, 0, 0, 0, 0, 320, 0, 0, 0, …
## $ screen_porch    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ pool_area       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ pool_qc         <chr> NA, NA, NA, NA, NA, NA, NA, N…
## $ fence           <chr> NA, NA, NA, NA, NA, "MnPrv", …
## $ misc_feature    <chr> NA, NA, NA, NA, NA, "Shed", N…
## $ misc_val        <dbl> 0, 0, 0, 0, 0, 700, 0, 350, 0…
## $ mo_sold         <dbl> 2, 5, 9, 2, 12, 10, 8, 11, 4,…
## $ yr_sold         <dbl> 2008, 2007, 2008, 2006, 2008,…
## $ sale_type       <chr> "WD", "WD", "WD", "WD", "WD",…
## $ sale_condition  <chr> "Normal", "Normal", "Normal",…
## $ sale_price      <dbl> 208500, 181500, 223500, 14000…

感谢曾倬同学提供的解释说明文档

explanation <- readxl::read_excel("./demo_data/ames_houseprice_explanation.xlsx")
explanation %>% 
  knitr::kable()
列名 description 解释
MSSubClass Identifies the type of dwelling involved in the sale. 住宅概况
MSZoning Identifies the general zoning classification of the sale. 建筑性质(农业、商业、高/低密度住宅)
LotFrontage Linear feet of street connected to property 建筑离街道的距离
LotArea Lot size in square feet 占地面积
Street Type of road access to property 建筑附近的路面材质
Alley Type of alley access to property 建筑附近小巷的修建材质
LotShape General shape of property 建筑物的形状
LandContour Flatness of the property 地面平坦程度
Utilities Type of utilities available 可用公用设施类型
LotConfig Lot configuration 房屋哪里配置多
LandSlope Slope of property 建筑的斜率
Neighborhood Physical locations within Ames city limits 建筑在Ames城市的位置
Condition1 Proximity to various conditions 建筑附近的交通网络
Condition2 Proximity to various conditions (if more than one is present) 建筑附近的交通网络
BldgType Type of dwelling 住宅类别(联排别墅、独栋别墅…)
HouseStyle Style of dwelling 建筑风格
OverallQual Rates the overall material and finish of the house 房屋装饰材质水平
OverallCond Rates the overall condition of the house 房屋整体状况评估
YearBuilt Original construction date 房屋修建日期
YearRemodAdd Remodel date (same as construction date if no remodeling or additions) 房屋改建日期
RoofStyle Type of roof 屋顶类型
RoofMatl Roof material 屋顶材质
Exterior1st Exterior covering on house 建筑外立面材质
Exterior2nd Exterior covering on house (if more than one material) 建筑外立面材质
MasVnrType Masonry veneer type 建筑表层砌体类型
MasVnrArea Masonry veneer area in square feet 每平方英尺的砌体面积
ExterQual Evaluates the quality of the material on the exterior 建筑表层砌体材料质量评估
ExterCond Evaluates the present condition of the material on the exterior 建筑表层砌体材料现状评估
Foundation Type of foundation 建筑基础的类型
BsmtQual Evaluates the height of the basement 地下室高度评估
BsmtCond Evaluates the general condition of the basement 地下室总体状况评估
BsmtExposure Refers to walkout or garden level walls 走廊/花园外墙的评估
BsmtFinType1 Rating of basement finished area 地下室完工区域的等级评价
BsmtFinSF1 Type 1 finished square feet 地下室完工区域的面积
BsmtFinType2 Rating of basement finished area (if multiple types) 其他地下室完工区域的等级评价
BsmtFinSF2 Type 2 finished square feet 其他地下室完工区域的面积
BsmtUnfSF Unfinished square feet of basement area 地下室未完工部分的面积
TotalBsmtSF Total square feet of basement area 地下室总面积
Heating Type of heating 房屋暖气类型(地暖、墙暖….)
HeatingQC Heating quality and condition 暖气设施的质量和条件
CentralAir Central air conditioning 是否有中央空调
Electrical Electrical system 电器系统配置标准
1stFlrSF First Floor square feet 一楼面积
2ndFlrSF Second floor square feet 二楼面积
LowQualFinSF Low quality finished square feet (all floors) 所有楼层中低质量施工面积
GrLivArea Above grade (ground) living area square feet 地上居住面积
BsmtFullBath Basement full bathrooms 地下室标准卫生间个数
BsmtHalfBath Basement half bathrooms 地下室简易卫生间个数
FullBath Full bathrooms above grade 地上楼层标准卫生间个数
HalfBath Half baths above grade 地上楼层简易卫生间个数
BedroomAbvGr Bedrooms above grade (does NOT include basement bedrooms) 地上楼层卧室个数
KitchenAbvGr Kitchens above grade 地上楼层厨房个数
KitchenQual Kitchen quality 厨房质量评估
TopRmsAbvGrd Total rooms above grade (does not include bathrooms) 地上楼层房间总数(除去卧室)
Functional Home functionality (Assume typical unless deductions are warranted) 房屋功能情况
Fireplaces Number of fireplaces 壁炉个数
FireplaceQu Fireplace quality 壁炉质量
GarageType Garage location 车库位置
GarageYrBlt Year garage was built 车库建成年份
GarageFinish Interior finish of the garage 车库内部装饰情况
GarageCars Size of garage in car capacity 车库容量
GarageArea Size of garage in square feet 车库占地面积
GarageQual Garage quality 车库质量
GarageCond Garage condition 车库条件
PavedDrive Paved driveway 车道施工方式
WoodDeckSF Wood deck area in square feet 木甲板面积
OpenPorchSF Open porch area in square feet 开放式门廊面积
EnclosedPorch Enclosed porch area in square feet 封闭式门廊面积
3SsnPorch Three season porch area in square feet 三季门廊面积
ScreenPorch Screen porch area in square feet 纱窗门廊面积
PoolArea Pool area in square feet 游泳池面积
PoolQC Pool quality 游泳池质量
Fence Fence quality 栅栏质量
MiscFeature Miscellaneous feature not covered in other categories 其他配套设施(网球场、电梯…)
MiscVal $Value of miscellaneous feature 其他配套设施的费用
MoSold Month Sold (MM) 销售月份
YrSold Year Sold (YYYY) 销售年份
SaleType Type of sale 支付方式
SaleCondition Condition of sale 房屋出售的情况

84.2 探索设想

  • 读懂数据描述,比如
    • 房屋设施 (bedrooms, garage, fireplace, pool, porch, etc.),
    • 地理位置 (neighborhood),
    • 土地信息 (zoning, shape, size, etc.),
    • 品相等级
    • 出售价格
  • 探索影响房屋价格的因素
    • 必要的预处理(缺失值处理、标准化、对数化等等)
    • 必要的可视化(比如价格分布图等)
    • 必要的统计(比如各地区房屋价格的均值)
    • 合理选取若干预测变量,建立多元线性模型,并对模型结果给出解释
    • 房屋价格与预测变量(房屋大小、在城市的位置、房屋类型、与街道的距离)

84.3 变量选取

我们选取下列变量:

  • lot_frontage, 建筑离街道的距离
  • lot_area, 占地面积
  • neighborhood, 建筑在城市的位置
  • gr_liv_area, 地上居住面积
  • bldg_type, 住宅类别(联排别墅、独栋别墅…)
  • year_built 房屋修建日期
d <- ames %>% 
  select(sale_price, 
         lot_frontage,   
         lot_area,      
         neighborhood,   
         gr_liv_area,   
         bldg_type,      
         year_built      
         )
d
## # A tibble: 1,460 × 7
##   sale_price lot_frontage lot_area neighborhood
##        <dbl>        <dbl>    <dbl> <chr>       
## 1     208500           65     8450 CollgCr     
## 2     181500           80     9600 Veenker     
## 3     223500           68    11250 CollgCr     
## 4     140000           60     9550 Crawfor     
## 5     250000           84    14260 NoRidge     
## 6     143000           85    14115 Mitchel     
## # … with 1,454 more rows, and 3 more variables:
## #   gr_liv_area <dbl>, bldg_type <chr>,
## #   year_built <dbl>

84.4 缺失值处理

d %>% 
  summarise(
    across(everything(), function(x) sum(is.na(x)) )
  )
## # A tibble: 1 × 7
##   sale_price lot_frontage lot_area neighborhood
##        <int>        <int>    <int>        <int>
## 1          0          259        0            0
## # … with 3 more variables: gr_liv_area <int>,
## #   bldg_type <int>, year_built <int>

找出来看看

## # A tibble: 259 × 7
##   sale_price lot_frontage lot_area neighborhood
##        <dbl>        <dbl>    <dbl> <chr>       
## 1     200000           NA    10382 NWAmes      
## 2     144000           NA    12968 Sawyer      
## 3     157000           NA    10920 NAmes       
## 4     149000           NA    11241 NAmes       
## 5     154000           NA     8246 Sawyer      
## 6     149350           NA     8544 Sawyer      
## # … with 253 more rows, and 3 more variables:
## #   gr_liv_area <dbl>, bldg_type <chr>,
## #   year_built <dbl>

如果不选择lot_frontage 就不会有缺失值,如何选择,自己抉择

d %>% 
  select(-lot_frontage) %>% 
  visdat::vis_dat()

我个人觉得这个变量很重要,所以还是保留,牺牲一点样本量吧

d <- d %>% 
  drop_na()
d %>% visdat::vis_dat()

84.5 预处理

  • 标准化
standard <- function(x) {
  (x - mean(x)) / sd(x)
}

d %>% 
  mutate(
    across(where(is.numeric), standard),
    across(where(is.character), as.factor)
  )
## # A tibble: 1,201 × 7
##   sale_price lot_frontage lot_area neighborhood
##        <dbl>        <dbl>    <dbl> <fct>       
## 1    0.333        -0.208   -0.190  CollgCr     
## 2    0.00875       0.410   -0.0444 Veenker     
## 3    0.512        -0.0844   0.164  CollgCr     
## 4   -0.489        -0.414   -0.0507 Crawfor     
## 5    0.830         0.574    0.544  NoRidge     
## 6   -0.453         0.616    0.525  Mitchel     
## # … with 1,195 more rows, and 3 more variables:
## #   gr_liv_area <dbl>, bldg_type <fct>,
## #   year_built <dbl>
  • 对数化
d %>% 
  mutate(
    log_sale_price = log(sale_price)
  )
## # A tibble: 1,201 × 8
##   sale_price lot_frontage lot_area neighborhood
##        <dbl>        <dbl>    <dbl> <chr>       
## 1     208500           65     8450 CollgCr     
## 2     181500           80     9600 Veenker     
## 3     223500           68    11250 CollgCr     
## 4     140000           60     9550 Crawfor     
## 5     250000           84    14260 NoRidge     
## 6     143000           85    14115 Mitchel     
## # … with 1,195 more rows, and 4 more variables:
## #   gr_liv_area <dbl>, bldg_type <chr>,
## #   year_built <dbl>, log_sale_price <dbl>
d %>% 
  mutate(
    across(where(is.numeric), log),
    across(where(is.character), as.factor)
  )
## # A tibble: 1,201 × 7
##   sale_price lot_frontage lot_area neighborhood
##        <dbl>        <dbl>    <dbl> <fct>       
## 1       12.2         4.17     9.04 CollgCr     
## 2       12.1         4.38     9.17 Veenker     
## 3       12.3         4.22     9.33 CollgCr     
## 4       11.8         4.09     9.16 Crawfor     
## 5       12.4         4.43     9.57 NoRidge     
## 6       11.9         4.44     9.55 Mitchel     
## # … with 1,195 more rows, and 3 more variables:
## #   gr_liv_area <dbl>, bldg_type <fct>,
## #   year_built <dbl>
  • 标准化 vs 对数化

选择哪一种,我们看图说话

d %>% 
  ggplot(aes(x = sale_price)) +
  geom_density()
d %>% 
  ggplot(aes(x = log(sale_price))) +
  geom_density()

我们选择对数化,并保存结果

d <- d %>% 
  mutate(
    across(where(is.numeric), 
           .fns = list(log = log), 
           .names = "{.fn}_{.col}"
           ),
    across(where(is.character), as.factor)
  )

84.6 有趣的探索

84.6.1 各区域的房屋价格均值

d %>% count(neighborhood)
## # A tibble: 25 × 2
##   neighborhood     n
##   <fct>        <int>
## 1 Blmngtn         14
## 2 Blueste          2
## 3 BrDale          16
## 4 BrkSide         51
## 5 ClearCr         13
## 6 CollgCr        126
## # … with 19 more rows
d %>% 
  group_by(neighborhood) %>% 
  summarise(
    mean_sale = mean(sale_price)
  ) %>% 
  
  ggplot(
    aes(x = mean_sale, y = fct_reorder(neighborhood, mean_sale))
  ) +
  geom_col(aes(fill = mean_sale < 150000), show.legend = FALSE) +
  geom_text(aes(label = round(mean_sale, 0)), hjust = 1) +
  # scale_x_continuous(
  #   expand = c(0, 0),
  #   breaks = c(0, 100000, 200000, 300000),
  #   labels = c(0, "1w", "2w", "3w")    
  #   ) +
  scale_x_continuous(
    expand = c(0, 0),
    labels = scales::dollar
  ) +
  scale_fill_viridis_d(option = "D") + 
  theme_classic() +
  labs(x = NULL, y = NULL)

84.6.2 房屋价格与占地面积

d %>%
  ggplot(aes(x = log_lot_area, y = log_sale_price)) +
  geom_point(colour = "blue") +
  geom_smooth(method = lm, se = FALSE, formula = "y ~ x")
d %>%
  ggplot(aes(x = log_lot_area, y = log_sale_price)) +
  geom_point(aes(colour = neighborhood)) +
  geom_smooth(method = lm, se = FALSE, formula = "y ~ x")
d %>%
  ggplot(aes(x = log_lot_area, y = log_sale_price)) +
  geom_point(colour = "blue") +
  geom_smooth(method = lm, se = FALSE, formula = "y ~ x", fullrange = TRUE) +
  facet_wrap(~neighborhood) +
  theme(strip.background = element_blank())

84.6.3 房屋价格与房屋居住面积

d %>%
  ggplot(aes(x = log_gr_liv_area, y = log_sale_price)) +
  geom_point(aes(colour = neighborhood)) +
  geom_smooth(method = lm, se = FALSE, formula = "y ~ x")
d %>%
  ggplot(aes(x = log_gr_liv_area, y = log_sale_price)) +
  geom_point() +
  geom_smooth(method = lm, se = FALSE, formula = "y ~ x", fullrange = TRUE) +
  facet_wrap(~neighborhood) +
  theme(strip.background = element_blank())

84.6.4 车库与房屋价格

车库大小是否对销售价格有帮助?

ames %>% 
  #select(garage_cars, garage_area, sale_price) %>% 
  ggplot(aes(x = garage_area, y = sale_price)) +
  geom_point(
    data = select(ames, -garage_cars),
    color = "gray50"
  ) +
  geom_point(aes(color = as_factor(garage_cars))) +
  facet_wrap(vars(garage_cars)) +
  theme(legend.position = "none") +
  ggtitle("This is the influence of garage for sale price")

84.7 建模

lm(log_sale_price ~ 1 + log_gr_liv_area + neighborhood, data = d) %>% 
  broom::tidy()
## # A tibble: 26 × 5
##   term           estimate std.error statistic   p.value
##   <chr>             <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)       7.53     0.154      48.7  2.21e-284
## 2 log_gr_liv_ar…    0.638    0.0200     31.9  3.76e-161
## 3 neighborhoodB…   -0.314    0.149      -2.10 3.55e-  2
## 4 neighborhoodB…   -0.466    0.0724     -6.43 1.80e- 10
## 5 neighborhoodB…   -0.336    0.0597     -5.62 2.44e-  8
## 6 neighborhoodC…   -0.103    0.0762     -1.35 1.76e-  1
## # … with 20 more rows
library(lme4)
lmer(log_sale_price ~ 1 + log_gr_liv_area + (log_gr_liv_area | neighborhood), 
    data = d) %>% 
   broom.mixed::tidy()
## # A tibble: 6 × 6
##   effect   group     term  estimate std.error statistic
##   <chr>    <chr>     <chr>    <dbl>     <dbl>     <dbl>
## 1 fixed    <NA>      (Int…    6.88     0.334       20.6
## 2 fixed    <NA>      log_…    0.705    0.0493      14.3
## 3 ran_pars neighbor… sd__…    1.34    NA           NA  
## 4 ran_pars neighbor… cor_…   -0.993   NA           NA  
## 5 ran_pars neighbor… sd__…    0.205   NA           NA  
## 6 ran_pars Residual  sd__…    0.191   NA           NA