# 第 84 章 探索性数据分析-ames房屋价格

## 84.1 数据故事

library(tidyverse)
janitor::clean_names()

glimpse(ames)
## Rows: 1,460
## Columns: 81
## $id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,… ##$ ms_sub_class    <dbl> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60, 20, 2…
## $ms_zoning <chr> "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RM", … ##$ lot_frontage    <dbl> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, NA, 91…
## $lot_area <dbl> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, 6… ##$ street          <chr> "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", "Pave"…
## $alley <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA… ##$ lot_shape       <chr> "Reg", "Reg", "IR1", "IR1", "IR1", "IR1", "Reg", "IR1"…
## $land_contour <chr> "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl"… ##$ utilities       <chr> "AllPub", "AllPub", "AllPub", "AllPub", "AllPub", "All…
## $lot_config <chr> "Inside", "FR2", "Inside", "Corner", "FR2", "Inside", … ##$ land_slope      <chr> "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl"…
## $neighborhood <chr> "CollgCr", "Veenker", "CollgCr", "Crawfor", "NoRidge",… ##$ condition1      <chr> "Norm", "Feedr", "Norm", "Norm", "Norm", "Norm", "Norm…
## $condition2 <chr> "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", "Norm"… ##$ bldg_type       <chr> "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam"…
## $house_style <chr> "2Story", "1Story", "2Story", "2Story", "2Story", "1.5… ##$ overall_qual    <dbl> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5, 5, 9, 5, 7, 6, 7, 6, 4, …
## $overall_cond <dbl> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, 8, 7, 5, … ##$ year_built      <dbl> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, 1931, …
## $year_remod_add <dbl> 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1973, 1950, … ##$ roof_style      <chr> "Gable", "Gable", "Gable", "Gable", "Gable", "Gable", …
## $roof_matl <chr> "CompShg", "CompShg", "CompShg", "CompShg", "CompShg",… ##$ exterior1st     <chr> "VinylSd", "MetalSd", "VinylSd", "Wd Sdng", "VinylSd",…
## $exterior2nd <chr> "VinylSd", "MetalSd", "VinylSd", "Wd Shng", "VinylSd",… ##$ mas_vnr_type    <chr> "BrkFace", "None", "BrkFace", "None", "BrkFace", "None…
## $mas_vnr_area <dbl> 196, 0, 162, 0, 350, 0, 186, 240, 0, 0, 0, 286, 0, 306… ##$ exter_qual      <chr> "Gd", "TA", "Gd", "TA", "Gd", "TA", "Gd", "TA", "TA", …
## $exter_cond <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", … ##$ foundation      <chr> "PConc", "CBlock", "PConc", "BrkTil", "PConc", "Wood",…
## $bsmt_qual <chr> "Gd", "Gd", "Gd", "TA", "Gd", "Gd", "Ex", "Gd", "TA", … ##$ bsmt_cond       <chr> "TA", "TA", "TA", "Gd", "TA", "TA", "TA", "TA", "TA", …
## $bsmt_exposure <chr> "No", "Gd", "Mn", "No", "Av", "No", "Av", "Mn", "No", … ##$ bsmt_fin_type1  <chr> "GLQ", "ALQ", "GLQ", "ALQ", "GLQ", "GLQ", "GLQ", "ALQ"…
## $bsmt_fin_sf1 <dbl> 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851, 906, … ##$ bsmt_fin_type2  <chr> "Unf", "Unf", "Unf", "Unf", "Unf", "Unf", "Unf", "BLQ"…
## $bsmt_fin_sf2 <dbl> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,… ##$ bsmt_unf_sf     <dbl> 150, 284, 434, 540, 490, 64, 317, 216, 952, 140, 134, …
## $total_bsmt_sf <dbl> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952, 991, … ##$ heating         <chr> "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", "GasA"…
## $heating_qc <chr> "Ex", "Ex", "Ex", "Gd", "Ex", "Ex", "Ex", "Ex", "Gd", … ##$ central_air     <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y",…
## $electrical <chr> "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", … ##$ x1st_flr_sf     <dbl> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022, 1077…
## $x2nd_flr_sf <dbl> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, 0, 1142, … ##$ low_qual_fin_sf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $gr_liv_area <dbl> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, 1774, … ##$ bsmt_full_bath  <dbl> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, …
## $bsmt_half_bath <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, … ##$ full_bath       <dbl> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, 1, 2, …
## $half_bath <dbl> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, … ##$ bedroom_abv_gr  <dbl> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, 2, 2, 2, …
## $kitchen_abv_gr <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, … ##$ kitchen_qual    <chr> "Gd", "TA", "Gd", "Gd", "Gd", "TA", "Gd", "TA", "TA", …
## $tot_rms_abv_grd <dbl> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5, 5, 5, 6,… ##$ functional      <chr> "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ"…
## $fireplaces <dbl> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 2, 0, 1, 1, 0, 1, 0, … ##$ fireplace_qu    <chr> NA, "TA", "TA", "Gd", "TA", NA, "Gd", "TA", "TA", "TA"…
## $garage_type <chr> "Attchd", "Attchd", "Attchd", "Detchd", "Attchd", "Att… ##$ garage_yr_blt   <dbl> 2003, 1976, 2001, 1998, 2000, 1993, 2004, 1973, 1931, …
## $garage_finish <chr> "RFn", "RFn", "RFn", "Unf", "RFn", "Unf", "RFn", "RFn"… ##$ garage_cars     <dbl> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 2, 2, 2, …
## $garage_area <dbl> 548, 460, 608, 642, 836, 480, 636, 484, 468, 205, 384,… ##$ garage_qual     <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "Fa", …
## $garage_cond <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", … ##$ paved_drive     <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y",…
## $wood_deck_sf <dbl> 0, 298, 0, 0, 192, 40, 255, 235, 90, 0, 0, 147, 140, 1… ##$ open_porch_sf   <dbl> 61, 0, 42, 35, 84, 30, 57, 204, 0, 4, 0, 21, 0, 33, 21…
## $enclosed_porch <dbl> 0, 0, 0, 272, 0, 0, 0, 228, 205, 0, 0, 0, 0, 0, 176, 0… ##$ x3ssn_porch     <dbl> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $screen_porch <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 176, 0, 0, 0, 0, 0… ##$ pool_area       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $pool_qc <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA… ##$ fence           <chr> NA, NA, NA, NA, NA, "MnPrv", NA, NA, NA, NA, NA, NA, N…
## $misc_feature <chr> NA, NA, NA, NA, NA, "Shed", NA, "Shed", NA, NA, NA, NA… ##$ misc_val        <dbl> 0, 0, 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0, 0, 70…
## $mo_sold <dbl> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, 5, 7, 3, … ##$ yr_sold         <dbl> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2009, 2008, …
## $sale_type <chr> "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", … ##$ sale_condition  <chr> "Normal", "Normal", "Normal", "Abnorml", "Normal", "No…
## $sale_price <dbl> 208500, 181500, 223500, 140000, 250000, 143000, 307000… 感谢曾倬同学提供的解释说明文档 explanation <- readxl::read_excel("./demo_data/ames_houseprice_explanation.xlsx") explanation %>% knitr::kable() 列名 description 解释 MSSubClass Identifies the type of dwelling involved in the sale. 住宅概况 MSZoning Identifies the general zoning classification of the sale. 建筑性质（农业、商业、高/低密度住宅） LotFrontage Linear feet of street connected to property 建筑离街道的距离 LotArea Lot size in square feet 占地面积 Street Type of road access to property 建筑附近的路面材质 Alley Type of alley access to property 建筑附近小巷的修建材质 LotShape General shape of property 建筑物的形状 LandContour Flatness of the property 地面平坦程度 Utilities Type of utilities available 可用公用设施类型 LotConfig Lot configuration 房屋哪里配置多 LandSlope Slope of property 建筑的斜率 Neighborhood Physical locations within Ames city limits 建筑在Ames城市的位置 Condition1 Proximity to various conditions 建筑附近的交通网络 Condition2 Proximity to various conditions (if more than one is present) 建筑附近的交通网络 BldgType Type of dwelling 住宅类别（联排别墅、独栋别墅…） HouseStyle Style of dwelling 建筑风格 OverallQual Rates the overall material and finish of the house 房屋装饰材质水平 OverallCond Rates the overall condition of the house 房屋整体状况评估 YearBuilt Original construction date 房屋修建日期 YearRemodAdd Remodel date (same as construction date if no remodeling or additions) 房屋改建日期 RoofStyle Type of roof 屋顶类型 RoofMatl Roof material 屋顶材质 Exterior1st Exterior covering on house 建筑外立面材质 Exterior2nd Exterior covering on house (if more than one material) 建筑外立面材质 MasVnrType Masonry veneer type 建筑表层砌体类型 MasVnrArea Masonry veneer area in square feet 每平方英尺的砌体面积 ExterQual Evaluates the quality of the material on the exterior 建筑表层砌体材料质量评估 ExterCond Evaluates the present condition of the material on the exterior 建筑表层砌体材料现状评估 Foundation Type of foundation 建筑基础的类型 BsmtQual Evaluates the height of the basement 地下室高度评估 BsmtCond Evaluates the general condition of the basement 地下室总体状况评估 BsmtExposure Refers to walkout or garden level walls 走廊/花园外墙的评估 BsmtFinType1 Rating of basement finished area 地下室完工区域的等级评价 BsmtFinSF1 Type 1 finished square feet 地下室完工区域的面积 BsmtFinType2 Rating of basement finished area (if multiple types) 其他地下室完工区域的等级评价 BsmtFinSF2 Type 2 finished square feet 其他地下室完工区域的面积 BsmtUnfSF Unfinished square feet of basement area 地下室未完工部分的面积 TotalBsmtSF Total square feet of basement area 地下室总面积 Heating Type of heating 房屋暖气类型（地暖、墙暖….） HeatingQC Heating quality and condition 暖气设施的质量和条件 CentralAir Central air conditioning 是否有中央空调 Electrical Electrical system 电器系统配置标准 1stFlrSF First Floor square feet 一楼面积 2ndFlrSF Second floor square feet 二楼面积 LowQualFinSF Low quality finished square feet (all floors) 所有楼层中低质量施工面积 GrLivArea Above grade (ground) living area square feet 地上居住面积 BsmtFullBath Basement full bathrooms 地下室标准卫生间个数 BsmtHalfBath Basement half bathrooms 地下室简易卫生间个数 FullBath Full bathrooms above grade 地上楼层标准卫生间个数 HalfBath Half baths above grade 地上楼层简易卫生间个数 BedroomAbvGr Bedrooms above grade (does NOT include basement bedrooms) 地上楼层卧室个数 KitchenAbvGr Kitchens above grade 地上楼层厨房个数 KitchenQual Kitchen quality 厨房质量评估 TopRmsAbvGrd Total rooms above grade (does not include bathrooms) 地上楼层房间总数（除去卧室） Functional Home functionality (Assume typical unless deductions are warranted) 房屋功能情况 Fireplaces Number of fireplaces 壁炉个数 FireplaceQu Fireplace quality 壁炉质量 GarageType Garage location 车库位置 GarageYrBlt Year garage was built 车库建成年份 GarageFinish Interior finish of the garage 车库内部装饰情况 GarageCars Size of garage in car capacity 车库容量 GarageArea Size of garage in square feet 车库占地面积 GarageQual Garage quality 车库质量 GarageCond Garage condition 车库条件 PavedDrive Paved driveway 车道施工方式 WoodDeckSF Wood deck area in square feet 木甲板面积 OpenPorchSF Open porch area in square feet 开放式门廊面积 EnclosedPorch Enclosed porch area in square feet 封闭式门廊面积 3SsnPorch Three season porch area in square feet 三季门廊面积 ScreenPorch Screen porch area in square feet 纱窗门廊面积 PoolArea Pool area in square feet 游泳池面积 PoolQC Pool quality 游泳池质量 Fence Fence quality 栅栏质量 MiscFeature Miscellaneous feature not covered in other categories 其他配套设施（网球场、电梯…） MiscVal$Value of miscellaneous feature 其他配套设施的费用
MoSold Month Sold (MM) 销售月份
YrSold Year Sold (YYYY) 销售年份
SaleType Type of sale 支付方式
SaleCondition Condition of sale 房屋出售的情况

## 84.2 探索设想

• 读懂数据描述，比如
• 房屋设施 (bedrooms, garage, fireplace, pool, porch, etc.),
• 地理位置 (neighborhood),
• 土地信息 (zoning, shape, size, etc.),
• 品相等级
• 出售价格
• 探索影响房屋价格的因素
• 必要的预处理（缺失值处理、标准化、对数化等等）
• 必要的可视化（比如价格分布图等）
• 必要的统计（比如各地区房屋价格的均值）
• 合理选取若干预测变量，建立多元线性模型，并对模型结果给出解释
• 房屋价格与预测变量（房屋大小、在城市的位置、房屋类型、与街道的距离）

## 84.3 变量选取

• lot_frontage, 建筑离街道的距离
• lot_area, 占地面积
• neighborhood, 建筑在城市的位置
• gr_liv_area, 地上居住面积
• bldg_type, 住宅类别(联排别墅、独栋别墅…)
• year_built 房屋修建日期
d <- ames %>%
select(sale_price,
lot_frontage,
lot_area,
neighborhood,
gr_liv_area,
bldg_type,
year_built
)
d
## # A tibble: 1,460 × 7
##    sale_price lot_frontage lot_area neighborhood gr_liv_area bldg_type
##         <dbl>        <dbl>    <dbl> <chr>              <dbl> <chr>
##  1     208500           65     8450 CollgCr             1710 1Fam
##  2     181500           80     9600 Veenker             1262 1Fam
##  3     223500           68    11250 CollgCr             1786 1Fam
##  4     140000           60     9550 Crawfor             1717 1Fam
##  5     250000           84    14260 NoRidge             2198 1Fam
##  6     143000           85    14115 Mitchel             1362 1Fam
##  7     307000           75    10084 Somerst             1694 1Fam
##  8     200000           NA    10382 NWAmes              2090 1Fam
##  9     129900           51     6120 OldTown             1774 1Fam
## 10     118000           50     7420 BrkSide             1077 2fmCon
## # ℹ 1,450 more rows
## # ℹ 1 more variable: year_built <dbl>

## 84.4 缺失值处理

d %>%
summarise(
across(everything(), function(x) sum(is.na(x)) )
)
## # A tibble: 1 × 7
##   sale_price lot_frontage lot_area neighborhood gr_liv_area bldg_type year_built
##        <int>        <int>    <int>        <int>       <int>     <int>      <int>
## 1          0          259        0            0           0         0          0

d %>%
filter_all(
any_vars(is.na(.))
)
## # A tibble: 259 × 7
##    sale_price lot_frontage lot_area neighborhood gr_liv_area bldg_type
##         <dbl>        <dbl>    <dbl> <chr>              <dbl> <chr>
##  1     200000           NA    10382 NWAmes              2090 1Fam
##  2     144000           NA    12968 Sawyer               912 1Fam
##  3     157000           NA    10920 NAmes               1253 1Fam
##  4     149000           NA    11241 NAmes               1004 1Fam
##  5     154000           NA     8246 Sawyer              1060 1Fam
##  6     149350           NA     8544 Sawyer              1228 1Fam
##  7     144000           NA     9180 SawyerW              884 1Fam
##  8     130250           NA     9200 CollgCr              938 1Fam
##  9     177000           NA    13869 Gilbert             1470 1Fam
## 10     219500           NA     9375 CollgCr             2034 1Fam
## # ℹ 249 more rows
## # ℹ 1 more variable: year_built <dbl>
library(visdat)

d %>% vis_dat()

d %>%
select(-lot_frontage) %>%
visdat::vis_dat()

d <- d %>%
drop_na()
d %>% visdat::vis_dat()

## 84.5 预处理

• 标准化
standard <- function(x) {
(x - mean(x)) / sd(x)
}

d %>%
mutate(
across(where(is.numeric), standard),
across(where(is.character), as.factor)
)
## # A tibble: 1,201 × 7
##    sale_price lot_frontage lot_area neighborhood gr_liv_area bldg_type
##         <dbl>        <dbl>    <dbl> <fct>              <dbl> <fct>
##  1    0.333       -0.208    -0.190  CollgCr            0.375 1Fam
##  2    0.00875      0.410    -0.0444 Veenker           -0.470 1Fam
##  3    0.512       -0.0844    0.164  CollgCr            0.519 1Fam
##  4   -0.489       -0.414    -0.0507 Crawfor            0.388 1Fam
##  5    0.830        0.574     0.544  NoRidge            1.30  1Fam
##  6   -0.453        0.616     0.525  Mitchel           -0.281 1Fam
##  7    1.51         0.204     0.0167 Somerst            0.345 1Fam
##  8   -0.610       -0.784    -0.484  OldTown            0.496 1Fam
##  9   -0.753       -0.826    -0.319  BrkSide           -0.819 2fmCon
## 10   -0.615       -0.00206   0.158  Sawyer            -0.889 1Fam
## # ℹ 1,191 more rows
## # ℹ 1 more variable: year_built <dbl>
• 对数化
d %>%
mutate(
log_sale_price = log(sale_price)
)
## # A tibble: 1,201 × 8
##    sale_price lot_frontage lot_area neighborhood gr_liv_area bldg_type
##         <dbl>        <dbl>    <dbl> <chr>              <dbl> <chr>
##  1     208500           65     8450 CollgCr             1710 1Fam
##  2     181500           80     9600 Veenker             1262 1Fam
##  3     223500           68    11250 CollgCr             1786 1Fam
##  4     140000           60     9550 Crawfor             1717 1Fam
##  5     250000           84    14260 NoRidge             2198 1Fam
##  6     143000           85    14115 Mitchel             1362 1Fam
##  7     307000           75    10084 Somerst             1694 1Fam
##  8     129900           51     6120 OldTown             1774 1Fam
##  9     118000           50     7420 BrkSide             1077 2fmCon
## 10     129500           70    11200 Sawyer              1040 1Fam
## # ℹ 1,191 more rows
## # ℹ 2 more variables: year_built <dbl>, log_sale_price <dbl>
d %>%
mutate(
across(where(is.numeric), log),
across(where(is.character), as.factor)
)
## # A tibble: 1,201 × 7
##    sale_price lot_frontage lot_area neighborhood gr_liv_area bldg_type
##         <dbl>        <dbl>    <dbl> <fct>              <dbl> <fct>
##  1       12.2         4.17     9.04 CollgCr             7.44 1Fam
##  2       12.1         4.38     9.17 Veenker             7.14 1Fam
##  3       12.3         4.22     9.33 CollgCr             7.49 1Fam
##  4       11.8         4.09     9.16 Crawfor             7.45 1Fam
##  5       12.4         4.43     9.57 NoRidge             7.70 1Fam
##  6       11.9         4.44     9.55 Mitchel             7.22 1Fam
##  7       12.6         4.32     9.22 Somerst             7.43 1Fam
##  8       11.8         3.93     8.72 OldTown             7.48 1Fam
##  9       11.7         3.91     8.91 BrkSide             6.98 2fmCon
## 10       11.8         4.25     9.32 Sawyer              6.95 1Fam
## # ℹ 1,191 more rows
## # ℹ 1 more variable: year_built <dbl>
• 标准化 vs 对数化

d %>%
ggplot(aes(x = sale_price)) +
geom_density()
d %>%
ggplot(aes(x = log(sale_price))) +
geom_density()

d <- d %>%
mutate(
across(where(is.numeric),
.fns = list(log = log),
.names = "{.fn}_{.col}"
),
across(where(is.character), as.factor)
)

## 84.6 有趣的探索

### 84.6.1 各区域的房屋价格均值

d %>% count(neighborhood)
## # A tibble: 25 × 2
##    neighborhood     n
##    <fct>        <int>
##  1 Blmngtn         14
##  2 Blueste          2
##  3 BrDale          16
##  4 BrkSide         51
##  5 ClearCr         13
##  6 CollgCr        126
##  7 Crawfor         41
##  8 Edwards         92
##  9 Gilbert         49
## 10 IDOTRR          34
## # ℹ 15 more rows
d %>%
group_by(neighborhood) %>%
summarise(
mean_sale = mean(sale_price)
) %>%

ggplot(
aes(x = mean_sale, y = fct_reorder(neighborhood, mean_sale))
) +
geom_col(aes(fill = mean_sale < 150000), show.legend = FALSE) +
geom_text(aes(label = round(mean_sale, 0)), hjust = 1) +
# scale_x_continuous(
#   expand = c(0, 0),
#   breaks = c(0, 100000, 200000, 300000),
#   labels = c(0, "1w", "2w", "3w")
#   ) +
scale_x_continuous(
expand = c(0, 0),
labels = scales::dollar
) +
scale_fill_viridis_d(option = "D") +
theme_classic() +
labs(x = NULL, y = NULL)

### 84.6.2 房屋价格与占地面积

d %>%
ggplot(aes(x = log_lot_area, y = log_sale_price)) +
geom_point(colour = "blue") +
geom_smooth(method = lm, se = FALSE, formula = "y ~ x")
d %>%
ggplot(aes(x = log_lot_area, y = log_sale_price)) +
geom_point(aes(colour = neighborhood)) +
geom_smooth(method = lm, se = FALSE, formula = "y ~ x")
d %>%
ggplot(aes(x = log_lot_area, y = log_sale_price)) +
geom_point(colour = "blue") +
geom_smooth(method = lm, se = FALSE, formula = "y ~ x", fullrange = TRUE) +
facet_wrap(~neighborhood) +
theme(strip.background = element_blank())

### 84.6.3 房屋价格与房屋居住面积

d %>%
ggplot(aes(x = log_gr_liv_area, y = log_sale_price)) +
geom_point(aes(colour = neighborhood)) +
geom_smooth(method = lm, se = FALSE, formula = "y ~ x")
d %>%
ggplot(aes(x = log_gr_liv_area, y = log_sale_price)) +
geom_point() +
geom_smooth(method = lm, se = FALSE, formula = "y ~ x", fullrange = TRUE) +
facet_wrap(~neighborhood) +
theme(strip.background = element_blank())

### 84.6.4 车库与房屋价格

ames %>%
#select(garage_cars, garage_area, sale_price) %>%
ggplot(aes(x = garage_area, y = sale_price)) +
geom_point(
data = select(ames, -garage_cars),
color = "gray50"
) +
geom_point(aes(color = as_factor(garage_cars))) +
facet_wrap(vars(garage_cars)) +
theme(legend.position = "none") +
ggtitle("This is the influence of garage for sale price")

## 84.7 建模

lm(log_sale_price ~ 1 + log_gr_liv_area + neighborhood, data = d) %>%
broom::tidy()
## # A tibble: 26 × 5
##    term                estimate std.error statistic   p.value
##    <chr>                  <dbl>     <dbl>     <dbl>     <dbl>
##  1 (Intercept)          7.53       0.154    48.7    2.21e-284
##  2 log_gr_liv_area      0.638      0.0200   31.9    3.76e-161
##  3 neighborhoodBlueste -0.314      0.149    -2.10   3.55e-  2
##  4 neighborhoodBrDale  -0.466      0.0724   -6.43   1.80e- 10
##  5 neighborhoodBrkSide -0.336      0.0597   -5.62   2.44e-  8
##  6 neighborhoodClearCr -0.103      0.0762   -1.35   1.76e-  1
##  7 neighborhoodCollgCr  0.00332    0.0556    0.0597 9.52e-  1
##  8 neighborhoodCrawfor -0.0870     0.0612   -1.42   1.55e-  1
##  9 neighborhoodEdwards -0.365      0.0567   -6.44   1.79e- 10
## 10 neighborhoodGilbert -0.0621     0.0599   -1.04   3.00e-  1
## # ℹ 16 more rows
library(lme4)
## Loading required package: Matrix
## Warning: package 'Matrix' was built under R version 4.2.3
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
##     expand, pack, unpack
lmer(log_sale_price ~ 1 + log_gr_liv_area + (log_gr_liv_area | neighborhood),
data = d) %>%
broom.mixed::tidy()
## # A tibble: 6 × 6
##   effect   group        term                        estimate std.error statistic
##   <chr>    <chr>        <chr>                          <dbl>     <dbl>     <dbl>
## 1 fixed    <NA>         (Intercept)                    6.88     0.334       20.6
## 2 fixed    <NA>         log_gr_liv_area                0.705    0.0493      14.3
## 3 ran_pars neighborhood sd__(Intercept)                1.34    NA           NA
## 4 ran_pars neighborhood cor__(Intercept).log_gr_li…   -0.993   NA           NA
## 5 ran_pars neighborhood sd__log_gr_liv_area            0.205   NA           NA
## 6 ran_pars Residual     sd__Observation                0.191   NA           NA