41.3 Missing Data

Missingness is information. First detect, then decide: delete, impute, or model it explicitly.

# Counts by variable
profile_missing(tx)
#> # A tibble: 13 × 3
#>    feature    num_missing pct_missing
#>    <fct>            <int>       <dbl>
#>  1 city                 0      0     
#>  2 year                 0      0     
#>  3 month                0      0     
#>  4 sales              568      0.0660
#>  5 volume             568      0.0660
#>  6 median             616      0.0716
#>  7 listings          1424      0.166 
#>  8 inventory         1467      0.171 
#>  9 date                 0      0     
#> 10 avg_price          568      0.0660
#> 11 absorption        1427      0.166 
#> 12 quarter              0      0     
#> 13 ym                   0      0

plot_missing(tx)

# Simple tidy summary (counts and proportions)
tx %>%
    summarise(across(everything(),
                     ~ sprintf(
                         "%d (%.1f%%)", sum(is.na(.)), 100 * mean(is.na(.))
                     ),
                     .names = "{.col}")) %>%
    pivot_longer(everything(), names_to = "variable", values_to = "n_pct_na") %>%
    arrange(desc(n_pct_na)) %>%
    head(12)
#> # A tibble: 12 × 2
#>    variable   n_pct_na    
#>    <chr>      <chr>       
#>  1 median     616 (7.2%)  
#>  2 sales      568 (6.6%)  
#>  3 volume     568 (6.6%)  
#>  4 avg_price  568 (6.6%)  
#>  5 inventory  1467 (17.1%)
#>  6 absorption 1427 (16.6%)
#>  7 listings   1424 (16.6%)
#>  8 city       0 (0.0%)    
#>  9 year       0 (0.0%)    
#> 10 month      0 (0.0%)    
#> 11 date       0 (0.0%)    
#> 12 quarter    0 (0.0%)

# Example: median imputation for a few numerics (for demonstration only)
# (In modeling, prefer imputations *within* resampling using recipes/caret/tidymodels.)
tx_imputed <- tx %>%
  mutate(
    sales     = ifelse(is.na(sales),     median(sales,     na.rm = TRUE), sales),
    listings  = ifelse(is.na(listings),  median(listings,  na.rm = TRUE), listings),
    inventory = ifelse(is.na(inventory), median(inventory, na.rm = TRUE), inventory),
    median    = ifelse(is.na(median),    median(median,    na.rm = TRUE), median)
  )
skimr::skim(tx_imputed %>% select(sales, listings, inventory, median))

Table 41.1: Data summary
Name	tx_imputed %>% select(sal…
Number of rows	8602
Number of columns	4
_______________________
Column type frequency:
numeric	4
________________________
Group variables	None

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
sales	1	524.44	1077.59	6	90.0	169.0	432.00	8945.0	▇▁▁▁▁
listings	1	2896.76	5499.11	0	756.0	1283.0	2527.75	43107.0	▇▁▁▁▁
inventory	1	7.01	4.22	0	5.2	6.2	7.57	55.9	▇▁▁▁▁
median	1	127821.26	36014.21	50000	101725.0	123800.0	147900.00	304200.0	▃▇▃▁▁

# dlookr helpers (diagnose and visualize NA concentration)
diagnose(tx)
#> # A tibble: 13 × 6
#>    variables  types   missing_count missing_percent unique_count unique_rate
#>    <chr>      <chr>           <int>           <dbl>        <int>       <dbl>
#>  1 city       factor              0            0              46    0.00535 
#>  2 year       integer             0            0              16    0.00186 
#>  3 month      integer             0            0              12    0.00140 
#>  4 sales      numeric           568            6.60         1712    0.199   
#>  5 volume     numeric           568            6.60         7495    0.871   
#>  6 median     numeric           616            7.16         1538    0.179   
#>  7 listings   numeric          1424           16.6          3703    0.430   
#>  8 inventory  numeric          1467           17.1           296    0.0344  
#>  9 date       Date                0            0             187    0.0217  
#> 10 avg_price  numeric           568            6.60         7917    0.920   
#> 11 absorption numeric          1427           16.6          6880    0.800   
#> 12 quarter    factor              0            0               4    0.000465
#> 13 ym         factor              0            0             187    0.0217
plot_na_pareto(tx)

Best practice: Impute inside a modeling pipeline (e.g., recipes::step_impute_median()), and consider adding “was missing” flags to retain signal from missingness patterns.