40.3 Exploratory Analysis
We begin with EDA to understand variable distributions, relationships, and potential data issues (outliers, missingness, skew)3. The jtools::movies dataset offers a realistic setting with continuous and discrete variables relevant to business/creative outcomes.
Key steps:
Inspect distributions (histograms/densities)
Examine pairwise relationships (scatterplots, correlation)
Flag outliers and influential observations
data(movies, package = "jtools")
# Minimal wrangling for illustration
movies_small <- movies %>%
select(metascore, budget, us_gross, year, runtime) %>%
filter(complete.cases(.))
summary(movies_small)
#> metascore budget us_gross year
#> Min. : 16.00 Min. : 11622 Min. :4.261e+04 Min. :1971
#> 1st Qu.: 52.00 1st Qu.: 19543169 1st Qu.:3.168e+07 1st Qu.:1998
#> Median : 64.00 Median : 40452872 Median :7.318e+07 Median :2004
#> Mean : 63.01 Mean : 60831325 Mean :1.215e+08 Mean :2002
#> 3rd Qu.: 75.00 3rd Qu.: 89567622 3rd Qu.:1.530e+08 3rd Qu.:2009
#> Max. :100.00 Max. :461435929 Max. :1.772e+09 Max. :2013
#> runtime
#> Min. :1.333
#> 1st Qu.:1.667
#> Median :1.850
#> Mean :1.923
#> 3rd Qu.:2.100
#> Max. :3.367# Distribution plots (log scale for highly skewed financials)
library(tidyr)
movies_long <- movies_small %>%
pivot_longer(cols = c(metascore, budget, us_gross, runtime),
names_to = "variable", values_to = "value")
ggplot(movies_long, aes(value)) +
facet_wrap(~ variable, scales = "free") +
geom_histogram(bins = 30, fill = "#3c8dbc", color = "white") +
scale_x_continuous(labels = scales::label_number(scale_cut = scales::cut_short_scale())) +
labs(title = "Distributions of Key Variables",
x = NULL, y = "Count") +
theme_bw(base_size = 12)
# Pairwise relationships: simple scatter matrix
if (requireNamespace("GGally", quietly = TRUE)) {
GGally::ggpairs(
movies_small %>% mutate(across(c(budget, us_gross), log1p)),
columns = c("metascore","budget","us_gross","runtime","year"),
upper = list(continuous = GGally::wrap("cor", size = 3)),
lower = list(continuous = GGally::wrap("points", alpha = .5, size = .7)),
diag = list(continuous = GGally::wrap("barDiag", bins = 20))
) + theme_bw(base_size = 10)
}
# Quick correlation table (with log transforms for skewed $ variables)
cor_mat <- movies_small %>%
mutate(across(c(budget, us_gross), log1p)) %>%
select(metascore, budget, us_gross, runtime, year) %>%
cor(use = "pairwise.complete.obs")
round(cor_mat, 3)
#> metascore budget us_gross runtime year
#> metascore 1.000 -0.168 0.105 0.197 -0.126
#> budget -0.168 1.000 0.596 0.378 0.017
#> us_gross 0.105 0.596 1.000 0.245 -0.309
#> runtime 0.197 0.378 0.245 1.000 -0.057
#> year -0.126 0.017 -0.309 -0.057 1.000# Outlier & influence screening (pre-model)
base_fit <- lm(metascore ~ log1p(budget) + log1p(us_gross) + runtime + year, data = movies_small)
infl <- influence.measures(base_fit)
summary(infl)
#> Potentially influential observations of
#> lm(formula = metascore ~ log1p(budget) + log1p(us_gross) + runtime + year, data = movies_small) :
#>
#> dfb.1_ dfb.l1() dfb.l1(_ dfb.rntm dfb.year dffit cov.r cook.d hat
#> 20 -0.05 -0.06 0.06 -0.02 0.05 -0.12 0.96_* 0.00 0.00
#> 40 0.08 0.06 -0.04 0.05 -0.09 -0.16 0.96_* 0.01 0.00
#> 44 0.05 -0.08 0.06 0.01 -0.05 0.12 1.03_* 0.00 0.03_*
#> 62 0.09 0.13 -0.05 0.07 -0.10 -0.22 0.97_* 0.01 0.01
#> 95 0.02 0.03 0.01 0.10 -0.02 -0.16 0.97_* 0.01 0.00
#> 102 0.00 0.00 0.00 0.00 0.00 -0.01 1.02_* 0.00 0.02_*
#> 106 -0.01 -0.01 0.00 0.01 0.01 0.02 1.02_* 0.00 0.02
#> 110 -0.01 0.00 0.01 -0.03 0.01 -0.04 1.02_* 0.00 0.01
#> 112 0.00 0.04 0.00 -0.02 0.00 -0.05 1.02_* 0.00 0.02_*
#> 129 0.00 0.00 0.00 0.00 0.00 0.00 1.02_* 0.00 0.01
#> 133 0.03 0.03 -0.05 -0.02 -0.03 0.06 1.02_* 0.00 0.01
#> 138 0.00 0.18 -0.07 -0.06 -0.01 -0.19 1.03_* 0.01 0.03_*
#> 143 -0.02 0.31 -0.10 -0.06 0.00 -0.34_* 1.04_* 0.02 0.05_*
#> 172 0.25 0.05 -0.14 -0.06 -0.24 0.28_* 1.00 0.02 0.02
#> 205 -0.02 0.40 -0.13 -0.06 0.00 -0.44_* 1.06_* 0.04 0.07_*
#> 230 0.01 0.01 -0.01 0.00 -0.01 0.01 1.02_* 0.00 0.01
#> 237 -0.21 -0.06 0.01 0.12 0.21 -0.26_* 0.98_* 0.01 0.01
#> 239 -0.18 0.14 -0.07 0.05 0.17 -0.30_* 0.97_* 0.02 0.01
#> 271 -0.14 0.04 0.11 -0.14 0.14 0.25_* 0.97_* 0.01 0.01
#> 296 -0.01 -0.02 0.06 0.00 0.01 -0.07 1.02_* 0.00 0.02_*
#> 298 -0.20 -0.11 0.18 0.00 0.20 -0.24_* 0.99 0.01 0.01
#> 329 0.11 0.24 -0.16 -0.03 -0.12 -0.26_* 1.00 0.01 0.02_*
#> 330 0.16 0.22 -0.18 -0.02 -0.16 -0.26_* 0.99 0.01 0.01
#> 350 -0.01 0.00 0.00 0.06 0.01 0.07 1.02_* 0.00 0.02
#> 383 0.00 0.00 0.00 0.01 0.00 0.01 1.02_* 0.00 0.02_*
#> 385 -0.02 0.01 0.01 -0.09 0.02 -0.10 1.03_* 0.00 0.02_*
#> 387 0.04 0.06 -0.16 0.06 -0.03 0.17 1.06_* 0.01 0.05_*
#> 391 0.00 -0.02 0.00 0.00 0.01 0.02 1.02_* 0.00 0.01
#> 408 -0.01 -0.02 0.02 0.01 0.01 -0.02 1.02_* 0.00 0.01
#> 413 0.00 0.00 -0.02 0.01 0.00 0.02 1.02_* 0.00 0.01
#> 454 0.00 0.15 -0.04 -0.01 -0.01 -0.17 1.04_* 0.01 0.03_*
#> 484 -0.06 0.08 0.04 -0.06 0.06 0.17 0.97_* 0.01 0.00
#> 503 -0.07 0.12 -0.09 -0.03 0.06 -0.17 1.02_* 0.01 0.02_*
#> 510 0.00 0.00 0.00 0.00 0.00 0.00 1.02_* 0.00 0.02
#> 515 0.09 0.30 -0.20 -0.04 -0.10 -0.32_* 0.99 0.02 0.02
#> 516 0.01 -0.03 0.01 0.07 -0.01 0.08 1.03_* 0.00 0.02_*
#> 535 -0.13 -0.10 0.14 -0.04 0.13 -0.20 0.96_* 0.01 0.00
#> 551 -0.22 -0.02 0.10 0.04 0.22 -0.24_* 0.99 0.01 0.01
#> 554 -0.03 -0.04 0.11 -0.04 0.03 -0.12 1.04_* 0.00 0.03_*
#> 586 -0.01 0.00 0.04 0.00 0.00 -0.06 1.02_* 0.00 0.02_*
#> 615 0.01 0.04 -0.03 0.00 -0.01 -0.04 1.04_* 0.00 0.03_*
#> 617 0.01 0.01 -0.02 0.00 -0.01 0.02 1.06_* 0.00 0.05_*
#> 618 0.05 0.09 0.04 -0.07 -0.06 -0.19 0.96_* 0.01 0.00
#> 625 0.04 0.09 -0.07 -0.02 -0.05 -0.13 0.97_* 0.00 0.00
#> 639 0.03 -0.01 0.00 0.04 -0.03 0.05 1.03_* 0.00 0.02_*
#> 646 0.05 -0.01 0.02 0.00 -0.06 0.08 1.02_* 0.00 0.02
#> 655 0.02 0.01 0.00 0.04 -0.02 -0.10 0.98_* 0.00 0.00
#> 661 0.07 -0.04 0.03 0.09 -0.08 0.14 1.03_* 0.00 0.03_*
#> 662 0.02 -0.01 0.00 0.04 -0.02 0.05 1.04_* 0.00 0.03_*
#> 673 0.02 0.03 -0.03 0.00 -0.02 0.04 1.02_* 0.00 0.02
#> 683 0.07 0.14 -0.10 -0.14 -0.08 0.20 0.98_* 0.01 0.01
#> 692 0.02 -0.01 -0.01 0.02 -0.01 0.03 1.02_* 0.00 0.02
#> 698 -0.02 -0.03 0.03 0.10 0.02 0.11 1.03_* 0.00 0.02_*
#> 703 -0.04 -0.07 0.10 -0.13 0.04 -0.20 0.98_* 0.01 0.01
#> 721 -0.10 -0.09 0.17 -0.24 0.10 -0.32_* 0.99 0.02 0.02
#> 730 -0.10 0.14 -0.13 0.02 0.10 -0.23 1.01 0.01 0.02_*
#> 731 0.09 0.05 -0.04 0.08 -0.09 -0.16 0.97_* 0.01 0.00
#> 744 0.01 0.00 0.01 0.00 -0.01 0.02 1.02_* 0.00 0.02
#> 759 0.00 0.00 0.00 0.01 0.00 0.01 1.02_* 0.00 0.02
#> 771 0.00 0.00 -0.01 -0.03 0.00 -0.03 1.03_* 0.00 0.02_*
#> 778 0.03 0.01 0.09 -0.15 -0.03 0.20 0.98_* 0.01 0.01
#> 779 -0.10 0.06 0.09 -0.10 0.09 0.21 0.98_* 0.01 0.01
#> 788 0.04 0.05 -0.15 -0.02 -0.03 0.17 1.02_* 0.01 0.02_*
#> 829 0.03 -0.01 0.01 0.00 -0.03 0.04 1.02_* 0.00 0.02
# Flag observations with large Cook's distance or high leverage
diag_df <- tibble(
.cooksd = cooks.distance(base_fit),
.hat = hatvalues(base_fit),
.resid = rstandard(base_fit)
) %>% mutate(id = row_number())
head(diag_df[order(-diag_df$.cooksd),], 10)
#> # A tibble: 10 × 4
#> .cooksd .hat .resid id
#> <dbl> <dbl> <dbl> <int>
#> 1 0.0393 0.0668 -1.66 205
#> 2 0.0233 0.0476 -1.53 143
#> 3 0.0199 0.0178 -2.34 515
#> 4 0.0199 0.0161 -2.46 721
#> 5 0.0176 0.0113 -2.77 239
#> 6 0.0152 0.0165 2.13 172
#> 7 0.0139 0.0107 -2.53 237
#> 8 0.0134 0.0126 -2.29 330
#> 9 0.0134 0.0187 -1.87 329
#> 10 0.0122 0.00869 2.64 271For further details on exploratory analysis, see the next chapter.↩︎