40.3 Exploratory Analysis

We begin with EDA to understand variable distributions, relationships, and potential data issues (outliers, missingness, skew)³. The jtools::movies dataset offers a realistic setting with continuous and discrete variables relevant to business/creative outcomes.

Key steps:

Inspect distributions (histograms/densities)
Examine pairwise relationships (scatterplots, correlation)
Flag outliers and influential observations

data(movies, package = "jtools")

# Minimal wrangling for illustration
movies_small <- movies %>%
  select(metascore, budget, us_gross, year, runtime) %>%
  filter(complete.cases(.))

summary(movies_small)
#>    metascore          budget             us_gross              year     
#>  Min.   : 16.00   Min.   :    11622   Min.   :4.261e+04   Min.   :1971  
#>  1st Qu.: 52.00   1st Qu.: 19543169   1st Qu.:3.168e+07   1st Qu.:1998  
#>  Median : 64.00   Median : 40452872   Median :7.318e+07   Median :2004  
#>  Mean   : 63.01   Mean   : 60831325   Mean   :1.215e+08   Mean   :2002  
#>  3rd Qu.: 75.00   3rd Qu.: 89567622   3rd Qu.:1.530e+08   3rd Qu.:2009  
#>  Max.   :100.00   Max.   :461435929   Max.   :1.772e+09   Max.   :2013  
#>     runtime     
#>  Min.   :1.333  
#>  1st Qu.:1.667  
#>  Median :1.850  
#>  Mean   :1.923  
#>  3rd Qu.:2.100  
#>  Max.   :3.367

# Distribution plots (log scale for highly skewed financials)
library(tidyr)

movies_long <- movies_small %>%
  pivot_longer(cols = c(metascore, budget, us_gross, runtime),
               names_to = "variable", values_to = "value")

ggplot(movies_long, aes(value)) +
  facet_wrap(~ variable, scales = "free") +
  geom_histogram(bins = 30, fill = "#3c8dbc", color = "white") +
  scale_x_continuous(labels = scales::label_number(scale_cut = scales::cut_short_scale())) +
  labs(title = "Distributions of Key Variables",
       x = NULL, y = "Count") +
  theme_bw(base_size = 12)

# Pairwise relationships: simple scatter matrix
if (requireNamespace("GGally", quietly = TRUE)) {
  GGally::ggpairs(
    movies_small %>% mutate(across(c(budget, us_gross), log1p)),
    columns = c("metascore","budget","us_gross","runtime","year"),
    upper = list(continuous = GGally::wrap("cor", size = 3)),
    lower = list(continuous = GGally::wrap("points", alpha = .5, size = .7)),
    diag = list(continuous = GGally::wrap("barDiag", bins = 20))
  ) + theme_bw(base_size = 10)
}

# Quick correlation table (with log transforms for skewed $ variables)
cor_mat <- movies_small %>%
  mutate(across(c(budget, us_gross), log1p)) %>%
  select(metascore, budget, us_gross, runtime, year) %>%
  cor(use = "pairwise.complete.obs")

round(cor_mat, 3)
#>           metascore budget us_gross runtime   year
#> metascore     1.000 -0.168    0.105   0.197 -0.126
#> budget       -0.168  1.000    0.596   0.378  0.017
#> us_gross      0.105  0.596    1.000   0.245 -0.309
#> runtime       0.197  0.378    0.245   1.000 -0.057
#> year         -0.126  0.017   -0.309  -0.057  1.000

# Outlier & influence screening (pre-model)
base_fit <- lm(metascore ~ log1p(budget) + log1p(us_gross) + runtime + year, data = movies_small)

infl <- influence.measures(base_fit)
summary(infl)
#> Potentially influential observations of
#>   lm(formula = metascore ~ log1p(budget) + log1p(us_gross) + runtime +      year, data = movies_small) :
#> 
#>     dfb.1_ dfb.l1() dfb.l1(_ dfb.rntm dfb.year dffit   cov.r   cook.d hat    
#> 20  -0.05  -0.06     0.06    -0.02     0.05    -0.12    0.96_*  0.00   0.00  
#> 40   0.08   0.06    -0.04     0.05    -0.09    -0.16    0.96_*  0.01   0.00  
#> 44   0.05  -0.08     0.06     0.01    -0.05     0.12    1.03_*  0.00   0.03_*
#> 62   0.09   0.13    -0.05     0.07    -0.10    -0.22    0.97_*  0.01   0.01  
#> 95   0.02   0.03     0.01     0.10    -0.02    -0.16    0.97_*  0.01   0.00  
#> 102  0.00   0.00     0.00     0.00     0.00    -0.01    1.02_*  0.00   0.02_*
#> 106 -0.01  -0.01     0.00     0.01     0.01     0.02    1.02_*  0.00   0.02  
#> 110 -0.01   0.00     0.01    -0.03     0.01    -0.04    1.02_*  0.00   0.01  
#> 112  0.00   0.04     0.00    -0.02     0.00    -0.05    1.02_*  0.00   0.02_*
#> 129  0.00   0.00     0.00     0.00     0.00     0.00    1.02_*  0.00   0.01  
#> 133  0.03   0.03    -0.05    -0.02    -0.03     0.06    1.02_*  0.00   0.01  
#> 138  0.00   0.18    -0.07    -0.06    -0.01    -0.19    1.03_*  0.01   0.03_*
#> 143 -0.02   0.31    -0.10    -0.06     0.00    -0.34_*  1.04_*  0.02   0.05_*
#> 172  0.25   0.05    -0.14    -0.06    -0.24     0.28_*  1.00    0.02   0.02  
#> 205 -0.02   0.40    -0.13    -0.06     0.00    -0.44_*  1.06_*  0.04   0.07_*
#> 230  0.01   0.01    -0.01     0.00    -0.01     0.01    1.02_*  0.00   0.01  
#> 237 -0.21  -0.06     0.01     0.12     0.21    -0.26_*  0.98_*  0.01   0.01  
#> 239 -0.18   0.14    -0.07     0.05     0.17    -0.30_*  0.97_*  0.02   0.01  
#> 271 -0.14   0.04     0.11    -0.14     0.14     0.25_*  0.97_*  0.01   0.01  
#> 296 -0.01  -0.02     0.06     0.00     0.01    -0.07    1.02_*  0.00   0.02_*
#> 298 -0.20  -0.11     0.18     0.00     0.20    -0.24_*  0.99    0.01   0.01  
#> 329  0.11   0.24    -0.16    -0.03    -0.12    -0.26_*  1.00    0.01   0.02_*
#> 330  0.16   0.22    -0.18    -0.02    -0.16    -0.26_*  0.99    0.01   0.01  
#> 350 -0.01   0.00     0.00     0.06     0.01     0.07    1.02_*  0.00   0.02  
#> 383  0.00   0.00     0.00     0.01     0.00     0.01    1.02_*  0.00   0.02_*
#> 385 -0.02   0.01     0.01    -0.09     0.02    -0.10    1.03_*  0.00   0.02_*
#> 387  0.04   0.06    -0.16     0.06    -0.03     0.17    1.06_*  0.01   0.05_*
#> 391  0.00  -0.02     0.00     0.00     0.01     0.02    1.02_*  0.00   0.01  
#> 408 -0.01  -0.02     0.02     0.01     0.01    -0.02    1.02_*  0.00   0.01  
#> 413  0.00   0.00    -0.02     0.01     0.00     0.02    1.02_*  0.00   0.01  
#> 454  0.00   0.15    -0.04    -0.01    -0.01    -0.17    1.04_*  0.01   0.03_*
#> 484 -0.06   0.08     0.04    -0.06     0.06     0.17    0.97_*  0.01   0.00  
#> 503 -0.07   0.12    -0.09    -0.03     0.06    -0.17    1.02_*  0.01   0.02_*
#> 510  0.00   0.00     0.00     0.00     0.00     0.00    1.02_*  0.00   0.02  
#> 515  0.09   0.30    -0.20    -0.04    -0.10    -0.32_*  0.99    0.02   0.02  
#> 516  0.01  -0.03     0.01     0.07    -0.01     0.08    1.03_*  0.00   0.02_*
#> 535 -0.13  -0.10     0.14    -0.04     0.13    -0.20    0.96_*  0.01   0.00  
#> 551 -0.22  -0.02     0.10     0.04     0.22    -0.24_*  0.99    0.01   0.01  
#> 554 -0.03  -0.04     0.11    -0.04     0.03    -0.12    1.04_*  0.00   0.03_*
#> 586 -0.01   0.00     0.04     0.00     0.00    -0.06    1.02_*  0.00   0.02_*
#> 615  0.01   0.04    -0.03     0.00    -0.01    -0.04    1.04_*  0.00   0.03_*
#> 617  0.01   0.01    -0.02     0.00    -0.01     0.02    1.06_*  0.00   0.05_*
#> 618  0.05   0.09     0.04    -0.07    -0.06    -0.19    0.96_*  0.01   0.00  
#> 625  0.04   0.09    -0.07    -0.02    -0.05    -0.13    0.97_*  0.00   0.00  
#> 639  0.03  -0.01     0.00     0.04    -0.03     0.05    1.03_*  0.00   0.02_*
#> 646  0.05  -0.01     0.02     0.00    -0.06     0.08    1.02_*  0.00   0.02  
#> 655  0.02   0.01     0.00     0.04    -0.02    -0.10    0.98_*  0.00   0.00  
#> 661  0.07  -0.04     0.03     0.09    -0.08     0.14    1.03_*  0.00   0.03_*
#> 662  0.02  -0.01     0.00     0.04    -0.02     0.05    1.04_*  0.00   0.03_*
#> 673  0.02   0.03    -0.03     0.00    -0.02     0.04    1.02_*  0.00   0.02  
#> 683  0.07   0.14    -0.10    -0.14    -0.08     0.20    0.98_*  0.01   0.01  
#> 692  0.02  -0.01    -0.01     0.02    -0.01     0.03    1.02_*  0.00   0.02  
#> 698 -0.02  -0.03     0.03     0.10     0.02     0.11    1.03_*  0.00   0.02_*
#> 703 -0.04  -0.07     0.10    -0.13     0.04    -0.20    0.98_*  0.01   0.01  
#> 721 -0.10  -0.09     0.17    -0.24     0.10    -0.32_*  0.99    0.02   0.02  
#> 730 -0.10   0.14    -0.13     0.02     0.10    -0.23    1.01    0.01   0.02_*
#> 731  0.09   0.05    -0.04     0.08    -0.09    -0.16    0.97_*  0.01   0.00  
#> 744  0.01   0.00     0.01     0.00    -0.01     0.02    1.02_*  0.00   0.02  
#> 759  0.00   0.00     0.00     0.01     0.00     0.01    1.02_*  0.00   0.02  
#> 771  0.00   0.00    -0.01    -0.03     0.00    -0.03    1.03_*  0.00   0.02_*
#> 778  0.03   0.01     0.09    -0.15    -0.03     0.20    0.98_*  0.01   0.01  
#> 779 -0.10   0.06     0.09    -0.10     0.09     0.21    0.98_*  0.01   0.01  
#> 788  0.04   0.05    -0.15    -0.02    -0.03     0.17    1.02_*  0.01   0.02_*
#> 829  0.03  -0.01     0.01     0.00    -0.03     0.04    1.02_*  0.00   0.02

# Flag observations with large Cook's distance or high leverage
diag_df <- tibble(
  .cooksd  = cooks.distance(base_fit),
  .hat     = hatvalues(base_fit),
  .resid   = rstandard(base_fit)
) %>% mutate(id = row_number())

head(diag_df[order(-diag_df$.cooksd),], 10)
#> # A tibble: 10 × 4
#>    .cooksd    .hat .resid    id
#>      <dbl>   <dbl>  <dbl> <int>
#>  1  0.0393 0.0668   -1.66   205
#>  2  0.0233 0.0476   -1.53   143
#>  3  0.0199 0.0178   -2.34   515
#>  4  0.0199 0.0161   -2.46   721
#>  5  0.0176 0.0113   -2.77   239
#>  6  0.0152 0.0165    2.13   172
#>  7  0.0139 0.0107   -2.53   237
#>  8  0.0134 0.0126   -2.29   330
#>  9  0.0134 0.0187   -1.87   329
#> 10  0.0122 0.00869   2.64   271

For further details on exploratory analysis, see the next chapter.↩︎