40.10 Inference / Prediction
Beyond classical t-tests:
Bootstrap CIs (resample rows)
Permutation tests (randomize treatment/feature under \(H_0\))
Cross-validation for predictive accuracy (RMSE/MAE)
Prediction intervals for new observations
# Bootstrap coefficient CIs (percentile) using boot
if (requireNamespace("boot", quietly = TRUE)) {
library(boot)
boot_fun <- function(data, idx) {
m <- lm(metascore ~ log1p(budget) + log1p(us_gross) + runtime + year, data = data[idx, , drop = FALSE])
coef(m)
}
bt <- boot(movies_small, statistic = boot_fun, R = 1000)
# Percentile CI for log1p(budget)
boot.ci(bt, type = "perc", index = which(names(coef(fit)) == "log1p(budget)"))
}
#> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#> Based on 1000 bootstrap replicates
#>
#> CALL :
#> boot.ci(boot.out = bt, type = "perc", index = which(names(coef(fit)) ==
#> "log1p(budget)"))
#>
#> Intervals :
#> Level Percentile
#> 95% (-8.158, -5.443 )
#> Calculations and Intervals on Original Scale# Simple K-fold CV (caret) for RMSE
if (requireNamespace("caret", quietly = TRUE)) {
library(caret)
ctrl <- trainControl(method = "cv", number = 5)
cv_fit <- train(
metascore ~ log1p(budget) + log1p(us_gross) + runtime + year,
data = movies_small,
method = "lm",
trControl = ctrl
)
cv_fit
}
#> Linear Regression
#>
#> 831 samples
#> 4 predictor
#>
#> No pre-processing
#> Resampling: Cross-Validated (5 fold)
#> Summary of sample sizes: 665, 664, 665, 666, 664
#> Resampling results:
#>
#> RMSE Rsquared MAE
#> 15.46842 0.1624431 12.53269
#>
#> Tuning parameter 'intercept' was held constant at a value of TRUE# Prediction intervals for new data
newdat <- tibble(
budget = median(movies_small$budget, na.rm = TRUE),
us_gross = median(movies_small$us_gross, na.rm = TRUE),
runtime = median(movies_small$runtime, na.rm = TRUE),
year = median(movies_small$year, na.rm = TRUE)
)
predict(fit, newdata = newdat, interval = "prediction", level = 0.95)
#> fit lwr upr
#> 1 62.00795 31.66255 92.35336