4 Modeling
4.1 Packages used in this chapter
library(tidyverse)
library(ggplot2)
library(broom)
4.2 Introduction
4.2.1 A simple linear regression model
simple_line <- tribble(
~x.line, ~y.line,
1., 2.,
3., 3.,
4., 4.,
6., 5.
)
simple_line
## # A tibble: 4 x 2
## x.line y.line
## <dbl> <dbl>
## 1 1 2
## 2 3 3
## 3 4 4
## 4 6 5
4.2.1.1 Plot of the data
ggplot(simple_line, aes(x.line, y.line)) +
geom_point()
4.2.1.2 Linear regression
Below, the data is fit to the line
\[ y = mx + b \]
the intercept is assumed unless explicity removed using either y ~ x -1 or y ~ 0 + x.
m_sl <- lm(y.line ~ x.line,
data = simple_line)
m_sl
##
## Call:
## lm(formula = y.line ~ x.line, data = simple_line)
##
## Coefficients:
## (Intercept) x.line
## 1.3462 0.6154
summary(m_sl)
##
## Call:
## lm(formula = y.line ~ x.line, data = simple_line)
##
## Residuals:
## 1 2 3 4
## 0.03846 -0.19231 0.19231 -0.03846
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.34615 0.21414 6.286 0.02438 *
## x.line 0.61538 0.05439 11.314 0.00772 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1961 on 2 degrees of freedom
## Multiple R-squared: 0.9846, Adjusted R-squared: 0.9769
## F-statistic: 128 on 1 and 2 DF, p-value: 0.007722
tidy(m_sl)
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 1.35 0.214 6.29 0.0244
## 2 x.line 0.615 0.0544 11.3 0.00772
sl_slope <- tidy(m_sl) %>%
filter(term == "x.line") %>%
dplyr::select(estimate)
sl_intercept <- tidy(m_sl) %>%
filter(term == "(Intercept)") %>%
dplyr::select(estimate)
sl_slope
## # A tibble: 1 x 1
## estimate
## <dbl>
## 1 0.615
sl_intercept
## # A tibble: 1 x 1
## estimate
## <dbl>
## 1 1.35
4.2.1.3 Plot of the data and linear regression
ggplot(simple_line, aes(x.line, y.line)) +
geom_point() +
geom_abline(slope = sl_slope$estimate, intercept = sl_intercept$estimate) +
ylim(0,6) +
xlim(0,7)
If we don’t need the coeficients, we can plot the data and linear regression using ggplot2
ggplot(simple_line, aes(x.line, y.line)) +
geom_point() +
stat_smooth(method = lm, linetype = "dashed") +
ylim(0,6) +
xlim(0,7)
4.2.1.4 Finally, let’s add prediction intervals to the graph
temp_var <- m_sl %>%
predict(interval="predict") %>%
as_tibble()
## Warning in predict.lm(., interval = "predict"): predictions on current data refer to _future_ responses
temp_var
## # A tibble: 4 x 3
## fit lwr upr
## <dbl> <dbl> <dbl>
## 1 1.96 0.851 3.07
## 2 3.19 2.24 4.14
## 3 3.81 2.86 4.76
## 4 5.04 3.93 6.15
simple_line_predict <- bind_cols(simple_line, temp_var)
simple_line_predict
## # A tibble: 4 x 5
## x.line y.line fit lwr upr
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 2 1.96 0.851 3.07
## 2 3 3 3.19 2.24 4.14
## 3 4 4 3.81 2.86 4.76
## 4 6 5 5.04 3.93 6.15
ggplot(simple_line_predict, aes(x.line, y.line)) +
geom_point() +
stat_smooth(method = lm, linetype = "dashed", size = 0.5) +
geom_line(aes(y=lwr), color = "red", linetype = "dashed") +
geom_line(aes(y=upr), color = "red", linetype = "dashed") +
ylim(0,7) +
xlim(0,7)
4.2.2 Beyond the linear regression
4.2.2.1 A simple data set for non-linear regression modeling—exponential decay
Example is from Brown, LeMay.
kinetics1 <- tribble(
~time, ~conc,
0., 0.100,
50., 0.0905,
100., 0.0820,
150., 0.0741,
200., 0.0671,
300., 0.0549,
400., 0.0448,
500., 0.0368,
800., 0.0200
)
kinetics1
## # A tibble: 9 x 2
## time conc
## <dbl> <dbl>
## 1 0 0.1
## 2 50 0.0905
## 3 100 0.082
## 4 150 0.0741
## 5 200 0.0671
## 6 300 0.0549
## 7 400 0.0448
## 8 500 0.0368
## 9 800 0.02
4.2.2.2 Simple plots of the data
We can plot the orginal data set, conc vs. time to view the trend. A simple test to confirm the data follows a first-order decay, we can plot log(conc) vs. time.
ggplot(kinetics1, aes(time, conc)) +
geom_point()
ggplot(kinetics1, aes(time, log(conc))) +
geom_point()
4.2.2.3 Using the nls function
k1 <- nls(conc ~ 0.1*exp(-a1*time),
data = kinetics1, start = list(a1 = 0.002), trace = T)
## 7.545743e-08 : 0.002
## 7.088224e-08 : 0.0020017
## 7.088224e-08 : 0.002001699
summary(k1)
##
## Formula: conc ~ 0.1 * exp(-a1 * time)
##
## Parameters:
## Estimate Std. Error t value Pr(>|t|)
## a1 2.002e-03 2.367e-06 845.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.413e-05 on 8 degrees of freedom
##
## Number of iterations to convergence: 2
## Achieved convergence tolerance: 2.138e-07
4.2.2.4 Ploting the model results
Using the augment()
function from the broom package, we can plot both the data and predicted values from th nls()
model.
augment(k1)
## # A tibble: 9 x 4
## time conc .fitted .resid
## <dbl> <dbl> <dbl> <dbl>
## 1 0 0.1 0.1 0
## 2 50 0.0905 0.0905 0.0000239
## 3 100 0.082 0.0819 0.000141
## 4 150 0.0741 0.0741 0.0000371
## 5 200 0.0671 0.0670 0.0000908
## 6 300 0.0549 0.0549 0.0000468
## 7 400 0.0448 0.0449 -0.000102
## 8 500 0.0368 0.0368 0.0000433
## 9 800 0.02 0.0202 -0.000162
ggplot()+
geom_point(aes(time, conc), kinetics1) +
geom_line(aes(time, .fitted), augment(k1))
We can also use the output of augment()
to plot the residuals
ggplot()+
geom_point(aes(time, .resid), augment(k1)) +
geom_hline(yintercept = 0)
4.2.2.4.1 create a function for the fit
If we want to create a smooth curve of the fit, we need to create a function and use the calculated coefficients from the nls()
model. We can then use the stat_function()
geom to superimpose the function on the base plot.
conc.fit <- function(t) {
0.1*exp(-t*summary(k1)$coefficients[1])
}
ggplot(kinetics1, mapping = aes(time, conc)) +
geom_point() +
stat_function(fun = conc.fit, linetype = "dashed", colour = "green") +
ggtitle("A kinetics example from first-year chemistry", subtitle = "dashed green line: first-order, exponential decay") +
theme_bw()
4.3 Case Stuidies
4.3.1 Load cell output
The data collected in the calibration experiment consisted of a known load, applied to the load cell, and the corresponding deflection of the cell from its nominal position. Forty measurements were made over a range of loads from 150,000 to 3,000,000 units. The data were collected in two sets in order of increasing load. The systematic run order makes it difficult to determine whether or not there was any drift in the load cell or measuring equipment over time. Assuming there is no drift, however, the experiment should provide a good description of the relationship between the load applied to the cell and its response.
library(tidyverse)
load_cell <- read_table2(
"NIST data/PONTIUS.dat", skip = 25, col_names = FALSE, col_types = "dd") %>%
rename(Deflection = X1, Load = X2)
load_cell
## # A tibble: 40 x 2
## Deflection Load
## <dbl> <dbl>
## 1 0.110 150000
## 2 0.220 300000
## 3 0.329 450000
## 4 0.439 600000
## 5 0.548 750000
## 6 0.657 900000
## 7 0.766 1050000
## 8 0.875 1200000
## 9 0.983 1350000
## 10 1.09 1500000
## # … with 30 more rows
4.3.1.1 Selection of Inital Model
First, let’s view the data.
ggplot(load_cell) +
geom_point(aes(Load, Deflection))
The data looks linear. We can use a simple linear model to view the data
\[ y = mx + b \]
load_cell_model <- lm(Deflection ~ Load, load_cell)
summary(load_cell_model)
##
## Call:
## lm(formula = Deflection ~ Load, data = load_cell)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0042751 -0.0016308 0.0005818 0.0018932 0.0024211
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.150e-03 7.132e-04 8.623 1.77e-10 ***
## Load 7.221e-07 3.969e-10 1819.289 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.002171 on 38 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 3.31e+06 on 1 and 38 DF, p-value: < 2.2e-16
Wow! an R-squared value of 1! it must be perfect.
4.3.1.1.1 A new package to work with summary information: broom()
broom package is part of the tidyverse and inccludes glance()
, tidy
, and augment()
.
These functions create tidy data frames based on the model.
load_cell_glance <- glance(load_cell_model)
load_cell_glance
## # A tibble: 1 x 11
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
## 1 1.000 1.000 0.00217 3309811. 1.77e-95 2 190. -373.
## # … with 3 more variables: BIC <dbl>, deviance <dbl>, df.residual <int>
load_cell_tidy <- tidy(load_cell_model)
load_cell_tidy
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.00615 7.13e- 4 8.62 1.77e-10
## 2 Load 0.000000722 3.97e-10 1819. 1.77e-95
augment(load_cell_model)
## # A tibble: 40 x 9
## Deflection Load .fitted .se.fit .resid .hat .sigma .cooksd
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.110 1.50e5 0.114 6.62e-4 -4.28e-3 0.0929 0.00207 2.19e-1
## 2 0.220 3.00e5 0.223 6.12e-4 -3.22e-3 0.0793 0.00213 1.03e-1
## 3 0.329 4.50e5 0.331 5.63e-4 -1.61e-3 0.0673 0.00218 2.12e-2
## 4 0.439 6.00e5 0.439 5.17e-4 -4.21e-4 0.0568 0.00220 1.20e-3
## 5 0.548 7.50e5 0.548 4.74e-4 3.03e-4 0.0477 0.00220 5.14e-4
## 6 0.657 9.00e5 0.656 4.35e-4 8.98e-4 0.0402 0.00220 3.73e-3
## 7 0.766 1.05e6 0.764 4.02e-4 1.26e-3 0.0342 0.00219 6.20e-3
## 8 0.875 1.20e6 0.873 3.74e-4 2.20e-3 0.0297 0.00217 1.62e-2
## 9 0.983 1.35e6 0.981 3.55e-4 1.93e-3 0.0267 0.00218 1.12e-2
## 10 1.09 1.50e6 1.09 3.45e-4 2.16e-3 0.0252 0.00217 1.31e-2
## # … with 30 more rows, and 1 more variable: .std.resid <dbl>
ggplot(load_cell, aes(Load, Deflection)) +
geom_point() +
stat_smooth(method = lm, linetype = "dashed", colour = "blue", size = 0.5) +
ggtitle("NIST Load Cell Calibration Data", subtitle = "+/- 95% Confidence Intervals are not visible")
4.3.1.2 But wait! What about the residuals?
load_cell_resid = resid(load_cell_model)
load_cell_resid
## 1 2 3 4 5
## -0.0042750714 -0.0032204586 -0.0016058459 -0.0004212331 0.0003033797
## 6 7 8 9 10
## 0.0008979925 0.0012626053 0.0021972180 0.0019318308 0.0021564436
## 11 12 13 14 15
## 0.0023910564 0.0022856692 0.0017402820 0.0014248947 0.0010595075
## 16 17 18 19 20
## 0.0002741203 -0.0010512669 -0.0019066541 -0.0028620414 -0.0040174286
## 21 22 23 24 25
## -0.0039450714 -0.0026004586 -0.0017058459 -0.0005512331 0.0002533797
## 26 27 28 29 30
## 0.0013479925 0.0016026053 0.0020672180 0.0020118308 0.0021964436
## 31 32 33 34 35
## 0.0024210564 0.0022456692 0.0018802820 0.0015148947 0.0007095075
## 36 37 38 39 40
## 0.0004541203 -0.0005512669 -0.0013766541 -0.0023720414 -0.0041674286
# ggplot() +
# geom_point(aes(LoadCell$Load, LC.resid)) +
# geom_hline(aes(yintercept=0)) +
# geom_hline(aes(yintercept=+2*(summary(m.LC)$sigma)), linetype = "dashed") +
# geom_hline(aes(yintercept=-2*(summary(m.LC)$sigma)), linetype = "dashed") +
# ggtitle("Deflection Load Residuals", subtitle = "+/- 2(Residual Statndard Deviation)") +
# theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5))
Using the augment()
function we can plot the residuals very easily
load_cell_fit <- augment(load_cell_model)
load_cell_fit
## # A tibble: 40 x 9
## Deflection Load .fitted .se.fit .resid .hat .sigma .cooksd
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.110 1.50e5 0.114 6.62e-4 -4.28e-3 0.0929 0.00207 2.19e-1
## 2 0.220 3.00e5 0.223 6.12e-4 -3.22e-3 0.0793 0.00213 1.03e-1
## 3 0.329 4.50e5 0.331 5.63e-4 -1.61e-3 0.0673 0.00218 2.12e-2
## 4 0.439 6.00e5 0.439 5.17e-4 -4.21e-4 0.0568 0.00220 1.20e-3
## 5 0.548 7.50e5 0.548 4.74e-4 3.03e-4 0.0477 0.00220 5.14e-4
## 6 0.657 9.00e5 0.656 4.35e-4 8.98e-4 0.0402 0.00220 3.73e-3
## 7 0.766 1.05e6 0.764 4.02e-4 1.26e-3 0.0342 0.00219 6.20e-3
## 8 0.875 1.20e6 0.873 3.74e-4 2.20e-3 0.0297 0.00217 1.62e-2
## 9 0.983 1.35e6 0.981 3.55e-4 1.93e-3 0.0267 0.00218 1.12e-2
## 10 1.09 1.50e6 1.09 3.45e-4 2.16e-3 0.0252 0.00217 1.31e-2
## # … with 30 more rows, and 1 more variable: .std.resid <dbl>
ggplot(load_cell_fit) +
geom_point(aes(Load, .resid)) +
ggtitle("Residuals from linear model of the load cell")
The residuals from a good model would be random. Although not necessary, we can plot a histogram or qqplot to demonstrate the residuals are not following a normal distribution.
ggplot(load_cell_fit) +
geom_histogram(aes(.resid))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(load_cell_fit) +
geom_qq(aes(sample = .resid))
4.3.1.3 Model Refinement
\[ D = \beta_0 + \beta_1 L + \beta_2 L^2 + \varepsilon \]
We can use the linear model function, lm()
, by creating a new variable \(L^2\).
load_cell_2 <- mutate(load_cell, Load_squared = Load^2)
load_cell_2
## # A tibble: 40 x 3
## Deflection Load Load_squared
## <dbl> <dbl> <dbl>
## 1 0.110 150000 22500000000
## 2 0.220 300000 90000000000
## 3 0.329 450000 202500000000
## 4 0.439 600000 360000000000
## 5 0.548 750000 562500000000
## 6 0.657 900000 810000000000
## 7 0.766 1050000 1102500000000
## 8 0.875 1200000 1440000000000
## 9 0.983 1350000 1822500000000
## 10 1.09 1500000 2250000000000
## # … with 30 more rows
load_cell_model_2 <- lm(Deflection ~ Load + Load_squared, load_cell_2)
summary(load_cell_model_2)
##
## Call:
## lm(formula = Deflection ~ Load + Load_squared, data = load_cell_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.468e-04 -1.578e-04 3.817e-05 1.088e-04 4.235e-04
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.736e-04 1.079e-04 6.24 2.97e-07 ***
## Load 7.321e-07 1.578e-10 4638.65 < 2e-16 ***
## Load_squared -3.161e-15 4.867e-17 -64.95 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0002052 on 37 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 1.853e+08 on 2 and 37 DF, p-value: < 2.2e-16
load_cell_fit_2 <- augment(load_cell_model_2)
load_cell_fit_2
## # A tibble: 40 x 10
## Deflection Load Load_squared .fitted .se.fit .resid .hat .sigma
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.110 1.50e5 2.25e10 0.110 8.83e-5 -2.21e-4 0.185 2.04e-4
## 2 0.220 3.00e5 9.00e10 0.220 7.19e-5 -4.47e-4 0.123 1.92e-4
## 3 0.329 4.50e5 2.02e11 0.329 5.89e-5 2.99e-5 0.0824 2.08e-4
## 4 0.439 6.00e5 3.60e11 0.439 4.99e-5 2.19e-4 0.0591 2.05e-4
## 5 0.548 7.50e5 5.62e11 0.548 4.50e-5 9.00e-5 0.0480 2.07e-4
## 6 0.657 9.00e5 8.10e11 0.657 4.35e-5 -2.65e-5 0.0450 2.08e-4
## 7 0.766 1.05e6 1.10e12 0.766 4.44e-5 -2.31e-4 0.0468 2.04e-4
## 8 0.875 1.20e6 1.44e12 0.875 4.61e-5 2.77e-4 0.0505 2.03e-4
## 9 0.983 1.35e6 1.82e12 0.983 4.77e-5 -2.73e-4 0.0541 2.03e-4
## 10 1.09 1.50e6 2.25e12 1.09 4.86e-5 -1.90e-4 0.0562 2.05e-4
## # … with 30 more rows, and 2 more variables: .cooksd <dbl>,
## # .std.resid <dbl>
ggplot(load_cell_fit_2) +
geom_point(aes(Load, .resid)) +
ggtitle("Residuals from refined model of the load cell")
4.3.1.3.1 Could we have used a non-linear least squares fit model?
load_cell_model_3 <- nls(Deflection ~ b0 + b1*Load + b2*Load^2, load_cell_2, start = c(b0 = 0, b1 = 0,b2 = 0))
summary(load_cell_model_3)
##
## Formula: Deflection ~ b0 + b1 * Load + b2 * Load^2
##
## Parameters:
## Estimate Std. Error t value Pr(>|t|)
## b0 6.736e-04 1.079e-04 6.24 2.97e-07 ***
## b1 7.321e-07 1.578e-10 4638.65 < 2e-16 ***
## b2 -3.161e-15 4.867e-17 -64.95 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0002052 on 37 degrees of freedom
##
## Number of iterations to convergence: 1
## Achieved convergence tolerance: 1.328e-06
The results are identical.
ggplot(load_cell_fit_2) +
geom_histogram(aes(.resid))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(load_cell_fit_2) +
geom_qq(aes(sample = .resid))
4.3.2 Thermal expansion of copper
from section 4.6.4. Thermal Expansion of Copper Case Study
This case study illustrates the use of a class of nonlinear models called rational function models. The data set used is the thermal expansion of copper related to temperature.
CTECu <- read_table2(
"NIST data/HAHN1.dat", skip = 25, col_names = FALSE)
## Parsed with column specification:
## cols(
## X1 = col_double(),
## X2 = col_double()
## )
CTECu <- CTECu %>%
rename(temp_K = X1, Cu_CTE = X2)
View(CTECu)
ggplot(CTECu, aes(temp_K, Cu_CTE)) +
geom_point()
4.3.3 Quadratic/Quadratic (Q/Q) model
The NIST handbook has a procedure for calculing estimates for the model, below, I just used guess values for the equation \[ y = \frac{(A0 + A1 \cdot x + A2 \cdot x^2)}{(1 + B1\cdot x + B2 \cdot X^2)} \]
model_Cu <- nls(Cu_CTE ~ ((a0 + a1*temp_K + a2*temp_K^2)/(1 + b1*temp_K + b2*temp_K^2)),
CTECu, start = list(a0 = 0, a1 = -1, a2 = -1, b1 = 0, b2 = 0), trace = T)
## 1.344556e+13 : 0 -1 -1 0 0
## 1697329 : -3.351400e+00 1.797879e-01 -5.745645e-04 7.915158e-07 -3.877688e-10
## 177589.3 : -3.319661e+00 1.750319e-01 -3.592189e-04 1.144561e-03 -6.445222e-07
## 32411.82 : -4.730736e+00 2.146213e-01 -3.235814e-04 3.851759e-03 -2.232626e-06
## 5374.946 : -6.413603e+00 2.683958e-01 -2.844465e-04 7.799104e-03 -4.266107e-06
## 690.5713 : -6.901674e+00 2.867489e-01 -1.804611e-04 1.009256e-02 -3.131612e-06
## 217.1164 : -5.970973e+00 2.481232e-01 1.250216e-04 8.918691e-03 1.150181e-05
## 114.0929 : -3.145220e+00 1.061122e-01 1.610442e-03 4.783451e-03 8.707152e-05
## 64.59622 : 3.1477349163 -0.2965983768 0.0078946767 0.0013004690 0.0003974542
## 42.78158 : 9.3230025114 -0.8210512798 0.0188009501 0.0103976361 0.0009147904
## 34.34313 : 11.848742450 -1.098105431 0.025888747 0.024139632 0.001231568
## 33.5614 : 11.838847499 -1.129134608 0.027249383 0.029538247 0.001286009
## 33.55298 : 12.220405792 -1.169396827 0.028259353 0.031296606 0.001332281
## 33.55278 : 11.988150281 -1.149285630 0.027831681 0.030880324 0.001312172
## 33.55273 : 12.165006923 -1.165285413 0.028186412 0.031296535 0.001328754
## 33.55271 : 12.034452089 -1.153533233 0.027926998 0.030997296 0.001316621
## 33.55269 : 12.131480095 -1.162274660 0.028120094 0.031220657 0.001325652
## 33.55268 : 12.059618930 -1.155801947 0.027977138 0.031055397 0.001318966
## 33.55268 : 12.112946981 -1.160605895 0.028083247 0.031178098 0.001323928
## 33.55268 : 12.073406215 -1.157044238 0.028004583 0.031087151 0.001320249
## 33.55267 : 12.102793316 -1.159691514 0.028063056 0.031154770 0.001322984
## 33.55267 : 12.081046435 -1.157732650 0.028019791 0.031104752 0.001320961
## 33.55267 : 12.097167203 -1.159184809 0.028051866 0.031141839 0.001322461
## 33.55267 : 12.085177311 -1.158104731 0.028028009 0.031114250 0.001321345
## 33.55267 : 12.094068505 -1.158905649 0.028045699 0.031134704 0.001322172
## 33.55267 : 12.087436601 -1.158308224 0.028032503 0.031119442 0.001321555
## 33.55267 : 12.092386536 -1.158754113 0.028042351 0.031130832 0.001322016
## 33.55267 : 12.088732963 -1.158425030 0.028035083 0.031122431 0.001321676
## 33.55267 : 12.091445289 -1.158669369 0.028040481 0.031128672 0.001321928
## 33.55267 : 12.08942043 -1.15848695 0.02803645 0.03112401 0.00132174
## 33.55267 : 12.090900623 -1.158620261 0.028039395 0.031127413 0.001321877
## 33.55267 : 12.089833503 -1.158524186 0.028037274 0.031124964 0.001321778
## 33.55267 : 12.090620445 -1.158595059 0.028038839 0.031126773 0.001321851
summary(model_Cu)
##
## Formula: Cu_CTE ~ ((a0 + a1 * temp_K + a2 * temp_K^2)/(1 + b1 * temp_K +
## b2 * temp_K^2))
##
## Parameters:
## Estimate Std. Error t value Pr(>|t|)
## a0 12.0906204 4.9457268 2.445 0.01525 *
## a1 -1.1585951 0.4459281 -2.598 0.00997 **
## a2 0.0280388 0.0099339 2.823 0.00518 **
## b1 0.0311268 0.0123461 2.521 0.01237 *
## b2 0.0013219 0.0004639 2.849 0.00478 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3811 on 231 degrees of freedom
##
## Number of iterations to convergence: 32
## Achieved convergence tolerance: 8.018e-06
glance(model_Cu)
## # A tibble: 1 x 8
## sigma isConv finTol logLik AIC BIC deviance df.residual
## <dbl> <lgl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 0.381 TRUE 0.00000802 -105. 221. 242. 33.6 231
augment(model_Cu)
## # A tibble: 236 x 4
## temp_K Cu_CTE .fitted .resid
## <dbl> <dbl> <dbl> <dbl>
## 1 24.4 0.591 0.203 0.388
## 2 34.8 1.55 1.56 -0.0110
## 3 44.1 2.90 3.14 -0.237
## 4 45.1 2.89 3.31 -0.413
## 5 55.0 4.70 4.94 -0.239
## 6 65.5 6.31 6.49 -0.181
## 7 70.5 7.03 7.15 -0.119
## 8 75.7 7.90 7.78 0.116
## 9 89.6 9.47 9.26 0.211
## 10 91.1 9.48 9.41 0.0757
## # … with 226 more rows
tidy(model_Cu)
## # A tibble: 5 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 a0 12.1 4.95 2.44 0.0152
## 2 a1 -1.16 0.446 -2.60 0.00997
## 3 a2 0.0280 0.00993 2.82 0.00518
## 4 b1 0.0311 0.0123 2.52 0.0124
## 5 b2 0.00132 0.000464 2.85 0.00478
4.3.3.1 Create a function using the fit paramters
Cu_fit <- function(x) {
((summary(model_Cu)$coefficients[1] + summary(model_Cu)$coefficients[2]*x + summary(model_Cu)$coefficients[3]*x^2)/(1 + summary(model_Cu)$coefficients[4]*x + summary(model_Cu)$coefficients[5]*x^2))
}
4.3.3.2 Add the fitted curve to the graph
ggplot(CTECu, aes(temp_K, Cu_CTE)) +
geom_point() +
stat_function(fun = Cu_fit, colour = "green", linetype = "dashed") +
ggtitle("Thermal Expansion of Copper", subtitle = "nls function using Q/Q model")
4.3.3.3 Plot the residulas
ggplot(augment(model_Cu)) +
geom_point(aes(temp_K, .resid)) +
geom_hline(aes(yintercept=0)) +
geom_hline(aes(yintercept=+2*(0.38)), linetype = "dashed") +
geom_hline(aes(yintercept=-2*(0.38)), linetype = "dashed") +
ggtitle("Thermal Expansion of Copper Residuals", subtitle = "+/- 2(Residual Statndard Deviation)")
The fit is not very good, shows a clear structure, and indicates the Q/Q model is insufficient.
4.3.4 Cubic/Cubic Rational Function
\[ y = \frac{(A0 + A1*x + A2x^2 + A3X^3)} {(1 + B1x + B2X^2 + B3X^3)} \]
mcc_Cu <- nls(Cu_CTE ~ ((a0 + a1*temp_K + a2*temp_K^2 + a3*temp_K^3)/(1 + b1*temp_K + b2*temp_K^2 + b3*temp_K^3)),
CTECu, start = list(a0 = 0, a1 = -1, a2 = -1, a3 = 0, b1 = 0, b2 = 0, b3 = 0),
trace = T)
## 1.344556e+13 : 0 -1 -1 0 0 0 0
## 1.339595e+13 : -4.089856e-03 -9.988271e-01 -9.983564e-01 6.676610e-04 -6.676602e-04 -1.055201e-11 1.803518e-15
## 1.334366e+13 : -4.807459e-03 -9.966232e-01 -9.964072e-01 6.664584e-04 -6.677607e-04 -1.094787e-11 1.584238e-15
## 1.32396e+13 : -6.863478e-03 -9.922082e-01 -9.925166e-01 6.639777e-04 -6.678813e-04 -1.223912e-11 1.510447e-15
## 1.30334e+13 : -1.594000e-02 -9.833127e-01 -9.847661e-01 6.593020e-04 -6.683904e-04 -1.602231e-11 1.289942e-15
## 1.262831e+13 : -6.540076e-02 -9.650849e-01 -9.693885e-01 6.507887e-04 -6.701861e-04 -3.033631e-11 1.802077e-15
## 1.184743e+13 : -1.788916e-01 -9.289031e-01 -9.391153e-01 6.341455e-04 -6.739522e-04 -6.184747e-11 2.875150e-15
## 1.040295e+13 : -4.142861e-01 -8.583930e-01 -8.804668e-01 5.994049e-04 -6.790616e-04 -1.428677e-10 1.605646e-14
## 7.989692e+12 : -9.341122e-01 -7.234575e-01 -7.705432e-01 5.175015e-04 -6.708211e-04 -4.851866e-10 1.813912e-13
## 6.371193e+12 : -1.922326e+00 -4.855256e-01 -5.789681e-01 -4.405781e-04 4.048477e-04 -1.132038e-09 4.421753e-13
## 1.590227e+12 : -3.385858e+00 -1.294959e-01 -2.899297e-01 -2.187966e-04 4.037949e-04 -2.100189e-09 8.979057e-13
## 19889209 : -4.850624e+00 2.266128e-01 -8.927502e-04 3.128700e-06 3.991144e-04 -5.877643e-09 2.695245e-12
## 825292.5 : -4.854099e+00 2.244501e-01 -7.872443e-04 1.327192e-06 8.773391e-04 4.504172e-08 -1.183450e-10
## 72834.96 : -4.698904e+00 2.158071e-01 -6.255161e-04 8.206990e-07 1.206312e-03 1.711327e-06 -1.281084e-09
## 11897.69 : -4.260919e+00 1.962288e-01 -4.518142e-04 6.174559e-07 5.394675e-04 9.144746e-06 -6.252643e-09
## 3068.102 : -3.363680e+00 1.545785e-01 -1.051538e-04 4.250646e-07 -1.433242e-03 2.984461e-05 -2.070801e-08
## 909.5887 : -1.947093e+00 8.097392e-02 6.980495e-04 -2.856405e-08 -4.239912e-03 7.556907e-05 -5.390319e-08
## 220.8618 : -4.057550e-01 -1.376339e-02 2.096694e-03 -9.279485e-07 -6.080614e-03 1.470438e-04 -1.056115e-07
## 31.88262 : 7.090768e-01 -9.381732e-02 3.536101e-03 -1.838264e-06 -6.162496e-03 2.145881e-04 -1.480838e-07
## 3.200182 : 1.102048e+00 -1.249213e-01 4.149347e-03 -1.951524e-06 -5.756966e-03 2.422411e-04 -1.492413e-07
## 1.55733 : 1.100520e+00 -1.246480e-01 4.131519e-03 -1.571143e-06 -5.729023e-03 2.422069e-04 -1.300201e-07
## 1.532465 : 1.080053e+00 -1.228904e-01 4.090638e-03 -1.435504e-06 -5.758189e-03 2.407076e-04 -1.235758e-07
## 1.532438 : 1.077745e+00 -1.227015e-01 4.086549e-03 -1.426532e-06 -5.760914e-03 2.405448e-04 -1.231569e-07
## 1.532438 : 1.077639e+00 -1.226932e-01 4.086381e-03 -1.426274e-06 -5.760992e-03 2.405376e-04 -1.231449e-07
summary(mcc_Cu)
##
## Formula: Cu_CTE ~ ((a0 + a1 * temp_K + a2 * temp_K^2 + a3 * temp_K^3)/(1 +
## b1 * temp_K + b2 * temp_K^2 + b3 * temp_K^3))
##
## Parameters:
## Estimate Std. Error t value Pr(>|t|)
## a0 1.078e+00 1.707e-01 6.313 1.40e-09 ***
## a1 -1.227e-01 1.200e-02 -10.224 < 2e-16 ***
## a2 4.086e-03 2.251e-04 18.155 < 2e-16 ***
## a3 -1.426e-06 2.758e-07 -5.172 5.06e-07 ***
## b1 -5.761e-03 2.471e-04 -23.312 < 2e-16 ***
## b2 2.405e-04 1.045e-05 23.019 < 2e-16 ***
## b3 -1.231e-07 1.303e-08 -9.453 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0818 on 229 degrees of freedom
##
## Number of iterations to convergence: 23
## Achieved convergence tolerance: 1.664e-06
glance(mcc_Cu)
## # A tibble: 1 x 8
## sigma isConv finTol logLik AIC BIC deviance df.residual
## <dbl> <lgl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 0.0818 TRUE 0.00000166 259. -503. -475. 1.53 229
augment(mcc_Cu)
## # A tibble: 236 x 4
## temp_K Cu_CTE .fitted .resid
## <dbl> <dbl> <dbl> <dbl>
## 1 24.4 0.591 0.496 0.0946
## 2 34.8 1.55 1.57 -0.0183
## 3 44.1 2.90 2.90 0.00143
## 4 45.1 2.89 3.05 -0.159
## 5 55.0 4.70 4.64 0.0643
## 6 65.5 6.31 6.28 0.0265
## 7 70.5 7.03 7.01 0.0173
## 8 75.7 7.90 7.72 0.175
## 9 89.6 9.47 9.40 0.0745
## 10 91.1 9.48 9.56 -0.0797
## # … with 226 more rows
tidy(mcc_Cu)
## # A tibble: 7 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 a0 1.08 0.171 6.31 1.40e- 9
## 2 a1 -0.123 0.0120 -10.2 1.87e-20
## 3 a2 0.00409 0.000225 18.2 3.11e-46
## 4 a3 -0.00000143 0.000000276 -5.17 5.06e- 7
## 5 b1 -0.00576 0.000247 -23.3 2.18e-62
## 6 b2 0.000241 0.0000104 23.0 1.66e-61
## 7 b3 -0.000000123 0.0000000130 -9.45 4.08e-18
4.3.4.1 Create a function using the fit parameters
cc.Cu.fit <- function(x) {
((summary(mcc_Cu)$coefficients[1] + summary(mcc_Cu)$coefficients[2]*x +
summary(mcc_Cu)$coefficients[3]*x^2 + summary(mcc_Cu)$coefficients[4]*x^3)/(1 + summary(mcc_Cu)$coefficients[5]*x + summary(mcc_Cu)$coefficients[6]*x^2 + summary(mcc_Cu)$coefficients[7]*x^3))
}
4.3.4.2 Add the fitted curve to the graph
ggplot(CTECu, aes(temp_K, Cu_CTE)) +
geom_point() +
stat_function(fun = cc.Cu.fit, colour = "green", linetype = "dashed") +
ggtitle("Thermal Expansion of Copper", subtitle = "nls function using C/C model")
4.3.4.3 Plot the residuals from the C/C model
ggplot(augment(mcc_Cu)) +
geom_point(aes(temp_K, .resid)) +
geom_hline(aes(yintercept=0)) +
geom_hline(aes(yintercept=+2*0.082), linetype = "dashed") +
geom_hline(aes(yintercept=-2*0.082), linetype = "dashed") +
ggtitle("Thermal Expansion of Copper Residuals (C/C)", subtitle = "+/- 2(Residual Statndard Deviation)")
4.3.4.4 Finally, lets fit the data with th LOESS method and compare to the C/C model
ggplot(CTECu, aes(temp_K, Cu_CTE)) +
geom_point() +
stat_smooth(method = "loess", span = 0.2, linetype = "dashed", size = 0.5) +
ggtitle("Thermal Expansion of Copper", subtitle = "analysis with LOESS model")
We can look at the quality of the fit using the LOESS model direclty
mloess_Cu <- loess(Cu_CTE ~ temp_K, CTECu, span = 0.2)
summary(mloess_Cu)
## Call:
## loess(formula = Cu_CTE ~ temp_K, data = CTECu, span = 0.2)
##
## Number of Observations: 236
## Equivalent Number of Parameters: 15.02
## Residual Standard Error: 0.09039
## Trace of smoother matrix: 16.62 (exact)
##
## Control settings:
## span : 0.2
## degree : 2
## family : gaussian
## surface : interpolate cell = 0.2
## normalize: TRUE
## parametric: FALSE
## drop.square: FALSE
augment(mloess_Cu)
## # A tibble: 236 x 5
## Cu_CTE temp_K .fitted .se.fit .resid
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.591 24.4 0.589 0.0234 0.00161
## 2 1.55 34.8 1.70 0.0179 -0.157
## 3 2.90 44.1 2.91 0.0196 -0.00555
## 4 2.89 45.1 3.04 0.0197 -0.149
## 5 4.70 55.0 4.60 0.0218 0.102
## 6 6.31 65.5 6.27 0.0222 0.0343
## 7 7.03 70.5 7.03 0.0242 0.000877
## 8 7.90 75.7 7.72 0.0232 0.176
## 9 9.47 89.6 9.37 0.0231 0.102
## 10 9.48 91.1 9.53 0.0229 -0.0486
## # … with 226 more rows
ggplot(augment(mloess_Cu)) +
geom_point(aes(temp_K, .resid)) +
geom_hline(aes(yintercept=0)) +
geom_hline(aes(yintercept=+2*(0.09)), linetype = "dashed") +
geom_hline(aes(yintercept=-2*(0.09)), linetype = "dashed") +
ggtitle("Thermal Expansion of Copper Residuals (LOESS)", subtitle = "+/- 2(Residual Statndard Error)")
The C/C is slightly bette, but rquired signifiantly more work.
4.3.4.5 Progaming with case_when()
Looking at the orginal data set, we can see that there are two sets of data. We might want to label these as “run1” and “run2.” we could do this in Excel using an IF()
function. In the tidyverse, we can use the case_when()
function.
load_cell_fit_2
## # A tibble: 40 x 10
## Deflection Load Load_squared .fitted .se.fit .resid .hat .sigma
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.110 1.50e5 2.25e10 0.110 8.83e-5 -2.21e-4 0.185 2.04e-4
## 2 0.220 3.00e5 9.00e10 0.220 7.19e-5 -4.47e-4 0.123 1.92e-4
## 3 0.329 4.50e5 2.02e11 0.329 5.89e-5 2.99e-5 0.0824 2.08e-4
## 4 0.439 6.00e5 3.60e11 0.439 4.99e-5 2.19e-4 0.0591 2.05e-4
## 5 0.548 7.50e5 5.62e11 0.548 4.50e-5 9.00e-5 0.0480 2.07e-4
## 6 0.657 9.00e5 8.10e11 0.657 4.35e-5 -2.65e-5 0.0450 2.08e-4
## 7 0.766 1.05e6 1.10e12 0.766 4.44e-5 -2.31e-4 0.0468 2.04e-4
## 8 0.875 1.20e6 1.44e12 0.875 4.61e-5 2.77e-4 0.0505 2.03e-4
## 9 0.983 1.35e6 1.82e12 0.983 4.77e-5 -2.73e-4 0.0541 2.03e-4
## 10 1.09 1.50e6 2.25e12 1.09 4.86e-5 -1.90e-4 0.0562 2.05e-4
## # … with 30 more rows, and 2 more variables: .cooksd <dbl>,
## # .std.resid <dbl>
load_cell_fit_2_runs <- load_cell_fit_2 %>%
mutate(
run = case_when(seq_along(Load) <= 20 ~ "run1",
seq_along(Load) > 20 ~ "run2"))
load_cell_fit_2_runs
## # A tibble: 40 x 11
## Deflection Load Load_squared .fitted .se.fit .resid .hat .sigma
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.110 1.50e5 2.25e10 0.110 8.83e-5 -2.21e-4 0.185 2.04e-4
## 2 0.220 3.00e5 9.00e10 0.220 7.19e-5 -4.47e-4 0.123 1.92e-4
## 3 0.329 4.50e5 2.02e11 0.329 5.89e-5 2.99e-5 0.0824 2.08e-4
## 4 0.439 6.00e5 3.60e11 0.439 4.99e-5 2.19e-4 0.0591 2.05e-4
## 5 0.548 7.50e5 5.62e11 0.548 4.50e-5 9.00e-5 0.0480 2.07e-4
## 6 0.657 9.00e5 8.10e11 0.657 4.35e-5 -2.65e-5 0.0450 2.08e-4
## 7 0.766 1.05e6 1.10e12 0.766 4.44e-5 -2.31e-4 0.0468 2.04e-4
## 8 0.875 1.20e6 1.44e12 0.875 4.61e-5 2.77e-4 0.0505 2.03e-4
## 9 0.983 1.35e6 1.82e12 0.983 4.77e-5 -2.73e-4 0.0541 2.03e-4
## 10 1.09 1.50e6 2.25e12 1.09 4.86e-5 -1.90e-4 0.0562 2.05e-4
## # … with 30 more rows, and 3 more variables: .cooksd <dbl>,
## # .std.resid <dbl>, run <chr>
4.3.4.6 EDA of load cell data by run
ggplot(load_cell_fit_2_runs) +
geom_point(aes(Load, .resid, colour = run)) +
ggtitle("Residuals from refined model of the load cell") +
theme_bw()
4.4 Applying models to multiple datasets
4.4.1 Revisting the Ascombe dataset
x_anscombe <- anscombe %>% # results will be storred into a new object x_anscombe; we start with the original data frame "anscombe"
dplyr::select(x1, x2, x3, x4) %>% # select the columns we want to work with
rename(group1 = x1, group2 = x2, group3 = x3, group4 = x4) %>% # rename the values using a generic header
gather(key = group, value = x_values, group1, group2, group3, group4) # gather the columns into rows
x_anscombe
## group x_values
## 1 group1 10
## 2 group1 8
## 3 group1 13
## 4 group1 9
## 5 group1 11
## 6 group1 14
## 7 group1 6
## 8 group1 4
## 9 group1 12
## 10 group1 7
## 11 group1 5
## 12 group2 10
## 13 group2 8
## 14 group2 13
## 15 group2 9
## 16 group2 11
## 17 group2 14
## 18 group2 6
## 19 group2 4
## 20 group2 12
## 21 group2 7
## 22 group2 5
## 23 group3 10
## 24 group3 8
## 25 group3 13
## 26 group3 9
## 27 group3 11
## 28 group3 14
## 29 group3 6
## 30 group3 4
## 31 group3 12
## 32 group3 7
## 33 group3 5
## 34 group4 8
## 35 group4 8
## 36 group4 8
## 37 group4 8
## 38 group4 8
## 39 group4 8
## 40 group4 8
## 41 group4 19
## 42 group4 8
## 43 group4 8
## 44 group4 8
y_anscombe <- anscombe %>%
dplyr::select(y1, y2, y3, y4) %>%
gather(key = group, value = y_values, y1, y2, y3, y4) %>% # I don't need to rename the columns as I will discard them (I only need one column to indicate the group number)
dplyr::select(y_values)
y_anscombe
## y_values
## 1 8.04
## 2 6.95
## 3 7.58
## 4 8.81
## 5 8.33
## 6 9.96
## 7 7.24
## 8 4.26
## 9 10.84
## 10 4.82
## 11 5.68
## 12 9.14
## 13 8.14
## 14 8.74
## 15 8.77
## 16 9.26
## 17 8.10
## 18 6.13
## 19 3.10
## 20 9.13
## 21 7.26
## 22 4.74
## 23 7.46
## 24 6.77
## 25 12.74
## 26 7.11
## 27 7.81
## 28 8.84
## 29 6.08
## 30 5.39
## 31 8.15
## 32 6.42
## 33 5.73
## 34 6.58
## 35 5.76
## 36 7.71
## 37 8.84
## 38 8.47
## 39 7.04
## 40 5.25
## 41 12.50
## 42 5.56
## 43 7.91
## 44 6.89
anscombe_tidy <- bind_cols(x_anscombe, y_anscombe)
anscombe_tidy
## group x_values y_values
## 1 group1 10 8.04
## 2 group1 8 6.95
## 3 group1 13 7.58
## 4 group1 9 8.81
## 5 group1 11 8.33
## 6 group1 14 9.96
## 7 group1 6 7.24
## 8 group1 4 4.26
## 9 group1 12 10.84
## 10 group1 7 4.82
## 11 group1 5 5.68
## 12 group2 10 9.14
## 13 group2 8 8.14
## 14 group2 13 8.74
## 15 group2 9 8.77
## 16 group2 11 9.26
## 17 group2 14 8.10
## 18 group2 6 6.13
## 19 group2 4 3.10
## 20 group2 12 9.13
## 21 group2 7 7.26
## 22 group2 5 4.74
## 23 group3 10 7.46
## 24 group3 8 6.77
## 25 group3 13 12.74
## 26 group3 9 7.11
## 27 group3 11 7.81
## 28 group3 14 8.84
## 29 group3 6 6.08
## 30 group3 4 5.39
## 31 group3 12 8.15
## 32 group3 7 6.42
## 33 group3 5 5.73
## 34 group4 8 6.58
## 35 group4 8 5.76
## 36 group4 8 7.71
## 37 group4 8 8.84
## 38 group4 8 8.47
## 39 group4 8 7.04
## 40 group4 8 5.25
## 41 group4 19 12.50
## 42 group4 8 5.56
## 43 group4 8 7.91
## 44 group4 8 6.89
ggplot(anscombe_tidy, aes(x_values, y_values)) +
geom_line(aes(group = group))
In chapter 1, we constructed models for each group; however, for large datasets, we’d like to be able to streamline this process.
Following the example from chapter 11 of Hadley Wickham:
anscombe_models <- anscombe_tidy %>%
group_by(group) %>%
do(mod = lm(y_values ~ x_values, data = ., na.action = na.exclude
)) %>%
ungroup()
anscombe_models
## # A tibble: 4 x 2
## group mod
## * <chr> <list>
## 1 group1 <S3: lm>
## 2 group2 <S3: lm>
## 3 group3 <S3: lm>
## 4 group4 <S3: lm>
4.4.2 Model-level summaries
model_summary <- anscombe_models %>%
rowwise() %>%
glance(mod)
model_summary
## # A tibble: 4 x 12
## # Groups: group [4]
## group r.squared adj.r.squared sigma statistic p.value df logLik AIC
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
## 1 grou… 0.667 0.629 1.24 18.0 0.00217 2 -16.8 39.7
## 2 grou… 0.666 0.629 1.24 18.0 0.00218 2 -16.8 39.7
## 3 grou… 0.666 0.629 1.24 18.0 0.00218 2 -16.8 39.7
## 4 grou… 0.667 0.630 1.24 18.0 0.00216 2 -16.8 39.7
## # … with 3 more variables: BIC <dbl>, deviance <dbl>, df.residual <int>
4.4.3 Coefficient-level summaries
coefficient_summary <- anscombe_models %>%
rowwise() %>%
tidy(mod)
coefficient_summary
## # A tibble: 8 x 6
## # Groups: group [4]
## group term estimate std.error statistic p.value
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 group1 (Intercept) 3.00 1.12 2.67 0.0257
## 2 group1 x_values 0.500 0.118 4.24 0.00217
## 3 group2 (Intercept) 3.00 1.13 2.67 0.0258
## 4 group2 x_values 0.5 0.118 4.24 0.00218
## 5 group3 (Intercept) 3.00 1.12 2.67 0.0256
## 6 group3 x_values 0.500 0.118 4.24 0.00218
## 7 group4 (Intercept) 3.00 1.12 2.67 0.0256
## 8 group4 x_values 0.500 0.118 4.24 0.00216
4.4.4 Observation Data
observation_summary <- anscombe_models %>%
rowwise() %>%
augment(mod)
observation_summary
## # A tibble: 44 x 10
## # Groups: group [4]
## group y_values x_values .fitted .se.fit .resid .hat .sigma .cooksd
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 grou… 8.04 10 8.00 0.391 0.0390 0.1 1.31 6.14e-5
## 2 grou… 6.95 8 7.00 0.391 -0.0508 0.1 1.31 1.04e-4
## 3 grou… 7.58 13 9.50 0.601 -1.92 0.236 1.06 4.89e-1
## 4 grou… 8.81 9 7.50 0.373 1.31 0.0909 1.22 6.16e-2
## 5 grou… 8.33 11 8.50 0.441 -0.171 0.127 1.31 1.60e-3
## 6 grou… 9.96 14 10.0 0.698 -0.0414 0.318 1.31 3.83e-4
## 7 grou… 7.24 6 6.00 0.514 1.24 0.173 1.22 1.27e-1
## 8 grou… 4.26 4 5.00 0.698 -0.740 0.318 1.27 1.23e-1
## 9 grou… 10.8 12 9.00 0.514 1.84 0.173 1.10 2.79e-1
## 10 grou… 4.82 7 6.50 0.441 -1.68 0.127 1.15 1.54e-1
## # … with 34 more rows, and 1 more variable: .std.resid <dbl>
ggplot(observation_summary, aes(.se.fit, fill = group)) +
geom_histogram(bins = 50) +
facet_wrap(~ group)
Having model information for each dataset can make the overall analysis much more efficient.