38 Report

Structure

  • Exploratory analysis

    • plots
    • preliminary results
    • interesting structure/features in the data
    • outliers
  • Model

    • Assumptions
    • Why this model/ How is this model the best one?
    • Consideration: interactions, collinearity, dependence
  • Model Fit

    • How well does it fit?

    • Are the model assumptions met?

      • Residual analysis
  • Inference/ Prediction

    • Are there different way to support your inference?
  • Conclusion

    • Recommendation

    • Limitation of the analysis

    • How to correct those in the future

This chapter is based on the jtools package. More information can be found here.

38.1 One summary table

Packages for reporting:

Summary Statistics Table:

Regression Table

library(jtools)
data(movies)
fit <- lm(metascore ~ budget + us_gross + year, data = movies)
summ(fit)
Observations 831 (10 missing obs. deleted)
Dependent variable metascore
Type OLS linear regression
F(3,827) 26.23
0.09
Adj. R² 0.08
Est. S.E. t val. p
(Intercept) 52.06 139.67 0.37 0.71
budget -0.00 0.00 -5.89 0.00
us_gross 0.00 0.00 7.61 0.00
year 0.01 0.07 0.08 0.94
Standard errors: OLS
summ(
    fit,
    scale = TRUE,
    vifs = TRUE,
    part.corr = TRUE,
    confint = TRUE,
    pvals = FALSE
) # notice that scale here is TRUE
Observations 831 (10 missing obs. deleted)
Dependent variable metascore
Type OLS linear regression
F(3,827) 26.23
0.09
Adj. R² 0.08
Est. 2.5% 97.5% t val. VIF partial.r part.r
(Intercept) 63.01 61.91 64.11 112.23 NA NA NA
budget -3.78 -5.05 -2.52 -5.89 1.31 -0.20 -0.20
us_gross 5.28 3.92 6.64 7.61 1.52 0.26 0.25
year 0.05 -1.18 1.28 0.08 1.24 0.00 0.00
Standard errors: OLS; Continuous predictors are mean-centered and scaled by 1 s.d. The outcome variable remains in its original units.

#obtain clsuter-robust SE
data("PetersenCL", package = "sandwich")
fit2 <- lm(y ~ x, data = PetersenCL)
summ(fit2, robust = "HC3", cluster = "firm") 
Observations 5000
Dependent variable y
Type OLS linear regression
F(1,4998) 1310.74
0.21
Adj. R² 0.21
Est. S.E. t val. p
(Intercept) 0.03 0.07 0.44 0.66
x 1.03 0.05 20.36 0.00
Standard errors: Cluster-robust, type = HC3

Model to Equation

# install.packages("equatiomatic") # not available for R 4.2
fit <- lm(metascore ~ budget + us_gross + year, data = movies)
# show the theoretical model
equatiomatic::extract_eq(fit)
# display the actual coefficients
equatiomatic::extract_eq(fit, use_coefs = TRUE)

38.2 Model Comparison

fit <- lm(metascore ~ log(budget), data = movies)
fit_b <- lm(metascore ~ log(budget) + log(us_gross), data = movies)
fit_c <- lm(metascore ~ log(budget) + log(us_gross) + runtime, data = movies)
coef_names <- c("Budget" = "log(budget)", "US Gross" = "log(us_gross)",
                "Runtime (Hours)" = "runtime", "Constant" = "(Intercept)")
export_summs(fit, fit_b, fit_c, robust = "HC3", coefs = coef_names)
Table 35.1:
Model 1 Model 2 Model 3
Budget -2.43 *** -5.16 *** -6.70 ***
(0.44)    (0.62)    (0.67)   
US Gross         3.96 *** 3.85 ***
        (0.51)    (0.48)   
Runtime (Hours)                 14.29 ***
                (1.63)   
Constant 105.29 *** 81.84 *** 83.35 ***
(7.65)    (8.66)    (8.82)   
N 831        831        831       
R2 0.03     0.09     0.17    
Standard errors are heteroskedasticity robust. *** p < 0.001; ** p < 0.01; * p < 0.05.

Another package is modelsummary

library(modelsummary)
lm_mod <- lm(mpg ~ wt + hp + cyl, mtcars)
msummary(lm_mod, vcov = c("iid","robust","HC4"))
 (1)   (2)   (3)
(Intercept) 38.752 38.752 38.752
(1.787) (2.286) (2.177)
wt −3.167 −3.167 −3.167
(0.741) (0.833) (0.819)
hp −0.018 −0.018 −0.018
(0.012) (0.010) (0.013)
cyl −0.942 −0.942 −0.942
(0.551) (0.573) (0.572)
Num.Obs. 32 32 32
R2 0.843 0.843 0.843
R2 Adj. 0.826 0.826 0.826
AIC 155.5 155.5 155.5
BIC 162.8 162.8 162.8
Log.Lik. −72.738 −72.738 −72.738
F 50.171 31.065 32.623
RMSE 2.35 2.35 2.35
Std.Errors IID HC3 HC4
modelplot(lm_mod, vcov = c("iid","robust","HC4"))

Another package is stargazer

library("stargazer")
stargazer(attitude)
#> 
#> % Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com
#> % Date and time: Thu, Aug 29, 2024 - 4:10:22 PM
#> \begin{table}[!htbp] \centering 
#>   \caption{} 
#>   \label{} 
#> \begin{tabular}{@{\extracolsep{5pt}}lccccc} 
#> \\[-1.8ex]\hline 
#> \hline \\[-1.8ex] 
#> Statistic & \multicolumn{1}{c}{N} & \multicolumn{1}{c}{Mean} & \multicolumn{1}{c}{St. Dev.} & \multicolumn{1}{c}{Min} & \multicolumn{1}{c}{Max} \\ 
#> \hline \\[-1.8ex] 
#> rating & 30 & 64.633 & 12.173 & 40 & 85 \\ 
#> complaints & 30 & 66.600 & 13.315 & 37 & 90 \\ 
#> privileges & 30 & 53.133 & 12.235 & 30 & 83 \\ 
#> learning & 30 & 56.367 & 11.737 & 34 & 75 \\ 
#> raises & 30 & 64.633 & 10.397 & 43 & 88 \\ 
#> critical & 30 & 74.767 & 9.895 & 49 & 92 \\ 
#> advance & 30 & 42.933 & 10.289 & 25 & 72 \\ 
#> \hline \\[-1.8ex] 
#> \end{tabular} 
#> \end{table}
## 2 OLS models
linear.1 <-
    lm(rating ~ complaints + privileges + learning + raises + critical,
       data = attitude)
linear.2 <-
    lm(rating ~ complaints + privileges + learning, data = attitude)
## create an indicator dependent variable, and run a probit model
attitude$high.rating <- (attitude$rating > 70)

probit.model <-
    glm(
        high.rating ~ learning + critical + advance,
        data = attitude,
        family = binomial(link = "probit")
    )
stargazer(linear.1,
          linear.2,
          probit.model,
          title = "Results",
          align = TRUE)
#> 
#> % Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com
#> % Date and time: Thu, Aug 29, 2024 - 4:10:22 PM
#> % Requires LaTeX packages: dcolumn 
#> \begin{table}[!htbp] \centering 
#>   \caption{Results} 
#>   \label{} 
#> \begin{tabular}{@{\extracolsep{5pt}}lD{.}{.}{-3} D{.}{.}{-3} D{.}{.}{-3} } 
#> \\[-1.8ex]\hline 
#> \hline \\[-1.8ex] 
#>  & \multicolumn{3}{c}{\textit{Dependent variable:}} \\ 
#> \cline{2-4} 
#> \\[-1.8ex] & \multicolumn{2}{c}{rating} & \multicolumn{1}{c}{high.rating} \\ 
#> \\[-1.8ex] & \multicolumn{2}{c}{\textit{OLS}} & \multicolumn{1}{c}{\textit{probit}} \\ 
#> \\[-1.8ex] & \multicolumn{1}{c}{(1)} & \multicolumn{1}{c}{(2)} & \multicolumn{1}{c}{(3)}\\ 
#> \hline \\[-1.8ex] 
#>  complaints & 0.692^{***} & 0.682^{***} &  \\ 
#>   & (0.149) & (0.129) &  \\ 
#>   & & & \\ 
#>  privileges & -0.104 & -0.103 &  \\ 
#>   & (0.135) & (0.129) &  \\ 
#>   & & & \\ 
#>  learning & 0.249 & 0.238^{*} & 0.164^{***} \\ 
#>   & (0.160) & (0.139) & (0.053) \\ 
#>   & & & \\ 
#>  raises & -0.033 &  &  \\ 
#>   & (0.202) &  &  \\ 
#>   & & & \\ 
#>  critical & 0.015 &  & -0.001 \\ 
#>   & (0.147) &  & (0.044) \\ 
#>   & & & \\ 
#>  advance &  &  & -0.062 \\ 
#>   &  &  & (0.042) \\ 
#>   & & & \\ 
#>  Constant & 11.011 & 11.258 & -7.476^{**} \\ 
#>   & (11.704) & (7.318) & (3.570) \\ 
#>   & & & \\ 
#> \hline \\[-1.8ex] 
#> Observations & \multicolumn{1}{c}{30} & \multicolumn{1}{c}{30} & \multicolumn{1}{c}{30} \\ 
#> R$^{2}$ & \multicolumn{1}{c}{0.715} & \multicolumn{1}{c}{0.715} &  \\ 
#> Adjusted R$^{2}$ & \multicolumn{1}{c}{0.656} & \multicolumn{1}{c}{0.682} &  \\ 
#> Log Likelihood &  &  & \multicolumn{1}{c}{-9.087} \\ 
#> Akaike Inf. Crit. &  &  & \multicolumn{1}{c}{26.175} \\ 
#> Residual Std. Error & \multicolumn{1}{c}{7.139 (df = 24)} & \multicolumn{1}{c}{6.863 (df = 26)} &  \\ 
#> F Statistic & \multicolumn{1}{c}{12.063$^{***}$ (df = 5; 24)} & \multicolumn{1}{c}{21.743$^{***}$ (df = 3; 26)} &  \\ 
#> \hline 
#> \hline \\[-1.8ex] 
#> \textit{Note:}  & \multicolumn{3}{r}{$^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01} \\ 
#> \end{tabular} 
#> \end{table}
# Latex
stargazer(
    linear.1,
    linear.2,
    probit.model,
    title = "Regression Results",
    align = TRUE,
    dep.var.labels = c("Overall Rating", "High Rating"),
    covariate.labels = c(
        "Handling of Complaints",
        "No Special Privileges",
        "Opportunity to Learn",
        "Performance-Based Raises",
        "Too Critical",
        "Advancement"
    ),
    omit.stat = c("LL", "ser", "f"),
    no.space = TRUE
)
# ASCII text output
stargazer(
    linear.1,
    linear.2,
    type = "text",
    title = "Regression Results",
    dep.var.labels = c("Overall Rating", "High Rating"),
    covariate.labels = c(
        "Handling of Complaints",
        "No Special Privileges",
        "Opportunity to Learn",
        "Performance-Based Raises",
        "Too Critical",
        "Advancement"
    ),
    omit.stat = c("LL", "ser", "f"),
    ci = TRUE,
    ci.level = 0.90,
    single.row = TRUE
)
#> 
#> Regression Results
#> ========================================================================
#>                                        Dependent variable:              
#>                          -----------------------------------------------
#>                                          Overall Rating                 
#>                                    (1)                     (2)          
#> ------------------------------------------------------------------------
#> Handling of Complaints   0.692*** (0.447, 0.937) 0.682*** (0.470, 0.894)
#> No Special Privileges    -0.104 (-0.325, 0.118)  -0.103 (-0.316, 0.109) 
#> Opportunity to Learn      0.249 (-0.013, 0.512)   0.238* (0.009, 0.467) 
#> Performance-Based Raises -0.033 (-0.366, 0.299)                         
#> Too Critical              0.015 (-0.227, 0.258)                         
#> Advancement              11.011 (-8.240, 30.262) 11.258 (-0.779, 23.296)
#> ------------------------------------------------------------------------
#> Observations                       30                      30           
#> R2                                0.715                   0.715         
#> Adjusted R2                       0.656                   0.682         
#> ========================================================================
#> Note:                                        *p<0.1; **p<0.05; ***p<0.01
stargazer(
    linear.1,
    linear.2,
    probit.model,
    title = "Regression Results",
    align = TRUE,
    dep.var.labels = c("Overall Rating", "High Rating"),
    covariate.labels = c(
        "Handling of Complaints",
        "No Special Privileges",
        "Opportunity to Learn",
        "Performance-Based Raises",
        "Too Critical",
        "Advancement"
    ),
    omit.stat = c("LL", "ser", "f"),
    no.space = TRUE
)

Correlation Table

correlation.matrix <-
    cor(attitude[, c("rating", "complaints", "privileges")])
stargazer(correlation.matrix, title = "Correlation Matrix")

38.3 Changes in an estimate

coef_names <- coef_names[1:3] # Dropping intercept for plots
plot_summs(fit, fit_b, fit_c, robust = "HC3", coefs = coef_names)
plot_summs(
    fit_c,
    robust = "HC3",
    coefs = coef_names,
    plot.distributions = TRUE
)

38.4 Standard Errors

sandwich vignette

Type Applicable Usage Reference
const Assume constant variances
HC HC0 vcovCL

Heterogeneity

White’s estimator

All other heterogeneity SE methods are derivatives of this.

No small sample bias adjustment

(White 1980)
HC1 vcovCL

Uses a degrees of freedom-based correction

When the number of clusters is small, HC2 and HC3 are better (Cameron, Gelbach, and Miller 2008)

(J. G. MacKinnon and White 1985)
HC2 vcovCL

Better with the linear model, but still applicable for Generalized Linear Models

Needs a hat (weighted) matrix

HC3 vcovCL

Better with the linear model, but still applicable for Generalized Linear Models

Needs a hat (weighted) matrix

HC4 vcovHC (Cribari-Neto 2004)
HC4m vcovHC (Cribari-Neto, Souza, and Vasconcellos 2007)
HC5 vcovHC (Cribari-Neto and Silva 2011)
data(cars)
model <- lm(speed ~ dist, data = cars)
summary(model)
#> 
#> Call:
#> lm(formula = speed ~ dist, data = cars)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -7.5293 -2.1550  0.3615  2.4377  6.4179 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)  8.28391    0.87438   9.474 1.44e-12 ***
#> dist         0.16557    0.01749   9.464 1.49e-12 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 3.156 on 48 degrees of freedom
#> Multiple R-squared:  0.6511, Adjusted R-squared:  0.6438 
#> F-statistic: 89.57 on 1 and 48 DF,  p-value: 1.49e-12
lmtest::coeftest(model, vcov. = sandwich::vcovHC(model, type = "HC1"))
#> 
#> t test of coefficients:
#> 
#>             Estimate Std. Error t value  Pr(>|t|)    
#> (Intercept) 8.283906   0.891860  9.2883 2.682e-12 ***
#> dist        0.165568   0.019402  8.5335 3.482e-11 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

38.5 Coefficient Uncertainty and Distribution

The ggdist allows us to visualize uncertainty under both frequentist and Bayesian frameworks

38.6 Descriptive Tables

Export APA theme

data("mtcars")

library(flextable)
theme_apa(flextable(mtcars[1:5,1:5]))

Export to Latex

print(xtable::xtable(mtcars, type = "latex"),
      file = file.path(getwd(), "output", "mtcars_xtable.tex"))

# American Economic Review style
stargazer::stargazer(
    mtcars,
    title = "Testing",
    style = "aer",
    out = file.path(getwd(), "output", "mtcars_stargazer.tex")
)

# other styles include
# Administrative Science Quarterly
# Quarterly Journal of Economics

However, the above codes do not play well with notes. Hence, I create my own custom code that follows the AMA guidelines

ama_tbl <- function(data, caption, label, note, output_path) {
  library(tidyverse)
  library(xtable)
  # Function to determine column alignment
  get_column_alignment <- function(data) {
    # Start with the alignment for the header row
    alignment <- c("l", "l")
    
    # Check each column
    for (col in seq_len(ncol(data))[-1]) {
      if (is.numeric(data[[col]])) {
        alignment <- c(alignment, "r")  # Right alignment for numbers
      } else {
        alignment <- c(alignment, "c")  # Center alignment for other data
      }
    }
    
    return(alignment)
  }
  
  data %>%
    # bold + left align first column 
    rename_with(~paste("\\multicolumn{1}{l}{\\textbf{", ., "}}"), 1) %>% 
    # bold + center align all other columns
    `colnames<-`(ifelse(colnames(.) != colnames(.)[1],
                        paste("\\multicolumn{1}{c}{\\textbf{", colnames(.), "}}"),
                        colnames(.))) %>% 
    
    xtable(caption = caption,
           label = label,
           align = get_column_alignment(data),
           auto = TRUE) %>%
    print(
      include.rownames = FALSE,
      caption.placement = "top",
      
      hline.after=c(-1, 0),
      
       # p{0.9\linewidth} sets the width of the column to 90% of the line width, and the @{} removes any extra padding around the cell.
      
      add.to.row = list(pos = list(nrow(data)), # Add at the bottom of the table
                        command = c(paste0("\\hline \n \\multicolumn{",ncol(data), "}{l} {", "\n \\begin{tabular}{@{}p{0.9\\linewidth}@{}} \n","Note: ", note, "\n \\end{tabular}  } \n"))), # Add your note here
      
      # make sure your heading is untouched (because you manually change it above)
      sanitize.colnames.function = identity,
      
      # place a the top of the page
      table.placement = "h",
      
      file = output_path
    )
}
ama_tbl(
    mtcars,
    caption     = "This is caption",
    label       = "tab:this_is_label",
    note        = "this is note",
    output_path = file.path(getwd(), "output", "mtcars_custom_ama.tex")
)

38.7 Visualizations and Plots

You can customize your plots based on your preferred journals. Here, I am creating a custom setting for the American Marketing Association.

American-Marketing-Association-ready theme for plots

library(ggplot2)

# check available fonts
# windowsFonts()

# for Times New Roman
# names(windowsFonts()[windowsFonts()=="TT Times New Roman"])
# Making a theme
amatheme = theme_bw(base_size = 14, base_family = "serif") + # This is Time New Roman
    
    theme(
        # remove major gridlines
        panel.grid.major   = element_blank(),

        # remove minor gridlines
        panel.grid.minor   = element_blank(),

        # remove panel border
        panel.border       = element_blank(),

        line               = element_line(),

        # change font
        text               = element_text(),

        # if you want to remove legend title
        # legend.title     = element_blank(),

        legend.title       = element_text(size = rel(0.6), face = "bold"),

        # change font size of legend
        legend.text        = element_text(size = rel(0.6)),
        
        legend.background  = element_rect(color = "black"),
        
        # legend.margin    = margin(t = 5, l = 5, r = 5, b = 5),
        # legend.key       = element_rect(color = NA, fill = NA),

        # change font size of main title
        plot.title         = element_text(
            size           = rel(1.2),
            face           = "bold",
            hjust          = 0.5,
            margin         = margin(b = 15)
        ),
        
        plot.margin        = unit(c(1, 1, 1, 1), "cm"),

        # add black line along axes
        axis.line          = element_line(colour = "black", linewidth = .8),
        
        axis.ticks         = element_line(),
        

        # axis title
        axis.title.x       = element_text(size = rel(1.2), face = "bold"),
        axis.title.y       = element_text(size = rel(1.2), face = "bold"),

        # axis text size
        axis.text.y        = element_text(size = rel(1)),
        axis.text.x        = element_text(size = rel(1))
    )

Example

library(tidyverse)
library(ggsci)
data("mtcars")
yourplot <- mtcars %>%
    select(mpg, cyl, gear) %>%
    ggplot(., aes(x = mpg, y = cyl, fill = gear)) + 
    geom_point() +
    labs(title="Some Plot") 

yourplot + 
    amatheme + 
    # choose different color theme
    scale_color_npg() 

yourplot + 
    amatheme + 
    scale_color_continuous()

Other pre-specified themes

library(ggthemes)


# Stata theme
yourplot +
    theme_stata()

# The economist theme
yourplot + 
    theme_economist()

yourplot + 
    theme_economist_white()

# Wall street journal theme
yourplot + 
    theme_wsj()

# APA theme
yourplot +
    jtools::theme_apa(
        legend.font.size = 24,
        x.font.size = 20,
        y.font.size = 20
    )