5 模型可视化

代码提供: 胡慧怡 何舒欣 李清仪 于芊

5.1 PartI

library(modelsummary)
url <- 'https://vincentarelbundock.github.io/Rdatasets/csv/HistData/Guerry.csv'
dat <- read.csv(url) 
dat$Small <- dat$Pop1831 > median(dat$Pop1831)
#数据的快速概述
datasummary_skim(dat)
Unique (#) Missing (%) Mean SD Min Median Max
X 86 0 43.5 25.0 1.0 43.5 86.0
dept 86 0 46.9 30.4 1.0 45.5 200.0
Crime_pers 85 0 19754.4 7504.7 2199.0 18748.5 37014.0
Crime_prop 86 0 7843.1 3051.4 1368.0 7595.0 20235.0
Literacy 50 0 39.3 17.4 12.0 38.0 74.0
Donations 85 0 7075.5 5834.6 1246.0 5020.0 37015.0
Infants 86 0 19049.9 8820.2 2660.0 17141.5 62486.0
Suicides 86 0 36522.6 31312.5 3460.0 26743.5 163241.0
Wealth 86 0 43.5 25.0 1.0 43.5 86.0
Commerce 84 0 42.8 25.0 1.0 42.5 86.0
Clergy 85 0 43.4 25.0 1.0 43.5 86.0
Crime_parents 86 0 43.5 25.0 1.0 43.5 86.0
Infanticide 81 0 43.5 24.9 1.0 43.5 86.0
Donation_clergy 86 0 43.5 25.0 1.0 43.5 86.0
Lottery 86 0 43.5 25.0 1.0 43.5 86.0
Desertion 86 0 43.5 25.0 1.0 43.5 86.0
Instruction 82 0 43.1 24.8 1.0 41.5 86.0
Prostitutes 63 0 141.9 521.0 0.0 33.0 4744.0
Distance 86 0 208.0 109.3 0.0 200.6 539.2
Area 84 0 6147.0 1398.2 762.0 6070.5 10000.0
Pop1831 86 0 378.6 148.8 129.1 346.2 989.9
#平衡表(又名“表1”),各子组的平均值不同:
datasummary_balance(~Small, dat)
## Warning in sanitize_datasummary_balance_data(formula, data): These variables
## were omitted because they include more than 50 levels: Department.
## Warning in datasummary_balance(~Small, dat): Please install the `estimatr`
## package or set `dinm=FALSE` to suppress this warning.
FALSE (N=43)
TRUE (N=43)
Mean Std. Dev. Mean Std. Dev.
X 41.4 27.4 45.6 22.3
dept 46.0 36.6 47.7 23.0
Crime_pers 18040.6 7638.4 21468.2 7044.3
Crime_prop 8422.5 3406.7 7263.7 2559.3
Literacy 37.9 19.1 40.6 15.6
Donations 7258.5 6194.1 6892.6 5519.0
Infants 20790.2 9363.5 17309.6 7973.0
Suicides 42565.4 37074.1 30479.8 23130.9
Wealth 51.0 23.9 36.0 23.9
Commerce 42.7 24.6 43.0 25.7
Clergy 39.1 26.7 47.7 22.7
Crime_parents 54.2 25.2 32.8 19.9
Infanticide 37.9 25.7 49.1 23.1
Donation_clergy 52.3 24.0 34.7 22.9
Lottery 54.8 23.0 32.2 21.7
Desertion 41.7 25.9 45.3 24.2
Instruction 46.7 26.7 39.6 22.5
Prostitutes 52.8 93.1 230.9 724.1
Distance 228.7 116.7 187.2 98.4
Area 5989.0 1142.8 6305.0 1612.4
Pop1831 272.4 53.4 484.8 137.3
N Pct. N Pct.
Region C 13 30.2 4 9.3
E 9 20.9 8 18.6
N 4 9.3 13 30.2
S 12 27.9 5 11.6
W 4 9.3 13 30.2
NA 1 2.3 0 0.0
MainCity 1:Sm 10 23.3 0 0.0
2:Med 33 76.7 33 76.7
3:Lg 0 0.0 10 23.3
#相关表
datasummary_correlation(dat)
X dept Crime_pers Crime_prop Literacy Donations Infants Suicides Wealth Commerce Clergy Crime_parents Infanticide Donation_clergy Lottery Desertion Instruction Prostitutes Distance Area Pop1831
X 1 . . . . . . . . . . . . . . . . . . . .
dept .92 1 . . . . . . . . . . . . . . . . . . .
Crime_pers −.12 −.20 1 . . . . . . . . . . . . . . . . . .
Crime_prop −.28 −.29 .27 1 . . . . . . . . . . . . . . . . .
Literacy .09 .10 −.04 −.37 1 . . . . . . . . . . . . . . . .
Donations .06 .27 −.04 −.13 −.13 1 . . . . . . . . . . . . . . .
Infants −.07 −.03 −.04 .27 −.41 .17 1 . . . . . . . . . . . . . .
Suicides −.18 −.15 −.13 .52 −.37 −.03 .29 1 . . . . . . . . . . . . .
Wealth −.08 −.08 −.12 .46 −.28 .08 .34 .42 1 . . . . . . . . . . . .
Commerce −.03 .05 .05 .41 −.58 .30 .39 .48 .48 1 . . . . . . . . . . .
Clergy .15 .06 .26 −.07 −.17 .09 −.06 −.32 −.11 −.12 1 . . . . . . . . . .
Crime_parents −.16 −.07 −.20 .36 −.20 −.02 .06 .35 .22 .18 −.18 1 . . . . . . . . .
Infanticide .01 −.07 .27 −.13 .32 −.15 −.24 −.08 −.22 −.28 −.01 −.09 1 . . . . . . . .
Donation_clergy −.08 .00 −.18 .30 −.38 .25 .10 .19 .34 .18 .30 .29 −.23 1 . . . . . . .
Lottery −.21 −.11 .00 .43 −.36 .15 .42 .49 .48 .45 −.28 .28 −.35 .36 1 . . . . . .
Desertion .10 .03 .33 −.26 .40 −.04 .00 −.47 −.23 −.36 .25 −.39 .11 −.41 −.30 1 . . . . .
Instruction −.10 −.11 .05 .39 −.98 .14 .43 .36 .31 .59 .21 .21 −.32 .40 .37 −.37 1 . . . .
Prostitutes .17 .14 −.05 −.33 .30 −.07 −.28 −.21 −.32 −.27 .20 −.03 .16 −.04 −.28 .04 −.26 1 . . .
Distance −.10 .04 −.51 .25 −.28 .08 .23 .41 .40 .38 −.31 .26 −.16 .28 .28 −.44 .24 −.37 1 . .
Area −.18 −.08 .22 .09 −.23 .18 .16 .00 .06 .18 .08 −.20 −.23 .02 .23 .04 .20 −.41 .06 1 .
Pop1831 .17 .09 .27 −.26 .09 .00 −.23 −.17 −.31 −.05 .29 −.40 .34 −.22 −.47 .11 −.11 .48 −.37 −.01 1
#两个变量和两个统计数据,嵌套在子组中
datasummary(Literacy + Commerce ~ Small * (mean + sd), dat)
FALSE
TRUE
mean sd mean sd
Literacy 37.88 19.08 40.63 15.57
Commerce 42.65 24.59 42.95 25.75
#估计一个线性模型并显示结果
mod <- lm(Donations ~ Crime_prop, data = dat)
modelsummary(mod)
Model 1
(Intercept) 9065.287
(1738.926)
Crime_prop −0.254
(0.207)
Num.Obs. 86
R2 0.018
R2 Adj. 0.006
AIC 1739.0
BIC 1746.4
Log.Lik. −866.516
F 1.505
#估计5个回归模型,并排显示结果,并将它们保存到Microsoft Word文档中
models <- list(
  "OLS 1"     = lm(Donations ~ Literacy + Clergy, data = dat),
  "Poisson 1" = glm(Donations ~ Literacy + Commerce, family = poisson, data = dat),
  "OLS 2"     = lm(Crime_pers ~ Literacy + Clergy, data = dat),
  "Poisson 2" = glm(Crime_pers ~ Literacy + Commerce, family = poisson, data = dat),
  "OLS 3"     = lm(Crime_prop ~ Literacy + Clergy, data = dat)
)
modelsummary(models, output = "table.docx")

5.2 PartII

url <- 'https://vincentarelbundock.github.io/Rdatasets/csv/palmerpenguins/penguins.csv'
dat <- read.csv(url)

# rescale mm -> cm
dat$bill_length_cm <- dat$bill_length_mm / 10
dat$flipper_length_cm <- dat$flipper_length_mm / 10

mod <- lm(bill_length_cm ~ flipper_length_cm + species, data = dat)
library(modelsummary)

modelplot(mod)

modelplot(mod, coef_omit = 'Interc')

cm <- c('speciesChinstrap' = 'Chinstrap',
        'speciesGentoo' = 'Gentoo', 
        'flipper_length_cm' = 'Flipper length (cm)')

modelplot(mod, coef_map = cm)

models <- list(
  "Small model" = lm(bill_length_cm ~ flipper_length_cm, data = dat),
  "Medium model" = lm(bill_length_cm ~ flipper_length_cm + body_mass_g, data = dat),
  "Large model" = lm(bill_length_cm ~ flipper_length_cm + body_mass_g + species, data = dat))

modelsummary(models, statistic = 'conf.int')
Small model Medium model Large model
(Intercept) −0.726 −0.344 0.984
[−1.356, −0.097] [−1.245, 0.557] [0.215, 1.752]
flipper_length_cm 0.255 0.222 0.095
[0.224, 0.286] [0.158, 0.285] [0.048, 0.142]
body_mass_g 0.000 0.000
[0.000, 0.000] [0.000, 0.000]
speciesChinstrap 0.939
[0.867, 1.011]
speciesGentoo 0.207
[0.088, 0.326]
Num.Obs. 342 342 342
R2 0.431 0.433 0.817
R2 Adj. 0.429 0.430 0.815
AIC 369.0 369.6 −12.6
BIC 380.5 385.0 10.4
Log.Lik. −181.499 −180.813 12.313
F 257.092 129.365 375.333
modelplot(models, coef_omit = 'Interc')

modelplot(models, facet = TRUE)

library(wesanderson)
library(ggplot2)

modelplot(models) +
  labs(x = 'Coefficients', 
       y = 'Term names',
       title = 'Linear regression models of "Bill Length (cm)"',
       caption = "Data source: Gorman, Williams & Fraser (2014), packaged for R by @apreshill and @allison_horst") +
  scale_color_manual(values = wes_palette('Darjeeling1'))

modelplot(mod, size = 1, fatten = .7, color = 'darkgreen', linetype = 'dotted') + 
  theme_classic()

library(ggplot2)
library(modelsummary)

models <- list(
  lm(vs ~ carb + mpg + cyl, data = mtcars),
  lm(disp ~ carb + mpg + cyl, data = mtcars),
  lm(hp ~ carb + mpg + cyl, data = mtcars))

models <- dvnames(models)
modelplot(models, draw = FALSE)
##           term model     estimate    std.error      conf.low    conf.high
## 1  (Intercept)    vs   2.41742511   0.67622094    1.03224931   3.80260091
## 5  (Intercept)  disp 112.57276339 114.86315481 -122.71374324 347.85927003
## 9  (Intercept)    hp -10.56116383  68.75946117 -151.40853516 130.28620751
## 2         carb    vs  -0.06945116   0.03943402   -0.15022810   0.01132577
## 6         carb  disp -12.30144724   6.69827859  -26.02224894   1.41935446
## 10        carb    hp  17.75593287   4.00972816    9.54237706  25.96948867
## 3          mpg    vs  -0.01513960   0.01716410   -0.05029868   0.02001947
## 7          mpg  disp  -7.14964651   2.91550156  -13.12178072  -1.17751230
## 11         mpg    hp  -1.00486469   1.74527956   -4.57990780   2.57017842
## 4          cyl    vs  -0.23926135   0.05687969   -0.35577411  -0.12274859
## 8          cyl  disp  47.90105842   9.66160634   28.11015499  67.69196184
## 12         cyl    hp  20.60581208   5.78363747    8.75856779  32.45305638
modelplot(models, color = "black") + facet_grid(~model)

modelplot(mod, conf_level = .99)

modelplot(mod, conf_level = NULL)

library(ggplot2)

b <- list(geom_vline(xintercept = 0, color = 'orange'),
          annotate("rect", alpha = .1,
                   xmin = -.5, xmax = .5, 
                   ymin = -Inf, ymax = Inf),
          geom_point(aes(y = term, x = estimate), alpha = .3, 
                     size = 10, color = 'red', shape = 'square'))

modelplot(mod, background = b)

modelplot(models, draw = FALSE)
##           term model     estimate    std.error      conf.low    conf.high
## 1  (Intercept)    vs   2.41742511   0.67622094    1.03224931   3.80260091
## 5  (Intercept)  disp 112.57276339 114.86315481 -122.71374324 347.85927003
## 9  (Intercept)    hp -10.56116383  68.75946117 -151.40853516 130.28620751
## 2         carb    vs  -0.06945116   0.03943402   -0.15022810   0.01132577
## 6         carb  disp -12.30144724   6.69827859  -26.02224894   1.41935446
## 10        carb    hp  17.75593287   4.00972816    9.54237706  25.96948867
## 3          mpg    vs  -0.01513960   0.01716410   -0.05029868   0.02001947
## 7          mpg  disp  -7.14964651   2.91550156  -13.12178072  -1.17751230
## 11         mpg    hp  -1.00486469   1.74527956   -4.57990780   2.57017842
## 4          cyl    vs  -0.23926135   0.05687969   -0.35577411  -0.12274859
## 8          cyl  disp  47.90105842   9.66160634   28.11015499  67.69196184
## 12         cyl    hp  20.60581208   5.78363747    8.75856779  32.45305638