31.1 Applications
31.1.1 Example 1
simulate data for 10 states and 30 years. State A receives the treatment T = 20
after year 15.
set.seed(1)
year <- rep(1:30, 10)
state <- rep(LETTERS[1:10], each = 30)
X1 <- round(rnorm(300, mean = 2, sd = 1), 2)
X2 <- round(rbinom(300, 1, 0.5) + rnorm(300), 2)
Y <- round(1 + 2 * X1 + rnorm(300), 2)
df <- as.data.frame(cbind(Y, X1, X2, state, year))
df$Y <- as.numeric(as.character(df$Y))
df$X1 <- as.numeric(as.character(df$X1))
df$X2 <- as.numeric(as.character(df$X2))
df$year <- as.numeric(as.character(df$year))
df$state.num <- rep(1:10, each = 30)
df$state <- as.character(df$state)
df$`T` <- ifelse(df$state == "A" & df$year >= 15, 1, 0)
df$Y <- ifelse(df$state == "A" & df$year >= 15,
df$Y + 20, df$Y)
str(df)
#> 'data.frame': 300 obs. of 7 variables:
#> $ Y : num 2.29 4.51 2.07 8.87 4.37 1.32 8 7.49 6.98 3.72 ...
#> $ X1 : num 1.37 2.18 1.16 3.6 2.33 1.18 2.49 2.74 2.58 1.69 ...
#> $ X2 : num 1.96 0.4 -0.75 -0.56 -0.45 1.06 0.51 -2.1 0 0.54 ...
#> $ state : chr "A" "A" "A" "A" ...
#> $ year : num 1 2 3 4 5 6 7 8 9 10 ...
#> $ state.num: int 1 1 1 1 1 1 1 1 1 1 ...
#> $ T : num 0 0 0 0 0 0 0 0 0 0 ...
dataprep.out <-
dataprep(
df,
predictors = c("X1", "X2"),
dependent = "Y",
unit.variable = "state.num",
time.variable = "year",
unit.names.variable = "state",
treatment.identifier = 1,
controls.identifier = c(2:10),
time.predictors.prior = c(1:14),
time.optimize.ssr = c(1:14),
time.plot = c(1:30)
)
synth.out <- synth(dataprep.out)
#>
#> X1, X0, Z1, Z0 all come directly from dataprep object.
#>
#>
#> ****************
#> searching for synthetic control unit
#>
#>
#> ****************
#> ****************
#> ****************
#>
#> MSPE (LOSS V): 9.831789
#>
#> solution.v:
#> 0.3888387 0.6111613
#>
#> solution.w:
#> 0.1115941 0.1832781 0.1027237 0.312091 0.06096758 0.03509706 0.05893735 0.05746256 0.07784853
print(synth.tables <- synth.tab(
dataprep.res = dataprep.out,
synth.res = synth.out)
)
#> $tab.pred
#> Treated Synthetic Sample Mean
#> X1 2.028 2.028 2.017
#> X2 0.513 0.513 0.394
#>
#> $tab.v
#> v.weights
#> X1 0.389
#> X2 0.611
#>
#> $tab.w
#> w.weights unit.names unit.numbers
#> 2 0.112 B 2
#> 3 0.183 C 3
#> 4 0.103 D 4
#> 5 0.312 E 5
#> 6 0.061 F 6
#> 7 0.035 G 7
#> 8 0.059 H 8
#> 9 0.057 I 9
#> 10 0.078 J 10
#>
#> $tab.loss
#> Loss W Loss V
#> [1,] 9.761708e-12 9.831789
path.plot(synth.res = synth.out,
dataprep.res = dataprep.out,
Ylab = c("Y"),
Xlab = c("Year"),
Legend = c("State A","Synthetic State A"),
Legend.position = c("topleft")
)
abline(v = 15,
lty = 2)
Gaps plot:
gaps.plot(synth.res = synth.out,
dataprep.res = dataprep.out,
Ylab = c("Gap"),
Xlab = c("Year"),
Ylim = c(-30, 30),
Main = ""
)
abline(v = 15,
lty = 2)
Alternatively, gsynth
provides options to estimate iterative fixed effects, and handle multiple treated units at tat time.
Here, we use two=way fixed effects and bootstrapped standard errors
gsynth.out <- gsynth(
Y ~ `T` + X1 + X2,
data = df,
index = c("state", "year"),
force = "two-way",
CV = TRUE,
r = c(0, 5),
se = TRUE,
inference = "parametric",
nboots = 1000,
parallel = F # TRUE
)
#> Cross-validating ...
#> r = 0; sigma2 = 1.13533; IC = 0.95632; PC = 0.96713; MSPE = 1.65502
#> r = 1; sigma2 = 0.96885; IC = 1.54420; PC = 4.30644; MSPE = 1.33375
#> r = 2; sigma2 = 0.81855; IC = 2.08062; PC = 6.58556; MSPE = 1.27341*
#> r = 3; sigma2 = 0.71670; IC = 2.61125; PC = 8.35187; MSPE = 1.79319
#> r = 4; sigma2 = 0.62823; IC = 3.10156; PC = 9.59221; MSPE = 2.02301
#> r = 5; sigma2 = 0.55497; IC = 3.55814; PC = 10.48406; MSPE = 2.79596
#>
#> r* = 2
#>
#>
Simulating errors .............
Bootstrapping ...
#> ..........
31.1.2 Example 2
by Leihua Ye
library(Synth)
data("basque")
dim(basque) #774*17
#> [1] 774 17
head(basque)
#> regionno regionname year gdpcap sec.agriculture sec.energy sec.industry
#> 1 1 Spain (Espana) 1955 2.354542 NA NA NA
#> 2 1 Spain (Espana) 1956 2.480149 NA NA NA
#> 3 1 Spain (Espana) 1957 2.603613 NA NA NA
#> 4 1 Spain (Espana) 1958 2.637104 NA NA NA
#> 5 1 Spain (Espana) 1959 2.669880 NA NA NA
#> 6 1 Spain (Espana) 1960 2.869966 NA NA NA
#> sec.construction sec.services.venta sec.services.nonventa school.illit
#> 1 NA NA NA NA
#> 2 NA NA NA NA
#> 3 NA NA NA NA
#> 4 NA NA NA NA
#> 5 NA NA NA NA
#> 6 NA NA NA NA
#> school.prim school.med school.high school.post.high popdens invest
#> 1 NA NA NA NA NA NA
#> 2 NA NA NA NA NA NA
#> 3 NA NA NA NA NA NA
#> 4 NA NA NA NA NA NA
#> 5 NA NA NA NA NA NA
#> 6 NA NA NA NA NA NA
transform data to be used in synth()
dataprep.out <- dataprep(
foo = basque,
predictors = c(
"school.illit",
"school.prim",
"school.med",
"school.high",
"school.post.high",
"invest"
),
predictors.op = "mean",
# the operator
time.predictors.prior = 1964:1969,
#the entire time frame from the #beginning to the end
special.predictors = list(
list("gdpcap", 1960:1969, "mean"),
list("sec.agriculture", seq(1961, 1969, 2), "mean"),
list("sec.energy", seq(1961, 1969, 2), "mean"),
list("sec.industry", seq(1961, 1969, 2), "mean"),
list("sec.construction", seq(1961, 1969, 2), "mean"),
list("sec.services.venta", seq(1961, 1969, 2), "mean"),
list("sec.services.nonventa", seq(1961, 1969, 2), "mean"),
list("popdens", 1969, "mean")
),
dependent = "gdpcap",
# dv
unit.variable = "regionno",
#identifying unit numbers
unit.names.variable = "regionname",
#identifying unit names
time.variable = "year",
#time-periods
treatment.identifier = 17,
#the treated case
controls.identifier = c(2:16, 18),
#the control cases; all others #except number 17
time.optimize.ssr = 1960:1969,
#the time-period over which to optimize
time.plot = 1955:1997
) #the entire time period before/after the treatment
where
X1 = the control case before the treatment
X0 = the control cases after the treatment
Z1: the treatment case before the treatment
Z0: the treatment case after the treatment
synth.out = synth(data.prep.obj = dataprep.out, method = "BFGS")
#>
#> X1, X0, Z1, Z0 all come directly from dataprep object.
#>
#>
#> ****************
#> searching for synthetic control unit
#>
#>
#> ****************
#> ****************
#> ****************
#>
#> MSPE (LOSS V): 0.008864606
#>
#> solution.v:
#> 0.02773094 1.194e-07 1.60609e-05 0.0007163836 1.486e-07 0.002423908 0.0587055 0.2651997 0.02851006 0.291276 0.007994382 0.004053188 0.009398579 0.303975
#>
#> solution.w:
#> 2.53e-08 4.63e-08 6.44e-08 2.81e-08 3.37e-08 4.844e-07 4.2e-08 4.69e-08 0.8508145 9.75e-08 3.2e-08 5.54e-08 0.1491843 4.86e-08 9.89e-08 1.162e-07
Calculate the difference between the real basque region and the synthetic control
gaps = dataprep.out$Y1plot - (dataprep.out$Y0plot
%*% synth.out$solution.w)
gaps[1:3,1]
#> 1955 1956 1957
#> 0.15023473 0.09168035 0.03716475
synth.tables = synth.tab(dataprep.res = dataprep.out,
synth.res = synth.out)
names(synth.tables)
#> [1] "tab.pred" "tab.v" "tab.w" "tab.loss"
synth.tables$tab.pred[1:13,]
#> Treated Synthetic Sample Mean
#> school.illit 39.888 256.337 170.786
#> school.prim 1031.742 2730.104 1127.186
#> school.med 90.359 223.340 76.260
#> school.high 25.728 63.437 24.235
#> school.post.high 13.480 36.153 13.478
#> invest 24.647 21.583 21.424
#> special.gdpcap.1960.1969 5.285 5.271 3.581
#> special.sec.agriculture.1961.1969 6.844 6.179 21.353
#> special.sec.energy.1961.1969 4.106 2.760 5.310
#> special.sec.industry.1961.1969 45.082 37.636 22.425
#> special.sec.construction.1961.1969 6.150 6.952 7.276
#> special.sec.services.venta.1961.1969 33.754 41.104 36.528
#> special.sec.services.nonventa.1961.1969 4.072 5.371 7.111
Relative importance of each unit
synth.tables$tab.w[8:14, ]
#> w.weights unit.names unit.numbers
#> 9 0.000 Castilla-La Mancha 9
#> 10 0.851 Cataluna 10
#> 11 0.000 Comunidad Valenciana 11
#> 12 0.000 Extremadura 12
#> 13 0.000 Galicia 13
#> 14 0.149 Madrid (Comunidad De) 14
#> 15 0.000 Murcia (Region de) 15
# plot the changes before and after the treatment
path.plot(
synth.res = synth.out,
dataprep.res = dataprep.out,
Ylab = "real per-capita gdp (1986 USD, thousand)",
Xlab = "year",
Ylim = c(0, 12),
Legend = c("Basque country",
"synthetic Basque country"),
Legend.position = "bottomright"
)
gaps.plot(
synth.res = synth.out,
dataprep.res = dataprep.out,
Ylab = "gap in real per - capita GDP (1986 USD, thousand)",
Xlab = "year",
Ylim = c(-1.5, 1.5),
Main = NA
)
Doubly Robust Difference-in-Differences
Example from DRDID
package
library(DRDID)
data(nsw_long)
# Form the Lalonde sample with CPS comparison group
eval_lalonde_cps <- subset(nsw_long, nsw_long$treated == 0 |
nsw_long$sample == 2)
Estimate Average Treatment Effect on Treated using Improved Locally Efficient Doubly Robust DID estimator
out <-
drdid(
yname = "re",
tname = "year",
idname = "id",
dname = "experimental",
xformla = ~ age + educ + black + married + nodegree + hisp + re74,
data = eval_lalonde_cps,
panel = TRUE
)
summary(out)
#> Call:
#> drdid(yname = "re", tname = "year", idname = "id", dname = "experimental",
#> xformla = ~age + educ + black + married + nodegree + hisp +
#> re74, data = eval_lalonde_cps, panel = TRUE)
#> ------------------------------------------------------------------
#> Further improved locally efficient DR DID estimator for the ATT:
#>
#> ATT Std. Error t value Pr(>|t|) [95% Conf. Interval]
#> -901.2703 393.6247 -2.2897 0.022 -1672.7747 -129.766
#> ------------------------------------------------------------------
#> Estimator based on panel data.
#> Outcome regression est. method: weighted least squares.
#> Propensity score est. method: inverse prob. tilting.
#> Analytical standard error.
#> ------------------------------------------------------------------
#> See Sant'Anna and Zhao (2020) for details.
31.1.3 Example 3
by Synth
package’s authors
synth()
requires
X1 vector of treatment predictors
X0 matrix of same variables for control group
Z1 vector of outcome variable for treatment group
Z0 matrix of outcome variable for control group
use dataprep()
to prepare data in the format that can be used throughout the Synth
package
dataprep.out <- dataprep(
foo = basque,
predictors = c(
"school.illit",
"school.prim",
"school.med",
"school.high",
"school.post.high",
"invest"
),
predictors.op = "mean",
time.predictors.prior = 1964:1969,
special.predictors = list(
list("gdpcap", 1960:1969 , "mean"),
list("sec.agriculture", seq(1961, 1969, 2), "mean"),
list("sec.energy", seq(1961, 1969, 2), "mean"),
list("sec.industry", seq(1961, 1969, 2), "mean"),
list("sec.construction", seq(1961, 1969, 2), "mean"),
list("sec.services.venta", seq(1961, 1969, 2), "mean"),
list("sec.services.nonventa", seq(1961, 1969, 2), "mean"),
list("popdens", 1969, "mean")
),
dependent = "gdpcap",
unit.variable = "regionno",
unit.names.variable = "regionname",
time.variable = "year",
treatment.identifier = 17,
controls.identifier = c(2:16, 18),
time.optimize.ssr = 1960:1969,
time.plot = 1955:1997
)
find optimal weights that identifies the synthetic control for the treatment group
synth.out <- synth(data.prep.obj = dataprep.out, method = "BFGS")
#>
#> X1, X0, Z1, Z0 all come directly from dataprep object.
#>
#>
#> ****************
#> searching for synthetic control unit
#>
#>
#> ****************
#> ****************
#> ****************
#>
#> MSPE (LOSS V): 0.008864606
#>
#> solution.v:
#> 0.02773094 1.194e-07 1.60609e-05 0.0007163836 1.486e-07 0.002423908 0.0587055 0.2651997 0.02851006 0.291276 0.007994382 0.004053188 0.009398579 0.303975
#>
#> solution.w:
#> 2.53e-08 4.63e-08 6.44e-08 2.81e-08 3.37e-08 4.844e-07 4.2e-08 4.69e-08 0.8508145 9.75e-08 3.2e-08 5.54e-08 0.1491843 4.86e-08 9.89e-08 1.162e-07
gaps <- dataprep.out$Y1plot - (dataprep.out$Y0plot %*% synth.out$solution.w)
gaps[1:3, 1]
#> 1955 1956 1957
#> 0.15023473 0.09168035 0.03716475
synth.tables <-
synth.tab(dataprep.res = dataprep.out, synth.res = synth.out)
names(synth.tables) # you can pick tables to see
#> [1] "tab.pred" "tab.v" "tab.w" "tab.loss"
path.plot(
synth.res = synth.out,
dataprep.res = dataprep.out,
Ylab = "real per-capita GDP (1986 USD, thousand)",
Xlab = "year",
Ylim = c(0, 12),
Legend = c("Basque country",
"synthetic Basque country"),
Legend.position = "bottomright"
)
gaps.plot(
synth.res = synth.out,
dataprep.res = dataprep.out,
Ylab = "gap in real per-capita GDP (1986 USD, thousand)",
Xlab = "year",
Ylim = c(-1.5, 1.5),
Main = NA
)
You could also run placebo tests
31.1.4 Example 4
by Michael Robbins and Steven Davenport who are authors of MicroSynth
with the following improvements:
Standardization
use.survey = TRUE
and permutation (perm = 250
andjack = TRUE
) for placebo testsOmnibus statistic (set to
omnibus.var
) for multiple outcome variablesincorporate multiple follow-up periods
end.post
Notes:
Both predictors and outcome will be used to match units before intervention
Outcome variable has to be time-variant
Predictors are time-invariant
# right now the package is not availabe for R version 4.2
library(microsynth)
data("seattledmi")
cov.var <-
c(
"TotalPop",
"BLACK",
"HISPANIC",
"Males_1521",
"HOUSEHOLDS",
"FAMILYHOUS",
"FEMALE_HOU",
"RENTER_HOU",
"VACANT_HOU"
)
match.out <- c("i_felony", "i_misdemea", "i_drugs", "any_crime")
sea1 <- microsynth(
seattledmi,
idvar = "ID",
timevar = "time",
intvar = "Intervention",
start.pre = 1,
end.pre = 12,
end.post = 16,
match.out = match.out, # outcome variable will be matched on exactly
match.covar = cov.var, # specify covariates will be matched on exactly
result.var = match.out, # used to report results
omnibus.var = match.out, # feature in the omnibus p-value
test = "lower",
n.cores = min(parallel::detectCores(), 2)
)
sea1
summary(sea1)
sea2 <- microsynth(
seattledmi,
idvar = "ID",
timevar = "time",
intvar = "Intervention",
start.pre = 1,
end.pre = 12,
end.post = c(14, 16),
match.out = match.out,
match.covar = cov.var,
result.var = match.out,
omnibus.var = match.out,
test = "lower",
perm = 250,
jack = TRUE,
n.cores = min(parallel::detectCores(), 2)
)