diff --git a/DESCRIPTION b/DESCRIPTION index e839669..94cbfef 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -13,12 +13,13 @@ Description: Estimate average treatment effects in panel data. Depends: R (>= 3.4) Imports: + abind, mvtnorm License: GPL (>= 2) | BSD_3_clause + file LICENSE Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) -RoxygenNote: 7.1.1 +RoxygenNote: 7.1.2 Suggests: testthat, ggplot2, diff --git a/NAMESPACE b/NAMESPACE index c253455..d6ecb1c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -11,6 +11,7 @@ export(lindsey_density_estimate) export(panel.matrices) export(sc_estimate) export(simulate_dgp) +export(synthdid) export(synthdid_controls) export(synthdid_effect_curve) export(synthdid_estimate) diff --git a/R/synthdid-package.R b/R/synthdid-package.R index 7c84ecf..d26c8e3 100644 --- a/R/synthdid-package.R +++ b/R/synthdid-package.R @@ -29,4 +29,5 @@ #'} #' #' @keywords internal +#' @aliases synthdid-package "_PACKAGE" diff --git a/R/synthdid.R b/R/synthdid.R index 725c3af..26b20cb 100644 --- a/R/synthdid.R +++ b/R/synthdid.R @@ -3,6 +3,7 @@ #' @param v a vector sparsify_function = function(v) { v[v <= max(v)/4] = 0; v/sum(v) } + #' Computes the synthetic diff-in-diff estimate for an average treatment effect on a treated block. #' #' See 'Synthetic Difference in Differences' by Arkhangelsky et al. This implements Algorithm 1. @@ -43,11 +44,11 @@ synthdid_estimate <- function(Y, N0, T0, X = array(dim = c(dim(Y), 0)), weights = list(omega = NULL, lambda = NULL), update.omega = is.null(weights$omega), update.lambda = is.null(weights$lambda), min.decrease = 1e-5 * noise.level, max.iter = 1e4, - sparsify = sparsify_function, - max.iter.pre.sparsify = 100) { + sparsify = sparsify_function, + max.iter.pre.sparsify = 100) { stopifnot(nrow(Y) > N0, ncol(Y) > T0, length(dim(X)) %in% c(2, 3), dim(X)[1:2] == dim(Y), is.list(weights), - is.null(weights$lambda) || length(weights$lambda) == T0, is.null(weights$omega) || length(weights$omega) == N0, - !is.null(weights$lambda) || update.lambda, !is.null(weights$omega) || update.omega) + is.null(weights$lambda) || length(weights$lambda) == T0, is.null(weights$omega) || length(weights$omega) == N0, + !is.null(weights$lambda) || update.lambda, !is.null(weights$omega) || update.omega) if (length(dim(X)) == 2) { dim(X) = c(dim(X), 1) } if (is.null(sparsify)) { max.iter.pre.sparsify = max.iter } N1 = nrow(Y) - N0 @@ -60,10 +61,10 @@ synthdid_estimate <- function(Y, N0, T0, X = array(dim = c(dim(Y), 0)), if (update.lambda) { Yc = collapsed.form(Y, N0, T0) lambda.opt = sc.weight.fw(Yc[1:N0, ], zeta = zeta.lambda, intercept = lambda.intercept, lambda=weights$lambda, - min.decrease = min.decrease, max.iter = max.iter.pre.sparsify) + min.decrease = min.decrease, max.iter = max.iter.pre.sparsify) if(!is.null(sparsify)) { - lambda.opt = sc.weight.fw(Yc[1:N0, ], zeta = zeta.lambda, intercept = lambda.intercept, lambda=sparsify(lambda.opt$lambda), - min.decrease = min.decrease, max.iter = max.iter) + lambda.opt = sc.weight.fw(Yc[1:N0, ], zeta = zeta.lambda, intercept = lambda.intercept, lambda=sparsify(lambda.opt$lambda), + min.decrease = min.decrease, max.iter = max.iter) } weights$lambda = lambda.opt$lambda weights$lambda.vals = lambda.opt$vals @@ -72,10 +73,10 @@ synthdid_estimate <- function(Y, N0, T0, X = array(dim = c(dim(Y), 0)), if (update.omega) { Yc = collapsed.form(Y, N0, T0) omega.opt = sc.weight.fw(t(Yc[, 1:T0]), zeta = zeta.omega, intercept = omega.intercept, lambda=weights$omega, - min.decrease = min.decrease, max.iter = max.iter.pre.sparsify) + min.decrease = min.decrease, max.iter = max.iter.pre.sparsify) if(!is.null(sparsify)) { - omega.opt = sc.weight.fw(t(Yc[, 1:T0]), zeta = zeta.omega, intercept = omega.intercept, lambda=sparsify(omega.opt$lambda), - min.decrease = min.decrease, max.iter = max.iter) + omega.opt = sc.weight.fw(t(Yc[, 1:T0]), zeta = zeta.omega, intercept = omega.intercept, lambda=sparsify(omega.opt$lambda), + min.decrease = min.decrease, max.iter = max.iter) } weights$omega = omega.opt$lambda weights$omega.vals = omega.opt$vals @@ -87,9 +88,9 @@ synthdid_estimate <- function(Y, N0, T0, X = array(dim = c(dim(Y), 0)), Xc = apply(X, 3, function(Xi) { collapsed.form(Xi, N0, T0) }) dim(Xc) = c(dim(Yc), dim(X)[3]) weights = sc.weight.fw.covariates(Yc, Xc, zeta.lambda = zeta.lambda, zeta.omega = zeta.omega, - lambda.intercept = lambda.intercept, omega.intercept = omega.intercept, - min.decrease = min.decrease, max.iter = max.iter, - lambda = weights$lambda, omega = weights$omega, update.lambda = update.lambda, update.omega = update.omega) + lambda.intercept = lambda.intercept, omega.intercept = omega.intercept, + min.decrease = min.decrease, max.iter = max.iter, + lambda = weights$lambda, omega = weights$omega, update.lambda = update.lambda, update.omega = update.omega) } X.beta = contract3(X, weights$beta) @@ -106,6 +107,102 @@ synthdid_estimate <- function(Y, N0, T0, X = array(dim = c(dim(Y), 0)), return(estimate) } +#' A wrapper function for estimating one or more synthetic diff-in-diffs given a model +#' formula and a data frame. +#' @param formula a formula representing the relationship to be estimated. For example, +#' \code{formula = y ~ d | unit + time} where y is the dependent variable, +#' d is the time-and-unit-varying treatment variable, and unit and time are +#' unit and time identifiers, respectively. The unit identifier must precede +#' the time identifier. Optional control variables \code{x1} and \code{x2} may +#' be added by appending \code{| ~ x1 + x2} to the end of the formula. +#' Additionally, you can estimate models of multiple outcomes by expressing the +#' left hand side as a vector of multiple variables \code{c(y1, y2, y3)}. +#' @param data a data.frame object including each variable in formula. +#' @param method the estimator to use. One of `did_estimate`, `sc_estimate`, or `synthdid_estimate`. +#' Default is `synthdid_estimate`. +#' @param ... optional parameters to pass on to synthdid_estimate. +#' @return A list of average treatment effect estimates with 'weights' and 'setup' attached as attributes. +#' 'weights' contains the estimated weights lambda and omega and corresponding intercepts, +#' as well as regression coefficients beta if X is passed. +#' 'setup' is a list describing the problem passed in: Y, N0, T0, X. +#' @examples +#' data(CPS) +#' # No controls +#' formula_1 <- c(urate, hours) ~ min_wage | state + year +#' # Controlling for log wage +#' formula_2 <- c(urate, hours) ~ min_wage | state + year | ~ log_wage +#' # Estimate models +#' estimates_nocontrol <- synthdid(formula_1, CPS) +#' estimates_lwcontrol <- synthdid(formula_2, CPS) +#' +#' @export synthdid +synthdid <- function(formula, + data, + method = synthdid_estimate, + ...){ + + # Flag whether controls are included + has_controls <- length(formula[[3]][[2]]) == 3 + + # Define panel dimensions + if(has_controls){ + panel_dims <- formula[[3]][[2]][[3]] + } else { + panel_dims <- formula[[3]][[3]] + } + unit_var <- as.character(panel_dims[[2]]) + time_var <- as.character(panel_dims[[3]]) + + # Define all dependent variables + dep_vars <- as.character(formula[[2]]) + if(length(dep_vars) > 1){ + n_dep_vars <- length(dep_vars) - 1 + dep_vars <- tail(dep_vars, n_dep_vars) + } + + # Define treatment variable + if(has_controls){ + treat_var <- as.character(formula[[3]][[2]][[2]]) + } else { + treat_var <- as.character(formula[[3]][[2]]) + } + + # Extract controls + if(has_controls){ + control_formula <- update(as.formula(formula[[3]][[3]]), ~ . -1) + control_matrix <- model.matrix(control_formula, data) + + # Create N x t matrix for each variable + control_matlist <- apply(control_matrix, 2, function(x){ + d <- data.frame(unit = data[[unit_var]], + time = data[[time_var]], + x) + reshape(d, direction = "wide", idvar = "unit", timevar = "time")[, -1] + }) + + # Bind matrices into array + X <- abind::abind(control_matlist, along = 3) + } + + # Data can't be a tibble + data <- data.frame(data) + + # Estimate multiple models + estimates <- lapply(dep_vars, function(dep_var) { + setup <- panel.matrices(data, unit_var, time_var, dep_var, treat_var) + + if(!has_controls) X <- array(dim = c(dim(setup$Y), 0)) + estimate <- method(setup$Y, setup$N0, setup$T0, X = X, ...) + estimate + }) + + # Rename list elements with model names + names(estimates) <- dep_vars + return(estimates) +} + + + #' synthdid_estimate for synthetic control estimates. #' Takes all the same parameters, but by default, passes options to use the synthetic control estimator #' By default, this uses only 'infinitesimal' ridge regularization when estimating the weights. @@ -118,7 +215,7 @@ synthdid_estimate <- function(Y, N0, T0, X = array(dim = c(dim(Y), 0)), #' @export sc_estimate sc_estimate = function(Y, N0, T0, eta.omega = 1e-6, ...) { estimate = synthdid_estimate(Y, N0, T0, eta.omega = eta.omega, - weights = list(lambda = rep(0, T0)), omega.intercept = FALSE, ...) + weights = list(lambda = rep(0, T0)), omega.intercept = FALSE, ...) attr(estimate, 'estimator') = "sc_estimate" estimate } diff --git a/man/synthdid-package.Rd b/man/synthdid-package.Rd index b7be13d..1c64c33 100644 --- a/man/synthdid-package.Rd +++ b/man/synthdid-package.Rd @@ -2,8 +2,8 @@ % Please edit documentation in R/synthdid-package.R \docType{package} \name{synthdid-package} -\alias{synthdid} \alias{synthdid-package} +\alias{_PACKAGE} \title{synthdid: Synthetic Difference-in-Difference Estimation} \description{ This package implements the synthetic difference in difference estimator (SDID) for the average treatment effect in panel data, diff --git a/man/synthdid.Rd b/man/synthdid.Rd new file mode 100644 index 0000000..0b29e0b --- /dev/null +++ b/man/synthdid.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/synthdid.R +\name{synthdid} +\alias{synthdid} +\title{A wrapper function for estimating one or more synthetic diff-in-diffs given a model +formula and a data frame.} +\usage{ +synthdid(formula, data, method = synthdid_estimate, ...) +} +\arguments{ +\item{formula}{a formula representing the relationship to be estimated. For example, +\code{formula = y ~ d | unit + time} where y is the dependent variable, +d is the time-and-unit-varying treatment variable, and unit and time are +unit and time identifiers, respectively. The unit identifier must precede +the time identifier. Optional control variables \code{x1} and \code{x2} may +be added by appending \code{| ~ x1 + x2} to the end of the formula. +Additionally, you can estimate models of multiple outcomes by expressing the +left hand side as a vector of multiple variables \code{c(y1, y2, y3)}.} + +\item{data}{a data.frame object including each variable in formula.} + +\item{method}{the estimator to use. One of \code{did_estimate}, \code{sc_estimate}, or \code{synthdid_estimate}. +Default is \code{synthdid_estimate}.} + +\item{...}{optional parameters to pass on to synthdid_estimate.} +} +\value{ +A list of average treatment effect estimates with 'weights' and 'setup' attached as attributes. +'weights' contains the estimated weights lambda and omega and corresponding intercepts, +as well as regression coefficients beta if X is passed. +'setup' is a list describing the problem passed in: Y, N0, T0, X. +} +\description{ +A wrapper function for estimating one or more synthetic diff-in-diffs given a model +formula and a data frame. +} +\examples{ +data(CPS) +# No controls +formula_1 <- c(urate, hours) ~ min_wage | state + year +# Controlling for log wage +formula_2 <- c(urate, hours) ~ min_wage | state + year | ~ log_wage +# Estimate models +estimates_nocontrol <- synthdid(formula_1, CPS) +estimates_lwcontrol <- synthdid(formula_2, CPS) + +} diff --git a/tests/testthat/test_synthdid.R b/tests/testthat/test_synthdid.R index 25fcf53..ca07da1 100644 --- a/tests/testthat/test_synthdid.R +++ b/tests/testthat/test_synthdid.R @@ -1,3 +1,5 @@ + + test_that("a simple workflow doesn't error", { setup = random.low.rank() tau.hat = synthdid_estimate(setup$Y,setup$N0,setup$T0) @@ -12,6 +14,60 @@ test_that("a simple workflow doesn't error", { expect_equal(1, 1) }) +test_that("a simple workflow doesn't error when estimating multiple outcomes", { + data(CPS) + # No controls + formula_1 <- c(urate, hours) ~ min_wage | state + year + # Controlling for log wage + formula_2 <- c(urate, hours) ~ min_wage | state + year | ~ log_wage + # Controlling for multiple variables + formula_3 <- c(urate, hours) ~ min_wage | state + year | ~ log_wage + open_carry + abort_ban + # Estimate models + estimates_nocontrol <- synthdid(formula_1, CPS) + estimates_lwcontrol <- synthdid(formula_2, CPS) + estimates_3control <- synthdid(formula_3, CPS) + + expect_equal(1, 1) +}) + +test_that("synthdid wrapper gives same results as synthdid_estimate", { + data(CPS) + # Controlling for hours + formula <- urate ~ min_wage | state + year | ~ hours + # Estimate using wrapper function + estimates_wrap <- synthdid(formula, CPS) + + # Estimate using synthdid_estimate directly + # Create panel matrix + setup <- panel.matrices(CPS, + unit = "state", + time = "year", + outcome = "urate", + treatment = "min_wage") + + # Create control array + covariate_df <- reshape(CPS[, c("state", "year", "hours")], + direction = "wide", idvar = "state", timevar = "year") + + covariate_array <- array(as.matrix(covariate_df[, -1]), c(50, 40, 1)) + estimates_direct <- synthdid_estimate(setup$Y, setup$N0, setup$T0, + X = covariate_array) + + expect_equal(as.numeric(estimates_wrap$urate), as.numeric(estimates_direct)) +}) + + +test_that("estimating SC and DID works with synthdid wrapper", { + data(CPS) + formula <- c(urate, hours) ~ min_wage | state + year | ~ log_wage + estimates_sc <- synthdid(formula, CPS, method = sc_estimate) + estimates_did <- synthdid(formula, CPS, method = did_estimate) + estimates_synthdid <- synthdid(formula, CPS, method = synthdid_estimate) + + expect_equal(1, 1) +}) + + test_that("plotting doesn't error with (i) dates as colnames (ii) spaghetti units", { data(california_prop99) @@ -168,3 +224,6 @@ test_that("treated effect shifts correctly with scalar shifts to the 4 blocks", } } }) + + +