Comparing Estimators Under Different Data-Generating Processes
mixedsubjects
2026-04-02
compare_estimators.Rmd
# Install from github
devtools::install_github('klintkanopka/mixedsubjects')Overview
Each of the seven estimators in the mixedsubjects
package has a different variance formula, so each dominates under a
different data-generating process. This document constructs targeted DGP
scenarios, runs Monte Carlo simulations under each, and confirms which
estimator achieves the smallest variance.
The key quantities that drive the comparison:
| Quantity | Favors |
|---|---|
| — prediction quality | All prediction-based estimators over DiM |
| — heterogeneous quality across arms | D-T and D-T DiP over single- estimators |
| — common-mode prediction error | DiP family over PPI/D-T family |
| — ratio of unlabeled to labeled | Prediction-based estimators (larger helps more) |
Simulation Engine
#' Run a Monte Carlo comparison of all seven estimators
#'
#' @param dgp_fn A function(seed) that returns a list with components
#' obs_df, unobs_df (data.frames), and true_tau (scalar).
#' @param n_sims Number of Monte Carlo replications.
#' @param n_folds Number of cross-fitting folds.
#' @return A data.frame with columns: estimator, mean_est, bias, variance, mse.
run_comparison <- function(dgp_fn, n_sims = 2000, n_folds = 2) {
estimator_names <- c("dim", "greg", "ppi", "dt", "dip", "dip_pp", "dt_dip")
results <- matrix(NA_real_, nrow = n_sims, ncol = length(estimator_names))
colnames(results) <- estimator_names
for (i in seq_len(n_sims)) {
d <- dgp_fn(seed = i)
msd <- msd_data(observed = d$obs_df, unobserved = d$unobs_df)
results[i, "dim"] <- tryCatch(msd_dim(msd)$estimate, error = function(e) NA)
results[i, "greg"] <- tryCatch(msd_greg(msd)$estimate, error = function(e) NA)
results[i, "ppi"] <- tryCatch(msd_ppi(msd, n_folds = n_folds)$estimate, error = function(e) NA)
results[i, "dt"] <- tryCatch(msd_dt(msd, n_folds = n_folds)$estimate, error = function(e) NA)
results[i, "dip"] <- tryCatch(msd_dip(msd)$estimate, error = function(e) NA)
results[i, "dip_pp"] <- tryCatch(msd_dip_pp(msd, n_folds = n_folds)$estimate, error = function(e) NA)
results[i, "dt_dip"] <- tryCatch(msd_dt_dip(msd, n_folds = n_folds)$estimate, error = function(e) NA)
}
true_tau <- dgp_fn(seed = 1)$true_tau
data.frame(
estimator = estimator_names,
mean_est = colMeans(results, na.rm = TRUE),
bias = colMeans(results, na.rm = TRUE) - true_tau,
variance = apply(results, 2, var, na.rm = TRUE),
mse = apply(results, 2, function(x) mean((x - true_tau)^2, na.rm = TRUE)),
stringsAsFactors = FALSE
)
}
#' Pretty-print a comparison table, highlighting the minimum-variance estimator
print_comparison <- function(comp, title = "") {
if (nchar(title) > 0) cat("###", title, "\n\n")
comp$variance <- round(comp$variance, 6)
comp$bias <- round(comp$bias, 4)
comp$mse <- round(comp$mse, 6)
best <- comp$estimator[which.min(comp$variance)]
comp$best <- ifelse(comp$estimator == best, " <-- min", "")
print(comp[, c("estimator", "bias", "variance", "mse", "best")], row.names = FALSE)
cat("\nLowest variance:", best, "\n\n")
}Scenario 1: Poor Predictions — DiM Wins
When predictions are essentially noise (), incorporating them only adds variance. DiM ignores predictions entirely, so it should dominate.
dgp_poor_predictions <- function(seed) {
set.seed(seed)
true_tau <- 0.5
n <- 200; m <- 400
D_obs <- rep(c(1, 0), each = n / 2)
Y <- rnorm(n) + true_tau * D_obs
# Predictions are pure noise — no correlation with Y
S1_obs <- rnorm(n, 0.1, 1)
S0_obs <- rnorm(n, 0, 1)
D_unobs <- rep(c(1, 0), each = m / 2)
S1_unobs <- rnorm(m, 0.1, 1)
S0_unobs <- rnorm(m, 0, 1)
list(
obs_df = data.frame(Y = Y, D = D_obs, S0 = S0_obs, S1 = S1_obs),
unobs_df = data.frame(D = D_unobs, S0 = S0_unobs, S1 = S1_unobs),
true_tau = true_tau
)
}
comp1 <- run_comparison(dgp_poor_predictions)
print_comparison(comp1, "Scenario 1: Poor Predictions")
#> ### Scenario 1: Poor Predictions
#>
#> estimator bias variance mse best
#> dim -0.0013 0.019485 0.019477 <-- min
#> greg 0.0038 0.049866 0.049855
#> ppi -0.0015 0.019623 0.019616
#> dt -0.0020 0.019715 0.019709
#> dip 0.0031 0.044152 0.044139
#> dip_pp -0.0018 0.019632 0.019626
#> dt_dip -0.0023 0.019895 0.019890
#>
#> Lowest variance: dimWhy: All prediction-based estimators add to the variance without any offsetting covariance reduction. The optimal shrinks toward 0 for PPI++/D-T/DiP++/D-T DiP, making them roughly equal to DiM, while GREG and DiP (fixed ) are strictly worse.
Scenario 2: Negatively Correlated Predictions
When a covariate has opposite effects on and (e.g., , ), the potential outcomes are negatively correlated, and so are the predictions and . This inflates relative to the positive-correlation case, which reduces DiP’s advantage over PPI — but does not eliminate it, because DiP still benefits structurally from using all unobserved units rather than splitting by arm ( each).
dgp_neg_corr_predictions <- function(seed) {
set.seed(seed)
true_tau <- 0.5
n <- 200; m <- 500
D_obs <- rep(c(1, 0), each = n / 2)
# X has opposite effects on Y(1) vs Y(0), creating negative Cov(Y1, Y0)
X_obs <- rnorm(n)
Y0_obs <- rnorm(n, 0, 0.3) + 1.0 * X_obs
Y1_obs <- rnorm(n, 0, 0.3) - 1.0 * X_obs + true_tau
Y <- D_obs * Y1_obs + (1 - D_obs) * Y0_obs
# Good predictions of each potential outcome
S1_obs <- 0.85 * Y1_obs + rnorm(n, 0, 0.2)
S0_obs <- 0.85 * Y0_obs + rnorm(n, 0, 0.2)
D_unobs <- rep(c(1, 0), each = m / 2)
X_unobs <- rnorm(m)
Y0_unobs <- rnorm(m, 0, 0.3) + 1.0 * X_unobs
Y1_unobs <- rnorm(m, 0, 0.3) - 1.0 * X_unobs + true_tau
S1_unobs <- 0.85 * Y1_unobs + rnorm(m, 0, 0.2)
S0_unobs <- 0.85 * Y0_unobs + rnorm(m, 0, 0.2)
list(
obs_df = data.frame(Y = Y, D = D_obs, S0 = S0_obs, S1 = S1_obs),
unobs_df = data.frame(D = D_unobs, S0 = S0_unobs, S1 = S1_unobs),
true_tau = true_tau
)
}
comp2 <- run_comparison(dgp_neg_corr_predictions)
print_comparison(comp2, "Scenario 2: Negatively Correlated Predictions")
#> ### Scenario 2: Negatively Correlated Predictions
#>
#> estimator bias variance mse best
#> dim 0.0029 0.022178 0.022175
#> greg 0.0018 0.008204 0.008204
#> ppi 0.0019 0.007675 0.007674
#> dt 0.0021 0.007698 0.007698
#> dip 0.0013 0.007707 0.007705
#> dip_pp 0.0014 0.007297 0.007296 <-- min
#> dt_dip 0.0011 0.009078 0.009075
#>
#> Lowest variance: dip_ppWhy: With , differencing inflates the unobserved variance: . However, DiP still outperforms PPI because its unobserved term divides by while PPI divides by per arm. In general, always holds (by Cauchy–Schwarz), so the DiP family structurally dominates the PPI family in the unobserved variance component regardless of the sign of . Negative correlation merely shrinks DiP’s advantage relative to the positive-correlation case.
Scenario 3: Heterogeneous Prediction Quality — Double-Tuned Wins
When the LLM predicts much better in one arm than the other (), double-tuned estimators (D-T, D-T DiP) that set arm-specific outperform their single- counterparts (PPI++, DiP++). D-T DiP achieves the lowest variance by combining the double-tuning advantage with DiP’s structural advantage of using all unobserved units.
dgp_heterogeneous_quality <- function(seed) {
set.seed(seed)
true_tau <- 0.5
n <- 200; m <- 500
D_obs <- rep(c(1, 0), each = n / 2)
Y0_obs <- rnorm(n)
Y1_obs <- Y0_obs + true_tau
Y <- D_obs * Y1_obs + (1 - D_obs) * Y0_obs
# Treatment arm: excellent predictions (rho ~ 0.85)
# Control arm: mediocre predictions (rho ~ 0.25)
S1_obs <- 0.9 * Y1_obs + rnorm(n, 0, 0.3)
S0_obs <- 0.2 * Y0_obs + rnorm(n, 0, 0.9)
D_unobs <- rep(c(1, 0), each = m / 2)
Y0_unobs <- rnorm(m)
Y1_unobs <- Y0_unobs + true_tau
S1_unobs <- 0.9 * Y1_unobs + rnorm(m, 0, 0.3)
S0_unobs <- 0.2 * Y0_unobs + rnorm(m, 0, 0.9)
list(
obs_df = data.frame(Y = Y, D = D_obs, S0 = S0_obs, S1 = S1_obs),
unobs_df = data.frame(D = D_unobs, S0 = S0_unobs, S1 = S1_unobs),
true_tau = true_tau
)
}
comp3 <- run_comparison(dgp_heterogeneous_quality)
print_comparison(comp3, "Scenario 3: Heterogeneous Quality Across Arms")
#> ### Scenario 3: Heterogeneous Quality Across Arms
#>
#> estimator bias variance mse best
#> dim -0.0013 0.019485 0.019477
#> greg 0.0053 0.022326 0.022343
#> ppi 0.0022 0.015073 0.015070
#> dt 0.0022 0.013479 0.013477
#> dip 0.0026 0.018447 0.018445
#> dip_pp 0.0008 0.014143 0.014137
#> dt_dip 0.0008 0.012518 0.012512 <-- min
#>
#> Lowest variance: dt_dipWhy: A single must compromise between the two arms: PPI++ and DiP++ cannot fully exploit the good treatment-arm predictions without also amplifying noise in the control arm. Double-tuned estimators (D-T, D-T DiP) set high and low, avoiding this trade-off. Within each tuning family, DiP-style estimators beat their PPI-style counterparts (D-T DiP < D-T, DiP++ < PPI++) because they use all unobserved units rather than splitting by arm.
Scenario 4: High Common-Mode Error
When predictions for and share a large common error component (e.g., the LLM has a stable unit-level bias that cancels in ), the DiP family exploits to reduce the unobserved variance component.
dgp_high_common_mode <- function(seed) {
set.seed(seed)
true_tau <- 0.5
n <- 200; m <- 500
D_obs <- rep(c(1, 0), each = n / 2)
Y0_obs <- rnorm(n)
Y1_obs <- Y0_obs + true_tau
Y <- D_obs * Y1_obs + (1 - D_obs) * Y0_obs
# Common-mode prediction error (shared LLM bias per unit, not in Y)
common_obs <- rnorm(n, 0, 0.8)
S1_obs <- 0.9 * Y1_obs + common_obs + rnorm(n, 0, 0.1)
S0_obs <- 0.9 * Y0_obs + common_obs + rnorm(n, 0, 0.1)
D_unobs <- rep(c(1, 0), each = m / 2)
Y0_unobs <- rnorm(m)
Y1_unobs <- Y0_unobs + true_tau
common_unobs <- rnorm(m, 0, 0.8)
S1_unobs <- 0.9 * Y1_unobs + common_unobs + rnorm(m, 0, 0.1)
S0_unobs <- 0.9 * Y0_unobs + common_unobs + rnorm(m, 0, 0.1)
list(
obs_df = data.frame(Y = Y, D = D_obs, S0 = S0_obs, S1 = S1_obs),
unobs_df = data.frame(D = D_unobs, S0 = S0_unobs, S1 = S1_unobs),
true_tau = true_tau
)
}
comp4 <- run_comparison(dgp_high_common_mode)
print_comparison(comp4, "Scenario 4: High Common-Mode Prediction Error")
#> ### Scenario 4: High Common-Mode Prediction Error
#>
#> estimator bias variance mse best
#> dim -0.0013 0.019485 0.019477
#> greg 0.0027 0.025099 0.025094
#> ppi 0.0010 0.012169 0.012164
#> dt 0.0011 0.012251 0.012246
#> dip 0.0013 0.013026 0.013021
#> dip_pp 0.0007 0.008676 0.008672 <-- min
#> dt_dip 0.0005 0.008817 0.008813
#>
#> Lowest variance: dip_ppWhy: and are inflated by the common error, which hurts PPI++/D-T through the term. But cancels the common error, so the DiP unobserved term is much smaller.
Scenario 5: Common-Mode Error + Heterogeneous Quality
Combine the advantages of scenarios 3 and 4: predictions share common-mode error (favoring DiP) but quality differs across arms (favoring D-T). D-T DiP should beat both D-T and DiP++.
dgp_common_mode_heterogeneous <- function(seed) {
set.seed(seed)
true_tau <- 0.5
n <- 200; m <- 500
D_obs <- rep(c(1, 0), each = n / 2)
Y0_obs <- rnorm(n)
Y1_obs <- Y0_obs + true_tau
Y <- D_obs * Y1_obs + (1 - D_obs) * Y0_obs
common_obs <- rnorm(n, 0, 1.2)
# Treatment arm: good signal; control arm: weak signal
S1_obs <- 0.9 * Y1_obs + common_obs + rnorm(n, 0, 0.3)
S0_obs <- 0.2 * Y0_obs + common_obs + rnorm(n, 0, 0.5)
D_unobs <- rep(c(1, 0), each = m / 2)
Y0_unobs <- rnorm(m)
Y1_unobs <- Y0_unobs + true_tau
common_unobs <- rnorm(m, 0, 1.2)
S1_unobs <- 0.9 * Y1_unobs + common_unobs + rnorm(m, 0, 0.3)
S0_unobs <- 0.2 * Y0_unobs + common_unobs + rnorm(m, 0, 0.5)
list(
obs_df = data.frame(Y = Y, D = D_obs, S0 = S0_obs, S1 = S1_obs),
unobs_df = data.frame(D = D_unobs, S0 = S0_unobs, S1 = S1_unobs),
true_tau = true_tau
)
}
comp5 <- run_comparison(dgp_common_mode_heterogeneous)
print_comparison(comp5, "Scenario 5: Common-Mode Error + Heterogeneous Quality")
#> ### Scenario 5: Common-Mode Error + Heterogeneous Quality
#>
#> estimator bias variance mse best
#> dim -0.0013 0.019485 0.019477
#> greg 0.0030 0.054145 0.054127
#> ppi 0.0000 0.017590 0.017581
#> dt 0.0001 0.017206 0.017197
#> dip 0.0007 0.039094 0.039075
#> dip_pp -0.0004 0.016770 0.016762
#> dt_dip 0.0001 0.016635 0.016627 <-- min
#>
#> Lowest variance: dt_dipWhy: D-T DiP can set large (good treatment predictions) and small (noisy control predictions), while still exploiting the common-mode cancellation in .
Scenario 6: Near-Perfect Predictions
When predictions are nearly unbiased and have very high correlation with (), the optimal is close to 1. With small (few observations per cross-fitting fold), estimating adds finite-sample noise. DiP with fixed dominates: it avoids the lambda-estimation cost while retaining the structural advantage of using all unobserved units.
dgp_near_perfect <- function(seed) {
set.seed(seed)
true_tau <- 0.5
n <- 100; m <- 500
D_obs <- rep(c(1, 0), each = n / 2)
Y0_obs <- rnorm(n)
Y1_obs <- Y0_obs + true_tau
Y <- D_obs * Y1_obs + (1 - D_obs) * Y0_obs
# Near-perfect predictions: rho = 1/sqrt(1 + 0.01) ~ 0.995
S1_obs <- Y1_obs + rnorm(n, 0, 0.1)
S0_obs <- Y0_obs + rnorm(n, 0, 0.1)
D_unobs <- rep(c(1, 0), each = m / 2)
Y0_unobs <- rnorm(m)
Y1_unobs <- Y0_unobs + true_tau
S1_unobs <- Y1_unobs + rnorm(m, 0, 0.1)
S0_unobs <- Y0_unobs + rnorm(m, 0, 0.1)
list(
obs_df = data.frame(Y = Y, D = D_obs, S0 = S0_obs, S1 = S1_obs),
unobs_df = data.frame(D = D_unobs, S0 = S0_unobs, S1 = S1_unobs),
true_tau = true_tau
)
}
comp6 <- run_comparison(dgp_near_perfect)
print_comparison(comp6, "Scenario 6: Near-Perfect Predictions")
#> ### Scenario 6: Near-Perfect Predictions
#>
#> estimator bias variance mse best
#> dim 4e-03 0.041698 0.041693
#> greg 0e+00 0.008294 0.008290
#> ppi 4e-04 0.007093 0.007090
#> dt 4e-04 0.007131 0.007127
#> dip -6e-04 0.000446 0.000446 <-- min
#> dip_pp -4e-04 0.000454 0.000454
#> dt_dip -6e-04 0.000464 0.000464
#>
#> Lowest variance: dipWhy: When , estimating via cross-fitting (with only observations per fold) adds finite-sample variance without meaningful bias reduction. DiP with fixed beats DiP++ and D-T DiP by avoiding this estimation cost. Meanwhile, DiP dominates all PPI-style estimators (GREG, PPI++, D-T) by a large margin thanks to the structural advantage of using all unobserved units rather than splitting by arm.
Summary Table
scenarios <- list(
"1: Poor predictions" = comp1,
"2: Neg corr predictions" = comp2,
"3: Heterogeneous quality" = comp3,
"4: High common-mode error" = comp4,
"5: Common-mode + hetero" = comp5,
"6: Near-perfect" = comp6
)
summary_df <- do.call(rbind, lapply(names(scenarios), function(nm) {
comp <- scenarios[[nm]]
best_idx <- which.min(comp$mse)
data.frame(
scenario = nm,
winner = comp$estimator[best_idx],
winner_mse = comp$mse[best_idx],
winner_var = comp$variance[best_idx],
dim_var = comp$variance[comp$estimator == "dim"],
reduction_pct = round(
(1 - comp$variance[best_idx] / comp$variance[comp$estimator == "dim"]) * 100, 1
),
stringsAsFactors = FALSE
)
}))
knitr::kable(summary_df,
col.names = c("Scenario", "Best Estimator", "Best MSE",
"Best Var", "DiM Var", "Var Reduction (%)"),
digits = 5,
caption = "Which estimator achieves the lowest Monte Carlo MSE under each DGP?")| Scenario | Best Estimator | Best MSE | Best Var | DiM Var | Var Reduction (%) |
|---|---|---|---|---|---|
| 1: Poor predictions | dim | 0.01948 | 0.01949 | 0.01949 | 0.0 |
| 2: Neg corr predictions | dip_pp | 0.00730 | 0.00730 | 0.02218 | 67.1 |
| 3: Heterogeneous quality | dt_dip | 0.01251 | 0.01252 | 0.01949 | 35.8 |
| 4: High common-mode error | dip_pp | 0.00867 | 0.00868 | 0.01949 | 55.5 |
| 5: Common-mode + hetero | dt_dip | 0.01663 | 0.01664 | 0.01949 | 14.6 |
| 6: Near-perfect | dip | 0.00045 | 0.00045 | 0.04170 | 98.9 |