Skip to contents
# Install from github
devtools::install_github('klintkanopka/mixedsubjects')

Overview

Each of the seven estimators in the mixedsubjects package has a different variance formula, so each dominates under a different data-generating process. This document constructs targeted DGP scenarios, runs Monte Carlo simulations under each, and confirms which estimator achieves the smallest variance.

The key quantities that drive the comparison:

Quantity Favors
ρ(Y,S)\rho(Y, S) — prediction quality All prediction-based estimators over DiM
ρ1ρ0\rho_1 \neq \rho_0 — heterogeneous quality across arms D-T and D-T DiP over single-λ\lambda estimators
Cov(S(1),S(0))\text{Cov}(S^{(1)}, S^{(0)}) — common-mode prediction error DiP family over PPI/D-T family
m/nm / n — ratio of unlabeled to labeled Prediction-based estimators (larger mm helps more)

Simulation Engine

#' Run a Monte Carlo comparison of all seven estimators
#'
#' @param dgp_fn A function(seed) that returns a list with components
#'   obs_df, unobs_df (data.frames), and true_tau (scalar).
#' @param n_sims Number of Monte Carlo replications.
#' @param n_folds Number of cross-fitting folds.
#' @return A data.frame with columns: estimator, mean_est, bias, variance, mse.
run_comparison <- function(dgp_fn, n_sims = 2000, n_folds = 2) {
  estimator_names <- c("dim", "greg", "ppi", "dt", "dip", "dip_pp", "dt_dip")
  results <- matrix(NA_real_, nrow = n_sims, ncol = length(estimator_names))
  colnames(results) <- estimator_names

  for (i in seq_len(n_sims)) {
    d <- dgp_fn(seed = i)
    msd <- msd_data(observed = d$obs_df, unobserved = d$unobs_df)

    results[i, "dim"] <- tryCatch(msd_dim(msd)$estimate, error = function(e) NA)
    results[i, "greg"] <- tryCatch(msd_greg(msd)$estimate, error = function(e) NA)
    results[i, "ppi"] <- tryCatch(msd_ppi(msd, n_folds = n_folds)$estimate, error = function(e) NA)
    results[i, "dt"] <- tryCatch(msd_dt(msd, n_folds = n_folds)$estimate, error = function(e) NA)
    results[i, "dip"] <- tryCatch(msd_dip(msd)$estimate, error = function(e) NA)
    results[i, "dip_pp"] <- tryCatch(msd_dip_pp(msd, n_folds = n_folds)$estimate, error = function(e) NA)
    results[i, "dt_dip"] <- tryCatch(msd_dt_dip(msd, n_folds = n_folds)$estimate, error = function(e) NA)
  }

  true_tau <- dgp_fn(seed = 1)$true_tau

  data.frame(
    estimator = estimator_names,
    mean_est  = colMeans(results, na.rm = TRUE),
    bias      = colMeans(results, na.rm = TRUE) - true_tau,
    variance  = apply(results, 2, var, na.rm = TRUE),
    mse       = apply(results, 2, function(x) mean((x - true_tau)^2, na.rm = TRUE)),
    stringsAsFactors = FALSE
  )
}

#' Pretty-print a comparison table, highlighting the minimum-variance estimator
print_comparison <- function(comp, title = "") {
  if (nchar(title) > 0) cat("###", title, "\n\n")
  comp$variance <- round(comp$variance, 6)
  comp$bias <- round(comp$bias, 4)
  comp$mse <- round(comp$mse, 6)
  best <- comp$estimator[which.min(comp$variance)]
  comp$best <- ifelse(comp$estimator == best, " <-- min", "")
  print(comp[, c("estimator", "bias", "variance", "mse", "best")], row.names = FALSE)
  cat("\nLowest variance:", best, "\n\n")
}

Scenario 1: Poor Predictions — DiM Wins

When predictions are essentially noise (ρ(Y,S)0\rho(Y,S) \approx 0), incorporating them only adds variance. DiM ignores predictions entirely, so it should dominate.

dgp_poor_predictions <- function(seed) {
  set.seed(seed)
  true_tau <- 0.5
  n <- 200; m <- 400

  D_obs <- rep(c(1, 0), each = n / 2)
  Y <- rnorm(n) + true_tau * D_obs

  # Predictions are pure noise — no correlation with Y
  S1_obs <- rnorm(n, 0.1, 1)
  S0_obs <- rnorm(n, 0, 1)

  D_unobs <- rep(c(1, 0), each = m / 2)
  S1_unobs <- rnorm(m, 0.1, 1)
  S0_unobs <- rnorm(m, 0, 1)

  list(
    obs_df = data.frame(Y = Y, D = D_obs, S0 = S0_obs, S1 = S1_obs),
    unobs_df = data.frame(D = D_unobs, S0 = S0_unobs, S1 = S1_unobs),
    true_tau = true_tau
  )
}
comp1 <- run_comparison(dgp_poor_predictions)
print_comparison(comp1, "Scenario 1: Poor Predictions")
#> ### Scenario 1: Poor Predictions 
#> 
#>  estimator    bias variance      mse     best
#>        dim -0.0013 0.019485 0.019477  <-- min
#>       greg  0.0038 0.049866 0.049855         
#>        ppi -0.0015 0.019623 0.019616         
#>         dt -0.0020 0.019715 0.019709         
#>        dip  0.0031 0.044152 0.044139         
#>     dip_pp -0.0018 0.019632 0.019626         
#>     dt_dip -0.0023 0.019895 0.019890         
#> 
#> Lowest variance: dim

Why: All prediction-based estimators add λ2Var(S)/m\lambda^2 \text{Var}(S)/m to the variance without any offsetting covariance reduction. The optimal λ\lambda shrinks toward 0 for PPI++/D-T/DiP++/D-T DiP, making them roughly equal to DiM, while GREG and DiP (fixed λ=1\lambda = 1) are strictly worse.

Scenario 2: Negatively Correlated Predictions

When a covariate XX has opposite effects on Y(1)Y(1) and Y(0)Y(0) (e.g., Y(0)=ε+XY(0) = \varepsilon + X, Y(1)=εX+τY(1) = \varepsilon' - X + \tau), the potential outcomes are negatively correlated, and so are the predictions S(1)S^{(1)} and S(0)S^{(0)}. This inflates Var(S(1)S(0))\text{Var}(S^{(1)} - S^{(0)}) relative to the positive-correlation case, which reduces DiP’s advantage over PPI — but does not eliminate it, because DiP still benefits structurally from using all mm unobserved units rather than splitting by arm (m/2m/2 each).

dgp_neg_corr_predictions <- function(seed) {
  set.seed(seed)
  true_tau <- 0.5
  n <- 200; m <- 500

  D_obs <- rep(c(1, 0), each = n / 2)
  # X has opposite effects on Y(1) vs Y(0), creating negative Cov(Y1, Y0)
  X_obs <- rnorm(n)
  Y0_obs <- rnorm(n, 0, 0.3) + 1.0 * X_obs
  Y1_obs <- rnorm(n, 0, 0.3) - 1.0 * X_obs + true_tau
  Y <- D_obs * Y1_obs + (1 - D_obs) * Y0_obs

  # Good predictions of each potential outcome
  S1_obs <- 0.85 * Y1_obs + rnorm(n, 0, 0.2)
  S0_obs <- 0.85 * Y0_obs + rnorm(n, 0, 0.2)

  D_unobs <- rep(c(1, 0), each = m / 2)
  X_unobs <- rnorm(m)
  Y0_unobs <- rnorm(m, 0, 0.3) + 1.0 * X_unobs
  Y1_unobs <- rnorm(m, 0, 0.3) - 1.0 * X_unobs + true_tau
  S1_unobs <- 0.85 * Y1_unobs + rnorm(m, 0, 0.2)
  S0_unobs <- 0.85 * Y0_unobs + rnorm(m, 0, 0.2)

  list(
    obs_df = data.frame(Y = Y, D = D_obs, S0 = S0_obs, S1 = S1_obs),
    unobs_df = data.frame(D = D_unobs, S0 = S0_unobs, S1 = S1_unobs),
    true_tau = true_tau
  )
}
comp2 <- run_comparison(dgp_neg_corr_predictions)
print_comparison(comp2, "Scenario 2: Negatively Correlated Predictions")
#> ### Scenario 2: Negatively Correlated Predictions 
#> 
#>  estimator   bias variance      mse     best
#>        dim 0.0029 0.022178 0.022175         
#>       greg 0.0018 0.008204 0.008204         
#>        ppi 0.0019 0.007675 0.007674         
#>         dt 0.0021 0.007698 0.007698         
#>        dip 0.0013 0.007707 0.007705         
#>     dip_pp 0.0014 0.007297 0.007296  <-- min
#>     dt_dip 0.0011 0.009078 0.009075         
#> 
#> Lowest variance: dip_pp

Why: With Cov(S(1),S(0))<0\text{Cov}(S^{(1)}, S^{(0)}) < 0, differencing inflates the unobserved variance: Var(S(1)S(0))=Var(S(1))+Var(S(0))2Cov(S(1),S(0))>Var(S(1))+Var(S(0))\text{Var}(S^{(1)} - S^{(0)}) = \text{Var}(S^{(1)}) + \text{Var}(S^{(0)}) - 2\text{Cov}(S^{(1)}, S^{(0)}) > \text{Var}(S^{(1)}) + \text{Var}(S^{(0)}). However, DiP still outperforms PPI because its unobserved term divides by mm while PPI divides by m/2m/2 per arm. In general, Var(S(1)S(0))mVar(S(1))m1+Var(S(0))m0\frac{\text{Var}(S^{(1)} - S^{(0)})}{m} \leq \frac{\text{Var}(S^{(1)})}{m_1} + \frac{\text{Var}(S^{(0)})}{m_0} always holds (by Cauchy–Schwarz), so the DiP family structurally dominates the PPI family in the unobserved variance component regardless of the sign of Cov(S(1),S(0))\text{Cov}(S^{(1)}, S^{(0)}). Negative correlation merely shrinks DiP’s advantage relative to the positive-correlation case.

Scenario 3: Heterogeneous Prediction Quality — Double-Tuned Wins

When the LLM predicts much better in one arm than the other (ρ1ρ0\rho_1 \gg \rho_0), double-tuned estimators (D-T, D-T DiP) that set arm-specific λ1,λ0\lambda_1, \lambda_0 outperform their single-λ\lambda counterparts (PPI++, DiP++). D-T DiP achieves the lowest variance by combining the double-tuning advantage with DiP’s structural advantage of using all mm unobserved units.

dgp_heterogeneous_quality <- function(seed) {
  set.seed(seed)
  true_tau <- 0.5
  n <- 200; m <- 500

  D_obs <- rep(c(1, 0), each = n / 2)
  Y0_obs <- rnorm(n)
  Y1_obs <- Y0_obs + true_tau
  Y <- D_obs * Y1_obs + (1 - D_obs) * Y0_obs

  # Treatment arm: excellent predictions (rho ~ 0.85)
  # Control arm: mediocre predictions (rho ~ 0.25)
  S1_obs <- 0.9 * Y1_obs + rnorm(n, 0, 0.3)
  S0_obs <- 0.2 * Y0_obs + rnorm(n, 0, 0.9)

  D_unobs <- rep(c(1, 0), each = m / 2)
  Y0_unobs <- rnorm(m)
  Y1_unobs <- Y0_unobs + true_tau
  S1_unobs <- 0.9 * Y1_unobs + rnorm(m, 0, 0.3)
  S0_unobs <- 0.2 * Y0_unobs + rnorm(m, 0, 0.9)

  list(
    obs_df = data.frame(Y = Y, D = D_obs, S0 = S0_obs, S1 = S1_obs),
    unobs_df = data.frame(D = D_unobs, S0 = S0_unobs, S1 = S1_unobs),
    true_tau = true_tau
  )
}
comp3 <- run_comparison(dgp_heterogeneous_quality)
print_comparison(comp3, "Scenario 3: Heterogeneous Quality Across Arms")
#> ### Scenario 3: Heterogeneous Quality Across Arms 
#> 
#>  estimator    bias variance      mse     best
#>        dim -0.0013 0.019485 0.019477         
#>       greg  0.0053 0.022326 0.022343         
#>        ppi  0.0022 0.015073 0.015070         
#>         dt  0.0022 0.013479 0.013477         
#>        dip  0.0026 0.018447 0.018445         
#>     dip_pp  0.0008 0.014143 0.014137         
#>     dt_dip  0.0008 0.012518 0.012512  <-- min
#> 
#> Lowest variance: dt_dip

Why: A single λ\lambda must compromise between the two arms: PPI++ and DiP++ cannot fully exploit the good treatment-arm predictions without also amplifying noise in the control arm. Double-tuned estimators (D-T, D-T DiP) set λ1\lambda_1 high and λ0\lambda_0 low, avoiding this trade-off. Within each tuning family, DiP-style estimators beat their PPI-style counterparts (D-T DiP < D-T, DiP++ < PPI++) because they use all mm unobserved units rather than splitting by arm.

Scenario 4: High Common-Mode Error

When predictions for S(1)S^{(1)} and S(0)S^{(0)} share a large common error component (e.g., the LLM has a stable unit-level bias that cancels in S(1)S(0)S^{(1)} - S^{(0)}), the DiP family exploits Cov(S(1),S(0))>0\text{Cov}(S^{(1)}, S^{(0)}) > 0 to reduce the unobserved variance component.

dgp_high_common_mode <- function(seed) {
  set.seed(seed)
  true_tau <- 0.5
  n <- 200; m <- 500

  D_obs <- rep(c(1, 0), each = n / 2)
  Y0_obs <- rnorm(n)
  Y1_obs <- Y0_obs + true_tau
  Y <- D_obs * Y1_obs + (1 - D_obs) * Y0_obs

  # Common-mode prediction error (shared LLM bias per unit, not in Y)
  common_obs <- rnorm(n, 0, 0.8)
  S1_obs <- 0.9 * Y1_obs + common_obs + rnorm(n, 0, 0.1)
  S0_obs <- 0.9 * Y0_obs + common_obs + rnorm(n, 0, 0.1)

  D_unobs <- rep(c(1, 0), each = m / 2)
  Y0_unobs <- rnorm(m)
  Y1_unobs <- Y0_unobs + true_tau
  common_unobs <- rnorm(m, 0, 0.8)
  S1_unobs <- 0.9 * Y1_unobs + common_unobs + rnorm(m, 0, 0.1)
  S0_unobs <- 0.9 * Y0_unobs + common_unobs + rnorm(m, 0, 0.1)

  list(
    obs_df = data.frame(Y = Y, D = D_obs, S0 = S0_obs, S1 = S1_obs),
    unobs_df = data.frame(D = D_unobs, S0 = S0_unobs, S1 = S1_unobs),
    true_tau = true_tau
  )
}
comp4 <- run_comparison(dgp_high_common_mode)
print_comparison(comp4, "Scenario 4: High Common-Mode Prediction Error")
#> ### Scenario 4: High Common-Mode Prediction Error 
#> 
#>  estimator    bias variance      mse     best
#>        dim -0.0013 0.019485 0.019477         
#>       greg  0.0027 0.025099 0.025094         
#>        ppi  0.0010 0.012169 0.012164         
#>         dt  0.0011 0.012251 0.012246         
#>        dip  0.0013 0.013026 0.013021         
#>     dip_pp  0.0007 0.008676 0.008672  <-- min
#>     dt_dip  0.0005 0.008817 0.008813         
#> 
#> Lowest variance: dip_pp

Why: Var(S(1))\text{Var}(S^{(1)}) and Var(S(0))\text{Var}(S^{(0)}) are inflated by the common error, which hurts PPI++/D-T through the λ2Var(S)/m\lambda^2 \text{Var}(S)/m term. But Var(S(1)S(0))\text{Var}(S^{(1)} - S^{(0)}) cancels the common error, so the DiP unobserved term λ2Var(S(1)S(0))/m\lambda^2 \text{Var}(S^{(1)} - S^{(0)})/m is much smaller.

Scenario 5: Common-Mode Error + Heterogeneous Quality

Combine the advantages of scenarios 3 and 4: predictions share common-mode error (favoring DiP) but quality differs across arms (favoring D-T). D-T DiP should beat both D-T and DiP++.

dgp_common_mode_heterogeneous <- function(seed) {
  set.seed(seed)
  true_tau <- 0.5
  n <- 200; m <- 500

  D_obs <- rep(c(1, 0), each = n / 2)
  Y0_obs <- rnorm(n)
  Y1_obs <- Y0_obs + true_tau
  Y <- D_obs * Y1_obs + (1 - D_obs) * Y0_obs

  common_obs <- rnorm(n, 0, 1.2)
  # Treatment arm: good signal; control arm: weak signal
  S1_obs <- 0.9 * Y1_obs + common_obs + rnorm(n, 0, 0.3)
  S0_obs <- 0.2 * Y0_obs + common_obs + rnorm(n, 0, 0.5)

  D_unobs <- rep(c(1, 0), each = m / 2)
  Y0_unobs <- rnorm(m)
  Y1_unobs <- Y0_unobs + true_tau
  common_unobs <- rnorm(m, 0, 1.2)
  S1_unobs <- 0.9 * Y1_unobs + common_unobs + rnorm(m, 0, 0.3)
  S0_unobs <- 0.2 * Y0_unobs + common_unobs + rnorm(m, 0, 0.5)

  list(
    obs_df = data.frame(Y = Y, D = D_obs, S0 = S0_obs, S1 = S1_obs),
    unobs_df = data.frame(D = D_unobs, S0 = S0_unobs, S1 = S1_unobs),
    true_tau = true_tau
  )
}
comp5 <- run_comparison(dgp_common_mode_heterogeneous)
print_comparison(comp5, "Scenario 5: Common-Mode Error + Heterogeneous Quality")
#> ### Scenario 5: Common-Mode Error + Heterogeneous Quality 
#> 
#>  estimator    bias variance      mse     best
#>        dim -0.0013 0.019485 0.019477         
#>       greg  0.0030 0.054145 0.054127         
#>        ppi  0.0000 0.017590 0.017581         
#>         dt  0.0001 0.017206 0.017197         
#>        dip  0.0007 0.039094 0.039075         
#>     dip_pp -0.0004 0.016770 0.016762         
#>     dt_dip  0.0001 0.016635 0.016627  <-- min
#> 
#> Lowest variance: dt_dip

Why: D-T DiP can set λ1\lambda_1 large (good treatment predictions) and λ0\lambda_0 small (noisy control predictions), while still exploiting the common-mode cancellation in λ1S(1)λ0S(0)\lambda_1 S^{(1)} - \lambda_0 S^{(0)}.

Scenario 6: Near-Perfect Predictions

When predictions are nearly unbiased and have very high correlation with YY (ρ0.995\rho \approx 0.995), the optimal λ*\lambda^* is close to 1. With small nn (few observations per cross-fitting fold), estimating λ\lambda adds finite-sample noise. DiP with fixed λ=1\lambda = 1 dominates: it avoids the lambda-estimation cost while retaining the structural advantage of using all mm unobserved units.

dgp_near_perfect <- function(seed) {
  set.seed(seed)
  true_tau <- 0.5
  n <- 100; m <- 500

  D_obs <- rep(c(1, 0), each = n / 2)
  Y0_obs <- rnorm(n)
  Y1_obs <- Y0_obs + true_tau
  Y <- D_obs * Y1_obs + (1 - D_obs) * Y0_obs

  # Near-perfect predictions: rho = 1/sqrt(1 + 0.01) ~ 0.995
  S1_obs <- Y1_obs + rnorm(n, 0, 0.1)
  S0_obs <- Y0_obs + rnorm(n, 0, 0.1)

  D_unobs <- rep(c(1, 0), each = m / 2)
  Y0_unobs <- rnorm(m)
  Y1_unobs <- Y0_unobs + true_tau
  S1_unobs <- Y1_unobs + rnorm(m, 0, 0.1)
  S0_unobs <- Y0_unobs + rnorm(m, 0, 0.1)

  list(
    obs_df = data.frame(Y = Y, D = D_obs, S0 = S0_obs, S1 = S1_obs),
    unobs_df = data.frame(D = D_unobs, S0 = S0_unobs, S1 = S1_unobs),
    true_tau = true_tau
  )
}
comp6 <- run_comparison(dgp_near_perfect)
print_comparison(comp6, "Scenario 6: Near-Perfect Predictions")
#> ### Scenario 6: Near-Perfect Predictions 
#> 
#>  estimator   bias variance      mse     best
#>        dim  4e-03 0.041698 0.041693         
#>       greg  0e+00 0.008294 0.008290         
#>        ppi  4e-04 0.007093 0.007090         
#>         dt  4e-04 0.007131 0.007127         
#>        dip -6e-04 0.000446 0.000446  <-- min
#>     dip_pp -4e-04 0.000454 0.000454         
#>     dt_dip -6e-04 0.000464 0.000464         
#> 
#> Lowest variance: dip

Why: When λ*1\lambda^* \approx 1, estimating λ\lambda via cross-fitting (with only n/2n/2 observations per fold) adds finite-sample variance without meaningful bias reduction. DiP with fixed λ=1\lambda = 1 beats DiP++ and D-T DiP by avoiding this estimation cost. Meanwhile, DiP dominates all PPI-style estimators (GREG, PPI++, D-T) by a large margin thanks to the structural advantage of using all mm unobserved units rather than splitting by arm.

Summary Table

scenarios <- list(
  "1: Poor predictions"        = comp1,
  "2: Neg corr predictions"    = comp2,
  "3: Heterogeneous quality"   = comp3,
  "4: High common-mode error"  = comp4,
  "5: Common-mode + hetero"    = comp5,
  "6: Near-perfect"            = comp6
)

summary_df <- do.call(rbind, lapply(names(scenarios), function(nm) {
  comp <- scenarios[[nm]]
  best_idx <- which.min(comp$mse)
  data.frame(
    scenario = nm,
    winner = comp$estimator[best_idx],
    winner_mse = comp$mse[best_idx],
    winner_var = comp$variance[best_idx],
    dim_var = comp$variance[comp$estimator == "dim"],
    reduction_pct = round(
      (1 - comp$variance[best_idx] / comp$variance[comp$estimator == "dim"]) * 100, 1
    ),
    stringsAsFactors = FALSE
  )
}))

knitr::kable(summary_df,
             col.names = c("Scenario", "Best Estimator", "Best MSE",
                           "Best Var", "DiM Var", "Var Reduction (%)"),
             digits = 5,
             caption = "Which estimator achieves the lowest Monte Carlo MSE under each DGP?")
Which estimator achieves the lowest Monte Carlo MSE under each DGP?
Scenario Best Estimator Best MSE Best Var DiM Var Var Reduction (%)
1: Poor predictions dim 0.01948 0.01949 0.01949 0.0
2: Neg corr predictions dip_pp 0.00730 0.00730 0.02218 67.1
3: Heterogeneous quality dt_dip 0.01251 0.01252 0.01949 35.8
4: High common-mode error dip_pp 0.00867 0.00868 0.01949 55.5
5: Common-mode + hetero dt_dip 0.01663 0.01664 0.01949 14.6
6: Near-perfect dip 0.00045 0.00045 0.04170 98.9