Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Release/2.2.5 🎉 #141

Merged
merged 8 commits into from
Nov 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# Changelog

# 2.2.5 (11/7/2024)
- fix: hot fixes for the extrapolation step + using the presidential margins to infer a ticket splitting estimate in each house / senate race [#140](https://github.com/washingtonpost/elex-live-model/pull/140)

# 2.2.4 (11/5/2024)
- fix: truncation can fail catastrophically when % reporting is too low [#138](https://github.com/washingtonpost/elex-live-model/pull/138)

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
LONG_DESCRIPTION = f.read()

# The full version, including alpha/beta/rc tags
RELEASE = "2.2.4"
RELEASE = "2.2.5"
# The short X.Y version
VERSION = ".".join(RELEASE.split(".")[:2])

Expand Down
30 changes: 29 additions & 1 deletion src/elexmodel/client.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from collections import defaultdict
from io import StringIO

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -339,6 +340,31 @@ def get_estimates(
versioned_data_handler = None
else:
versioned_data_handler = None

if model_parameters.get("correct_from_presidential", False):
s3_client = s3.S3CsvUtil(TARGET_BUCKET)
baseline_path = f"{S3_FILE_PATH}/{self.election_id}/data/P/data_county.csv"
results_path = f"{S3_FILE_PATH}/{self.election_id}/results/P/county/current.csv"
predictions_path = f"{S3_FILE_PATH}/{self.election_id}/predictions/P/county/unit_data/current.csv"
pres_baseline = pd.read_csv(StringIO(s3_client.get(baseline_path)), dtype={"geographic_unit_fips": str})
pres_baseline["baseline_normalized_margin"] = (pres_baseline.baseline_dem - pres_baseline.baseline_gop) / (
pres_baseline.baseline_dem + pres_baseline.baseline_gop
)
pres_results = pd.read_csv(StringIO(s3_client.get(results_path)), dtype={"geographic_unit_fips": str})
pres_predictions = pd.read_csv(
StringIO(s3_client.get(predictions_path)), dtype={"geographic_unit_fips": str}
)
pres_predictions = pres_predictions.merge(
pres_results[["geographic_unit_fips", "results_weights"]], on="geographic_unit_fips", how="left"
)
pres_predictions = pres_predictions.merge(
pres_baseline[["geographic_unit_fips", "baseline_normalized_margin"]],
on="geographic_unit_fips",
how="left",
)
else:
pres_predictions = None

LOG.info("Running model for %s", self.election_id)
LOG.info(
"Model parameters: \n prediction intervals: %s, percent reporting threshold: %s, \
Expand All @@ -359,7 +385,9 @@ def get_estimates(
self.model = GaussianElectionModel(model_settings=model_settings)
elif pi_method == "bootstrap":
self.model = BootstrapElectionModel(
model_settings=model_settings, versioned_data_handler=versioned_data_handler
model_settings=model_settings,
versioned_data_handler=versioned_data_handler,
pres_predictions=pres_predictions,
)

minimum_reporting_units_max = 0
Expand Down
8 changes: 6 additions & 2 deletions src/elexmodel/handlers/data/VersionedData.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,11 @@ def compute_estimated_margin(df):
# because the AP adjusted its model after the fact. We correct for this here.
# we recompute the percent_expected_vote using the last reported value as the max
perc_expected_vote_corr = np.divide(
results_turnout, results_turnout[-1], out=np.zeros_like(results_turnout), where=results_turnout[-1] != 0
results_turnout,
results_turnout[-1],
out=np.zeros_like(results_turnout),
where=results_turnout[-1] != 0,
casting="unsafe",
)

# check if perc_expected_vote_corr is monotone increasing (if not, give up and don't try to estimate a margin)
Expand Down Expand Up @@ -190,7 +194,7 @@ def compute_estimated_margin(df):

est_margins = observed_norm_margin * observed_vote + observed_batch_margin * (percs - observed_vote)
est_margins = np.divide(
est_margins, percs, where=percs != 0, out=np.zeros_like(est_margins)
est_margins, percs, where=percs != 0, out=np.zeros_like(est_margins), casting="unsafe"
) # Handle div-by-zero

# Return a DataFrame with the multi-index (geographic_unit_fips, perc)
Expand Down
50 changes: 49 additions & 1 deletion src/elexmodel/models/BootstrapElectionModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ class BootstrapElectionModel(BaseElectionModel):
and the epsilons are contest (state/district) level random effects.
"""

def __init__(self, model_settings={}, versioned_data_handler=None):
def __init__(self, model_settings={}, versioned_data_handler=None, pres_predictions=None):
super().__init__(model_settings)
self.B = model_settings.get("B", 500) # number of bootstrap samples
self.strata = model_settings.get("strata", ["county_classification"]) # columns to stratify the data by
Expand All @@ -61,6 +61,7 @@ def __init__(self, model_settings={}, versioned_data_handler=None):
"agg_model_hard_threshold", True
) # use sigmoid or hard thresold when calculating agg model
self.district_election = model_settings.get("district_election", False)

self.lambda_ = model_settings.get("lambda_", None) # regularization parameter for OLS

# save versioned data for later use
Expand All @@ -70,6 +71,10 @@ def __init__(self, model_settings={}, versioned_data_handler=None):
self.extrapolate_std_method = model_settings.get("extrapolate_std_method", "std")
self.max_dist_to_observed = model_settings.get("max_dist_to_observed", 5)

# save presidenial predictions for later use
self.pres_predictions = pres_predictions
self.correct_from_presidential = model_settings.get("correct_from_presidential", False)

# upper and lower bounds for the quantile regression which define the strata distributions
# these make sure that we can control the worst cases for the distributions in case we
# haven't seen enough data ayet
Expand Down Expand Up @@ -1283,6 +1288,49 @@ def compute_bootstrap_errors(
extrap_filter
]

if self.correct_from_presidential:
nonreporting_units["geographic_unit_fips_p"] = nonreporting_units.geographic_unit_fips.apply(
lambda x: x.split("_")[1]
)
nonreporting_units = nonreporting_units.merge(
self.pres_predictions,
left_on="geographic_unit_fips_p",
right_on="geographic_unit_fips",
how="left",
suffixes=("", "_pres"),
)

# adjust results_normalized_margin_pres to account for split counties

nonreporting_units["margin_adj"] = (
nonreporting_units.baseline_normalized_margin - nonreporting_units.baseline_normalized_margin_pres
)

nonreporting_units["results_normalized_margin_pres"] = (
nonreporting_units.results_margin_pres / nonreporting_units.results_weights_pres
+ nonreporting_units.margin_adj
)
nonreporting_units["pred_normalized_margin_pres"] = (
nonreporting_units.pred_margin / nonreporting_units.pred_turnout + nonreporting_units.margin_adj
)

nonreporting_units["pred_normalized_margin"] = np.mean(
y_test_pred_B.clip(min=y_partial_reporting_lower, max=y_partial_reporting_upper), axis=1
)

nonreporting_units["margin_gap"] = (
nonreporting_units.results_normalized_margin - nonreporting_units.results_normalized_margin_pres
)

nonreporting_units["pred_normalized_margin_new"] = (
nonreporting_units.pred_normalized_margin_pres + nonreporting_units.margin_gap
)
adjustment = (
nonreporting_units["pred_normalized_margin_new"].values
- nonreporting_units["pred_normalized_margin"].values
)
y_test_pred_B[~np.isnan(adjustment)] += adjustment[~np.isnan(adjustment)].reshape(-1, 1)

y_test_pred_B = y_test_pred_B.clip(min=y_partial_reporting_lower, max=y_partial_reporting_upper)

# \tilde{y_i}^{b} * \tilde{z_i}^{b}
Expand Down
Loading