Skip to content

Commit

Permalink
improving ability to test sample sizes (#97)
Browse files Browse the repository at this point in the history
* Oneaudit (#92)

* WIP: improving Dominion support, progress towards ONEAudit integration

* WIP: Dominion tools

* Dominion "multiple card" fix, Testcase Regression Fixes (#91)

* This commit fixes issues with the record_id not always set, and the use_current flag never being used

* Fix earlier issue where only the first 'Card' in the Dominion output was parsed, added associated unit tests

---------

Co-authored-by: Philip B. Stark <[email protected]>

* WIP: ONEAudit integration. Dominion mvr retrieval improvements

* WIP: Dominion card numbering, construct pool_means only for pool==True CVRs

* WIP: working but sample size estimation needs to be improved

* ENH: ONEAudit seems to be working, including sample size estimation.

* ENH: improve documentation in ONEAudit notebook; minor bug fixes

---------

Co-authored-by: bsheehan-SF-RLA <[email protected]>

* Added missing parameter in supermajority assertion and changed np.infty (no longer supported) to np.inf (#93)

* Dominion (#94)

* BUG: change logic for parsing Dominion ranks, fix minor bugs in supermajority

* BUG: revise unit test for Dominion to match new mark processing

* ENH: logic for SF treatment of multiple ranks assigned to same candidate in IRV

* ENH: logic for SF treatment of multiple ranks assigned to same candidate in IRV

* ENH: logic for SF treatment of multiple ranks assigned to same candidate in IRV

* ENH: logic for SF treatment of multiple ranks assigned to same candidate in IRV

* Optimizations for Dominion.sample_from_cvrs and CVR.consistent_sampling (#95)

* Speed optimizations for Dominion.sample_from_cvrs() and CVR.consistent_sampling()

* Unit tests for optimzations in last commit, should pass on original and optimized code

---------

Co-authored-by: bsheehan-SF-RLA <[email protected]>
Co-authored-by: Alexander Ek <[email protected]>
  • Loading branch information
3 people authored Nov 28, 2024
1 parent e9ca527 commit 7962bce
Show file tree
Hide file tree
Showing 5 changed files with 103 additions and 96 deletions.
5 changes: 4 additions & 1 deletion examples/ONEAudit-demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@
"+ Read manual interpretations of the cards (MVRs)\n",
"+ Calculate attained risk for each assorter\n",
" - Use ~2EZ to deal with phantom CVRs or cards; the treatment depends on whether `use_style == True`\n",
" - If a sampled card cannot be found/retrieved, use the phantom-to-zombie transformation for it\n",
" - Use the pooled assorter means for cards in pooled batches\n",
"+ Report\n",
"+ Estimate incremental sample size if any assorter nulls have not been rejected\n",
Expand Down Expand Up @@ -1006,7 +1007,9 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Read the audited sample data"
"# Read the audited sample data.\n",
"\n",
"## Any ballot that cannot be retrieved should be marked as a \"zombie\" (treated in the least favorable way for every contest it might contain)."
]
},
{
Expand Down
21 changes: 9 additions & 12 deletions shangrla/core/Audit.py
Original file line number Diff line number Diff line change
Expand Up @@ -883,9 +883,8 @@ def consistent_sampling(
].sample_num
current_sizes[c] += 1
inx += 1
for i in range(len(cvr_list)):
if i in sampled_cvr_indices:
cvr_list[i].sampled = True
for i in sampled_cvr_indices:
cvr_list[i].sampled = True
return sampled_cvr_indices

@classmethod
Expand Down Expand Up @@ -1931,7 +1930,8 @@ def make_plurality_assertions(
@classmethod
def make_supermajority_assertion(
cls,
contest,
contest: object=None,
share_to_win: float = 1/2,
winner: str = None,
loser: list = None,
test: callable = None,
Expand Down Expand Up @@ -1962,12 +1962,12 @@ def make_supermajority_assertion(
-----------
contest:
contest object instance to which the assertion applies
share_to_win: float
fraction of the valid votes the winner must get to win
winner:
identifier of winning candidate
loser: list
list of identifiers of losing candidate(s)
share_to_win: float
fraction of the valid votes the winner must get to win
test: instance of NonnegMean
risk function for the contest
estim: an estimation method of NonnegMean
Expand Down Expand Up @@ -2218,7 +2218,7 @@ def set_all_margins_from_cvrs(
`assertion.contest.audit_type==Audit.AUDIT_TYPE.POLLING`
or `assertion.contest.audit_type in [Audit.AUDIT_TYPE.CARD_COMPARISON, Audit.AUDIT_TYPE.ONEAUDIT]`
"""
min_margin = np.infty
min_margin = np.inf
for c, con in contests.items():
con.margins = {}
for a, asn in con.assertions.items():
Expand Down Expand Up @@ -2526,7 +2526,7 @@ def overstatement(self, mvr, cvr, use_style=True):
)
# assort the CVR
cvr_assort = (
self.tally_pool_means[cvr.tally_pool]
self.tally_pool_means[cvr.tally_pool]
if
cvr.pool and self.tally_pool_means is not None
else
Expand Down Expand Up @@ -2737,8 +2737,7 @@ def check_cards(cls, contests: Collection["Contest"], cvrs: Collection["CVR"], f
if not force:
raise ValueError(f'{found} cards contain contest {c} but upper bound is {con.cards}')
else:
warnings.warn(f'{found} cards contain contest {c} but upper bound is {con.cards}')

warnings.warn(f'{found} cards contain contest {c} but upper bound is {con.cards}')
con.cards = max(con.cards, found) if force else con.cards


Expand All @@ -2748,8 +2747,6 @@ def tally(cls, con_dict: dict = None, cvr_list: "Collection[CVR]" = None, enforc
Tally the votes in the contests in con_dict from a collection of CVRs.
Only tallies plurality, multi-winner plurality, supermajority, and approval contests.
If
Parameters
----------
con_dict: dict
Expand Down
40 changes: 32 additions & 8 deletions shangrla/formats/Dominion.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,12 @@ def read_cvrs(
"Marks" as the container for votes
"Rank" as the rank
**WARNING**
Uses San Francisco's rules for validity of an IRV ballot; in particular, if a CVR gives
a candidate more than one rank, the vote is considered valid and the lowest rank is used.
**WARNING This may break some scoring rules!**
Parameters:
-----------
cvr_file: string
Expand Down Expand Up @@ -154,9 +160,19 @@ def read_cvrs(
for con in _selector:
contest_votes = {}
for mark in con["Marks"]:
contest_votes[str(mark["CandidateId"])] = (
mark["Rank"] if (mark["IsVote"] or not enforce_rules) else 0
)
if mark["IsVote"] or not enforce_rules:
if str(mark["CandidateId"]) in contest_votes.keys():
# replace existing vote/rank if the new rank is lower but still a vote, not 0 or False
# This may break some scoring rules other than IRV, and might not be what local rules
# require. This logic branch matches San Francisco's rules.
if bool(mark["Rank"]):
contest_votes[str(mark["CandidateId"])] = (
min(int(contest_votes[str(mark["CandidateId"])]), int(mark["Rank"]))
if bool(contest_votes[str(mark["CandidateId"])])
else int(mark["Rank"])
)
else:
contest_votes[str(mark["CandidateId"])] = mark["Rank"]
votes[str(con["Id"])] = contest_votes
# If RecordId is obfuscated, extract it from the ImageMask
record_id = c["RecordId"]
Expand Down Expand Up @@ -486,18 +502,26 @@ def sample_from_cvrs(cls, cvr_list: list, manifest: list, sample: np.array):
sample_order = {}
cvr_sample = []
mvr_phantoms = []

# Convert the pandas DataFrame into a list of tuples to create a lookup table
_manifest = list(manifest.itertuples(index=False, name=None))
_columns = list(manifest.columns)
index = dict([(_, _columns.index(_)) for _ in _columns])
lookuptable = {}
for _m in _manifest:
_key = f'{_m[index["Tabulator Number"]]}-{_m[index["Batch Number"]]}'
_val = [_m[index["VBMCart.Cart number"]], _m[index["Tray #"]]]
lookuptable[_key] = _val

for i, s in enumerate(sample):
cvr_sample.append(cvr_list[s])
cvr_id = cvr_list[s].id
card_in_batch = cvr_list[s].card_in_batch
tab, batch, card_num = cvr_id.split("-")
card_id = f"{tab}-{batch}-{card_num}"
search_key = f"{tab}-{batch}"
if not cvr_list[s].phantom:
manifest_row = manifest[
(manifest["Tabulator Number"] == str(tab))
& (manifest["Batch Number"] == str(batch))
].iloc[0]
card = [manifest_row["VBMCart.Cart number"], manifest_row["Tray #"]] + [
card = lookuptable[search_key] + [
tab,
batch,
card_in_batch,
Expand Down
2 changes: 2 additions & 0 deletions tests/core/test_CVR.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,8 @@ def test_consistent_sampling(self):
con_tests = Contest.from_dict_of_dicts(contests)
sample_cvr_indices = CVR.consistent_sampling(cvrs, con_tests)
assert sample_cvr_indices == [0, 1, 2, 5]
sampled_cvr_by_ivar = [(cvr.id, cvr.sampled) for cvr in cvrs]
assert sampled_cvr_by_ivar == [("1", True), ("2", True), ("3", True), ("4", False), ("5", False), ("6", True)]
np.testing.assert_approx_equal(con_tests['city_council'].sample_threshold, 2)
np.testing.assert_approx_equal(con_tests['measure_1'].sample_threshold, 5)

Expand Down
131 changes: 56 additions & 75 deletions tests/formats/test_Dominion.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from pathlib import Path

from shangrla.formats.Dominion import Dominion
from shangrla.core.Audit import CVR

##########################################################################################

Expand Down Expand Up @@ -56,13 +57,13 @@ def test_read_cvrs_old_format_rules_enforced(self):
assert not cvr_1.pool, f"{cvr_1.pool=}"
assert list(cvr_1.votes.keys()) == ["111"]
# Reading Dominion CVRs now takes "IsVote" and "Modified" values into account,
# so {"6": 1, "1": 2} now becomes {"6": 1, "1": 0} (vote for candidate 1 in this
# so {"6": 1, "1": 2} now becomes {"6": 1} (vote for candidate 1 in this
# testcase is marked "IsVote": false). For the same reason, the call to
# get_vote_for("111", "1") is now 0.
assert cvr_1.votes["111"] == {"6": 1, "1": 0}
# get_vote_for("111", "1") is now False.
assert cvr_1.votes["111"] == {"6": 1}
assert cvr_1.get_vote_for("111", "6")
assert cvr_1.get_vote_for("111", "1") == 0
assert cvr_1.get_vote_for("111", "999") is False
assert not cvr_1.get_vote_for("111", "1")
assert not cvr_1.get_vote_for("111", "999")
assert cvr_2.id == "60009_3_21"
assert cvr_2.tally_pool == "60009_3"
assert not cvr_2.pool, f"{cvr_2.pool=}"
Expand Down Expand Up @@ -220,78 +221,58 @@ def test_sample_from_manifest(self):
assert cards[6] == [3, 3, 19, 3, 1, "19-3-1", 201]
assert len(mvr_phantoms) == 0

# def test_make_contest_dict(self):
# cvr_dir = Path("data/SF_CVR_Export_20240311150227")
# contest_manifest = cvr_dir / "ContestManifest.json"
# candidate_manifest = cvr_dir / "CandidateManifest.json"

# cvr_list = Dominion.read_cvrs_directory(
# cvr_dir, use_current=True, include_groups=(2,)
# )
def test_sample_from_cvrs(self):
"""
Test sampling from a list of CVRs using a manifest
"""

# c = make_contest_dict(
# cvr_list,
# contest_manifest,
# candidate_manifest,
# {},
# )
# Construct a list of CVRs and a corresponding manifest
cvr_list = []
mnf_list = []
tray = 1
batch = 100
cards_per_batch = 20
for tab in range(10, 13):
for batch in range(100, 105):
mnf_list.append(
{
"Tray #": tray,
"Tabulator Number": tab,
"Batch Number": batch,
"Total Ballots": cards_per_batch,
"VBMCart.Cart number": 1,
},
)
for card in range(1, cards_per_batch + 1):
id = f"{tab}-{batch}-{card}"
cvr_list.append(CVR(id=id, card_in_batch=card))
tray += 1

# assert c["8"]["name"] == "DEM CCC DISTRICT 17"
# assert c["8"]["risk_limit"] == pytest.approx(0.05)
# assert c["8"]["cards"] == 82019
# assert c["8"]["choice_function"] == Contest.SOCIAL_CHOICE_FUNCTION.PLURALITY
# assert c["8"]["n_winners"] == 14
# assert c["8"]["candidates"] == [
# "24",
# "25",
# "26",
# "27",
# "28",
# "29",
# "30",
# "31",
# "32",
# "33",
# "34",
# "35",
# "36",
# "37",
# "38",
# "39",
# "40",
# "41",
# "42",
# "43",
# "44",
# "45",
# "46",
# "47",
# "48",
# "49",
# "50",
# "51",
# "52",
# "53",
# "241",
# ]
# assert c["8"]["winner"] == [
# "49",
# "30",
# "27",
# "52",
# "36",
# "26",
# "32",
# "45",
# "28",
# "51",
# "39",
# "29",
# "44",
# "35",
# ]
# assert c["8"]["assertion_file"] is None
# assert c["8"]["audit_type"] == "CARD_COMPARISON"
manifest = pd.DataFrame.from_dict(mnf_list)
manifest["cum_cards"] = manifest["Total Ballots"].cumsum()
manifest, _, _ = Dominion.prep_manifest(manifest, 300, len(cvr_list))

# Draw the sample
sample = [2, 22, 25, 78, 151, 191, 196, 203, 233, 254]
cards, sample_order, cvr_sample, mvr_phantoms = Dominion.sample_from_cvrs(cvr_list, manifest, sample)
_cards = sorted([card[5] for card in cards])
_cvrs = sorted([cvr.id for cvr in cvr_sample])
_order = sorted(sample_order.keys())
assert len(mvr_phantoms) == 0
assert sorted(_cards) == sorted(_cvrs)
assert sorted(_order) == sorted(_cvrs)
assert sorted(set(_cvrs)) == [
'10-100-3',
'10-101-3',
'10-101-6',
'10-103-19',
'11-102-12',
'11-104-12',
'11-104-17',
'12-100-4',
'12-101-14',
'12-102-15',
]


##########################################################################################
Expand Down

0 comments on commit 7962bce

Please sign in to comment.