improving ability to test sample sizes (#97)

* Oneaudit (#92) * WIP: improving Dominion support, progress towards ONEAudit integration * WIP: Dominion tools * Dominion "multiple card" fix, Testcase Regression Fixes (#91) * This commit fixes issues with the record_id not always set, and the use_current flag never being used * Fix earlier issue where only the first 'Card' in the Dominion output was parsed, added associated unit tests --------- Co-authored-by: Philip B. Stark <[email protected]> * WIP: ONEAudit integration. Dominion mvr retrieval improvements * WIP: Dominion card numbering, construct pool_means only for pool==True CVRs * WIP: working but sample size estimation needs to be improved * ENH: ONEAudit seems to be working, including sample size estimation. * ENH: improve documentation in ONEAudit notebook; minor bug fixes --------- Co-authored-by: bsheehan-SF-RLA <[email protected]> * Added missing parameter in supermajority assertion and changed np.infty (no longer supported) to np.inf (#93) * Dominion (#94) * BUG: change logic for parsing Dominion ranks, fix minor bugs in supermajority * BUG: revise unit test for Dominion to match new mark processing * ENH: logic for SF treatment of multiple ranks assigned to same candidate in IRV * ENH: logic for SF treatment of multiple ranks assigned to same candidate in IRV * ENH: logic for SF treatment of multiple ranks assigned to same candidate in IRV * ENH: logic for SF treatment of multiple ranks assigned to same candidate in IRV * Optimizations for Dominion.sample_from_cvrs and CVR.consistent_sampling (#95) * Speed optimizations for Dominion.sample_from_cvrs() and CVR.consistent_sampling() * Unit tests for optimzations in last commit, should pass on original and optimized code --------- Co-authored-by: bsheehan-SF-RLA <[email protected]> Co-authored-by: Alexander Ek <[email protected]>
pbstark · Nov 28, 2024 · 7962bce · 7962bce
1 parent e9ca527
commit 7962bce
Show file tree

Hide file tree

Showing 5 changed files with 103 additions and 96 deletions.
diff --git a/examples/ONEAudit-demo.ipynb b/examples/ONEAudit-demo.ipynb
@@ -90,6 +90,7 @@
     "+ Read manual interpretations of the cards (MVRs)\n",
     "+ Calculate attained risk for each assorter\n",
     "    - Use ~2EZ to deal with phantom CVRs or cards; the treatment depends on whether `use_style == True`\n",
+    "    - If a sampled card cannot be found/retrieved, use the phantom-to-zombie transformation for it\n",
     "    - Use the pooled assorter means for cards in pooled batches\n",
     "+ Report\n",
     "+ Estimate incremental sample size if any assorter nulls have not been rejected\n",
@@ -1006,7 +1007,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Read the audited sample data"
+    "# Read the audited sample data.\n",
+    "\n",
+    "## Any ballot that cannot be retrieved should be marked as a \"zombie\" (treated in the least favorable way for every contest it might contain)."
    ]
   },
   {

diff --git a/shangrla/core/Audit.py b/shangrla/core/Audit.py
@@ -883,9 +883,8 @@ def consistent_sampling(
                         ].sample_num
                         current_sizes[c] += 1
             inx += 1
-        for i in range(len(cvr_list)):
-            if i in sampled_cvr_indices:
-                cvr_list[i].sampled = True
+        for i in sampled_cvr_indices:
+            cvr_list[i].sampled = True
         return sampled_cvr_indices
 
     @classmethod
@@ -1931,7 +1930,8 @@ def make_plurality_assertions(
     @classmethod
     def make_supermajority_assertion(
         cls,
-        contest,
+        contest: object=None,
+        share_to_win: float = 1/2,
         winner: str = None,
         loser: list = None,
         test: callable = None,
@@ -1962,12 +1962,12 @@ def make_supermajority_assertion(
         -----------
         contest:
             contest object instance to which the assertion applies
+        share_to_win: float
+            fraction of the valid votes the winner must get to win
         winner:
             identifier of winning candidate
         loser: list
             list of identifiers of losing candidate(s)
-        share_to_win: float
-            fraction of the valid votes the winner must get to win
         test: instance of NonnegMean
             risk function for the contest
         estim: an estimation method of NonnegMean
@@ -2218,7 +2218,7 @@ def set_all_margins_from_cvrs(
            `assertion.contest.audit_type==Audit.AUDIT_TYPE.POLLING`
            or `assertion.contest.audit_type in [Audit.AUDIT_TYPE.CARD_COMPARISON, Audit.AUDIT_TYPE.ONEAUDIT]`
         """
-        min_margin = np.infty
+        min_margin = np.inf
         for c, con in contests.items():
             con.margins = {}
             for a, asn in con.assertions.items():
@@ -2526,7 +2526,7 @@ def overstatement(self, mvr, cvr, use_style=True):
         )
         # assort the CVR
         cvr_assort = (
-            self.tally_pool_means[cvr.tally_pool] 
+            self.tally_pool_means[cvr.tally_pool]
             if 
                 cvr.pool and self.tally_pool_means is not None
             else 
@@ -2737,8 +2737,7 @@ def check_cards(cls, contests: Collection["Contest"], cvrs: Collection["CVR"], f
                 if not force:
                     raise ValueError(f'{found} cards contain contest {c} but upper bound is {con.cards}')
                 else:
-                    warnings.warn(f'{found} cards contain contest {c} but upper bound is {con.cards}')
-
+                    warnings.warn(f'{found} cards contain contest {c} but upper bound is {con.cards}')                                    
             con.cards = max(con.cards, found) if force else con.cards
 
 
@@ -2748,8 +2747,6 @@ def tally(cls, con_dict: dict = None, cvr_list: "Collection[CVR]" = None, enforc
         Tally the votes in the contests in con_dict from a collection of CVRs.
         Only tallies plurality, multi-winner plurality, supermajority, and approval contests.
 
-        If 
-
         Parameters
         ----------
         con_dict: dict

diff --git a/shangrla/formats/Dominion.py b/shangrla/formats/Dominion.py
@@ -97,6 +97,12 @@ def read_cvrs(
            "Marks" as the container for votes
            "Rank" as the rank
 
+        **WARNING**
+        Uses San Francisco's rules for validity of an IRV ballot; in particular, if a CVR gives
+        a candidate more than one rank, the vote is considered valid and the lowest rank is used.
+        
+        **WARNING This may break some scoring rules!**
+
         Parameters:
         -----------
         cvr_file: string
@@ -154,9 +160,19 @@ def read_cvrs(
                 for con in _selector:
                     contest_votes = {}
                     for mark in con["Marks"]:
-                        contest_votes[str(mark["CandidateId"])] = (
-                            mark["Rank"] if (mark["IsVote"] or not enforce_rules) else 0
-                        )
+                        if mark["IsVote"] or not enforce_rules:
+                            if str(mark["CandidateId"]) in contest_votes.keys():
+                            # replace existing vote/rank if the new rank is lower but still a vote, not 0 or False
+                            # This may break some scoring rules other than IRV, and might not be what local rules
+                            # require. This logic branch matches San Francisco's rules.
+                                if bool(mark["Rank"]):
+                                    contest_votes[str(mark["CandidateId"])] = (
+                                        min(int(contest_votes[str(mark["CandidateId"])]), int(mark["Rank"]))
+                                        if bool(contest_votes[str(mark["CandidateId"])])
+                                        else int(mark["Rank"])
+                                    )
+                            else:
+                                contest_votes[str(mark["CandidateId"])] = mark["Rank"]
                     votes[str(con["Id"])] = contest_votes
             # If RecordId is obfuscated, extract it from the ImageMask
             record_id = c["RecordId"]
@@ -486,18 +502,26 @@ def sample_from_cvrs(cls, cvr_list: list, manifest: list, sample: np.array):
         sample_order = {}
         cvr_sample = []
         mvr_phantoms = []
+
+        # Convert the pandas DataFrame into a list of tuples to create a lookup table
+        _manifest = list(manifest.itertuples(index=False, name=None))
+        _columns = list(manifest.columns)
+        index = dict([(_, _columns.index(_)) for _ in _columns])
+        lookuptable = {}
+        for _m in _manifest:
+            _key = f'{_m[index["Tabulator Number"]]}-{_m[index["Batch Number"]]}'
+            _val = [_m[index["VBMCart.Cart number"]], _m[index["Tray #"]]]
+            lookuptable[_key] = _val
+
         for i, s in enumerate(sample):
             cvr_sample.append(cvr_list[s])
             cvr_id = cvr_list[s].id
             card_in_batch = cvr_list[s].card_in_batch
             tab, batch, card_num = cvr_id.split("-")
             card_id = f"{tab}-{batch}-{card_num}"
+            search_key = f"{tab}-{batch}"
             if not cvr_list[s].phantom:
-                manifest_row = manifest[
-                    (manifest["Tabulator Number"] == str(tab))
-                    & (manifest["Batch Number"] == str(batch))
-                ].iloc[0]
-                card = [manifest_row["VBMCart.Cart number"], manifest_row["Tray #"]] + [
+                card = lookuptable[search_key] + [
                     tab,
                     batch,
                     card_in_batch,

diff --git a/tests/core/test_CVR.py b/tests/core/test_CVR.py
@@ -268,6 +268,8 @@ def test_consistent_sampling(self):
         con_tests = Contest.from_dict_of_dicts(contests)
         sample_cvr_indices = CVR.consistent_sampling(cvrs, con_tests)
         assert sample_cvr_indices == [0, 1, 2, 5]
+        sampled_cvr_by_ivar = [(cvr.id, cvr.sampled) for cvr in cvrs]
+        assert sampled_cvr_by_ivar == [("1", True), ("2", True), ("3", True), ("4", False), ("5", False), ("6", True)]
         np.testing.assert_approx_equal(con_tests['city_council'].sample_threshold, 2)
         np.testing.assert_approx_equal(con_tests['measure_1'].sample_threshold, 5)
 

diff --git a/tests/formats/test_Dominion.py b/tests/formats/test_Dominion.py
@@ -4,6 +4,7 @@
 from pathlib import Path
 
 from shangrla.formats.Dominion import Dominion
+from shangrla.core.Audit import CVR
 
 ##########################################################################################
 
@@ -56,13 +57,13 @@ def test_read_cvrs_old_format_rules_enforced(self):
         assert not cvr_1.pool, f"{cvr_1.pool=}"
         assert list(cvr_1.votes.keys()) == ["111"]
         # Reading Dominion CVRs now takes "IsVote" and "Modified" values into account,
-        # so {"6": 1, "1": 2} now becomes {"6": 1, "1": 0} (vote for candidate 1 in this
+        # so {"6": 1, "1": 2} now becomes {"6": 1} (vote for candidate 1 in this
         # testcase is marked "IsVote": false).  For the same reason, the call to
-        # get_vote_for("111", "1") is now 0.
-        assert cvr_1.votes["111"] == {"6": 1, "1": 0}
+        # get_vote_for("111", "1") is now False.
+        assert cvr_1.votes["111"] == {"6": 1}
         assert cvr_1.get_vote_for("111", "6")
-        assert cvr_1.get_vote_for("111", "1") == 0
-        assert cvr_1.get_vote_for("111", "999") is False
+        assert not cvr_1.get_vote_for("111", "1")
+        assert not cvr_1.get_vote_for("111", "999")
         assert cvr_2.id == "60009_3_21"
         assert cvr_2.tally_pool == "60009_3"
         assert not cvr_2.pool, f"{cvr_2.pool=}"
@@ -220,78 +221,58 @@ def test_sample_from_manifest(self):
         assert cards[6] == [3, 3, 19, 3, 1, "19-3-1", 201]
         assert len(mvr_phantoms) == 0
 
-    # def test_make_contest_dict(self):
-    #     cvr_dir = Path("data/SF_CVR_Export_20240311150227")
-    #     contest_manifest = cvr_dir / "ContestManifest.json"
-    #     candidate_manifest = cvr_dir / "CandidateManifest.json"
-
-    #     cvr_list = Dominion.read_cvrs_directory(
-    #         cvr_dir, use_current=True, include_groups=(2,)
-    #     )
+    def test_sample_from_cvrs(self):
+        """
+        Test sampling from a list of CVRs using a manifest
+        """
 
-    #     c = make_contest_dict(
-    #         cvr_list,
-    #         contest_manifest,
-    #         candidate_manifest,
-    #         {},
-    #     )
+        # Construct a list of CVRs and a corresponding manifest
+        cvr_list = []
+        mnf_list = []
+        tray = 1
+        batch = 100
+        cards_per_batch = 20
+        for tab in range(10, 13):
+            for batch in range(100, 105):
+                mnf_list.append(
+                    {
+                        "Tray #": tray,
+                        "Tabulator Number": tab,
+                        "Batch Number": batch,
+                        "Total Ballots": cards_per_batch,
+                        "VBMCart.Cart number": 1,
+                    },                    
+                )
+                for card in range(1, cards_per_batch + 1):
+                    id = f"{tab}-{batch}-{card}"
+                    cvr_list.append(CVR(id=id, card_in_batch=card))
+            tray += 1
 
-    #     assert c["8"]["name"] == "DEM CCC DISTRICT 17"
-    #     assert c["8"]["risk_limit"] == pytest.approx(0.05)
-    #     assert c["8"]["cards"] == 82019
-    #     assert c["8"]["choice_function"] == Contest.SOCIAL_CHOICE_FUNCTION.PLURALITY
-    #     assert c["8"]["n_winners"] == 14
-    #     assert c["8"]["candidates"] == [
-    #         "24",
-    #         "25",
-    #         "26",
-    #         "27",
-    #         "28",
-    #         "29",
-    #         "30",
-    #         "31",
-    #         "32",
-    #         "33",
-    #         "34",
-    #         "35",
-    #         "36",
-    #         "37",
-    #         "38",
-    #         "39",
-    #         "40",
-    #         "41",
-    #         "42",
-    #         "43",
-    #         "44",
-    #         "45",
-    #         "46",
-    #         "47",
-    #         "48",
-    #         "49",
-    #         "50",
-    #         "51",
-    #         "52",
-    #         "53",
-    #         "241",
-    #     ]
-    #     assert c["8"]["winner"] == [
-    #         "49",
-    #         "30",
-    #         "27",
-    #         "52",
-    #         "36",
-    #         "26",
-    #         "32",
-    #         "45",
-    #         "28",
-    #         "51",
-    #         "39",
-    #         "29",
-    #         "44",
-    #         "35",
-    #     ]
-    #     assert c["8"]["assertion_file"] is None
-    #     assert c["8"]["audit_type"] == "CARD_COMPARISON"
+        manifest = pd.DataFrame.from_dict(mnf_list)
+        manifest["cum_cards"] = manifest["Total Ballots"].cumsum()
+        manifest, _, _ = Dominion.prep_manifest(manifest, 300, len(cvr_list))
+
+        # Draw the sample
+        sample = [2, 22, 25, 78, 151, 191, 196, 203, 233, 254]
+        cards, sample_order, cvr_sample, mvr_phantoms = Dominion.sample_from_cvrs(cvr_list, manifest, sample)
+        _cards = sorted([card[5] for card in cards])
+        _cvrs = sorted([cvr.id for cvr in cvr_sample])
+        _order = sorted(sample_order.keys())
+        assert len(mvr_phantoms) == 0
+        assert sorted(_cards) == sorted(_cvrs)
+        assert sorted(_order) == sorted(_cvrs)
+        assert sorted(set(_cvrs)) == [
+            '10-100-3',
+            '10-101-3',
+            '10-101-6',
+            '10-103-19',
+            '11-102-12',
+            '11-104-12',
+            '11-104-17',
+            '12-100-4',
+            '12-101-14',
+            '12-102-15',
+        ]
 
 
 ##########################################################################################