Modify parse-xml-cvrs to support ask from Clark County (#2079)

* update parse hart cvrs script * script updates * extract function to be reusable * update script to call hart fn not ess
votingworks · Dec 2, 2024 · 6434e12 · 6434e12
1 parent d9fe3f8
commit 6434e12
Show file tree

Hide file tree

Showing 4 changed files with 221 additions and 142 deletions.
diff --git a/scripts/parse-xml-cvrs.py b/scripts/parse-xml-cvrs.py
@@ -5,6 +5,13 @@
 from xml.etree import ElementTree
 from collections import defaultdict
 
+from server.api.cvrs import parse_scanned_ballot_information_file
+
+# This script that parses hart CVRS and outputs a CSV file similar to the dominion format.
+# Run with:
+# FLASK_ENV=development poetry run python -m scripts.parse-xml-cvrs <path/to/hart/cvrs> <output-file.csv> [--include-votes-cast-per-contest] [--cvrs-include-scanned-ballot-info] [--cvrs-exported-by-tabulator]
+##
+
 # Annoyingly, ElementTree requires that you specify the namespace in all tag
 # searches, so we make some wrapper functions
 ns = "http://tempuri.org/CVRDesign.xsd"
@@ -18,13 +25,32 @@ def findall(xml, tag):
     return xml.findall(tag, namespaces={"": ns})
 
 
+NUM_CAST = "# Number of Votes Cast in Contest"
+
+
 def get_directory_name(file_path):
     directory_path = os.path.dirname(file_path)
     directory_name = os.path.basename(directory_path)
     return directory_name
 
 
-def parse_cvr_file(file_path, use_directory_name_as_tabulator=False):
+def parse_scanned_ballot_file(file_path, cvr_workstation_mapping):
+    with open(file_path, "rb") as ballots_file:
+        rows = parse_scanned_ballot_information_file(ballots_file)
+
+        for row in rows:
+            cvr_number = row["CvrId"]
+            workstation = row["Workstation"]
+            unique_id = row["UniqueIdentifier"]
+            cvr_workstation_mapping[cvr_number] = [workstation, unique_id]
+        return cvr_workstation_mapping
+
+
+def parse_cvr_file(
+    file_path,
+    use_directory_name_as_tabulator=False,
+    include_votes_cast_per_contest=False,
+):
     xml = ElementTree.parse(file_path).getroot()
     assert xml.tag == f"{{{ns}}}Cvr"
 
@@ -33,14 +59,18 @@ def parse_cvr_file(file_path, use_directory_name_as_tabulator=False):
         "BatchNumber": find(xml, "BatchNumber").text,
         "BatchSequence": find(xml, "BatchSequence").text,
         "SheetNumber": find(xml, "SheetNumber").text,
-        "PrecinctSplit": find(find(xml, "PrecinctSplit"), "Name").text,
+        "PrecinctSplitName": find(find(xml, "PrecinctSplit"), "Name").text,
+        "PrecinctSplitId": find(find(xml, "PrecinctSplit"), "Id").text,
         # { contest: { choice: vote }}
         "Contests": defaultdict(dict),
     }
 
     for contest in findall(find(xml, "Contests"), "Contest"):
         contest_name = find(contest, "Name").text
         choices = findall(find(contest, "Options"), "Option")
+        num_votes_made_in_contest = len(choices)
+        if include_votes_cast_per_contest:
+            cvr["Contests"][contest_name][NUM_CAST] = num_votes_made_in_contest
         for choice in choices:
             if find(choice, "WriteInData"):
                 choice_name = "WRITE-IN"
@@ -56,34 +86,55 @@ def parse_cvr_file(file_path, use_directory_name_as_tabulator=False):
 
 
 if __name__ == "__main__":
-    if not (
-        len(sys.argv) == 3
-        or (len(sys.argv) == 4 and sys.argv[1] == "--cvrs-exported-by-tabulator")
-    ):
+    if len(sys.argv) < 3 or len(sys.argv) > 6:
         print(
-            "Usage: python -m scripts.parse-xml-cvrs [--cvrs-exported-by-tabulator] <cvr_directory_path> <output_csv_path>",
+            "Usage: python -m scripts.parse-xml-cvrs <cvr_directory_path> <output_csv_path> [--cvrs-exported-by-tabulator] [--include-votes-cast-per-contest] [--cvrs-include-scanned-ballot-info]",
             file=sys.stderr,
         )
         sys.exit(1)
 
-    cvr_directory_path = sys.argv[len(sys.argv) - 2]
-    output_csv_path = sys.argv[len(sys.argv) - 1]
-    cvrs_exported_by_tabulator = len(sys.argv) == 4
+    cvr_directory_path = sys.argv[1]
+    output_csv_path = sys.argv[2]
+    cvrs_exported_by_tabulator = False
+    include_votes_cast_per_contest = False
+    cvrs_include_scanned_ballot_info = False
+    for arg in sys.argv[3:]:
+        if arg == "--cvrs-exported-by-tabulator":
+            cvrs_exported_by_tabulator = True
+        if arg == "--cvrs-include-scanned-ballot-info":
+            cvrs_include_scanned_ballot_info = True
+        elif arg == "--include-votes-cast-per-contest":
+            include_votes_cast_per_contest = True
+        else:
+            print(f"Unknown argument: {arg}", file=sys.stderr)
+            print(
+                "Usage: python -m scripts.parse-xml-cvrs <cvr_directory_path> <output_csv_path> [--cvrs-exported-by-tabulator] [--include-votes-cast-per-contest]",
+                file=sys.stderr,
+            )
+            sys.exit(1)
 
     print("Finding CVR files...")
 
     cvr_file_paths = []
-    if cvrs_exported_by_tabulator:
+    scanned_ballot_file_paths = []
+    if cvrs_exported_by_tabulator or cvrs_include_scanned_ballot_info:
         for entry in os.scandir(cvr_directory_path):
             if entry.is_dir():
                 for sub_entry in os.scandir(entry.path):
                     if sub_entry.is_file() and sub_entry.name.endswith(".xml"):
                         cvr_file_paths.append(sub_entry.path)
+            if (
+                cvrs_include_scanned_ballot_info
+                and entry.is_file()
+                and entry.name.endswith(".csv")
+            ):
+                scanned_ballot_file_paths.append(entry.path)
     else:
         for entry in os.scandir(cvr_directory_path):
             if entry.is_file() and entry.name.endswith(".xml"):
                 cvr_file_paths.append(entry.path)
 
+    print(f"Found {len(scanned_ballot_file_paths)} scanned ballot information files")
     print(f"Found {len(cvr_file_paths)} CVR files")
 
     cvrs = []
@@ -93,6 +144,7 @@ def parse_cvr_file(file_path, use_directory_name_as_tabulator=False):
             cvr = parse_cvr_file(
                 cvr_file_path,
                 use_directory_name_as_tabulator=cvrs_exported_by_tabulator,
+                include_votes_cast_per_contest=include_votes_cast_per_contest,
             )
         except Exception as exc:
             print(f"Error parsing file: {cvr_file_path}")
@@ -108,46 +160,71 @@ def parse_cvr_file(file_path, use_directory_name_as_tabulator=False):
         if len(cvrs) % 1000 == 0:
             print(f"Parsed {len(cvrs)} files")
 
+    print("Parsing ballot information files...")
+    cvr_workstation_mapping: dict = {}
+    for scanned_ballot_path in scanned_ballot_file_paths:
+        try:
+            cvr_workstation_mapping = parse_scanned_ballot_file(
+                scanned_ballot_path,
+                cvr_workstation_mapping,
+            )
+        except Exception as exc:
+            print(f"Error parsing file: {scanned_ballot_path}")
+            raise exc
+
     print("Writing CSV...")
 
-    contest_choice_pairs = [
-        (contest_name, choice_name)
-        for contest_name, choices in contest_choices.items()
-        for choice_name in choices
-    ]
+    contest_choice_pairs = []
+    for contest_name, choices in contest_choices.items():
+        contest_name_cleaned = contest_name.replace("\n", " ")
+        for choice_name in choices:
+            if choice_name != NUM_CAST:
+                contest_choice_pairs.append((contest_name, choice_name))
+        if include_votes_cast_per_contest:
+            contest_choice_pairs.append((contest_name, NUM_CAST))
 
     with open(output_csv_path, "w", encoding="utf8") as output_file:
 
         writer = csv.writer(output_file)
         writer.writerow(["Election Name", "0.00.0.00"])
 
-        contest_headers = ["", "", "", "", ""] + [
-            f"{contest_name} (Vote For=1)" for contest_name, _ in contest_choice_pairs
+        contest_headers = ["", "", "", "", "", "", "", ""] + [
+            f"{contest_name}" for contest_name, _ in contest_choice_pairs
         ]
         writer.writerow(contest_headers)
 
-        choice_headers = ["", "", "", "", ""] + [
+        choice_headers = ["", "", "", "", "", "", "", ""] + [
             choice_name for _, choice_name in contest_choice_pairs
         ]
         writer.writerow(choice_headers)
 
         headers = [
             "CvrNumber",
-            "TabulatorNum",
-            "BatchId",
-            "RecordId",
-            "ImprintedId",
+            "BatchNumber",
+            "BatchSequence",
+            "CvrId",
+            "PrecinctSplit Name",
+            "PrecinctSplit Id",
+            "Workstation",
+            "UniqueIdentifier",
         ] + ["NP" for _ in contest_choice_pairs]
         writer.writerow(headers)
 
         for i, cvr in enumerate(cvrs):
             row = [
                 i,
-                cvr["Tabulator"] if cvrs_exported_by_tabulator else 1,
                 cvr["BatchNumber"],
                 cvr["BatchSequence"],
                 cvr["CvrGuid"],
+                cvr["PrecinctSplitName"],
+                cvr["PrecinctSplitId"],
             ]
+            if cvr["CvrGuid"] in cvr_workstation_mapping:
+                row.append(cvr_workstation_mapping[cvr["CvrGuid"]][0])
+                row.append(cvr_workstation_mapping[cvr["CvrGuid"]][1])
+            else:
+                row.append("")
+                row.append("")
 
             # Fill in missing contest choices with 0s
             for contest_name, choice_name in contest_choice_pairs:

diff --git a/server/api/batch_inventory.py b/server/api/batch_inventory.py
@@ -25,14 +25,14 @@
     get_support_user,
 )
 from .cvrs import (
-    column_value,
     csv_reader_for_cvr,
-    get_header_indices,
     read_ess_ballots_file,
     separate_ess_cvr_and_ballots_files,
 )
 from ..models import *  # pylint: disable=wildcard-import
 from ..util.csv_parse import (
+    column_value,
+    get_header_indices,
     validate_comma_delimited,
     is_filetype_csv_mimetype,
 )