Skip to content

Commit

Permalink
Modify parse-xml-cvrs to support ask from Clark County (#2079)
Browse files Browse the repository at this point in the history
* update parse hart cvrs script

* script updates

* extract function to be reusable

* update script to call hart fn not ess
  • Loading branch information
carolinemodic authored Dec 2, 2024
1 parent d9fe3f8 commit 6434e12
Show file tree
Hide file tree
Showing 4 changed files with 221 additions and 142 deletions.
125 changes: 101 additions & 24 deletions scripts/parse-xml-cvrs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@
from xml.etree import ElementTree
from collections import defaultdict

from server.api.cvrs import parse_scanned_ballot_information_file

# This script that parses hart CVRS and outputs a CSV file similar to the dominion format.
# Run with:
# FLASK_ENV=development poetry run python -m scripts.parse-xml-cvrs <path/to/hart/cvrs> <output-file.csv> [--include-votes-cast-per-contest] [--cvrs-include-scanned-ballot-info] [--cvrs-exported-by-tabulator]
##

# Annoyingly, ElementTree requires that you specify the namespace in all tag
# searches, so we make some wrapper functions
ns = "http://tempuri.org/CVRDesign.xsd"
Expand All @@ -18,13 +25,32 @@ def findall(xml, tag):
return xml.findall(tag, namespaces={"": ns})


NUM_CAST = "# Number of Votes Cast in Contest"


def get_directory_name(file_path):
directory_path = os.path.dirname(file_path)
directory_name = os.path.basename(directory_path)
return directory_name


def parse_cvr_file(file_path, use_directory_name_as_tabulator=False):
def parse_scanned_ballot_file(file_path, cvr_workstation_mapping):
with open(file_path, "rb") as ballots_file:
rows = parse_scanned_ballot_information_file(ballots_file)

for row in rows:
cvr_number = row["CvrId"]
workstation = row["Workstation"]
unique_id = row["UniqueIdentifier"]
cvr_workstation_mapping[cvr_number] = [workstation, unique_id]
return cvr_workstation_mapping


def parse_cvr_file(
file_path,
use_directory_name_as_tabulator=False,
include_votes_cast_per_contest=False,
):
xml = ElementTree.parse(file_path).getroot()
assert xml.tag == f"{{{ns}}}Cvr"

Expand All @@ -33,14 +59,18 @@ def parse_cvr_file(file_path, use_directory_name_as_tabulator=False):
"BatchNumber": find(xml, "BatchNumber").text,
"BatchSequence": find(xml, "BatchSequence").text,
"SheetNumber": find(xml, "SheetNumber").text,
"PrecinctSplit": find(find(xml, "PrecinctSplit"), "Name").text,
"PrecinctSplitName": find(find(xml, "PrecinctSplit"), "Name").text,
"PrecinctSplitId": find(find(xml, "PrecinctSplit"), "Id").text,
# { contest: { choice: vote }}
"Contests": defaultdict(dict),
}

for contest in findall(find(xml, "Contests"), "Contest"):
contest_name = find(contest, "Name").text
choices = findall(find(contest, "Options"), "Option")
num_votes_made_in_contest = len(choices)
if include_votes_cast_per_contest:
cvr["Contests"][contest_name][NUM_CAST] = num_votes_made_in_contest
for choice in choices:
if find(choice, "WriteInData"):
choice_name = "WRITE-IN"
Expand All @@ -56,34 +86,55 @@ def parse_cvr_file(file_path, use_directory_name_as_tabulator=False):


if __name__ == "__main__":
if not (
len(sys.argv) == 3
or (len(sys.argv) == 4 and sys.argv[1] == "--cvrs-exported-by-tabulator")
):
if len(sys.argv) < 3 or len(sys.argv) > 6:
print(
"Usage: python -m scripts.parse-xml-cvrs [--cvrs-exported-by-tabulator] <cvr_directory_path> <output_csv_path>",
"Usage: python -m scripts.parse-xml-cvrs <cvr_directory_path> <output_csv_path> [--cvrs-exported-by-tabulator] [--include-votes-cast-per-contest] [--cvrs-include-scanned-ballot-info]",
file=sys.stderr,
)
sys.exit(1)

cvr_directory_path = sys.argv[len(sys.argv) - 2]
output_csv_path = sys.argv[len(sys.argv) - 1]
cvrs_exported_by_tabulator = len(sys.argv) == 4
cvr_directory_path = sys.argv[1]
output_csv_path = sys.argv[2]
cvrs_exported_by_tabulator = False
include_votes_cast_per_contest = False
cvrs_include_scanned_ballot_info = False
for arg in sys.argv[3:]:
if arg == "--cvrs-exported-by-tabulator":
cvrs_exported_by_tabulator = True
if arg == "--cvrs-include-scanned-ballot-info":
cvrs_include_scanned_ballot_info = True
elif arg == "--include-votes-cast-per-contest":
include_votes_cast_per_contest = True
else:
print(f"Unknown argument: {arg}", file=sys.stderr)
print(
"Usage: python -m scripts.parse-xml-cvrs <cvr_directory_path> <output_csv_path> [--cvrs-exported-by-tabulator] [--include-votes-cast-per-contest]",
file=sys.stderr,
)
sys.exit(1)

print("Finding CVR files...")

cvr_file_paths = []
if cvrs_exported_by_tabulator:
scanned_ballot_file_paths = []
if cvrs_exported_by_tabulator or cvrs_include_scanned_ballot_info:
for entry in os.scandir(cvr_directory_path):
if entry.is_dir():
for sub_entry in os.scandir(entry.path):
if sub_entry.is_file() and sub_entry.name.endswith(".xml"):
cvr_file_paths.append(sub_entry.path)
if (
cvrs_include_scanned_ballot_info
and entry.is_file()
and entry.name.endswith(".csv")
):
scanned_ballot_file_paths.append(entry.path)
else:
for entry in os.scandir(cvr_directory_path):
if entry.is_file() and entry.name.endswith(".xml"):
cvr_file_paths.append(entry.path)

print(f"Found {len(scanned_ballot_file_paths)} scanned ballot information files")
print(f"Found {len(cvr_file_paths)} CVR files")

cvrs = []
Expand All @@ -93,6 +144,7 @@ def parse_cvr_file(file_path, use_directory_name_as_tabulator=False):
cvr = parse_cvr_file(
cvr_file_path,
use_directory_name_as_tabulator=cvrs_exported_by_tabulator,
include_votes_cast_per_contest=include_votes_cast_per_contest,
)
except Exception as exc:
print(f"Error parsing file: {cvr_file_path}")
Expand All @@ -108,46 +160,71 @@ def parse_cvr_file(file_path, use_directory_name_as_tabulator=False):
if len(cvrs) % 1000 == 0:
print(f"Parsed {len(cvrs)} files")

print("Parsing ballot information files...")
cvr_workstation_mapping: dict = {}
for scanned_ballot_path in scanned_ballot_file_paths:
try:
cvr_workstation_mapping = parse_scanned_ballot_file(
scanned_ballot_path,
cvr_workstation_mapping,
)
except Exception as exc:
print(f"Error parsing file: {scanned_ballot_path}")
raise exc

print("Writing CSV...")

contest_choice_pairs = [
(contest_name, choice_name)
for contest_name, choices in contest_choices.items()
for choice_name in choices
]
contest_choice_pairs = []
for contest_name, choices in contest_choices.items():
contest_name_cleaned = contest_name.replace("\n", " ")
for choice_name in choices:
if choice_name != NUM_CAST:
contest_choice_pairs.append((contest_name, choice_name))
if include_votes_cast_per_contest:
contest_choice_pairs.append((contest_name, NUM_CAST))

with open(output_csv_path, "w", encoding="utf8") as output_file:

writer = csv.writer(output_file)
writer.writerow(["Election Name", "0.00.0.00"])

contest_headers = ["", "", "", "", ""] + [
f"{contest_name} (Vote For=1)" for contest_name, _ in contest_choice_pairs
contest_headers = ["", "", "", "", "", "", "", ""] + [
f"{contest_name}" for contest_name, _ in contest_choice_pairs
]
writer.writerow(contest_headers)

choice_headers = ["", "", "", "", ""] + [
choice_headers = ["", "", "", "", "", "", "", ""] + [
choice_name for _, choice_name in contest_choice_pairs
]
writer.writerow(choice_headers)

headers = [
"CvrNumber",
"TabulatorNum",
"BatchId",
"RecordId",
"ImprintedId",
"BatchNumber",
"BatchSequence",
"CvrId",
"PrecinctSplit Name",
"PrecinctSplit Id",
"Workstation",
"UniqueIdentifier",
] + ["NP" for _ in contest_choice_pairs]
writer.writerow(headers)

for i, cvr in enumerate(cvrs):
row = [
i,
cvr["Tabulator"] if cvrs_exported_by_tabulator else 1,
cvr["BatchNumber"],
cvr["BatchSequence"],
cvr["CvrGuid"],
cvr["PrecinctSplitName"],
cvr["PrecinctSplitId"],
]
if cvr["CvrGuid"] in cvr_workstation_mapping:
row.append(cvr_workstation_mapping[cvr["CvrGuid"]][0])
row.append(cvr_workstation_mapping[cvr["CvrGuid"]][1])
else:
row.append("")
row.append("")

# Fill in missing contest choices with 0s
for contest_name, choice_name in contest_choice_pairs:
Expand Down
4 changes: 2 additions & 2 deletions server/api/batch_inventory.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,14 @@
get_support_user,
)
from .cvrs import (
column_value,
csv_reader_for_cvr,
get_header_indices,
read_ess_ballots_file,
separate_ess_cvr_and_ballots_files,
)
from ..models import * # pylint: disable=wildcard-import
from ..util.csv_parse import (
column_value,
get_header_indices,
validate_comma_delimited,
is_filetype_csv_mimetype,
)
Expand Down
Loading

0 comments on commit 6434e12

Please sign in to comment.