Skip to content

Commit

Permalink
Merge pull request #372 from rcpch:370-add-the-data-quality-report-to…
Browse files Browse the repository at this point in the history
…-the-csv-as-a-separate-sheet

370 add the data quality report to the csv as a separate sheet
  • Loading branch information
eatyourpeas authored Nov 15, 2024
2 parents d07bd4e + f9f6e49 commit bbe7dbf
Show file tree
Hide file tree
Showing 17 changed files with 343 additions and 171 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ __pycache__
/.cache
.vscode
*.code-workspace
.vscode

node_modules

Expand Down
2 changes: 0 additions & 2 deletions project/npda/general_functions/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
from .csv_header import *
from .csv_download import *
from .email import *
from .group_for_group import *
from .index_multiple_deprivation import *
Expand Down
5 changes: 5 additions & 0 deletions project/npda/general_functions/csv/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .csv_download import *
from .csv_header import *
from .csv_parse import *
from .csv_summarize import *
from .csv_upload import *
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@
from django.http import HttpResponse
from django.shortcuts import get_object_or_404

def download_file(file_path, file_name):
with open(file_path, "rb") as f:
response = HttpResponse(f.read(), content_type="text/csv")
response["Content-Disposition"] = f'attachment; filename="{file_name}"'
return response

def download_csv(request, submission_id):
"""
Expand All @@ -12,7 +17,16 @@ def download_csv(request, submission_id):
file_path = submission.csv_file.path
file_name = submission.csv_file.name.split("/")[-1]

with open(file_path, "rb") as f:
response = HttpResponse(f.read(), content_type="text/csv")
response["Content-Disposition"] = f'attachment; filename="{file_name}"'
return response
return download_file(file_path, file_name)

def download_xlsx(request, submission_id):
"""
Download a XLSX file.
NB: This repurposes download_csv with a simple file rename.
"""
Submission = apps.get_model(app_label="npda", model_name="Submission")
submission = get_object_or_404(Submission, id=submission_id)
file_path = submission.csv_file.path.replace('.csv','.xlsx')
file_name = submission.csv_file.name.split("/")[-1].replace('.csv','.xlsx')

return download_file(file_path, file_name)
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import csv
import io
from ...constants.csv_headings import HEADINGS_LIST
from project.constants.csv_headings import HEADINGS_LIST


def csv_header():
Expand Down
141 changes: 141 additions & 0 deletions project/npda/general_functions/csv/csv_parse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
# python imports
from dataclasses import dataclass
import logging
import re

# Third-party imports
import pandas as pd
import numpy as np

# RCPCH imports
from project.constants import (
ALL_DATES,
CSV_DATA_TYPES_MINUS_DATES,
CSV_HEADINGS,
HEADINGS_LIST,
)

# Logging setup
logger = logging.getLogger(__name__)

@dataclass
class ParsedCSVFile:
df: pd.DataFrame
missing_columns: list[str]
additional_columns: list[str]
duplicate_columns: list[str]
parse_type_error_columns: list[str]

def csv_parse(csv_file):
"""
Read the csv file and return a pandas dataframe
Assigns the correct data types to the columns
Parses the dates in the columns to the correct format
"""
# It is possible the csv file has no header row. In this case, we will use the predefined column names
# The predefined column names are in the HEADINGS_LIST constant and if cast to lowercase, in lowercase_headings_list
# We will check if the first row of the csv file matches the predefined column names
# If it does not, we will use the predefined column names
# If it does, we will use the column names in the csv file

# Convert the predefined column names to lowercase
lowercase_headings_list = [heading.lower() for heading in HEADINGS_LIST]

# Read the first row of the csv file
df = pd.read_csv(csv_file)

if any(col.lower() in lowercase_headings_list for col in df.columns):
# The first row of the csv file matches at least some of the predefined column names
# We will use the column names in the csv file
pass
else:
# The first row of the csv file does not match the predefined column names
# We will use the predefined column names
csv_file.seek(0)
df = pd.read_csv(csv_file, header=None, names=HEADINGS_LIST)

# Remove leading and trailing whitespace on column names
# The template published on the RCPCH website has trailing spaces on 'Observation Date: Thyroid Function '
df.columns = df.columns.str.strip()

if df.columns[0].lower() not in lowercase_headings_list:
# No header in the source - pass them from our definitions
logger.warning(
f"CSV file uploaded without column names, using predefined column names"
)

# Have to reset back otherwise we get an empty dataframe
csv_file.seek(0)

# Pandas has strange behaviour for the first line in a CSV - additional cells become row labels
# https://github.com/pandas-dev/pandas/issues/47490
#
# As a heuristic for this, check the row label for the first row is the number 0
# If it isn't - you've got too many values in the first row
if not df.iloc[0].name == 0:
raise ValueError(
"Suspected too many values in the first row, please check there are no extra values"
)

# Accept columns case insensitively but replace them with their official version to make life easier later
for column in df.columns:
if not column in HEADINGS_LIST and column.lower() in lowercase_headings_list:
normalised_column = next(
c for c in HEADINGS_LIST if c.lower() == column.lower()
)
df = df.rename(columns={column: normalised_column})

missing_columns = [column for column in HEADINGS_LIST if not column in df.columns]

additional_columns = [
column for column in df.columns if not column in HEADINGS_LIST
]

# Duplicate columns appear in the dataframe as XYZ.1, XYZ.2 etc
duplicate_columns = []

parse_type_error_columns = []

for column in df.columns:
result = re.match(r"([\w ]+)\.\d+$", column)

if result and result.group(1) not in duplicate_columns:
duplicate_columns.append(result.group(1))

for column in ALL_DATES:
if column in df.columns:
df[column] = pd.to_datetime(df[column], format="%d/%m/%Y", errors="coerce")

# Apply the dtype to non-date columns
for column, dtype in CSV_DATA_TYPES_MINUS_DATES.items():
try:
if column in df.columns:
df[column] = df[column].astype(dtype)
except ValueError as e:
parse_type_error_columns.append(column)
continue
# Convert NaN to None for nullable fields
if column in df.columns:
df[column] = df[column].where(pd.notnull(df[column]), None)
# round height and weight if provided to 1 decimal place
if (
column
in [
"Patient Height (cm)",
"Patient Weight (kg)",
"Total Cholesterol Level (mmol/l)",
]
and column in df.columns
):
if df[column].dtype == np.float64:
df[column] = df[column].round(1)
else:
parse_type_error_columns.append(column)

return ParsedCSVFile(
df,
missing_columns,
additional_columns,
duplicate_columns,
parse_type_error_columns,
)
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import pandas as pd

# RCPCH imports
from ...constants.csv_headings import (
from project.constants.csv_headings import (
ALL_DATES,
)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
# python imports
from datetime import date
from dataclasses import dataclass
import logging
import asyncio
import collections
import re
from pprint import pprint

# django imports
from django.apps import apps
Expand All @@ -18,143 +15,15 @@
import httpx

# RCPCH imports
from ...constants import (
ALL_DATES,
CSV_DATA_TYPES_MINUS_DATES,
CSV_HEADINGS,
HEADINGS_LIST,
)
from project.npda.general_functions.write_errors_to_xlsx import write_errors_to_xlsx
from project.constants import CSV_HEADINGS

# Logging setup
logger = logging.getLogger(__name__)
from ..forms.patient_form import PatientForm
from ..forms.visit_form import VisitForm
from ..forms.external_patient_validators import validate_patient_async


@dataclass
class ParsedCSVFile:
df: pd.DataFrame
missing_columns: list[str]
additional_columns: list[str]
duplicate_columns: list[str]
parse_type_error_columns: list[str]


def read_csv(csv_file):
"""
Read the csv file and return a pandas dataframe
Assigns the correct data types to the columns
Parses the dates in the columns to the correct format
"""
# It is possible the csv file has no header row. In this case, we will use the predefined column names
# The predefined column names are in the HEADINGS_LIST constant and if cast to lowercase, in lowercase_headings_list
# We will check if the first row of the csv file matches the predefined column names
# If it does not, we will use the predefined column names
# If it does, we will use the column names in the csv file

# Convert the predefined column names to lowercase
lowercase_headings_list = [heading.lower() for heading in HEADINGS_LIST]

# Read the first row of the csv file
df = pd.read_csv(csv_file)

if any(col.lower() in lowercase_headings_list for col in df.columns):
# The first row of the csv file matches at least some of the predefined column names
# We will use the column names in the csv file
pass
else:
# The first row of the csv file does not match the predefined column names
# We will use the predefined column names
csv_file.seek(0)
df = pd.read_csv(csv_file, header=None, names=HEADINGS_LIST)

# Remove leading and trailing whitespace on column names
# The template published on the RCPCH website has trailing spaces on 'Observation Date: Thyroid Function '
df.columns = df.columns.str.strip()

if df.columns[0].lower() not in lowercase_headings_list:
# No header in the source - pass them from our definitions
logger.warning(
f"CSV file uploaded without column names, using predefined column names"
)

# Have to reset back otherwise we get an empty dataframe
csv_file.seek(0)

# Pandas has strange behaviour for the first line in a CSV - additional cells become row labels
# https://github.com/pandas-dev/pandas/issues/47490
#
# As a heuristic for this, check the row label for the first row is the number 0
# If it isn't - you've got too many values in the first row
if not df.iloc[0].name == 0:
raise ValueError(
"Suspected too many values in the first row, please check there are no extra values"
)

# Accept columns case insensitively but replace them with their official version to make life easier later
for column in df.columns:
if not column in HEADINGS_LIST and column.lower() in lowercase_headings_list:
normalised_column = next(
c for c in HEADINGS_LIST if c.lower() == column.lower()
)
df = df.rename(columns={column: normalised_column})

missing_columns = [column for column in HEADINGS_LIST if not column in df.columns]

additional_columns = [
column for column in df.columns if not column in HEADINGS_LIST
]

# Duplicate columns appear in the dataframe as XYZ.1, XYZ.2 etc
duplicate_columns = []

parse_type_error_columns = []

for column in df.columns:
result = re.match(r"([\w ]+)\.\d+$", column)

if result and result.group(1) not in duplicate_columns:
duplicate_columns.append(result.group(1))

for column in ALL_DATES:
if column in df.columns:
df[column] = pd.to_datetime(df[column], format="%d/%m/%Y", errors="coerce")

# Apply the dtype to non-date columns
for column, dtype in CSV_DATA_TYPES_MINUS_DATES.items():
try:
if column in df.columns:
df[column] = df[column].astype(dtype)
except ValueError as e:
parse_type_error_columns.append(column)
continue
# Convert NaN to None for nullable fields
if column in df.columns:
df[column] = df[column].where(pd.notnull(df[column]), None)
# round height and weight if provided to 1 decimal place
if (
column
in [
"Patient Height (cm)",
"Patient Weight (kg)",
"Total Cholesterol Level (mmol/l)",
]
and column in df.columns
):
if df[column].dtype == np.float64:
df[column] = df[column].round(1)
else:
parse_type_error_columns.append(column)

return ParsedCSVFile(
df,
missing_columns,
additional_columns,
duplicate_columns,
parse_type_error_columns,
)

from project.npda.forms.patient_form import PatientForm
from project.npda.forms.visit_form import VisitForm
from project.npda.forms.external_patient_validators import validate_patient_async

async def csv_upload(user, dataframe, csv_file, pdu_pz_code):
"""
Expand Down Expand Up @@ -424,4 +293,8 @@ async def validate_rows_in_parallel(rows_by_patient, async_client):
except Exception as error:
errors_to_return[visit_row_index]["__all__"].append(error)

return errors_to_return
# Only create xlsx file if the csv file was created.
if new_submission.csv_file:
_ = write_errors_to_xlsx(errors_to_return, new_submission)

return errors_to_return
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@ def serialize_error(error):
return {k: serialize_error(v) for k, v in error.items()}
return str(error)

return json.dumps({k: serialize_error(v) for k, v in errors.items()})
return json.dumps({int(k): serialize_error(v) for k, v in errors.items()})
Loading

0 comments on commit bbe7dbf

Please sign in to comment.