Merge pull request #372 from rcpch:370-add-the-data-quality-report-to…

…-the-csv-as-a-separate-sheet 370 add the data quality report to the csv as a separate sheet
rcpch · Nov 15, 2024 · bbe7dbf · bbe7dbf
2 parents d07bd4e + f9f6e49
commit bbe7dbf
Show file tree

Hide file tree

Showing 17 changed files with 343 additions and 171 deletions.
diff --git a/.gitignore b/.gitignore
@@ -11,6 +11,7 @@ __pycache__
 /.cache
 .vscode
 *.code-workspace
+.vscode
 
 node_modules
 

diff --git a/project/npda/general_functions/__init__.py b/project/npda/general_functions/__init__.py
@@ -1,5 +1,3 @@
-from .csv_header import *
-from .csv_download import *
 from .email import *
 from .group_for_group import *
 from .index_multiple_deprivation import *

diff --git a/project/npda/general_functions/csv/__init__.py b/project/npda/general_functions/csv/__init__.py
@@ -0,0 +1,5 @@
+from .csv_download import *
+from .csv_header import *
+from .csv_parse import *
+from .csv_summarize import *
+from .csv_upload import *
diff --git a/...ct/npda/general_functions/csv_download.py → ...pda/general_functions/csv/csv_download.py b/...ct/npda/general_functions/csv_download.py → ...pda/general_functions/csv/csv_download.py
@@ -2,6 +2,11 @@
 from django.http import HttpResponse
 from django.shortcuts import get_object_or_404
 
+def download_file(file_path, file_name):
+    with open(file_path, "rb") as f:
+        response = HttpResponse(f.read(), content_type="text/csv")
+        response["Content-Disposition"] = f'attachment; filename="{file_name}"'
+        return response
 
 def download_csv(request, submission_id):
     """
@@ -12,7 +17,16 @@ def download_csv(request, submission_id):
     file_path = submission.csv_file.path
     file_name = submission.csv_file.name.split("/")[-1]
 
-    with open(file_path, "rb") as f:
-        response = HttpResponse(f.read(), content_type="text/csv")
-        response["Content-Disposition"] = f'attachment; filename="{file_name}"'
-        return response
+    return download_file(file_path, file_name)
+
+def download_xlsx(request, submission_id):
+    """
+    Download a XLSX file.
+    NB: This repurposes download_csv with a simple file rename.
+    """
+    Submission = apps.get_model(app_label="npda", model_name="Submission")
+    submission = get_object_or_404(Submission, id=submission_id)
+    file_path = submission.csv_file.path.replace('.csv','.xlsx')
+    file_name = submission.csv_file.name.split("/")[-1].replace('.csv','.xlsx')
+
+    return download_file(file_path, file_name)
diff --git a/project/npda/general_functions/csv_header.py → .../npda/general_functions/csv/csv_header.py b/project/npda/general_functions/csv_header.py → .../npda/general_functions/csv/csv_header.py
@@ -1,6 +1,6 @@
 import csv
 import io
-from ...constants.csv_headings import HEADINGS_LIST
+from project.constants.csv_headings import HEADINGS_LIST
 
 
 def csv_header():

diff --git a/project/npda/general_functions/csv/csv_parse.py b/project/npda/general_functions/csv/csv_parse.py
@@ -0,0 +1,141 @@
+# python imports
+from dataclasses import dataclass
+import logging
+import re
+
+# Third-party imports
+import pandas as pd
+import numpy as np
+
+# RCPCH imports
+from project.constants import (
+    ALL_DATES,
+    CSV_DATA_TYPES_MINUS_DATES,
+    CSV_HEADINGS,
+    HEADINGS_LIST,
+)
+
+# Logging setup
+logger = logging.getLogger(__name__)
+
+@dataclass
+class ParsedCSVFile:
+    df: pd.DataFrame
+    missing_columns: list[str]
+    additional_columns: list[str]
+    duplicate_columns: list[str]
+    parse_type_error_columns: list[str]
+
+def csv_parse(csv_file):
+    """
+    Read the csv file and return a pandas dataframe
+    Assigns the correct data types to the columns
+    Parses the dates in the columns to the correct format
+    """
+    # It is possible the csv file has no header row. In this case, we will use the predefined column names
+    # The predefined column names are in the HEADINGS_LIST constant and if cast to lowercase, in lowercase_headings_list
+    # We will check if the first row of the csv file matches the predefined column names
+    # If it does not, we will use the predefined column names
+    # If it does, we will use the column names in the csv file
+
+    # Convert the predefined column names to lowercase
+    lowercase_headings_list = [heading.lower() for heading in HEADINGS_LIST]
+
+    # Read the first row of the csv file
+    df = pd.read_csv(csv_file)
+
+    if any(col.lower() in lowercase_headings_list for col in df.columns):
+        # The first row of the csv file matches at least some of the predefined column names
+        # We will use the column names in the csv file
+        pass
+    else:
+        # The first row of the csv file does not match the predefined column names
+        # We will use the predefined column names
+        csv_file.seek(0)
+        df = pd.read_csv(csv_file, header=None, names=HEADINGS_LIST)
+
+    # Remove leading and trailing whitespace on column names
+    # The template published on the RCPCH website has trailing spaces on 'Observation Date: Thyroid Function '
+    df.columns = df.columns.str.strip()
+
+    if df.columns[0].lower() not in lowercase_headings_list:
+        # No header in the source - pass them from our definitions
+        logger.warning(
+            f"CSV file uploaded without column names, using predefined column names"
+        )
+
+        # Have to reset back otherwise we get an empty dataframe
+        csv_file.seek(0)
+
+    # Pandas has strange behaviour for the first line in a CSV - additional cells become row labels
+    # https://github.com/pandas-dev/pandas/issues/47490
+    #
+    # As a heuristic for this, check the row label for the first row is the number 0
+    # If it isn't - you've got too many values in the first row
+    if not df.iloc[0].name == 0:
+        raise ValueError(
+            "Suspected too many values in the first row, please check there are no extra values"
+        )
+
+    # Accept columns case insensitively but replace them with their official version to make life easier later
+    for column in df.columns:
+        if not column in HEADINGS_LIST and column.lower() in lowercase_headings_list:
+            normalised_column = next(
+                c for c in HEADINGS_LIST if c.lower() == column.lower()
+            )
+            df = df.rename(columns={column: normalised_column})
+
+    missing_columns = [column for column in HEADINGS_LIST if not column in df.columns]
+
+    additional_columns = [
+        column for column in df.columns if not column in HEADINGS_LIST
+    ]
+
+    # Duplicate columns appear in the dataframe as XYZ.1, XYZ.2 etc
+    duplicate_columns = []
+
+    parse_type_error_columns = []
+
+    for column in df.columns:
+        result = re.match(r"([\w ]+)\.\d+$", column)
+
+        if result and result.group(1) not in duplicate_columns:
+            duplicate_columns.append(result.group(1))
+
+    for column in ALL_DATES:
+        if column in df.columns:
+            df[column] = pd.to_datetime(df[column], format="%d/%m/%Y", errors="coerce")
+
+    # Apply the dtype to non-date columns
+    for column, dtype in CSV_DATA_TYPES_MINUS_DATES.items():
+        try:
+            if column in df.columns:
+                df[column] = df[column].astype(dtype)
+        except ValueError as e:
+            parse_type_error_columns.append(column)
+            continue
+        # Convert NaN to None for nullable fields
+        if column in df.columns:
+            df[column] = df[column].where(pd.notnull(df[column]), None)
+        # round height and weight if provided to 1 decimal place
+        if (
+            column
+            in [
+                "Patient Height (cm)",
+                "Patient Weight (kg)",
+                "Total Cholesterol Level (mmol/l)",
+            ]
+            and column in df.columns
+        ):
+            if df[column].dtype == np.float64:
+                df[column] = df[column].round(1)
+            else:
+                parse_type_error_columns.append(column)
+
+    return ParsedCSVFile(
+        df,
+        missing_columns,
+        additional_columns,
+        duplicate_columns,
+        parse_type_error_columns,
+    )
diff --git a/...t/npda/general_functions/csv_summarize.py → ...da/general_functions/csv/csv_summarize.py b/...t/npda/general_functions/csv_summarize.py → ...da/general_functions/csv/csv_summarize.py
@@ -8,7 +8,7 @@
 import pandas as pd
 
 # RCPCH imports
-from ...constants.csv_headings import (
+from project.constants.csv_headings import (
     ALL_DATES,
 )
 

diff --git a/project/npda/general_functions/csv_upload.py → .../npda/general_functions/csv/csv_upload.py b/project/npda/general_functions/csv_upload.py → .../npda/general_functions/csv/csv_upload.py
@@ -1,11 +1,8 @@
 # python imports
 from datetime import date
-from dataclasses import dataclass
 import logging
 import asyncio
 import collections
-import re
-from pprint import pprint
 
 # django imports
 from django.apps import apps
@@ -18,143 +15,15 @@
 import httpx
 
 # RCPCH imports
-from ...constants import (
-    ALL_DATES,
-    CSV_DATA_TYPES_MINUS_DATES,
-    CSV_HEADINGS,
-    HEADINGS_LIST,
-)
+from project.npda.general_functions.write_errors_to_xlsx import write_errors_to_xlsx
+from project.constants import CSV_HEADINGS
 
 # Logging setup
 logger = logging.getLogger(__name__)
-from ..forms.patient_form import PatientForm
-from ..forms.visit_form import VisitForm
-from ..forms.external_patient_validators import validate_patient_async
-
-
-@dataclass
-class ParsedCSVFile:
-    df: pd.DataFrame
-    missing_columns: list[str]
-    additional_columns: list[str]
-    duplicate_columns: list[str]
-    parse_type_error_columns: list[str]
-
-
-def read_csv(csv_file):
-    """
-    Read the csv file and return a pandas dataframe
-    Assigns the correct data types to the columns
-    Parses the dates in the columns to the correct format
-    """
-    # It is possible the csv file has no header row. In this case, we will use the predefined column names
-    # The predefined column names are in the HEADINGS_LIST constant and if cast to lowercase, in lowercase_headings_list
-    # We will check if the first row of the csv file matches the predefined column names
-    # If it does not, we will use the predefined column names
-    # If it does, we will use the column names in the csv file
-
-    # Convert the predefined column names to lowercase
-    lowercase_headings_list = [heading.lower() for heading in HEADINGS_LIST]
-
-    # Read the first row of the csv file
-    df = pd.read_csv(csv_file)
-
-    if any(col.lower() in lowercase_headings_list for col in df.columns):
-        # The first row of the csv file matches at least some of the predefined column names
-        # We will use the column names in the csv file
-        pass
-    else:
-        # The first row of the csv file does not match the predefined column names
-        # We will use the predefined column names
-        csv_file.seek(0)
-        df = pd.read_csv(csv_file, header=None, names=HEADINGS_LIST)
-
-    # Remove leading and trailing whitespace on column names
-    # The template published on the RCPCH website has trailing spaces on 'Observation Date: Thyroid Function '
-    df.columns = df.columns.str.strip()
-
-    if df.columns[0].lower() not in lowercase_headings_list:
-        # No header in the source - pass them from our definitions
-        logger.warning(
-            f"CSV file uploaded without column names, using predefined column names"
-        )
-
-        # Have to reset back otherwise we get an empty dataframe
-        csv_file.seek(0)
-
-    # Pandas has strange behaviour for the first line in a CSV - additional cells become row labels
-    # https://github.com/pandas-dev/pandas/issues/47490
-    #
-    # As a heuristic for this, check the row label for the first row is the number 0
-    # If it isn't - you've got too many values in the first row
-    if not df.iloc[0].name == 0:
-        raise ValueError(
-            "Suspected too many values in the first row, please check there are no extra values"
-        )
-
-    # Accept columns case insensitively but replace them with their official version to make life easier later
-    for column in df.columns:
-        if not column in HEADINGS_LIST and column.lower() in lowercase_headings_list:
-            normalised_column = next(
-                c for c in HEADINGS_LIST if c.lower() == column.lower()
-            )
-            df = df.rename(columns={column: normalised_column})
-
-    missing_columns = [column for column in HEADINGS_LIST if not column in df.columns]
-
-    additional_columns = [
-        column for column in df.columns if not column in HEADINGS_LIST
-    ]
-
-    # Duplicate columns appear in the dataframe as XYZ.1, XYZ.2 etc
-    duplicate_columns = []
-
-    parse_type_error_columns = []
-
-    for column in df.columns:
-        result = re.match(r"([\w ]+)\.\d+$", column)
-
-        if result and result.group(1) not in duplicate_columns:
-            duplicate_columns.append(result.group(1))
-
-    for column in ALL_DATES:
-        if column in df.columns:
-            df[column] = pd.to_datetime(df[column], format="%d/%m/%Y", errors="coerce")
-
-    # Apply the dtype to non-date columns
-    for column, dtype in CSV_DATA_TYPES_MINUS_DATES.items():
-        try:
-            if column in df.columns:
-                df[column] = df[column].astype(dtype)
-        except ValueError as e:
-            parse_type_error_columns.append(column)
-            continue
-        # Convert NaN to None for nullable fields
-        if column in df.columns:
-            df[column] = df[column].where(pd.notnull(df[column]), None)
-        # round height and weight if provided to 1 decimal place
-        if (
-            column
-            in [
-                "Patient Height (cm)",
-                "Patient Weight (kg)",
-                "Total Cholesterol Level (mmol/l)",
-            ]
-            and column in df.columns
-        ):
-            if df[column].dtype == np.float64:
-                df[column] = df[column].round(1)
-            else:
-                parse_type_error_columns.append(column)
-
-    return ParsedCSVFile(
-        df,
-        missing_columns,
-        additional_columns,
-        duplicate_columns,
-        parse_type_error_columns,
-    )
 
+from project.npda.forms.patient_form import PatientForm
+from project.npda.forms.visit_form import VisitForm
+from project.npda.forms.external_patient_validators import validate_patient_async
 
 async def csv_upload(user, dataframe, csv_file, pdu_pz_code):
     """
@@ -424,4 +293,8 @@ async def validate_rows_in_parallel(rows_by_patient, async_client):
                 except Exception as error:
                     errors_to_return[visit_row_index]["__all__"].append(error)
 
-        return errors_to_return
+    # Only create xlsx file if the csv file was created.
+    if new_submission.csv_file:
+        _ = write_errors_to_xlsx(errors_to_return, new_submission)
+
+    return errors_to_return
diff --git a/project/npda/general_functions/serialize_validation_errors.py b/project/npda/general_functions/serialize_validation_errors.py
@@ -15,4 +15,4 @@ def serialize_error(error):
             return {k: serialize_error(v) for k, v in error.items()}
         return str(error)
 
-    return json.dumps({k: serialize_error(v) for k, v in errors.items()})
+    return json.dumps({int(k): serialize_error(v) for k, v in errors.items()})