From b5fa8ed2e52188a0d7900631cd5c7f33baa7bb9e Mon Sep 17 00:00:00 2001 From: Michael Barton Date: Fri, 1 Nov 2024 16:27:28 +0000 Subject: [PATCH] Handle leading and trailing whitespace in CSV column headers --- project/npda/general_functions/csv_upload.py | 14 +++-- project/npda/tests/test_csv_upload.py | 55 +++++++++++++++++++- 2 files changed, 65 insertions(+), 4 deletions(-) diff --git a/project/npda/general_functions/csv_upload.py b/project/npda/general_functions/csv_upload.py index 06bce280..a72d2b33 100644 --- a/project/npda/general_functions/csv_upload.py +++ b/project/npda/general_functions/csv_upload.py @@ -27,9 +27,17 @@ def read_csv(csv_file): - return pd.read_csv( - csv_file, parse_dates=ALL_DATES, dayfirst=True, date_format="%d/%m/%Y" - ) + df = pd.read_csv(csv_file) + + # Remove leading and trailing whitespace on column names + # The template published on the RCPCH website has trailing spaces on 'Observation Date: Thyroid Function ' + df.columns = df.columns.str.strip() + + for column in ALL_DATES: + df[column] = pd.to_datetime(df[column], format="%d/%m/%Y") + + return df + async def csv_upload(user, dataframe, csv_file, pdu_pz_code): """ diff --git a/project/npda/tests/test_csv_upload.py b/project/npda/tests/test_csv_upload.py index 2f225697..0b4fb6a2 100644 --- a/project/npda/tests/test_csv_upload.py +++ b/project/npda/tests/test_csv_upload.py @@ -1,5 +1,6 @@ -from functools import partial import dataclasses +import tempfile +from functools import partial from unittest.mock import AsyncMock, patch from asgiref.sync import sync_to_async, async_to_sync @@ -44,6 +45,12 @@ def mock_remote_calls(): def dummy_sheets_folder(request): return request.config.rootdir / 'project' / 'npda' / 'dummy_sheets' +@pytest.fixture +def dummy_sheet_csv(dummy_sheets_folder): + file = dummy_sheets_folder / 'dummy_sheet.csv' + with open(file, 'r') as f: + return f.read() + @pytest.fixture def valid_df(dummy_sheets_folder): return read_csv(dummy_sheets_folder / 'dummy_sheet.csv') @@ -99,6 +106,13 @@ def async_get_all(query_set_fn): async def csv_upload_sync(user, dataframe, csv_file, pdu_pz_code): return await csv_upload(user, dataframe, csv_file, pdu_pz_code) +def read_csv_from_str(contents): + with tempfile.NamedTemporaryFile() as f: + f.write(contents.encode()) + f.seek(0) + + return read_csv(f) + @pytest.mark.django_db def test_create_patient(test_user, single_row_valid_df): @@ -505,3 +519,42 @@ def test_error_looking_up_index_of_multiple_deprivation(test_user, single_row_va patient = Patient.objects.first() assert(patient.index_of_multiple_deprivation_quintile is None) + + +@pytest.mark.django_db +def test_strip_first_spaces_in_column_name(test_user, dummy_sheet_csv): + csv = dummy_sheet_csv.replace("NHS Number", " NHS Number") + df = read_csv_from_str(csv) + + assert(df.columns[0] == "NHS Number") + + csv_upload_sync(test_user, df, None, ALDER_HEY_PZ_CODE) + patient = Patient.objects.first() + + assert(patient.nhs_number == nhs_number.standardise_format(df["NHS Number"][0])) + + +@pytest.mark.django_db +def test_strip_last_spaces_in_column_name(test_user, dummy_sheet_csv): + csv = dummy_sheet_csv.replace("NHS Number", "NHS Number ") + df = read_csv_from_str(csv) + + assert(df.columns[0] == "NHS Number") + + csv_upload_sync(test_user, df, None, ALDER_HEY_PZ_CODE) + patient = Patient.objects.first() + + assert(patient.nhs_number == nhs_number.standardise_format(df["NHS Number"][0])) + + +# Originally found in https://github.com/rcpch/national-paediatric-diabetes-audit/actions/runs/11627684066/job/32381466250 +# so we have a separate unit test for it +@pytest.mark.django_db +def test_spaces_in_date_column_name(test_user, dummy_sheet_csv): + csv = dummy_sheet_csv.replace("Date of Birth", " Date of Birth") + df = read_csv_from_str(csv) + + csv_upload_sync(test_user, df, None, ALDER_HEY_PZ_CODE) + patient = Patient.objects.first() + + assert(patient.date_of_birth == df["Date of Birth"][0].date())