From 03b9d89d21f5447fea1919101d155abce8bbebbd Mon Sep 17 00:00:00 2001 From: Josh Langford Date: Mon, 20 Jan 2025 13:30:32 +0000 Subject: [PATCH] feat: if ofsted related columns are missing from gias submission for data processing use default values. - new 2024 gias schema without `OfstedRating (name)` and `OfstedLastInsp` columns. - when ofsted related columns are missing they are created and set to default values for downstream processing and computations. - when ofsted related columns are present (for historic submissions) the current behaviour is preserved. --- .../src/pipeline/input_schemas/__init__.py | 39 +-------- .../src/pipeline/input_schemas/gias.py | 73 +++++++++++++++++ data-pipeline/src/pipeline/main.py | 2 +- .../pre_processing/ancillary/schools.py | 47 +++++++++-- .../tests/unit/pre_processing/conftest.py | 2 +- .../tests/unit/pre_processing/test_gias.py | 79 +++++++++++++++++++ data-pipeline/tests/unit/rag/test_rag.py | 34 ++++++-- 7 files changed, 225 insertions(+), 51 deletions(-) create mode 100644 data-pipeline/src/pipeline/input_schemas/gias.py diff --git a/data-pipeline/src/pipeline/input_schemas/__init__.py b/data-pipeline/src/pipeline/input_schemas/__init__.py index 495d7296e..8b4ee3b2f 100644 --- a/data-pipeline/src/pipeline/input_schemas/__init__.py +++ b/data-pipeline/src/pipeline/input_schemas/__init__.py @@ -10,44 +10,7 @@ workforce_census_header_row, workforce_census_index_col, ) - -gias_index_col = "URN" -gias = { - "URN": "Int64", - "UKPRN": "Int64", - "LA (code)": "Int64", - "LA (name)": "string", - "EstablishmentNumber": "Int64", - "EstablishmentName": "string", - "TypeOfEstablishment (code)": "Int64", - "TypeOfEstablishment (name)": "string", - "OpenDate": "string", - "CloseDate": "string", - "PhaseOfEducation (code)": "Int64", - "PhaseOfEducation (name)": "string", - "Boarders (code)": "Int64", - "Boarders (name)": "string", - "NurseryProvision (name)": "string", - "OfficialSixthForm (code)": "Int64", - "OfficialSixthForm (name)": "string", - "AdmissionsPolicy (code)": "Int64", - "AdmissionsPolicy (name)": "string", - "OfstedLastInsp": "string", - "Postcode": "string", - "SchoolWebsite": "string", - "TelephoneNum": "string", - "GOR (name)": "string", - "OfstedRating (name)": "string", - "MSOA (code)": "string", - "LSOA (code)": "string", - "StatutoryLowAge": "Int64", - "StatutoryHighAge": "Int64", - "Street": "string", - "Locality": "string", - "Address3": "string", - "Town": "string", - "County (name)": "string", -} +from .gias import gias, gias_index_col gias_links_index_col = "URN" gias_links = { diff --git a/data-pipeline/src/pipeline/input_schemas/gias.py b/data-pipeline/src/pipeline/input_schemas/gias.py new file mode 100644 index 000000000..60822c438 --- /dev/null +++ b/data-pipeline/src/pipeline/input_schemas/gias.py @@ -0,0 +1,73 @@ +gias_index_col = "URN" +gias = { + "default": { + "URN": "Int64", + "UKPRN": "Int64", + "LA (code)": "Int64", + "LA (name)": "string", + "EstablishmentNumber": "Int64", + "EstablishmentName": "string", + "TypeOfEstablishment (code)": "Int64", + "TypeOfEstablishment (name)": "string", + "OpenDate": "string", + "CloseDate": "string", + "PhaseOfEducation (code)": "Int64", + "PhaseOfEducation (name)": "string", + "Boarders (code)": "Int64", + "Boarders (name)": "string", + "NurseryProvision (name)": "string", + "OfficialSixthForm (code)": "Int64", + "OfficialSixthForm (name)": "string", + "AdmissionsPolicy (code)": "Int64", + "AdmissionsPolicy (name)": "string", + "OfstedLastInsp": "string", + "Postcode": "string", + "SchoolWebsite": "string", + "TelephoneNum": "string", + "GOR (name)": "string", + "OfstedRating (name)": "string", + "MSOA (code)": "string", + "LSOA (code)": "string", + "StatutoryLowAge": "Int64", + "StatutoryHighAge": "Int64", + "Street": "string", + "Locality": "string", + "Address3": "string", + "Town": "string", + "County (name)": "string", + }, + 2024: { + "URN": "Int64", + "UKPRN": "Int64", + "LA (code)": "Int64", + "LA (name)": "string", + "EstablishmentNumber": "Int64", + "EstablishmentName": "string", + "TypeOfEstablishment (code)": "Int64", + "TypeOfEstablishment (name)": "string", + "OpenDate": "string", + "CloseDate": "string", + "PhaseOfEducation (code)": "Int64", + "PhaseOfEducation (name)": "string", + "Boarders (code)": "Int64", + "Boarders (name)": "string", + "NurseryProvision (name)": "string", + "OfficialSixthForm (code)": "Int64", + "OfficialSixthForm (name)": "string", + "AdmissionsPolicy (code)": "Int64", + "AdmissionsPolicy (name)": "string", + "Postcode": "string", + "SchoolWebsite": "string", + "TelephoneNum": "string", + "GOR (name)": "string", + "MSOA (code)": "string", + "LSOA (code)": "string", + "StatutoryLowAge": "Int64", + "StatutoryHighAge": "Int64", + "Street": "string", + "Locality": "string", + "Address3": "string", + "Town": "string", + "County (name)": "string", + }, +} diff --git a/data-pipeline/src/pipeline/main.py b/data-pipeline/src/pipeline/main.py index 82c6b5d32..27dde8765 100644 --- a/data-pipeline/src/pipeline/main.py +++ b/data-pipeline/src/pipeline/main.py @@ -181,7 +181,7 @@ def pre_process_schools(run_type: str, year: int, run_id: str) -> pd.DataFrame: raw_container, f"{run_type}/{year}/gias_links.csv", encoding="cp1252" ) - schools = prepare_schools_data(gias_data, gias_links_data) + schools = prepare_schools_data(gias_data, gias_links_data, year) write_blob( "pre-processed", diff --git a/data-pipeline/src/pipeline/pre_processing/ancillary/schools.py b/data-pipeline/src/pipeline/pre_processing/ancillary/schools.py index 63b3238d5..683d7d5bd 100644 --- a/data-pipeline/src/pipeline/pre_processing/ancillary/schools.py +++ b/data-pipeline/src/pipeline/pre_processing/ancillary/schools.py @@ -4,13 +4,20 @@ import pipeline.mappings as mappings -def prepare_schools_data(base_data_path, links_data_path): +def prepare_schools_data(base_data_path, links_data_path, year: int): + """ + Prepare school data derived from GIAS and GIAS links. + + :param base_data_path: readable source for GIAS + :param pupil_census_path: readable source for GIAS links + :param year: financial year in question + """ gias = pd.read_csv( base_data_path, encoding="cp1252", index_col=input_schemas.gias_index_col, - usecols=input_schemas.gias.keys(), - dtype=input_schemas.gias, + usecols=input_schemas.gias.get(year, input_schemas.gias["default"]).keys(), + dtype=input_schemas.gias.get(year, input_schemas.gias["default"]), ) gias_links = pd.read_csv( @@ -39,9 +46,7 @@ def prepare_schools_data(base_data_path, links_data_path): gias["Boarders (name)"].fillna("").map(mappings.map_boarders) ) - gias["OfstedRating (name)"] = ( - gias["OfstedRating (name)"].fillna("").map(mappings.map_ofsted_rating) - ) + gias = _optional_ofsted_cols(gias) gias["TypeOfEstablishment (name)"] = ( gias["TypeOfEstablishment (name)"].fillna("").map(lambda x: x.strip()) @@ -93,3 +98,33 @@ def prepare_schools_data(base_data_path, links_data_path): return schools[(schools["Rank"] == 1) | (schools["Rank"].isna())].drop( columns=["LinkURN", "LinkName", "LinkType", "LinkEstablishedDate", "Rank"] ) + + +def _optional_ofsted_cols(df: pd.DataFrame) -> pd.DataFrame: + """ + Ensure that "OfstedRating (name)" and "OfstedLastInsp" columns are present in the DataFrame, + even if they are missing from the original submission. Missing columns are created with defaults + to ensure compatibility with downstream processing. + + These columns are required to write to the db and "OfstedRating (name)" is required for rag + calculations. + + If the columns exist, they are either preserved or mapped as necessary. + If they do not exist, new columns are created: an empty string column for + "OfstedRating (name)" and a `None` column for "OfstedLastInsp". + + :param df: The GIAS DataFrame to process. + + :return: The DataFrame with the "OfstedRating (name)" and "OfstedLastInsp" columns added or modified. + """ + df["OfstedRating (name)"] = ( + df.get("OfstedRating (name)", pd.Series([""] * len(df), index=df.index)).fillna( + "" + ) + ).map(mappings.map_ofsted_rating) + + df["OfstedLastInsp"] = df.get( + "OfstedLastInsp", pd.Series([None] * len(df), index=df.index) + ) + + return df diff --git a/data-pipeline/tests/unit/pre_processing/conftest.py b/data-pipeline/tests/unit/pre_processing/conftest.py index 66874cc1c..a4bc95235 100644 --- a/data-pipeline/tests/unit/pre_processing/conftest.py +++ b/data-pipeline/tests/unit/pre_processing/conftest.py @@ -532,7 +532,7 @@ def prepared_schools_data( gias_data: pd.DataFrame, gias_links: pd.DataFrame ) -> pd.DataFrame: return prepare_schools_data( - StringIO(gias_data.to_csv()), StringIO(gias_links.to_csv()) + StringIO(gias_data.to_csv()), StringIO(gias_links.to_csv()), 2023 ) diff --git a/data-pipeline/tests/unit/pre_processing/test_gias.py b/data-pipeline/tests/unit/pre_processing/test_gias.py index 019b1720d..bfe4bb22c 100644 --- a/data-pipeline/tests/unit/pre_processing/test_gias.py +++ b/data-pipeline/tests/unit/pre_processing/test_gias.py @@ -1,6 +1,10 @@ +from io import StringIO + import pandas as pd import pytest +from pipeline.pre_processing import prepare_schools_data + def test_prepare_school_data_has_correct_output_columns( prepared_schools_data: pd.DataFrame, @@ -45,6 +49,81 @@ def test_prepare_school_data_has_correct_output_columns( ] +def test_prepare_school_data_has_correct_output_columns_without_ofsted_cols( + gias_data, gias_links +): + """ + For 2024 submissions GIAS data will not include "OfstedRating (name)" or "OfstedLastInsp" + This test confirms these columns are created as they are required for downsteam processing. + """ + gias_without_ofsted = gias_data.copy().drop( + columns=["OfstedRating (name)", "OfstedLastInsp"] + ) + + actual = prepare_schools_data( + StringIO(gias_without_ofsted.to_csv()), StringIO(gias_links.to_csv()), 2024 + ) + + assert list(actual.columns) == [ + "UKPRN", + "LA (code)", + "LA (name)", + "EstablishmentNumber", + "EstablishmentName", + "TypeOfEstablishment (code)", + "TypeOfEstablishment (name)", + "OpenDate", + "CloseDate", + "PhaseOfEducation (code)", + "PhaseOfEducation (name)", + "Boarders (code)", + "Boarders (name)", + "NurseryProvision (name)", + "OfficialSixthForm (code)", + "OfficialSixthForm (name)", + "AdmissionsPolicy (code)", + "AdmissionsPolicy (name)", + "Postcode", + "SchoolWebsite", + "TelephoneNum", + "GOR (name)", + "MSOA (code)", + "LSOA (code)", + "StatutoryLowAge", + "StatutoryHighAge", + "Street", + "Locality", + "Address3", + "Town", + "County (name)", + "LA Establishment Number", + "OfstedRating (name)", + "OfstedLastInsp", + "Has Nursery", + "Has Sixth Form", + ] + + +def test_prepare_school_data_has_correct_output_ofsted_values_without_submission( + gias_data, gias_links +): + """ + For 2024 submissions GIAS data will not include "OfstedRating (name)" or + "OfstedLastInsp". These columns should still be created as they are required for downstream processing. + This test confirms these are set with default values. + """ + gias_without_ofsted = gias_data.copy().drop( + columns=["OfstedRating (name)", "OfstedLastInsp"] + ) + + actual = prepare_schools_data( + StringIO(gias_without_ofsted.to_csv()), StringIO(gias_links.to_csv()), 2024 + ) + + assert (actual["OfstedRating (name)"] == "").all() + assert actual["OfstedLastInsp"].isna().all() + + def test_la_establishment_number_computed(prepared_schools_data: pd.DataFrame): assert prepared_schools_data.loc[100150]["LA Establishment Number"] == "201-3614" diff --git a/data-pipeline/tests/unit/rag/test_rag.py b/data-pipeline/tests/unit/rag/test_rag.py index 27b6fa7a1..eb48ec3bd 100644 --- a/data-pipeline/tests/unit/rag/test_rag.py +++ b/data-pipeline/tests/unit/rag/test_rag.py @@ -114,7 +114,7 @@ def test_find_percentile(): @pytest.mark.parametrize( - "value,data,diff_median,percent_diff,percentile,decile,expected_rag", + "value,data,diff_median,percent_diff,percentile,decile,expected_rag,ofsted,key", [ ( 20, @@ -124,6 +124,8 @@ def test_find_percentile(): 60.0, 6, "amber", + "outstanding", + "outstanding", ), ( 150, @@ -133,6 +135,8 @@ def test_find_percentile(): 100.0, 10, "red", + "outstanding", + "outstanding", ), ( 15, @@ -142,11 +146,32 @@ def test_find_percentile(): 30.0, 3, "green", + "outstanding", + "outstanding", + ), + ( + 15, + [15, 5, 6, 150, 16, 19, 22, 25, 76, 20], + -4.5, + -23.076923076923077, + 30.0, + 3, + "green", + "", + "other", ), ], ) def test_category_stats( - value, data, diff_median, percent_diff, percentile, decile, expected_rag + value, + data, + diff_median, + percent_diff, + percentile, + decile, + expected_rag, + ofsted, + key, ): category = "Teaching and Teaching support staff_Sub Cat" data = pd.DataFrame( @@ -161,7 +186,7 @@ def test_category_stats( "Value": value, "Median": 19.5, "DiffMedian": diff_median, - "Key": "outstanding", + "Key": key, "PercentDiff": percent_diff, "Percentile": percentile, "Decile": decile, @@ -170,6 +195,5 @@ def test_category_stats( rag_settings = config.rag_category_settings["Teaching and Teaching support staff"] assert ( - rag.category_stats(100000, category, data, "outstanding", rag_settings, 10) - == expected + rag.category_stats(100000, category, data, ofsted, rag_settings, 10) == expected )