feat: if ofsted related columns are missing from gias submission for …

…data processing use default values. - new 2024 gias schema without `OfstedRating (name)` and `OfstedLastInsp` columns. - when ofsted related columns are missing they are created and set to default values for downstream processing and computations. - when ofsted related columns are present (for historic submissions) the current behaviour is preserved.
DFE-Digital · Jan 22, 2025 · 25c0ffc · 25c0ffc
1 parent cc4ab4c
commit 25c0ffc
Show file tree

Hide file tree

Showing 7 changed files with 229 additions and 51 deletions.
diff --git a/data-pipeline/src/pipeline/input_schemas/__init__.py b/data-pipeline/src/pipeline/input_schemas/__init__.py
@@ -10,44 +10,7 @@
     workforce_census_header_row,
     workforce_census_index_col,
 )
-
-gias_index_col = "URN"
-gias = {
-    "URN": "Int64",
-    "UKPRN": "Int64",
-    "LA (code)": "Int64",
-    "LA (name)": "string",
-    "EstablishmentNumber": "Int64",
-    "EstablishmentName": "string",
-    "TypeOfEstablishment (code)": "Int64",
-    "TypeOfEstablishment (name)": "string",
-    "OpenDate": "string",
-    "CloseDate": "string",
-    "PhaseOfEducation (code)": "Int64",
-    "PhaseOfEducation (name)": "string",
-    "Boarders (code)": "Int64",
-    "Boarders (name)": "string",
-    "NurseryProvision (name)": "string",
-    "OfficialSixthForm (code)": "Int64",
-    "OfficialSixthForm (name)": "string",
-    "AdmissionsPolicy (code)": "Int64",
-    "AdmissionsPolicy (name)": "string",
-    "OfstedLastInsp": "string",
-    "Postcode": "string",
-    "SchoolWebsite": "string",
-    "TelephoneNum": "string",
-    "GOR (name)": "string",
-    "OfstedRating (name)": "string",
-    "MSOA (code)": "string",
-    "LSOA (code)": "string",
-    "StatutoryLowAge": "Int64",
-    "StatutoryHighAge": "Int64",
-    "Street": "string",
-    "Locality": "string",
-    "Address3": "string",
-    "Town": "string",
-    "County (name)": "string",
-}
+from .gias import gias, gias_index_col
 
 gias_links_index_col = "URN"
 gias_links = {

diff --git a/data-pipeline/src/pipeline/input_schemas/gias.py b/data-pipeline/src/pipeline/input_schemas/gias.py
@@ -0,0 +1,73 @@
+gias_index_col = "URN"
+gias = {
+    "default": {
+        "URN": "Int64",
+        "UKPRN": "Int64",
+        "LA (code)": "Int64",
+        "LA (name)": "string",
+        "EstablishmentNumber": "Int64",
+        "EstablishmentName": "string",
+        "TypeOfEstablishment (code)": "Int64",
+        "TypeOfEstablishment (name)": "string",
+        "OpenDate": "string",
+        "CloseDate": "string",
+        "PhaseOfEducation (code)": "Int64",
+        "PhaseOfEducation (name)": "string",
+        "Boarders (code)": "Int64",
+        "Boarders (name)": "string",
+        "NurseryProvision (name)": "string",
+        "OfficialSixthForm (code)": "Int64",
+        "OfficialSixthForm (name)": "string",
+        "AdmissionsPolicy (code)": "Int64",
+        "AdmissionsPolicy (name)": "string",
+        "OfstedLastInsp": "string",
+        "Postcode": "string",
+        "SchoolWebsite": "string",
+        "TelephoneNum": "string",
+        "GOR (name)": "string",
+        "OfstedRating (name)": "string",
+        "MSOA (code)": "string",
+        "LSOA (code)": "string",
+        "StatutoryLowAge": "Int64",
+        "StatutoryHighAge": "Int64",
+        "Street": "string",
+        "Locality": "string",
+        "Address3": "string",
+        "Town": "string",
+        "County (name)": "string",
+    },
+    2024: {
+        "URN": "Int64",
+        "UKPRN": "Int64",
+        "LA (code)": "Int64",
+        "LA (name)": "string",
+        "EstablishmentNumber": "Int64",
+        "EstablishmentName": "string",
+        "TypeOfEstablishment (code)": "Int64",
+        "TypeOfEstablishment (name)": "string",
+        "OpenDate": "string",
+        "CloseDate": "string",
+        "PhaseOfEducation (code)": "Int64",
+        "PhaseOfEducation (name)": "string",
+        "Boarders (code)": "Int64",
+        "Boarders (name)": "string",
+        "NurseryProvision (name)": "string",
+        "OfficialSixthForm (code)": "Int64",
+        "OfficialSixthForm (name)": "string",
+        "AdmissionsPolicy (code)": "Int64",
+        "AdmissionsPolicy (name)": "string",
+        "Postcode": "string",
+        "SchoolWebsite": "string",
+        "TelephoneNum": "string",
+        "GOR (name)": "string",
+        "MSOA (code)": "string",
+        "LSOA (code)": "string",
+        "StatutoryLowAge": "Int64",
+        "StatutoryHighAge": "Int64",
+        "Street": "string",
+        "Locality": "string",
+        "Address3": "string",
+        "Town": "string",
+        "County (name)": "string",
+    },
+}
diff --git a/data-pipeline/src/pipeline/main.py b/data-pipeline/src/pipeline/main.py
@@ -181,7 +181,7 @@ def pre_process_schools(run_type: str, year: int, run_id: str) -> pd.DataFrame:
         raw_container, f"{run_type}/{year}/gias_links.csv", encoding="cp1252"
     )
 
-    schools = prepare_schools_data(gias_data, gias_links_data)
+    schools = prepare_schools_data(gias_data, gias_links_data, year)
 
     write_blob(
         "pre-processed",

diff --git a/data-pipeline/src/pipeline/pre_processing/ancillary/schools.py b/data-pipeline/src/pipeline/pre_processing/ancillary/schools.py
@@ -4,13 +4,20 @@
 import pipeline.mappings as mappings
 
 
-def prepare_schools_data(base_data_path, links_data_path):
+def prepare_schools_data(base_data_path, links_data_path, year: int):
+    """
+    Prepare school data derived from gias and gias links.
+
+    :param base_data_path: readable source for gias
+    :param pupil_census_path: readable source for gias links
+    :param year: financial year in question
+    """
     gias = pd.read_csv(
         base_data_path,
         encoding="cp1252",
         index_col=input_schemas.gias_index_col,
-        usecols=input_schemas.gias.keys(),
-        dtype=input_schemas.gias,
+        usecols=input_schemas.gias.get(year, input_schemas.gias["default"]).keys(),
+        dtype=input_schemas.gias.get(year, input_schemas.gias["default"]),
     )
 
     gias_links = pd.read_csv(
@@ -39,9 +46,7 @@ def prepare_schools_data(base_data_path, links_data_path):
         gias["Boarders (name)"].fillna("").map(mappings.map_boarders)
     )
 
-    gias["OfstedRating (name)"] = (
-        gias["OfstedRating (name)"].fillna("").map(mappings.map_ofsted_rating)
-    )
+    gias = _optional_ofsted_cols(gias)
 
     gias["TypeOfEstablishment (name)"] = (
         gias["TypeOfEstablishment (name)"].fillna("").map(lambda x: x.strip())
@@ -93,3 +98,35 @@ def prepare_schools_data(base_data_path, links_data_path):
     return schools[(schools["Rank"] == 1) | (schools["Rank"].isna())].drop(
         columns=["LinkURN", "LinkName", "LinkType", "LinkEstablishedDate", "Rank"]
     )
+
+
+def _optional_ofsted_cols(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Ensure that "OfstedRating (name)" and "OfstedLastInsp" columns are present in the DataFrame,
+    even if they are missing from the original submission. Missing columns are created with defaults
+    to ensure compatibility with downstream processing.
+
+    These columns are required to write to the db and "OfstedRating (name)" is required for rag
+    calculations.
+
+    If the columns exist, they are either preserved or mapped as necessary.
+    If they do not exist, new columns are created: an empty string column for
+    "OfstedRating (name)" and a `None` column for "OfstedLastInsp".
+
+    :param df: The GIAS DataFrame to process.
+    :type pd.DataFrame:
+
+    :return: The DataFrame with the "OfstedRating (name)" and "OfstedLastInsp" columns added or modified.
+    :rtype: pd.DataFrame
+    """
+    df["OfstedRating (name)"] = (
+        df.get("OfstedRating (name)", pd.Series([""] * len(df), index=df.index)).fillna(
+            ""
+        )
+    ).map(mappings.map_ofsted_rating)
+
+    df["OfstedLastInsp"] = df.get(
+        "OfstedLastInsp", pd.Series([None] * len(df), index=df.index)
+    )
+
+    return df
diff --git a/data-pipeline/tests/unit/pre_processing/conftest.py b/data-pipeline/tests/unit/pre_processing/conftest.py
@@ -532,7 +532,7 @@ def prepared_schools_data(
     gias_data: pd.DataFrame, gias_links: pd.DataFrame
 ) -> pd.DataFrame:
     return prepare_schools_data(
-        StringIO(gias_data.to_csv()), StringIO(gias_links.to_csv())
+        StringIO(gias_data.to_csv()), StringIO(gias_links.to_csv()), 2023
     )
 
 

diff --git a/data-pipeline/tests/unit/pre_processing/test_gias.py b/data-pipeline/tests/unit/pre_processing/test_gias.py
@@ -1,6 +1,10 @@
+from io import StringIO
+
 import pandas as pd
 import pytest
 
+from pipeline.pre_processing import prepare_schools_data
+
 
 def test_prepare_school_data_has_correct_output_columns(
     prepared_schools_data: pd.DataFrame,
@@ -45,6 +49,83 @@ def test_prepare_school_data_has_correct_output_columns(
     ]
 
 
+def test_prepare_school_data_has_correct_output_columns_without_ofsted_cols(
+    gias_data, gias_links
+):
+    """
+    For 2024 submissions gias data will not include "OfstedRating (name)" or "OfstedLastInsp"
+    This test confirms these columns are created as they are required for downsteam processing.
+    """
+    gias_without_ofsted = gias_data.copy().drop(
+        columns=["OfstedRating (name)", "OfstedLastInsp"]
+    )
+
+    actual = prepare_schools_data(
+        StringIO(gias_without_ofsted.to_csv()), StringIO(gias_links.to_csv()), 2024
+    )
+
+    print(actual.columns)
+
+    assert list(actual.columns) == [
+        "UKPRN",
+        "LA (code)",
+        "LA (name)",
+        "EstablishmentNumber",
+        "EstablishmentName",
+        "TypeOfEstablishment (code)",
+        "TypeOfEstablishment (name)",
+        "OpenDate",
+        "CloseDate",
+        "PhaseOfEducation (code)",
+        "PhaseOfEducation (name)",
+        "Boarders (code)",
+        "Boarders (name)",
+        "NurseryProvision (name)",
+        "OfficialSixthForm (code)",
+        "OfficialSixthForm (name)",
+        "AdmissionsPolicy (code)",
+        "AdmissionsPolicy (name)",
+        "Postcode",
+        "SchoolWebsite",
+        "TelephoneNum",
+        "GOR (name)",
+        "MSOA (code)",
+        "LSOA (code)",
+        "StatutoryLowAge",
+        "StatutoryHighAge",
+        "Street",
+        "Locality",
+        "Address3",
+        "Town",
+        "County (name)",
+        "LA Establishment Number",
+        "OfstedRating (name)",
+        "OfstedLastInsp",
+        "Has Nursery",
+        "Has Sixth Form",
+    ]
+
+
+def test_prepare_school_data_has_correct_output_ofsted_values_without_submission(
+    gias_data, gias_links
+):
+    """
+    For 2024 submissions gias data will not include "OfstedRating (name)" or
+    "OfstedLastInsp". These columns should still be created as they are required for downsteam processing.
+    This test confirms these are set with default values.
+    """
+    gias_without_ofsted = gias_data.copy().drop(
+        columns=["OfstedRating (name)", "OfstedLastInsp"]
+    )
+
+    actual = prepare_schools_data(
+        StringIO(gias_without_ofsted.to_csv()), StringIO(gias_links.to_csv()), 2024
+    )
+
+    assert (actual["OfstedRating (name)"] == "").all()
+    assert actual["OfstedLastInsp"].isna().all()
+
+
 def test_la_establishment_number_computed(prepared_schools_data: pd.DataFrame):
     assert prepared_schools_data.loc[100150]["LA Establishment Number"] == "201-3614"
 

diff --git a/data-pipeline/tests/unit/rag/test_rag.py b/data-pipeline/tests/unit/rag/test_rag.py
@@ -114,7 +114,7 @@ def test_find_percentile():
 
 
 @pytest.mark.parametrize(
-    "value,data,diff_median,percent_diff,percentile,decile,expected_rag",
+    "value,data,diff_median,percent_diff,percentile,decile,expected_rag,ofsted,key",
     [
         (
             20,
@@ -124,6 +124,8 @@ def test_find_percentile():
             60.0,
             6,
             "amber",
+            "outstanding",
+            "outstanding",
         ),
         (
             150,
@@ -133,6 +135,8 @@ def test_find_percentile():
             100.0,
             10,
             "red",
+            "outstanding",
+            "outstanding",
         ),
         (
             15,
@@ -142,11 +146,32 @@ def test_find_percentile():
             30.0,
             3,
             "green",
+            "outstanding",
+            "outstanding",
+        ),
+        (
+            15,
+            [15, 5, 6, 150, 16, 19, 22, 25, 76, 20],
+            -4.5,
+            -23.076923076923077,
+            30.0,
+            3,
+            "green",
+            "",
+            "other",
         ),
     ],
 )
 def test_category_stats(
-    value, data, diff_median, percent_diff, percentile, decile, expected_rag
+    value,
+    data,
+    diff_median,
+    percent_diff,
+    percentile,
+    decile,
+    expected_rag,
+    ofsted,
+    key,
 ):
     category = "Teaching and Teaching support staff_Sub Cat"
     data = pd.DataFrame(
@@ -161,7 +186,7 @@ def test_category_stats(
         "Value": value,
         "Median": 19.5,
         "DiffMedian": diff_median,
-        "Key": "outstanding",
+        "Key": key,
         "PercentDiff": percent_diff,
         "Percentile": percentile,
         "Decile": decile,
@@ -170,6 +195,5 @@ def test_category_stats(
 
     rag_settings = config.rag_category_settings["Teaching and Teaching support staff"]
     assert (
-        rag.category_stats(100000, category, data, "outstanding", rag_settings, 10)
-        == expected
+        rag.category_stats(100000, category, data, ofsted, rag_settings, 10) == expected
     )