Skip to content

Commit

Permalink
feat: if ofsted related columns are missing from gias submission for …
Browse files Browse the repository at this point in the history
…data processing use default values.

    - new 2024 gias schema without `OfstedRating (name)` and `OfstedLastInsp` columns.
    - when ofsted related columns are missing they are created and set to default values for downstream processing and computations.
    - when ofsted related columns are present (for historic submissions) the current behaviour is preserved.
  • Loading branch information
J05h-L committed Jan 23, 2025
1 parent d9f195d commit 03b9d89
Show file tree
Hide file tree
Showing 7 changed files with 225 additions and 51 deletions.
39 changes: 1 addition & 38 deletions data-pipeline/src/pipeline/input_schemas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,44 +10,7 @@
workforce_census_header_row,
workforce_census_index_col,
)

gias_index_col = "URN"
gias = {
"URN": "Int64",
"UKPRN": "Int64",
"LA (code)": "Int64",
"LA (name)": "string",
"EstablishmentNumber": "Int64",
"EstablishmentName": "string",
"TypeOfEstablishment (code)": "Int64",
"TypeOfEstablishment (name)": "string",
"OpenDate": "string",
"CloseDate": "string",
"PhaseOfEducation (code)": "Int64",
"PhaseOfEducation (name)": "string",
"Boarders (code)": "Int64",
"Boarders (name)": "string",
"NurseryProvision (name)": "string",
"OfficialSixthForm (code)": "Int64",
"OfficialSixthForm (name)": "string",
"AdmissionsPolicy (code)": "Int64",
"AdmissionsPolicy (name)": "string",
"OfstedLastInsp": "string",
"Postcode": "string",
"SchoolWebsite": "string",
"TelephoneNum": "string",
"GOR (name)": "string",
"OfstedRating (name)": "string",
"MSOA (code)": "string",
"LSOA (code)": "string",
"StatutoryLowAge": "Int64",
"StatutoryHighAge": "Int64",
"Street": "string",
"Locality": "string",
"Address3": "string",
"Town": "string",
"County (name)": "string",
}
from .gias import gias, gias_index_col

gias_links_index_col = "URN"
gias_links = {
Expand Down
73 changes: 73 additions & 0 deletions data-pipeline/src/pipeline/input_schemas/gias.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
gias_index_col = "URN"
gias = {
"default": {
"URN": "Int64",
"UKPRN": "Int64",
"LA (code)": "Int64",
"LA (name)": "string",
"EstablishmentNumber": "Int64",
"EstablishmentName": "string",
"TypeOfEstablishment (code)": "Int64",
"TypeOfEstablishment (name)": "string",
"OpenDate": "string",
"CloseDate": "string",
"PhaseOfEducation (code)": "Int64",
"PhaseOfEducation (name)": "string",
"Boarders (code)": "Int64",
"Boarders (name)": "string",
"NurseryProvision (name)": "string",
"OfficialSixthForm (code)": "Int64",
"OfficialSixthForm (name)": "string",
"AdmissionsPolicy (code)": "Int64",
"AdmissionsPolicy (name)": "string",
"OfstedLastInsp": "string",
"Postcode": "string",
"SchoolWebsite": "string",
"TelephoneNum": "string",
"GOR (name)": "string",
"OfstedRating (name)": "string",
"MSOA (code)": "string",
"LSOA (code)": "string",
"StatutoryLowAge": "Int64",
"StatutoryHighAge": "Int64",
"Street": "string",
"Locality": "string",
"Address3": "string",
"Town": "string",
"County (name)": "string",
},
2024: {
"URN": "Int64",
"UKPRN": "Int64",
"LA (code)": "Int64",
"LA (name)": "string",
"EstablishmentNumber": "Int64",
"EstablishmentName": "string",
"TypeOfEstablishment (code)": "Int64",
"TypeOfEstablishment (name)": "string",
"OpenDate": "string",
"CloseDate": "string",
"PhaseOfEducation (code)": "Int64",
"PhaseOfEducation (name)": "string",
"Boarders (code)": "Int64",
"Boarders (name)": "string",
"NurseryProvision (name)": "string",
"OfficialSixthForm (code)": "Int64",
"OfficialSixthForm (name)": "string",
"AdmissionsPolicy (code)": "Int64",
"AdmissionsPolicy (name)": "string",
"Postcode": "string",
"SchoolWebsite": "string",
"TelephoneNum": "string",
"GOR (name)": "string",
"MSOA (code)": "string",
"LSOA (code)": "string",
"StatutoryLowAge": "Int64",
"StatutoryHighAge": "Int64",
"Street": "string",
"Locality": "string",
"Address3": "string",
"Town": "string",
"County (name)": "string",
},
}
2 changes: 1 addition & 1 deletion data-pipeline/src/pipeline/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ def pre_process_schools(run_type: str, year: int, run_id: str) -> pd.DataFrame:
raw_container, f"{run_type}/{year}/gias_links.csv", encoding="cp1252"
)

schools = prepare_schools_data(gias_data, gias_links_data)
schools = prepare_schools_data(gias_data, gias_links_data, year)

write_blob(
"pre-processed",
Expand Down
47 changes: 41 additions & 6 deletions data-pipeline/src/pipeline/pre_processing/ancillary/schools.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,20 @@
import pipeline.mappings as mappings


def prepare_schools_data(base_data_path, links_data_path):
def prepare_schools_data(base_data_path, links_data_path, year: int):
"""
Prepare school data derived from GIAS and GIAS links.
:param base_data_path: readable source for GIAS
:param pupil_census_path: readable source for GIAS links
:param year: financial year in question
"""
gias = pd.read_csv(
base_data_path,
encoding="cp1252",
index_col=input_schemas.gias_index_col,
usecols=input_schemas.gias.keys(),
dtype=input_schemas.gias,
usecols=input_schemas.gias.get(year, input_schemas.gias["default"]).keys(),
dtype=input_schemas.gias.get(year, input_schemas.gias["default"]),
)

gias_links = pd.read_csv(
Expand Down Expand Up @@ -39,9 +46,7 @@ def prepare_schools_data(base_data_path, links_data_path):
gias["Boarders (name)"].fillna("").map(mappings.map_boarders)
)

gias["OfstedRating (name)"] = (
gias["OfstedRating (name)"].fillna("").map(mappings.map_ofsted_rating)
)
gias = _optional_ofsted_cols(gias)

gias["TypeOfEstablishment (name)"] = (
gias["TypeOfEstablishment (name)"].fillna("").map(lambda x: x.strip())
Expand Down Expand Up @@ -93,3 +98,33 @@ def prepare_schools_data(base_data_path, links_data_path):
return schools[(schools["Rank"] == 1) | (schools["Rank"].isna())].drop(
columns=["LinkURN", "LinkName", "LinkType", "LinkEstablishedDate", "Rank"]
)


def _optional_ofsted_cols(df: pd.DataFrame) -> pd.DataFrame:
"""
Ensure that "OfstedRating (name)" and "OfstedLastInsp" columns are present in the DataFrame,
even if they are missing from the original submission. Missing columns are created with defaults
to ensure compatibility with downstream processing.
These columns are required to write to the db and "OfstedRating (name)" is required for rag
calculations.
If the columns exist, they are either preserved or mapped as necessary.
If they do not exist, new columns are created: an empty string column for
"OfstedRating (name)" and a `None` column for "OfstedLastInsp".
:param df: The GIAS DataFrame to process.
:return: The DataFrame with the "OfstedRating (name)" and "OfstedLastInsp" columns added or modified.
"""
df["OfstedRating (name)"] = (
df.get("OfstedRating (name)", pd.Series([""] * len(df), index=df.index)).fillna(
""
)
).map(mappings.map_ofsted_rating)

df["OfstedLastInsp"] = df.get(
"OfstedLastInsp", pd.Series([None] * len(df), index=df.index)
)

return df
2 changes: 1 addition & 1 deletion data-pipeline/tests/unit/pre_processing/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -532,7 +532,7 @@ def prepared_schools_data(
gias_data: pd.DataFrame, gias_links: pd.DataFrame
) -> pd.DataFrame:
return prepare_schools_data(
StringIO(gias_data.to_csv()), StringIO(gias_links.to_csv())
StringIO(gias_data.to_csv()), StringIO(gias_links.to_csv()), 2023
)


Expand Down
79 changes: 79 additions & 0 deletions data-pipeline/tests/unit/pre_processing/test_gias.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
from io import StringIO

import pandas as pd
import pytest

from pipeline.pre_processing import prepare_schools_data


def test_prepare_school_data_has_correct_output_columns(
prepared_schools_data: pd.DataFrame,
Expand Down Expand Up @@ -45,6 +49,81 @@ def test_prepare_school_data_has_correct_output_columns(
]


def test_prepare_school_data_has_correct_output_columns_without_ofsted_cols(
gias_data, gias_links
):
"""
For 2024 submissions GIAS data will not include "OfstedRating (name)" or "OfstedLastInsp"
This test confirms these columns are created as they are required for downsteam processing.
"""
gias_without_ofsted = gias_data.copy().drop(
columns=["OfstedRating (name)", "OfstedLastInsp"]
)

actual = prepare_schools_data(
StringIO(gias_without_ofsted.to_csv()), StringIO(gias_links.to_csv()), 2024
)

assert list(actual.columns) == [
"UKPRN",
"LA (code)",
"LA (name)",
"EstablishmentNumber",
"EstablishmentName",
"TypeOfEstablishment (code)",
"TypeOfEstablishment (name)",
"OpenDate",
"CloseDate",
"PhaseOfEducation (code)",
"PhaseOfEducation (name)",
"Boarders (code)",
"Boarders (name)",
"NurseryProvision (name)",
"OfficialSixthForm (code)",
"OfficialSixthForm (name)",
"AdmissionsPolicy (code)",
"AdmissionsPolicy (name)",
"Postcode",
"SchoolWebsite",
"TelephoneNum",
"GOR (name)",
"MSOA (code)",
"LSOA (code)",
"StatutoryLowAge",
"StatutoryHighAge",
"Street",
"Locality",
"Address3",
"Town",
"County (name)",
"LA Establishment Number",
"OfstedRating (name)",
"OfstedLastInsp",
"Has Nursery",
"Has Sixth Form",
]


def test_prepare_school_data_has_correct_output_ofsted_values_without_submission(
gias_data, gias_links
):
"""
For 2024 submissions GIAS data will not include "OfstedRating (name)" or
"OfstedLastInsp". These columns should still be created as they are required for downstream processing.
This test confirms these are set with default values.
"""
gias_without_ofsted = gias_data.copy().drop(
columns=["OfstedRating (name)", "OfstedLastInsp"]
)

actual = prepare_schools_data(
StringIO(gias_without_ofsted.to_csv()), StringIO(gias_links.to_csv()), 2024
)

assert (actual["OfstedRating (name)"] == "").all()
assert actual["OfstedLastInsp"].isna().all()


def test_la_establishment_number_computed(prepared_schools_data: pd.DataFrame):
assert prepared_schools_data.loc[100150]["LA Establishment Number"] == "201-3614"

Expand Down
34 changes: 29 additions & 5 deletions data-pipeline/tests/unit/rag/test_rag.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def test_find_percentile():


@pytest.mark.parametrize(
"value,data,diff_median,percent_diff,percentile,decile,expected_rag",
"value,data,diff_median,percent_diff,percentile,decile,expected_rag,ofsted,key",
[
(
20,
Expand All @@ -124,6 +124,8 @@ def test_find_percentile():
60.0,
6,
"amber",
"outstanding",
"outstanding",
),
(
150,
Expand All @@ -133,6 +135,8 @@ def test_find_percentile():
100.0,
10,
"red",
"outstanding",
"outstanding",
),
(
15,
Expand All @@ -142,11 +146,32 @@ def test_find_percentile():
30.0,
3,
"green",
"outstanding",
"outstanding",
),
(
15,
[15, 5, 6, 150, 16, 19, 22, 25, 76, 20],
-4.5,
-23.076923076923077,
30.0,
3,
"green",
"",
"other",
),
],
)
def test_category_stats(
value, data, diff_median, percent_diff, percentile, decile, expected_rag
value,
data,
diff_median,
percent_diff,
percentile,
decile,
expected_rag,
ofsted,
key,
):
category = "Teaching and Teaching support staff_Sub Cat"
data = pd.DataFrame(
Expand All @@ -161,7 +186,7 @@ def test_category_stats(
"Value": value,
"Median": 19.5,
"DiffMedian": diff_median,
"Key": "outstanding",
"Key": key,
"PercentDiff": percent_diff,
"Percentile": percentile,
"Decile": decile,
Expand All @@ -170,6 +195,5 @@ def test_category_stats(

rag_settings = config.rag_category_settings["Teaching and Teaching support staff"]
assert (
rag.category_stats(100000, category, data, "outstanding", rag_settings, 10)
== expected
rag.category_stats(100000, category, data, ofsted, rag_settings, 10) == expected
)

0 comments on commit 03b9d89

Please sign in to comment.