Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: ofsted data points optional for gias submission #1794

Merged
merged 1 commit into from
Jan 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 1 addition & 38 deletions data-pipeline/src/pipeline/input_schemas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,44 +10,7 @@
workforce_census_header_row,
workforce_census_index_col,
)

gias_index_col = "URN"
gias = {
"URN": "Int64",
"UKPRN": "Int64",
"LA (code)": "Int64",
"LA (name)": "string",
"EstablishmentNumber": "Int64",
"EstablishmentName": "string",
"TypeOfEstablishment (code)": "Int64",
"TypeOfEstablishment (name)": "string",
"OpenDate": "string",
"CloseDate": "string",
"PhaseOfEducation (code)": "Int64",
"PhaseOfEducation (name)": "string",
"Boarders (code)": "Int64",
"Boarders (name)": "string",
"NurseryProvision (name)": "string",
"OfficialSixthForm (code)": "Int64",
"OfficialSixthForm (name)": "string",
"AdmissionsPolicy (code)": "Int64",
"AdmissionsPolicy (name)": "string",
"OfstedLastInsp": "string",
"Postcode": "string",
"SchoolWebsite": "string",
"TelephoneNum": "string",
"GOR (name)": "string",
"OfstedRating (name)": "string",
"MSOA (code)": "string",
"LSOA (code)": "string",
"StatutoryLowAge": "Int64",
"StatutoryHighAge": "Int64",
"Street": "string",
"Locality": "string",
"Address3": "string",
"Town": "string",
"County (name)": "string",
}
from .gias import gias, gias_index_col

gias_links_index_col = "URN"
gias_links = {
Expand Down
73 changes: 73 additions & 0 deletions data-pipeline/src/pipeline/input_schemas/gias.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
gias_index_col = "URN"
gias = {
"default": {
"URN": "Int64",
"UKPRN": "Int64",
"LA (code)": "Int64",
"LA (name)": "string",
"EstablishmentNumber": "Int64",
"EstablishmentName": "string",
"TypeOfEstablishment (code)": "Int64",
"TypeOfEstablishment (name)": "string",
"OpenDate": "string",
"CloseDate": "string",
"PhaseOfEducation (code)": "Int64",
"PhaseOfEducation (name)": "string",
"Boarders (code)": "Int64",
"Boarders (name)": "string",
"NurseryProvision (name)": "string",
"OfficialSixthForm (code)": "Int64",
"OfficialSixthForm (name)": "string",
"AdmissionsPolicy (code)": "Int64",
"AdmissionsPolicy (name)": "string",
"OfstedLastInsp": "string",
"Postcode": "string",
"SchoolWebsite": "string",
"TelephoneNum": "string",
"GOR (name)": "string",
"OfstedRating (name)": "string",
"MSOA (code)": "string",
"LSOA (code)": "string",
"StatutoryLowAge": "Int64",
"StatutoryHighAge": "Int64",
"Street": "string",
"Locality": "string",
"Address3": "string",
"Town": "string",
"County (name)": "string",
},
2024: {
"URN": "Int64",
"UKPRN": "Int64",
"LA (code)": "Int64",
"LA (name)": "string",
"EstablishmentNumber": "Int64",
"EstablishmentName": "string",
"TypeOfEstablishment (code)": "Int64",
"TypeOfEstablishment (name)": "string",
"OpenDate": "string",
"CloseDate": "string",
"PhaseOfEducation (code)": "Int64",
"PhaseOfEducation (name)": "string",
"Boarders (code)": "Int64",
"Boarders (name)": "string",
"NurseryProvision (name)": "string",
"OfficialSixthForm (code)": "Int64",
"OfficialSixthForm (name)": "string",
"AdmissionsPolicy (code)": "Int64",
"AdmissionsPolicy (name)": "string",
"Postcode": "string",
"SchoolWebsite": "string",
"TelephoneNum": "string",
"GOR (name)": "string",
"MSOA (code)": "string",
"LSOA (code)": "string",
"StatutoryLowAge": "Int64",
"StatutoryHighAge": "Int64",
"Street": "string",
"Locality": "string",
"Address3": "string",
"Town": "string",
"County (name)": "string",
},
}
2 changes: 1 addition & 1 deletion data-pipeline/src/pipeline/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ def pre_process_schools(run_type: str, year: int, run_id: str) -> pd.DataFrame:
raw_container, f"{run_type}/{year}/gias_links.csv", encoding="cp1252"
)

schools = prepare_schools_data(gias_data, gias_links_data)
schools = prepare_schools_data(gias_data, gias_links_data, year)

write_blob(
"pre-processed",
Expand Down
47 changes: 41 additions & 6 deletions data-pipeline/src/pipeline/pre_processing/ancillary/schools.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,20 @@
import pipeline.mappings as mappings


def prepare_schools_data(base_data_path, links_data_path):
def prepare_schools_data(base_data_path, links_data_path, year: int):
"""
Prepare school data derived from GIAS and GIAS links.

:param base_data_path: readable source for GIAS
:param pupil_census_path: readable source for GIAS links
:param year: financial year in question
"""
gias = pd.read_csv(
base_data_path,
encoding="cp1252",
index_col=input_schemas.gias_index_col,
usecols=input_schemas.gias.keys(),
dtype=input_schemas.gias,
usecols=input_schemas.gias.get(year, input_schemas.gias["default"]).keys(),
dtype=input_schemas.gias.get(year, input_schemas.gias["default"]),
)

gias_links = pd.read_csv(
Expand Down Expand Up @@ -39,9 +46,7 @@ def prepare_schools_data(base_data_path, links_data_path):
gias["Boarders (name)"].fillna("").map(mappings.map_boarders)
)

gias["OfstedRating (name)"] = (
gias["OfstedRating (name)"].fillna("").map(mappings.map_ofsted_rating)
)
gias = _optional_ofsted_cols(gias)

gias["TypeOfEstablishment (name)"] = (
gias["TypeOfEstablishment (name)"].fillna("").map(lambda x: x.strip())
Expand Down Expand Up @@ -93,3 +98,33 @@ def prepare_schools_data(base_data_path, links_data_path):
return schools[(schools["Rank"] == 1) | (schools["Rank"].isna())].drop(
columns=["LinkURN", "LinkName", "LinkType", "LinkEstablishedDate", "Rank"]
)


def _optional_ofsted_cols(df: pd.DataFrame) -> pd.DataFrame:
"""
Ensure that "OfstedRating (name)" and "OfstedLastInsp" columns are present in the DataFrame,
even if they are missing from the original submission. Missing columns are created with defaults
to ensure compatibility with downstream processing.

These columns are required to write to the db and "OfstedRating (name)" is required for rag
calculations.

If the columns exist, they are either preserved or mapped as necessary.
If they do not exist, new columns are created: an empty string column for
"OfstedRating (name)" and a `None` column for "OfstedLastInsp".

:param df: The GIAS DataFrame to process.
PsypherPunk marked this conversation as resolved.
Show resolved Hide resolved

:return: The DataFrame with the "OfstedRating (name)" and "OfstedLastInsp" columns added or modified.
"""
df["OfstedRating (name)"] = (
df.get("OfstedRating (name)", pd.Series([""] * len(df), index=df.index)).fillna(
""
)
).map(mappings.map_ofsted_rating)

df["OfstedLastInsp"] = df.get(
"OfstedLastInsp", pd.Series([None] * len(df), index=df.index)
)

return df
2 changes: 1 addition & 1 deletion data-pipeline/tests/unit/pre_processing/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -532,7 +532,7 @@ def prepared_schools_data(
gias_data: pd.DataFrame, gias_links: pd.DataFrame
) -> pd.DataFrame:
return prepare_schools_data(
StringIO(gias_data.to_csv()), StringIO(gias_links.to_csv())
StringIO(gias_data.to_csv()), StringIO(gias_links.to_csv()), 2023
)


Expand Down
79 changes: 79 additions & 0 deletions data-pipeline/tests/unit/pre_processing/test_gias.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
from io import StringIO

import pandas as pd
import pytest

from pipeline.pre_processing import prepare_schools_data


def test_prepare_school_data_has_correct_output_columns(
prepared_schools_data: pd.DataFrame,
Expand Down Expand Up @@ -45,6 +49,81 @@ def test_prepare_school_data_has_correct_output_columns(
]


def test_prepare_school_data_has_correct_output_columns_without_ofsted_cols(
gias_data, gias_links
):
"""
For 2024 submissions GIAS data will not include "OfstedRating (name)" or "OfstedLastInsp"
This test confirms these columns are created as they are required for downsteam processing.
"""
gias_without_ofsted = gias_data.copy().drop(
columns=["OfstedRating (name)", "OfstedLastInsp"]
)

actual = prepare_schools_data(
StringIO(gias_without_ofsted.to_csv()), StringIO(gias_links.to_csv()), 2024
)

assert list(actual.columns) == [
"UKPRN",
"LA (code)",
"LA (name)",
"EstablishmentNumber",
"EstablishmentName",
"TypeOfEstablishment (code)",
"TypeOfEstablishment (name)",
"OpenDate",
"CloseDate",
"PhaseOfEducation (code)",
"PhaseOfEducation (name)",
"Boarders (code)",
"Boarders (name)",
"NurseryProvision (name)",
"OfficialSixthForm (code)",
"OfficialSixthForm (name)",
"AdmissionsPolicy (code)",
"AdmissionsPolicy (name)",
"Postcode",
"SchoolWebsite",
"TelephoneNum",
"GOR (name)",
"MSOA (code)",
"LSOA (code)",
"StatutoryLowAge",
"StatutoryHighAge",
"Street",
"Locality",
"Address3",
"Town",
"County (name)",
"LA Establishment Number",
"OfstedRating (name)",
"OfstedLastInsp",
"Has Nursery",
"Has Sixth Form",
]


def test_prepare_school_data_has_correct_output_ofsted_values_without_submission(
gias_data, gias_links
):
"""
For 2024 submissions GIAS data will not include "OfstedRating (name)" or
"OfstedLastInsp". These columns should still be created as they are required for downstream processing.
This test confirms these are set with default values.
"""
gias_without_ofsted = gias_data.copy().drop(
columns=["OfstedRating (name)", "OfstedLastInsp"]
)

actual = prepare_schools_data(
StringIO(gias_without_ofsted.to_csv()), StringIO(gias_links.to_csv()), 2024
)

assert (actual["OfstedRating (name)"] == "").all()
assert actual["OfstedLastInsp"].isna().all()


def test_la_establishment_number_computed(prepared_schools_data: pd.DataFrame):
assert prepared_schools_data.loc[100150]["LA Establishment Number"] == "201-3614"

Expand Down
34 changes: 29 additions & 5 deletions data-pipeline/tests/unit/rag/test_rag.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def test_find_percentile():


@pytest.mark.parametrize(
"value,data,diff_median,percent_diff,percentile,decile,expected_rag",
"value,data,diff_median,percent_diff,percentile,decile,expected_rag,ofsted,key",
[
(
20,
Expand All @@ -124,6 +124,8 @@ def test_find_percentile():
60.0,
6,
"amber",
"outstanding",
"outstanding",
),
(
150,
Expand All @@ -133,6 +135,8 @@ def test_find_percentile():
100.0,
10,
"red",
"outstanding",
"outstanding",
),
(
15,
Expand All @@ -142,11 +146,32 @@ def test_find_percentile():
30.0,
3,
"green",
"outstanding",
"outstanding",
),
(
15,
[15, 5, 6, 150, 16, 19, 22, 25, 76, 20],
-4.5,
-23.076923076923077,
30.0,
3,
"green",
"",
"other",
),
],
)
def test_category_stats(
value, data, diff_median, percent_diff, percentile, decile, expected_rag
value,
data,
diff_median,
percent_diff,
percentile,
decile,
expected_rag,
ofsted,
key,
):
category = "Teaching and Teaching support staff_Sub Cat"
data = pd.DataFrame(
Expand All @@ -161,7 +186,7 @@ def test_category_stats(
"Value": value,
"Median": 19.5,
"DiffMedian": diff_median,
"Key": "outstanding",
"Key": key,
"PercentDiff": percent_diff,
"Percentile": percentile,
"Decile": decile,
Expand All @@ -170,6 +195,5 @@ def test_category_stats(

rag_settings = config.rag_category_settings["Teaching and Teaching support staff"]
assert (
rag.category_stats(100000, category, data, "outstanding", rag_settings, 10)
== expected
rag.category_stats(100000, category, data, ofsted, rag_settings, 10) == expected
)