Skip to content

Commit

Permalink
Merge pull request #30 from sensiblecodeio/peter/schema-1.3
Browse files Browse the repository at this point in the history
Update to version 1.3 of metadata schema
  • Loading branch information
phynes-sensiblecode authored Aug 22, 2022
2 parents 39559c5 + de35dc7 commit b06fa6c
Show file tree
Hide file tree
Showing 36 changed files with 819 additions and 734 deletions.
2 changes: 1 addition & 1 deletion CENSUS_METADATA.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ Cantabular metadata.
| `name` | `String!` | `Dataset.Dataset_Mnemonic` | |
| `label` | `String` | `Dataset.Dataset_Title` | `Dataset.Dataset_Title_Welsh` |
| `description` | `String` | `Dataset.Dataset_Description` | `Dataset.Dataset_Description_Welsh` |
| `datasetName` | `String` | `Dataset.Pre_Built_Database_Mnemonic` if set, else `Dataset.Source_Database_Mnemonic` | |
| `datasetName` | `String` | `Dataset.Destination_Pre_Built_Database_Mnemonic` if set, else `Dataset_Variable.Database_Mnemonic` | |
| `meta` | `TableMetadata!` | Additional data from `Dataset.csv` | |
| `vars` | `[String!]!` | Table variable names sourced from `Dataset_Variable.csv` (see note below) | |

Expand Down
17 changes: 0 additions & 17 deletions FIXUP.md

This file was deleted.

375 changes: 189 additions & 186 deletions README.md

Large diffs are not rendered by default.

7 changes: 7 additions & 0 deletions RELEASE_NOTES.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
Release Notes
=============

1.3.alpha
---------
- Updated code to work with metadata schema version 1.3.
- Removed preprocessing scripts that are no longer necessary.
- Cantabular version 10.2.0 is now the default version. The file format for version 10.2.0 is
identical to all other supported versions except `9.2.0`.

1.2.epsilon
-----------
- The code to process the geography lookup file expects lowercase file suffixes `cd`, `nm` and `nmw`.
Expand Down
30 changes: 0 additions & 30 deletions REMOVE_EMPTY_ROWS_AND_COLUMNS.md

This file was deleted.

87 changes: 0 additions & 87 deletions bin/fixup.py

This file was deleted.

45 changes: 40 additions & 5 deletions bin/ons_csv_to_ctb_json_ds_vars.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,17 @@
import logging
from collections import namedtuple

DatasetVariables = namedtuple('DatasetVariables', 'classifications alternate_geog_variables')
TABULAR_DATABASE_TYPE = 'AGGDATA'

DatasetVariables = namedtuple('DatasetVariables',
'classifications alternate_geog_variables databases')


class DatasetVarsBuilder():
"""Utility class to validate and build dataset variables."""

def __init__(self, dataset_mnemonic, filename, all_classifications, recoverable_error):
def __init__(self, dataset_mnemonic, filename, all_classifications, all_databases,
recoverable_error):
"""Initialise DatasetVarsBuilder object."""
self.lowest_geog_variable = None
self.alternate_geog_variables = []
Expand All @@ -17,7 +21,9 @@ def __init__(self, dataset_mnemonic, filename, all_classifications, recoverable_
self.dataset_mnemonic = dataset_mnemonic
self.filename = filename
self.all_classifications = all_classifications
self.all_databases = all_databases
self.recoverable_error = recoverable_error
self.databases = set()

def add_geographic_variable(self, variable, row_num):
"""Add geographic variable ensuring data integrity."""
Expand All @@ -41,9 +47,19 @@ def add_geographic_variable(self, variable, row_num):
f'{self.lowest_geog_variable} for dataset '
f'{self.dataset_mnemonic}')
else:
self.lowest_geog_variable = variable['Variable_Mnemonic']
self.lowest_geog_variable = variable_mnemonic
else:
self.alternate_geog_variables.append(variable['Variable_Mnemonic'])
self.alternate_geog_variables.append(variable_mnemonic)

database_mnemonic = variable['Database_Mnemonic']
database = self.all_databases[database_mnemonic]
if variable_mnemonic not in database.private['Classifications']:
self.recoverable_error(
f'Reading {self.filename}:{row_num} '
f'{self.dataset_mnemonic} has geographic variable {variable_mnemonic} '
f'that is not in database {database_mnemonic}')

self._add_database(database_mnemonic, row_num)

def add_non_geographic_variable(self, variable, row_num):
"""Add non-geographic variable ensuring data integrity."""
Expand Down Expand Up @@ -81,6 +97,25 @@ def add_non_geographic_variable(self, variable, row_num):
self.classifications.append(variable['Classification_Mnemonic'])
self.processing_priorities.append(int(variable['Processing_Priority']))

database_mnemonic = variable['Database_Mnemonic']
database = self.all_databases[database_mnemonic]
if classification_mnemonic not in database.private['Classifications']:
self.recoverable_error(
f'Reading {self.filename}:{row_num} '
f'{self.dataset_mnemonic} has classification {classification_mnemonic} '
f'that is not in database {database_mnemonic}')

self._add_database(database_mnemonic, row_num)

def _add_database(self, database_mnemonic, row_num):
database = self.all_databases[database_mnemonic]
if database.private['Database_Type_Code'] == TABULAR_DATABASE_TYPE:
self.recoverable_error(
f'Reading {self.filename}:{row_num} {self.dataset_mnemonic} '
f'has Database_Mnemonic {database_mnemonic} which has invalid '
f'Database_Type_Code: {TABULAR_DATABASE_TYPE}')
self.databases.add(database_mnemonic)

def dataset_variables(self):
"""Return dataset classifications and alternate geographic variables for each dataset."""
if self.alternate_geog_variables and not self.lowest_geog_variable:
Expand All @@ -102,4 +137,4 @@ def dataset_variables(self):

geo_vars = sorted(self.alternate_geog_variables) if self.alternate_geog_variables else None

return DatasetVariables(classifications, geo_vars)
return DatasetVariables(classifications, geo_vars, sorted(self.databases))
Loading

0 comments on commit b06fa6c

Please sign in to comment.