Skip to content

Commit

Permalink
Merge pull request #16 from sensiblecodeio/v1.1.beta
Browse files Browse the repository at this point in the history
V1.1.beta
  • Loading branch information
phynes-sensiblecode authored May 11, 2022
2 parents b6c7535 + 537d05b commit 32bad86
Show file tree
Hide file tree
Showing 57 changed files with 1,527 additions and 480 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ jobs:
pydocstyle bin/*.py
- name: Run pylint
run: |
pylint --max-locals=20 --max-public-methods=30 --max-branches=50 --max-statements=90 --min-similarity-lines=6 --disable=W1202 bin/*.py
pylint --max-locals=20 --max-public-methods=30 --max-branches=50 --max-statements=90 --min-similarity-lines=6 --max-module-lines=1200 --max-locals=22 --max-attributes=10 --disable=W1202 bin/*.py
- name: Run tests
run: |
PYTHONPATH=test:bin python3 -m unittest -v
303 changes: 221 additions & 82 deletions README.md

Large diffs are not rendered by default.

10 changes: 10 additions & 0 deletions RELEASE_NOTES.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
Release Notes
=============

1.1.beta
--------
- Added `--best-effort` flag to discard invalid data and make a best effort
attempt to generate output files.
- This replaces the `fixup.py` script.
- Formatted and customizable output filenames.
- Support for Cantabular version 9.2.0 formatting.
- Rework on mandatory fields.
- Added 2011 1% sample metadata.

1.1.alpha
---------
- Updated code to work with metadata schema version 1.1.
Expand Down
176 changes: 0 additions & 176 deletions bin/fixup.py

This file was deleted.

88 changes: 50 additions & 38 deletions bin/ons_csv_to_ctb_json_ds_vars.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Build data structure that represents relationship between dataset and variables."""
import logging
from collections import namedtuple

DatasetVariables = namedtuple('DatasetVariables', 'classifications alternate_geog_variables')
Expand All @@ -7,7 +8,7 @@
class DatasetVarsBuilder():
"""Utility class to validate and build dataset variables."""

def __init__(self, dataset_mnemonic, filename, all_classifications):
def __init__(self, dataset_mnemonic, filename, all_classifications, recoverable_error):
"""Initialise DatasetVarsBuilder object."""
self.lowest_geog_variable = None
self.alternate_geog_variables = []
Expand All @@ -16,71 +17,82 @@ def __init__(self, dataset_mnemonic, filename, all_classifications):
self.dataset_mnemonic = dataset_mnemonic
self.filename = filename
self.all_classifications = all_classifications
self.recoverable_error = recoverable_error

def add_geographic_variable(self, variable):
def add_geographic_variable(self, variable, row_num):
"""Add geographic variable ensuring data integrity."""
variable_mnemonic = variable['Variable_Mnemonic']
classification_mnemonic = variable['Classification_Mnemonic']
if classification_mnemonic:
raise ValueError(f'Reading {self.filename} '
'Classification_Mnemonic must not be specified for '
f'geographic variable {variable_mnemonic} in dataset '
f'{self.dataset_mnemonic}')
self.recoverable_error(f'Reading {self.filename}:{row_num} '
'Classification_Mnemonic must not be specified for '
f'geographic variable {variable_mnemonic} in dataset '
f'{self.dataset_mnemonic}')
if variable['Processing_Priority']:
raise ValueError(f'Reading {self.filename} '
'Processing_Priority must not be specified for geographic'
f' variable {variable_mnemonic} in dataset '
f'{self.dataset_mnemonic}')
self.recoverable_error(f'Reading {self.filename}:{row_num} '
'Processing_Priority must not be specified for geographic'
f' variable {variable_mnemonic} in dataset '
f'{self.dataset_mnemonic}')
if variable['Lowest_Geog_Variable_Flag'] == 'Y':
if self.lowest_geog_variable:
raise ValueError(f'Reading {self.filename} '
'Lowest_Geog_Variable_Flag set on variable '
f'{variable_mnemonic} and '
f'{self.lowest_geog_variable} for dataset '
f'{self.dataset_mnemonic}')
self.lowest_geog_variable = variable['Variable_Mnemonic']
self.recoverable_error(f'Reading {self.filename}:{row_num} '
'Lowest_Geog_Variable_Flag set on variable '
f'{variable_mnemonic} and '
f'{self.lowest_geog_variable} for dataset '
f'{self.dataset_mnemonic}')
else:
self.lowest_geog_variable = variable['Variable_Mnemonic']
else:
self.alternate_geog_variables.append(variable['Variable_Mnemonic'])

def add_non_geographic_variable(self, variable):
def add_non_geographic_variable(self, variable, row_num):
"""Add non-geographic variable ensuring data integrity."""
variable_mnemonic = variable['Variable_Mnemonic']
classification_mnemonic = variable['Classification_Mnemonic']
if not classification_mnemonic:
raise ValueError(f'Reading {self.filename} '
'Classification must be specified for non-geographic '
f'{variable_mnemonic} in dataset {self.dataset_mnemonic}')
self.recoverable_error(f'Reading {self.filename}:{row_num} '
'Classification must be specified for non-geographic '
f'{variable_mnemonic} in dataset {self.dataset_mnemonic}')
logging.warning(f'Reading {self.filename}:{row_num} dropping record')
return

if variable['Lowest_Geog_Variable_Flag'] == 'Y':
raise ValueError(f'Reading {self.filename} '
'Lowest_Geog_Variable_Flag set on non-geographic variable'
f' {variable_mnemonic} for dataset {self.dataset_mnemonic}')
self.recoverable_error(f'Reading {self.filename}:{row_num} '
'Lowest_Geog_Variable_Flag set on non-geographic variable '
f'{variable_mnemonic} for dataset {self.dataset_mnemonic}')

classification = self.all_classifications[classification_mnemonic]
if classification.private['Variable_Mnemonic'] != variable_mnemonic:
raise ValueError(f'Reading {self.filename} Invalid '
f'classification {classification_mnemonic} '
f'specified for variable {variable_mnemonic} '
f'in dataset {self.dataset_mnemonic}')
self.recoverable_error(f'Reading {self.filename}:{row_num} Invalid '
f'classification {classification_mnemonic} '
f'specified for variable {variable_mnemonic} '
f'in dataset {self.dataset_mnemonic}')
logging.warning(f'Reading {self.filename}:{row_num} dropping record')
return

if not variable['Processing_Priority']:
raise ValueError(f'Reading {self.filename} '
'Processing_Priority not specified for classification '
f'{classification_mnemonic} in dataset '
f'{self.dataset_mnemonic}')
self.recoverable_error(f'Reading {self.filename}:{row_num} '
'Processing_Priority not specified for classification '
f'{classification_mnemonic} in dataset '
f'{self.dataset_mnemonic}')
logging.warning(f'Reading {self.filename}:{row_num} using 0 for Processing_Priority')
variable['Processing_Priority'] = 0

self.classifications.append(variable['Classification_Mnemonic'])
self.processing_priorities.append(int(variable['Processing_Priority']))

def dataset_variables(self):
"""Return dataset classifications and alternate geographic variables for each dataset."""
if self.alternate_geog_variables and not self.lowest_geog_variable:
raise ValueError(f'Reading {self.filename} '
'Lowest_Geog_Variable_Flag not set on any geographic variables '
f'for dataset {self.dataset_mnemonic}')
self.recoverable_error(f'Reading {self.filename} '
'Lowest_Geog_Variable_Flag not set on any geographic variables '
f'for dataset {self.dataset_mnemonic}')

if set(self.processing_priorities) != set(range(1, len(self.processing_priorities) + 1)):
raise ValueError(f'Reading {self.filename} '
'Invalid processing_priorities '
f'{self.processing_priorities} for dataset '
f'{self.dataset_mnemonic}')
self.recoverable_error(f'Reading {self.filename} '
'Invalid processing_priorities '
f'{self.processing_priorities} for dataset '
f'{self.dataset_mnemonic}')

classifications = [c for _, c in sorted(zip(self.processing_priorities,
self.classifications))]
Expand Down
8 changes: 3 additions & 5 deletions bin/ons_csv_to_ctb_json_geo.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,11 @@ def read_geo_cats(filename):
var_to_columns = assign_columns_to_variables(filename, fieldnames)
data = {var_name: {} for var_name in var_to_columns}

for row in reader:
for row_num, row in enumerate(reader, 2):
if len(row) > len(fieldnames):
raise ValueError(f'Reading {filename}: too many fields on line '
f'{reader.line_num}')
raise ValueError(f'Reading {filename}: too many fields on row {row_num}')
if len(row) < len(fieldnames):
raise ValueError(f'Reading {filename}: too few fields on line '
f'{reader.line_num}')
raise ValueError(f'Reading {filename}: too few fields on row {row_num}')

for geo, columns in var_to_columns.items():
code = row[columns.code].strip()
Expand Down
Loading

0 comments on commit 32bad86

Please sign in to comment.