From 81108b4302ddb05bc7fbd6f00fefc1ad35a7ccf0 Mon Sep 17 00:00:00 2001 From: Peter Hynes Date: Mon, 25 Apr 2022 17:45:21 +0100 Subject: [PATCH 01/15] Formatted filenames --- README.md | 180 +++++++++++++++++++------------- bin/ons_csv_to_ctb_json_main.py | 54 +++++++++- test/test_contact.py | 2 +- test/test_integration.py | 36 +++++-- 4 files changed, 189 insertions(+), 83 deletions(-) diff --git a/README.md b/README.md index 70e2353..95ed808 100644 --- a/README.md +++ b/README.md @@ -34,45 +34,84 @@ Basic logging will be displayed by default, including the number of high-level C objects loaded and the name of the output files. ``` > python3 bin/ons_csv_to_ctb_json_main.py -i test/testdata/ -g test/testdata/geography/geography.csv -o ctb_metadata_files/ -t=2022-04-21 14:26:43,977 lvl=INFO msg=Reading test/testdata/geography/geography.csv: found Welsh labels for unknown classification: OTHER -t=2022-04-21 14:26:43,977 lvl=INFO msg=Dropped non public classification: CLASS_PRIV -t=2022-04-21 14:26:43,977 lvl=INFO msg=Loaded metadata for 5 Cantabular variables -t=2022-04-21 14:26:43,977 lvl=INFO msg=Loaded metadata for 3 Cantabular datasets -t=2022-04-21 14:26:43,979 lvl=INFO msg=Written dataset metadata file to: ctb_metadata_files/dataset-metadata.json -t=2022-04-21 14:26:43,979 lvl=INFO msg=Dropped non public ONS Dataset: DS_PRIV -t=2022-04-21 14:26:43,979 lvl=INFO msg=Loaded metadata for 4 Cantabular tables -t=2022-04-21 14:26:43,980 lvl=INFO msg=Written table metadata file to: ctb_metadata_files/table-metadata.json -t=2022-04-21 14:26:43,980 lvl=INFO msg=Loaded service metadata -t=2022-04-21 14:26:43,980 lvl=INFO msg=Written service metadata file to: ctb_metadata_files/service-metadata.json +t=2022-04-28 15:21:06,357 lvl=INFO msg=Reading test/testdata/geography/geography.csv: found Welsh labels for unknown classification: OTHER +t=2022-04-28 15:21:06,357 lvl=INFO msg=Dropped non public classification: CLASS_PRIV +t=2022-04-28 15:21:06,357 lvl=INFO msg=Loaded metadata for 5 Cantabular variables +t=2022-04-28 15:21:06,358 lvl=INFO msg=Loaded metadata for 3 Cantabular datasets +t=2022-04-28 15:21:06,359 lvl=INFO msg=Written dataset metadata file to: ctb_metadata_files/cantabm_v9-3-0_unknown-metadata-version_dataset-md_20220428-1.json +t=2022-04-28 15:21:06,360 lvl=INFO msg=Dropped non public ONS Dataset: DS_PRIV +t=2022-04-28 15:21:06,360 lvl=INFO msg=Loaded metadata for 4 Cantabular tables +t=2022-04-28 15:21:06,360 lvl=INFO msg=Written table metadata file to: ctb_metadata_files/cantabm_v9-3-0_unknown-metadata-version_tables-md_20220428-1.json +t=2022-04-28 15:21:06,360 lvl=INFO msg=Loaded service metadata +t=2022-04-28 15:21:06,360 lvl=INFO msg=Written service metadata file to: ctb_metadata_files/cantabm_v9-3-0_unknown-metadata-version_service-md_20220428-1.json ``` More detailed information can be obtained by running with a `-l DEBUG` flag e.g.: ``` > python3 bin/ons_csv_to_ctb_json_main.py -i test/testdata/ -g test/testdata/geography/geography.csv -o ctb_metadata_files/ -l DEBUG -t=2022-04-21 14:27:07,830 lvl=DEBUG msg=Creating classification for geographic variable: GEO1 -t=2022-04-21 14:27:07,830 lvl=DEBUG msg=Creating classification for geographic variable: GEO2 -t=2022-04-21 14:27:07,830 lvl=INFO msg=Reading test/testdata/geography/geography.csv: found Welsh labels for unknown classification: OTHER -t=2022-04-21 14:27:07,830 lvl=DEBUG msg=Loaded metadata for Cantabular variable: CLASS1 -t=2022-04-21 14:27:07,830 lvl=DEBUG msg=Loaded metadata for Cantabular variable: CLASS2 -t=2022-04-21 14:27:07,830 lvl=DEBUG msg=Loaded metadata for Cantabular variable: CLASS3 -t=2022-04-21 14:27:07,830 lvl=INFO msg=Dropped non public classification: CLASS_PRIV -t=2022-04-21 14:27:07,830 lvl=DEBUG msg=Loaded metadata for Cantabular variable: GEO1 -t=2022-04-21 14:27:07,830 lvl=DEBUG msg=Loaded metadata for Cantabular variable: GEO2 -t=2022-04-21 14:27:07,830 lvl=INFO msg=Loaded metadata for 5 Cantabular variables -t=2022-04-21 14:27:07,831 lvl=DEBUG msg=Loaded metadata for Cantabular dataset: DB1 -t=2022-04-21 14:27:07,831 lvl=DEBUG msg=Loaded metadata for Cantabular dataset: DB2 -t=2022-04-21 14:27:07,831 lvl=DEBUG msg=Loaded metadata for Cantabular dataset: DB3 -t=2022-04-21 14:27:07,831 lvl=INFO msg=Loaded metadata for 3 Cantabular datasets -t=2022-04-21 14:27:07,832 lvl=INFO msg=Written dataset metadata file to: ctb_metadata_files/dataset-metadata.json -t=2022-04-21 14:27:07,833 lvl=DEBUG msg=Loaded metadata for Cantabular table: DS1 -t=2022-04-21 14:27:07,833 lvl=DEBUG msg=Loaded metadata for Cantabular table: DS2 -t=2022-04-21 14:27:07,833 lvl=DEBUG msg=Loaded metadata for Cantabular table: DS3 -t=2022-04-21 14:27:07,833 lvl=INFO msg=Dropped non public ONS Dataset: DS_PRIV -t=2022-04-21 14:27:07,833 lvl=DEBUG msg=Loaded metadata for Cantabular table: DS4 -t=2022-04-21 14:27:07,833 lvl=INFO msg=Loaded metadata for 4 Cantabular tables -t=2022-04-21 14:27:07,833 lvl=INFO msg=Written table metadata file to: ctb_metadata_files/table-metadata.json -t=2022-04-21 14:27:07,833 lvl=INFO msg=Loaded service metadata -t=2022-04-21 14:27:07,834 lvl=INFO msg=Written service metadata file to: ctb_metadata_files/service-metadata.json +t=2022-04-28 15:25:04,409 lvl=DEBUG msg=Creating classification for geographic variable: GEO1 +t=2022-04-28 15:25:04,409 lvl=DEBUG msg=Creating classification for geographic variable: GEO2 +t=2022-04-28 15:25:04,409 lvl=INFO msg=Reading test/testdata/geography/geography.csv: found Welsh labels for unknown classification: OTHER +t=2022-04-28 15:25:04,409 lvl=DEBUG msg=Loaded metadata for Cantabular variable: CLASS1 +t=2022-04-28 15:25:04,409 lvl=DEBUG msg=Loaded metadata for Cantabular variable: CLASS2 +t=2022-04-28 15:25:04,409 lvl=DEBUG msg=Loaded metadata for Cantabular variable: CLASS3 +t=2022-04-28 15:25:04,409 lvl=INFO msg=Dropped non public classification: CLASS_PRIV +t=2022-04-28 15:25:04,409 lvl=DEBUG msg=Loaded metadata for Cantabular variable: GEO1 +t=2022-04-28 15:25:04,409 lvl=DEBUG msg=Loaded metadata for Cantabular variable: GEO2 +t=2022-04-28 15:25:04,409 lvl=INFO msg=Loaded metadata for 5 Cantabular variables +t=2022-04-28 15:25:04,410 lvl=DEBUG msg=Loaded metadata for Cantabular dataset: DB1 +t=2022-04-28 15:25:04,410 lvl=DEBUG msg=Loaded metadata for Cantabular dataset: DB2 +t=2022-04-28 15:25:04,410 lvl=DEBUG msg=Loaded metadata for Cantabular dataset: DB3 +t=2022-04-28 15:25:04,410 lvl=INFO msg=Loaded metadata for 3 Cantabular datasets +t=2022-04-28 15:25:04,411 lvl=INFO msg=Written dataset metadata file to: ctb_metadata_files/cantabm_v9-3-0_unknown-metadata-version_dataset-md_20220428-1.json +t=2022-04-28 15:25:04,412 lvl=DEBUG msg=Loaded metadata for Cantabular table: DS1 +t=2022-04-28 15:25:04,412 lvl=DEBUG msg=Loaded metadata for Cantabular table: DS2 +t=2022-04-28 15:25:04,412 lvl=DEBUG msg=Loaded metadata for Cantabular table: DS3 +t=2022-04-28 15:25:04,412 lvl=INFO msg=Dropped non public ONS Dataset: DS_PRIV +t=2022-04-28 15:25:04,412 lvl=DEBUG msg=Loaded metadata for Cantabular table: DS4 +t=2022-04-28 15:25:04,412 lvl=INFO msg=Loaded metadata for 4 Cantabular tables +t=2022-04-28 15:25:04,412 lvl=INFO msg=Written table metadata file to: ctb_metadata_files/cantabm_v9-3-0_unknown-metadata-version_tables-md_20220428-1.json +t=2022-04-28 15:25:04,412 lvl=INFO msg=Loaded service metadata +t=2022-04-28 15:25:04,412 lvl=INFO msg=Written service metadata file to: ctb_metadata_files/cantabm_v9-3-0_unknown-metadata-version_service-md_20220428-1.json +``` + +Output file names +----------------- + +The output file names are formatted as follows: +``` +cantabm_9-3-0__dataset-md_.json +cantabm_9-3-0__service-md_.json +cantabm_9-3-0__tables-md_.json +``` + +The `prefix`, `metadata master version` and `build number` can be specified using command line +arguments as described in the help text for `ons_csv_to_ctb_json_main.py`: +``` + -p {d,t,tu}, --file_prefix {d,t,tu} + Prefix to use in output filenames: d=dev, t=test, + tu=tuning (default: no prefix i.e. operational) + -m METADATA_MASTER_VERSION, --metadata_master_version METADATA_MASTER_VERSION + Metadata master version to use in output filenames + (default: unknown-metadata-version) + -b BUILD_NUMBER, --build_number BUILD_NUMBER + Build number to use in output filenames (default: 1) + +``` + +For example: +``` +> python3 bin/ons_csv_to_ctb_json_main.py -i test/testdata/ -g test/testdata/geography/geography.csv -o ctb_metadata_files/ -p t -m test -b 42 +t=2022-04-28 15:28:02,518 lvl=INFO msg=Reading test/testdata/geography/geography.csv: found Welsh labels for unknown classification: OTHER +t=2022-04-28 15:28:02,518 lvl=INFO msg=Dropped non public classification: CLASS_PRIV +t=2022-04-28 15:28:02,518 lvl=INFO msg=Loaded metadata for 5 Cantabular variables +t=2022-04-28 15:28:02,518 lvl=INFO msg=Loaded metadata for 3 Cantabular datasets +t=2022-04-28 15:28:02,519 lvl=INFO msg=Written dataset metadata file to: ctb_metadata_files/t_cantabm_v9-3-0_test_dataset-md_20220428-42.json +t=2022-04-28 15:28:02,520 lvl=INFO msg=Dropped non public ONS Dataset: DS_PRIV +t=2022-04-28 15:28:02,520 lvl=INFO msg=Loaded metadata for 4 Cantabular tables +t=2022-04-28 15:28:02,521 lvl=INFO msg=Written table metadata file to: ctb_metadata_files/t_cantabm_v9-3-0_test_tables-md_20220428-42.json +t=2022-04-28 15:28:02,521 lvl=INFO msg=Loaded service metadata +t=2022-04-28 15:28:02,521 lvl=INFO msg=Written service metadata file to: ctb_metadata_files/t_cantabm_v9-3-0_test_service-md_20220428-42.json ``` Using externally sourced files @@ -105,49 +144,50 @@ can be found in the `sample_2011` directory. Use this command to convert the files to JSON (with debugging enabled): ``` -> python3 bin/ons_csv_to_ctb_json_main.py -i sample_2011/ -g sample_2011/geography.csv -o ctb_metadata_files/ -l DEBUG -t=2022-05-03 08:58:06,547 lvl=DEBUG msg=Creating classification for geographic variable: Region -t=2022-05-03 08:58:06,547 lvl=DEBUG msg=Creating classification for geographic variable: Country -t=2022-05-03 08:58:06,548 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Residence Type -t=2022-05-03 08:58:06,548 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Family Composition -t=2022-05-03 08:58:06,548 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Population Base -t=2022-05-03 08:58:06,548 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Sex -t=2022-05-03 08:58:06,548 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Age -t=2022-05-03 08:58:06,548 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Marital Status -t=2022-05-03 08:58:06,548 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Student -t=2022-05-03 08:58:06,548 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Country of Birth -t=2022-05-03 08:58:06,548 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Health -t=2022-05-03 08:58:06,548 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Ethnic Group -t=2022-05-03 08:58:06,548 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Religion -t=2022-05-03 08:58:06,548 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Economic Activity -t=2022-05-03 08:58:06,548 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Occupation -t=2022-05-03 08:58:06,548 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Industry -t=2022-05-03 08:58:06,548 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Hours worked per week -t=2022-05-03 08:58:06,548 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Approximated Social Grade -t=2022-05-03 08:58:06,549 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Region -t=2022-05-03 08:58:06,549 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Country -t=2022-05-03 08:58:06,549 lvl=INFO msg=Loaded metadata for 18 Cantabular variables -t=2022-05-03 08:58:06,549 lvl=DEBUG msg=Loaded metadata for Cantabular dataset: Teaching-Dataset -t=2022-05-03 08:58:06,549 lvl=INFO msg=Loaded metadata for 1 Cantabular datasets -t=2022-05-03 08:58:06,552 lvl=INFO msg=Written dataset metadata file to: ctb_metadata_files/dataset-metadata.json -t=2022-05-03 08:58:06,553 lvl=DEBUG msg=Loaded metadata for Cantabular table: LC2101EW -t=2022-05-03 08:58:06,553 lvl=DEBUG msg=Loaded metadata for Cantabular table: LC1117EW -t=2022-05-03 08:58:06,553 lvl=DEBUG msg=Loaded metadata for Cantabular table: LC2107EW -t=2022-05-03 08:58:06,553 lvl=DEBUG msg=Loaded metadata for Cantabular table: LC6107EW -t=2022-05-03 08:58:06,553 lvl=DEBUG msg=Loaded metadata for Cantabular table: LC6112EW -t=2022-05-03 08:58:06,553 lvl=INFO msg=Loaded metadata for 5 Cantabular tables -t=2022-05-03 08:58:06,554 lvl=INFO msg=Written table metadata file to: ctb_metadata_files/table-metadata.json -t=2022-05-03 08:58:06,554 lvl=INFO msg=Loaded service metadata -t=2022-05-03 08:58:06,554 lvl=INFO msg=Written service metadata file to: ctb_metadata_files/service-metadata.json +> python3 bin/ons_csv_to_ctb_json_main.py -i sample_2011/ -g sample_2011/geography.csv -o ctb_metadata_files/ -m 2001-sample -l DEBUG +t=2022-05-06 12:55:25,674 lvl=DEBUG msg=Creating classification for geographic variable: Region +t=2022-05-06 12:55:25,675 lvl=DEBUG msg=Creating classification for geographic variable: Country +t=2022-05-06 12:55:25,676 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Residence Type +t=2022-05-06 12:55:25,676 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Family Composition +t=2022-05-06 12:55:25,676 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Population Base +t=2022-05-06 12:55:25,676 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Sex +t=2022-05-06 12:55:25,676 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Age +t=2022-05-06 12:55:25,676 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Marital Status +t=2022-05-06 12:55:25,676 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Student +t=2022-05-06 12:55:25,676 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Country of Birth +t=2022-05-06 12:55:25,676 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Health +t=2022-05-06 12:55:25,676 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Ethnic Group +t=2022-05-06 12:55:25,676 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Religion +t=2022-05-06 12:55:25,676 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Economic Activity +t=2022-05-06 12:55:25,676 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Occupation +t=2022-05-06 12:55:25,676 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Industry +t=2022-05-06 12:55:25,676 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Hours worked per week +t=2022-05-06 12:55:25,676 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Approximated Social Grade +t=2022-05-06 12:55:25,676 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Region +t=2022-05-06 12:55:25,676 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Country +t=2022-05-06 12:55:25,676 lvl=INFO msg=Loaded metadata for 18 Cantabular variables +t=2022-05-06 12:55:25,677 lvl=DEBUG msg=Loaded metadata for Cantabular dataset: Teaching-Dataset +t=2022-05-06 12:55:25,677 lvl=INFO msg=Loaded metadata for 1 Cantabular datasets +t=2022-05-06 12:55:25,680 lvl=INFO msg=Written dataset metadata file to: ctb_metadata_files/cantabm_v9-3-0_2001-sample_dataset-md_20220506-1.json +t=2022-05-06 12:55:25,681 lvl=DEBUG msg=Loaded metadata for Cantabular table: LC2101EW +t=2022-05-06 12:55:25,681 lvl=DEBUG msg=Loaded metadata for Cantabular table: LC1117EW +t=2022-05-06 12:55:25,681 lvl=DEBUG msg=Loaded metadata for Cantabular table: LC2107EW +t=2022-05-06 12:55:25,681 lvl=DEBUG msg=Loaded metadata for Cantabular table: LC6107EW +t=2022-05-06 12:55:25,681 lvl=DEBUG msg=Loaded metadata for Cantabular table: LC6112EW +t=2022-05-06 12:55:25,681 lvl=INFO msg=Loaded metadata for 5 Cantabular tables +t=2022-05-06 12:55:25,682 lvl=INFO msg=Written table metadata file to: ctb_metadata_files/cantabm_v9-3-0_2001-sample_tables-md_20220506-1.json +t=2022-05-06 12:55:25,682 lvl=INFO msg=Loaded service metadata +t=2022-05-06 12:55:25,682 lvl=INFO msg=Written service metadata file to: ctb_metadata_files/cantabm_v9-3-0_2001-sample_service-md_20220506-1.json ``` Load the JSON files with cantabular-metadata ============================================ -To load the generated JSON files into `cantabular-metadata` (version 9.3.0) run: +To load the generated JSON files into `cantabular-metadata` (version 9.3.0) run the following +commands, substituting the file names and paths as appropriate: ``` cd ctb_metadata_files -CANTABULAR_METADATA_GRAPHQL_TYPES_FILE=metadata.graphql CANTABULAR_METADATA_SERVICE_FILE=service-metadata.json CANTABULAR_METADATA_DATASET_FILES=dataset-metadata.json CANTABULAR_METADATA_TABLE_FILES=table-metadata.json /cantabular-metadata +CANTABULAR_METADATA_GRAPHQL_TYPES_FILE=metadata.graphql CANTABULAR_METADATA_SERVICE_FILE=cantabm_v9-3-0_unknown-metadata-version_service-md_20220428-1.json CANTABULAR_METADATA_DATASET_FILES=cantabm_v9-3-0_unknown-metadata-version_dataset-md_20220428-1.json CANTABULAR_METADATA_TABLE_FILES=cantabm_v9-3-0_unknown-metadata-version_tables-md_20220428-1.json /cantabular-metadata ``` The metadata can be queried via a GraphQL interface. By default this is accessible at: diff --git a/bin/ons_csv_to_ctb_json_main.py b/bin/ons_csv_to_ctb_json_main.py index 2e1495f..4a0dc75 100644 --- a/bin/ons_csv_to_ctb_json_main.py +++ b/bin/ons_csv_to_ctb_json_main.py @@ -3,11 +3,18 @@ import os import logging from argparse import ArgumentParser +from datetime import date from ons_csv_to_ctb_json_load import Loader, PUBLIC_SECURITY_MNEMONIC from ons_csv_to_ctb_json_bilingual import BilingualDict, Bilingual VERSION = '1.1.alpha' +SYSTEM = 'cantabm' +SYSTEM_SOFTWARE_VERSION = 'v9-3-0' +FILE_CONTENT_TYPE_DATASET = 'dataset-md' +FILE_CONTENT_TYPE_TABLES = 'tables-md' +FILE_CONTENT_TYPE_SERVICE = 'service-md' + def main(): """ @@ -41,6 +48,24 @@ def main(): choices=['CRITICAL', 'ERROR', 'WARNING', 'INFO', 'DEBUG'], help='Log level (default: %(default)s)') + parser.add_argument('-p', '--file_prefix', + type=str, + choices=['d', 't', 'tu'], + help='Prefix to use in output filenames: d=dev, t=test, tu=tuning ' + '(default: no prefix i.e. operational)') + + parser.add_argument('-m', '--metadata_master_version', + type=str, + default='unknown-metadata-version', + help='Metadata master version to use in output filenames ' + '(default: %(default)s)') + + parser.add_argument('-b', '--build_number', + type=int, + default=1, + help='Build number to use in output filenames ' + '(default: %(default)s)') + args = parser.parse_args() logging.basicConfig(format='t=%(asctime)s lvl=%(levelname)s msg=%(message)s', @@ -50,6 +75,8 @@ def main(): if not os.path.isdir(directory): raise ValueError(f'{directory} does not exist or is not a directory') + todays_date = date.today().strftime('%Y%m%d') + # loader is used to load the metadata from CSV files and convert it to JSON. loader = Loader(args.input_dir, args.geography_file) @@ -58,26 +85,47 @@ def main(): # A Cantabular dataset is equivalent to an ONS database. ctb_variables = build_ctb_variables(loader.classifications, loader.categories) ctb_datasets = build_ctb_datasets(loader.databases, ctb_variables) - filename = os.path.join(args.output_dir, 'dataset-metadata.json') + + base_filename = output_filename(args.file_prefix, args.metadata_master_version, + FILE_CONTENT_TYPE_DATASET, todays_date, + args.build_number) + filename = os.path.join(args.output_dir, base_filename) with open(filename, 'w') as jsonfile: json.dump(ctb_datasets, jsonfile, indent=4) logging.info(f'Written dataset metadata file to: {filename}') # Build Cantabular table objects and write to JSON. ctb_tables = build_ctb_tables(loader.datasets) - filename = os.path.join(args.output_dir, 'table-metadata.json') + + base_filename = output_filename(args.file_prefix, args.metadata_master_version, + FILE_CONTENT_TYPE_TABLES, todays_date, args.build_number) + filename = os.path.join(args.output_dir, base_filename) with open(filename, 'w') as jsonfile: json.dump(ctb_tables, jsonfile, indent=4) logging.info(f'Written table metadata file to: {filename}') # Build Cantabular service metadata objects and write to JSON. service_metadata = build_ctb_service_metadata() - filename = os.path.join(args.output_dir, 'service-metadata.json') + + base_filename = output_filename(args.file_prefix, args.metadata_master_version, + FILE_CONTENT_TYPE_SERVICE, todays_date, + args.build_number) + filename = os.path.join(args.output_dir, base_filename) with open(filename, 'w') as jsonfile: json.dump(service_metadata, jsonfile, indent=4) logging.info(f'Written service metadata file to: {filename}') +def output_filename(prefix, metadata_master_version, content_type, todays_date, build_number): + """Generate output filename.""" + filename = (f'{SYSTEM}_{SYSTEM_SOFTWARE_VERSION}_{metadata_master_version}_{content_type}_' + f'{todays_date}-{build_number}.json') + if prefix: + filename = f'{prefix}_{filename}' + + return filename + + def build_ctb_variables(classifications, cat_labels): """ Build Cantabular variable objects. diff --git a/test/test_contact.py b/test/test_contact.py index ac51e6b..f5a2adb 100644 --- a/test/test_contact.py +++ b/test/test_contact.py @@ -36,4 +36,4 @@ def test_duplicate_contact_id(self): if __name__ == '__main__': - unittest.main() \ No newline at end of file + unittest.main() diff --git a/test/test_integration.py b/test/test_integration.py index d2d019c..f922603 100644 --- a/test/test_integration.py +++ b/test/test_integration.py @@ -3,8 +3,17 @@ import unittest import pathlib import os +from datetime import date import ons_csv_to_ctb_json_main +FILENAME_TABLES = 'cantabm_v9-3-0_unknown-metadata-version_tables-md_19700101-1.json' +FILENAME_DATASET = 'cantabm_v9-3-0_unknown-metadata-version_dataset-md_19700101-1.json' +FILENAME_SERVICE = 'cantabm_v9-3-0_unknown-metadata-version_service-md_19700101-1.json' + +FILENAME_TABLES_NO_GEO = 't_cantabm_v9-3-0_no-geo_tables-md_19700101-2.json' +FILENAME_DATASET_NO_GEO = 't_cantabm_v9-3-0_no-geo_dataset-md_19700101-2.json' +FILENAME_SERVICE_NO_GEO = 't_cantabm_v9-3-0_no-geo_service-md_19700101-2.json' + class TestIntegration(unittest.TestCase): def test_directory_validity(self): """Check that a sensible error is raised if the input/output directory is invalid.""" @@ -31,52 +40,61 @@ def test_directory_validity(self): with self.assertRaisesRegex(ValueError, expected_error): ons_csv_to_ctb_json_main.main() - def test_generated_json(self): + @unittest.mock.patch('ons_csv_to_ctb_json_main.date') + def test_generated_json(self, mock_date): """Generate JSON from source CSV and compare it with expected values.""" + mock_date.today.return_value = date(1970, 1, 1) + mock_date.side_effect = lambda *args, **kw: date(*args, **kw) + file_dir = pathlib.Path(__file__).parent.resolve() input_dir = os.path.join(file_dir, 'testdata') output_dir = os.path.join(file_dir, 'out') geo_dir = os.path.join(input_dir, 'geography/geography.csv') with unittest.mock.patch('sys.argv', ['test', '-i', input_dir, '-o', output_dir, '-g', geo_dir]): ons_csv_to_ctb_json_main.main() - with open(os.path.join(output_dir, 'service-metadata.json')) as f: + with open(os.path.join(output_dir, FILENAME_SERVICE)) as f: service_metadata = json.load(f) with open(os.path.join(file_dir, 'expected/service-metadata.json')) as f: expected_service_metadata = json.load(f) self.assertEqual(service_metadata, expected_service_metadata) - with open(os.path.join(output_dir, 'dataset-metadata.json')) as f: + with open(os.path.join(output_dir, FILENAME_DATASET)) as f: dataset_metadata = json.load(f) with open(os.path.join(file_dir, 'expected/dataset-metadata.json')) as f: expected_dataset_metadata = json.load(f) self.assertEqual(dataset_metadata, expected_dataset_metadata) - with open(os.path.join(output_dir, 'table-metadata.json')) as f: + with open(os.path.join(output_dir, FILENAME_TABLES)) as f: table_metadata = json.load(f) with open(os.path.join(file_dir, 'expected/table-metadata.json')) as f: expected_table_metadata = json.load(f) self.assertEqual(table_metadata, expected_table_metadata) - def test_no_geography_file(self): + @unittest.mock.patch('ons_csv_to_ctb_json_main.date') + def test_no_geography_file(self, mock_date): """Generate JSON from source CSV and compare it with expected values.""" + mock_date.today.return_value = date(1970, 1, 1) + mock_date.side_effect = lambda *args, **kw: date(*args, **kw) + file_dir = pathlib.Path(__file__).parent.resolve() input_dir = os.path.join(file_dir, 'testdata') output_dir = os.path.join(file_dir, 'out') - with unittest.mock.patch('sys.argv', ['test', '-i', input_dir, '-o', output_dir]): + with unittest.mock.patch('sys.argv', ['test', '-i', input_dir, '-o', output_dir, + '-m', 'no-geo', '-b', '2', '-p', 't']): ons_csv_to_ctb_json_main.main() - with open(os.path.join(output_dir, 'service-metadata.json')) as f: + with open(os.path.join(output_dir, FILENAME_SERVICE_NO_GEO)) as f: service_metadata = json.load(f) with open(os.path.join(file_dir, 'expected/service-metadata.json')) as f: expected_service_metadata = json.load(f) self.assertEqual(service_metadata, expected_service_metadata) - with open(os.path.join(output_dir, 'dataset-metadata.json')) as f: + with open(os.path.join(output_dir, FILENAME_DATASET_NO_GEO)) as f: dataset_metadata = json.load(f) with open(os.path.join(file_dir, 'expected/dataset-metadata-no-geo.json')) as f: expected_dataset_metadata = json.load(f) self.assertEqual(dataset_metadata, expected_dataset_metadata) - with open(os.path.join(output_dir, 'table-metadata.json')) as f: + with open(os.path.join(output_dir, FILENAME_TABLES_NO_GEO)) as f: table_metadata = json.load(f) with open(os.path.join(file_dir, 'expected/table-metadata.json')) as f: expected_table_metadata = json.load(f) From 204bb0997238b58451b32405a776d403123a9fbd Mon Sep 17 00:00:00 2001 From: Peter Hynes Date: Mon, 9 May 2022 10:14:19 +0100 Subject: [PATCH 02/15] Validate characters in metadata master version --- bin/ons_csv_to_ctb_json_main.py | 10 +++++++++- test/test_integration.py | 11 +++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/bin/ons_csv_to_ctb_json_main.py b/bin/ons_csv_to_ctb_json_main.py index 4a0dc75..c08a532 100644 --- a/bin/ons_csv_to_ctb_json_main.py +++ b/bin/ons_csv_to_ctb_json_main.py @@ -16,6 +16,14 @@ FILE_CONTENT_TYPE_SERVICE = 'service-md' +def filename_segment(value): + """Check that the string is valid for use as part of a filename.""" + for character in value: + if not character.isalnum() and character not in '-_. ': + raise ValueError(f"invalid value: '{value}'") + return value + + def main(): """ Load metadata in CSV format and export in JSON format. @@ -55,7 +63,7 @@ def main(): '(default: no prefix i.e. operational)') parser.add_argument('-m', '--metadata_master_version', - type=str, + type=filename_segment, default='unknown-metadata-version', help='Metadata master version to use in output filenames ' '(default: %(default)s)') diff --git a/test/test_integration.py b/test/test_integration.py index f922603..bad0a00 100644 --- a/test/test_integration.py +++ b/test/test_integration.py @@ -40,6 +40,17 @@ def test_directory_validity(self): with self.assertRaisesRegex(ValueError, expected_error): ons_csv_to_ctb_json_main.main() + def test_metadata_master_version(self): + """Check that a SystemExit is raised if the metadata master version is invalid.""" + file_dir = pathlib.Path(__file__).parent.resolve() + input_dir = os.path.join(file_dir, 'testdata') + output_dir = os.path.join(file_dir, 'out') + + with unittest.mock.patch('sys.argv', ['test', '-i', input_dir, '-o', output_dir, + '-m', 'a/../b']): + with self.assertRaises(SystemExit): + ons_csv_to_ctb_json_main.main() + @unittest.mock.patch('ons_csv_to_ctb_json_main.date') def test_generated_json(self, mock_date): """Generate JSON from source CSV and compare it with expected values.""" From f5ece9f0cdcf5088b2d39ab3f91162ce0e3697cb Mon Sep 17 00:00:00 2001 From: Peter Hynes Date: Mon, 9 May 2022 11:07:13 +0100 Subject: [PATCH 03/15] Ensure that build number is a positive int --- bin/ons_csv_to_ctb_json_main.py | 11 ++++++++++- test/test_integration.py | 16 ++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/bin/ons_csv_to_ctb_json_main.py b/bin/ons_csv_to_ctb_json_main.py index c08a532..06956fc 100644 --- a/bin/ons_csv_to_ctb_json_main.py +++ b/bin/ons_csv_to_ctb_json_main.py @@ -24,6 +24,15 @@ def filename_segment(value): return value +def positive_int(value): + """Check that the value is an integer greater or equal to 0.""" + # An exception will be raised if value is not an int + number = int(value) + if number < 0: + raise ValueError(f"invalid value: '{value}'") + return number + + def main(): """ Load metadata in CSV format and export in JSON format. @@ -69,7 +78,7 @@ def main(): '(default: %(default)s)') parser.add_argument('-b', '--build_number', - type=int, + type=positive_int, default=1, help='Build number to use in output filenames ' '(default: %(default)s)') diff --git a/test/test_integration.py b/test/test_integration.py index bad0a00..b3e2225 100644 --- a/test/test_integration.py +++ b/test/test_integration.py @@ -51,6 +51,22 @@ def test_metadata_master_version(self): with self.assertRaises(SystemExit): ons_csv_to_ctb_json_main.main() + def test_build_number(self): + """Check that a SystemExit is raised if the build number is invalid.""" + file_dir = pathlib.Path(__file__).parent.resolve() + input_dir = os.path.join(file_dir, 'testdata') + output_dir = os.path.join(file_dir, 'out') + + with unittest.mock.patch('sys.argv', ['test', '-i', input_dir, '-o', output_dir, + '-b', 'a']): + with self.assertRaises(SystemExit): + ons_csv_to_ctb_json_main.main() + + with unittest.mock.patch('sys.argv', ['test', '-i', input_dir, '-o', output_dir, + '-b', '-1']): + with self.assertRaises(SystemExit): + ons_csv_to_ctb_json_main.main() + @unittest.mock.patch('ons_csv_to_ctb_json_main.date') def test_generated_json(self, mock_date): """Generate JSON from source CSV and compare it with expected values.""" From df4ecec80e7ee9449214aedac60254f59d15ede7 Mon Sep 17 00:00:00 2001 From: Peter Hynes Date: Thu, 5 May 2022 10:14:04 +0100 Subject: [PATCH 04/15] Added alternate geographic variables to tables --- bin/ons_csv_to_ctb_json_load.py | 7 ++++++- ctb_metadata_files/metadata.graphql | 1 + test/expected/table-metadata.json | 26 +++++++++++++++++++------- test/testdata/Dataset_Variable.csv | 1 + test/testdata/Variable.csv | 1 + 5 files changed, 28 insertions(+), 8 deletions(-) diff --git a/bin/ons_csv_to_ctb_json_load.py b/bin/ons_csv_to_ctb_json_load.py index d7b12ed..db86d6c 100644 --- a/bin/ons_csv_to_ctb_json_load.py +++ b/bin/ons_csv_to_ctb_json_load.py @@ -270,6 +270,11 @@ def datasets(self): dataset_variables = dataset_to_variables.get( dataset_mnemonic, DatasetVariables([], [])) + alternate_geog_variables = (dataset_variables.alternate_geog_variables if + dataset_variables.alternate_geog_variables else []) + dataset['Alternate_Geographic_Variables'] = alternate_geog_variables + all_classifications = dataset_variables.classifications + alternate_geog_variables + # If the dataset is public then ensure that there is at least one classification and # that all the classifications are also public. if dataset['Security_Mnemonic'] == PUBLIC_SECURITY_MNEMONIC: @@ -278,7 +283,7 @@ def datasets(self): f'Reading {self.full_filename(filename)}:{line_num} {dataset_mnemonic} ' 'has no associated classifications or geographic variable') - for classification in dataset_variables.classifications: + for classification in all_classifications: if self.classifications[classification].private['Security_Mnemonic'] != \ PUBLIC_SECURITY_MNEMONIC: raise ValueError( diff --git a/ctb_metadata_files/metadata.graphql b/ctb_metadata_files/metadata.graphql index 7a01a91..2b2767b 100644 --- a/ctb_metadata_files/metadata.graphql +++ b/ctb_metadata_files/metadata.graphql @@ -92,6 +92,7 @@ type TableMetadata { Publications: [Publication]! Census_Releases: [Census_Release]! Statistical_Unit: Statistical_Unit! + Alternate_Geographic_Variables: [String]! } type Publication { diff --git a/test/expected/table-metadata.json b/test/expected/table-metadata.json index cee40cf..a3bee43 100644 --- a/test/expected/table-metadata.json +++ b/test/expected/table-metadata.json @@ -64,6 +64,9 @@ "Publisher_Name": null, "Publisher_Website": null } + ], + "Alternate_Geographic_Variables": [ + "GEO2" ] } }, @@ -122,6 +125,9 @@ "Publisher_Name": null, "Publisher_Website": null } + ], + "Alternate_Geographic_Variables": [ + "GEO2" ] } } @@ -159,7 +165,8 @@ "Release_Date": "1/1/2022" } ], - "Publications": [] + "Publications": [], + "Alternate_Geographic_Variables": [] } }, { @@ -187,7 +194,8 @@ "Release_Date": "1/1/2022" } ], - "Publications": [] + "Publications": [], + "Alternate_Geographic_Variables": [] } } ] @@ -224,7 +232,8 @@ "Keywords": [], "Related_Datasets": [], "Census_Releases": [], - "Publications": [] + "Publications": [], + "Alternate_Geographic_Variables": [] } }, { @@ -252,7 +261,8 @@ "Keywords": [], "Related_Datasets": [], "Census_Releases": [], - "Publications": [] + "Publications": [], + "Alternate_Geographic_Variables": [] } } ] @@ -283,7 +293,8 @@ "Keywords": [], "Related_Datasets": [], "Census_Releases": [], - "Publications": [] + "Publications": [], + "Alternate_Geographic_Variables": [] } }, { @@ -305,9 +316,10 @@ "Keywords": [], "Related_Datasets": [], "Census_Releases": [], - "Publications": [] + "Publications": [], + "Alternate_Geographic_Variables": [] } } ] } -] +] \ No newline at end of file diff --git a/test/testdata/Dataset_Variable.csv b/test/testdata/Dataset_Variable.csv index 4eec30c..9e36d33 100644 --- a/test/testdata/Dataset_Variable.csv +++ b/test/testdata/Dataset_Variable.csv @@ -8,3 +8,4 @@ CLASS2,DS4,5,1,VAR2,N ,DS2,7,,GEO2,Y ,DS3,8,,GEO2,Y ,DS_PRIV,9,,GEO1,Y +,DS1,10,,GEO2,N diff --git a/test/testdata/Variable.csv b/test/testdata/Variable.csv index 94ef3e9..87a20b2 100644 --- a/test/testdata/Variable.csv +++ b/test/testdata/Variable.csv @@ -5,3 +5,4 @@ VAR2,3,VAR2 Title,,VAR2 Description,,DVO,,,,VAR2 Comparability Comments,,VAR2 UK VAR3,4,VAR3 Title,,VAR3 Description,,DVO,,,,,,,,PUB,,,,,,,,,1,, VAR_PRIV,5,VAR_PRIV Title,VAR_PRIV Title (Welsh),VAR_PRIV Description,VAR_PRIV Description (Welsh),DVO,People,TOPIC1,VAR_PRIV 2011,VAR_PRIV Comparability Comments,VAR_PRIV Comparability Comments (Welsh),VAR_PRIV UK Comparison Comments,VAR_PRIV Comparison Comments (Welsh),CLASS,Y,1,,,,,,,1,, GEO2,6,GEO2 Title,,GEO2 Description,,GEOG,,,,GEO2 Comparability Comments,,GEO2 UK Comparison Comments,,PUB,Y,3,G2,,GEO1 Theme,,GEO2 Coverage,,1,, +GEO_PRIV,7,GEO_PRIV Title,,GEO_PRIV Description,,GEOG,,,,GEO_PRIV Comparability Comments,,GEO_PRIV UK Comparison Comments,,CLASS,Y,3,G_PRIV,,GEO_PRIV Theme,,GEO_PRIV Coverage,,1,, From 2699b2d10303692787cec50a9b85fbd9f3f34736 Mon Sep 17 00:00:00 2001 From: Peter Hynes Date: Tue, 26 Apr 2022 11:49:44 +0100 Subject: [PATCH 05/15] Handle updated list of mandatory fields --- .github/workflows/ci-test.yml | 2 +- bin/ons_csv_to_ctb_json_load.py | 58 ++++++++++++++++------ bin/ons_csv_to_ctb_json_main.py | 1 + ctb_metadata_files/metadata.graphql | 2 +- test/expected/dataset-metadata-no-geo.json | 12 +++-- test/expected/dataset-metadata.json | 12 +++-- test/test_classification.py | 7 +-- test/test_dataset.py | 6 ++- test/test_security_classification.py | 5 +- test/test_source.py | 3 +- test/test_variable.py | 6 ++- test/testdata/Category.csv | 1 + test/testdata/Classification.csv | 4 +- test/testdata/Dataset.csv | 4 +- test/testdata/Security_Classification.csv | 2 +- test/testdata/Variable.csv | 4 +- 16 files changed, 86 insertions(+), 43 deletions(-) diff --git a/.github/workflows/ci-test.yml b/.github/workflows/ci-test.yml index f04fbec..5905643 100644 --- a/.github/workflows/ci-test.yml +++ b/.github/workflows/ci-test.yml @@ -32,7 +32,7 @@ jobs: pydocstyle bin/*.py - name: Run pylint run: | - pylint --max-locals=20 --max-public-methods=30 --max-branches=50 --max-statements=90 --min-similarity-lines=6 --disable=W1202 bin/*.py + pylint --max-locals=20 --max-public-methods=30 --max-branches=50 --max-statements=90 --min-similarity-lines=6 --max-module-lines=1200 --disable=W1202 bin/*.py - name: Run tests run: | PYTHONPATH=test:bin python3 -m unittest -v diff --git a/bin/ons_csv_to_ctb_json_load.py b/bin/ons_csv_to_ctb_json_load.py index db86d6c..5132417 100644 --- a/bin/ons_csv_to_ctb_json_load.py +++ b/bin/ons_csv_to_ctb_json_load.py @@ -19,6 +19,11 @@ def isnumeric(string): return string.isnumeric() +def is_y_or_n(string): + """Return true if the string is either 'Y' or 'N'.""" + return string in ['Y', 'N'] + + def isoneof(valid_values): """Return a function that checks whether the value is in the specified set of values.""" valid_values_set = set(valid_values) @@ -102,6 +107,7 @@ def sources(self): required('Source_Mnemonic', unique=True), required('Source_Description'), required('Id'), + required('Version'), optional('Source_Description_Welsh'), optional('Copyright_Statement'), @@ -113,7 +119,6 @@ def sources(self): optional('SDC_Link'), optional('SDC_Statement'), optional('SDC_Statement_Welsh'), - optional('Version'), optional('Contact_Id', validate_fn=isoneof(self.contacts.keys())), ] source_rows = self.read_file('Source.csv', columns) @@ -170,9 +175,9 @@ def security_classifications(self): columns = [ required('Security_Mnemonic', unique=True), required('Id'), + required('Security_Description'), optional('Security_Description_Welsh'), - optional('Security_Description'), ] security_classification_rows = self.read_file(filename, columns) @@ -229,6 +234,7 @@ def datasets(self): required('Statistical_Unit', validate_fn=isoneof(self.statistical_units.keys())), required('Version'), required('Dataset_Description'), + required('Signed_Off_Flag', validate_fn=is_y_or_n), optional('Dataset_Title_Welsh'), optional('Dataset_Description_Welsh'), @@ -237,7 +243,6 @@ def datasets(self): optional('Dataset_Population_Welsh'), optional('Last_Updated'), optional('Unique_Url'), - optional('Signed_Off_Flag'), optional('Contact_Id', validate_fn=isoneof(self.contacts.keys())), ] dataset_rows = self.read_file(filename, columns) @@ -328,10 +333,11 @@ def databases(self): required('Id'), required('Database_Description'), required('Version'), + # This should be mandatory but is not yet populated + optional('Cantabular_DB_Flag', validate_fn=is_y_or_n), optional('Database_Title_Welsh'), optional('Database_Description_Welsh'), - optional('Cantabular_DB_Flag'), optional('IAR_Asset_Id'), ] database_rows = self.read_file('Database.csv', columns) @@ -542,9 +548,17 @@ def variables(self): required('Variable_Type_Code', validate_fn=isoneof(self.variable_types.keys())), required('Variable_Title'), required('Variable_Description'), - optional('Statistical_Unit', validate_fn=isoneof(self.statistical_units.keys())), required('Id'), required('Version'), + required('Signed_Off_Flag', validate_fn=is_y_or_n), + + # Required for non-geographic variables but not always populated in source files + optional('Statistical_Unit', validate_fn=isoneof(self.statistical_units.keys())), + + # Required for geographic variables but not yet populated + optional('Geographic_Abbreviation'), + optional('Geographic_Theme'), + optional('Geographic_Coverage'), optional('Variable_Title_Welsh'), optional('Variable_Description_Welsh'), @@ -553,14 +567,10 @@ def variables(self): optional('Comparability_Comments_Welsh'), optional('Uk_Comparison_Comments'), optional('Uk_Comparison_Comments_Welsh'), - optional('Geographic_Abbreviation'), optional('Geographic_Abbreviation_Welsh'), - optional('Geographic_Theme'), optional('Geographic_Theme_Welsh'), - optional('Geographic_Coverage'), optional('Geographic_Coverage_Welsh'), optional('Topic_Mnemonic', validate_fn=isoneof(self.topics.keys())), - optional('Signed_Off_Flag'), optional('Number_Of_Classifications'), optional('Quality_Statement_Text'), optional('Quality_Summary_URL'), @@ -571,19 +581,34 @@ def variables(self): variable_to_keywords = self.load_variable_to_keywords(variable_mnemonics) variable_to_source_questions = self.load_variable_to_questions(variable_mnemonics) - geo_fields = {'Geographic_Abbreviation', 'Geographic_Abbreviation_Welsh', - 'Geographic_Theme', 'Geographic_Theme_Welsh', 'Geographic_Coverage', - 'Geographic_Coverage_Welsh'} + en_geo_fields = {'Geographic_Abbreviation', 'Geographic_Theme', 'Geographic_Coverage'} + all_geo_fields = en_geo_fields | {'Geographic_Abbreviation_Welsh', + 'Geographic_Theme_Welsh', + 'Geographic_Coverage_Welsh'} variables = {} for variable, line_num in variable_rows: # Ensure that non-geographic variables do not have geographic values set. is_geographic = variable['Variable_Type_Code'] == GEOGRAPHIC_VARIABLE_TYPE if not is_geographic: - for geo_field in geo_fields: + # This value is not always populated in source files + # if not variable['Statistical_Unit']: + # raise ValueError(f'Reading {self.full_filename(filename)}:{line_num} ' + # f'no Statistical_Unit specified for non geographic variable: ' + # f'{variable["Variable_Mnemonic"]}') + for geo_field in all_geo_fields: if variable[geo_field]: raise ValueError(f'Reading {self.full_filename(filename)}:{line_num} ' f'{geo_field} specified for non geographic variable: ' f'{variable["Variable_Mnemonic"]}') + + # These values are not yet populated in source files + # else: + # for geo_field in en_geo_fields: + # if not variable[geo_field]: + # raise ValueError(f'Reading {self.full_filename(filename)}:{line_num} ' + # f'no {geo_field} specified for geographic variable: ' + # f'{variable["Variable_Mnemonic"]}') + variable_title = Bilingual( variable.pop('Variable_Title'), variable.pop('Variable_Title_Welsh')) @@ -643,18 +668,18 @@ def classifications(self): columns = [ required('Id'), required('Classification_Mnemonic', unique=True), - required('Number_Of_Category_Items', validate_fn=isnumeric), required('Variable_Mnemonic', validate_fn=isoneof(self.variables.keys())), required('Internal_Classification_Label_English'), required('Security_Mnemonic', validate_fn=isoneof(self.security_classifications)), required('Version'), + required('Signed_Off_Flag', validate_fn=is_y_or_n), + optional('Number_Of_Category_Items', validate_fn=isnumeric), optional('External_Classification_Label_English'), optional('External_Classification_Label_Welsh'), optional('Mnemonic_2011'), optional('Parent_Classification_Mnemonic'), optional('Default_Classification_Flag'), - optional('Signed_Off_Flag'), optional('Flat_Classification_Flag'), ] classification_rows = self.read_file(filename, columns) @@ -692,7 +717,8 @@ def classifications(self): del classification['Flat_Classification_Flag'] del classification['Id'] - num_cat_items = int(classification.pop('Number_Of_Category_Items')) + num_cat_items = classification.pop('Number_Of_Category_Items') + num_cat_items = int(num_cat_items) if num_cat_items else 0 classifications[classification_mnemonic] = BilingualDict( classification, diff --git a/bin/ons_csv_to_ctb_json_main.py b/bin/ons_csv_to_ctb_json_main.py index 06956fc..b777fb1 100644 --- a/bin/ons_csv_to_ctb_json_main.py +++ b/bin/ons_csv_to_ctb_json_main.py @@ -199,6 +199,7 @@ def build_ctb_datasets(databases, ctb_variables): 'Source': { 'Source_Mnemonic': 'Census2021', 'Source_Description': 'The 2021 England and Wales Census', + 'Version': '1', }, 'Version': '1' }, diff --git a/ctb_metadata_files/metadata.graphql b/ctb_metadata_files/metadata.graphql index 2b2767b..4e08953 100644 --- a/ctb_metadata_files/metadata.graphql +++ b/ctb_metadata_files/metadata.graphql @@ -67,7 +67,7 @@ type Source { Methodology_Statement: String SDC_Link: String SDC_Statement: String - Version: String + Version: String! Contact: Contact } diff --git a/test/expected/dataset-metadata-no-geo.json b/test/expected/dataset-metadata-no-geo.json index 5cae910..0cb7694 100644 --- a/test/expected/dataset-metadata-no-geo.json +++ b/test/expected/dataset-metadata-no-geo.json @@ -7,7 +7,8 @@ "meta": { "Source": { "Source_Mnemonic": "Census2021", - "Source_Description": "The 2021 England and Wales Census" + "Source_Description": "The 2021 England and Wales Census", + "Version": "1" }, "Version": "1" }, @@ -245,7 +246,8 @@ "meta": { "Source": { "Source_Mnemonic": "Census2021", - "Source_Description": "The 2021 England and Wales Census" + "Source_Description": "The 2021 England and Wales Census", + "Version": "1" }, "Version": "1" }, @@ -357,7 +359,9 @@ }, "Topics": [] }, - "catLabels": null + "catLabels": { + "CODE2-1": "LABEL2-1 (Welsh)" + } }, { "name": "CLASS3", @@ -678,4 +682,4 @@ }, "vars": null } -] +] \ No newline at end of file diff --git a/test/expected/dataset-metadata.json b/test/expected/dataset-metadata.json index 1bae173..5154346 100644 --- a/test/expected/dataset-metadata.json +++ b/test/expected/dataset-metadata.json @@ -7,7 +7,8 @@ "meta": { "Source": { "Source_Mnemonic": "Census2021", - "Source_Description": "The 2021 England and Wales Census" + "Source_Description": "The 2021 England and Wales Census", + "Version": "1" }, "Version": "1" }, @@ -245,7 +246,8 @@ "meta": { "Source": { "Source_Mnemonic": "Census2021", - "Source_Description": "The 2021 England and Wales Census" + "Source_Description": "The 2021 England and Wales Census", + "Version": "1" }, "Version": "1" }, @@ -357,7 +359,9 @@ }, "Topics": [] }, - "catLabels": null + "catLabels": { + "CODE2-1": "LABEL2-1 (Welsh)" + } }, { "name": "CLASS3", @@ -682,4 +686,4 @@ }, "vars": null } -] +] \ No newline at end of file diff --git a/test/test_classification.py b/test/test_classification.py index b17ca47..574aeaf 100644 --- a/test/test_classification.py +++ b/test/test_classification.py @@ -14,9 +14,9 @@ 'Security_Mnemonic': 'PUB', 'Variable_Mnemonic': 'VAR1', 'Internal_Classification_Label_English': 'label', - 'Number_Of_Category_Items': '1', 'Version': '1', - 'Id': '1'} + 'Id': '1', + 'Signed_Off_Flag': 'N'} INPUT_DIR = os.path.join(pathlib.Path(__file__).parent.resolve(), 'testdata') @@ -37,7 +37,8 @@ def test_required_fields(self): self.run_test([row], f'^Reading {FILENAME}:2 no value supplied for required field {field}$') def test_invalid_values(self): - for field in ['Security_Mnemonic', 'Variable_Mnemonic', 'Number_Of_Category_Items']: + for field in ['Security_Mnemonic', 'Variable_Mnemonic', 'Number_Of_Category_Items', + 'Signed_Off_Flag']: with self.subTest(field=field): row = REQUIRED_FIELDS.copy() row[field] = 'X' diff --git a/test/test_dataset.py b/test/test_dataset.py index 3c085b7..9ab18dc 100644 --- a/test/test_dataset.py +++ b/test/test_dataset.py @@ -18,7 +18,8 @@ 'Dataset_Population': 'population', 'Id': '1', 'Statistical_Unit': 'People', - 'Version': '1'} + 'Version': '1', + 'Signed_Off_Flag': 'N'} REQUIRED_FIELDS = {'Dataset_Mnemonic': 'DS1', 'Database_Mnemonic': 'DB1', @@ -44,7 +45,8 @@ def test_required_fields(self): self.run_test([row], f'^Reading {FILENAME}:2 no value supplied for required field {field}$') def test_invalid_values(self): - for field in ['Security_Mnemonic', 'Database_Mnemonic', 'Contact_Id', 'Statistical_Unit']: + for field in ['Security_Mnemonic', 'Database_Mnemonic', 'Contact_Id', 'Statistical_Unit', + 'Signed_Off_Flag']: with self.subTest(field=field): row = REQUIRED_FIELDS.copy() row[field] = 'X' diff --git a/test/test_security_classification.py b/test/test_security_classification.py index ab410c3..ddd74df 100644 --- a/test/test_security_classification.py +++ b/test/test_security_classification.py @@ -8,7 +8,8 @@ HEADERS = ['Security_Mnemonic', 'Id', 'Security_Description', 'Security_Description_Welsh'] REQUIRED_FIELDS = {'Security_Mnemonic': 'PUB', - 'Id': '1'} + 'Id': '1', + 'Security_Description': 'Public'} INPUT_DIR = os.path.join(pathlib.Path(__file__).parent.resolve(), 'testdata') @@ -35,7 +36,7 @@ def test_duplicate_security_mnemonic(self): def test_missing_public_security_classification(self): self.run_test( - [{'Security_Mnemonic': 'PRIVATE', 'Id': '1'}], + [{'Security_Mnemonic': 'PRIVATE', 'Id': '1', 'Security_Description': 'Private'}], f'^PUB not found as Security_Mnemonic for any entry in {FILENAME}$') diff --git a/test/test_source.py b/test/test_source.py index 6b037aa..09a3d6e 100644 --- a/test/test_source.py +++ b/test/test_source.py @@ -12,7 +12,8 @@ REQUIRED_FIELDS = {'Source_Mnemonic': 'SRC1', 'Source_Description': 'description', - 'Id': '1'} + 'Id': '1', + 'Version': '1'} INPUT_DIR = os.path.join(pathlib.Path(__file__).parent.resolve(), 'testdata') diff --git a/test/test_variable.py b/test/test_variable.py index 6dd9ecb..51998c0 100644 --- a/test/test_variable.py +++ b/test/test_variable.py @@ -18,7 +18,8 @@ 'Variable_Title': 'title', 'Variable_Description': 'description', 'Id': '1', - 'Version': '1'} + 'Version': '1', + 'Signed_Off_Flag': 'N'} REQUIRED_FIELDS = {'Variable_Mnemonic': 'VAR1', 'Variable_Type_Code': 'DVO', @@ -43,7 +44,8 @@ def test_required_fields(self): self.run_test([row], f'^Reading {FILENAME}:2 no value supplied for required field {field}$') def test_invalid_values(self): - for field in ['Security_Mnemonic', 'Variable_Type_Code', 'Statistical_Unit', 'Topic_Mnemonic']: + for field in ['Security_Mnemonic', 'Variable_Type_Code', 'Statistical_Unit', 'Topic_Mnemonic', + 'Signed_Off_Flag']: with self.subTest(field=field): row = REQUIRED_FIELDS.copy() row[field] = 'X' diff --git a/test/testdata/Category.csv b/test/testdata/Category.csv index f6de984..81d470f 100644 --- a/test/testdata/Category.csv +++ b/test/testdata/Category.csv @@ -5,3 +5,4 @@ SOURCE,CLASS1,6,CODE6,,LABEL6 (Welsh),6,1,LABEL6 Internal SOURCE,CLASS1,2,CODE2,,LABEL2 (Welsh),2,1,LABEL2 Internal SOURCE,CLASS1,4,CODE4,,,,1,LABEL4 Internal SOURCE,CLASS1,1,CODE1,LABEL1,,,1,LABEL1 Internal +SOURCE2,CLASS2,1,CODE2-1,,LABEL2-1 (Welsh),1,1,LABEL2-1 Internal diff --git a/test/testdata/Classification.csv b/test/testdata/Classification.csv index 869f827..56a6734 100644 --- a/test/testdata/Classification.csv +++ b/test/testdata/Classification.csv @@ -1,5 +1,5 @@ Classification_Mnemonic,Variable_Mnemonic,Id,External_Classification_Label_English,External_Classification_Label_Welsh,Number_Of_Category_Items,Mnemonic_2011,Flat_Classification_Flag,Parent_Classification_Mnemonic,Security_Mnemonic,Signed_Off_Flag,Default_Classification_Flag,Version,Internal_Classification_Label_English CLASS1,VAR1,1,CLASS1 Label,CLASS1 Label Welsh,6,CLASS1 2011,N,CLASS1 Parent,PUB,Y,Y,1,CLASS1 Label Internal -CLASS2,VAR2,2,,,5,,,,PUB,N,,1,CLASS2 Label Internal -CLASS3,VAR3,3,,CLASS3 Label Welsh,5,,,,PUB,,N,1,CLASS3 Label Internal +CLASS2,VAR2,2,,,,,,,PUB,N,,1,CLASS2 Label Internal +CLASS3,VAR3,3,,CLASS3 Label Welsh,,,,,PUB,N,N,1,CLASS3 Label Internal CLASS_PRIV,VAR_PRIV,5,CLASS_PRIV Label,CLASS_PRIV Label Welsh,0,CLASS_PRIV 2011,Y,CLASS_PRIV,CLASS,Y,Y,1,CLASS_PRIV Label Internal diff --git a/test/testdata/Dataset.csv b/test/testdata/Dataset.csv index 94c789d..116948a 100644 --- a/test/testdata/Dataset.csv +++ b/test/testdata/Dataset.csv @@ -1,6 +1,6 @@ Dataset_Mnemonic,Id,Dataset_Title,Dataset_Title_Welsh,Dataset_Description,Dataset_Description_Welsh,Statistical_Unit,Dataset_Mnemonic_2011,Geographic_Coverage,Geographic_Coverage_Welsh,Dataset_Population,Dataset_Population_Welsh,Last_Updated,Unique_Url,Security_Mnemonic,Signed_Off_Flag,Database_Mnemonic,Contact_Id,Version DS1,1,DS1 Title,DS1 Title (Welsh),DS1 Description,DS1 Description (Welsh),People,DS1 2011,Everywhere,Everywhere (Welsh),Everyone,Everyone (Welsh),Today,DS1 Unique URL,PUB,Y,DB1,2,1 -DS2,2,DS2 Title,,DS2 Description,,Houses,,Everywhere,,Everyone,,,,PUB,,DB2,,1 +DS2,2,DS2 Title,,DS2 Description,,Houses,,Everywhere,,Everyone,,,,PUB,N,DB2,,1 DS3,3,DS3 Title,DS3 Title (Welsh),DS3 Description,DS3 Description (Welsh),People,DS3 2011,Everywhere,Everywhere (Welsh),Everyone,Everyone (Welsh),Today,DS3 Unique URL,PUB,Y,DB2,1,1 DS_PRIV,4,DS_PRIV Title,DS_PRIV Title (Welsh),DS_PRIV Description,DS_PRIV Description (Welsh),People,DS_PRIV 2011,Everywhere,Everywhere (Welsh),Everyone,Everyone (Welsh),Today,DS_PRIV Unique URL,CLASS,Y,DB1,1,1 -DS4,5,DS4 Title,,DS4 Description,,Houses,,Everywhere,,Everyone,,,,PUB,,DB1,,1 +DS4,5,DS4 Title,,DS4 Description,,Houses,,Everywhere,,Everyone,,,,PUB,N,DB1,,1 diff --git a/test/testdata/Security_Classification.csv b/test/testdata/Security_Classification.csv index 7d5c322..3fa7783 100644 --- a/test/testdata/Security_Classification.csv +++ b/test/testdata/Security_Classification.csv @@ -1,3 +1,3 @@ Security_Mnemonic,Id,Security_Description,Security_Description_Welsh PUB,1,Public,Public (Welsh) -CLASS,2,, +CLASS,2,Classified, diff --git a/test/testdata/Variable.csv b/test/testdata/Variable.csv index 87a20b2..b5d2386 100644 --- a/test/testdata/Variable.csv +++ b/test/testdata/Variable.csv @@ -1,8 +1,8 @@ Variable_Mnemonic,Id,Variable_Title,Variable_Title_Welsh,Variable_Description,Variable_Description_Welsh,Variable_Type_Code,Statistical_Unit,Topic_Mnemonic,Variable_Mnemonic_2011,Comparability_Comments,Comparability_Comments_Welsh,Uk_Comparison_Comments,Uk_Comparison_Comments_Welsh,Security_Mnemonic,Signed_Off_Flag,Number_Of_Classifications,Geographic_Abbreviation,Geographic_Abbreviation_Welsh,Geographic_Theme,Geographic_Theme_Welsh,Geographic_Coverage,Geographic_Coverage_Welsh,Version,Quality_Statement_Text,Quality_Summary_URL VAR1,1,VAR1 Title,VAR1 Title (Welsh),VAR1 Description,VAR1 Description (Welsh),DVO,People,TOPIC1,VAR1 2011,VAR1 Comparability Comments,VAR1 Comparability Comments (Welsh),VAR1 UK Comparison Comments,VAR1 UK Comparison Comments (Welsh),PUB,Y,3,,,,,,,1,VAR1 Quality Statement Text,VAR1 Quality Statement URL GEO1,2,GEO1 Title,GEO1 Title (Welsh),GEO1 Description,GEO1 Description (Welsh),GEOG,People,TOPIC1,GEO1 2011,GEO1 Comparability Comments,GEO1 Comparability Comments (Welsh),GEO1 UK Comparison Comments,GEO1 UK Comparison Comments (Welsh),PUB,Y,3,G1,G1 (Welsh),GEO1 Theme,GEO1 Theme (Welsh),GEO1 Coverage,GEO1 Coverage (Welsh),1,, -VAR2,3,VAR2 Title,,VAR2 Description,,DVO,,,,VAR2 Comparability Comments,,VAR2 UK Comparison Comments,,PUB,,,,,,,,,1,, -VAR3,4,VAR3 Title,,VAR3 Description,,DVO,,,,,,,,PUB,,,,,,,,,1,, +VAR2,3,VAR2 Title,,VAR2 Description,,DVO,,,,VAR2 Comparability Comments,,VAR2 UK Comparison Comments,,PUB,N,,,,,,,,1,, +VAR3,4,VAR3 Title,,VAR3 Description,,DVO,,,,,,,,PUB,N,,,,,,,,1,, VAR_PRIV,5,VAR_PRIV Title,VAR_PRIV Title (Welsh),VAR_PRIV Description,VAR_PRIV Description (Welsh),DVO,People,TOPIC1,VAR_PRIV 2011,VAR_PRIV Comparability Comments,VAR_PRIV Comparability Comments (Welsh),VAR_PRIV UK Comparison Comments,VAR_PRIV Comparison Comments (Welsh),CLASS,Y,1,,,,,,,1,, GEO2,6,GEO2 Title,,GEO2 Description,,GEOG,,,,GEO2 Comparability Comments,,GEO2 UK Comparison Comments,,PUB,Y,3,G2,,GEO1 Theme,,GEO2 Coverage,,1,, GEO_PRIV,7,GEO_PRIV Title,,GEO_PRIV Description,,GEOG,,,,GEO_PRIV Comparability Comments,,GEO_PRIV UK Comparison Comments,,CLASS,Y,3,G_PRIV,,GEO_PRIV Theme,,GEO_PRIV Coverage,,1,, From b33710cb6171c18ca622966bd88e962f8870d124 Mon Sep 17 00:00:00 2001 From: Peter Hynes Date: Wed, 4 May 2022 16:14:38 +0100 Subject: [PATCH 06/15] Added Signed_Off_Flag values to Variable.csv --- sample_2011/Variable.csv | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/sample_2011/Variable.csv b/sample_2011/Variable.csv index 36405f7..82b6eb5 100644 --- a/sample_2011/Variable.csv +++ b/sample_2011/Variable.csv @@ -1,17 +1,17 @@ Id,Variable_Mnemonic,Variable_Title,Variable_Title_Welsh,Variable_Description,Variable_Description_Welsh,Variable_Mnemonic_2011,Comparability_Comments,Comparability_Comments_Welsh,Uk_Comparison_Comments,Uk_Comparison_Comments_Welsh,Signed_Off_Flag,Security_Mnemonic,Variable_Type_Code,Topic_Mnemonic,Number_Of_Classifications,Statistical_Unit,Geographic_Abbreviation,Geographic_Abbreviation_Welsh,Geographic_Theme,Geographic_Theme_Welsh,Geographic_Coverage,Geographic_Coverage_Welsh,Version,Quality_Statement_Text,Quality_Summary_URL -1,Region,Region,Rhanbarth,"The geographic region in which a person lives, derived from the address of their household or communal establishment.","Y rhanbarth daearyddol y mae person yn byw ynddo, yn deillio o gyfeiriad eu cartref neu sefydliad cymunedol.",Region,,,,,,PUB,GEOG,,1,People,RGN,,Administrative,Gweinyddol,England and Wales,Cymru a Lloegr,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. +1,Region,Region,Rhanbarth,"The geographic region in which a person lives, derived from the address of their household or communal establishment.","Y rhanbarth daearyddol y mae person yn byw ynddo, yn deillio o gyfeiriad eu cartref neu sefydliad cymunedol.",Region,,,,,Y,PUB,GEOG,,1,People,RGN,,Administrative,Gweinyddol,England and Wales,Cymru a Lloegr,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. However no census is perfect and some people are inevitably missed. ONS therefore uses complex statistical techniques to adjust the 2011 Census counts for those people missed by the census. The methods and quality assurance approach was researched and developed in consultation with academics, statisticians, demographers and users of census data. The result was a suite of methods to process, clean, adjust and protect the census results.",https://www.ons.gov.uk/census/2011census/2011censusdata/2011censususerguide/qualityandmethods -2,Country,Country,Ngwlad,"The country - either England or Wales - in which a person lives, derived from the region they live in.","Mae'r wlad - naill ai Cymru neu Loegr - lle mae person yn byw, yn deillio o'r rhanbarth y mae'n byw ynddo.",Country,,,,,,PUB,GEOG,,1,People,CTRY,,Administrative,Gweinyddol,England and Wales,Cymru a Lloegr,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. +2,Country,Country,Ngwlad,"The country - either England or Wales - in which a person lives, derived from the region they live in.","Mae'r wlad - naill ai Cymru neu Loegr - lle mae person yn byw, yn deillio o'r rhanbarth y mae'n byw ynddo.",Country,,,,,Y,PUB,GEOG,,1,People,CTRY,,Administrative,Gweinyddol,England and Wales,Cymru a Lloegr,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. However no census is perfect and some people are inevitably missed. ONS therefore uses complex statistical techniques to adjust the 2011 Census counts for those people missed by the census. The methods and quality assurance approach was researched and developed in consultation with academics, statisticians, demographers and users of census data. The result was a suite of methods to process, clean, adjust and protect the census results.",https://www.ons.gov.uk/census/2011census/2011censusdata/2011censususerguide/qualityandmethods -3,Residence Type,Residence Type,Math Preswyl,"This defines the type of residence that a person lives in. It categorises people as living in a household or living in a communal establishment. People who filled in the normal household questionnaire were recorded as living in a household. Those that filled in an individual questionnaire were asked what type of accommodation they lived in, i.e. whether it was a household or a communal establishment.","Mae hyn yn diffinio'r math o breswylfa y mae person yn byw ynddi. Mae'n categoreiddio pobl fel rhai sy'n byw mewn cartref neu'n byw mewn sefydliad cymunedol. Cofnodwyd bod pobl a lenwodd holiadur arferol y cartref yn byw mewn cartref. Gofynnwyd i'r rhai a lenwodd holiadur unigol pa fath o lety yr oeddent yn byw ynddo, h.y. a oedd yn aelwyd neu'n sefydliad cymunedol.",Residence Type,,,,,,PUB,SV,LHQ,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. +3,Residence Type,Residence Type,Math Preswyl,"This defines the type of residence that a person lives in. It categorises people as living in a household or living in a communal establishment. People who filled in the normal household questionnaire were recorded as living in a household. Those that filled in an individual questionnaire were asked what type of accommodation they lived in, i.e. whether it was a household or a communal establishment.","Mae hyn yn diffinio'r math o breswylfa y mae person yn byw ynddi. Mae'n categoreiddio pobl fel rhai sy'n byw mewn cartref neu'n byw mewn sefydliad cymunedol. Cofnodwyd bod pobl a lenwodd holiadur arferol y cartref yn byw mewn cartref. Gofynnwyd i'r rhai a lenwodd holiadur unigol pa fath o lety yr oeddent yn byw ynddo, h.y. a oedd yn aelwyd neu'n sefydliad cymunedol.",Residence Type,,,,,Y,PUB,SV,LHQ,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. However no census is perfect and some people are inevitably missed. ONS therefore uses complex statistical techniques to adjust the 2011 Census counts for those people missed by the census. The methods and quality assurance approach was researched and developed in consultation with academics, statisticians, demographers and users of census data. The result was a suite of methods to process, clean, adjust and protect the census results.",https://www.ons.gov.uk/census/2011census/2011censusdata/2011censususerguide/qualityandmethods -4,Family Composition,Family Composition,Cyfansoddiad teuluol,"Family type is the classification of families into different types distinguished by the presence, absence and type of couple relationship: whether a married couple family, a same-sex civil partnership family, a cohabiting couple family, or a lone parent family.","Math o deulu yw dosbarthiad teuluoedd i wahanol fathau sy'n cael eu gwahaniaethu gan bresenoldeb, absenoldeb a math o berthynas cwpl: boed yn deulu pâr priod, teulu partneriaeth sifil o'r un rhyw, teulu pâr sy'n cyd-fyw, neu deulu rhiant sengl.",Family Composition,,,,,,PUB,DVO,MAD,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. +4,Family Composition,Family Composition,Cyfansoddiad teuluol,"Family type is the classification of families into different types distinguished by the presence, absence and type of couple relationship: whether a married couple family, a same-sex civil partnership family, a cohabiting couple family, or a lone parent family.","Math o deulu yw dosbarthiad teuluoedd i wahanol fathau sy'n cael eu gwahaniaethu gan bresenoldeb, absenoldeb a math o berthynas cwpl: boed yn deulu pâr priod, teulu partneriaeth sifil o'r un rhyw, teulu pâr sy'n cyd-fyw, neu deulu rhiant sengl.",Family Composition,,,,,Y,PUB,DVO,MAD,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. However no census is perfect and some people are inevitably missed. ONS therefore uses complex statistical techniques to adjust the 2011 Census counts for those people missed by the census. The methods and quality assurance approach was researched and developed in consultation with academics, statisticians, demographers and users of census data. The result was a suite of methods to process, clean, adjust and protect the census results.",https://www.ons.gov.uk/census/2011census/2011censusdata/2011censususerguide/qualityandmethods -5,Population Base,Population Base,Sylfaen Poblogaeth,The main census population base into which a person falls.,Prif sylfaen poblogaeth y cyfrifiad y mae person yn syrthio iddi.,Population Base,,,,,,PUB,DVO,,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. +5,Population Base,Population Base,Sylfaen Poblogaeth,The main census population base into which a person falls.,Prif sylfaen poblogaeth y cyfrifiad y mae person yn syrthio iddi.,Population Base,,,,,Y,PUB,DVO,,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. However no census is perfect and some people are inevitably missed. ONS therefore uses complex statistical techniques to adjust the 2011 Census counts for those people missed by the census. The methods and quality assurance approach was researched and developed in consultation with academics, statisticians, demographers and users of census data. The result was a suite of methods to process, clean, adjust and protect the census results.",https://www.ons.gov.uk/census/2011census/2011censusdata/2011censususerguide/qualityandmethods 6,Sex,Sex,Rhyw,The classification of a person as either male or female.,Dosbarthiad person naill ai'n wryw neu'n fenyw.,Sex,,,"Indicator of comparability: Highly comparable @@ -32,7 +32,7 @@ Allbwn Data - Mae dadansoddiad o wrywod a benywod yn cael ei gynnwys mewn llawer o'r allbynnau anarferol ac aml-ddirprwy. A yw allbynnau'r DU ar gael? -Ydw.",,PUB,SV,MAD,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. +Ydw.",Y,PUB,SV,MAD,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. However no census is perfect and some people are inevitably missed. ONS therefore uses complex statistical techniques to adjust the 2011 Census counts for those people missed by the census. The methods and quality assurance approach was researched and developed in consultation with academics, statisticians, demographers and users of census data. The result was a suite of methods to process, clean, adjust and protect the census results.",https://www.ons.gov.uk/census/2011census/2011censusdata/2011censususerguide/qualityandmethods 7,Age,Age,Oedran,"Age is derived from the date of birth question and is a person's age at their last birthday, at 27 March 2011. Dates of birth that imply an age over 115 are treated as invalid and the person's age is imputed. Infants less than one year old are classified as 0 years of age.","Mae oedran yn deillio o'r cwestiwn dyddiad geni ac mae'n oedran unigolyn ar ei ben-blwydd olaf, ar 27 Mawrth 2011. Mae dyddiadau geni sy'n awgrymu oedran dros 115 yn cael eu trin fel rhai annilys ac mae oedran yr unigolyn yn cael ei osod. Mae babanod llai na blwydd oed yn cael eu dosbarthu fel 0 oed.",Age,,,"Indicator of comparability: Highly comparable @@ -57,7 +57,7 @@ Allbwn data: Allbynnau anwahanadwy ac amlochrog. A yw allbynnau'r DU ar gael? -Amherthnasol",,PUB,DVO,MAD,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. +Amherthnasol",Y,PUB,DVO,MAD,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. However no census is perfect and some people are inevitably missed. ONS therefore uses complex statistical techniques to adjust the 2011 Census counts for those people missed by the census. The methods and quality assurance approach was researched and developed in consultation with academics, statisticians, demographers and users of census data. The result was a suite of methods to process, clean, adjust and protect the census results.",https://www.ons.gov.uk/census/2011census/2011censusdata/2011censususerguide/qualityandmethods 8,Marital Status,Marital Status,Statws priodasol,"Marital and civil partnership status classifies an individual according to their legal marital or registered same-sex civil partnership status as at census day, 27 March 2011. This topic is the equivalent of the 2001 Census topic “Marital status”, but has undergone significant revision to take account of the Civil Partnership Act which came into force on 5 December 2005. @@ -88,7 +88,7 @@ Allbwn data: - Er mwyn atal datgelu gwybodaeth unigolion, ar gyfer rhai allbynnau, yn enwedig ar lefelau is o ddaearyddiaeth, darperir data fel categorïau agregedig. A yw allbynnau'r DU ar gael? -Ydw.",,PUB,SV,MAD,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. +Ydw.",Y,PUB,SV,MAD,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. However no census is perfect and some people are inevitably missed. ONS therefore uses complex statistical techniques to adjust the 2011 Census counts for those people missed by the census. The methods and quality assurance approach was researched and developed in consultation with academics, statisticians, demographers and users of census data. The result was a suite of methods to process, clean, adjust and protect the census results.",https://www.ons.gov.uk/census/2011census/2011censusdata/2011censususerguide/qualityandmethods 9,Student,Student,Myfyriwr,"Schoolchildren and students in full-time education studying away from their family home are treated as usually resident at their term-time address. Basic demographic information only (name, sex, age, marital status and relationship) is collected at their non term-time address (their “home” or “vacation”address). The information on families, household size and household composition for their non term-time address does not include them.","Mae plant ysgol a myfyrwyr mewn addysg amser llawn sy'n astudio i ffwrdd o'u cartref teuluol yn cael eu trin fel arfer yn byw yn eu cyfeiriad yn ystod y tymor. Cesglir gwybodaeth ddemograffig sylfaenol yn unig (enw, rhyw, oedran, statws priodasol a pherthynas) yn eu cyfeiriad nad yw'n ystod y tymor (eu cyfeiriad “cartref” neu “wyliau”). Nid yw'r wybodaeth am deuluoedd, maint aelwydydd a chyfansoddiad y cartref ar gyfer eu cyfeiriad nad ydynt yn ystod y tymor yn eu cynnwys.",Student,,,"Indicator of comparability: Highly comparable @@ -117,7 +117,7 @@ Mae cyfrifiadau anffafriol o blant ysgol a myfyrwyr ar gael i bob gwlad. Mae'r r Cyhoeddwyd allbynnau aml-amrywiol hefyd. A yw allbynnau'r DU ar gael? -Ydw.",,PUB,SV,LHQ,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. +Ydw.",Y,PUB,SV,LHQ,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. However no census is perfect and some people are inevitably missed. ONS therefore uses complex statistical techniques to adjust the 2011 Census counts for those people missed by the census. The methods and quality assurance approach was researched and developed in consultation with academics, statisticians, demographers and users of census data. The result was a suite of methods to process, clean, adjust and protect the census results.",https://www.ons.gov.uk/census/2011census/2011censusdata/2011censususerguide/qualityandmethods 10,Country of Birth,Country of Birth,Gwlad Geni,"Country of birth is the country in which a person was born. The country of birth question included six tick-box responses – one for each of the four parts of the UK, one for the Republic of Ireland, and one for “elsewhere”. Where a person ticked “elsewhere”, they were asked to write in the current name of the country in which they were born. Responses are assigned codes based on the National Statistics Country Classification. @@ -182,7 +182,7 @@ Ydw. Cyhoeddwyd tabl QS203UK gwlad enedigol. Mae grwpio gwledydd o fewn y dosbarthiad a ddefnyddiwyd yn rhanbarthol yn fras, ond yn ystyried grwpio Ewrop Gwledydd yr Undeb (UE). Mae lefel y manylder a gyflwynir yn y tabl hwn wedi cael ei bennu gan lefel y wlad genedigaethau manylion sydd ar gael yn y tri tablau penodol ar wlad geni -QS203EW (Cymru a Lloegr), QS203SC (Yr Alban) a QS208NI (Gogledd Iwerddon).",,PUB,SV,MAD,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. +QS203EW (Cymru a Lloegr), QS203SC (Yr Alban) a QS208NI (Gogledd Iwerddon).",Y,PUB,SV,MAD,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. However no census is perfect and some people are inevitably missed. ONS therefore uses complex statistical techniques to adjust the 2011 Census counts for those people missed by the census. The methods and quality assurance approach was researched and developed in consultation with academics, statisticians, demographers and users of census data. The result was a suite of methods to process, clean, adjust and protect the census results.",https://www.ons.gov.uk/census/2011census/2011censusdata/2011censususerguide/qualityandmethods 11,Health,Health,Iechyd,"General health is a self-assessment of a person’s general state of health. People were asked to assess whether their health was very good, good, fair, bad or very bad. This assessment is not based on a person's health over any specified period of time.","Mae iechyd cyffredinol yn hunanasesiad o gyflwr iechyd cyffredinol person. Gofynnwyd i bobl asesu a oedd eu hiechyd yn dda iawn, yn dda, yn deg, yn ddrwg neu'n ddrwg iawn. Nid yw'r asesiad hwn yn seiliedig ar iechyd person dros unrhyw gyfnod penodol o amser.",Health,,,"Indicator of comparability: Highly comparable @@ -207,7 +207,7 @@ Allbwn data: - Mae allbynnau anffafriol ac aml-amrywiol ar gael ar iechyd cyffredinol, ar wahanol lefelau daearyddiaeth, ar gyfer pob gwlad. A yw allbynnau'r DU ar gael? -Ydw.",,PUB,SV,HSC,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. +Ydw.",Y,PUB,SV,HSC,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. However no census is perfect and some people are inevitably missed. ONS therefore uses complex statistical techniques to adjust the 2011 Census counts for those people missed by the census. The methods and quality assurance approach was researched and developed in consultation with academics, statisticians, demographers and users of census data. The result was a suite of methods to process, clean, adjust and protect the census results.",https://www.ons.gov.uk/census/2011census/2011censusdata/2011censususerguide/qualityandmethods 12,Ethnic Group,Ethnic Group,Grŵp Ethnig,Ethnic group classifies people according to their own perceived ethnic group and cultural background.,Mae grŵp ethnig yn dosbarthu pobl yn ôl eu grŵp ethnig canfyddedig eu hunain a'u cefndir diwylliannol.,Ethnic Group,,,"Indicator of comparability: Broadly comparable @@ -254,7 +254,7 @@ Mae allbynnau anffafriol ac amlochrog ar grŵp ethnig ar gael, ar wahanol lefela Mae gan bob gwlad ei dosbarthiad grŵp ethnig ei hun. -Oherwydd gwahaniaethau penodol i wlad mewn data a gasglwyd, cynllun cwestiwn ac ymateb categorïau ar gyfer y cwestiwn grŵp ethnig, ynghyd â rheolau prosesu data penodol yn benodol, nid yw rhai ymatebion yn cael eu cymharu'n uniongyrchol.",,PUB,DVO,EILR,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. +Oherwydd gwahaniaethau penodol i wlad mewn data a gasglwyd, cynllun cwestiwn ac ymateb categorïau ar gyfer y cwestiwn grŵp ethnig, ynghyd â rheolau prosesu data penodol yn benodol, nid yw rhai ymatebion yn cael eu cymharu'n uniongyrchol.",Y,PUB,DVO,EILR,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. However no census is perfect and some people are inevitably missed. ONS therefore uses complex statistical techniques to adjust the 2011 Census counts for those people missed by the census. The methods and quality assurance approach was researched and developed in consultation with academics, statisticians, demographers and users of census data. The result was a suite of methods to process, clean, adjust and protect the census results.",https://www.ons.gov.uk/census/2011census/2011censusdata/2011censususerguide/qualityandmethods 13,Religion,Religion,Crefydd,"The voluntary question on religion in the 2011 Census was intended to capture people's religious affiliation and identification at the time of the Census irrespective of whether they practised or believed in that religion or how important it was in their lives. @@ -325,14 +325,14 @@ Yn wahanol i gwestiynau cyfrifiad eraill lle mae atebion coll yn cael eu cyfrifo Mae'r Alban wedi cyhoeddi Tabl KS209SCA gyda'r un categorïau allbwn crefydd i ddarparu fersiwn wedi'i chysoni i allbynnau Cymru a Lloegr. Cynhyrchodd Gogledd Iwerddon hefyd fwrdd crefydd manylder llawn QS218NI sy'n cynnwys pob crefydd gyda chyfrif o 10 neu fwy o ymatebion. Fodd bynnag, lle mae defnyddwyr yn dewis cymharu gwybodaeth am grefydd ar draws gwledydd, rhaid iddynt fod yn ymwybodol eu bod yn cymharu gwahanol gysyniadau ac yn gwneud hynny yn ofalus. O ganlyniad, mae cymaroldeb yn gyfyngedig. A yw allbynnau'r DU ar gael? -Na.",,PUB,SV,EILR,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. +Na.",Y,PUB,SV,EILR,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. However no census is perfect and some people are inevitably missed. ONS therefore uses complex statistical techniques to adjust the 2011 Census counts for those people missed by the census. The methods and quality assurance approach was researched and developed in consultation with academics, statisticians, demographers and users of census data. The result was a suite of methods to process, clean, adjust and protect the census results.",https://www.ons.gov.uk/census/2011census/2011censusdata/2011censususerguide/qualityandmethods 14,Economic Activity,Economic Activity,Gweithgaredd economaidd,"Economic activity relates to whether or not a person who was aged 16 and over was working or looking for work in the week before census. Rather than a simple indicator of whether or not someone was currently in employment, it provides a measure of whether or not a person was an active participant in the labour market. A person's economic activity is derived from their “activity last week”. This is an indicator of their status or availability for employment - whether employed, unemployed, or their status if not employed and not seeking employment. Additional information included in the economic activity classification is also derived from information about the number of hours a person works and their type of employment - whether employed or self-employed.","Mae gweithgarwch economaidd yn ymwneud ag a oedd person a oedd yn 16 oed a throsodd yn gweithio neu'n chwilio am waith yn ystod yr wythnos cyn y cyfrifiad ai peidio. Yn hytrach na dangosydd syml a oedd rhywun mewn cyflogaeth ar hyn o bryd ai peidio, mae'n mesur a oedd person yn cymryd rhan weithredol yn y farchnad lafur ai peidio. -Mae gweithgaredd economaidd unigolyn yn deillio o'u “gweithgaredd yr wythnos diwethaf”. Mae hwn yn ddangosydd o'u statws neu argaeledd cyflogaeth - boed yn gyflogedig, yn ddi-waith, neu eu statws os nad ydynt yn gyflogedig ac nad ydynt yn chwilio am waith. Mae gwybodaeth ychwanegol a gynhwysir yn y dosbarthiad gweithgarwch economaidd hefyd yn deillio o wybodaeth am nifer yr oriau y mae person yn gweithio a'u math o gyflogaeth - boed yn gyflogedig neu'n hunangyflogedig.",Economic Activity,"The census concept of economic activity is compatible with the standard for economic status defined by the International Labour Organisation (ILO). It is one of a number of definitions used internationally to produce accurate and comparable statistics on employment, unemployment and economic status.","Mae cysyniad y cyfrifiad o weithgarwch economaidd yn gydnaws â'r safon ar gyfer statws economaidd a ddiffinnir gan y Sefydliad Llafur Rhyngwladol (ILO). Mae'n un o nifer o ddiffiniadau a ddefnyddir yn rhyngwladol i gynhyrchu ystadegau cywir a chymaradwy ar gyflogaeth, diweithdra a statws economaidd.",,,,PUB,DVO,LHQ,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. +Mae gweithgaredd economaidd unigolyn yn deillio o'u “gweithgaredd yr wythnos diwethaf”. Mae hwn yn ddangosydd o'u statws neu argaeledd cyflogaeth - boed yn gyflogedig, yn ddi-waith, neu eu statws os nad ydynt yn gyflogedig ac nad ydynt yn chwilio am waith. Mae gwybodaeth ychwanegol a gynhwysir yn y dosbarthiad gweithgarwch economaidd hefyd yn deillio o wybodaeth am nifer yr oriau y mae person yn gweithio a'u math o gyflogaeth - boed yn gyflogedig neu'n hunangyflogedig.",Economic Activity,"The census concept of economic activity is compatible with the standard for economic status defined by the International Labour Organisation (ILO). It is one of a number of definitions used internationally to produce accurate and comparable statistics on employment, unemployment and economic status.","Mae cysyniad y cyfrifiad o weithgarwch economaidd yn gydnaws â'r safon ar gyfer statws economaidd a ddiffinnir gan y Sefydliad Llafur Rhyngwladol (ILO). Mae'n un o nifer o ddiffiniadau a ddefnyddir yn rhyngwladol i gynhyrchu ystadegau cywir a chymaradwy ar gyflogaeth, diweithdra a statws economaidd.",,,Y,PUB,DVO,LHQ,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. However no census is perfect and some people are inevitably missed. ONS therefore uses complex statistical techniques to adjust the 2011 Census counts for those people missed by the census. The methods and quality assurance approach was researched and developed in consultation with academics, statisticians, demographers and users of census data. The result was a suite of methods to process, clean, adjust and protect the census results.",https://www.ons.gov.uk/census/2011census/2011censusdata/2011censususerguide/qualityandmethods 15,Occupation,Occupation,Ngalwedigaeth,A person's occupation relates to their main job and is derived from either their job title or details of the activities involved in their job. This is used to assign responses to an occupation code based on the Standard Occupational Classification 2010 (SOC2010).,Mae galwedigaeth unigolyn yn ymwneud â'i brif swydd ac mae'n deillio naill ai o deitl ei swydd neu fanylion y gweithgareddau sy'n gysylltiedig â'u swydd. Defnyddir hwn i neilltuo ymatebion i god meddiannaeth yn seiliedig ar Ddosbarthiad Galwedigaethol Safonol 2010 (SOC2010).,Occupation,The census concept of cccupation uses occupation codes from the Standard Occupational Classification 2010 (SOC2010).,Mae cysyniad y Cyfrifiad o CcCupation yn defnyddio codau galwedigaeth o'r Dosbarthiad Galwedigaethol Safonol 2010 (SOC2010).,"Indicator of comparability: Highly comparable @@ -429,7 +429,7 @@ http://www.ons.gov.uk/ons/guide-method/classifications/current-standard-classifi A yw allbynnau'r DU ar gael? Ydw. Mae allbynnau'r DU ar alwedigaeth a NS-SEC ar gael. - Allbynnau'r DU ar alwedigaeth Defnyddiwch y prif ddosbarthiad grŵp -- Mae allbynnau'r DU ar NS-SEC preswylwyr arferol a pherson cyfeirio aelwydydd (HRP) ar gael.",,PUB,DVO,LHQ,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. +- Mae allbynnau'r DU ar NS-SEC preswylwyr arferol a pherson cyfeirio aelwydydd (HRP) ar gael.",Y,PUB,DVO,LHQ,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. However no census is perfect and some people are inevitably missed. ONS therefore uses complex statistical techniques to adjust the 2011 Census counts for those people missed by the census. The methods and quality assurance approach was researched and developed in consultation with academics, statisticians, demographers and users of census data. The result was a suite of methods to process, clean, adjust and protect the census results.",https://www.ons.gov.uk/census/2011census/2011censusdata/2011censususerguide/qualityandmethods 16,Industry,Industry,Ddiwydiant,"The industry in which a person aged 16 and over works relates to their main job, and is derived from information provided on the main activity of their employer or business. This is used to assign responses to an industry code based on the Standard Industrial Classification 2007.","Mae'r diwydiant lle mae person 16 oed a throsodd yn gweithio yn ymwneud â'i brif swydd, ac mae'n deillio o wybodaeth a ddarperir am brif weithgaredd eu cyflogwr neu fusnes. Defnyddir hyn i neilltuo ymatebion i god diwydiant yn seiliedig ar y Dosbarthiad Diwydiannol Safonol 2007.",Industry,The census concept of industry uses industry codes from the Standard Industrial Classification 2007.,Mae cysyniad y cyfrifiad o ddiwydiant yn defnyddio codau diwydiant o'r dosbarthiad diwydiannol safonol 2007.,"Indicator of comparability: Highly comparable @@ -494,7 +494,7 @@ Er enghraifft, gellir cyfuno'r categorïau fel a ganlyn i alluogi cymhariaeth: Noder bod y cwestiwn ar enw'r cyflogwr yn cael ei ddefnyddio i gynhyrchu amcangyfrifon cyfrifiad ond ni chaiff data ei gasglu ar gyfer cyhoeddi uniongyrchol. A yw allbynnau'r DU ar gael? -Ydw. Mae allbynnau'r DU ar ddiwydiant ar gael.",,PUB,DVO,LHQ,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. +Ydw. Mae allbynnau'r DU ar ddiwydiant ar gael.",Y,PUB,DVO,LHQ,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. However no census is perfect and some people are inevitably missed. ONS therefore uses complex statistical techniques to adjust the 2011 Census counts for those people missed by the census. The methods and quality assurance approach was researched and developed in consultation with academics, statisticians, demographers and users of census data. The result was a suite of methods to process, clean, adjust and protect the census results.",https://www.ons.gov.uk/census/2011census/2011censusdata/2011censususerguide/qualityandmethods 17,Hours worked per week,Hours worked per week,Oriau a weithir yr wythnos,"The number of hours that a person aged 16 to 74, in employment in the week before the census, worked in their main job. This includes paid and unpaid overtime.","Nifer yr oriau yr oedd person 16 i 74 oed, mewn cyflogaeth yn ystod yr wythnos cyn y cyfrifiad, yn gweithio yn eu prif swydd. Mae hyn yn cynnwys goramser taledig a di-dâl.",Hours worked per week,,,"Indicator of comparability: Broadly comparable @@ -559,9 +559,9 @@ Ydw. Mae data ar gyfer Cymru, Lloegr, yr Alban a Gogledd Iwerddon ar gael gan fo - rhan-amser: 15 awr neu lai yn gweithio - rhan-amser: 16 i 30 awr yn gweithio - Gweithiodd 31 i 48 awr amser llawn -- amser llawn 49 neu fwy o oriau yn gweithio",,PUB,SV,LHQ,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. +- amser llawn 49 neu fwy o oriau yn gweithio",Y,PUB,SV,LHQ,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. However no census is perfect and some people are inevitably missed. ONS therefore uses complex statistical techniques to adjust the 2011 Census counts for those people missed by the census. The methods and quality assurance approach was researched and developed in consultation with academics, statisticians, demographers and users of census data. The result was a suite of methods to process, clean, adjust and protect the census results.",https://www.ons.gov.uk/census/2011census/2011censusdata/2011censususerguide/qualityandmethods -18,Approximated Social Grade,Approximated Social Grade,Gradd gymdeithasol amcangyfrifedig,"Social Grade is the socio-economic classification used by the Market Research and Marketing Industries, most often in the analysis of spending habits and consumer attitudes. Although it is not possible to allocate Social Grade precisely from information collected by the 2011 Census, the Market Research Society has developed a method for using Census information to provide a good approximation of Social Grade.","Gradd Gymdeithasol yw'r dosbarthiad economaidd-gymdeithasol a ddefnyddir gan y Diwydiannau Ymchwil i'r Farchnad a Marchnata, gan amlaf wrth ddadansoddi arferion gwario ac agweddau defnyddwyr. Er nad yw'n bosibl dyrannu Gradd Gymdeithasol yn union o wybodaeth a gasglwyd gan Gyfrifiad 2011, mae'r Gymdeithas Ymchwil i'r Farchnad wedi datblygu dull i forddefnyddio gwybodaeth y Cyfrifiad i ddarparu brasamcan da o Radd Gymdeithasol.",Approximated Social Grade,The census concept of approximated social grade is equivalent to the socio-economic classification used by the Market Research and Marketing Industries.,Mae'r cysyniad cyfrifiad o radd gymdeithasol amcangyfrifedig yn gyfwerth â'r dosbarthiad economaidd-gymdeithasol a ddefnyddir gan y diwydiannau ymchwil a marchnata marchnata.,,,,PUB,DVO,LHQ,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. +18,Approximated Social Grade,Approximated Social Grade,Gradd gymdeithasol amcangyfrifedig,"Social Grade is the socio-economic classification used by the Market Research and Marketing Industries, most often in the analysis of spending habits and consumer attitudes. Although it is not possible to allocate Social Grade precisely from information collected by the 2011 Census, the Market Research Society has developed a method for using Census information to provide a good approximation of Social Grade.","Gradd Gymdeithasol yw'r dosbarthiad economaidd-gymdeithasol a ddefnyddir gan y Diwydiannau Ymchwil i'r Farchnad a Marchnata, gan amlaf wrth ddadansoddi arferion gwario ac agweddau defnyddwyr. Er nad yw'n bosibl dyrannu Gradd Gymdeithasol yn union o wybodaeth a gasglwyd gan Gyfrifiad 2011, mae'r Gymdeithas Ymchwil i'r Farchnad wedi datblygu dull i forddefnyddio gwybodaeth y Cyfrifiad i ddarparu brasamcan da o Radd Gymdeithasol.",Approximated Social Grade,The census concept of approximated social grade is equivalent to the socio-economic classification used by the Market Research and Marketing Industries.,Mae'r cysyniad cyfrifiad o radd gymdeithasol amcangyfrifedig yn gyfwerth â'r dosbarthiad economaidd-gymdeithasol a ddefnyddir gan y diwydiannau ymchwil a marchnata marchnata.,,,Y,PUB,DVO,LHQ,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. However no census is perfect and some people are inevitably missed. ONS therefore uses complex statistical techniques to adjust the 2011 Census counts for those people missed by the census. The methods and quality assurance approach was researched and developed in consultation with academics, statisticians, demographers and users of census data. The result was a suite of methods to process, clean, adjust and protect the census results.",https://www.ons.gov.uk/census/2011census/2011censusdata/2011censususerguide/qualityandmethods From e0b587df33152808d3f20cdd04711f14f53e8ed0 Mon Sep 17 00:00:00 2001 From: Peter Hynes Date: Mon, 9 May 2022 14:00:45 +0100 Subject: [PATCH 07/15] Removed Flat_Classification_Flag from schema --- ctb_metadata_files/metadata.graphql | 1 - 1 file changed, 1 deletion(-) diff --git a/ctb_metadata_files/metadata.graphql b/ctb_metadata_files/metadata.graphql index 4e08953..d94100e 100644 --- a/ctb_metadata_files/metadata.graphql +++ b/ctb_metadata_files/metadata.graphql @@ -11,7 +11,6 @@ type DatasetMetadata { type VariableMetadata { Mnemonic_2011: String - Flat_Classification_Flag: String Parent_Classification_Mnemonic: String Default_Classification_Flag: String Version: String! From c145df51b42b5b00cd90b837b038cf57897b1300 Mon Sep 17 00:00:00 2001 From: Peter Hynes Date: Fri, 6 May 2022 17:40:54 +0100 Subject: [PATCH 08/15] Add support for Cantabular v9.2.0 --- README.md | 216 ++++++++++++++-------- bin/ons_csv_to_ctb_json_main.py | 140 ++++++++++---- ctb_metadata_files/metadata_9_2_0.graphql | 127 +++++++++++++ 3 files changed, 372 insertions(+), 111 deletions(-) create mode 100644 ctb_metadata_files/metadata_9_2_0.graphql diff --git a/README.md b/README.md index 95ed808..9eddc75 100644 --- a/README.md +++ b/README.md @@ -34,45 +34,56 @@ Basic logging will be displayed by default, including the number of high-level C objects loaded and the name of the output files. ``` > python3 bin/ons_csv_to_ctb_json_main.py -i test/testdata/ -g test/testdata/geography/geography.csv -o ctb_metadata_files/ -t=2022-04-28 15:21:06,357 lvl=INFO msg=Reading test/testdata/geography/geography.csv: found Welsh labels for unknown classification: OTHER -t=2022-04-28 15:21:06,357 lvl=INFO msg=Dropped non public classification: CLASS_PRIV -t=2022-04-28 15:21:06,357 lvl=INFO msg=Loaded metadata for 5 Cantabular variables -t=2022-04-28 15:21:06,358 lvl=INFO msg=Loaded metadata for 3 Cantabular datasets -t=2022-04-28 15:21:06,359 lvl=INFO msg=Written dataset metadata file to: ctb_metadata_files/cantabm_v9-3-0_unknown-metadata-version_dataset-md_20220428-1.json -t=2022-04-28 15:21:06,360 lvl=INFO msg=Dropped non public ONS Dataset: DS_PRIV -t=2022-04-28 15:21:06,360 lvl=INFO msg=Loaded metadata for 4 Cantabular tables -t=2022-04-28 15:21:06,360 lvl=INFO msg=Written table metadata file to: ctb_metadata_files/cantabm_v9-3-0_unknown-metadata-version_tables-md_20220428-1.json -t=2022-04-28 15:21:06,360 lvl=INFO msg=Loaded service metadata -t=2022-04-28 15:21:06,360 lvl=INFO msg=Written service metadata file to: ctb_metadata_files/cantabm_v9-3-0_unknown-metadata-version_service-md_20220428-1.json +t=2022-05-09 21:26:50,348 lvl=INFO msg=ons_csv_to_ctb_json_main.py version 1.1.alpha +t=2022-05-09 21:26:50,348 lvl=INFO msg=CSV source directory: test/testdata/ +t=2022-05-09 21:26:50,348 lvl=INFO msg=Geography file: test/testdata/geography/geography.csv +t=2022-05-09 21:26:50,350 lvl=INFO msg=Reading test/testdata/geography/geography.csv: found Welsh labels for unknown classification: OTHER +t=2022-05-09 21:26:50,350 lvl=INFO msg=Dropped non public classification: CLASS_PRIV +t=2022-05-09 21:26:50,350 lvl=INFO msg=Dropped non public classification: GEO_PRIV +t=2022-05-09 21:26:50,350 lvl=INFO msg=Loaded metadata for 5 Cantabular variables +t=2022-05-09 21:26:50,351 lvl=INFO msg=Loaded metadata for 3 Cantabular datasets +t=2022-05-09 21:26:50,351 lvl=INFO msg=Dropped non public ONS Dataset: DS_PRIV +t=2022-05-09 21:26:50,351 lvl=INFO msg=Loaded metadata for 4 Cantabular tables +t=2022-05-09 21:26:50,351 lvl=INFO msg=Loaded service metadata +t=2022-05-09 21:26:50,351 lvl=INFO msg=Output files will be written in Cantabular 9.3.0 format +t=2022-05-09 21:26:50,352 lvl=INFO msg=Written dataset metadata file to: ctb_metadata_files/cantabm_v9-3-0_unknown-metadata-version_dataset-md_20220509-1.json +t=2022-05-09 21:26:50,353 lvl=INFO msg=Written table metadata file to: ctb_metadata_files/cantabm_v9-3-0_unknown-metadata-version_tables-md_20220509-1.json +t=2022-05-09 21:26:50,353 lvl=INFO msg=Written service metadata file to: ctb_metadata_files/cantabm_v9-3-0_unknown-metadata-version_service-md_20220509-1.json ``` More detailed information can be obtained by running with a `-l DEBUG` flag e.g.: ``` > python3 bin/ons_csv_to_ctb_json_main.py -i test/testdata/ -g test/testdata/geography/geography.csv -o ctb_metadata_files/ -l DEBUG -t=2022-04-28 15:25:04,409 lvl=DEBUG msg=Creating classification for geographic variable: GEO1 -t=2022-04-28 15:25:04,409 lvl=DEBUG msg=Creating classification for geographic variable: GEO2 -t=2022-04-28 15:25:04,409 lvl=INFO msg=Reading test/testdata/geography/geography.csv: found Welsh labels for unknown classification: OTHER -t=2022-04-28 15:25:04,409 lvl=DEBUG msg=Loaded metadata for Cantabular variable: CLASS1 -t=2022-04-28 15:25:04,409 lvl=DEBUG msg=Loaded metadata for Cantabular variable: CLASS2 -t=2022-04-28 15:25:04,409 lvl=DEBUG msg=Loaded metadata for Cantabular variable: CLASS3 -t=2022-04-28 15:25:04,409 lvl=INFO msg=Dropped non public classification: CLASS_PRIV -t=2022-04-28 15:25:04,409 lvl=DEBUG msg=Loaded metadata for Cantabular variable: GEO1 -t=2022-04-28 15:25:04,409 lvl=DEBUG msg=Loaded metadata for Cantabular variable: GEO2 -t=2022-04-28 15:25:04,409 lvl=INFO msg=Loaded metadata for 5 Cantabular variables -t=2022-04-28 15:25:04,410 lvl=DEBUG msg=Loaded metadata for Cantabular dataset: DB1 -t=2022-04-28 15:25:04,410 lvl=DEBUG msg=Loaded metadata for Cantabular dataset: DB2 -t=2022-04-28 15:25:04,410 lvl=DEBUG msg=Loaded metadata for Cantabular dataset: DB3 -t=2022-04-28 15:25:04,410 lvl=INFO msg=Loaded metadata for 3 Cantabular datasets -t=2022-04-28 15:25:04,411 lvl=INFO msg=Written dataset metadata file to: ctb_metadata_files/cantabm_v9-3-0_unknown-metadata-version_dataset-md_20220428-1.json -t=2022-04-28 15:25:04,412 lvl=DEBUG msg=Loaded metadata for Cantabular table: DS1 -t=2022-04-28 15:25:04,412 lvl=DEBUG msg=Loaded metadata for Cantabular table: DS2 -t=2022-04-28 15:25:04,412 lvl=DEBUG msg=Loaded metadata for Cantabular table: DS3 -t=2022-04-28 15:25:04,412 lvl=INFO msg=Dropped non public ONS Dataset: DS_PRIV -t=2022-04-28 15:25:04,412 lvl=DEBUG msg=Loaded metadata for Cantabular table: DS4 -t=2022-04-28 15:25:04,412 lvl=INFO msg=Loaded metadata for 4 Cantabular tables -t=2022-04-28 15:25:04,412 lvl=INFO msg=Written table metadata file to: ctb_metadata_files/cantabm_v9-3-0_unknown-metadata-version_tables-md_20220428-1.json -t=2022-04-28 15:25:04,412 lvl=INFO msg=Loaded service metadata -t=2022-04-28 15:25:04,412 lvl=INFO msg=Written service metadata file to: ctb_metadata_files/cantabm_v9-3-0_unknown-metadata-version_service-md_20220428-1.json +t=2022-05-09 21:27:20,066 lvl=INFO msg=ons_csv_to_ctb_json_main.py version 1.1.alpha +t=2022-05-09 21:27:20,066 lvl=INFO msg=CSV source directory: test/testdata/ +t=2022-05-09 21:27:20,066 lvl=INFO msg=Geography file: test/testdata/geography/geography.csv +t=2022-05-09 21:27:20,067 lvl=DEBUG msg=Creating classification for geographic variable: GEO1 +t=2022-05-09 21:27:20,067 lvl=DEBUG msg=Creating classification for geographic variable: GEO2 +t=2022-05-09 21:27:20,067 lvl=DEBUG msg=Creating classification for geographic variable: GEO_PRIV +t=2022-05-09 21:27:20,067 lvl=INFO msg=Reading test/testdata/geography/geography.csv: found Welsh labels for unknown classification: OTHER +t=2022-05-09 21:27:20,068 lvl=DEBUG msg=Loaded metadata for Cantabular variable: CLASS1 +t=2022-05-09 21:27:20,068 lvl=DEBUG msg=Loaded metadata for Cantabular variable: CLASS2 +t=2022-05-09 21:27:20,068 lvl=DEBUG msg=Loaded metadata for Cantabular variable: CLASS3 +t=2022-05-09 21:27:20,068 lvl=INFO msg=Dropped non public classification: CLASS_PRIV +t=2022-05-09 21:27:20,068 lvl=DEBUG msg=Loaded metadata for Cantabular variable: GEO1 +t=2022-05-09 21:27:20,068 lvl=DEBUG msg=Loaded metadata for Cantabular variable: GEO2 +t=2022-05-09 21:27:20,068 lvl=INFO msg=Dropped non public classification: GEO_PRIV +t=2022-05-09 21:27:20,068 lvl=INFO msg=Loaded metadata for 5 Cantabular variables +t=2022-05-09 21:27:20,068 lvl=DEBUG msg=Loaded metadata for Cantabular dataset: DB1 +t=2022-05-09 21:27:20,068 lvl=DEBUG msg=Loaded metadata for Cantabular dataset: DB2 +t=2022-05-09 21:27:20,068 lvl=DEBUG msg=Loaded metadata for Cantabular dataset: DB3 +t=2022-05-09 21:27:20,068 lvl=INFO msg=Loaded metadata for 3 Cantabular datasets +t=2022-05-09 21:27:20,069 lvl=DEBUG msg=Loaded metadata for Cantabular table: DS1 +t=2022-05-09 21:27:20,069 lvl=DEBUG msg=Loaded metadata for Cantabular table: DS2 +t=2022-05-09 21:27:20,069 lvl=DEBUG msg=Loaded metadata for Cantabular table: DS3 +t=2022-05-09 21:27:20,069 lvl=INFO msg=Dropped non public ONS Dataset: DS_PRIV +t=2022-05-09 21:27:20,069 lvl=DEBUG msg=Loaded metadata for Cantabular table: DS4 +t=2022-05-09 21:27:20,069 lvl=INFO msg=Loaded metadata for 4 Cantabular tables +t=2022-05-09 21:27:20,069 lvl=INFO msg=Loaded service metadata +t=2022-05-09 21:27:20,069 lvl=INFO msg=Output files will be written in Cantabular 9.3.0 format +t=2022-05-09 21:27:20,070 lvl=INFO msg=Written dataset metadata file to: ctb_metadata_files/cantabm_v9-3-0_unknown-metadata-version_dataset-md_20220509-1.json +t=2022-05-09 21:27:20,071 lvl=INFO msg=Written table metadata file to: ctb_metadata_files/cantabm_v9-3-0_unknown-metadata-version_tables-md_20220509-1.json +t=2022-05-09 21:27:20,071 lvl=INFO msg=Written service metadata file to: ctb_metadata_files/cantabm_v9-3-0_unknown-metadata-version_service-md_20220509-1.json ``` Output file names @@ -102,16 +113,21 @@ arguments as described in the help text for `ons_csv_to_ctb_json_main.py`: For example: ``` > python3 bin/ons_csv_to_ctb_json_main.py -i test/testdata/ -g test/testdata/geography/geography.csv -o ctb_metadata_files/ -p t -m test -b 42 -t=2022-04-28 15:28:02,518 lvl=INFO msg=Reading test/testdata/geography/geography.csv: found Welsh labels for unknown classification: OTHER -t=2022-04-28 15:28:02,518 lvl=INFO msg=Dropped non public classification: CLASS_PRIV -t=2022-04-28 15:28:02,518 lvl=INFO msg=Loaded metadata for 5 Cantabular variables -t=2022-04-28 15:28:02,518 lvl=INFO msg=Loaded metadata for 3 Cantabular datasets -t=2022-04-28 15:28:02,519 lvl=INFO msg=Written dataset metadata file to: ctb_metadata_files/t_cantabm_v9-3-0_test_dataset-md_20220428-42.json -t=2022-04-28 15:28:02,520 lvl=INFO msg=Dropped non public ONS Dataset: DS_PRIV -t=2022-04-28 15:28:02,520 lvl=INFO msg=Loaded metadata for 4 Cantabular tables -t=2022-04-28 15:28:02,521 lvl=INFO msg=Written table metadata file to: ctb_metadata_files/t_cantabm_v9-3-0_test_tables-md_20220428-42.json -t=2022-04-28 15:28:02,521 lvl=INFO msg=Loaded service metadata -t=2022-04-28 15:28:02,521 lvl=INFO msg=Written service metadata file to: ctb_metadata_files/t_cantabm_v9-3-0_test_service-md_20220428-42.json +t=2022-05-09 21:27:57,633 lvl=INFO msg=ons_csv_to_ctb_json_main.py version 1.1.alpha +t=2022-05-09 21:27:57,633 lvl=INFO msg=CSV source directory: test/testdata/ +t=2022-05-09 21:27:57,633 lvl=INFO msg=Geography file: test/testdata/geography/geography.csv +t=2022-05-09 21:27:57,634 lvl=INFO msg=Reading test/testdata/geography/geography.csv: found Welsh labels for unknown classification: OTHER +t=2022-05-09 21:27:57,635 lvl=INFO msg=Dropped non public classification: CLASS_PRIV +t=2022-05-09 21:27:57,635 lvl=INFO msg=Dropped non public classification: GEO_PRIV +t=2022-05-09 21:27:57,635 lvl=INFO msg=Loaded metadata for 5 Cantabular variables +t=2022-05-09 21:27:57,635 lvl=INFO msg=Loaded metadata for 3 Cantabular datasets +t=2022-05-09 21:27:57,636 lvl=INFO msg=Dropped non public ONS Dataset: DS_PRIV +t=2022-05-09 21:27:57,636 lvl=INFO msg=Loaded metadata for 4 Cantabular tables +t=2022-05-09 21:27:57,636 lvl=INFO msg=Loaded service metadata +t=2022-05-09 21:27:57,636 lvl=INFO msg=Output files will be written in Cantabular 9.3.0 format +t=2022-05-09 21:27:57,637 lvl=INFO msg=Written dataset metadata file to: ctb_metadata_files/t_cantabm_v9-3-0_test_dataset-md_20220509-42.json +t=2022-05-09 21:27:57,637 lvl=INFO msg=Written table metadata file to: ctb_metadata_files/t_cantabm_v9-3-0_test_tables-md_20220509-42.json +t=2022-05-09 21:27:57,638 lvl=INFO msg=Written service metadata file to: ctb_metadata_files/t_cantabm_v9-3-0_test_service-md_20220509-42.json ``` Using externally sourced files @@ -145,39 +161,43 @@ can be found in the `sample_2011` directory. Use this command to convert the files to JSON (with debugging enabled): ``` > python3 bin/ons_csv_to_ctb_json_main.py -i sample_2011/ -g sample_2011/geography.csv -o ctb_metadata_files/ -m 2001-sample -l DEBUG -t=2022-05-06 12:55:25,674 lvl=DEBUG msg=Creating classification for geographic variable: Region -t=2022-05-06 12:55:25,675 lvl=DEBUG msg=Creating classification for geographic variable: Country -t=2022-05-06 12:55:25,676 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Residence Type -t=2022-05-06 12:55:25,676 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Family Composition -t=2022-05-06 12:55:25,676 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Population Base -t=2022-05-06 12:55:25,676 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Sex -t=2022-05-06 12:55:25,676 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Age -t=2022-05-06 12:55:25,676 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Marital Status -t=2022-05-06 12:55:25,676 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Student -t=2022-05-06 12:55:25,676 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Country of Birth -t=2022-05-06 12:55:25,676 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Health -t=2022-05-06 12:55:25,676 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Ethnic Group -t=2022-05-06 12:55:25,676 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Religion -t=2022-05-06 12:55:25,676 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Economic Activity -t=2022-05-06 12:55:25,676 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Occupation -t=2022-05-06 12:55:25,676 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Industry -t=2022-05-06 12:55:25,676 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Hours worked per week -t=2022-05-06 12:55:25,676 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Approximated Social Grade -t=2022-05-06 12:55:25,676 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Region -t=2022-05-06 12:55:25,676 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Country -t=2022-05-06 12:55:25,676 lvl=INFO msg=Loaded metadata for 18 Cantabular variables -t=2022-05-06 12:55:25,677 lvl=DEBUG msg=Loaded metadata for Cantabular dataset: Teaching-Dataset -t=2022-05-06 12:55:25,677 lvl=INFO msg=Loaded metadata for 1 Cantabular datasets -t=2022-05-06 12:55:25,680 lvl=INFO msg=Written dataset metadata file to: ctb_metadata_files/cantabm_v9-3-0_2001-sample_dataset-md_20220506-1.json -t=2022-05-06 12:55:25,681 lvl=DEBUG msg=Loaded metadata for Cantabular table: LC2101EW -t=2022-05-06 12:55:25,681 lvl=DEBUG msg=Loaded metadata for Cantabular table: LC1117EW -t=2022-05-06 12:55:25,681 lvl=DEBUG msg=Loaded metadata for Cantabular table: LC2107EW -t=2022-05-06 12:55:25,681 lvl=DEBUG msg=Loaded metadata for Cantabular table: LC6107EW -t=2022-05-06 12:55:25,681 lvl=DEBUG msg=Loaded metadata for Cantabular table: LC6112EW -t=2022-05-06 12:55:25,681 lvl=INFO msg=Loaded metadata for 5 Cantabular tables -t=2022-05-06 12:55:25,682 lvl=INFO msg=Written table metadata file to: ctb_metadata_files/cantabm_v9-3-0_2001-sample_tables-md_20220506-1.json -t=2022-05-06 12:55:25,682 lvl=INFO msg=Loaded service metadata -t=2022-05-06 12:55:25,682 lvl=INFO msg=Written service metadata file to: ctb_metadata_files/cantabm_v9-3-0_2001-sample_service-md_20220506-1.json +t=2022-05-09 21:28:29,336 lvl=INFO msg=ons_csv_to_ctb_json_main.py version 1.1.alpha +t=2022-05-09 21:28:29,336 lvl=INFO msg=CSV source directory: sample_2011/ +t=2022-05-09 21:28:29,336 lvl=INFO msg=Geography file: sample_2011/geography.csv +t=2022-05-09 21:28:29,354 lvl=DEBUG msg=Creating classification for geographic variable: Region +t=2022-05-09 21:28:29,354 lvl=DEBUG msg=Creating classification for geographic variable: Country +t=2022-05-09 21:28:29,356 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Residence Type +t=2022-05-09 21:28:29,356 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Family Composition +t=2022-05-09 21:28:29,356 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Population Base +t=2022-05-09 21:28:29,356 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Sex +t=2022-05-09 21:28:29,356 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Age +t=2022-05-09 21:28:29,356 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Marital Status +t=2022-05-09 21:28:29,356 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Student +t=2022-05-09 21:28:29,356 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Country of Birth +t=2022-05-09 21:28:29,357 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Health +t=2022-05-09 21:28:29,357 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Ethnic Group +t=2022-05-09 21:28:29,357 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Religion +t=2022-05-09 21:28:29,357 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Economic Activity +t=2022-05-09 21:28:29,357 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Occupation +t=2022-05-09 21:28:29,357 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Industry +t=2022-05-09 21:28:29,357 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Hours worked per week +t=2022-05-09 21:28:29,357 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Approximated Social Grade +t=2022-05-09 21:28:29,357 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Region +t=2022-05-09 21:28:29,357 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Country +t=2022-05-09 21:28:29,357 lvl=INFO msg=Loaded metadata for 18 Cantabular variables +t=2022-05-09 21:28:29,358 lvl=DEBUG msg=Loaded metadata for Cantabular dataset: Teaching-Dataset +t=2022-05-09 21:28:29,358 lvl=INFO msg=Loaded metadata for 1 Cantabular datasets +t=2022-05-09 21:28:29,360 lvl=DEBUG msg=Loaded metadata for Cantabular table: LC2101EW +t=2022-05-09 21:28:29,360 lvl=DEBUG msg=Loaded metadata for Cantabular table: LC1117EW +t=2022-05-09 21:28:29,360 lvl=DEBUG msg=Loaded metadata for Cantabular table: LC2107EW +t=2022-05-09 21:28:29,360 lvl=DEBUG msg=Loaded metadata for Cantabular table: LC6107EW +t=2022-05-09 21:28:29,360 lvl=DEBUG msg=Loaded metadata for Cantabular table: LC6112EW +t=2022-05-09 21:28:29,360 lvl=INFO msg=Loaded metadata for 5 Cantabular tables +t=2022-05-09 21:28:29,361 lvl=INFO msg=Loaded service metadata +t=2022-05-09 21:28:29,361 lvl=INFO msg=Output files will be written in Cantabular 9.3.0 format +t=2022-05-09 21:28:29,364 lvl=INFO msg=Written dataset metadata file to: ctb_metadata_files/cantabm_v9-3-0_2001-sample_dataset-md_20220509-1.json +t=2022-05-09 21:28:29,365 lvl=INFO msg=Written table metadata file to: ctb_metadata_files/cantabm_v9-3-0_2001-sample_tables-md_20220509-1.json +t=2022-05-09 21:28:29,365 lvl=INFO msg=Written service metadata file to: ctb_metadata_files/cantabm_v9-3-0_2001-sample_service-md_20220509-1.json ``` Load the JSON files with cantabular-metadata @@ -206,7 +226,7 @@ This query can be used to obtain information for a single named table: http://localhost:8493/graphql?query=%7Bservice%7Btables(names%3A%20%5B%22DS1%22%5D)%7Bname%20datasetName%20vars%20description%20label%20all%7D%7D%7D%0A Tests ------ +===== This repository has tests written using the `unittest` framework. They are run as part of Continuous Integration testing in the GitHub repository. They can be run manually by running this @@ -215,3 +235,43 @@ command from the base directory: ``` PYTHONPATH=test:bin python3 -m unittest -v ``` + +Other Cantabular versions +========================= + +The `-v` argument can be used to generate output files that are compatible with a different version of Cantabular. +At present only 9.2.0 and 9.3.0 are supported. If any other version is specified then the specified version +will be reflected in the output filenames, but `9.3.0` format will be used. + +To generate version 9.2.0 compatible files from the test data use the following command: +``` +> python3 bin/ons_csv_to_ctb_json_main.py -i test/testdata/ -g test/testdata/geography/geography.csv -o ctb_metadata_files/ -v 9.2.0 +t=2022-05-09 21:40:49,218 lvl=INFO msg=ons_csv_to_ctb_json_main.py version 1.1.alpha +t=2022-05-09 21:40:49,218 lvl=INFO msg=CSV source directory: test/testdata/ +t=2022-05-09 21:40:49,218 lvl=INFO msg=Geography file: test/testdata/geography/geography.csv +t=2022-05-09 21:40:49,220 lvl=INFO msg=Reading test/testdata/geography/geography.csv: found Welsh labels for unknown classification: OTHER +t=2022-05-09 21:40:49,220 lvl=INFO msg=Dropped non public classification: CLASS_PRIV +t=2022-05-09 21:40:49,220 lvl=INFO msg=Dropped non public classification: GEO_PRIV +t=2022-05-09 21:40:49,220 lvl=INFO msg=Loaded metadata for 5 Cantabular variables +t=2022-05-09 21:40:49,221 lvl=INFO msg=Loaded metadata for 3 Cantabular datasets +t=2022-05-09 21:40:49,222 lvl=INFO msg=Dropped non public ONS Dataset: DS_PRIV +t=2022-05-09 21:40:49,222 lvl=INFO msg=Loaded metadata for 4 Cantabular tables +t=2022-05-09 21:40:49,222 lvl=INFO msg=Loaded service metadata +t=2022-05-09 21:40:49,222 lvl=INFO msg=Output files will be written in Cantabular 9.2.0 format +t=2022-05-09 21:40:49,223 lvl=INFO msg=Written dataset metadata file to: ctb_metadata_files/cantabm_v9-2-0_unknown-metadata-version_dataset-md_20220509-1.json +t=2022-05-09 21:40:49,223 lvl=INFO msg=Written service metadata file to: ctb_metadata_files/cantabm_v9-2-0_unknown-metadata-version_service-md_20220509-1.json +``` + +No tables metadata file is produced. The tables data is embedded in the service metadata file. + +To load the files into `cantabular-metadata` version 9.2.0 you need a different GraphQL types +file which can be found `ctb_metadata_files/metadata_9_2_0.graphql`. The files are also specified at +the command line instead of via environment variables. + + +To load the generated JSON files into `cantabular-metadata` (version 9.2.0) run the following +commands, substituting the file names and paths as appropriate: +``` +cd ctb_metadata_files +/cantabular-metadata metadata_9_2_0.graphql cantabm_v9-2-0_unknown-metadata-version_service-md_20220509-1.json cantabm_v9-2-0_unknown-metadata-version_dataset-md_20220509-1.json +``` diff --git a/bin/ons_csv_to_ctb_json_main.py b/bin/ons_csv_to_ctb_json_main.py index b777fb1..545825c 100644 --- a/bin/ons_csv_to_ctb_json_main.py +++ b/bin/ons_csv_to_ctb_json_main.py @@ -2,6 +2,8 @@ import json import os import logging +import re +from pathlib import Path from argparse import ArgumentParser from datetime import date from ons_csv_to_ctb_json_load import Loader, PUBLIC_SECURITY_MNEMONIC @@ -10,10 +12,12 @@ VERSION = '1.1.alpha' SYSTEM = 'cantabm' -SYSTEM_SOFTWARE_VERSION = 'v9-3-0' +DEFAULT_CANTABULAR_VERSION = '9.3.0' +CANTABULAR_V9_2_0 = '9.2.0' FILE_CONTENT_TYPE_DATASET = 'dataset-md' FILE_CONTENT_TYPE_TABLES = 'tables-md' FILE_CONTENT_TYPE_SERVICE = 'service-md' +KNOWN_CANTABULAR_VERSIONS = [DEFAULT_CANTABULAR_VERSION, CANTABULAR_V9_2_0] def filename_segment(value): @@ -33,6 +37,14 @@ def positive_int(value): return number +def cantabular_version_string(value): + """Check that the version is of format x.y.z.""" + value = value.strip() + if not re.match(r'^\d+.\d+.\d+$', value): + raise ValueError(f"invalid value: '{value}'") + return value + + def main(): """ Load metadata in CSV format and export in JSON format. @@ -83,62 +95,124 @@ def main(): help='Build number to use in output filenames ' '(default: %(default)s)') + parser.add_argument('-v', '--cantabular-version', + type=cantabular_version_string, + default=DEFAULT_CANTABULAR_VERSION, + help='Cantabular version for output files. The supported versions are ' + f'[{", ".join(KNOWN_CANTABULAR_VERSIONS)}]. If any other version is ' + 'supplied then it will be used in the filename, but version ' + f'{DEFAULT_CANTABULAR_VERSION} formatting will be used. ' + '(default: %(default)s)') + args = parser.parse_args() logging.basicConfig(format='t=%(asctime)s lvl=%(levelname)s msg=%(message)s', level=args.log_level) + logging.info(f'{Path(__file__).name} version {VERSION}') + logging.info(f'CSV source directory: {args.input_dir}') + if args.geography_file: + logging.info(f'Geography file: {args.geography_file}') + for directory in (args.input_dir, args.output_dir): if not os.path.isdir(directory): raise ValueError(f'{directory} does not exist or is not a directory') todays_date = date.today().strftime('%Y%m%d') + base_filename_template = output_filename_template( + args.file_prefix, args.cantabular_version, args.metadata_master_version, todays_date, + args.build_number) # loader is used to load the metadata from CSV files and convert it to JSON. loader = Loader(args.input_dir, args.geography_file) - # Build Cantabular variable and dataset objects and write them to a JSON file. + # Build Cantabular variable objects. # A Cantabular variable is equivalent to an ONS classification. - # A Cantabular dataset is equivalent to an ONS database. ctb_variables = build_ctb_variables(loader.classifications, loader.categories) - ctb_datasets = build_ctb_datasets(loader.databases, ctb_variables) - base_filename = output_filename(args.file_prefix, args.metadata_master_version, - FILE_CONTENT_TYPE_DATASET, todays_date, - args.build_number) - filename = os.path.join(args.output_dir, base_filename) - with open(filename, 'w') as jsonfile: - json.dump(ctb_datasets, jsonfile, indent=4) - logging.info(f'Written dataset metadata file to: {filename}') + # Build Cantabular dataset objects. + # A Cantabular dataset is equivalent to an ONS database. + ctb_datasets = build_ctb_datasets(loader.databases, ctb_variables) - # Build Cantabular table objects and write to JSON. + # Build Cantabular table objects. + # A Cantabular table is equivalent to an ONS dataset. ctb_tables = build_ctb_tables(loader.datasets) - base_filename = output_filename(args.file_prefix, args.metadata_master_version, - FILE_CONTENT_TYPE_TABLES, todays_date, args.build_number) - filename = os.path.join(args.output_dir, base_filename) - with open(filename, 'w') as jsonfile: - json.dump(ctb_tables, jsonfile, indent=4) - logging.info(f'Written table metadata file to: {filename}') - - # Build Cantabular service metadata objects and write to JSON. + # Build Cantabular service metadata. service_metadata = build_ctb_service_metadata() - base_filename = output_filename(args.file_prefix, args.metadata_master_version, - FILE_CONTENT_TYPE_SERVICE, todays_date, - args.build_number) - filename = os.path.join(args.output_dir, base_filename) - with open(filename, 'w') as jsonfile: - json.dump(service_metadata, jsonfile, indent=4) - logging.info(f'Written service metadata file to: {filename}') - - -def output_filename(prefix, metadata_master_version, content_type, todays_date, build_number): - """Generate output filename.""" - filename = (f'{SYSTEM}_{SYSTEM_SOFTWARE_VERSION}_{metadata_master_version}_{content_type}_' + # There is not a separate tables file for v9.2.0. Use the output_file_types list + # to determine which file types will be written. + output_file_types = [FILE_CONTENT_TYPE_DATASET, FILE_CONTENT_TYPE_SERVICE, + FILE_CONTENT_TYPE_TABLES] + + if args.cantabular_version == DEFAULT_CANTABULAR_VERSION: + logging.info( + f'Output files will be written in Cantabular {args.cantabular_version} format') + + elif args.cantabular_version == CANTABULAR_V9_2_0: + output_file_types = [FILE_CONTENT_TYPE_DATASET, FILE_CONTENT_TYPE_SERVICE] + convert_json_to_ctb_v9_2_0(ctb_datasets, ctb_tables, service_metadata) + logging.info( + f'Output files will be written in Cantabular {args.cantabular_version} format') + + else: + logging.info( + f'{args.cantabular_version} is an unknown Cantabular version: files will be written ' + f'using {DEFAULT_CANTABULAR_VERSION} format') + + if FILE_CONTENT_TYPE_DATASET in output_file_types: + filename = os.path.join(args.output_dir, + base_filename_template.format(FILE_CONTENT_TYPE_DATASET)) + with open(filename, 'w') as jsonfile: + json.dump(ctb_datasets, jsonfile, indent=4) + logging.info(f'Written dataset metadata file to: {filename}') + + if FILE_CONTENT_TYPE_TABLES in output_file_types: + filename = os.path.join(args.output_dir, + base_filename_template.format(FILE_CONTENT_TYPE_TABLES)) + with open(filename, 'w') as jsonfile: + json.dump(ctb_tables, jsonfile, indent=4) + logging.info(f'Written table metadata file to: {filename}') + + if FILE_CONTENT_TYPE_SERVICE in output_file_types: + filename = os.path.join(args.output_dir, + base_filename_template.format(FILE_CONTENT_TYPE_SERVICE)) + with open(filename, 'w') as jsonfile: + json.dump(service_metadata, jsonfile, indent=4) + logging.info(f'Written service metadata file to: {filename}') + + +def convert_json_to_ctb_v9_2_0(ctb_datasets, ctb_tables, service_metadata): + """Convert JSON to Cantabular v9.2.0 format.""" + for dataset in ctb_datasets: + dataset['meta']['description'] = dataset.pop('description') + for variable in dataset['vars'] if dataset['vars'] else []: + variable['meta']['description'] = variable.pop('description') + + service_metadata[0]['meta']['tables'] = [] + service_metadata[1]['meta']['tables'] = [] + for table in ctb_tables: + for idx in [0, 1]: + localized_table = { + 'name': table['name'], + 'label': table['ref'][idx]['label'], + 'description': table['ref'][idx]['description'], + 'datasetName': table['datasetName'], + 'vars': table['vars'], + 'meta': table['ref'][idx]['meta'], + } + service_metadata[idx]['meta']['tables'].append(localized_table) + + +def output_filename_template(prefix, cantabular_version, metadata_master_version, todays_date, + build_number): + """Generate template for output filename.""" + system_software_version = 'v' + cantabular_version.replace('.', '-') + filename = (f'{SYSTEM}_{system_software_version}_{metadata_master_version}_{{}}_' f'{todays_date}-{build_number}.json') if prefix: - filename = f'{prefix}_{filename}' + filename = prefix + '_' + filename return filename diff --git a/ctb_metadata_files/metadata_9_2_0.graphql b/ctb_metadata_files/metadata_9_2_0.graphql new file mode 100644 index 0000000..94b79f3 --- /dev/null +++ b/ctb_metadata_files/metadata_9_2_0.graphql @@ -0,0 +1,127 @@ +type ServiceMetadata { + description: String! + tables: [Table]! +} + +type Table { + name: String! + label: String! + description: String + datasetName: String! + vars: [String]! + meta: TableMetadata! +} + +type DatasetMetadata { + description: String! + Cantabular_DB_Flag: String + Source: Source! + Version: String! + Lowest_Geog_Variable: String +} + +type VariableMetadata { + description: String! + Mnemonic_2011: String + Flat_Classification_Flag: String + Parent_Classification_Mnemonic: String + Default_Classification_Flag: String + Version: String! + ONS_Variable: ONS_Variable! + Topics: [Topic]! +} + +type ONS_Variable { + Variable_Mnemonic: String! + Variable_Title: String! + Variable_Mnemonic_2011: String + Comparability_Comments: String + Uk_Comparison_Comments: String + Geographic_Abbreviation: String + Geographic_Theme: String + Geographic_Coverage: String + Version: String! + Statistical_Unit: Statistical_Unit + Keywords: [String]! + Topic: Topic + Questions: [Question]! + Variable_Type: Variable_Type! + Quality_Statement_Text: String + Quality_Summary_URL: String +} + +type Variable_Type { + Variable_Type_Code: String! + Variable_Type_Description: String! +} + +type Topic { + Topic_Mnemonic: String! + Topic_Description: String! + Topic_Title: String! +} + +type Question { + Question_Code: String! + Question_Label: String! + Reason_For_Asking_Question: String + Question_First_Asked_In_Year: String + Version: String! +} + +type Source { + Source_Mnemonic: String! + Source_Description: String! + Copyright_Statement: String + Licence: String + Nationals_Statistic_Certified: String + Methodology_Link: String + Methodology_Statement: String + SDC_Link: String + SDC_Statement: String + Version: String! + Contact: Contact +} + +type Contact { + Contact_Id: String! + Contact_Name: String! + Contact_Email: String! + Contact_Phone: String + Contact_Website: String +} + +type TableMetadata { + Dataset_Mnemonic_2011: String + Geographic_Coverage: String! + Dataset_Population: String! + Last_Updated: String + Unique_Url: String + Contact: Contact + Version: String! + Related_Datasets: [String]! + Keywords: [String]! + Publications: [Publication]! + Census_Releases: [Census_Release]! + Statistical_Unit: Statistical_Unit! + Alternate_Geographic_Variables: [String]! +} + +type Publication { + Publication_Mnemonic: String! + Publication_Title: String + Publisher_Name: String + Publisher_Website: String +} + +type Census_Release { + Census_Release_Number: String! + Census_Release_Description: String! + Release_Date: String! +} + +type Statistical_Unit { + Statistical_Unit: String! + Statistical_Unit_Description: String! +} + From f4f12fba25c976ae459a2efe3754bd8eaf8cf8fa Mon Sep 17 00:00:00 2001 From: Peter Hynes Date: Tue, 10 May 2022 17:34:17 +0100 Subject: [PATCH 09/15] Updated README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 9eddc75..a30116c 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,8 @@ Introduction `bin/ons_csv_to_ctb_json_main.py` is an application that loads source metadata files in CSV format and converts them to hierarchical JSON that can be loaded into `cantabular-metadata`. -It is compatible with version `1.1` of the metadata schema and version `9.3.0` of `cantabular-metadata`. +It is compatible with version `1.1` of the metadata schema and versions `9.3.0`/`9.2.0` of +`cantabular-metadata`. `9.3.0` format is used by default. This is version `1.1.alpha` of the CSV to JSON processing software and is subject to change. From ea534606cff6610bd4c05d92a34a114ca54506f6 Mon Sep 17 00:00:00 2001 From: Peter Hynes Date: Tue, 10 May 2022 13:48:48 +0100 Subject: [PATCH 10/15] Use row number instead of line number in errors The DictReader.line_num value is not necessarily the same as the row number as a single row can cover multiple lines. The row number is a more useful value in error messages as it is expected users will typically look at the CSV files using a tool like Excel. --- bin/ons_csv_to_ctb_json_geo.py | 8 +++----- bin/ons_csv_to_ctb_json_load.py | 26 +++++++++++++------------- bin/ons_csv_to_ctb_json_read.py | 24 +++++++++++------------- test/test_csv_read.py | 4 ++-- test/test_geo_read.py | 4 ++-- 5 files changed, 31 insertions(+), 35 deletions(-) diff --git a/bin/ons_csv_to_ctb_json_geo.py b/bin/ons_csv_to_ctb_json_geo.py index ec78865..57359c6 100644 --- a/bin/ons_csv_to_ctb_json_geo.py +++ b/bin/ons_csv_to_ctb_json_geo.py @@ -40,13 +40,11 @@ def read_geo_cats(filename): var_to_columns = assign_columns_to_variables(filename, fieldnames) data = {var_name: {} for var_name in var_to_columns} - for row in reader: + for row_num, row in enumerate(reader, 2): if len(row) > len(fieldnames): - raise ValueError(f'Reading {filename}: too many fields on line ' - f'{reader.line_num}') + raise ValueError(f'Reading {filename}: too many fields on row {row_num}') if len(row) < len(fieldnames): - raise ValueError(f'Reading {filename}: too few fields on line ' - f'{reader.line_num}') + raise ValueError(f'Reading {filename}: too few fields on row {row_num}') for geo, columns in var_to_columns.items(): code = row[columns.code].strip() diff --git a/bin/ons_csv_to_ctb_json_load.py b/bin/ons_csv_to_ctb_json_load.py index 5132417..f4b3f74 100644 --- a/bin/ons_csv_to_ctb_json_load.py +++ b/bin/ons_csv_to_ctb_json_load.py @@ -256,7 +256,7 @@ def datasets(self): dataset_to_variables = self.load_dataset_to_variables(dataset_mnemonics) datasets = {} - for dataset, line_num in dataset_rows: + for dataset, row_num in dataset_rows: dataset_mnemonic = dataset.pop('Dataset_Mnemonic') database_mnemonic = dataset.pop('Database_Mnemonic') @@ -285,20 +285,20 @@ def datasets(self): if dataset['Security_Mnemonic'] == PUBLIC_SECURITY_MNEMONIC: if not dataset_variables.classifications: raise ValueError( - f'Reading {self.full_filename(filename)}:{line_num} {dataset_mnemonic} ' + f'Reading {self.full_filename(filename)}:{row_num} {dataset_mnemonic} ' 'has no associated classifications or geographic variable') for classification in all_classifications: if self.classifications[classification].private['Security_Mnemonic'] != \ PUBLIC_SECURITY_MNEMONIC: raise ValueError( - f'Reading {self.full_filename(filename)}:{line_num} Public ONS ' + f'Reading {self.full_filename(filename)}:{row_num} Public ONS ' f'dataset {dataset_mnemonic} has non-public classification ' f'{classification}') if classification not in \ self.databases[database_mnemonic].private['Classifications']: raise ValueError( - f'Reading {self.full_filename(filename)}:{line_num} ' + f'Reading {self.full_filename(filename)}:{row_num} ' f'{dataset_mnemonic} has classification {classification} ' f'that is not in database {database_mnemonic}') @@ -403,10 +403,10 @@ def categories(self): unique_combo_fields=['Category_Code', 'Classification_Mnemonic']) classification_to_cats = {} - for cat, line_num in category_rows: + for cat, row_num in category_rows: classification_mnemonic = cat['Classification_Mnemonic'] if self.classifications[classification_mnemonic].private['Is_Geographic']: - raise ValueError(f'Reading {self.full_filename(filename)}:{line_num} ' + raise ValueError(f'Reading {self.full_filename(filename)}:{row_num} ' 'found category for geographic classification ' f'{classification_mnemonic}: all categories for geographic ' 'classifications must be in a separate lookup file') @@ -586,18 +586,18 @@ def variables(self): 'Geographic_Theme_Welsh', 'Geographic_Coverage_Welsh'} variables = {} - for variable, line_num in variable_rows: + for variable, row_num in variable_rows: # Ensure that non-geographic variables do not have geographic values set. is_geographic = variable['Variable_Type_Code'] == GEOGRAPHIC_VARIABLE_TYPE if not is_geographic: # This value is not always populated in source files # if not variable['Statistical_Unit']: - # raise ValueError(f'Reading {self.full_filename(filename)}:{line_num} ' + # raise ValueError(f'Reading {self.full_filename(filename)}:{row_num} ' # f'no Statistical_Unit specified for non geographic variable: ' # f'{variable["Variable_Mnemonic"]}') for geo_field in all_geo_fields: if variable[geo_field]: - raise ValueError(f'Reading {self.full_filename(filename)}:{line_num} ' + raise ValueError(f'Reading {self.full_filename(filename)}:{row_num} ' f'{geo_field} specified for non geographic variable: ' f'{variable["Variable_Mnemonic"]}') @@ -605,7 +605,7 @@ def variables(self): # else: # for geo_field in en_geo_fields: # if not variable[geo_field]: - # raise ValueError(f'Reading {self.full_filename(filename)}:{line_num} ' + # raise ValueError(f'Reading {self.full_filename(filename)}:{row_num} ' # f'no {geo_field} specified for geographic variable: ' # f'{variable["Variable_Mnemonic"]}') @@ -688,11 +688,11 @@ def classifications(self): classification_to_topics = self.load_classification_to_topics(classification_mnemonics) classifications = {} - for classification, line_num in classification_rows: + for classification, row_num in classification_rows: variable_mnemonic = classification.pop('Variable_Mnemonic') classification_mnemonic = classification.pop('Classification_Mnemonic') if self.variables[variable_mnemonic].private['Is_Geographic']: - raise ValueError(f'Reading {self.full_filename(filename)}:{line_num} ' + raise ValueError(f'Reading {self.full_filename(filename)}:{row_num} ' f'{classification_mnemonic} has a geographic variable ' f'{variable_mnemonic} which is not allowed') @@ -709,7 +709,7 @@ def classifications(self): if classification['Security_Mnemonic'] == PUBLIC_SECURITY_MNEMONIC: variable = classification['ONS_Variable'] if variable.private['Security_Mnemonic'] != PUBLIC_SECURITY_MNEMONIC: - raise ValueError(f'Reading {self.full_filename(filename)}:{line_num} ' + raise ValueError(f'Reading {self.full_filename(filename)}:{row_num} ' f'Public classification {classification_mnemonic} has ' f'non-public variable {variable_mnemonic}') diff --git a/bin/ons_csv_to_ctb_json_read.py b/bin/ons_csv_to_ctb_json_read.py index 8d7b5dc..d7325e9 100644 --- a/bin/ons_csv_to_ctb_json_read.py +++ b/bin/ons_csv_to_ctb_json_read.py @@ -15,7 +15,7 @@ def optional(name, unique=False, validate_fn=None): return Column(name, unique, validate_fn, required=False) -Row = namedtuple('Row', 'data line_num') +Row = namedtuple('Row', 'data row_num') class Reader: @@ -48,13 +48,11 @@ def read(self): raise ValueError(f'Reading {self.filename}: missing expected columns: ' f'{", ".join(sorted(missing_columns))}') - for row in reader: + for row_num, row in enumerate(reader, 2): if None in row: - raise ValueError(f'Reading {self.filename}: too many fields on line ' - f'{reader.line_num}') + raise ValueError(f'Reading {self.filename}: too many fields on row {row_num}') if None in row.values(): - raise ValueError(f'Reading {self.filename}: too few fields on line ' - f'{reader.line_num}') + raise ValueError(f'Reading {self.filename}: too few fields on row {row_num}') for k in list(row.keys()): if k not in self.expected_columns: @@ -63,40 +61,40 @@ def read(self): if not [k for k in row if row[k]]: continue - self.validate_row(row, reader.line_num) + self.validate_row(row, row_num) for k in row.keys(): if row[k] == '': row[k] = None - data.append(Row(row, reader.line_num)) + data.append(Row(row, row_num)) return data - def validate_row(self, row, line_num): + def validate_row(self, row, row_num): """Validate the fields in a row.""" for column in self.columns: row[column.name] = row[column.name].strip() if column.required and not row[column.name]: - raise ValueError(f'Reading {self.filename}:{line_num} no value supplied ' + raise ValueError(f'Reading {self.filename}:{row_num} no value supplied ' f'for required field {column.name}') if column.unique: if row[column.name] in self.unique_column_values[column.name]: - raise ValueError(f'Reading {self.filename}:{line_num} duplicate ' + raise ValueError(f'Reading {self.filename}:{row_num} duplicate ' f'value {row[column.name]} for {column.name}') self.unique_column_values[column.name].add(row[column.name]) if row[column.name] and column.validate_fn and not \ column.validate_fn(row[column.name]): - raise ValueError(f'Reading {self.filename}:{line_num} invalid value ' + raise ValueError(f'Reading {self.filename}:{row_num} invalid value ' f'{row[column.name]} for {column.name}') if self.unique_combo_fields: combo = tuple([row[f] for f in self.unique_combo_fields]) if combo in self.unique_combos: - raise ValueError(f'Reading {self.filename}:{line_num} duplicate ' + raise ValueError(f'Reading {self.filename}:{row_num} duplicate ' f'value combo {"/".join(combo)} for ' f'{"/".join(self.unique_combo_fields)}') self.unique_combos.add(combo) diff --git a/test/test_csv_read.py b/test/test_csv_read.py index ce9236d..04b46d7 100644 --- a/test/test_csv_read.py +++ b/test/test_csv_read.py @@ -69,7 +69,7 @@ def test_too_many_columns(self, m): required('name'), required('email'), ] - with self.assertRaisesRegex(ValueError, 'Reading file.csv: too many fields on line 3'): + with self.assertRaisesRegex(ValueError, 'Reading file.csv: too many fields on row 3'): Reader('file.csv', columns).read() @unittest.mock.patch('builtins.open', new_callable=mock_open, read_data="""name,email @@ -81,7 +81,7 @@ def test_too_few_columns(self, m): required('name'), required('email'), ] - with self.assertRaisesRegex(ValueError, 'Reading file.csv: too few fields on line 3'): + with self.assertRaisesRegex(ValueError, 'Reading file.csv: too few fields on row 3'): Reader('file.csv', columns).read() @unittest.mock.patch('builtins.open', new_callable=mock_open, read_data="""name diff --git a/test/test_geo_read.py b/test/test_geo_read.py index b59681b..500fd4b 100644 --- a/test/test_geo_read.py +++ b/test/test_geo_read.py @@ -64,14 +64,14 @@ def test_valid_varname_characters(self, m): OA1,LAD1,LAD1 Name,LAD1 Name (Welsh),COUNTRY1,COUNTRY1 Name,extra """) def test_too_many_columns(self, m): - with self.assertRaisesRegex(ValueError, 'Reading file.csv: too many fields on line 2'): + with self.assertRaisesRegex(ValueError, 'Reading file.csv: too many fields on row 2'): read_geo_cats('file.csv') @unittest.mock.patch('builtins.open', new_callable=mock_open, read_data="""OA11CD,LAD22CD,LAD22NM,LAD22NMW,COUNTRY22CD,COUNTRY22NM OA1,LAD1,LAD1 Name,LAD1 Name (Welsh),COUNTRY1 """) def test_too_few_columns(self, m): - with self.assertRaisesRegex(ValueError, 'Reading file.csv: too few fields on line 2'): + with self.assertRaisesRegex(ValueError, 'Reading file.csv: too few fields on row 2'): read_geo_cats('file.csv') @unittest.mock.patch('builtins.open', new_callable=mock_open, read_data="""LAD22CD,LAD22NM,LAD22NMW From f57ef27141a39189c5295ab30be82a2281813639 Mon Sep 17 00:00:00 2001 From: Peter Hynes Date: Tue, 10 May 2022 15:29:27 +0100 Subject: [PATCH 11/15] Recover from errors if --best-effort is set --- .github/workflows/ci-test.yml | 2 +- bin/ons_csv_to_ctb_json_ds_vars.py | 87 +++++++++++++++------------ bin/ons_csv_to_ctb_json_load.py | 88 +++++++++++++++++++--------- bin/ons_csv_to_ctb_json_main.py | 11 +++- bin/ons_csv_to_ctb_json_read.py | 45 ++++++++++---- test/test_csv_read.py | 22 +++---- test/test_dataset_classifications.py | 14 ++--- 7 files changed, 171 insertions(+), 98 deletions(-) diff --git a/.github/workflows/ci-test.yml b/.github/workflows/ci-test.yml index 5905643..1b9ad9a 100644 --- a/.github/workflows/ci-test.yml +++ b/.github/workflows/ci-test.yml @@ -32,7 +32,7 @@ jobs: pydocstyle bin/*.py - name: Run pylint run: | - pylint --max-locals=20 --max-public-methods=30 --max-branches=50 --max-statements=90 --min-similarity-lines=6 --max-module-lines=1200 --disable=W1202 bin/*.py + pylint --max-locals=20 --max-public-methods=30 --max-branches=50 --max-statements=90 --min-similarity-lines=6 --max-module-lines=1200 --max-locals=22 --max-attributes=10 --disable=W1202 bin/*.py - name: Run tests run: | PYTHONPATH=test:bin python3 -m unittest -v diff --git a/bin/ons_csv_to_ctb_json_ds_vars.py b/bin/ons_csv_to_ctb_json_ds_vars.py index 3bf2180..890679d 100644 --- a/bin/ons_csv_to_ctb_json_ds_vars.py +++ b/bin/ons_csv_to_ctb_json_ds_vars.py @@ -1,4 +1,5 @@ """Build data structure that represents relationship between dataset and variables.""" +import logging from collections import namedtuple DatasetVariables = namedtuple('DatasetVariables', 'classifications alternate_geog_variables') @@ -7,7 +8,7 @@ class DatasetVarsBuilder(): """Utility class to validate and build dataset variables.""" - def __init__(self, dataset_mnemonic, filename, all_classifications): + def __init__(self, dataset_mnemonic, filename, all_classifications, recoverable_error): """Initialise DatasetVarsBuilder object.""" self.lowest_geog_variable = None self.alternate_geog_variables = [] @@ -16,71 +17,81 @@ def __init__(self, dataset_mnemonic, filename, all_classifications): self.dataset_mnemonic = dataset_mnemonic self.filename = filename self.all_classifications = all_classifications + self.recoverable_error = recoverable_error - def add_geographic_variable(self, variable): + def add_geographic_variable(self, variable, row_num): """Add geographic variable ensuring data integrity.""" variable_mnemonic = variable['Variable_Mnemonic'] classification_mnemonic = variable['Classification_Mnemonic'] if classification_mnemonic: - raise ValueError(f'Reading {self.filename} ' - 'Classification_Mnemonic must not be specified for ' - f'geographic variable {variable_mnemonic} in dataset ' - f'{self.dataset_mnemonic}') + self.recoverable_error(f'Reading {self.filename}:{row_num} ' + 'Classification_Mnemonic must not be specified for ' + f'geographic variable {variable_mnemonic} in dataset ' + f'{self.dataset_mnemonic}') if variable['Processing_Priority']: - raise ValueError(f'Reading {self.filename} ' - 'Processing_Priority must not be specified for geographic' - f' variable {variable_mnemonic} in dataset ' - f'{self.dataset_mnemonic}') + self.recoverable_error(f'Reading {self.filename}:{row_num} ' + 'Processing_Priority must not be specified for geographic' + f' variable {variable_mnemonic} in dataset ' + f'{self.dataset_mnemonic}') if variable['Lowest_Geog_Variable_Flag'] == 'Y': if self.lowest_geog_variable: - raise ValueError(f'Reading {self.filename} ' - 'Lowest_Geog_Variable_Flag set on variable ' - f'{variable_mnemonic} and ' - f'{self.lowest_geog_variable} for dataset ' - f'{self.dataset_mnemonic}') - self.lowest_geog_variable = variable['Variable_Mnemonic'] + self.recoverable_error(f'Reading {self.filename}:{row_num} ' + 'Lowest_Geog_Variable_Flag set on variable ' + f'{variable_mnemonic} and ' + f'{self.lowest_geog_variable} for dataset ' + f'{self.dataset_mnemonic}') + else: + self.lowest_geog_variable = variable['Variable_Mnemonic'] else: self.alternate_geog_variables.append(variable['Variable_Mnemonic']) - def add_non_geographic_variable(self, variable): + def add_non_geographic_variable(self, variable, row_num): """Add non-geographic variable ensuring data integrity.""" variable_mnemonic = variable['Variable_Mnemonic'] classification_mnemonic = variable['Classification_Mnemonic'] if not classification_mnemonic: - raise ValueError(f'Reading {self.filename} ' - 'Classification must be specified for non-geographic ' - f'{variable_mnemonic} in dataset {self.dataset_mnemonic}') + self.recoverable_error(f'Reading {self.filename}:{row_num} ' + 'Classification must be specified for non-geographic ' + f'{variable_mnemonic} in dataset {self.dataset_mnemonic}') + logging.warning(f'Reading {self.filename}:{row_num} dropping record') + return + if variable['Lowest_Geog_Variable_Flag'] == 'Y': - raise ValueError(f'Reading {self.filename} ' - 'Lowest_Geog_Variable_Flag set on non-geographic variable' - f' {variable_mnemonic} for dataset {self.dataset_mnemonic}') + self.recoverable_error(f'Reading {self.filename}:{row_num} ' + 'Lowest_Geog_Variable_Flag set on non-geographic variable ' + f'{variable_mnemonic} for dataset {self.dataset_mnemonic}') classification = self.all_classifications[classification_mnemonic] if classification.private['Variable_Mnemonic'] != variable_mnemonic: - raise ValueError(f'Reading {self.filename} Invalid ' - f'classification {classification_mnemonic} ' - f'specified for variable {variable_mnemonic} ' - f'in dataset {self.dataset_mnemonic}') + self.recoverable_error(f'Reading {self.filename}:{row_num} Invalid ' + f'classification {classification_mnemonic} ' + f'specified for variable {variable_mnemonic} ' + f'in dataset {self.dataset_mnemonic}') + logging.warning(f'Reading {self.filename}:{row_num} dropping record') + return + if not variable['Processing_Priority']: - raise ValueError(f'Reading {self.filename} ' - 'Processing_Priority not specified for classification ' - f'{classification_mnemonic} in dataset ' - f'{self.dataset_mnemonic}') + self.recoverable_error(f'Reading {self.filename}:{row_num} ' + 'Processing_Priority not specified for classification ' + f'{classification_mnemonic} in dataset ' + f'{self.dataset_mnemonic}') + variable['Processing_Priority'] = 0 + self.classifications.append(variable['Classification_Mnemonic']) self.processing_priorities.append(int(variable['Processing_Priority'])) def dataset_variables(self): """Return dataset classifications and alternate geographic variables for each dataset.""" if self.alternate_geog_variables and not self.lowest_geog_variable: - raise ValueError(f'Reading {self.filename} ' - 'Lowest_Geog_Variable_Flag not set on any geographic variables ' - f'for dataset {self.dataset_mnemonic}') + self.recoverable_error(f'Reading {self.filename} ' + 'Lowest_Geog_Variable_Flag not set on any geographic variables ' + f'for dataset {self.dataset_mnemonic}') if set(self.processing_priorities) != set(range(1, len(self.processing_priorities) + 1)): - raise ValueError(f'Reading {self.filename} ' - 'Invalid processing_priorities ' - f'{self.processing_priorities} for dataset ' - f'{self.dataset_mnemonic}') + self.recoverable_error(f'Reading {self.filename} ' + 'Invalid processing_priorities ' + f'{self.processing_priorities} for dataset ' + f'{self.dataset_mnemonic}') classifications = [c for _, c in sorted(zip(self.processing_priorities, self.classifications))] diff --git a/bin/ons_csv_to_ctb_json_load.py b/bin/ons_csv_to_ctb_json_load.py index f4b3f74..037715e 100644 --- a/bin/ons_csv_to_ctb_json_load.py +++ b/bin/ons_csv_to_ctb_json_load.py @@ -60,10 +60,26 @@ class Loader: Many of the fields in this class are cached properties, with the data loaded on first access. """ - def __init__(self, input_directory, geography_file): + def __init__(self, input_directory, geography_file, best_effort=False): """Initialise MetadataLoader object.""" self.input_directory = input_directory self.geography_file = geography_file + self._error_count = 0 + + def raise_value_error(msg): + """Raise a ValueError exception.""" + raise ValueError(msg) + + def log_error(msg): + """Log the error.""" + self._error_count += 1 + logging.warning(msg) + + self.recoverable_error = log_error if best_effort else raise_value_error + + def error_count(self): + """Return number of errors.""" + return self._error_count def read_file(self, filename, columns, unique_combo_fields=None): """ @@ -73,7 +89,7 @@ def read_file(self, filename, columns, unique_combo_fields=None): and corresponding line number. """ full_filename = self.full_filename(filename) - return Reader(full_filename, columns, unique_combo_fields).read() + return Reader(full_filename, columns, self.recoverable_error, unique_combo_fields).read() def full_filename(self, filename): """Add the input_directory path to the filename.""" @@ -283,24 +299,34 @@ def datasets(self): # If the dataset is public then ensure that there is at least one classification and # that all the classifications are also public. if dataset['Security_Mnemonic'] == PUBLIC_SECURITY_MNEMONIC: + drop_dataset = False if not dataset_variables.classifications: - raise ValueError( + self.recoverable_error( f'Reading {self.full_filename(filename)}:{row_num} {dataset_mnemonic} ' 'has no associated classifications or geographic variable') + drop_dataset = True for classification in all_classifications: if self.classifications[classification].private['Security_Mnemonic'] != \ PUBLIC_SECURITY_MNEMONIC: - raise ValueError( + self.recoverable_error( f'Reading {self.full_filename(filename)}:{row_num} Public ONS ' f'dataset {dataset_mnemonic} has non-public classification ' f'{classification}') + drop_dataset = True + if classification not in \ self.databases[database_mnemonic].private['Classifications']: - raise ValueError( + self.recoverable_error( f'Reading {self.full_filename(filename)}:{row_num} ' f'{dataset_mnemonic} has classification {classification} ' f'that is not in database {database_mnemonic}') + drop_dataset = True + + if drop_dataset: + logging.warning( + f'Reading {self.full_filename(filename)}:{row_num} dropping record') + continue del dataset['Id'] del dataset['Signed_Off_Flag'] @@ -418,9 +444,10 @@ def categories(self): num_cat_items = \ self.classifications[classification_mnemonic].private['Number_Of_Category_Items'] if num_cat_items and len(one_var_categories) != num_cat_items: - raise ValueError(f'Reading {self.full_filename(filename)} ' - f'Unexpected number of categories for {classification_mnemonic}: ' - f'expected {num_cat_items} but found {len(one_var_categories)}') + self.recoverable_error( + f'Reading {self.full_filename(filename)} ' + f'Unexpected number of categories for {classification_mnemonic}: ' + f'expected {num_cat_items} but found {len(one_var_categories)}') welsh_cats = {cat['Category_Code']: cat['External_Category_Label_Welsh'] for cat in one_var_categories if cat['External_Category_Label_Welsh']} @@ -440,8 +467,9 @@ def categories(self): continue if not self.classifications[class_name].private['Is_Geographic']: - raise ValueError(f'Reading {self.geography_file}: found Welsh labels for non ' - f'geographic classification: {class_name}') + self.recoverable_error(f'Reading {self.geography_file}: found Welsh labels for ' + f'non geographic classification: {class_name}') + continue welsh_names = {cd: nm.welsh_name for cd, nm in geo_cats.items() if nm.welsh_name} if geo_cats: @@ -597,9 +625,9 @@ def variables(self): # f'{variable["Variable_Mnemonic"]}') for geo_field in all_geo_fields: if variable[geo_field]: - raise ValueError(f'Reading {self.full_filename(filename)}:{row_num} ' - f'{geo_field} specified for non geographic variable: ' - f'{variable["Variable_Mnemonic"]}') + self.recoverable_error(f'Reading {self.full_filename(filename)}:{row_num} ' + f'{geo_field} specified for non geographic ' + f'variable: {variable["Variable_Mnemonic"]}') # These values are not yet populated in source files # else: @@ -803,20 +831,23 @@ def load_database_to_variables(self, database_mnemonics): if db_var['Lowest_Geog_Variable_Flag'] == 'Y': if not is_geographic: - raise ValueError(f'Reading {self.full_filename(filename)} ' - 'Lowest_Geog_Variable_Flag set on non-geographic variable' - f' {variable_mnemonic} for database {database_mnemonic}') - if lowest_geog_var: - raise ValueError(f'Reading {self.full_filename(filename)} ' - f'Lowest_Geog_Variable_Flag set on {variable_mnemonic} ' - f'and {lowest_geog_var} for database {database_mnemonic}') - lowest_geog_var = variable_mnemonic + self.recoverable_error( + f'Reading {self.full_filename(filename)} ' + 'Lowest_Geog_Variable_Flag set on non-geographic variable' + f' {variable_mnemonic} for database {database_mnemonic}') + elif lowest_geog_var: + self.recoverable_error( + f'Reading {self.full_filename(filename)} ' + f'Lowest_Geog_Variable_Flag set on {variable_mnemonic} ' + f'and {lowest_geog_var} for database {database_mnemonic}') + else: + lowest_geog_var = variable_mnemonic variables.append(variable_mnemonic) if not lowest_geog_var and contains_geo_vars: - raise ValueError(f'Reading {self.full_filename(filename)} ' - 'Lowest_Geog_Variable_Flag not set on any geographic variable ' - f'for database {database_mnemonic}') + self.recoverable_error(f'Reading {self.full_filename(filename)} ' + 'Lowest_Geog_Variable_Flag not set on any geographic ' + f'variable for database {database_mnemonic}') database_to_variables[database_mnemonic] = DatabaseVariables( variables=variables, lowest_geog_variable=lowest_geog_var) @@ -983,18 +1014,19 @@ def load_dataset_to_variables(self, dataset_mnemonics): unique_combo_fields=['Dataset_Mnemonic', 'Variable_Mnemonic']) ds_to_vars_builder = {} - for ds_variable, _ in dataset_variable_rows: + for ds_variable, row_num in dataset_variable_rows: dataset_mnemonic = ds_variable['Dataset_Mnemonic'] variable_mnemonic = ds_variable['Variable_Mnemonic'] if dataset_mnemonic not in ds_to_vars_builder: ds_to_vars_builder[dataset_mnemonic] = DatasetVarsBuilder( - dataset_mnemonic, self.full_filename(filename), self.classifications) + dataset_mnemonic, self.full_filename(filename), self.classifications, + self.recoverable_error) vars_builder = ds_to_vars_builder[dataset_mnemonic] if self.variables[variable_mnemonic].private['Is_Geographic']: - vars_builder.add_geographic_variable(ds_variable) + vars_builder.add_geographic_variable(ds_variable, row_num) else: - vars_builder.add_non_geographic_variable(ds_variable) + vars_builder.add_non_geographic_variable(ds_variable, row_num) ds_to_variables = {} for dataset_mnemonic, vars_builder in ds_to_vars_builder.items(): diff --git a/bin/ons_csv_to_ctb_json_main.py b/bin/ons_csv_to_ctb_json_main.py index 545825c..48cb27c 100644 --- a/bin/ons_csv_to_ctb_json_main.py +++ b/bin/ons_csv_to_ctb_json_main.py @@ -104,6 +104,11 @@ def main(): f'{DEFAULT_CANTABULAR_VERSION} formatting will be used. ' '(default: %(default)s)') + parser.add_argument('--best-effort', + action='store_true', + help='Discard invalid data instead of failing on the first error and ' + 'make a best effort attempt to produce valid output files.') + args = parser.parse_args() logging.basicConfig(format='t=%(asctime)s lvl=%(levelname)s msg=%(message)s', @@ -124,7 +129,7 @@ def main(): args.build_number) # loader is used to load the metadata from CSV files and convert it to JSON. - loader = Loader(args.input_dir, args.geography_file) + loader = Loader(args.input_dir, args.geography_file, best_effort=args.best_effort) # Build Cantabular variable objects. # A Cantabular variable is equivalent to an ONS classification. @@ -141,6 +146,10 @@ def main(): # Build Cantabular service metadata. service_metadata = build_ctb_service_metadata() + error_count = loader.error_count() + if error_count: + logging.warning(f'{error_count} errors were encountered during processing') + # There is not a separate tables file for v9.2.0. Use the output_file_types list # to determine which file types will be written. output_file_types = [FILE_CONTENT_TYPE_DATASET, FILE_CONTENT_TYPE_SERVICE, diff --git a/bin/ons_csv_to_ctb_json_read.py b/bin/ons_csv_to_ctb_json_read.py index d7325e9..945c41e 100644 --- a/bin/ons_csv_to_ctb_json_read.py +++ b/bin/ons_csv_to_ctb_json_read.py @@ -1,5 +1,6 @@ """Load metadata from CSV files and export in JSON format.""" import csv +import logging from collections import namedtuple Column = namedtuple('Column', 'name unique validate_fn required') @@ -21,7 +22,7 @@ def optional(name, unique=False, validate_fn=None): class Reader: """Reader is used to read a CSV file containing metadata.""" - def __init__(self, filename, columns, unique_combo_fields=None): + def __init__(self, filename, columns, recoverable_error, unique_combo_fields=None): """Initialise Reader object.""" self.filename = filename self.columns = columns @@ -30,6 +31,7 @@ def __init__(self, filename, columns, unique_combo_fields=None): self.unique_combo_fields = unique_combo_fields if unique_combo_fields: self.unique_combos = set() + self.recoverable_error = recoverable_error def read(self): """ @@ -61,7 +63,9 @@ def read(self): if not [k for k in row if row[k]]: continue - self.validate_row(row, row_num) + if not self.validate_row(row, row_num): + logging.warning(f'Reading {self.filename}:{row_num} dropping record') + continue for k in row.keys(): if row[k] == '': @@ -73,28 +77,43 @@ def read(self): def validate_row(self, row, row_num): """Validate the fields in a row.""" + keep_row = True for column in self.columns: row[column.name] = row[column.name].strip() if column.required and not row[column.name]: - raise ValueError(f'Reading {self.filename}:{row_num} no value supplied ' - f'for required field {column.name}') + self.recoverable_error(f'Reading {self.filename}:{row_num} no value supplied ' + f'for required field {column.name}') + keep_row = False + continue if column.unique: if row[column.name] in self.unique_column_values[column.name]: - raise ValueError(f'Reading {self.filename}:{row_num} duplicate ' - f'value {row[column.name]} for {column.name}') + self.recoverable_error(f'Reading {self.filename}:{row_num} duplicate ' + f'value {row[column.name]} for {column.name}') + keep_row = False + continue + self.unique_column_values[column.name].add(row[column.name]) if row[column.name] and column.validate_fn and not \ column.validate_fn(row[column.name]): - raise ValueError(f'Reading {self.filename}:{row_num} invalid value ' - f'{row[column.name]} for {column.name}') + self.recoverable_error(f'Reading {self.filename}:{row_num} invalid value ' + f'{row[column.name]} for {column.name}') + if column.required: + keep_row = False + continue + row[column.name] = "" - if self.unique_combo_fields: + if self.unique_combo_fields and keep_row: combo = tuple([row[f] for f in self.unique_combo_fields]) if combo in self.unique_combos: - raise ValueError(f'Reading {self.filename}:{row_num} duplicate ' - f'value combo {"/".join(combo)} for ' - f'{"/".join(self.unique_combo_fields)}') - self.unique_combos.add(combo) + self.recoverable_error(f'Reading {self.filename}:{row_num} duplicate ' + f'value combo {"/".join(combo)} for ' + f'{"/".join(self.unique_combo_fields)}') + + keep_row = False + else: + self.unique_combos.add(combo) + + return keep_row diff --git a/test/test_csv_read.py b/test/test_csv_read.py index 04b46d7..f68553d 100644 --- a/test/test_csv_read.py +++ b/test/test_csv_read.py @@ -16,6 +16,8 @@ def validate_fn(value): return validate_fn +def raise_error(msg): + raise ValueError(msg) class TestCSVRead(unittest.TestCase): @unittest.mock.patch('builtins.open', new_callable=mock_open, read_data="""id,name,email,age @@ -29,7 +31,7 @@ def test_read_file(self, m): required('age', validate_fn=isoneof(['40', '50'])), required('id'), ] - data = Reader('file.csv', columns).read() + data = Reader('file.csv', columns, raise_error).read() self.assertEqual(data, [ ({'name': 'bob', 'email': 'bob@bob.com', 'age': '40', 'id': '1'}, 2), @@ -43,7 +45,7 @@ def test_extra_fields(self, m): required('name'), required('id'), ] - data = Reader('file.csv', columns).read() + data = Reader('file.csv', columns, raise_error).read() self.assertEqual(data, [ ({'name': 'bob', 'id': '1'}, 2)]) @@ -58,7 +60,7 @@ def test_missing_fields(self, m): required('id'), ] with self.assertRaisesRegex(ValueError, 'Reading file.csv: missing expected columns: email, id'): - Reader('file.csv', columns).read() + Reader('file.csv', columns, raise_error).read() @unittest.mock.patch('builtins.open', new_callable=mock_open, read_data="""name,email bob,bob@bob.com @@ -70,7 +72,7 @@ def test_too_many_columns(self, m): required('email'), ] with self.assertRaisesRegex(ValueError, 'Reading file.csv: too many fields on row 3'): - Reader('file.csv', columns).read() + Reader('file.csv', columns, raise_error).read() @unittest.mock.patch('builtins.open', new_callable=mock_open, read_data="""name,email bob,bob@bob.com @@ -82,7 +84,7 @@ def test_too_few_columns(self, m): required('email'), ] with self.assertRaisesRegex(ValueError, 'Reading file.csv: too few fields on row 3'): - Reader('file.csv', columns).read() + Reader('file.csv', columns, raise_error).read() @unittest.mock.patch('builtins.open', new_callable=mock_open, read_data="""name bob @@ -94,7 +96,7 @@ def test_invalid_value(self, m): required('name', validate_fn=isoneof(['bob', 'bill'])), ] with self.assertRaisesRegex(ValueError, 'Reading file.csv:4 invalid value ben for name'): - Reader('file.csv', columns).read() + Reader('file.csv', columns, raise_error).read() @unittest.mock.patch('builtins.open', new_callable=mock_open, read_data="""name bob @@ -106,7 +108,7 @@ def test_non_unique_value(self, m): required('name', unique=True), ] with self.assertRaisesRegex(ValueError, 'Reading file.csv:4 duplicate value bob for name'): - Reader('file.csv', columns).read() + Reader('file.csv', columns, raise_error).read() @unittest.mock.patch('builtins.open', new_callable=mock_open, read_data="""name,email bob,bob@bob.com @@ -118,7 +120,7 @@ def test_empty_rows(self, m): required('name'), required('email'), ] - data = Reader('file.csv', columns).read() + data = Reader('file.csv', columns, raise_error).read() self.assertEqual(data, [ ({'name': 'bob', 'email': 'bob@bob.com'}, 2)]) @@ -133,7 +135,7 @@ def test_whitespace(self, m): required('name'), required('id'), ] - data = Reader('file.csv', columns).read() + data = Reader('file.csv', columns, raise_error).read() self.assertEqual(data, [ ({'name': 'bob', 'id': '1'}, 2), @@ -153,7 +155,7 @@ def test_unique_combos(self, m): required('id'), ] with self.assertRaisesRegex(ValueError, 'Reading file.csv:6 duplicate value combo bob/1 for name/id'): - Reader('file.csv', columns, unique_combo_fields=['name', 'id']).read() + Reader('file.csv', columns, raise_error, unique_combo_fields=['name', 'id']).read() if __name__ == '__main__': diff --git a/test/test_dataset_classifications.py b/test/test_dataset_classifications.py index 5fa1d39..9d563da 100644 --- a/test/test_dataset_classifications.py +++ b/test/test_dataset_classifications.py @@ -63,38 +63,38 @@ def test_classification_on_geo_var(self): self.run_test( [{'Dataset_Mnemonic': 'DS1', 'Variable_Mnemonic': 'GEO1', 'Id': '1', 'Classification_Mnemonic': 'GEO1'}], - f'^Reading {FILENAME} Classification_Mnemonic must not be specified for geographic variable GEO1 in dataset DS1$') + f'^Reading {FILENAME}:2 Classification_Mnemonic must not be specified for geographic variable GEO1 in dataset DS1$') def test_processing_priority_on_geo_var(self): self.run_test( [{'Dataset_Mnemonic': 'DS1', 'Variable_Mnemonic': 'GEO1', 'Id': '1', 'Processing_Priority': '1'}], - f'^Reading {FILENAME} Processing_Priority must not be specified for geographic variable GEO1 in dataset DS1$') + f'^Reading {FILENAME}:2 Processing_Priority must not be specified for geographic variable GEO1 in dataset DS1$') def test_no_classification_on_non_geo_var(self): self.run_test( [{'Dataset_Mnemonic': 'DS1', 'Variable_Mnemonic': 'VAR1', 'Id': '1', 'Processing_Priority': '1'}], - f'^Reading {FILENAME} Classification must be specified for non-geographic VAR1 in dataset DS1$') + f'^Reading {FILENAME}:2 Classification must be specified for non-geographic VAR1 in dataset DS1$') def test_no_processing_priority_on_non_geo_var(self): self.run_test( [{'Dataset_Mnemonic': 'DS1', 'Variable_Mnemonic': 'VAR1', 'Id': '1', 'Classification_Mnemonic': 'CLASS1'}], - f'^Reading {FILENAME} Processing_Priority not specified for classification CLASS1 in dataset DS1$') + f'^Reading {FILENAME}:2 Processing_Priority not specified for classification CLASS1 in dataset DS1$') def test_lowest_geog_on_non_geo_var(self): self.run_test( [{'Dataset_Mnemonic': 'DS1', 'Variable_Mnemonic': 'VAR1', 'Id': '1', 'Classification_Mnemonic': 'CLASS1', 'Processing_Priority': '1', 'Lowest_Geog_Variable_Flag': 'Y'}], - f'^Reading {FILENAME} Lowest_Geog_Variable_Flag set on non-geographic variable VAR1 for dataset DS1$') + f'^Reading {FILENAME}:2 Lowest_Geog_Variable_Flag set on non-geographic variable VAR1 for dataset DS1$') def test_invalid_classification_on_var(self): self.run_test( [{'Dataset_Mnemonic': 'DS1', 'Variable_Mnemonic': 'VAR1', 'Id': '1', 'Classification_Mnemonic': 'CLASS2', 'Processing_Priority': '1'}], - f'^Reading {FILENAME} Invalid classification CLASS2 specified for variable VAR1 in dataset DS1$') + f'^Reading {FILENAME}:2 Invalid classification CLASS2 specified for variable VAR1 in dataset DS1$') def test_no_lowest_geog_flag(self): self.run_test( @@ -108,7 +108,7 @@ def test_duplicate_lowest_geog_flag(self): 'Id': '1', 'Lowest_Geog_Variable_Flag': 'Y'}, {'Dataset_Mnemonic': 'DS1', 'Variable_Mnemonic': 'GEO2', 'Id': '1', 'Lowest_Geog_Variable_Flag': 'Y'}], - f'^Reading {FILENAME} Lowest_Geog_Variable_Flag set on variable GEO2 and GEO1 for dataset DS1$') + f'^Reading {FILENAME}:3 Lowest_Geog_Variable_Flag set on variable GEO2 and GEO1 for dataset DS1$') if __name__ == '__main__': From 8fef7fe0f1e490c5815fbf5d1f7a72aeee58b8cf Mon Sep 17 00:00:00 2001 From: Peter Hynes Date: Tue, 10 May 2022 17:22:07 +0100 Subject: [PATCH 12/15] Added best-effort test --- .../dataset-metadata-best-effort.json | 436 ++++++++++++++++++ test/expected/table-metadata-best-effort.json | 62 +++ test/test_best_effort.py | 77 ++++ test/testdata/best_effort/Category.csv | 2 + test/testdata/best_effort/Census_Release.csv | 1 + test/testdata/best_effort/Classification.csv | 5 + test/testdata/best_effort/Contact.csv | 1 + test/testdata/best_effort/Database.csv | 2 + .../best_effort/Database_Variable.csv | 4 + test/testdata/best_effort/Dataset.csv | 4 + test/testdata/best_effort/Dataset_Keyword.csv | 1 + .../testdata/best_effort/Dataset_Variable.csv | 9 + .../best_effort/Publication_Dataset.csv | 1 + test/testdata/best_effort/Question.csv | 1 + .../testdata/best_effort/Related_Datasets.csv | 1 + test/testdata/best_effort/Release_Dataset.csv | 1 + .../best_effort/Security_Classification.csv | 3 + test/testdata/best_effort/Source.csv | 2 + .../testdata/best_effort/Statistical_Unit.csv | 2 + test/testdata/best_effort/Topic.csv | 1 + .../best_effort/Topic_Classification.csv | 1 + test/testdata/best_effort/Variable.csv | 7 + .../testdata/best_effort/Variable_Keyword.csv | 1 + .../best_effort/Variable_Source_Question.csv | 1 + test/testdata/best_effort/Variable_Type.csv | 3 + 25 files changed, 629 insertions(+) create mode 100644 test/expected/dataset-metadata-best-effort.json create mode 100644 test/expected/table-metadata-best-effort.json create mode 100644 test/test_best_effort.py create mode 100644 test/testdata/best_effort/Category.csv create mode 100644 test/testdata/best_effort/Census_Release.csv create mode 100644 test/testdata/best_effort/Classification.csv create mode 100644 test/testdata/best_effort/Contact.csv create mode 100644 test/testdata/best_effort/Database.csv create mode 100644 test/testdata/best_effort/Database_Variable.csv create mode 100644 test/testdata/best_effort/Dataset.csv create mode 100644 test/testdata/best_effort/Dataset_Keyword.csv create mode 100644 test/testdata/best_effort/Dataset_Variable.csv create mode 100644 test/testdata/best_effort/Publication_Dataset.csv create mode 100644 test/testdata/best_effort/Question.csv create mode 100644 test/testdata/best_effort/Related_Datasets.csv create mode 100644 test/testdata/best_effort/Release_Dataset.csv create mode 100644 test/testdata/best_effort/Security_Classification.csv create mode 100644 test/testdata/best_effort/Source.csv create mode 100644 test/testdata/best_effort/Statistical_Unit.csv create mode 100644 test/testdata/best_effort/Topic.csv create mode 100644 test/testdata/best_effort/Topic_Classification.csv create mode 100644 test/testdata/best_effort/Variable.csv create mode 100644 test/testdata/best_effort/Variable_Keyword.csv create mode 100644 test/testdata/best_effort/Variable_Source_Question.csv create mode 100644 test/testdata/best_effort/Variable_Type.csv diff --git a/test/expected/dataset-metadata-best-effort.json b/test/expected/dataset-metadata-best-effort.json new file mode 100644 index 0000000..b259df0 --- /dev/null +++ b/test/expected/dataset-metadata-best-effort.json @@ -0,0 +1,436 @@ +[ + { + "name": "base", + "label": "Base dataset with metadata for all variables", + "lang": "en", + "description": "This is a base dataset containing metadata for all variables used across all other datasets. Other datasets include it to avoid duplicating metadata.", + "meta": { + "Source": { + "Source_Mnemonic": "Census2021", + "Source_Description": "The 2021 England and Wales Census", + "Version": "1" + }, + "Version": "1" + }, + "vars": [ + { + "name": "CLASS1", + "label": "CLASS1 Label Internal", + "description": "VAR1 Description", + "meta": { + "Mnemonic_2011": null, + "Parent_Classification_Mnemonic": null, + "Default_Classification_Flag": null, + "Version": "1", + "ONS_Variable": { + "Variable_Mnemonic": "VAR1", + "Variable_Mnemonic_2011": null, + "Version": "1", + "Quality_Statement_Text": null, + "Quality_Summary_URL": null, + "Variable_Title": "VAR1 Title", + "Comparability_Comments": "VAR1 Comparability Comments", + "Uk_Comparison_Comments": "VAR1 UK Comparison Comments", + "Geographic_Abbreviation": null, + "Geographic_Theme": null, + "Geographic_Coverage": null, + "Variable_Type": { + "Variable_Type_Code": "DVO", + "Variable_Type_Description": "Derived variable" + }, + "Statistical_Unit": null, + "Topic": null, + "Keywords": [], + "Questions": [] + }, + "Topics": [] + }, + "catLabels": null + }, + { + "name": "CLASS3", + "label": "CLASS3 Label Internal", + "description": "VAR3 Description", + "meta": { + "Mnemonic_2011": null, + "Parent_Classification_Mnemonic": null, + "Default_Classification_Flag": null, + "Version": "1", + "ONS_Variable": { + "Variable_Mnemonic": "VAR3", + "Variable_Mnemonic_2011": null, + "Version": "1", + "Quality_Statement_Text": null, + "Quality_Summary_URL": null, + "Variable_Title": "VAR3 Title", + "Comparability_Comments": "VAR3 Comparability Comments", + "Uk_Comparison_Comments": "VAR3 UK Comparison Comments", + "Geographic_Abbreviation": null, + "Geographic_Theme": null, + "Geographic_Coverage": null, + "Variable_Type": { + "Variable_Type_Code": "DVO", + "Variable_Type_Description": "Derived variable" + }, + "Statistical_Unit": null, + "Topic": null, + "Keywords": [], + "Questions": [] + }, + "Topics": [] + }, + "catLabels": null + }, + { + "name": "GEO1", + "label": "GEO1 Title", + "description": "GEO1 Description", + "meta": { + "Mnemonic_2011": null, + "Parent_Classification_Mnemonic": "GEO1", + "Default_Classification_Flag": null, + "Version": "1", + "ONS_Variable": { + "Variable_Mnemonic": "GEO1", + "Variable_Mnemonic_2011": "GEO1 2011", + "Version": "1", + "Quality_Statement_Text": null, + "Quality_Summary_URL": null, + "Variable_Title": "GEO1 Title", + "Comparability_Comments": "GEO1 Comparability Comments", + "Uk_Comparison_Comments": "GEO1 UK Comparison Comments", + "Geographic_Abbreviation": "G1", + "Geographic_Theme": "GEO1 Theme", + "Geographic_Coverage": "GEO1 Coverage", + "Variable_Type": { + "Variable_Type_Code": "GEOG", + "Variable_Type_Description": "Geographic variable" + }, + "Statistical_Unit": null, + "Topic": null, + "Keywords": [], + "Questions": [] + }, + "Topics": [] + }, + "catLabels": null + }, + { + "name": "GEO2", + "label": "GEO2 Title", + "description": "GEO2 Description", + "meta": { + "Mnemonic_2011": null, + "Parent_Classification_Mnemonic": "GEO2", + "Default_Classification_Flag": null, + "Version": "1", + "ONS_Variable": { + "Variable_Mnemonic": "GEO2", + "Variable_Mnemonic_2011": "GEO2 2011", + "Version": "1", + "Quality_Statement_Text": null, + "Quality_Summary_URL": null, + "Variable_Title": "GEO2 Title", + "Comparability_Comments": "GEO2 Comparability Comments", + "Uk_Comparison_Comments": "GEO2 UK Comparison Comments", + "Geographic_Abbreviation": "G1", + "Geographic_Theme": "GEO2 Theme", + "Geographic_Coverage": "GEO2 Coverage", + "Variable_Type": { + "Variable_Type_Code": "GEOG", + "Variable_Type_Description": "Geographic variable" + }, + "Statistical_Unit": null, + "Topic": null, + "Keywords": [], + "Questions": [] + }, + "Topics": [] + }, + "catLabels": null + }, + { + "name": "GEO3", + "label": "GEO3 Title", + "description": "GEO3 Description", + "meta": { + "Mnemonic_2011": null, + "Parent_Classification_Mnemonic": "GEO3", + "Default_Classification_Flag": null, + "Version": "1", + "ONS_Variable": { + "Variable_Mnemonic": "GEO3", + "Variable_Mnemonic_2011": "GEO3 2011", + "Version": "1", + "Quality_Statement_Text": null, + "Quality_Summary_URL": null, + "Variable_Title": "GEO3 Title", + "Comparability_Comments": "GEO3 Comparability Comments", + "Uk_Comparison_Comments": "GEO3 UK Comparison Comments", + "Geographic_Abbreviation": "G1", + "Geographic_Theme": "GEO3 Theme", + "Geographic_Coverage": "GEO3 Coverage", + "Variable_Type": { + "Variable_Type_Code": "GEOG", + "Variable_Type_Description": "Geographic variable" + }, + "Statistical_Unit": null, + "Topic": null, + "Keywords": [], + "Questions": [] + }, + "Topics": [] + }, + "catLabels": null + } + ] + }, + { + "name": "base", + "label": "Base dataset with metadata for all variables in Welsh", + "lang": "cy", + "description": "This is the Welsh version of the base dataset containing metadata for all variables.", + "meta": { + "Source": { + "Source_Mnemonic": "Census2021", + "Source_Description": "The 2021 England and Wales Census", + "Version": "1" + }, + "Version": "1" + }, + "vars": [ + { + "name": "CLASS1", + "label": "CLASS1 Label Internal", + "description": "VAR1 Description", + "meta": { + "Mnemonic_2011": null, + "Parent_Classification_Mnemonic": null, + "Default_Classification_Flag": null, + "Version": "1", + "ONS_Variable": { + "Variable_Mnemonic": "VAR1", + "Variable_Mnemonic_2011": null, + "Version": "1", + "Quality_Statement_Text": null, + "Quality_Summary_URL": null, + "Variable_Title": "VAR1 Title", + "Comparability_Comments": "VAR1 Comparability Comments", + "Uk_Comparison_Comments": "VAR1 UK Comparison Comments", + "Geographic_Abbreviation": null, + "Geographic_Theme": null, + "Geographic_Coverage": null, + "Variable_Type": { + "Variable_Type_Code": "DVO", + "Variable_Type_Description": "Derived variable" + }, + "Statistical_Unit": null, + "Topic": null, + "Keywords": [], + "Questions": [] + }, + "Topics": [] + }, + "catLabels": null + }, + { + "name": "CLASS3", + "label": "CLASS3 Label Internal", + "description": "VAR3 Description", + "meta": { + "Mnemonic_2011": null, + "Parent_Classification_Mnemonic": null, + "Default_Classification_Flag": null, + "Version": "1", + "ONS_Variable": { + "Variable_Mnemonic": "VAR3", + "Variable_Mnemonic_2011": null, + "Version": "1", + "Quality_Statement_Text": null, + "Quality_Summary_URL": null, + "Variable_Title": "VAR3 Title", + "Comparability_Comments": "VAR3 Comparability Comments", + "Uk_Comparison_Comments": "VAR3 UK Comparison Comments", + "Geographic_Abbreviation": null, + "Geographic_Theme": null, + "Geographic_Coverage": null, + "Variable_Type": { + "Variable_Type_Code": "DVO", + "Variable_Type_Description": "Derived variable" + }, + "Statistical_Unit": null, + "Topic": null, + "Keywords": [], + "Questions": [] + }, + "Topics": [] + }, + "catLabels": null + }, + { + "name": "GEO1", + "label": "GEO1 Title (Welsh)", + "description": "GEO1 Description (Welsh)", + "meta": { + "Mnemonic_2011": null, + "Parent_Classification_Mnemonic": "GEO1", + "Default_Classification_Flag": null, + "Version": "1", + "ONS_Variable": { + "Variable_Mnemonic": "GEO1", + "Variable_Mnemonic_2011": "GEO1 2011", + "Version": "1", + "Quality_Statement_Text": null, + "Quality_Summary_URL": null, + "Variable_Title": "GEO1 Title (Welsh)", + "Comparability_Comments": "GEO1 Comparability Comments (Welsh)", + "Uk_Comparison_Comments": "GEO1 UK Comparison Comments (Welsh)", + "Geographic_Abbreviation": "G1 (Welsh)", + "Geographic_Theme": "GEO1 Theme (Welsh)", + "Geographic_Coverage": "GEO1 Coverage (Welsh)", + "Variable_Type": { + "Variable_Type_Code": "GEOG", + "Variable_Type_Description": "Geographic variable (Welsh)" + }, + "Statistical_Unit": null, + "Topic": null, + "Keywords": [], + "Questions": [] + }, + "Topics": [] + }, + "catLabels": null + }, + { + "name": "GEO2", + "label": "GEO2 Title (Welsh)", + "description": "GEO2 Description (Welsh)", + "meta": { + "Mnemonic_2011": null, + "Parent_Classification_Mnemonic": "GEO2", + "Default_Classification_Flag": null, + "Version": "1", + "ONS_Variable": { + "Variable_Mnemonic": "GEO2", + "Variable_Mnemonic_2011": "GEO2 2011", + "Version": "1", + "Quality_Statement_Text": null, + "Quality_Summary_URL": null, + "Variable_Title": "GEO2 Title (Welsh)", + "Comparability_Comments": "GEO2 Comparability Comments (Welsh)", + "Uk_Comparison_Comments": "GEO2 UK Comparison Comments (Welsh)", + "Geographic_Abbreviation": "G1 (Welsh)", + "Geographic_Theme": "GEO2 Theme (Welsh)", + "Geographic_Coverage": "GEO2 Coverage (Welsh)", + "Variable_Type": { + "Variable_Type_Code": "GEOG", + "Variable_Type_Description": "Geographic variable (Welsh)" + }, + "Statistical_Unit": null, + "Topic": null, + "Keywords": [], + "Questions": [] + }, + "Topics": [] + }, + "catLabels": null + }, + { + "name": "GEO3", + "label": "GEO3 Title (Welsh)", + "description": "GEO3 Description (Welsh)", + "meta": { + "Mnemonic_2011": null, + "Parent_Classification_Mnemonic": "GEO3", + "Default_Classification_Flag": null, + "Version": "1", + "ONS_Variable": { + "Variable_Mnemonic": "GEO3", + "Variable_Mnemonic_2011": "GEO3 2011", + "Version": "1", + "Quality_Statement_Text": null, + "Quality_Summary_URL": null, + "Variable_Title": "GEO3 Title (Welsh)", + "Comparability_Comments": "GEO3 Comparability Comments (Welsh)", + "Uk_Comparison_Comments": "GEO3 UK Comparison Comments (Welsh)", + "Geographic_Abbreviation": "G1 (Welsh)", + "Geographic_Theme": "GEO3 Theme (Welsh)", + "Geographic_Coverage": "GEO3 Coverage (Welsh)", + "Variable_Type": { + "Variable_Type_Code": "GEOG", + "Variable_Type_Description": "Geographic variable (Welsh)" + }, + "Statistical_Unit": null, + "Topic": null, + "Keywords": [], + "Questions": [] + }, + "Topics": [] + }, + "catLabels": null + } + ] + }, + { + "name": "DB1", + "incl": [ + { + "name": "base", + "lang": "en" + } + ], + "label": "DB1 Title", + "description": "DB1 Description", + "lang": "en", + "meta": { + "Cantabular_DB_Flag": null, + "Version": "1", + "Source": { + "Source_Mnemonic": "SRC1", + "Copyright_Statement": null, + "Licence": null, + "Nationals_Statistic_Certified": null, + "Methodology_Link": null, + "SDC_Link": null, + "Version": "1", + "Source_Description": "SRC1 Description", + "Methodology_Statement": null, + "SDC_Statement": null, + "Contact": null + }, + "Lowest_Geog_Variable": "GEO1" + }, + "vars": null + }, + { + "name": "DB1", + "incl": [ + { + "name": "base", + "lang": "cy" + } + ], + "label": "DB1 Title", + "description": "DB1 Description", + "lang": "cy", + "meta": { + "Cantabular_DB_Flag": null, + "Version": "1", + "Source": { + "Source_Mnemonic": "SRC1", + "Copyright_Statement": null, + "Licence": null, + "Nationals_Statistic_Certified": null, + "Methodology_Link": null, + "SDC_Link": null, + "Version": "1", + "Source_Description": "SRC1 Description", + "Methodology_Statement": null, + "SDC_Statement": null, + "Contact": null + }, + "Lowest_Geog_Variable": "GEO1" + }, + "vars": null + } +] diff --git a/test/expected/table-metadata-best-effort.json b/test/expected/table-metadata-best-effort.json new file mode 100644 index 0000000..d304b86 --- /dev/null +++ b/test/expected/table-metadata-best-effort.json @@ -0,0 +1,62 @@ +[ + { + "name": "DS1", + "datasetName": "DB1", + "vars": [ + "GEO1", + "CLASS1" + ], + "ref": [ + { + "lang": "en", + "label": "DS1 Title", + "description": "DS1 Description", + "meta": { + "Dataset_Mnemonic_2011": null, + "Last_Updated": null, + "Unique_Url": null, + "Version": "1", + "Geographic_Coverage": "Everywhere", + "Dataset_Population": "Everyone", + "Statistical_Unit": { + "Statistical_Unit": "Houses", + "Statistical_Unit_Description": "House Description" + }, + "Contact": null, + "Keywords": [], + "Related_Datasets": [], + "Census_Releases": [], + "Publications": [], + "Alternate_Geographic_Variables": [ + "GEO3" + ] + } + }, + { + "lang": "cy", + "label": "DS1 Title", + "description": "DS1 Description", + "meta": { + "Dataset_Mnemonic_2011": null, + "Last_Updated": null, + "Unique_Url": null, + "Version": "1", + "Geographic_Coverage": "Everywhere", + "Dataset_Population": "Everyone", + "Statistical_Unit": { + "Statistical_Unit": "Houses", + "Statistical_Unit_Description": "House Description" + }, + "Contact": null, + "Keywords": [], + "Related_Datasets": [], + "Census_Releases": [], + "Publications": [], + "Alternate_Geographic_Variables": [ + "GEO3" + ] + } + } + ] + } +] diff --git a/test/test_best_effort.py b/test/test_best_effort.py new file mode 100644 index 0000000..82c280a --- /dev/null +++ b/test/test_best_effort.py @@ -0,0 +1,77 @@ +import json +import unittest.mock +import unittest +import pathlib +import os +import logging +from io import StringIO +from datetime import date +import ons_csv_to_ctb_json_main + +FILENAME_TABLES = 'cantabm_v9-3-0_best-effort_tables-md_19700101-1.json' +FILENAME_DATASET = 'cantabm_v9-3-0_best-effort_dataset-md_19700101-1.json' +FILENAME_SERVICE = 'cantabm_v9-3-0_best-effort_service-md_19700101-1.json' + +class TestBestEffort(unittest.TestCase): + @unittest.mock.patch('ons_csv_to_ctb_json_main.date') + def test_generated_json_best_effort(self, mock_date): + """Generate JSON from source CSV and compare it with expected values.""" + mock_date.today.return_value = date(1970, 1, 1) + mock_date.side_effect = lambda *args, **kw: date(*args, **kw) + + file_dir = pathlib.Path(__file__).parent.resolve() + input_dir = os.path.join(file_dir, 'testdata/best_effort') + output_dir = os.path.join(file_dir, 'out') + + with self.assertLogs(level='WARNING') as cm: + with unittest.mock.patch('sys.argv', ['test', '-i', input_dir, '-o', output_dir, '-m', 'best-effort', '--best-effort']): + ons_csv_to_ctb_json_main.main() + with open(os.path.join(output_dir, FILENAME_SERVICE)) as f: + service_metadata = json.load(f) + with open(os.path.join(file_dir, 'expected/service-metadata.json')) as f: + expected_service_metadata = json.load(f) + self.assertEqual(service_metadata, expected_service_metadata) + + with open(os.path.join(output_dir, FILENAME_DATASET)) as f: + dataset_metadata = json.load(f) + with open(os.path.join(file_dir, 'expected/dataset-metadata-best-effort.json')) as f: + expected_dataset_metadata = json.load(f) + self.assertEqual(dataset_metadata, expected_dataset_metadata) + + with open(os.path.join(output_dir, FILENAME_TABLES)) as f: + table_metadata = json.load(f) + with open(os.path.join(file_dir, 'expected/table-metadata-best-effort.json')) as f: + expected_table_metadata = json.load(f) + self.assertEqual(table_metadata, expected_table_metadata) + + warnings = [ + r'Classification.csv:3 no value supplied for required field Variable_Mnemonic', + r'Classification.csv:3 dropping record', + r'Classification.csv:4 duplicate value CLASS1 for Classification_Mnemonic', + r'Classification.csv:4 dropping record', + r'Classification.csv:5 invalid value x for Number_Of_Category_Items', + r'Category.csv Unexpected number of categories for CLASS1: expected 4 but found 1', + r'Database_Variable.csv Lowest_Geog_Variable_Flag set on GEO3 and GEO1 for database DB1', + r'Dataset_Variable.csv:4 duplicate value combo DS1/VAR1 for Dataset_Mnemonic/Variable_Mnemonic', + r'Dataset_Variable.csv:4 dropping record', + r'Dataset_Variable.csv:2 Lowest_Geog_Variable_Flag set on non-geographic variable VAR1 for dataset DS1', + r'Dataset_Variable.csv:2 Processing_Priority not specified for classification CLASS1 in dataset DS1', + r'Dataset_Variable.csv:3 Classification_Mnemonic must not be specified for geographic variable GEO1 in dataset DS1', + r'Dataset_Variable.csv:3 Processing_Priority must not be specified for geographic variable GEO1 in dataset DS1', + r'Dataset_Variable.csv:5 Lowest_Geog_Variable_Flag set on variable GEO2 and GEO1 for dataset DS1', + r'Dataset_Variable.csv:7 Classification must be specified for non-geographic VAR2 in dataset DS1', + r'Dataset_Variable.csv:7 dropping record', + r'Dataset_Variable.csv:8 Invalid classification CLASS1 specified for variable VAR3 in dataset DS1', + r'Dataset_Variable.csv:8 dropping record', + r'Dataset_Variable.csv Invalid processing_priorities \[0\] for dataset DS1', + r'Dataset.csv:3 DS2 has classification CLASS3 that is not in database DB1', + r'Dataset.csv:3 dropping record', + r'Dataset.csv:4 DS3 has no associated classifications or geographic variable', + r'Dataset.csv:4 dropping record', + r'16 errors were encountered during processing', + ] + + self.assertEqual(len(warnings), len(cm.output)) + for i, warning in enumerate(cm.output): + self.assertRegex(warning, warnings[i]) + diff --git a/test/testdata/best_effort/Category.csv b/test/testdata/best_effort/Category.csv new file mode 100644 index 0000000..6bf67d4 --- /dev/null +++ b/test/testdata/best_effort/Category.csv @@ -0,0 +1,2 @@ +Variable_Mnemonic,Classification_Mnemonic,Id,Category_Code,External_Category_Label_English,External_Category_Label_Welsh,Sort_Order,Version,Internal_Category_Label_English +SOURCE,CLASS1,1,CODE1,LABEL1,,,1,LABEL1 Internal diff --git a/test/testdata/best_effort/Census_Release.csv b/test/testdata/best_effort/Census_Release.csv new file mode 100644 index 0000000..532c8cd --- /dev/null +++ b/test/testdata/best_effort/Census_Release.csv @@ -0,0 +1 @@ +Census_Release_Number,Id,Census_Release_Description,Release_Date diff --git a/test/testdata/best_effort/Classification.csv b/test/testdata/best_effort/Classification.csv new file mode 100644 index 0000000..8d20083 --- /dev/null +++ b/test/testdata/best_effort/Classification.csv @@ -0,0 +1,5 @@ +Classification_Mnemonic,Variable_Mnemonic,Id,External_Classification_Label_English,External_Classification_Label_Welsh,Number_Of_Category_Items,Mnemonic_2011,Flat_Classification_Flag,Parent_Classification_Mnemonic,Security_Mnemonic,Signed_Off_Flag,Default_Classification_Flag,Version,Internal_Classification_Label_English +CLASS1,VAR1,1,,,4,,,,PUB,N,,1,CLASS1 Label Internal +CLASS2,,2,,,,,,,PUB,N,,1,CLASS2 Label Internal +CLASS1,VAR1,3,,,,,,,PUB,N,,1,CLASS1 Label Internal (Alternative) +CLASS3,VAR3,4,,,x,,,,PUB,N,,1,CLASS3 Label Internal diff --git a/test/testdata/best_effort/Contact.csv b/test/testdata/best_effort/Contact.csv new file mode 100644 index 0000000..9c33c77 --- /dev/null +++ b/test/testdata/best_effort/Contact.csv @@ -0,0 +1 @@ +Contact_Id,Contact_Name,Contact_Email,Contact_Phone,Contact_Website diff --git a/test/testdata/best_effort/Database.csv b/test/testdata/best_effort/Database.csv new file mode 100644 index 0000000..b1b01a7 --- /dev/null +++ b/test/testdata/best_effort/Database.csv @@ -0,0 +1,2 @@ +Database_Mnemonic,Id,Database_Title,Database_Title_Welsh,Database_Description,Database_Description_Welsh,Cantabular_DB_Flag,IAR_Asset_Id,Source_Mnemonic,Version +DB1,1,DB1 Title,,DB1 Description,,,,SRC1,1 diff --git a/test/testdata/best_effort/Database_Variable.csv b/test/testdata/best_effort/Database_Variable.csv new file mode 100644 index 0000000..9b0bb5e --- /dev/null +++ b/test/testdata/best_effort/Database_Variable.csv @@ -0,0 +1,4 @@ +Id,Database_Mnemonic,Variable_Mnemonic,Version,Lowest_Geog_Variable_Flag +1,DB1,VAR1,1, +2,DB1,GEO1,1,Y +3,DB1,GEO3,1,Y diff --git a/test/testdata/best_effort/Dataset.csv b/test/testdata/best_effort/Dataset.csv new file mode 100644 index 0000000..9e9cd1c --- /dev/null +++ b/test/testdata/best_effort/Dataset.csv @@ -0,0 +1,4 @@ +Dataset_Mnemonic,Id,Dataset_Title,Dataset_Title_Welsh,Dataset_Description,Dataset_Description_Welsh,Statistical_Unit,Dataset_Mnemonic_2011,Geographic_Coverage,Geographic_Coverage_Welsh,Dataset_Population,Dataset_Population_Welsh,Last_Updated,Unique_Url,Security_Mnemonic,Signed_Off_Flag,Database_Mnemonic,Contact_Id,Version +DS1,1,DS1 Title,,DS1 Description,,Houses,,Everywhere,,Everyone,,,,PUB,N,DB1,,1 +DS2,2,DS2 Title,,DS2 Description,,Houses,,Everywhere,,Everyone,,,,PUB,N,DB1,,1 +DS3,3,DS3 Title,,DS3 Description,,Houses,,Everywhere,,Everyone,,,,PUB,N,DB1,,1 diff --git a/test/testdata/best_effort/Dataset_Keyword.csv b/test/testdata/best_effort/Dataset_Keyword.csv new file mode 100644 index 0000000..4204157 --- /dev/null +++ b/test/testdata/best_effort/Dataset_Keyword.csv @@ -0,0 +1 @@ +Dataset_Mnemonic,Id,Dataset_Keyword,Dataset_Keyword_Welsh diff --git a/test/testdata/best_effort/Dataset_Variable.csv b/test/testdata/best_effort/Dataset_Variable.csv new file mode 100644 index 0000000..3aaf62a --- /dev/null +++ b/test/testdata/best_effort/Dataset_Variable.csv @@ -0,0 +1,9 @@ +Classification_Mnemonic,Dataset_Mnemonic,Id,Processing_Priority,Variable_Mnemonic,Lowest_Geog_Variable_Flag +CLASS1,DS1,1,,VAR1,Y +GEO1,DS1,2,1,GEO1,Y +CLASS1,DS1,3,1,VAR1,N +,DS1,4,,GEO2,Y +,DS1,5,,GEO3,N +,DS1,6,1,VAR2,N +CLASS1,DS1,7,1,VAR3,N +CLASS3,DS2,8,1,VAR3,N diff --git a/test/testdata/best_effort/Publication_Dataset.csv b/test/testdata/best_effort/Publication_Dataset.csv new file mode 100644 index 0000000..57af1aa --- /dev/null +++ b/test/testdata/best_effort/Publication_Dataset.csv @@ -0,0 +1 @@ +Publication_Mnemonic,Dataset_Mnemonic,Id,Publication_Title,Publisher_Name,Publisher_Website diff --git a/test/testdata/best_effort/Question.csv b/test/testdata/best_effort/Question.csv new file mode 100644 index 0000000..31aa1cb --- /dev/null +++ b/test/testdata/best_effort/Question.csv @@ -0,0 +1 @@ +Question_Code,Id,Question_Label,Question_Label_Welsh,Reason_For_Asking_Question,Reason_For_Asking_Question_Welsh,Question_First_Asked_In_Year,Version diff --git a/test/testdata/best_effort/Related_Datasets.csv b/test/testdata/best_effort/Related_Datasets.csv new file mode 100644 index 0000000..959f4d8 --- /dev/null +++ b/test/testdata/best_effort/Related_Datasets.csv @@ -0,0 +1 @@ +Dataset_Mnemonic,Id,Related_Dataset_Mnemonic diff --git a/test/testdata/best_effort/Release_Dataset.csv b/test/testdata/best_effort/Release_Dataset.csv new file mode 100644 index 0000000..37adceb --- /dev/null +++ b/test/testdata/best_effort/Release_Dataset.csv @@ -0,0 +1 @@ +Census_Release_Number,Dataset_Mnemonic,Id diff --git a/test/testdata/best_effort/Security_Classification.csv b/test/testdata/best_effort/Security_Classification.csv new file mode 100644 index 0000000..3fa7783 --- /dev/null +++ b/test/testdata/best_effort/Security_Classification.csv @@ -0,0 +1,3 @@ +Security_Mnemonic,Id,Security_Description,Security_Description_Welsh +PUB,1,Public,Public (Welsh) +CLASS,2,Classified, diff --git a/test/testdata/best_effort/Source.csv b/test/testdata/best_effort/Source.csv new file mode 100644 index 0000000..7455d90 --- /dev/null +++ b/test/testdata/best_effort/Source.csv @@ -0,0 +1,2 @@ +Source_Mnemonic,Id,Source_Description,Source_Description_Welsh,Copyright_Statement,Licence,Nationals_Statistic_Certified,Methodology_Link,Methodology_Statement,Methodology_Statement_Welsh,SDC_Link,SDC_Statement,SDC_Statement_Welsh,Contact_Id,Version +SRC1,1,SRC1 Description,,,,,,,,,,,,1 diff --git a/test/testdata/best_effort/Statistical_Unit.csv b/test/testdata/best_effort/Statistical_Unit.csv new file mode 100644 index 0000000..2e3106b --- /dev/null +++ b/test/testdata/best_effort/Statistical_Unit.csv @@ -0,0 +1,2 @@ +Statistical_Unit,Id,Statistical_Unit_Description,Statistical_Unit_Description_Welsh +Houses,1,House Description, diff --git a/test/testdata/best_effort/Topic.csv b/test/testdata/best_effort/Topic.csv new file mode 100644 index 0000000..00b0e08 --- /dev/null +++ b/test/testdata/best_effort/Topic.csv @@ -0,0 +1 @@ +Topic_Mnemonic,Id,Topic_Description,Topic_Description_Welsh,Topic_Title,Topic_Title_Welsh diff --git a/test/testdata/best_effort/Topic_Classification.csv b/test/testdata/best_effort/Topic_Classification.csv new file mode 100644 index 0000000..0bb94c9 --- /dev/null +++ b/test/testdata/best_effort/Topic_Classification.csv @@ -0,0 +1 @@ +Classification_Mnemonic,Topic_Mnemonic,Id diff --git a/test/testdata/best_effort/Variable.csv b/test/testdata/best_effort/Variable.csv new file mode 100644 index 0000000..e386025 --- /dev/null +++ b/test/testdata/best_effort/Variable.csv @@ -0,0 +1,7 @@ +Variable_Mnemonic,Id,Variable_Title,Variable_Title_Welsh,Variable_Description,Variable_Description_Welsh,Variable_Type_Code,Statistical_Unit,Topic_Mnemonic,Variable_Mnemonic_2011,Comparability_Comments,Comparability_Comments_Welsh,Uk_Comparison_Comments,Uk_Comparison_Comments_Welsh,Security_Mnemonic,Signed_Off_Flag,Number_Of_Classifications,Geographic_Abbreviation,Geographic_Abbreviation_Welsh,Geographic_Theme,Geographic_Theme_Welsh,Geographic_Coverage,Geographic_Coverage_Welsh,Version,Quality_Statement_Text,Quality_Summary_URL +VAR1,1,VAR1 Title,,VAR1 Description,,DVO,,,,VAR1 Comparability Comments,,VAR1 UK Comparison Comments,,PUB,N,,,,,,,,1,, +GEO1,2,GEO1 Title,GEO1 Title (Welsh),GEO1 Description,GEO1 Description (Welsh),GEOG,,,GEO1 2011,GEO1 Comparability Comments,GEO1 Comparability Comments (Welsh),GEO1 UK Comparison Comments,GEO1 UK Comparison Comments (Welsh),PUB,Y,3,G1,G1 (Welsh),GEO1 Theme,GEO1 Theme (Welsh),GEO1 Coverage,GEO1 Coverage (Welsh),1,, +GEO2,3,GEO2 Title,GEO2 Title (Welsh),GEO2 Description,GEO2 Description (Welsh),GEOG,,,GEO2 2011,GEO2 Comparability Comments,GEO2 Comparability Comments (Welsh),GEO2 UK Comparison Comments,GEO2 UK Comparison Comments (Welsh),PUB,Y,3,G1,G1 (Welsh),GEO2 Theme,GEO2 Theme (Welsh),GEO2 Coverage,GEO2 Coverage (Welsh),1,, +GEO3,4,GEO3 Title,GEO3 Title (Welsh),GEO3 Description,GEO3 Description (Welsh),GEOG,,,GEO3 2011,GEO3 Comparability Comments,GEO3 Comparability Comments (Welsh),GEO3 UK Comparison Comments,GEO3 UK Comparison Comments (Welsh),PUB,Y,3,G1,G1 (Welsh),GEO3 Theme,GEO3 Theme (Welsh),GEO3 Coverage,GEO3 Coverage (Welsh),1,, +VAR2,5,VAR2 Title,,VAR2 Description,,DVO,,,,VAR2 Comparability Comments,,VAR2 UK Comparison Comments,,PUB,N,,,,,,,,1,, +VAR3,6,VAR3 Title,,VAR3 Description,,DVO,,,,VAR3 Comparability Comments,,VAR3 UK Comparison Comments,,PUB,N,,,,,,,,1,, diff --git a/test/testdata/best_effort/Variable_Keyword.csv b/test/testdata/best_effort/Variable_Keyword.csv new file mode 100644 index 0000000..7e9e35f --- /dev/null +++ b/test/testdata/best_effort/Variable_Keyword.csv @@ -0,0 +1 @@ +Variable_Mnemonic,Id,Variable_Keyword,Variable_Keyword_Welsh diff --git a/test/testdata/best_effort/Variable_Source_Question.csv b/test/testdata/best_effort/Variable_Source_Question.csv new file mode 100644 index 0000000..993c687 --- /dev/null +++ b/test/testdata/best_effort/Variable_Source_Question.csv @@ -0,0 +1 @@ +Variable_Mnemonic,Source_Question_Code,Id diff --git a/test/testdata/best_effort/Variable_Type.csv b/test/testdata/best_effort/Variable_Type.csv new file mode 100644 index 0000000..9794587 --- /dev/null +++ b/test/testdata/best_effort/Variable_Type.csv @@ -0,0 +1,3 @@ +Variable_Type_Code,Id,Variable_Type_Description,Variable_Type_Description_Welsh +GEOG,1,Geographic variable,Geographic variable (Welsh) +DVO,1,Derived variable, From 9e2214873bb1d4ddb3732ef3fd4ee0ce6924737f Mon Sep 17 00:00:00 2001 From: Peter Hynes Date: Tue, 10 May 2022 21:29:26 +0100 Subject: [PATCH 13/15] Updated README.md and removed fixup.py script --- README.md | 60 ++++++++++++--- bin/fixup.py | 176 -------------------------------------------- modified/.gitignore | 1 - 3 files changed, 48 insertions(+), 189 deletions(-) delete mode 100644 bin/fixup.py delete mode 100644 modified/.gitignore diff --git a/README.md b/README.md index a30116c..3fd2b8e 100644 --- a/README.md +++ b/README.md @@ -131,23 +131,59 @@ t=2022-05-09 21:27:57,637 lvl=INFO msg=Written table metadata file to: ctb_metad t=2022-05-09 21:27:57,638 lvl=INFO msg=Written service metadata file to: ctb_metadata_files/t_cantabm_v9-3-0_test_service-md_20220509-42.json ``` -Using externally sourced files ------------------------------- +Using data with errors +---------------------- -To convert the externally sourced metadata CSV files currently being used for testing, first fixup the source files: +`ons_csv_to_ctb_json_main.py` fails on the first error. This is intentional as the data must be +correct for use in production. For debug purpose a `--best-effort` flag can be used to continue +processing when errors are found and to make a **best effort** to generate output. Typically this +will result in some data loss as some records will be dropped and some fields will be ignored. + +This repository contains some test data that is full of errors. It can be used to demonstrate the usage +of the `--best-effort` flag as shown below: ``` -python3 bin/fixup.py -i --g -o modified/ +> python3 bin/ons_csv_to_ctb_json_main.py -i test/testdata/best_effort -o ctb_metadata_files/ -m best-effort --best-effort +t=2022-05-10 21:23:34,762 lvl=INFO msg=ons_csv_to_ctb_json_main.py version 1.1.alpha +t=2022-05-10 21:23:34,762 lvl=INFO msg=CSV source directory: test/testdata/best_effort +t=2022-05-10 21:23:34,763 lvl=WARNING msg=Reading test/testdata/best_effort/Classification.csv:3 no value supplied for required field Variable_Mnemonic +t=2022-05-10 21:23:34,763 lvl=WARNING msg=Reading test/testdata/best_effort/Classification.csv:3 dropping record +t=2022-05-10 21:23:34,763 lvl=WARNING msg=Reading test/testdata/best_effort/Classification.csv:4 duplicate value CLASS1 for Classification_Mnemonic +t=2022-05-10 21:23:34,764 lvl=WARNING msg=Reading test/testdata/best_effort/Classification.csv:4 dropping record +t=2022-05-10 21:23:34,764 lvl=WARNING msg=Reading test/testdata/best_effort/Classification.csv:5 invalid value x for Number_Of_Category_Items +t=2022-05-10 21:23:34,764 lvl=WARNING msg=Reading test/testdata/best_effort/Category.csv Unexpected number of categories for CLASS1: expected 4 but found 1 +t=2022-05-10 21:23:34,764 lvl=INFO msg=No geography file specified +t=2022-05-10 21:23:34,764 lvl=INFO msg=Loaded metadata for 5 Cantabular variables +t=2022-05-10 21:23:34,764 lvl=WARNING msg=Reading test/testdata/best_effort/Database_Variable.csv Lowest_Geog_Variable_Flag set on GEO3 and GEO1 for database DB1 +t=2022-05-10 21:23:34,764 lvl=INFO msg=Loaded metadata for 1 Cantabular datasets +t=2022-05-10 21:23:34,765 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:4 duplicate value combo DS1/VAR1 for Dataset_Mnemonic/Variable_Mnemonic +t=2022-05-10 21:23:34,765 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:4 dropping record +t=2022-05-10 21:23:34,765 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:2 Lowest_Geog_Variable_Flag set on non-geographic variable VAR1 for dataset DS1 +t=2022-05-10 21:23:34,765 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:2 Processing_Priority not specified for classification CLASS1 in dataset DS1 +t=2022-05-10 21:23:34,765 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:3 Classification_Mnemonic must not be specified for geographic variable GEO1 in dataset DS1 +t=2022-05-10 21:23:34,765 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:3 Processing_Priority must not be specified for geographic variable GEO1 in dataset DS1 +t=2022-05-10 21:23:34,765 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:5 Lowest_Geog_Variable_Flag set on variable GEO2 and GEO1 for dataset DS1 +t=2022-05-10 21:23:34,765 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:7 Classification must be specified for non-geographic VAR2 in dataset DS1 +t=2022-05-10 21:23:34,765 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:7 dropping record +t=2022-05-10 21:23:34,765 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:8 Invalid classification CLASS1 specified for variable VAR3 in dataset DS1 +t=2022-05-10 21:23:34,765 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:8 dropping record +t=2022-05-10 21:23:34,765 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv Invalid processing_priorities [0] for dataset DS1 +t=2022-05-10 21:23:34,765 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset.csv:3 DS2 has classification CLASS3 that is not in database DB1 +t=2022-05-10 21:23:34,765 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset.csv:3 dropping record +t=2022-05-10 21:23:34,765 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset.csv:4 DS3 has no associated classifications or geographic variable +t=2022-05-10 21:23:34,765 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset.csv:4 dropping record +t=2022-05-10 21:23:34,765 lvl=INFO msg=Loaded metadata for 1 Cantabular tables +t=2022-05-10 21:23:34,765 lvl=INFO msg=Loaded service metadata +t=2022-05-10 21:23:34,765 lvl=WARNING msg=16 errors were encountered during processing +t=2022-05-10 21:23:34,765 lvl=INFO msg=Output files will be written in Cantabular 9.3.0 format +t=2022-05-10 21:23:34,766 lvl=INFO msg=Written dataset metadata file to: ctb_metadata_files/cantabm_v9-3-0_best-effort_dataset-md_20220510-1.json +t=2022-05-10 21:23:34,766 lvl=INFO msg=Written table metadata file to: ctb_metadata_files/cantabm_v9-3-0_best-effort_tables-md_20220510-1.json +t=2022-05-10 21:23:34,766 lvl=INFO msg=Written service metadata file to: ctb_metadata_files/cantabm_v9-3-0_best-effort_service-md_20220510-1.json ``` -The geography file will be placed in the `modified/` directory and will have the same base name as the original file. - -This will load the sample files and modify them slightly so that they can be processed by `ons_csv_to_ctb_json_main.py`. -This step will not be needed for production. +Many lines contain strings such as `test/testdata/best_effort/Dataset.csv:4` this means that an error has been detected +on row 4 of the `Dataset.csv` file. The header will be row 1. -Then convert the files to JSON: -``` -python3 bin/ons_csv_to_ctb_json_main.py -i modified/ -g modified/ -o ctb_metadata_files/ -``` +The `--best-effort` flag is for debug purposes only. Using 2011 census teaching file metadata ---------------------------------------- diff --git a/bin/fixup.py b/bin/fixup.py deleted file mode 100644 index d73fbef..0000000 --- a/bin/fixup.py +++ /dev/null @@ -1,176 +0,0 @@ -""" -Fixup metadata source CSV files. - -This is a program for use in development. It modifies source files so that they can successfully -be loaded on ons_csv_to_ctb_main.py. - -""" - -import glob -import os -import logging -import csv -from argparse import ArgumentParser - -VERSION = '1.1.alpha' - - -def main(): - """Fixup metadata source CSV files.""" - parser = ArgumentParser(description='Program for fixing up metadata source CSV files.', - epilog=f'Version: {VERSION}') - - parser.add_argument('-i', '--input-dir', - type=str, - required=True, - help='Input directory containing CSV files') - - parser.add_argument('-o', '--output-dir', - type=str, - required=True, - help='Output directory to write fixed-up files') - - parser.add_argument('-g', '--geography-file', - type=str, - required=False, - help='Name of geography CSV file') - - args = parser.parse_args() - - logging.basicConfig(format='t=%(asctime)s lvl=%(levelname)s msg=%(message)s', level='INFO') - - for directory in (args.input_dir, args.output_dir): - if not os.path.isdir(directory): - raise ValueError(f'{directory} does not exist or is not a directory') - - for filename in glob.glob(os.path.join(args.input_dir, '*.csv')): - if args.geography_file and \ - os.path.abspath(filename) == os.path.abspath(args.geography_file): - continue - - basename = os.path.basename(filename) - out_filename = os.path.join(args.output_dir, basename) - with open(out_filename, 'w') as outfile: - # Main program expects input files in UTF-8 format. - with open(filename, newline='', encoding='iso-8859-1') as infile: - reader = csv.DictReader(infile) - fieldnames = reader.fieldnames.copy() - if basename == 'Category.csv': - fieldnames.remove('variable_mnemonic') - fieldnames.append('Variable_Mnemonic') - - writer = csv.DictWriter(outfile, fieldnames) - writer.writeheader() - for line in reader: - if basename == 'Category.csv': - line['Variable_Mnemonic'] = line.pop('variable_mnemonic') - - if basename == 'Variable.csv': - line['Security_Mnemonic'] = 'PUB' - - if not line['Variable_Type_Code']: - line['Variable_Type_Code'] = 'DVO' - - elif basename == 'Classification.csv': - if not line['Number_Of_Category_Items']: - line['Number_Of_Category_Items'] = '0' - - if line['Classification_Mnemonic'] == 'hh_away_student_9a': - line['Number_Of_Category_Items'] = '8' - - if line['Classification_Mnemonic'] == 'hh_families_count_7a': - line['Number_Of_Category_Items'] = '8' - - if line['Classification_Mnemonic'] == 'legal_partnership_status_12a': - line['Number_Of_Category_Items'] = '12' - - if line['Classification_Mnemonic'] == 'moving_group_size_10000a': - line['Number_Of_Category_Items'] = '9716' - - if '_pop' in line['Variable_Mnemonic']: - continue - - if line['Classification_Mnemonic'] == 'hh_multi_ethnic_combination_23B': - line['Classification_Mnemonic'] = 'hh_multi_ethnic_combination_23b' - - elif basename == 'Topic_Classification.csv': - if line['Classification_Mnemonic'] == 'hh_multi_ethnic_combination_23B': - line['Classification_Mnemonic'] = 'hh_multi_ethnic_combination_23b' - - if line['Classification_Mnemonic'] == 'distance_to_work': - line['Classification_Mnemonic'] = 'distance_to_work_12002a' - - if line['Classification_Mnemonic'] == 'moving_group_number': - line['Classification_Mnemonic'] = 'moving_group_number_10000a' - - if line['Classification_Mnemonic'] == 'moving_group_size': - line['Classification_Mnemonic'] = 'moving_group_size_10000a' - - if line['Classification_Mnemonic'] in [ - 'dwelling_number', 'economic_activity_status_14a', - 'economic_activity_status_13a', 'economic_activity_status_12b', - 'economic_activity_status_11a', 'economic_activity_status_11b', - 'economic_activity_status_10b', 'economic_activity_status_9a', - 'economic_activity_status_7a', 'economic_activity_status_6a', - 'economic_activity_status_6b', 'economic_activity_status_5b', - 'economic_activity_status_4a', 'economic_activity_status_4b', - 'ethnic_group', 'travel_destination_wz']: - continue - - elif basename == 'Category.csv': - if line['Classification_Mnemonic'] == 'armed_forces_dependent_ind_5a': - continue - - if line['Classification_Mnemonic'] == 'moving_group_size': - line['Classification_Mnemonic'] = 'moving_group_size_10000a' - - if '_pop' in line['Classification_Mnemonic']: - continue - - elif basename == 'Dataset.csv': - line['Security_Mnemonic'] = 'PUB' - - elif basename == 'Release_Dataset.csv': - line['Census_Release_Number'] = '1' - - elif basename == 'Dataset_Variable.csv': - if line['Classification_Mnemonic'] == 'sex': - line['Classification_Mnemonic'] = 'sex_2a' - - elif basename == 'Database_Variable.csv': - if line['Variable_Mnemonic'] in ['dwelling_number', 'ethnic_group', - 'travel_destination_wz']: - continue - - writer.writerow(line) - - if basename == 'Topic.csv': - writer.writerow({ - 'Id': 13, - 'Topic_Mnemonic': 'HDS', - 'Topic_Description': 'HDS', - 'Topic_Description_Welsh': '', - 'Topic_Title': 'HDS', - 'Topic_Title_Welsh': ''}) - - logging.info(f'Read file from: {filename} and wrote modified file to: {out_filename}') - - if args.geography_file: - basename = os.path.basename(args.geography_file) - out_filename = os.path.join(args.output_dir, basename) - with open(out_filename, 'w') as outfile: - with open(args.geography_file, newline='') as infile: - for line in infile.read().splitlines(): - line = line.replace(',West Northamptonshireshire,', ',West Northamptonshire,') - outfile.write(line) - outfile.write('\n') - logging.info(f'Read geography file from: {args.geography_file} and wrote modified file to:' - f' {out_filename}') - - -if __name__ == '__main__': - try: - main() - except Exception as exception: - logging.error(exception) - raise exception diff --git a/modified/.gitignore b/modified/.gitignore deleted file mode 100644 index afed073..0000000 --- a/modified/.gitignore +++ /dev/null @@ -1 +0,0 @@ -*.csv From 7547b4f435f5d90ddd2fdfc406f64703d5012f5d Mon Sep 17 00:00:00 2001 From: Peter Hynes Date: Wed, 11 May 2022 10:14:59 +0100 Subject: [PATCH 14/15] Report when fields are dropped with --best-effort Also minor fix to README.md --- README.md | 74 +++++++++++++++--------------- bin/ons_csv_to_ctb_json_ds_vars.py | 1 + bin/ons_csv_to_ctb_json_read.py | 1 + test/test_best_effort.py | 2 + 4 files changed, 42 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index 3fd2b8e..f29327c 100644 --- a/README.md +++ b/README.md @@ -135,7 +135,7 @@ Using data with errors ---------------------- `ons_csv_to_ctb_json_main.py` fails on the first error. This is intentional as the data must be -correct for use in production. For debug purpose a `--best-effort` flag can be used to continue +correct for use in production. For debug purposes a `--best-effort` flag can be used to continue processing when errors are found and to make a **best effort** to generate output. Typically this will result in some data loss as some records will be dropped and some fields will be ignored. @@ -143,41 +143,43 @@ This repository contains some test data that is full of errors. It can be used t of the `--best-effort` flag as shown below: ``` > python3 bin/ons_csv_to_ctb_json_main.py -i test/testdata/best_effort -o ctb_metadata_files/ -m best-effort --best-effort -t=2022-05-10 21:23:34,762 lvl=INFO msg=ons_csv_to_ctb_json_main.py version 1.1.alpha -t=2022-05-10 21:23:34,762 lvl=INFO msg=CSV source directory: test/testdata/best_effort -t=2022-05-10 21:23:34,763 lvl=WARNING msg=Reading test/testdata/best_effort/Classification.csv:3 no value supplied for required field Variable_Mnemonic -t=2022-05-10 21:23:34,763 lvl=WARNING msg=Reading test/testdata/best_effort/Classification.csv:3 dropping record -t=2022-05-10 21:23:34,763 lvl=WARNING msg=Reading test/testdata/best_effort/Classification.csv:4 duplicate value CLASS1 for Classification_Mnemonic -t=2022-05-10 21:23:34,764 lvl=WARNING msg=Reading test/testdata/best_effort/Classification.csv:4 dropping record -t=2022-05-10 21:23:34,764 lvl=WARNING msg=Reading test/testdata/best_effort/Classification.csv:5 invalid value x for Number_Of_Category_Items -t=2022-05-10 21:23:34,764 lvl=WARNING msg=Reading test/testdata/best_effort/Category.csv Unexpected number of categories for CLASS1: expected 4 but found 1 -t=2022-05-10 21:23:34,764 lvl=INFO msg=No geography file specified -t=2022-05-10 21:23:34,764 lvl=INFO msg=Loaded metadata for 5 Cantabular variables -t=2022-05-10 21:23:34,764 lvl=WARNING msg=Reading test/testdata/best_effort/Database_Variable.csv Lowest_Geog_Variable_Flag set on GEO3 and GEO1 for database DB1 -t=2022-05-10 21:23:34,764 lvl=INFO msg=Loaded metadata for 1 Cantabular datasets -t=2022-05-10 21:23:34,765 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:4 duplicate value combo DS1/VAR1 for Dataset_Mnemonic/Variable_Mnemonic -t=2022-05-10 21:23:34,765 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:4 dropping record -t=2022-05-10 21:23:34,765 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:2 Lowest_Geog_Variable_Flag set on non-geographic variable VAR1 for dataset DS1 -t=2022-05-10 21:23:34,765 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:2 Processing_Priority not specified for classification CLASS1 in dataset DS1 -t=2022-05-10 21:23:34,765 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:3 Classification_Mnemonic must not be specified for geographic variable GEO1 in dataset DS1 -t=2022-05-10 21:23:34,765 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:3 Processing_Priority must not be specified for geographic variable GEO1 in dataset DS1 -t=2022-05-10 21:23:34,765 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:5 Lowest_Geog_Variable_Flag set on variable GEO2 and GEO1 for dataset DS1 -t=2022-05-10 21:23:34,765 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:7 Classification must be specified for non-geographic VAR2 in dataset DS1 -t=2022-05-10 21:23:34,765 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:7 dropping record -t=2022-05-10 21:23:34,765 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:8 Invalid classification CLASS1 specified for variable VAR3 in dataset DS1 -t=2022-05-10 21:23:34,765 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:8 dropping record -t=2022-05-10 21:23:34,765 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv Invalid processing_priorities [0] for dataset DS1 -t=2022-05-10 21:23:34,765 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset.csv:3 DS2 has classification CLASS3 that is not in database DB1 -t=2022-05-10 21:23:34,765 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset.csv:3 dropping record -t=2022-05-10 21:23:34,765 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset.csv:4 DS3 has no associated classifications or geographic variable -t=2022-05-10 21:23:34,765 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset.csv:4 dropping record -t=2022-05-10 21:23:34,765 lvl=INFO msg=Loaded metadata for 1 Cantabular tables -t=2022-05-10 21:23:34,765 lvl=INFO msg=Loaded service metadata -t=2022-05-10 21:23:34,765 lvl=WARNING msg=16 errors were encountered during processing -t=2022-05-10 21:23:34,765 lvl=INFO msg=Output files will be written in Cantabular 9.3.0 format -t=2022-05-10 21:23:34,766 lvl=INFO msg=Written dataset metadata file to: ctb_metadata_files/cantabm_v9-3-0_best-effort_dataset-md_20220510-1.json -t=2022-05-10 21:23:34,766 lvl=INFO msg=Written table metadata file to: ctb_metadata_files/cantabm_v9-3-0_best-effort_tables-md_20220510-1.json -t=2022-05-10 21:23:34,766 lvl=INFO msg=Written service metadata file to: ctb_metadata_files/cantabm_v9-3-0_best-effort_service-md_20220510-1.json +t=2022-05-11 10:10:38,936 lvl=INFO msg=ons_csv_to_ctb_json_main.py version 1.1.alpha +t=2022-05-11 10:10:38,936 lvl=INFO msg=CSV source directory: test/testdata/best_effort +t=2022-05-11 10:10:38,937 lvl=WARNING msg=Reading test/testdata/best_effort/Classification.csv:3 no value supplied for required field Variable_Mnemonic +t=2022-05-11 10:10:38,937 lvl=WARNING msg=Reading test/testdata/best_effort/Classification.csv:3 dropping record +t=2022-05-11 10:10:38,937 lvl=WARNING msg=Reading test/testdata/best_effort/Classification.csv:4 duplicate value CLASS1 for Classification_Mnemonic +t=2022-05-11 10:10:38,937 lvl=WARNING msg=Reading test/testdata/best_effort/Classification.csv:4 dropping record +t=2022-05-11 10:10:38,937 lvl=WARNING msg=Reading test/testdata/best_effort/Classification.csv:5 invalid value x for Number_Of_Category_Items +t=2022-05-11 10:10:38,937 lvl=WARNING msg=Reading test/testdata/best_effort/Classification.csv:5 ignoring field Number_Of_Category_Items +t=2022-05-11 10:10:38,938 lvl=WARNING msg=Reading test/testdata/best_effort/Category.csv Unexpected number of categories for CLASS1: expected 4 but found 1 +t=2022-05-11 10:10:38,938 lvl=INFO msg=No geography file specified +t=2022-05-11 10:10:38,938 lvl=INFO msg=Loaded metadata for 5 Cantabular variables +t=2022-05-11 10:10:38,938 lvl=WARNING msg=Reading test/testdata/best_effort/Database_Variable.csv Lowest_Geog_Variable_Flag set on GEO3 and GEO1 for database DB1 +t=2022-05-11 10:10:38,938 lvl=INFO msg=Loaded metadata for 1 Cantabular datasets +t=2022-05-11 10:10:38,939 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:4 duplicate value combo DS1/VAR1 for Dataset_Mnemonic/Variable_Mnemonic +t=2022-05-11 10:10:38,939 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:4 dropping record +t=2022-05-11 10:10:38,939 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:2 Lowest_Geog_Variable_Flag set on non-geographic variable VAR1 for dataset DS1 +t=2022-05-11 10:10:38,939 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:2 Processing_Priority not specified for classification CLASS1 in dataset DS1 +t=2022-05-11 10:10:38,939 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:2 using 0 for Processing_Priority +t=2022-05-11 10:10:38,939 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:3 Classification_Mnemonic must not be specified for geographic variable GEO1 in dataset DS1 +t=2022-05-11 10:10:38,939 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:3 Processing_Priority must not be specified for geographic variable GEO1 in dataset DS1 +t=2022-05-11 10:10:38,939 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:5 Lowest_Geog_Variable_Flag set on variable GEO2 and GEO1 for dataset DS1 +t=2022-05-11 10:10:38,939 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:7 Classification must be specified for non-geographic VAR2 in dataset DS1 +t=2022-05-11 10:10:38,939 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:7 dropping record +t=2022-05-11 10:10:38,939 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:8 Invalid classification CLASS1 specified for variable VAR3 in dataset DS1 +t=2022-05-11 10:10:38,939 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:8 dropping record +t=2022-05-11 10:10:38,939 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv Invalid processing_priorities [0] for dataset DS1 +t=2022-05-11 10:10:38,939 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset.csv:3 DS2 has classification CLASS3 that is not in database DB1 +t=2022-05-11 10:10:38,939 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset.csv:3 dropping record +t=2022-05-11 10:10:38,939 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset.csv:4 DS3 has no associated classifications or geographic variable +t=2022-05-11 10:10:38,939 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset.csv:4 dropping record +t=2022-05-11 10:10:38,939 lvl=INFO msg=Loaded metadata for 1 Cantabular tables +t=2022-05-11 10:10:38,939 lvl=INFO msg=Loaded service metadata +t=2022-05-11 10:10:38,939 lvl=WARNING msg=16 errors were encountered during processing +t=2022-05-11 10:10:38,939 lvl=INFO msg=Output files will be written in Cantabular 9.3.0 format +t=2022-05-11 10:10:38,940 lvl=INFO msg=Written dataset metadata file to: ctb_metadata_files/cantabm_v9-3-0_best-effort_dataset-md_20220511-1.json +t=2022-05-11 10:10:38,940 lvl=INFO msg=Written table metadata file to: ctb_metadata_files/cantabm_v9-3-0_best-effort_tables-md_20220511-1.json +t=2022-05-11 10:10:38,940 lvl=INFO msg=Written service metadata file to: ctb_metadata_files/cantabm_v9-3-0_best-effort_service-md_20220511-1.json ``` Many lines contain strings such as `test/testdata/best_effort/Dataset.csv:4` this means that an error has been detected diff --git a/bin/ons_csv_to_ctb_json_ds_vars.py b/bin/ons_csv_to_ctb_json_ds_vars.py index 890679d..1643ccb 100644 --- a/bin/ons_csv_to_ctb_json_ds_vars.py +++ b/bin/ons_csv_to_ctb_json_ds_vars.py @@ -75,6 +75,7 @@ def add_non_geographic_variable(self, variable, row_num): 'Processing_Priority not specified for classification ' f'{classification_mnemonic} in dataset ' f'{self.dataset_mnemonic}') + logging.warning(f'Reading {self.filename}:{row_num} using 0 for Processing_Priority') variable['Processing_Priority'] = 0 self.classifications.append(variable['Classification_Mnemonic']) diff --git a/bin/ons_csv_to_ctb_json_read.py b/bin/ons_csv_to_ctb_json_read.py index 945c41e..cb3e219 100644 --- a/bin/ons_csv_to_ctb_json_read.py +++ b/bin/ons_csv_to_ctb_json_read.py @@ -103,6 +103,7 @@ def validate_row(self, row, row_num): if column.required: keep_row = False continue + logging.warning(f'Reading {self.filename}:{row_num} ignoring field {column.name}') row[column.name] = "" if self.unique_combo_fields and keep_row: diff --git a/test/test_best_effort.py b/test/test_best_effort.py index 82c280a..5f9d46d 100644 --- a/test/test_best_effort.py +++ b/test/test_best_effort.py @@ -50,12 +50,14 @@ def test_generated_json_best_effort(self, mock_date): r'Classification.csv:4 duplicate value CLASS1 for Classification_Mnemonic', r'Classification.csv:4 dropping record', r'Classification.csv:5 invalid value x for Number_Of_Category_Items', + r'Classification.csv:5 ignoring field Number_Of_Category_Items', r'Category.csv Unexpected number of categories for CLASS1: expected 4 but found 1', r'Database_Variable.csv Lowest_Geog_Variable_Flag set on GEO3 and GEO1 for database DB1', r'Dataset_Variable.csv:4 duplicate value combo DS1/VAR1 for Dataset_Mnemonic/Variable_Mnemonic', r'Dataset_Variable.csv:4 dropping record', r'Dataset_Variable.csv:2 Lowest_Geog_Variable_Flag set on non-geographic variable VAR1 for dataset DS1', r'Dataset_Variable.csv:2 Processing_Priority not specified for classification CLASS1 in dataset DS1', + r'Dataset_Variable.csv:2 using 0 for Processing_Priority', r'Dataset_Variable.csv:3 Classification_Mnemonic must not be specified for geographic variable GEO1 in dataset DS1', r'Dataset_Variable.csv:3 Processing_Priority must not be specified for geographic variable GEO1 in dataset DS1', r'Dataset_Variable.csv:5 Lowest_Geog_Variable_Flag set on variable GEO2 and GEO1 for dataset DS1', From 0b473329df79063e975886af7975ad09494bfab4 Mon Sep 17 00:00:00 2001 From: Peter Hynes Date: Wed, 11 May 2022 10:37:48 +0100 Subject: [PATCH 15/15] Update to v1.1.beta --- README.md | 14 +++++++------- RELEASE_NOTES.md | 10 ++++++++++ bin/ons_csv_to_ctb_json_main.py | 2 +- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index f29327c..54fc4a7 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ and converts them to hierarchical JSON that can be loaded into `cantabular-metad It is compatible with version `1.1` of the metadata schema and versions `9.3.0`/`9.2.0` of `cantabular-metadata`. `9.3.0` format is used by default. -This is version `1.1.alpha` of the CSV to JSON processing software and is subject to change. +This is version `1.1.beta` of the CSV to JSON processing software and is subject to change. The applications only use packages in the Python standard library. @@ -35,7 +35,7 @@ Basic logging will be displayed by default, including the number of high-level C objects loaded and the name of the output files. ``` > python3 bin/ons_csv_to_ctb_json_main.py -i test/testdata/ -g test/testdata/geography/geography.csv -o ctb_metadata_files/ -t=2022-05-09 21:26:50,348 lvl=INFO msg=ons_csv_to_ctb_json_main.py version 1.1.alpha +t=2022-05-09 21:26:50,348 lvl=INFO msg=ons_csv_to_ctb_json_main.py version 1.1.beta t=2022-05-09 21:26:50,348 lvl=INFO msg=CSV source directory: test/testdata/ t=2022-05-09 21:26:50,348 lvl=INFO msg=Geography file: test/testdata/geography/geography.csv t=2022-05-09 21:26:50,350 lvl=INFO msg=Reading test/testdata/geography/geography.csv: found Welsh labels for unknown classification: OTHER @@ -55,7 +55,7 @@ t=2022-05-09 21:26:50,353 lvl=INFO msg=Written service metadata file to: ctb_met More detailed information can be obtained by running with a `-l DEBUG` flag e.g.: ``` > python3 bin/ons_csv_to_ctb_json_main.py -i test/testdata/ -g test/testdata/geography/geography.csv -o ctb_metadata_files/ -l DEBUG -t=2022-05-09 21:27:20,066 lvl=INFO msg=ons_csv_to_ctb_json_main.py version 1.1.alpha +t=2022-05-09 21:27:20,066 lvl=INFO msg=ons_csv_to_ctb_json_main.py version 1.1.beta t=2022-05-09 21:27:20,066 lvl=INFO msg=CSV source directory: test/testdata/ t=2022-05-09 21:27:20,066 lvl=INFO msg=Geography file: test/testdata/geography/geography.csv t=2022-05-09 21:27:20,067 lvl=DEBUG msg=Creating classification for geographic variable: GEO1 @@ -114,7 +114,7 @@ arguments as described in the help text for `ons_csv_to_ctb_json_main.py`: For example: ``` > python3 bin/ons_csv_to_ctb_json_main.py -i test/testdata/ -g test/testdata/geography/geography.csv -o ctb_metadata_files/ -p t -m test -b 42 -t=2022-05-09 21:27:57,633 lvl=INFO msg=ons_csv_to_ctb_json_main.py version 1.1.alpha +t=2022-05-09 21:27:57,633 lvl=INFO msg=ons_csv_to_ctb_json_main.py version 1.1.beta t=2022-05-09 21:27:57,633 lvl=INFO msg=CSV source directory: test/testdata/ t=2022-05-09 21:27:57,633 lvl=INFO msg=Geography file: test/testdata/geography/geography.csv t=2022-05-09 21:27:57,634 lvl=INFO msg=Reading test/testdata/geography/geography.csv: found Welsh labels for unknown classification: OTHER @@ -143,7 +143,7 @@ This repository contains some test data that is full of errors. It can be used t of the `--best-effort` flag as shown below: ``` > python3 bin/ons_csv_to_ctb_json_main.py -i test/testdata/best_effort -o ctb_metadata_files/ -m best-effort --best-effort -t=2022-05-11 10:10:38,936 lvl=INFO msg=ons_csv_to_ctb_json_main.py version 1.1.alpha +t=2022-05-11 10:10:38,936 lvl=INFO msg=ons_csv_to_ctb_json_main.py version 1.1.beta t=2022-05-11 10:10:38,936 lvl=INFO msg=CSV source directory: test/testdata/best_effort t=2022-05-11 10:10:38,937 lvl=WARNING msg=Reading test/testdata/best_effort/Classification.csv:3 no value supplied for required field Variable_Mnemonic t=2022-05-11 10:10:38,937 lvl=WARNING msg=Reading test/testdata/best_effort/Classification.csv:3 dropping record @@ -200,7 +200,7 @@ can be found in the `sample_2011` directory. Use this command to convert the files to JSON (with debugging enabled): ``` > python3 bin/ons_csv_to_ctb_json_main.py -i sample_2011/ -g sample_2011/geography.csv -o ctb_metadata_files/ -m 2001-sample -l DEBUG -t=2022-05-09 21:28:29,336 lvl=INFO msg=ons_csv_to_ctb_json_main.py version 1.1.alpha +t=2022-05-09 21:28:29,336 lvl=INFO msg=ons_csv_to_ctb_json_main.py version 1.1.beta t=2022-05-09 21:28:29,336 lvl=INFO msg=CSV source directory: sample_2011/ t=2022-05-09 21:28:29,336 lvl=INFO msg=Geography file: sample_2011/geography.csv t=2022-05-09 21:28:29,354 lvl=DEBUG msg=Creating classification for geographic variable: Region @@ -285,7 +285,7 @@ will be reflected in the output filenames, but `9.3.0` format will be used. To generate version 9.2.0 compatible files from the test data use the following command: ``` > python3 bin/ons_csv_to_ctb_json_main.py -i test/testdata/ -g test/testdata/geography/geography.csv -o ctb_metadata_files/ -v 9.2.0 -t=2022-05-09 21:40:49,218 lvl=INFO msg=ons_csv_to_ctb_json_main.py version 1.1.alpha +t=2022-05-09 21:40:49,218 lvl=INFO msg=ons_csv_to_ctb_json_main.py version 1.1.beta t=2022-05-09 21:40:49,218 lvl=INFO msg=CSV source directory: test/testdata/ t=2022-05-09 21:40:49,218 lvl=INFO msg=Geography file: test/testdata/geography/geography.csv t=2022-05-09 21:40:49,220 lvl=INFO msg=Reading test/testdata/geography/geography.csv: found Welsh labels for unknown classification: OTHER diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 2c1ad00..0425d85 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,6 +1,16 @@ Release Notes ============= +1.1.beta +-------- +- Added `--best-effort` flag to discard invalid data and make a best effort + attempt to generate output files. + - This replaces the `fixup.py` script. +- Formatted and customizable output filenames. +- Support for Cantabular version 9.2.0 formatting. +- Rework on mandatory fields. +- Added 2011 1% sample metadata. + 1.1.alpha --------- - Updated code to work with metadata schema version 1.1. diff --git a/bin/ons_csv_to_ctb_json_main.py b/bin/ons_csv_to_ctb_json_main.py index 48cb27c..78333f5 100644 --- a/bin/ons_csv_to_ctb_json_main.py +++ b/bin/ons_csv_to_ctb_json_main.py @@ -9,7 +9,7 @@ from ons_csv_to_ctb_json_load import Loader, PUBLIC_SECURITY_MNEMONIC from ons_csv_to_ctb_json_bilingual import BilingualDict, Bilingual -VERSION = '1.1.alpha' +VERSION = '1.1.beta' SYSTEM = 'cantabm' DEFAULT_CANTABULAR_VERSION = '9.3.0'