diff --git a/.github/workflows/ci-test.yml b/.github/workflows/ci-test.yml index f04fbec..1b9ad9a 100644 --- a/.github/workflows/ci-test.yml +++ b/.github/workflows/ci-test.yml @@ -32,7 +32,7 @@ jobs: pydocstyle bin/*.py - name: Run pylint run: | - pylint --max-locals=20 --max-public-methods=30 --max-branches=50 --max-statements=90 --min-similarity-lines=6 --disable=W1202 bin/*.py + pylint --max-locals=20 --max-public-methods=30 --max-branches=50 --max-statements=90 --min-similarity-lines=6 --max-module-lines=1200 --max-locals=22 --max-attributes=10 --disable=W1202 bin/*.py - name: Run tests run: | PYTHONPATH=test:bin python3 -m unittest -v diff --git a/README.md b/README.md index 70e2353..54fc4a7 100644 --- a/README.md +++ b/README.md @@ -4,9 +4,10 @@ Introduction `bin/ons_csv_to_ctb_json_main.py` is an application that loads source metadata files in CSV format and converts them to hierarchical JSON that can be loaded into `cantabular-metadata`. -It is compatible with version `1.1` of the metadata schema and version `9.3.0` of `cantabular-metadata`. +It is compatible with version `1.1` of the metadata schema and versions `9.3.0`/`9.2.0` of +`cantabular-metadata`. `9.3.0` format is used by default. -This is version `1.1.alpha` of the CSV to JSON processing software and is subject to change. +This is version `1.1.beta` of the CSV to JSON processing software and is subject to change. The applications only use packages in the Python standard library. @@ -34,65 +35,158 @@ Basic logging will be displayed by default, including the number of high-level C objects loaded and the name of the output files. ``` > python3 bin/ons_csv_to_ctb_json_main.py -i test/testdata/ -g test/testdata/geography/geography.csv -o ctb_metadata_files/ -t=2022-04-21 14:26:43,977 lvl=INFO msg=Reading test/testdata/geography/geography.csv: found Welsh labels for unknown classification: OTHER -t=2022-04-21 14:26:43,977 lvl=INFO msg=Dropped non public classification: CLASS_PRIV -t=2022-04-21 14:26:43,977 lvl=INFO msg=Loaded metadata for 5 Cantabular variables -t=2022-04-21 14:26:43,977 lvl=INFO msg=Loaded metadata for 3 Cantabular datasets -t=2022-04-21 14:26:43,979 lvl=INFO msg=Written dataset metadata file to: ctb_metadata_files/dataset-metadata.json -t=2022-04-21 14:26:43,979 lvl=INFO msg=Dropped non public ONS Dataset: DS_PRIV -t=2022-04-21 14:26:43,979 lvl=INFO msg=Loaded metadata for 4 Cantabular tables -t=2022-04-21 14:26:43,980 lvl=INFO msg=Written table metadata file to: ctb_metadata_files/table-metadata.json -t=2022-04-21 14:26:43,980 lvl=INFO msg=Loaded service metadata -t=2022-04-21 14:26:43,980 lvl=INFO msg=Written service metadata file to: ctb_metadata_files/service-metadata.json +t=2022-05-09 21:26:50,348 lvl=INFO msg=ons_csv_to_ctb_json_main.py version 1.1.beta +t=2022-05-09 21:26:50,348 lvl=INFO msg=CSV source directory: test/testdata/ +t=2022-05-09 21:26:50,348 lvl=INFO msg=Geography file: test/testdata/geography/geography.csv +t=2022-05-09 21:26:50,350 lvl=INFO msg=Reading test/testdata/geography/geography.csv: found Welsh labels for unknown classification: OTHER +t=2022-05-09 21:26:50,350 lvl=INFO msg=Dropped non public classification: CLASS_PRIV +t=2022-05-09 21:26:50,350 lvl=INFO msg=Dropped non public classification: GEO_PRIV +t=2022-05-09 21:26:50,350 lvl=INFO msg=Loaded metadata for 5 Cantabular variables +t=2022-05-09 21:26:50,351 lvl=INFO msg=Loaded metadata for 3 Cantabular datasets +t=2022-05-09 21:26:50,351 lvl=INFO msg=Dropped non public ONS Dataset: DS_PRIV +t=2022-05-09 21:26:50,351 lvl=INFO msg=Loaded metadata for 4 Cantabular tables +t=2022-05-09 21:26:50,351 lvl=INFO msg=Loaded service metadata +t=2022-05-09 21:26:50,351 lvl=INFO msg=Output files will be written in Cantabular 9.3.0 format +t=2022-05-09 21:26:50,352 lvl=INFO msg=Written dataset metadata file to: ctb_metadata_files/cantabm_v9-3-0_unknown-metadata-version_dataset-md_20220509-1.json +t=2022-05-09 21:26:50,353 lvl=INFO msg=Written table metadata file to: ctb_metadata_files/cantabm_v9-3-0_unknown-metadata-version_tables-md_20220509-1.json +t=2022-05-09 21:26:50,353 lvl=INFO msg=Written service metadata file to: ctb_metadata_files/cantabm_v9-3-0_unknown-metadata-version_service-md_20220509-1.json ``` More detailed information can be obtained by running with a `-l DEBUG` flag e.g.: ``` > python3 bin/ons_csv_to_ctb_json_main.py -i test/testdata/ -g test/testdata/geography/geography.csv -o ctb_metadata_files/ -l DEBUG -t=2022-04-21 14:27:07,830 lvl=DEBUG msg=Creating classification for geographic variable: GEO1 -t=2022-04-21 14:27:07,830 lvl=DEBUG msg=Creating classification for geographic variable: GEO2 -t=2022-04-21 14:27:07,830 lvl=INFO msg=Reading test/testdata/geography/geography.csv: found Welsh labels for unknown classification: OTHER -t=2022-04-21 14:27:07,830 lvl=DEBUG msg=Loaded metadata for Cantabular variable: CLASS1 -t=2022-04-21 14:27:07,830 lvl=DEBUG msg=Loaded metadata for Cantabular variable: CLASS2 -t=2022-04-21 14:27:07,830 lvl=DEBUG msg=Loaded metadata for Cantabular variable: CLASS3 -t=2022-04-21 14:27:07,830 lvl=INFO msg=Dropped non public classification: CLASS_PRIV -t=2022-04-21 14:27:07,830 lvl=DEBUG msg=Loaded metadata for Cantabular variable: GEO1 -t=2022-04-21 14:27:07,830 lvl=DEBUG msg=Loaded metadata for Cantabular variable: GEO2 -t=2022-04-21 14:27:07,830 lvl=INFO msg=Loaded metadata for 5 Cantabular variables -t=2022-04-21 14:27:07,831 lvl=DEBUG msg=Loaded metadata for Cantabular dataset: DB1 -t=2022-04-21 14:27:07,831 lvl=DEBUG msg=Loaded metadata for Cantabular dataset: DB2 -t=2022-04-21 14:27:07,831 lvl=DEBUG msg=Loaded metadata for Cantabular dataset: DB3 -t=2022-04-21 14:27:07,831 lvl=INFO msg=Loaded metadata for 3 Cantabular datasets -t=2022-04-21 14:27:07,832 lvl=INFO msg=Written dataset metadata file to: ctb_metadata_files/dataset-metadata.json -t=2022-04-21 14:27:07,833 lvl=DEBUG msg=Loaded metadata for Cantabular table: DS1 -t=2022-04-21 14:27:07,833 lvl=DEBUG msg=Loaded metadata for Cantabular table: DS2 -t=2022-04-21 14:27:07,833 lvl=DEBUG msg=Loaded metadata for Cantabular table: DS3 -t=2022-04-21 14:27:07,833 lvl=INFO msg=Dropped non public ONS Dataset: DS_PRIV -t=2022-04-21 14:27:07,833 lvl=DEBUG msg=Loaded metadata for Cantabular table: DS4 -t=2022-04-21 14:27:07,833 lvl=INFO msg=Loaded metadata for 4 Cantabular tables -t=2022-04-21 14:27:07,833 lvl=INFO msg=Written table metadata file to: ctb_metadata_files/table-metadata.json -t=2022-04-21 14:27:07,833 lvl=INFO msg=Loaded service metadata -t=2022-04-21 14:27:07,834 lvl=INFO msg=Written service metadata file to: ctb_metadata_files/service-metadata.json +t=2022-05-09 21:27:20,066 lvl=INFO msg=ons_csv_to_ctb_json_main.py version 1.1.beta +t=2022-05-09 21:27:20,066 lvl=INFO msg=CSV source directory: test/testdata/ +t=2022-05-09 21:27:20,066 lvl=INFO msg=Geography file: test/testdata/geography/geography.csv +t=2022-05-09 21:27:20,067 lvl=DEBUG msg=Creating classification for geographic variable: GEO1 +t=2022-05-09 21:27:20,067 lvl=DEBUG msg=Creating classification for geographic variable: GEO2 +t=2022-05-09 21:27:20,067 lvl=DEBUG msg=Creating classification for geographic variable: GEO_PRIV +t=2022-05-09 21:27:20,067 lvl=INFO msg=Reading test/testdata/geography/geography.csv: found Welsh labels for unknown classification: OTHER +t=2022-05-09 21:27:20,068 lvl=DEBUG msg=Loaded metadata for Cantabular variable: CLASS1 +t=2022-05-09 21:27:20,068 lvl=DEBUG msg=Loaded metadata for Cantabular variable: CLASS2 +t=2022-05-09 21:27:20,068 lvl=DEBUG msg=Loaded metadata for Cantabular variable: CLASS3 +t=2022-05-09 21:27:20,068 lvl=INFO msg=Dropped non public classification: CLASS_PRIV +t=2022-05-09 21:27:20,068 lvl=DEBUG msg=Loaded metadata for Cantabular variable: GEO1 +t=2022-05-09 21:27:20,068 lvl=DEBUG msg=Loaded metadata for Cantabular variable: GEO2 +t=2022-05-09 21:27:20,068 lvl=INFO msg=Dropped non public classification: GEO_PRIV +t=2022-05-09 21:27:20,068 lvl=INFO msg=Loaded metadata for 5 Cantabular variables +t=2022-05-09 21:27:20,068 lvl=DEBUG msg=Loaded metadata for Cantabular dataset: DB1 +t=2022-05-09 21:27:20,068 lvl=DEBUG msg=Loaded metadata for Cantabular dataset: DB2 +t=2022-05-09 21:27:20,068 lvl=DEBUG msg=Loaded metadata for Cantabular dataset: DB3 +t=2022-05-09 21:27:20,068 lvl=INFO msg=Loaded metadata for 3 Cantabular datasets +t=2022-05-09 21:27:20,069 lvl=DEBUG msg=Loaded metadata for Cantabular table: DS1 +t=2022-05-09 21:27:20,069 lvl=DEBUG msg=Loaded metadata for Cantabular table: DS2 +t=2022-05-09 21:27:20,069 lvl=DEBUG msg=Loaded metadata for Cantabular table: DS3 +t=2022-05-09 21:27:20,069 lvl=INFO msg=Dropped non public ONS Dataset: DS_PRIV +t=2022-05-09 21:27:20,069 lvl=DEBUG msg=Loaded metadata for Cantabular table: DS4 +t=2022-05-09 21:27:20,069 lvl=INFO msg=Loaded metadata for 4 Cantabular tables +t=2022-05-09 21:27:20,069 lvl=INFO msg=Loaded service metadata +t=2022-05-09 21:27:20,069 lvl=INFO msg=Output files will be written in Cantabular 9.3.0 format +t=2022-05-09 21:27:20,070 lvl=INFO msg=Written dataset metadata file to: ctb_metadata_files/cantabm_v9-3-0_unknown-metadata-version_dataset-md_20220509-1.json +t=2022-05-09 21:27:20,071 lvl=INFO msg=Written table metadata file to: ctb_metadata_files/cantabm_v9-3-0_unknown-metadata-version_tables-md_20220509-1.json +t=2022-05-09 21:27:20,071 lvl=INFO msg=Written service metadata file to: ctb_metadata_files/cantabm_v9-3-0_unknown-metadata-version_service-md_20220509-1.json ``` -Using externally sourced files ------------------------------- +Output file names +----------------- -To convert the externally sourced metadata CSV files currently being used for testing, first fixup the source files: +The output file names are formatted as follows: ``` -python3 bin/fixup.py -i --g -o modified/ +cantabm_9-3-0__dataset-md_.json +cantabm_9-3-0__service-md_.json +cantabm_9-3-0__tables-md_.json ``` -The geography file will be placed in the `modified/` directory and will have the same base name as the original file. +The `prefix`, `metadata master version` and `build number` can be specified using command line +arguments as described in the help text for `ons_csv_to_ctb_json_main.py`: +``` + -p {d,t,tu}, --file_prefix {d,t,tu} + Prefix to use in output filenames: d=dev, t=test, + tu=tuning (default: no prefix i.e. operational) + -m METADATA_MASTER_VERSION, --metadata_master_version METADATA_MASTER_VERSION + Metadata master version to use in output filenames + (default: unknown-metadata-version) + -b BUILD_NUMBER, --build_number BUILD_NUMBER + Build number to use in output filenames (default: 1) -This will load the sample files and modify them slightly so that they can be processed by `ons_csv_to_ctb_json_main.py`. -This step will not be needed for production. +``` -Then convert the files to JSON: +For example: ``` -python3 bin/ons_csv_to_ctb_json_main.py -i modified/ -g modified/ -o ctb_metadata_files/ +> python3 bin/ons_csv_to_ctb_json_main.py -i test/testdata/ -g test/testdata/geography/geography.csv -o ctb_metadata_files/ -p t -m test -b 42 +t=2022-05-09 21:27:57,633 lvl=INFO msg=ons_csv_to_ctb_json_main.py version 1.1.beta +t=2022-05-09 21:27:57,633 lvl=INFO msg=CSV source directory: test/testdata/ +t=2022-05-09 21:27:57,633 lvl=INFO msg=Geography file: test/testdata/geography/geography.csv +t=2022-05-09 21:27:57,634 lvl=INFO msg=Reading test/testdata/geography/geography.csv: found Welsh labels for unknown classification: OTHER +t=2022-05-09 21:27:57,635 lvl=INFO msg=Dropped non public classification: CLASS_PRIV +t=2022-05-09 21:27:57,635 lvl=INFO msg=Dropped non public classification: GEO_PRIV +t=2022-05-09 21:27:57,635 lvl=INFO msg=Loaded metadata for 5 Cantabular variables +t=2022-05-09 21:27:57,635 lvl=INFO msg=Loaded metadata for 3 Cantabular datasets +t=2022-05-09 21:27:57,636 lvl=INFO msg=Dropped non public ONS Dataset: DS_PRIV +t=2022-05-09 21:27:57,636 lvl=INFO msg=Loaded metadata for 4 Cantabular tables +t=2022-05-09 21:27:57,636 lvl=INFO msg=Loaded service metadata +t=2022-05-09 21:27:57,636 lvl=INFO msg=Output files will be written in Cantabular 9.3.0 format +t=2022-05-09 21:27:57,637 lvl=INFO msg=Written dataset metadata file to: ctb_metadata_files/t_cantabm_v9-3-0_test_dataset-md_20220509-42.json +t=2022-05-09 21:27:57,637 lvl=INFO msg=Written table metadata file to: ctb_metadata_files/t_cantabm_v9-3-0_test_tables-md_20220509-42.json +t=2022-05-09 21:27:57,638 lvl=INFO msg=Written service metadata file to: ctb_metadata_files/t_cantabm_v9-3-0_test_service-md_20220509-42.json ``` +Using data with errors +---------------------- + +`ons_csv_to_ctb_json_main.py` fails on the first error. This is intentional as the data must be +correct for use in production. For debug purposes a `--best-effort` flag can be used to continue +processing when errors are found and to make a **best effort** to generate output. Typically this +will result in some data loss as some records will be dropped and some fields will be ignored. + +This repository contains some test data that is full of errors. It can be used to demonstrate the usage +of the `--best-effort` flag as shown below: +``` +> python3 bin/ons_csv_to_ctb_json_main.py -i test/testdata/best_effort -o ctb_metadata_files/ -m best-effort --best-effort +t=2022-05-11 10:10:38,936 lvl=INFO msg=ons_csv_to_ctb_json_main.py version 1.1.beta +t=2022-05-11 10:10:38,936 lvl=INFO msg=CSV source directory: test/testdata/best_effort +t=2022-05-11 10:10:38,937 lvl=WARNING msg=Reading test/testdata/best_effort/Classification.csv:3 no value supplied for required field Variable_Mnemonic +t=2022-05-11 10:10:38,937 lvl=WARNING msg=Reading test/testdata/best_effort/Classification.csv:3 dropping record +t=2022-05-11 10:10:38,937 lvl=WARNING msg=Reading test/testdata/best_effort/Classification.csv:4 duplicate value CLASS1 for Classification_Mnemonic +t=2022-05-11 10:10:38,937 lvl=WARNING msg=Reading test/testdata/best_effort/Classification.csv:4 dropping record +t=2022-05-11 10:10:38,937 lvl=WARNING msg=Reading test/testdata/best_effort/Classification.csv:5 invalid value x for Number_Of_Category_Items +t=2022-05-11 10:10:38,937 lvl=WARNING msg=Reading test/testdata/best_effort/Classification.csv:5 ignoring field Number_Of_Category_Items +t=2022-05-11 10:10:38,938 lvl=WARNING msg=Reading test/testdata/best_effort/Category.csv Unexpected number of categories for CLASS1: expected 4 but found 1 +t=2022-05-11 10:10:38,938 lvl=INFO msg=No geography file specified +t=2022-05-11 10:10:38,938 lvl=INFO msg=Loaded metadata for 5 Cantabular variables +t=2022-05-11 10:10:38,938 lvl=WARNING msg=Reading test/testdata/best_effort/Database_Variable.csv Lowest_Geog_Variable_Flag set on GEO3 and GEO1 for database DB1 +t=2022-05-11 10:10:38,938 lvl=INFO msg=Loaded metadata for 1 Cantabular datasets +t=2022-05-11 10:10:38,939 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:4 duplicate value combo DS1/VAR1 for Dataset_Mnemonic/Variable_Mnemonic +t=2022-05-11 10:10:38,939 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:4 dropping record +t=2022-05-11 10:10:38,939 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:2 Lowest_Geog_Variable_Flag set on non-geographic variable VAR1 for dataset DS1 +t=2022-05-11 10:10:38,939 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:2 Processing_Priority not specified for classification CLASS1 in dataset DS1 +t=2022-05-11 10:10:38,939 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:2 using 0 for Processing_Priority +t=2022-05-11 10:10:38,939 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:3 Classification_Mnemonic must not be specified for geographic variable GEO1 in dataset DS1 +t=2022-05-11 10:10:38,939 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:3 Processing_Priority must not be specified for geographic variable GEO1 in dataset DS1 +t=2022-05-11 10:10:38,939 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:5 Lowest_Geog_Variable_Flag set on variable GEO2 and GEO1 for dataset DS1 +t=2022-05-11 10:10:38,939 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:7 Classification must be specified for non-geographic VAR2 in dataset DS1 +t=2022-05-11 10:10:38,939 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:7 dropping record +t=2022-05-11 10:10:38,939 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:8 Invalid classification CLASS1 specified for variable VAR3 in dataset DS1 +t=2022-05-11 10:10:38,939 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv:8 dropping record +t=2022-05-11 10:10:38,939 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset_Variable.csv Invalid processing_priorities [0] for dataset DS1 +t=2022-05-11 10:10:38,939 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset.csv:3 DS2 has classification CLASS3 that is not in database DB1 +t=2022-05-11 10:10:38,939 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset.csv:3 dropping record +t=2022-05-11 10:10:38,939 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset.csv:4 DS3 has no associated classifications or geographic variable +t=2022-05-11 10:10:38,939 lvl=WARNING msg=Reading test/testdata/best_effort/Dataset.csv:4 dropping record +t=2022-05-11 10:10:38,939 lvl=INFO msg=Loaded metadata for 1 Cantabular tables +t=2022-05-11 10:10:38,939 lvl=INFO msg=Loaded service metadata +t=2022-05-11 10:10:38,939 lvl=WARNING msg=16 errors were encountered during processing +t=2022-05-11 10:10:38,939 lvl=INFO msg=Output files will be written in Cantabular 9.3.0 format +t=2022-05-11 10:10:38,940 lvl=INFO msg=Written dataset metadata file to: ctb_metadata_files/cantabm_v9-3-0_best-effort_dataset-md_20220511-1.json +t=2022-05-11 10:10:38,940 lvl=INFO msg=Written table metadata file to: ctb_metadata_files/cantabm_v9-3-0_best-effort_tables-md_20220511-1.json +t=2022-05-11 10:10:38,940 lvl=INFO msg=Written service metadata file to: ctb_metadata_files/cantabm_v9-3-0_best-effort_service-md_20220511-1.json +``` + +Many lines contain strings such as `test/testdata/best_effort/Dataset.csv:4` this means that an error has been detected +on row 4 of the `Dataset.csv` file. The header will be row 1. + +The `--best-effort` flag is for debug purposes only. + Using 2011 census teaching file metadata ---------------------------------------- @@ -105,49 +199,54 @@ can be found in the `sample_2011` directory. Use this command to convert the files to JSON (with debugging enabled): ``` -> python3 bin/ons_csv_to_ctb_json_main.py -i sample_2011/ -g sample_2011/geography.csv -o ctb_metadata_files/ -l DEBUG -t=2022-05-03 08:58:06,547 lvl=DEBUG msg=Creating classification for geographic variable: Region -t=2022-05-03 08:58:06,547 lvl=DEBUG msg=Creating classification for geographic variable: Country -t=2022-05-03 08:58:06,548 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Residence Type -t=2022-05-03 08:58:06,548 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Family Composition -t=2022-05-03 08:58:06,548 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Population Base -t=2022-05-03 08:58:06,548 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Sex -t=2022-05-03 08:58:06,548 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Age -t=2022-05-03 08:58:06,548 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Marital Status -t=2022-05-03 08:58:06,548 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Student -t=2022-05-03 08:58:06,548 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Country of Birth -t=2022-05-03 08:58:06,548 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Health -t=2022-05-03 08:58:06,548 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Ethnic Group -t=2022-05-03 08:58:06,548 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Religion -t=2022-05-03 08:58:06,548 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Economic Activity -t=2022-05-03 08:58:06,548 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Occupation -t=2022-05-03 08:58:06,548 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Industry -t=2022-05-03 08:58:06,548 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Hours worked per week -t=2022-05-03 08:58:06,548 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Approximated Social Grade -t=2022-05-03 08:58:06,549 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Region -t=2022-05-03 08:58:06,549 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Country -t=2022-05-03 08:58:06,549 lvl=INFO msg=Loaded metadata for 18 Cantabular variables -t=2022-05-03 08:58:06,549 lvl=DEBUG msg=Loaded metadata for Cantabular dataset: Teaching-Dataset -t=2022-05-03 08:58:06,549 lvl=INFO msg=Loaded metadata for 1 Cantabular datasets -t=2022-05-03 08:58:06,552 lvl=INFO msg=Written dataset metadata file to: ctb_metadata_files/dataset-metadata.json -t=2022-05-03 08:58:06,553 lvl=DEBUG msg=Loaded metadata for Cantabular table: LC2101EW -t=2022-05-03 08:58:06,553 lvl=DEBUG msg=Loaded metadata for Cantabular table: LC1117EW -t=2022-05-03 08:58:06,553 lvl=DEBUG msg=Loaded metadata for Cantabular table: LC2107EW -t=2022-05-03 08:58:06,553 lvl=DEBUG msg=Loaded metadata for Cantabular table: LC6107EW -t=2022-05-03 08:58:06,553 lvl=DEBUG msg=Loaded metadata for Cantabular table: LC6112EW -t=2022-05-03 08:58:06,553 lvl=INFO msg=Loaded metadata for 5 Cantabular tables -t=2022-05-03 08:58:06,554 lvl=INFO msg=Written table metadata file to: ctb_metadata_files/table-metadata.json -t=2022-05-03 08:58:06,554 lvl=INFO msg=Loaded service metadata -t=2022-05-03 08:58:06,554 lvl=INFO msg=Written service metadata file to: ctb_metadata_files/service-metadata.json +> python3 bin/ons_csv_to_ctb_json_main.py -i sample_2011/ -g sample_2011/geography.csv -o ctb_metadata_files/ -m 2001-sample -l DEBUG +t=2022-05-09 21:28:29,336 lvl=INFO msg=ons_csv_to_ctb_json_main.py version 1.1.beta +t=2022-05-09 21:28:29,336 lvl=INFO msg=CSV source directory: sample_2011/ +t=2022-05-09 21:28:29,336 lvl=INFO msg=Geography file: sample_2011/geography.csv +t=2022-05-09 21:28:29,354 lvl=DEBUG msg=Creating classification for geographic variable: Region +t=2022-05-09 21:28:29,354 lvl=DEBUG msg=Creating classification for geographic variable: Country +t=2022-05-09 21:28:29,356 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Residence Type +t=2022-05-09 21:28:29,356 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Family Composition +t=2022-05-09 21:28:29,356 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Population Base +t=2022-05-09 21:28:29,356 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Sex +t=2022-05-09 21:28:29,356 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Age +t=2022-05-09 21:28:29,356 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Marital Status +t=2022-05-09 21:28:29,356 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Student +t=2022-05-09 21:28:29,356 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Country of Birth +t=2022-05-09 21:28:29,357 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Health +t=2022-05-09 21:28:29,357 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Ethnic Group +t=2022-05-09 21:28:29,357 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Religion +t=2022-05-09 21:28:29,357 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Economic Activity +t=2022-05-09 21:28:29,357 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Occupation +t=2022-05-09 21:28:29,357 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Industry +t=2022-05-09 21:28:29,357 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Hours worked per week +t=2022-05-09 21:28:29,357 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Approximated Social Grade +t=2022-05-09 21:28:29,357 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Region +t=2022-05-09 21:28:29,357 lvl=DEBUG msg=Loaded metadata for Cantabular variable: Country +t=2022-05-09 21:28:29,357 lvl=INFO msg=Loaded metadata for 18 Cantabular variables +t=2022-05-09 21:28:29,358 lvl=DEBUG msg=Loaded metadata for Cantabular dataset: Teaching-Dataset +t=2022-05-09 21:28:29,358 lvl=INFO msg=Loaded metadata for 1 Cantabular datasets +t=2022-05-09 21:28:29,360 lvl=DEBUG msg=Loaded metadata for Cantabular table: LC2101EW +t=2022-05-09 21:28:29,360 lvl=DEBUG msg=Loaded metadata for Cantabular table: LC1117EW +t=2022-05-09 21:28:29,360 lvl=DEBUG msg=Loaded metadata for Cantabular table: LC2107EW +t=2022-05-09 21:28:29,360 lvl=DEBUG msg=Loaded metadata for Cantabular table: LC6107EW +t=2022-05-09 21:28:29,360 lvl=DEBUG msg=Loaded metadata for Cantabular table: LC6112EW +t=2022-05-09 21:28:29,360 lvl=INFO msg=Loaded metadata for 5 Cantabular tables +t=2022-05-09 21:28:29,361 lvl=INFO msg=Loaded service metadata +t=2022-05-09 21:28:29,361 lvl=INFO msg=Output files will be written in Cantabular 9.3.0 format +t=2022-05-09 21:28:29,364 lvl=INFO msg=Written dataset metadata file to: ctb_metadata_files/cantabm_v9-3-0_2001-sample_dataset-md_20220509-1.json +t=2022-05-09 21:28:29,365 lvl=INFO msg=Written table metadata file to: ctb_metadata_files/cantabm_v9-3-0_2001-sample_tables-md_20220509-1.json +t=2022-05-09 21:28:29,365 lvl=INFO msg=Written service metadata file to: ctb_metadata_files/cantabm_v9-3-0_2001-sample_service-md_20220509-1.json ``` Load the JSON files with cantabular-metadata ============================================ -To load the generated JSON files into `cantabular-metadata` (version 9.3.0) run: +To load the generated JSON files into `cantabular-metadata` (version 9.3.0) run the following +commands, substituting the file names and paths as appropriate: ``` cd ctb_metadata_files -CANTABULAR_METADATA_GRAPHQL_TYPES_FILE=metadata.graphql CANTABULAR_METADATA_SERVICE_FILE=service-metadata.json CANTABULAR_METADATA_DATASET_FILES=dataset-metadata.json CANTABULAR_METADATA_TABLE_FILES=table-metadata.json /cantabular-metadata +CANTABULAR_METADATA_GRAPHQL_TYPES_FILE=metadata.graphql CANTABULAR_METADATA_SERVICE_FILE=cantabm_v9-3-0_unknown-metadata-version_service-md_20220428-1.json CANTABULAR_METADATA_DATASET_FILES=cantabm_v9-3-0_unknown-metadata-version_dataset-md_20220428-1.json CANTABULAR_METADATA_TABLE_FILES=cantabm_v9-3-0_unknown-metadata-version_tables-md_20220428-1.json /cantabular-metadata ``` The metadata can be queried via a GraphQL interface. By default this is accessible at: @@ -166,7 +265,7 @@ This query can be used to obtain information for a single named table: http://localhost:8493/graphql?query=%7Bservice%7Btables(names%3A%20%5B%22DS1%22%5D)%7Bname%20datasetName%20vars%20description%20label%20all%7D%7D%7D%0A Tests ------ +===== This repository has tests written using the `unittest` framework. They are run as part of Continuous Integration testing in the GitHub repository. They can be run manually by running this @@ -175,3 +274,43 @@ command from the base directory: ``` PYTHONPATH=test:bin python3 -m unittest -v ``` + +Other Cantabular versions +========================= + +The `-v` argument can be used to generate output files that are compatible with a different version of Cantabular. +At present only 9.2.0 and 9.3.0 are supported. If any other version is specified then the specified version +will be reflected in the output filenames, but `9.3.0` format will be used. + +To generate version 9.2.0 compatible files from the test data use the following command: +``` +> python3 bin/ons_csv_to_ctb_json_main.py -i test/testdata/ -g test/testdata/geography/geography.csv -o ctb_metadata_files/ -v 9.2.0 +t=2022-05-09 21:40:49,218 lvl=INFO msg=ons_csv_to_ctb_json_main.py version 1.1.beta +t=2022-05-09 21:40:49,218 lvl=INFO msg=CSV source directory: test/testdata/ +t=2022-05-09 21:40:49,218 lvl=INFO msg=Geography file: test/testdata/geography/geography.csv +t=2022-05-09 21:40:49,220 lvl=INFO msg=Reading test/testdata/geography/geography.csv: found Welsh labels for unknown classification: OTHER +t=2022-05-09 21:40:49,220 lvl=INFO msg=Dropped non public classification: CLASS_PRIV +t=2022-05-09 21:40:49,220 lvl=INFO msg=Dropped non public classification: GEO_PRIV +t=2022-05-09 21:40:49,220 lvl=INFO msg=Loaded metadata for 5 Cantabular variables +t=2022-05-09 21:40:49,221 lvl=INFO msg=Loaded metadata for 3 Cantabular datasets +t=2022-05-09 21:40:49,222 lvl=INFO msg=Dropped non public ONS Dataset: DS_PRIV +t=2022-05-09 21:40:49,222 lvl=INFO msg=Loaded metadata for 4 Cantabular tables +t=2022-05-09 21:40:49,222 lvl=INFO msg=Loaded service metadata +t=2022-05-09 21:40:49,222 lvl=INFO msg=Output files will be written in Cantabular 9.2.0 format +t=2022-05-09 21:40:49,223 lvl=INFO msg=Written dataset metadata file to: ctb_metadata_files/cantabm_v9-2-0_unknown-metadata-version_dataset-md_20220509-1.json +t=2022-05-09 21:40:49,223 lvl=INFO msg=Written service metadata file to: ctb_metadata_files/cantabm_v9-2-0_unknown-metadata-version_service-md_20220509-1.json +``` + +No tables metadata file is produced. The tables data is embedded in the service metadata file. + +To load the files into `cantabular-metadata` version 9.2.0 you need a different GraphQL types +file which can be found `ctb_metadata_files/metadata_9_2_0.graphql`. The files are also specified at +the command line instead of via environment variables. + + +To load the generated JSON files into `cantabular-metadata` (version 9.2.0) run the following +commands, substituting the file names and paths as appropriate: +``` +cd ctb_metadata_files +/cantabular-metadata metadata_9_2_0.graphql cantabm_v9-2-0_unknown-metadata-version_service-md_20220509-1.json cantabm_v9-2-0_unknown-metadata-version_dataset-md_20220509-1.json +``` diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 2c1ad00..0425d85 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,6 +1,16 @@ Release Notes ============= +1.1.beta +-------- +- Added `--best-effort` flag to discard invalid data and make a best effort + attempt to generate output files. + - This replaces the `fixup.py` script. +- Formatted and customizable output filenames. +- Support for Cantabular version 9.2.0 formatting. +- Rework on mandatory fields. +- Added 2011 1% sample metadata. + 1.1.alpha --------- - Updated code to work with metadata schema version 1.1. diff --git a/bin/fixup.py b/bin/fixup.py deleted file mode 100644 index d73fbef..0000000 --- a/bin/fixup.py +++ /dev/null @@ -1,176 +0,0 @@ -""" -Fixup metadata source CSV files. - -This is a program for use in development. It modifies source files so that they can successfully -be loaded on ons_csv_to_ctb_main.py. - -""" - -import glob -import os -import logging -import csv -from argparse import ArgumentParser - -VERSION = '1.1.alpha' - - -def main(): - """Fixup metadata source CSV files.""" - parser = ArgumentParser(description='Program for fixing up metadata source CSV files.', - epilog=f'Version: {VERSION}') - - parser.add_argument('-i', '--input-dir', - type=str, - required=True, - help='Input directory containing CSV files') - - parser.add_argument('-o', '--output-dir', - type=str, - required=True, - help='Output directory to write fixed-up files') - - parser.add_argument('-g', '--geography-file', - type=str, - required=False, - help='Name of geography CSV file') - - args = parser.parse_args() - - logging.basicConfig(format='t=%(asctime)s lvl=%(levelname)s msg=%(message)s', level='INFO') - - for directory in (args.input_dir, args.output_dir): - if not os.path.isdir(directory): - raise ValueError(f'{directory} does not exist or is not a directory') - - for filename in glob.glob(os.path.join(args.input_dir, '*.csv')): - if args.geography_file and \ - os.path.abspath(filename) == os.path.abspath(args.geography_file): - continue - - basename = os.path.basename(filename) - out_filename = os.path.join(args.output_dir, basename) - with open(out_filename, 'w') as outfile: - # Main program expects input files in UTF-8 format. - with open(filename, newline='', encoding='iso-8859-1') as infile: - reader = csv.DictReader(infile) - fieldnames = reader.fieldnames.copy() - if basename == 'Category.csv': - fieldnames.remove('variable_mnemonic') - fieldnames.append('Variable_Mnemonic') - - writer = csv.DictWriter(outfile, fieldnames) - writer.writeheader() - for line in reader: - if basename == 'Category.csv': - line['Variable_Mnemonic'] = line.pop('variable_mnemonic') - - if basename == 'Variable.csv': - line['Security_Mnemonic'] = 'PUB' - - if not line['Variable_Type_Code']: - line['Variable_Type_Code'] = 'DVO' - - elif basename == 'Classification.csv': - if not line['Number_Of_Category_Items']: - line['Number_Of_Category_Items'] = '0' - - if line['Classification_Mnemonic'] == 'hh_away_student_9a': - line['Number_Of_Category_Items'] = '8' - - if line['Classification_Mnemonic'] == 'hh_families_count_7a': - line['Number_Of_Category_Items'] = '8' - - if line['Classification_Mnemonic'] == 'legal_partnership_status_12a': - line['Number_Of_Category_Items'] = '12' - - if line['Classification_Mnemonic'] == 'moving_group_size_10000a': - line['Number_Of_Category_Items'] = '9716' - - if '_pop' in line['Variable_Mnemonic']: - continue - - if line['Classification_Mnemonic'] == 'hh_multi_ethnic_combination_23B': - line['Classification_Mnemonic'] = 'hh_multi_ethnic_combination_23b' - - elif basename == 'Topic_Classification.csv': - if line['Classification_Mnemonic'] == 'hh_multi_ethnic_combination_23B': - line['Classification_Mnemonic'] = 'hh_multi_ethnic_combination_23b' - - if line['Classification_Mnemonic'] == 'distance_to_work': - line['Classification_Mnemonic'] = 'distance_to_work_12002a' - - if line['Classification_Mnemonic'] == 'moving_group_number': - line['Classification_Mnemonic'] = 'moving_group_number_10000a' - - if line['Classification_Mnemonic'] == 'moving_group_size': - line['Classification_Mnemonic'] = 'moving_group_size_10000a' - - if line['Classification_Mnemonic'] in [ - 'dwelling_number', 'economic_activity_status_14a', - 'economic_activity_status_13a', 'economic_activity_status_12b', - 'economic_activity_status_11a', 'economic_activity_status_11b', - 'economic_activity_status_10b', 'economic_activity_status_9a', - 'economic_activity_status_7a', 'economic_activity_status_6a', - 'economic_activity_status_6b', 'economic_activity_status_5b', - 'economic_activity_status_4a', 'economic_activity_status_4b', - 'ethnic_group', 'travel_destination_wz']: - continue - - elif basename == 'Category.csv': - if line['Classification_Mnemonic'] == 'armed_forces_dependent_ind_5a': - continue - - if line['Classification_Mnemonic'] == 'moving_group_size': - line['Classification_Mnemonic'] = 'moving_group_size_10000a' - - if '_pop' in line['Classification_Mnemonic']: - continue - - elif basename == 'Dataset.csv': - line['Security_Mnemonic'] = 'PUB' - - elif basename == 'Release_Dataset.csv': - line['Census_Release_Number'] = '1' - - elif basename == 'Dataset_Variable.csv': - if line['Classification_Mnemonic'] == 'sex': - line['Classification_Mnemonic'] = 'sex_2a' - - elif basename == 'Database_Variable.csv': - if line['Variable_Mnemonic'] in ['dwelling_number', 'ethnic_group', - 'travel_destination_wz']: - continue - - writer.writerow(line) - - if basename == 'Topic.csv': - writer.writerow({ - 'Id': 13, - 'Topic_Mnemonic': 'HDS', - 'Topic_Description': 'HDS', - 'Topic_Description_Welsh': '', - 'Topic_Title': 'HDS', - 'Topic_Title_Welsh': ''}) - - logging.info(f'Read file from: {filename} and wrote modified file to: {out_filename}') - - if args.geography_file: - basename = os.path.basename(args.geography_file) - out_filename = os.path.join(args.output_dir, basename) - with open(out_filename, 'w') as outfile: - with open(args.geography_file, newline='') as infile: - for line in infile.read().splitlines(): - line = line.replace(',West Northamptonshireshire,', ',West Northamptonshire,') - outfile.write(line) - outfile.write('\n') - logging.info(f'Read geography file from: {args.geography_file} and wrote modified file to:' - f' {out_filename}') - - -if __name__ == '__main__': - try: - main() - except Exception as exception: - logging.error(exception) - raise exception diff --git a/bin/ons_csv_to_ctb_json_ds_vars.py b/bin/ons_csv_to_ctb_json_ds_vars.py index 3bf2180..1643ccb 100644 --- a/bin/ons_csv_to_ctb_json_ds_vars.py +++ b/bin/ons_csv_to_ctb_json_ds_vars.py @@ -1,4 +1,5 @@ """Build data structure that represents relationship between dataset and variables.""" +import logging from collections import namedtuple DatasetVariables = namedtuple('DatasetVariables', 'classifications alternate_geog_variables') @@ -7,7 +8,7 @@ class DatasetVarsBuilder(): """Utility class to validate and build dataset variables.""" - def __init__(self, dataset_mnemonic, filename, all_classifications): + def __init__(self, dataset_mnemonic, filename, all_classifications, recoverable_error): """Initialise DatasetVarsBuilder object.""" self.lowest_geog_variable = None self.alternate_geog_variables = [] @@ -16,71 +17,82 @@ def __init__(self, dataset_mnemonic, filename, all_classifications): self.dataset_mnemonic = dataset_mnemonic self.filename = filename self.all_classifications = all_classifications + self.recoverable_error = recoverable_error - def add_geographic_variable(self, variable): + def add_geographic_variable(self, variable, row_num): """Add geographic variable ensuring data integrity.""" variable_mnemonic = variable['Variable_Mnemonic'] classification_mnemonic = variable['Classification_Mnemonic'] if classification_mnemonic: - raise ValueError(f'Reading {self.filename} ' - 'Classification_Mnemonic must not be specified for ' - f'geographic variable {variable_mnemonic} in dataset ' - f'{self.dataset_mnemonic}') + self.recoverable_error(f'Reading {self.filename}:{row_num} ' + 'Classification_Mnemonic must not be specified for ' + f'geographic variable {variable_mnemonic} in dataset ' + f'{self.dataset_mnemonic}') if variable['Processing_Priority']: - raise ValueError(f'Reading {self.filename} ' - 'Processing_Priority must not be specified for geographic' - f' variable {variable_mnemonic} in dataset ' - f'{self.dataset_mnemonic}') + self.recoverable_error(f'Reading {self.filename}:{row_num} ' + 'Processing_Priority must not be specified for geographic' + f' variable {variable_mnemonic} in dataset ' + f'{self.dataset_mnemonic}') if variable['Lowest_Geog_Variable_Flag'] == 'Y': if self.lowest_geog_variable: - raise ValueError(f'Reading {self.filename} ' - 'Lowest_Geog_Variable_Flag set on variable ' - f'{variable_mnemonic} and ' - f'{self.lowest_geog_variable} for dataset ' - f'{self.dataset_mnemonic}') - self.lowest_geog_variable = variable['Variable_Mnemonic'] + self.recoverable_error(f'Reading {self.filename}:{row_num} ' + 'Lowest_Geog_Variable_Flag set on variable ' + f'{variable_mnemonic} and ' + f'{self.lowest_geog_variable} for dataset ' + f'{self.dataset_mnemonic}') + else: + self.lowest_geog_variable = variable['Variable_Mnemonic'] else: self.alternate_geog_variables.append(variable['Variable_Mnemonic']) - def add_non_geographic_variable(self, variable): + def add_non_geographic_variable(self, variable, row_num): """Add non-geographic variable ensuring data integrity.""" variable_mnemonic = variable['Variable_Mnemonic'] classification_mnemonic = variable['Classification_Mnemonic'] if not classification_mnemonic: - raise ValueError(f'Reading {self.filename} ' - 'Classification must be specified for non-geographic ' - f'{variable_mnemonic} in dataset {self.dataset_mnemonic}') + self.recoverable_error(f'Reading {self.filename}:{row_num} ' + 'Classification must be specified for non-geographic ' + f'{variable_mnemonic} in dataset {self.dataset_mnemonic}') + logging.warning(f'Reading {self.filename}:{row_num} dropping record') + return + if variable['Lowest_Geog_Variable_Flag'] == 'Y': - raise ValueError(f'Reading {self.filename} ' - 'Lowest_Geog_Variable_Flag set on non-geographic variable' - f' {variable_mnemonic} for dataset {self.dataset_mnemonic}') + self.recoverable_error(f'Reading {self.filename}:{row_num} ' + 'Lowest_Geog_Variable_Flag set on non-geographic variable ' + f'{variable_mnemonic} for dataset {self.dataset_mnemonic}') classification = self.all_classifications[classification_mnemonic] if classification.private['Variable_Mnemonic'] != variable_mnemonic: - raise ValueError(f'Reading {self.filename} Invalid ' - f'classification {classification_mnemonic} ' - f'specified for variable {variable_mnemonic} ' - f'in dataset {self.dataset_mnemonic}') + self.recoverable_error(f'Reading {self.filename}:{row_num} Invalid ' + f'classification {classification_mnemonic} ' + f'specified for variable {variable_mnemonic} ' + f'in dataset {self.dataset_mnemonic}') + logging.warning(f'Reading {self.filename}:{row_num} dropping record') + return + if not variable['Processing_Priority']: - raise ValueError(f'Reading {self.filename} ' - 'Processing_Priority not specified for classification ' - f'{classification_mnemonic} in dataset ' - f'{self.dataset_mnemonic}') + self.recoverable_error(f'Reading {self.filename}:{row_num} ' + 'Processing_Priority not specified for classification ' + f'{classification_mnemonic} in dataset ' + f'{self.dataset_mnemonic}') + logging.warning(f'Reading {self.filename}:{row_num} using 0 for Processing_Priority') + variable['Processing_Priority'] = 0 + self.classifications.append(variable['Classification_Mnemonic']) self.processing_priorities.append(int(variable['Processing_Priority'])) def dataset_variables(self): """Return dataset classifications and alternate geographic variables for each dataset.""" if self.alternate_geog_variables and not self.lowest_geog_variable: - raise ValueError(f'Reading {self.filename} ' - 'Lowest_Geog_Variable_Flag not set on any geographic variables ' - f'for dataset {self.dataset_mnemonic}') + self.recoverable_error(f'Reading {self.filename} ' + 'Lowest_Geog_Variable_Flag not set on any geographic variables ' + f'for dataset {self.dataset_mnemonic}') if set(self.processing_priorities) != set(range(1, len(self.processing_priorities) + 1)): - raise ValueError(f'Reading {self.filename} ' - 'Invalid processing_priorities ' - f'{self.processing_priorities} for dataset ' - f'{self.dataset_mnemonic}') + self.recoverable_error(f'Reading {self.filename} ' + 'Invalid processing_priorities ' + f'{self.processing_priorities} for dataset ' + f'{self.dataset_mnemonic}') classifications = [c for _, c in sorted(zip(self.processing_priorities, self.classifications))] diff --git a/bin/ons_csv_to_ctb_json_geo.py b/bin/ons_csv_to_ctb_json_geo.py index ec78865..57359c6 100644 --- a/bin/ons_csv_to_ctb_json_geo.py +++ b/bin/ons_csv_to_ctb_json_geo.py @@ -40,13 +40,11 @@ def read_geo_cats(filename): var_to_columns = assign_columns_to_variables(filename, fieldnames) data = {var_name: {} for var_name in var_to_columns} - for row in reader: + for row_num, row in enumerate(reader, 2): if len(row) > len(fieldnames): - raise ValueError(f'Reading {filename}: too many fields on line ' - f'{reader.line_num}') + raise ValueError(f'Reading {filename}: too many fields on row {row_num}') if len(row) < len(fieldnames): - raise ValueError(f'Reading {filename}: too few fields on line ' - f'{reader.line_num}') + raise ValueError(f'Reading {filename}: too few fields on row {row_num}') for geo, columns in var_to_columns.items(): code = row[columns.code].strip() diff --git a/bin/ons_csv_to_ctb_json_load.py b/bin/ons_csv_to_ctb_json_load.py index d7b12ed..037715e 100644 --- a/bin/ons_csv_to_ctb_json_load.py +++ b/bin/ons_csv_to_ctb_json_load.py @@ -19,6 +19,11 @@ def isnumeric(string): return string.isnumeric() +def is_y_or_n(string): + """Return true if the string is either 'Y' or 'N'.""" + return string in ['Y', 'N'] + + def isoneof(valid_values): """Return a function that checks whether the value is in the specified set of values.""" valid_values_set = set(valid_values) @@ -55,10 +60,26 @@ class Loader: Many of the fields in this class are cached properties, with the data loaded on first access. """ - def __init__(self, input_directory, geography_file): + def __init__(self, input_directory, geography_file, best_effort=False): """Initialise MetadataLoader object.""" self.input_directory = input_directory self.geography_file = geography_file + self._error_count = 0 + + def raise_value_error(msg): + """Raise a ValueError exception.""" + raise ValueError(msg) + + def log_error(msg): + """Log the error.""" + self._error_count += 1 + logging.warning(msg) + + self.recoverable_error = log_error if best_effort else raise_value_error + + def error_count(self): + """Return number of errors.""" + return self._error_count def read_file(self, filename, columns, unique_combo_fields=None): """ @@ -68,7 +89,7 @@ def read_file(self, filename, columns, unique_combo_fields=None): and corresponding line number. """ full_filename = self.full_filename(filename) - return Reader(full_filename, columns, unique_combo_fields).read() + return Reader(full_filename, columns, self.recoverable_error, unique_combo_fields).read() def full_filename(self, filename): """Add the input_directory path to the filename.""" @@ -102,6 +123,7 @@ def sources(self): required('Source_Mnemonic', unique=True), required('Source_Description'), required('Id'), + required('Version'), optional('Source_Description_Welsh'), optional('Copyright_Statement'), @@ -113,7 +135,6 @@ def sources(self): optional('SDC_Link'), optional('SDC_Statement'), optional('SDC_Statement_Welsh'), - optional('Version'), optional('Contact_Id', validate_fn=isoneof(self.contacts.keys())), ] source_rows = self.read_file('Source.csv', columns) @@ -170,9 +191,9 @@ def security_classifications(self): columns = [ required('Security_Mnemonic', unique=True), required('Id'), + required('Security_Description'), optional('Security_Description_Welsh'), - optional('Security_Description'), ] security_classification_rows = self.read_file(filename, columns) @@ -229,6 +250,7 @@ def datasets(self): required('Statistical_Unit', validate_fn=isoneof(self.statistical_units.keys())), required('Version'), required('Dataset_Description'), + required('Signed_Off_Flag', validate_fn=is_y_or_n), optional('Dataset_Title_Welsh'), optional('Dataset_Description_Welsh'), @@ -237,7 +259,6 @@ def datasets(self): optional('Dataset_Population_Welsh'), optional('Last_Updated'), optional('Unique_Url'), - optional('Signed_Off_Flag'), optional('Contact_Id', validate_fn=isoneof(self.contacts.keys())), ] dataset_rows = self.read_file(filename, columns) @@ -251,7 +272,7 @@ def datasets(self): dataset_to_variables = self.load_dataset_to_variables(dataset_mnemonics) datasets = {} - for dataset, line_num in dataset_rows: + for dataset, row_num in dataset_rows: dataset_mnemonic = dataset.pop('Dataset_Mnemonic') database_mnemonic = dataset.pop('Database_Mnemonic') @@ -270,27 +291,42 @@ def datasets(self): dataset_variables = dataset_to_variables.get( dataset_mnemonic, DatasetVariables([], [])) + alternate_geog_variables = (dataset_variables.alternate_geog_variables if + dataset_variables.alternate_geog_variables else []) + dataset['Alternate_Geographic_Variables'] = alternate_geog_variables + all_classifications = dataset_variables.classifications + alternate_geog_variables + # If the dataset is public then ensure that there is at least one classification and # that all the classifications are also public. if dataset['Security_Mnemonic'] == PUBLIC_SECURITY_MNEMONIC: + drop_dataset = False if not dataset_variables.classifications: - raise ValueError( - f'Reading {self.full_filename(filename)}:{line_num} {dataset_mnemonic} ' + self.recoverable_error( + f'Reading {self.full_filename(filename)}:{row_num} {dataset_mnemonic} ' 'has no associated classifications or geographic variable') + drop_dataset = True - for classification in dataset_variables.classifications: + for classification in all_classifications: if self.classifications[classification].private['Security_Mnemonic'] != \ PUBLIC_SECURITY_MNEMONIC: - raise ValueError( - f'Reading {self.full_filename(filename)}:{line_num} Public ONS ' + self.recoverable_error( + f'Reading {self.full_filename(filename)}:{row_num} Public ONS ' f'dataset {dataset_mnemonic} has non-public classification ' f'{classification}') + drop_dataset = True + if classification not in \ self.databases[database_mnemonic].private['Classifications']: - raise ValueError( - f'Reading {self.full_filename(filename)}:{line_num} ' + self.recoverable_error( + f'Reading {self.full_filename(filename)}:{row_num} ' f'{dataset_mnemonic} has classification {classification} ' f'that is not in database {database_mnemonic}') + drop_dataset = True + + if drop_dataset: + logging.warning( + f'Reading {self.full_filename(filename)}:{row_num} dropping record') + continue del dataset['Id'] del dataset['Signed_Off_Flag'] @@ -323,10 +359,11 @@ def databases(self): required('Id'), required('Database_Description'), required('Version'), + # This should be mandatory but is not yet populated + optional('Cantabular_DB_Flag', validate_fn=is_y_or_n), optional('Database_Title_Welsh'), optional('Database_Description_Welsh'), - optional('Cantabular_DB_Flag'), optional('IAR_Asset_Id'), ] database_rows = self.read_file('Database.csv', columns) @@ -392,10 +429,10 @@ def categories(self): unique_combo_fields=['Category_Code', 'Classification_Mnemonic']) classification_to_cats = {} - for cat, line_num in category_rows: + for cat, row_num in category_rows: classification_mnemonic = cat['Classification_Mnemonic'] if self.classifications[classification_mnemonic].private['Is_Geographic']: - raise ValueError(f'Reading {self.full_filename(filename)}:{line_num} ' + raise ValueError(f'Reading {self.full_filename(filename)}:{row_num} ' 'found category for geographic classification ' f'{classification_mnemonic}: all categories for geographic ' 'classifications must be in a separate lookup file') @@ -407,9 +444,10 @@ def categories(self): num_cat_items = \ self.classifications[classification_mnemonic].private['Number_Of_Category_Items'] if num_cat_items and len(one_var_categories) != num_cat_items: - raise ValueError(f'Reading {self.full_filename(filename)} ' - f'Unexpected number of categories for {classification_mnemonic}: ' - f'expected {num_cat_items} but found {len(one_var_categories)}') + self.recoverable_error( + f'Reading {self.full_filename(filename)} ' + f'Unexpected number of categories for {classification_mnemonic}: ' + f'expected {num_cat_items} but found {len(one_var_categories)}') welsh_cats = {cat['Category_Code']: cat['External_Category_Label_Welsh'] for cat in one_var_categories if cat['External_Category_Label_Welsh']} @@ -429,8 +467,9 @@ def categories(self): continue if not self.classifications[class_name].private['Is_Geographic']: - raise ValueError(f'Reading {self.geography_file}: found Welsh labels for non ' - f'geographic classification: {class_name}') + self.recoverable_error(f'Reading {self.geography_file}: found Welsh labels for ' + f'non geographic classification: {class_name}') + continue welsh_names = {cd: nm.welsh_name for cd, nm in geo_cats.items() if nm.welsh_name} if geo_cats: @@ -537,9 +576,17 @@ def variables(self): required('Variable_Type_Code', validate_fn=isoneof(self.variable_types.keys())), required('Variable_Title'), required('Variable_Description'), - optional('Statistical_Unit', validate_fn=isoneof(self.statistical_units.keys())), required('Id'), required('Version'), + required('Signed_Off_Flag', validate_fn=is_y_or_n), + + # Required for non-geographic variables but not always populated in source files + optional('Statistical_Unit', validate_fn=isoneof(self.statistical_units.keys())), + + # Required for geographic variables but not yet populated + optional('Geographic_Abbreviation'), + optional('Geographic_Theme'), + optional('Geographic_Coverage'), optional('Variable_Title_Welsh'), optional('Variable_Description_Welsh'), @@ -548,14 +595,10 @@ def variables(self): optional('Comparability_Comments_Welsh'), optional('Uk_Comparison_Comments'), optional('Uk_Comparison_Comments_Welsh'), - optional('Geographic_Abbreviation'), optional('Geographic_Abbreviation_Welsh'), - optional('Geographic_Theme'), optional('Geographic_Theme_Welsh'), - optional('Geographic_Coverage'), optional('Geographic_Coverage_Welsh'), optional('Topic_Mnemonic', validate_fn=isoneof(self.topics.keys())), - optional('Signed_Off_Flag'), optional('Number_Of_Classifications'), optional('Quality_Statement_Text'), optional('Quality_Summary_URL'), @@ -566,19 +609,34 @@ def variables(self): variable_to_keywords = self.load_variable_to_keywords(variable_mnemonics) variable_to_source_questions = self.load_variable_to_questions(variable_mnemonics) - geo_fields = {'Geographic_Abbreviation', 'Geographic_Abbreviation_Welsh', - 'Geographic_Theme', 'Geographic_Theme_Welsh', 'Geographic_Coverage', - 'Geographic_Coverage_Welsh'} + en_geo_fields = {'Geographic_Abbreviation', 'Geographic_Theme', 'Geographic_Coverage'} + all_geo_fields = en_geo_fields | {'Geographic_Abbreviation_Welsh', + 'Geographic_Theme_Welsh', + 'Geographic_Coverage_Welsh'} variables = {} - for variable, line_num in variable_rows: + for variable, row_num in variable_rows: # Ensure that non-geographic variables do not have geographic values set. is_geographic = variable['Variable_Type_Code'] == GEOGRAPHIC_VARIABLE_TYPE if not is_geographic: - for geo_field in geo_fields: + # This value is not always populated in source files + # if not variable['Statistical_Unit']: + # raise ValueError(f'Reading {self.full_filename(filename)}:{row_num} ' + # f'no Statistical_Unit specified for non geographic variable: ' + # f'{variable["Variable_Mnemonic"]}') + for geo_field in all_geo_fields: if variable[geo_field]: - raise ValueError(f'Reading {self.full_filename(filename)}:{line_num} ' - f'{geo_field} specified for non geographic variable: ' - f'{variable["Variable_Mnemonic"]}') + self.recoverable_error(f'Reading {self.full_filename(filename)}:{row_num} ' + f'{geo_field} specified for non geographic ' + f'variable: {variable["Variable_Mnemonic"]}') + + # These values are not yet populated in source files + # else: + # for geo_field in en_geo_fields: + # if not variable[geo_field]: + # raise ValueError(f'Reading {self.full_filename(filename)}:{row_num} ' + # f'no {geo_field} specified for geographic variable: ' + # f'{variable["Variable_Mnemonic"]}') + variable_title = Bilingual( variable.pop('Variable_Title'), variable.pop('Variable_Title_Welsh')) @@ -638,18 +696,18 @@ def classifications(self): columns = [ required('Id'), required('Classification_Mnemonic', unique=True), - required('Number_Of_Category_Items', validate_fn=isnumeric), required('Variable_Mnemonic', validate_fn=isoneof(self.variables.keys())), required('Internal_Classification_Label_English'), required('Security_Mnemonic', validate_fn=isoneof(self.security_classifications)), required('Version'), + required('Signed_Off_Flag', validate_fn=is_y_or_n), + optional('Number_Of_Category_Items', validate_fn=isnumeric), optional('External_Classification_Label_English'), optional('External_Classification_Label_Welsh'), optional('Mnemonic_2011'), optional('Parent_Classification_Mnemonic'), optional('Default_Classification_Flag'), - optional('Signed_Off_Flag'), optional('Flat_Classification_Flag'), ] classification_rows = self.read_file(filename, columns) @@ -658,11 +716,11 @@ def classifications(self): classification_to_topics = self.load_classification_to_topics(classification_mnemonics) classifications = {} - for classification, line_num in classification_rows: + for classification, row_num in classification_rows: variable_mnemonic = classification.pop('Variable_Mnemonic') classification_mnemonic = classification.pop('Classification_Mnemonic') if self.variables[variable_mnemonic].private['Is_Geographic']: - raise ValueError(f'Reading {self.full_filename(filename)}:{line_num} ' + raise ValueError(f'Reading {self.full_filename(filename)}:{row_num} ' f'{classification_mnemonic} has a geographic variable ' f'{variable_mnemonic} which is not allowed') @@ -679,7 +737,7 @@ def classifications(self): if classification['Security_Mnemonic'] == PUBLIC_SECURITY_MNEMONIC: variable = classification['ONS_Variable'] if variable.private['Security_Mnemonic'] != PUBLIC_SECURITY_MNEMONIC: - raise ValueError(f'Reading {self.full_filename(filename)}:{line_num} ' + raise ValueError(f'Reading {self.full_filename(filename)}:{row_num} ' f'Public classification {classification_mnemonic} has ' f'non-public variable {variable_mnemonic}') @@ -687,7 +745,8 @@ def classifications(self): del classification['Flat_Classification_Flag'] del classification['Id'] - num_cat_items = int(classification.pop('Number_Of_Category_Items')) + num_cat_items = classification.pop('Number_Of_Category_Items') + num_cat_items = int(num_cat_items) if num_cat_items else 0 classifications[classification_mnemonic] = BilingualDict( classification, @@ -772,20 +831,23 @@ def load_database_to_variables(self, database_mnemonics): if db_var['Lowest_Geog_Variable_Flag'] == 'Y': if not is_geographic: - raise ValueError(f'Reading {self.full_filename(filename)} ' - 'Lowest_Geog_Variable_Flag set on non-geographic variable' - f' {variable_mnemonic} for database {database_mnemonic}') - if lowest_geog_var: - raise ValueError(f'Reading {self.full_filename(filename)} ' - f'Lowest_Geog_Variable_Flag set on {variable_mnemonic} ' - f'and {lowest_geog_var} for database {database_mnemonic}') - lowest_geog_var = variable_mnemonic + self.recoverable_error( + f'Reading {self.full_filename(filename)} ' + 'Lowest_Geog_Variable_Flag set on non-geographic variable' + f' {variable_mnemonic} for database {database_mnemonic}') + elif lowest_geog_var: + self.recoverable_error( + f'Reading {self.full_filename(filename)} ' + f'Lowest_Geog_Variable_Flag set on {variable_mnemonic} ' + f'and {lowest_geog_var} for database {database_mnemonic}') + else: + lowest_geog_var = variable_mnemonic variables.append(variable_mnemonic) if not lowest_geog_var and contains_geo_vars: - raise ValueError(f'Reading {self.full_filename(filename)} ' - 'Lowest_Geog_Variable_Flag not set on any geographic variable ' - f'for database {database_mnemonic}') + self.recoverable_error(f'Reading {self.full_filename(filename)} ' + 'Lowest_Geog_Variable_Flag not set on any geographic ' + f'variable for database {database_mnemonic}') database_to_variables[database_mnemonic] = DatabaseVariables( variables=variables, lowest_geog_variable=lowest_geog_var) @@ -952,18 +1014,19 @@ def load_dataset_to_variables(self, dataset_mnemonics): unique_combo_fields=['Dataset_Mnemonic', 'Variable_Mnemonic']) ds_to_vars_builder = {} - for ds_variable, _ in dataset_variable_rows: + for ds_variable, row_num in dataset_variable_rows: dataset_mnemonic = ds_variable['Dataset_Mnemonic'] variable_mnemonic = ds_variable['Variable_Mnemonic'] if dataset_mnemonic not in ds_to_vars_builder: ds_to_vars_builder[dataset_mnemonic] = DatasetVarsBuilder( - dataset_mnemonic, self.full_filename(filename), self.classifications) + dataset_mnemonic, self.full_filename(filename), self.classifications, + self.recoverable_error) vars_builder = ds_to_vars_builder[dataset_mnemonic] if self.variables[variable_mnemonic].private['Is_Geographic']: - vars_builder.add_geographic_variable(ds_variable) + vars_builder.add_geographic_variable(ds_variable, row_num) else: - vars_builder.add_non_geographic_variable(ds_variable) + vars_builder.add_non_geographic_variable(ds_variable, row_num) ds_to_variables = {} for dataset_mnemonic, vars_builder in ds_to_vars_builder.items(): diff --git a/bin/ons_csv_to_ctb_json_main.py b/bin/ons_csv_to_ctb_json_main.py index 2e1495f..78333f5 100644 --- a/bin/ons_csv_to_ctb_json_main.py +++ b/bin/ons_csv_to_ctb_json_main.py @@ -2,11 +2,47 @@ import json import os import logging +import re +from pathlib import Path from argparse import ArgumentParser +from datetime import date from ons_csv_to_ctb_json_load import Loader, PUBLIC_SECURITY_MNEMONIC from ons_csv_to_ctb_json_bilingual import BilingualDict, Bilingual -VERSION = '1.1.alpha' +VERSION = '1.1.beta' + +SYSTEM = 'cantabm' +DEFAULT_CANTABULAR_VERSION = '9.3.0' +CANTABULAR_V9_2_0 = '9.2.0' +FILE_CONTENT_TYPE_DATASET = 'dataset-md' +FILE_CONTENT_TYPE_TABLES = 'tables-md' +FILE_CONTENT_TYPE_SERVICE = 'service-md' +KNOWN_CANTABULAR_VERSIONS = [DEFAULT_CANTABULAR_VERSION, CANTABULAR_V9_2_0] + + +def filename_segment(value): + """Check that the string is valid for use as part of a filename.""" + for character in value: + if not character.isalnum() and character not in '-_. ': + raise ValueError(f"invalid value: '{value}'") + return value + + +def positive_int(value): + """Check that the value is an integer greater or equal to 0.""" + # An exception will be raised if value is not an int + number = int(value) + if number < 0: + raise ValueError(f"invalid value: '{value}'") + return number + + +def cantabular_version_string(value): + """Check that the version is of format x.y.z.""" + value = value.strip() + if not re.match(r'^\d+.\d+.\d+$', value): + raise ValueError(f"invalid value: '{value}'") + return value def main(): @@ -41,41 +77,153 @@ def main(): choices=['CRITICAL', 'ERROR', 'WARNING', 'INFO', 'DEBUG'], help='Log level (default: %(default)s)') + parser.add_argument('-p', '--file_prefix', + type=str, + choices=['d', 't', 'tu'], + help='Prefix to use in output filenames: d=dev, t=test, tu=tuning ' + '(default: no prefix i.e. operational)') + + parser.add_argument('-m', '--metadata_master_version', + type=filename_segment, + default='unknown-metadata-version', + help='Metadata master version to use in output filenames ' + '(default: %(default)s)') + + parser.add_argument('-b', '--build_number', + type=positive_int, + default=1, + help='Build number to use in output filenames ' + '(default: %(default)s)') + + parser.add_argument('-v', '--cantabular-version', + type=cantabular_version_string, + default=DEFAULT_CANTABULAR_VERSION, + help='Cantabular version for output files. The supported versions are ' + f'[{", ".join(KNOWN_CANTABULAR_VERSIONS)}]. If any other version is ' + 'supplied then it will be used in the filename, but version ' + f'{DEFAULT_CANTABULAR_VERSION} formatting will be used. ' + '(default: %(default)s)') + + parser.add_argument('--best-effort', + action='store_true', + help='Discard invalid data instead of failing on the first error and ' + 'make a best effort attempt to produce valid output files.') + args = parser.parse_args() logging.basicConfig(format='t=%(asctime)s lvl=%(levelname)s msg=%(message)s', level=args.log_level) + logging.info(f'{Path(__file__).name} version {VERSION}') + logging.info(f'CSV source directory: {args.input_dir}') + if args.geography_file: + logging.info(f'Geography file: {args.geography_file}') + for directory in (args.input_dir, args.output_dir): if not os.path.isdir(directory): raise ValueError(f'{directory} does not exist or is not a directory') + todays_date = date.today().strftime('%Y%m%d') + base_filename_template = output_filename_template( + args.file_prefix, args.cantabular_version, args.metadata_master_version, todays_date, + args.build_number) + # loader is used to load the metadata from CSV files and convert it to JSON. - loader = Loader(args.input_dir, args.geography_file) + loader = Loader(args.input_dir, args.geography_file, best_effort=args.best_effort) - # Build Cantabular variable and dataset objects and write them to a JSON file. + # Build Cantabular variable objects. # A Cantabular variable is equivalent to an ONS classification. - # A Cantabular dataset is equivalent to an ONS database. ctb_variables = build_ctb_variables(loader.classifications, loader.categories) + + # Build Cantabular dataset objects. + # A Cantabular dataset is equivalent to an ONS database. ctb_datasets = build_ctb_datasets(loader.databases, ctb_variables) - filename = os.path.join(args.output_dir, 'dataset-metadata.json') - with open(filename, 'w') as jsonfile: - json.dump(ctb_datasets, jsonfile, indent=4) - logging.info(f'Written dataset metadata file to: {filename}') - # Build Cantabular table objects and write to JSON. + # Build Cantabular table objects. + # A Cantabular table is equivalent to an ONS dataset. ctb_tables = build_ctb_tables(loader.datasets) - filename = os.path.join(args.output_dir, 'table-metadata.json') - with open(filename, 'w') as jsonfile: - json.dump(ctb_tables, jsonfile, indent=4) - logging.info(f'Written table metadata file to: {filename}') - # Build Cantabular service metadata objects and write to JSON. + # Build Cantabular service metadata. service_metadata = build_ctb_service_metadata() - filename = os.path.join(args.output_dir, 'service-metadata.json') - with open(filename, 'w') as jsonfile: - json.dump(service_metadata, jsonfile, indent=4) - logging.info(f'Written service metadata file to: {filename}') + + error_count = loader.error_count() + if error_count: + logging.warning(f'{error_count} errors were encountered during processing') + + # There is not a separate tables file for v9.2.0. Use the output_file_types list + # to determine which file types will be written. + output_file_types = [FILE_CONTENT_TYPE_DATASET, FILE_CONTENT_TYPE_SERVICE, + FILE_CONTENT_TYPE_TABLES] + + if args.cantabular_version == DEFAULT_CANTABULAR_VERSION: + logging.info( + f'Output files will be written in Cantabular {args.cantabular_version} format') + + elif args.cantabular_version == CANTABULAR_V9_2_0: + output_file_types = [FILE_CONTENT_TYPE_DATASET, FILE_CONTENT_TYPE_SERVICE] + convert_json_to_ctb_v9_2_0(ctb_datasets, ctb_tables, service_metadata) + logging.info( + f'Output files will be written in Cantabular {args.cantabular_version} format') + + else: + logging.info( + f'{args.cantabular_version} is an unknown Cantabular version: files will be written ' + f'using {DEFAULT_CANTABULAR_VERSION} format') + + if FILE_CONTENT_TYPE_DATASET in output_file_types: + filename = os.path.join(args.output_dir, + base_filename_template.format(FILE_CONTENT_TYPE_DATASET)) + with open(filename, 'w') as jsonfile: + json.dump(ctb_datasets, jsonfile, indent=4) + logging.info(f'Written dataset metadata file to: {filename}') + + if FILE_CONTENT_TYPE_TABLES in output_file_types: + filename = os.path.join(args.output_dir, + base_filename_template.format(FILE_CONTENT_TYPE_TABLES)) + with open(filename, 'w') as jsonfile: + json.dump(ctb_tables, jsonfile, indent=4) + logging.info(f'Written table metadata file to: {filename}') + + if FILE_CONTENT_TYPE_SERVICE in output_file_types: + filename = os.path.join(args.output_dir, + base_filename_template.format(FILE_CONTENT_TYPE_SERVICE)) + with open(filename, 'w') as jsonfile: + json.dump(service_metadata, jsonfile, indent=4) + logging.info(f'Written service metadata file to: {filename}') + + +def convert_json_to_ctb_v9_2_0(ctb_datasets, ctb_tables, service_metadata): + """Convert JSON to Cantabular v9.2.0 format.""" + for dataset in ctb_datasets: + dataset['meta']['description'] = dataset.pop('description') + for variable in dataset['vars'] if dataset['vars'] else []: + variable['meta']['description'] = variable.pop('description') + + service_metadata[0]['meta']['tables'] = [] + service_metadata[1]['meta']['tables'] = [] + for table in ctb_tables: + for idx in [0, 1]: + localized_table = { + 'name': table['name'], + 'label': table['ref'][idx]['label'], + 'description': table['ref'][idx]['description'], + 'datasetName': table['datasetName'], + 'vars': table['vars'], + 'meta': table['ref'][idx]['meta'], + } + service_metadata[idx]['meta']['tables'].append(localized_table) + + +def output_filename_template(prefix, cantabular_version, metadata_master_version, todays_date, + build_number): + """Generate template for output filename.""" + system_software_version = 'v' + cantabular_version.replace('.', '-') + filename = (f'{SYSTEM}_{system_software_version}_{metadata_master_version}_{{}}_' + f'{todays_date}-{build_number}.json') + if prefix: + filename = prefix + '_' + filename + + return filename def build_ctb_variables(classifications, cat_labels): @@ -134,6 +282,7 @@ def build_ctb_datasets(databases, ctb_variables): 'Source': { 'Source_Mnemonic': 'Census2021', 'Source_Description': 'The 2021 England and Wales Census', + 'Version': '1', }, 'Version': '1' }, diff --git a/bin/ons_csv_to_ctb_json_read.py b/bin/ons_csv_to_ctb_json_read.py index 8d7b5dc..cb3e219 100644 --- a/bin/ons_csv_to_ctb_json_read.py +++ b/bin/ons_csv_to_ctb_json_read.py @@ -1,5 +1,6 @@ """Load metadata from CSV files and export in JSON format.""" import csv +import logging from collections import namedtuple Column = namedtuple('Column', 'name unique validate_fn required') @@ -15,13 +16,13 @@ def optional(name, unique=False, validate_fn=None): return Column(name, unique, validate_fn, required=False) -Row = namedtuple('Row', 'data line_num') +Row = namedtuple('Row', 'data row_num') class Reader: """Reader is used to read a CSV file containing metadata.""" - def __init__(self, filename, columns, unique_combo_fields=None): + def __init__(self, filename, columns, recoverable_error, unique_combo_fields=None): """Initialise Reader object.""" self.filename = filename self.columns = columns @@ -30,6 +31,7 @@ def __init__(self, filename, columns, unique_combo_fields=None): self.unique_combo_fields = unique_combo_fields if unique_combo_fields: self.unique_combos = set() + self.recoverable_error = recoverable_error def read(self): """ @@ -48,13 +50,11 @@ def read(self): raise ValueError(f'Reading {self.filename}: missing expected columns: ' f'{", ".join(sorted(missing_columns))}') - for row in reader: + for row_num, row in enumerate(reader, 2): if None in row: - raise ValueError(f'Reading {self.filename}: too many fields on line ' - f'{reader.line_num}') + raise ValueError(f'Reading {self.filename}: too many fields on row {row_num}') if None in row.values(): - raise ValueError(f'Reading {self.filename}: too few fields on line ' - f'{reader.line_num}') + raise ValueError(f'Reading {self.filename}: too few fields on row {row_num}') for k in list(row.keys()): if k not in self.expected_columns: @@ -63,40 +63,58 @@ def read(self): if not [k for k in row if row[k]]: continue - self.validate_row(row, reader.line_num) + if not self.validate_row(row, row_num): + logging.warning(f'Reading {self.filename}:{row_num} dropping record') + continue for k in row.keys(): if row[k] == '': row[k] = None - data.append(Row(row, reader.line_num)) + data.append(Row(row, row_num)) return data - def validate_row(self, row, line_num): + def validate_row(self, row, row_num): """Validate the fields in a row.""" + keep_row = True for column in self.columns: row[column.name] = row[column.name].strip() if column.required and not row[column.name]: - raise ValueError(f'Reading {self.filename}:{line_num} no value supplied ' - f'for required field {column.name}') + self.recoverable_error(f'Reading {self.filename}:{row_num} no value supplied ' + f'for required field {column.name}') + keep_row = False + continue if column.unique: if row[column.name] in self.unique_column_values[column.name]: - raise ValueError(f'Reading {self.filename}:{line_num} duplicate ' - f'value {row[column.name]} for {column.name}') + self.recoverable_error(f'Reading {self.filename}:{row_num} duplicate ' + f'value {row[column.name]} for {column.name}') + keep_row = False + continue + self.unique_column_values[column.name].add(row[column.name]) if row[column.name] and column.validate_fn and not \ column.validate_fn(row[column.name]): - raise ValueError(f'Reading {self.filename}:{line_num} invalid value ' - f'{row[column.name]} for {column.name}') + self.recoverable_error(f'Reading {self.filename}:{row_num} invalid value ' + f'{row[column.name]} for {column.name}') + if column.required: + keep_row = False + continue + logging.warning(f'Reading {self.filename}:{row_num} ignoring field {column.name}') + row[column.name] = "" - if self.unique_combo_fields: + if self.unique_combo_fields and keep_row: combo = tuple([row[f] for f in self.unique_combo_fields]) if combo in self.unique_combos: - raise ValueError(f'Reading {self.filename}:{line_num} duplicate ' - f'value combo {"/".join(combo)} for ' - f'{"/".join(self.unique_combo_fields)}') - self.unique_combos.add(combo) + self.recoverable_error(f'Reading {self.filename}:{row_num} duplicate ' + f'value combo {"/".join(combo)} for ' + f'{"/".join(self.unique_combo_fields)}') + + keep_row = False + else: + self.unique_combos.add(combo) + + return keep_row diff --git a/ctb_metadata_files/metadata.graphql b/ctb_metadata_files/metadata.graphql index 7a01a91..d94100e 100644 --- a/ctb_metadata_files/metadata.graphql +++ b/ctb_metadata_files/metadata.graphql @@ -11,7 +11,6 @@ type DatasetMetadata { type VariableMetadata { Mnemonic_2011: String - Flat_Classification_Flag: String Parent_Classification_Mnemonic: String Default_Classification_Flag: String Version: String! @@ -67,7 +66,7 @@ type Source { Methodology_Statement: String SDC_Link: String SDC_Statement: String - Version: String + Version: String! Contact: Contact } @@ -92,6 +91,7 @@ type TableMetadata { Publications: [Publication]! Census_Releases: [Census_Release]! Statistical_Unit: Statistical_Unit! + Alternate_Geographic_Variables: [String]! } type Publication { diff --git a/ctb_metadata_files/metadata_9_2_0.graphql b/ctb_metadata_files/metadata_9_2_0.graphql new file mode 100644 index 0000000..94b79f3 --- /dev/null +++ b/ctb_metadata_files/metadata_9_2_0.graphql @@ -0,0 +1,127 @@ +type ServiceMetadata { + description: String! + tables: [Table]! +} + +type Table { + name: String! + label: String! + description: String + datasetName: String! + vars: [String]! + meta: TableMetadata! +} + +type DatasetMetadata { + description: String! + Cantabular_DB_Flag: String + Source: Source! + Version: String! + Lowest_Geog_Variable: String +} + +type VariableMetadata { + description: String! + Mnemonic_2011: String + Flat_Classification_Flag: String + Parent_Classification_Mnemonic: String + Default_Classification_Flag: String + Version: String! + ONS_Variable: ONS_Variable! + Topics: [Topic]! +} + +type ONS_Variable { + Variable_Mnemonic: String! + Variable_Title: String! + Variable_Mnemonic_2011: String + Comparability_Comments: String + Uk_Comparison_Comments: String + Geographic_Abbreviation: String + Geographic_Theme: String + Geographic_Coverage: String + Version: String! + Statistical_Unit: Statistical_Unit + Keywords: [String]! + Topic: Topic + Questions: [Question]! + Variable_Type: Variable_Type! + Quality_Statement_Text: String + Quality_Summary_URL: String +} + +type Variable_Type { + Variable_Type_Code: String! + Variable_Type_Description: String! +} + +type Topic { + Topic_Mnemonic: String! + Topic_Description: String! + Topic_Title: String! +} + +type Question { + Question_Code: String! + Question_Label: String! + Reason_For_Asking_Question: String + Question_First_Asked_In_Year: String + Version: String! +} + +type Source { + Source_Mnemonic: String! + Source_Description: String! + Copyright_Statement: String + Licence: String + Nationals_Statistic_Certified: String + Methodology_Link: String + Methodology_Statement: String + SDC_Link: String + SDC_Statement: String + Version: String! + Contact: Contact +} + +type Contact { + Contact_Id: String! + Contact_Name: String! + Contact_Email: String! + Contact_Phone: String + Contact_Website: String +} + +type TableMetadata { + Dataset_Mnemonic_2011: String + Geographic_Coverage: String! + Dataset_Population: String! + Last_Updated: String + Unique_Url: String + Contact: Contact + Version: String! + Related_Datasets: [String]! + Keywords: [String]! + Publications: [Publication]! + Census_Releases: [Census_Release]! + Statistical_Unit: Statistical_Unit! + Alternate_Geographic_Variables: [String]! +} + +type Publication { + Publication_Mnemonic: String! + Publication_Title: String + Publisher_Name: String + Publisher_Website: String +} + +type Census_Release { + Census_Release_Number: String! + Census_Release_Description: String! + Release_Date: String! +} + +type Statistical_Unit { + Statistical_Unit: String! + Statistical_Unit_Description: String! +} + diff --git a/modified/.gitignore b/modified/.gitignore deleted file mode 100644 index afed073..0000000 --- a/modified/.gitignore +++ /dev/null @@ -1 +0,0 @@ -*.csv diff --git a/sample_2011/Variable.csv b/sample_2011/Variable.csv index 36405f7..82b6eb5 100644 --- a/sample_2011/Variable.csv +++ b/sample_2011/Variable.csv @@ -1,17 +1,17 @@ Id,Variable_Mnemonic,Variable_Title,Variable_Title_Welsh,Variable_Description,Variable_Description_Welsh,Variable_Mnemonic_2011,Comparability_Comments,Comparability_Comments_Welsh,Uk_Comparison_Comments,Uk_Comparison_Comments_Welsh,Signed_Off_Flag,Security_Mnemonic,Variable_Type_Code,Topic_Mnemonic,Number_Of_Classifications,Statistical_Unit,Geographic_Abbreviation,Geographic_Abbreviation_Welsh,Geographic_Theme,Geographic_Theme_Welsh,Geographic_Coverage,Geographic_Coverage_Welsh,Version,Quality_Statement_Text,Quality_Summary_URL -1,Region,Region,Rhanbarth,"The geographic region in which a person lives, derived from the address of their household or communal establishment.","Y rhanbarth daearyddol y mae person yn byw ynddo, yn deillio o gyfeiriad eu cartref neu sefydliad cymunedol.",Region,,,,,,PUB,GEOG,,1,People,RGN,,Administrative,Gweinyddol,England and Wales,Cymru a Lloegr,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. +1,Region,Region,Rhanbarth,"The geographic region in which a person lives, derived from the address of their household or communal establishment.","Y rhanbarth daearyddol y mae person yn byw ynddo, yn deillio o gyfeiriad eu cartref neu sefydliad cymunedol.",Region,,,,,Y,PUB,GEOG,,1,People,RGN,,Administrative,Gweinyddol,England and Wales,Cymru a Lloegr,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. However no census is perfect and some people are inevitably missed. ONS therefore uses complex statistical techniques to adjust the 2011 Census counts for those people missed by the census. The methods and quality assurance approach was researched and developed in consultation with academics, statisticians, demographers and users of census data. The result was a suite of methods to process, clean, adjust and protect the census results.",https://www.ons.gov.uk/census/2011census/2011censusdata/2011censususerguide/qualityandmethods -2,Country,Country,Ngwlad,"The country - either England or Wales - in which a person lives, derived from the region they live in.","Mae'r wlad - naill ai Cymru neu Loegr - lle mae person yn byw, yn deillio o'r rhanbarth y mae'n byw ynddo.",Country,,,,,,PUB,GEOG,,1,People,CTRY,,Administrative,Gweinyddol,England and Wales,Cymru a Lloegr,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. +2,Country,Country,Ngwlad,"The country - either England or Wales - in which a person lives, derived from the region they live in.","Mae'r wlad - naill ai Cymru neu Loegr - lle mae person yn byw, yn deillio o'r rhanbarth y mae'n byw ynddo.",Country,,,,,Y,PUB,GEOG,,1,People,CTRY,,Administrative,Gweinyddol,England and Wales,Cymru a Lloegr,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. However no census is perfect and some people are inevitably missed. ONS therefore uses complex statistical techniques to adjust the 2011 Census counts for those people missed by the census. The methods and quality assurance approach was researched and developed in consultation with academics, statisticians, demographers and users of census data. The result was a suite of methods to process, clean, adjust and protect the census results.",https://www.ons.gov.uk/census/2011census/2011censusdata/2011censususerguide/qualityandmethods -3,Residence Type,Residence Type,Math Preswyl,"This defines the type of residence that a person lives in. It categorises people as living in a household or living in a communal establishment. People who filled in the normal household questionnaire were recorded as living in a household. Those that filled in an individual questionnaire were asked what type of accommodation they lived in, i.e. whether it was a household or a communal establishment.","Mae hyn yn diffinio'r math o breswylfa y mae person yn byw ynddi. Mae'n categoreiddio pobl fel rhai sy'n byw mewn cartref neu'n byw mewn sefydliad cymunedol. Cofnodwyd bod pobl a lenwodd holiadur arferol y cartref yn byw mewn cartref. Gofynnwyd i'r rhai a lenwodd holiadur unigol pa fath o lety yr oeddent yn byw ynddo, h.y. a oedd yn aelwyd neu'n sefydliad cymunedol.",Residence Type,,,,,,PUB,SV,LHQ,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. +3,Residence Type,Residence Type,Math Preswyl,"This defines the type of residence that a person lives in. It categorises people as living in a household or living in a communal establishment. People who filled in the normal household questionnaire were recorded as living in a household. Those that filled in an individual questionnaire were asked what type of accommodation they lived in, i.e. whether it was a household or a communal establishment.","Mae hyn yn diffinio'r math o breswylfa y mae person yn byw ynddi. Mae'n categoreiddio pobl fel rhai sy'n byw mewn cartref neu'n byw mewn sefydliad cymunedol. Cofnodwyd bod pobl a lenwodd holiadur arferol y cartref yn byw mewn cartref. Gofynnwyd i'r rhai a lenwodd holiadur unigol pa fath o lety yr oeddent yn byw ynddo, h.y. a oedd yn aelwyd neu'n sefydliad cymunedol.",Residence Type,,,,,Y,PUB,SV,LHQ,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. However no census is perfect and some people are inevitably missed. ONS therefore uses complex statistical techniques to adjust the 2011 Census counts for those people missed by the census. The methods and quality assurance approach was researched and developed in consultation with academics, statisticians, demographers and users of census data. The result was a suite of methods to process, clean, adjust and protect the census results.",https://www.ons.gov.uk/census/2011census/2011censusdata/2011censususerguide/qualityandmethods -4,Family Composition,Family Composition,Cyfansoddiad teuluol,"Family type is the classification of families into different types distinguished by the presence, absence and type of couple relationship: whether a married couple family, a same-sex civil partnership family, a cohabiting couple family, or a lone parent family.","Math o deulu yw dosbarthiad teuluoedd i wahanol fathau sy'n cael eu gwahaniaethu gan bresenoldeb, absenoldeb a math o berthynas cwpl: boed yn deulu pâr priod, teulu partneriaeth sifil o'r un rhyw, teulu pâr sy'n cyd-fyw, neu deulu rhiant sengl.",Family Composition,,,,,,PUB,DVO,MAD,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. +4,Family Composition,Family Composition,Cyfansoddiad teuluol,"Family type is the classification of families into different types distinguished by the presence, absence and type of couple relationship: whether a married couple family, a same-sex civil partnership family, a cohabiting couple family, or a lone parent family.","Math o deulu yw dosbarthiad teuluoedd i wahanol fathau sy'n cael eu gwahaniaethu gan bresenoldeb, absenoldeb a math o berthynas cwpl: boed yn deulu pâr priod, teulu partneriaeth sifil o'r un rhyw, teulu pâr sy'n cyd-fyw, neu deulu rhiant sengl.",Family Composition,,,,,Y,PUB,DVO,MAD,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. However no census is perfect and some people are inevitably missed. ONS therefore uses complex statistical techniques to adjust the 2011 Census counts for those people missed by the census. The methods and quality assurance approach was researched and developed in consultation with academics, statisticians, demographers and users of census data. The result was a suite of methods to process, clean, adjust and protect the census results.",https://www.ons.gov.uk/census/2011census/2011censusdata/2011censususerguide/qualityandmethods -5,Population Base,Population Base,Sylfaen Poblogaeth,The main census population base into which a person falls.,Prif sylfaen poblogaeth y cyfrifiad y mae person yn syrthio iddi.,Population Base,,,,,,PUB,DVO,,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. +5,Population Base,Population Base,Sylfaen Poblogaeth,The main census population base into which a person falls.,Prif sylfaen poblogaeth y cyfrifiad y mae person yn syrthio iddi.,Population Base,,,,,Y,PUB,DVO,,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. However no census is perfect and some people are inevitably missed. ONS therefore uses complex statistical techniques to adjust the 2011 Census counts for those people missed by the census. The methods and quality assurance approach was researched and developed in consultation with academics, statisticians, demographers and users of census data. The result was a suite of methods to process, clean, adjust and protect the census results.",https://www.ons.gov.uk/census/2011census/2011censusdata/2011censususerguide/qualityandmethods 6,Sex,Sex,Rhyw,The classification of a person as either male or female.,Dosbarthiad person naill ai'n wryw neu'n fenyw.,Sex,,,"Indicator of comparability: Highly comparable @@ -32,7 +32,7 @@ Allbwn Data - Mae dadansoddiad o wrywod a benywod yn cael ei gynnwys mewn llawer o'r allbynnau anarferol ac aml-ddirprwy. A yw allbynnau'r DU ar gael? -Ydw.",,PUB,SV,MAD,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. +Ydw.",Y,PUB,SV,MAD,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. However no census is perfect and some people are inevitably missed. ONS therefore uses complex statistical techniques to adjust the 2011 Census counts for those people missed by the census. The methods and quality assurance approach was researched and developed in consultation with academics, statisticians, demographers and users of census data. The result was a suite of methods to process, clean, adjust and protect the census results.",https://www.ons.gov.uk/census/2011census/2011censusdata/2011censususerguide/qualityandmethods 7,Age,Age,Oedran,"Age is derived from the date of birth question and is a person's age at their last birthday, at 27 March 2011. Dates of birth that imply an age over 115 are treated as invalid and the person's age is imputed. Infants less than one year old are classified as 0 years of age.","Mae oedran yn deillio o'r cwestiwn dyddiad geni ac mae'n oedran unigolyn ar ei ben-blwydd olaf, ar 27 Mawrth 2011. Mae dyddiadau geni sy'n awgrymu oedran dros 115 yn cael eu trin fel rhai annilys ac mae oedran yr unigolyn yn cael ei osod. Mae babanod llai na blwydd oed yn cael eu dosbarthu fel 0 oed.",Age,,,"Indicator of comparability: Highly comparable @@ -57,7 +57,7 @@ Allbwn data: Allbynnau anwahanadwy ac amlochrog. A yw allbynnau'r DU ar gael? -Amherthnasol",,PUB,DVO,MAD,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. +Amherthnasol",Y,PUB,DVO,MAD,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. However no census is perfect and some people are inevitably missed. ONS therefore uses complex statistical techniques to adjust the 2011 Census counts for those people missed by the census. The methods and quality assurance approach was researched and developed in consultation with academics, statisticians, demographers and users of census data. The result was a suite of methods to process, clean, adjust and protect the census results.",https://www.ons.gov.uk/census/2011census/2011censusdata/2011censususerguide/qualityandmethods 8,Marital Status,Marital Status,Statws priodasol,"Marital and civil partnership status classifies an individual according to their legal marital or registered same-sex civil partnership status as at census day, 27 March 2011. This topic is the equivalent of the 2001 Census topic “Marital status”, but has undergone significant revision to take account of the Civil Partnership Act which came into force on 5 December 2005. @@ -88,7 +88,7 @@ Allbwn data: - Er mwyn atal datgelu gwybodaeth unigolion, ar gyfer rhai allbynnau, yn enwedig ar lefelau is o ddaearyddiaeth, darperir data fel categorïau agregedig. A yw allbynnau'r DU ar gael? -Ydw.",,PUB,SV,MAD,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. +Ydw.",Y,PUB,SV,MAD,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. However no census is perfect and some people are inevitably missed. ONS therefore uses complex statistical techniques to adjust the 2011 Census counts for those people missed by the census. The methods and quality assurance approach was researched and developed in consultation with academics, statisticians, demographers and users of census data. The result was a suite of methods to process, clean, adjust and protect the census results.",https://www.ons.gov.uk/census/2011census/2011censusdata/2011censususerguide/qualityandmethods 9,Student,Student,Myfyriwr,"Schoolchildren and students in full-time education studying away from their family home are treated as usually resident at their term-time address. Basic demographic information only (name, sex, age, marital status and relationship) is collected at their non term-time address (their “home” or “vacation”address). The information on families, household size and household composition for their non term-time address does not include them.","Mae plant ysgol a myfyrwyr mewn addysg amser llawn sy'n astudio i ffwrdd o'u cartref teuluol yn cael eu trin fel arfer yn byw yn eu cyfeiriad yn ystod y tymor. Cesglir gwybodaeth ddemograffig sylfaenol yn unig (enw, rhyw, oedran, statws priodasol a pherthynas) yn eu cyfeiriad nad yw'n ystod y tymor (eu cyfeiriad “cartref” neu “wyliau”). Nid yw'r wybodaeth am deuluoedd, maint aelwydydd a chyfansoddiad y cartref ar gyfer eu cyfeiriad nad ydynt yn ystod y tymor yn eu cynnwys.",Student,,,"Indicator of comparability: Highly comparable @@ -117,7 +117,7 @@ Mae cyfrifiadau anffafriol o blant ysgol a myfyrwyr ar gael i bob gwlad. Mae'r r Cyhoeddwyd allbynnau aml-amrywiol hefyd. A yw allbynnau'r DU ar gael? -Ydw.",,PUB,SV,LHQ,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. +Ydw.",Y,PUB,SV,LHQ,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. However no census is perfect and some people are inevitably missed. ONS therefore uses complex statistical techniques to adjust the 2011 Census counts for those people missed by the census. The methods and quality assurance approach was researched and developed in consultation with academics, statisticians, demographers and users of census data. The result was a suite of methods to process, clean, adjust and protect the census results.",https://www.ons.gov.uk/census/2011census/2011censusdata/2011censususerguide/qualityandmethods 10,Country of Birth,Country of Birth,Gwlad Geni,"Country of birth is the country in which a person was born. The country of birth question included six tick-box responses – one for each of the four parts of the UK, one for the Republic of Ireland, and one for “elsewhere”. Where a person ticked “elsewhere”, they were asked to write in the current name of the country in which they were born. Responses are assigned codes based on the National Statistics Country Classification. @@ -182,7 +182,7 @@ Ydw. Cyhoeddwyd tabl QS203UK gwlad enedigol. Mae grwpio gwledydd o fewn y dosbarthiad a ddefnyddiwyd yn rhanbarthol yn fras, ond yn ystyried grwpio Ewrop Gwledydd yr Undeb (UE). Mae lefel y manylder a gyflwynir yn y tabl hwn wedi cael ei bennu gan lefel y wlad genedigaethau manylion sydd ar gael yn y tri tablau penodol ar wlad geni -QS203EW (Cymru a Lloegr), QS203SC (Yr Alban) a QS208NI (Gogledd Iwerddon).",,PUB,SV,MAD,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. +QS203EW (Cymru a Lloegr), QS203SC (Yr Alban) a QS208NI (Gogledd Iwerddon).",Y,PUB,SV,MAD,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. However no census is perfect and some people are inevitably missed. ONS therefore uses complex statistical techniques to adjust the 2011 Census counts for those people missed by the census. The methods and quality assurance approach was researched and developed in consultation with academics, statisticians, demographers and users of census data. The result was a suite of methods to process, clean, adjust and protect the census results.",https://www.ons.gov.uk/census/2011census/2011censusdata/2011censususerguide/qualityandmethods 11,Health,Health,Iechyd,"General health is a self-assessment of a person’s general state of health. People were asked to assess whether their health was very good, good, fair, bad or very bad. This assessment is not based on a person's health over any specified period of time.","Mae iechyd cyffredinol yn hunanasesiad o gyflwr iechyd cyffredinol person. Gofynnwyd i bobl asesu a oedd eu hiechyd yn dda iawn, yn dda, yn deg, yn ddrwg neu'n ddrwg iawn. Nid yw'r asesiad hwn yn seiliedig ar iechyd person dros unrhyw gyfnod penodol o amser.",Health,,,"Indicator of comparability: Highly comparable @@ -207,7 +207,7 @@ Allbwn data: - Mae allbynnau anffafriol ac aml-amrywiol ar gael ar iechyd cyffredinol, ar wahanol lefelau daearyddiaeth, ar gyfer pob gwlad. A yw allbynnau'r DU ar gael? -Ydw.",,PUB,SV,HSC,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. +Ydw.",Y,PUB,SV,HSC,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. However no census is perfect and some people are inevitably missed. ONS therefore uses complex statistical techniques to adjust the 2011 Census counts for those people missed by the census. The methods and quality assurance approach was researched and developed in consultation with academics, statisticians, demographers and users of census data. The result was a suite of methods to process, clean, adjust and protect the census results.",https://www.ons.gov.uk/census/2011census/2011censusdata/2011censususerguide/qualityandmethods 12,Ethnic Group,Ethnic Group,Grŵp Ethnig,Ethnic group classifies people according to their own perceived ethnic group and cultural background.,Mae grŵp ethnig yn dosbarthu pobl yn ôl eu grŵp ethnig canfyddedig eu hunain a'u cefndir diwylliannol.,Ethnic Group,,,"Indicator of comparability: Broadly comparable @@ -254,7 +254,7 @@ Mae allbynnau anffafriol ac amlochrog ar grŵp ethnig ar gael, ar wahanol lefela Mae gan bob gwlad ei dosbarthiad grŵp ethnig ei hun. -Oherwydd gwahaniaethau penodol i wlad mewn data a gasglwyd, cynllun cwestiwn ac ymateb categorïau ar gyfer y cwestiwn grŵp ethnig, ynghyd â rheolau prosesu data penodol yn benodol, nid yw rhai ymatebion yn cael eu cymharu'n uniongyrchol.",,PUB,DVO,EILR,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. +Oherwydd gwahaniaethau penodol i wlad mewn data a gasglwyd, cynllun cwestiwn ac ymateb categorïau ar gyfer y cwestiwn grŵp ethnig, ynghyd â rheolau prosesu data penodol yn benodol, nid yw rhai ymatebion yn cael eu cymharu'n uniongyrchol.",Y,PUB,DVO,EILR,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. However no census is perfect and some people are inevitably missed. ONS therefore uses complex statistical techniques to adjust the 2011 Census counts for those people missed by the census. The methods and quality assurance approach was researched and developed in consultation with academics, statisticians, demographers and users of census data. The result was a suite of methods to process, clean, adjust and protect the census results.",https://www.ons.gov.uk/census/2011census/2011censusdata/2011censususerguide/qualityandmethods 13,Religion,Religion,Crefydd,"The voluntary question on religion in the 2011 Census was intended to capture people's religious affiliation and identification at the time of the Census irrespective of whether they practised or believed in that religion or how important it was in their lives. @@ -325,14 +325,14 @@ Yn wahanol i gwestiynau cyfrifiad eraill lle mae atebion coll yn cael eu cyfrifo Mae'r Alban wedi cyhoeddi Tabl KS209SCA gyda'r un categorïau allbwn crefydd i ddarparu fersiwn wedi'i chysoni i allbynnau Cymru a Lloegr. Cynhyrchodd Gogledd Iwerddon hefyd fwrdd crefydd manylder llawn QS218NI sy'n cynnwys pob crefydd gyda chyfrif o 10 neu fwy o ymatebion. Fodd bynnag, lle mae defnyddwyr yn dewis cymharu gwybodaeth am grefydd ar draws gwledydd, rhaid iddynt fod yn ymwybodol eu bod yn cymharu gwahanol gysyniadau ac yn gwneud hynny yn ofalus. O ganlyniad, mae cymaroldeb yn gyfyngedig. A yw allbynnau'r DU ar gael? -Na.",,PUB,SV,EILR,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. +Na.",Y,PUB,SV,EILR,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. However no census is perfect and some people are inevitably missed. ONS therefore uses complex statistical techniques to adjust the 2011 Census counts for those people missed by the census. The methods and quality assurance approach was researched and developed in consultation with academics, statisticians, demographers and users of census data. The result was a suite of methods to process, clean, adjust and protect the census results.",https://www.ons.gov.uk/census/2011census/2011censusdata/2011censususerguide/qualityandmethods 14,Economic Activity,Economic Activity,Gweithgaredd economaidd,"Economic activity relates to whether or not a person who was aged 16 and over was working or looking for work in the week before census. Rather than a simple indicator of whether or not someone was currently in employment, it provides a measure of whether or not a person was an active participant in the labour market. A person's economic activity is derived from their “activity last week”. This is an indicator of their status or availability for employment - whether employed, unemployed, or their status if not employed and not seeking employment. Additional information included in the economic activity classification is also derived from information about the number of hours a person works and their type of employment - whether employed or self-employed.","Mae gweithgarwch economaidd yn ymwneud ag a oedd person a oedd yn 16 oed a throsodd yn gweithio neu'n chwilio am waith yn ystod yr wythnos cyn y cyfrifiad ai peidio. Yn hytrach na dangosydd syml a oedd rhywun mewn cyflogaeth ar hyn o bryd ai peidio, mae'n mesur a oedd person yn cymryd rhan weithredol yn y farchnad lafur ai peidio. -Mae gweithgaredd economaidd unigolyn yn deillio o'u “gweithgaredd yr wythnos diwethaf”. Mae hwn yn ddangosydd o'u statws neu argaeledd cyflogaeth - boed yn gyflogedig, yn ddi-waith, neu eu statws os nad ydynt yn gyflogedig ac nad ydynt yn chwilio am waith. Mae gwybodaeth ychwanegol a gynhwysir yn y dosbarthiad gweithgarwch economaidd hefyd yn deillio o wybodaeth am nifer yr oriau y mae person yn gweithio a'u math o gyflogaeth - boed yn gyflogedig neu'n hunangyflogedig.",Economic Activity,"The census concept of economic activity is compatible with the standard for economic status defined by the International Labour Organisation (ILO). It is one of a number of definitions used internationally to produce accurate and comparable statistics on employment, unemployment and economic status.","Mae cysyniad y cyfrifiad o weithgarwch economaidd yn gydnaws â'r safon ar gyfer statws economaidd a ddiffinnir gan y Sefydliad Llafur Rhyngwladol (ILO). Mae'n un o nifer o ddiffiniadau a ddefnyddir yn rhyngwladol i gynhyrchu ystadegau cywir a chymaradwy ar gyflogaeth, diweithdra a statws economaidd.",,,,PUB,DVO,LHQ,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. +Mae gweithgaredd economaidd unigolyn yn deillio o'u “gweithgaredd yr wythnos diwethaf”. Mae hwn yn ddangosydd o'u statws neu argaeledd cyflogaeth - boed yn gyflogedig, yn ddi-waith, neu eu statws os nad ydynt yn gyflogedig ac nad ydynt yn chwilio am waith. Mae gwybodaeth ychwanegol a gynhwysir yn y dosbarthiad gweithgarwch economaidd hefyd yn deillio o wybodaeth am nifer yr oriau y mae person yn gweithio a'u math o gyflogaeth - boed yn gyflogedig neu'n hunangyflogedig.",Economic Activity,"The census concept of economic activity is compatible with the standard for economic status defined by the International Labour Organisation (ILO). It is one of a number of definitions used internationally to produce accurate and comparable statistics on employment, unemployment and economic status.","Mae cysyniad y cyfrifiad o weithgarwch economaidd yn gydnaws â'r safon ar gyfer statws economaidd a ddiffinnir gan y Sefydliad Llafur Rhyngwladol (ILO). Mae'n un o nifer o ddiffiniadau a ddefnyddir yn rhyngwladol i gynhyrchu ystadegau cywir a chymaradwy ar gyflogaeth, diweithdra a statws economaidd.",,,Y,PUB,DVO,LHQ,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. However no census is perfect and some people are inevitably missed. ONS therefore uses complex statistical techniques to adjust the 2011 Census counts for those people missed by the census. The methods and quality assurance approach was researched and developed in consultation with academics, statisticians, demographers and users of census data. The result was a suite of methods to process, clean, adjust and protect the census results.",https://www.ons.gov.uk/census/2011census/2011censusdata/2011censususerguide/qualityandmethods 15,Occupation,Occupation,Ngalwedigaeth,A person's occupation relates to their main job and is derived from either their job title or details of the activities involved in their job. This is used to assign responses to an occupation code based on the Standard Occupational Classification 2010 (SOC2010).,Mae galwedigaeth unigolyn yn ymwneud â'i brif swydd ac mae'n deillio naill ai o deitl ei swydd neu fanylion y gweithgareddau sy'n gysylltiedig â'u swydd. Defnyddir hwn i neilltuo ymatebion i god meddiannaeth yn seiliedig ar Ddosbarthiad Galwedigaethol Safonol 2010 (SOC2010).,Occupation,The census concept of cccupation uses occupation codes from the Standard Occupational Classification 2010 (SOC2010).,Mae cysyniad y Cyfrifiad o CcCupation yn defnyddio codau galwedigaeth o'r Dosbarthiad Galwedigaethol Safonol 2010 (SOC2010).,"Indicator of comparability: Highly comparable @@ -429,7 +429,7 @@ http://www.ons.gov.uk/ons/guide-method/classifications/current-standard-classifi A yw allbynnau'r DU ar gael? Ydw. Mae allbynnau'r DU ar alwedigaeth a NS-SEC ar gael. - Allbynnau'r DU ar alwedigaeth Defnyddiwch y prif ddosbarthiad grŵp -- Mae allbynnau'r DU ar NS-SEC preswylwyr arferol a pherson cyfeirio aelwydydd (HRP) ar gael.",,PUB,DVO,LHQ,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. +- Mae allbynnau'r DU ar NS-SEC preswylwyr arferol a pherson cyfeirio aelwydydd (HRP) ar gael.",Y,PUB,DVO,LHQ,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. However no census is perfect and some people are inevitably missed. ONS therefore uses complex statistical techniques to adjust the 2011 Census counts for those people missed by the census. The methods and quality assurance approach was researched and developed in consultation with academics, statisticians, demographers and users of census data. The result was a suite of methods to process, clean, adjust and protect the census results.",https://www.ons.gov.uk/census/2011census/2011censusdata/2011censususerguide/qualityandmethods 16,Industry,Industry,Ddiwydiant,"The industry in which a person aged 16 and over works relates to their main job, and is derived from information provided on the main activity of their employer or business. This is used to assign responses to an industry code based on the Standard Industrial Classification 2007.","Mae'r diwydiant lle mae person 16 oed a throsodd yn gweithio yn ymwneud â'i brif swydd, ac mae'n deillio o wybodaeth a ddarperir am brif weithgaredd eu cyflogwr neu fusnes. Defnyddir hyn i neilltuo ymatebion i god diwydiant yn seiliedig ar y Dosbarthiad Diwydiannol Safonol 2007.",Industry,The census concept of industry uses industry codes from the Standard Industrial Classification 2007.,Mae cysyniad y cyfrifiad o ddiwydiant yn defnyddio codau diwydiant o'r dosbarthiad diwydiannol safonol 2007.,"Indicator of comparability: Highly comparable @@ -494,7 +494,7 @@ Er enghraifft, gellir cyfuno'r categorïau fel a ganlyn i alluogi cymhariaeth: Noder bod y cwestiwn ar enw'r cyflogwr yn cael ei ddefnyddio i gynhyrchu amcangyfrifon cyfrifiad ond ni chaiff data ei gasglu ar gyfer cyhoeddi uniongyrchol. A yw allbynnau'r DU ar gael? -Ydw. Mae allbynnau'r DU ar ddiwydiant ar gael.",,PUB,DVO,LHQ,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. +Ydw. Mae allbynnau'r DU ar ddiwydiant ar gael.",Y,PUB,DVO,LHQ,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. However no census is perfect and some people are inevitably missed. ONS therefore uses complex statistical techniques to adjust the 2011 Census counts for those people missed by the census. The methods and quality assurance approach was researched and developed in consultation with academics, statisticians, demographers and users of census data. The result was a suite of methods to process, clean, adjust and protect the census results.",https://www.ons.gov.uk/census/2011census/2011censusdata/2011censususerguide/qualityandmethods 17,Hours worked per week,Hours worked per week,Oriau a weithir yr wythnos,"The number of hours that a person aged 16 to 74, in employment in the week before the census, worked in their main job. This includes paid and unpaid overtime.","Nifer yr oriau yr oedd person 16 i 74 oed, mewn cyflogaeth yn ystod yr wythnos cyn y cyfrifiad, yn gweithio yn eu prif swydd. Mae hyn yn cynnwys goramser taledig a di-dâl.",Hours worked per week,,,"Indicator of comparability: Broadly comparable @@ -559,9 +559,9 @@ Ydw. Mae data ar gyfer Cymru, Lloegr, yr Alban a Gogledd Iwerddon ar gael gan fo - rhan-amser: 15 awr neu lai yn gweithio - rhan-amser: 16 i 30 awr yn gweithio - Gweithiodd 31 i 48 awr amser llawn -- amser llawn 49 neu fwy o oriau yn gweithio",,PUB,SV,LHQ,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. +- amser llawn 49 neu fwy o oriau yn gweithio",Y,PUB,SV,LHQ,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. However no census is perfect and some people are inevitably missed. ONS therefore uses complex statistical techniques to adjust the 2011 Census counts for those people missed by the census. The methods and quality assurance approach was researched and developed in consultation with academics, statisticians, demographers and users of census data. The result was a suite of methods to process, clean, adjust and protect the census results.",https://www.ons.gov.uk/census/2011census/2011censusdata/2011censususerguide/qualityandmethods -18,Approximated Social Grade,Approximated Social Grade,Gradd gymdeithasol amcangyfrifedig,"Social Grade is the socio-economic classification used by the Market Research and Marketing Industries, most often in the analysis of spending habits and consumer attitudes. Although it is not possible to allocate Social Grade precisely from information collected by the 2011 Census, the Market Research Society has developed a method for using Census information to provide a good approximation of Social Grade.","Gradd Gymdeithasol yw'r dosbarthiad economaidd-gymdeithasol a ddefnyddir gan y Diwydiannau Ymchwil i'r Farchnad a Marchnata, gan amlaf wrth ddadansoddi arferion gwario ac agweddau defnyddwyr. Er nad yw'n bosibl dyrannu Gradd Gymdeithasol yn union o wybodaeth a gasglwyd gan Gyfrifiad 2011, mae'r Gymdeithas Ymchwil i'r Farchnad wedi datblygu dull i forddefnyddio gwybodaeth y Cyfrifiad i ddarparu brasamcan da o Radd Gymdeithasol.",Approximated Social Grade,The census concept of approximated social grade is equivalent to the socio-economic classification used by the Market Research and Marketing Industries.,Mae'r cysyniad cyfrifiad o radd gymdeithasol amcangyfrifedig yn gyfwerth â'r dosbarthiad economaidd-gymdeithasol a ddefnyddir gan y diwydiannau ymchwil a marchnata marchnata.,,,,PUB,DVO,LHQ,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. +18,Approximated Social Grade,Approximated Social Grade,Gradd gymdeithasol amcangyfrifedig,"Social Grade is the socio-economic classification used by the Market Research and Marketing Industries, most often in the analysis of spending habits and consumer attitudes. Although it is not possible to allocate Social Grade precisely from information collected by the 2011 Census, the Market Research Society has developed a method for using Census information to provide a good approximation of Social Grade.","Gradd Gymdeithasol yw'r dosbarthiad economaidd-gymdeithasol a ddefnyddir gan y Diwydiannau Ymchwil i'r Farchnad a Marchnata, gan amlaf wrth ddadansoddi arferion gwario ac agweddau defnyddwyr. Er nad yw'n bosibl dyrannu Gradd Gymdeithasol yn union o wybodaeth a gasglwyd gan Gyfrifiad 2011, mae'r Gymdeithas Ymchwil i'r Farchnad wedi datblygu dull i forddefnyddio gwybodaeth y Cyfrifiad i ddarparu brasamcan da o Radd Gymdeithasol.",Approximated Social Grade,The census concept of approximated social grade is equivalent to the socio-economic classification used by the Market Research and Marketing Industries.,Mae'r cysyniad cyfrifiad o radd gymdeithasol amcangyfrifedig yn gyfwerth â'r dosbarthiad economaidd-gymdeithasol a ddefnyddir gan y diwydiannau ymchwil a marchnata marchnata.,,,Y,PUB,DVO,LHQ,1,People,,,,,,,1,"The census is the most complete source of information about the population that we have. Every effort is made to include everyone. It is the only survey which provides a detailed picture of the entire population, and is unique because it covers everyone at the same time and asks the same core questions everywhere. This makes it easy to compare different parts of the country. However no census is perfect and some people are inevitably missed. ONS therefore uses complex statistical techniques to adjust the 2011 Census counts for those people missed by the census. The methods and quality assurance approach was researched and developed in consultation with academics, statisticians, demographers and users of census data. The result was a suite of methods to process, clean, adjust and protect the census results.",https://www.ons.gov.uk/census/2011census/2011censusdata/2011censususerguide/qualityandmethods diff --git a/test/expected/dataset-metadata-best-effort.json b/test/expected/dataset-metadata-best-effort.json new file mode 100644 index 0000000..b259df0 --- /dev/null +++ b/test/expected/dataset-metadata-best-effort.json @@ -0,0 +1,436 @@ +[ + { + "name": "base", + "label": "Base dataset with metadata for all variables", + "lang": "en", + "description": "This is a base dataset containing metadata for all variables used across all other datasets. Other datasets include it to avoid duplicating metadata.", + "meta": { + "Source": { + "Source_Mnemonic": "Census2021", + "Source_Description": "The 2021 England and Wales Census", + "Version": "1" + }, + "Version": "1" + }, + "vars": [ + { + "name": "CLASS1", + "label": "CLASS1 Label Internal", + "description": "VAR1 Description", + "meta": { + "Mnemonic_2011": null, + "Parent_Classification_Mnemonic": null, + "Default_Classification_Flag": null, + "Version": "1", + "ONS_Variable": { + "Variable_Mnemonic": "VAR1", + "Variable_Mnemonic_2011": null, + "Version": "1", + "Quality_Statement_Text": null, + "Quality_Summary_URL": null, + "Variable_Title": "VAR1 Title", + "Comparability_Comments": "VAR1 Comparability Comments", + "Uk_Comparison_Comments": "VAR1 UK Comparison Comments", + "Geographic_Abbreviation": null, + "Geographic_Theme": null, + "Geographic_Coverage": null, + "Variable_Type": { + "Variable_Type_Code": "DVO", + "Variable_Type_Description": "Derived variable" + }, + "Statistical_Unit": null, + "Topic": null, + "Keywords": [], + "Questions": [] + }, + "Topics": [] + }, + "catLabels": null + }, + { + "name": "CLASS3", + "label": "CLASS3 Label Internal", + "description": "VAR3 Description", + "meta": { + "Mnemonic_2011": null, + "Parent_Classification_Mnemonic": null, + "Default_Classification_Flag": null, + "Version": "1", + "ONS_Variable": { + "Variable_Mnemonic": "VAR3", + "Variable_Mnemonic_2011": null, + "Version": "1", + "Quality_Statement_Text": null, + "Quality_Summary_URL": null, + "Variable_Title": "VAR3 Title", + "Comparability_Comments": "VAR3 Comparability Comments", + "Uk_Comparison_Comments": "VAR3 UK Comparison Comments", + "Geographic_Abbreviation": null, + "Geographic_Theme": null, + "Geographic_Coverage": null, + "Variable_Type": { + "Variable_Type_Code": "DVO", + "Variable_Type_Description": "Derived variable" + }, + "Statistical_Unit": null, + "Topic": null, + "Keywords": [], + "Questions": [] + }, + "Topics": [] + }, + "catLabels": null + }, + { + "name": "GEO1", + "label": "GEO1 Title", + "description": "GEO1 Description", + "meta": { + "Mnemonic_2011": null, + "Parent_Classification_Mnemonic": "GEO1", + "Default_Classification_Flag": null, + "Version": "1", + "ONS_Variable": { + "Variable_Mnemonic": "GEO1", + "Variable_Mnemonic_2011": "GEO1 2011", + "Version": "1", + "Quality_Statement_Text": null, + "Quality_Summary_URL": null, + "Variable_Title": "GEO1 Title", + "Comparability_Comments": "GEO1 Comparability Comments", + "Uk_Comparison_Comments": "GEO1 UK Comparison Comments", + "Geographic_Abbreviation": "G1", + "Geographic_Theme": "GEO1 Theme", + "Geographic_Coverage": "GEO1 Coverage", + "Variable_Type": { + "Variable_Type_Code": "GEOG", + "Variable_Type_Description": "Geographic variable" + }, + "Statistical_Unit": null, + "Topic": null, + "Keywords": [], + "Questions": [] + }, + "Topics": [] + }, + "catLabels": null + }, + { + "name": "GEO2", + "label": "GEO2 Title", + "description": "GEO2 Description", + "meta": { + "Mnemonic_2011": null, + "Parent_Classification_Mnemonic": "GEO2", + "Default_Classification_Flag": null, + "Version": "1", + "ONS_Variable": { + "Variable_Mnemonic": "GEO2", + "Variable_Mnemonic_2011": "GEO2 2011", + "Version": "1", + "Quality_Statement_Text": null, + "Quality_Summary_URL": null, + "Variable_Title": "GEO2 Title", + "Comparability_Comments": "GEO2 Comparability Comments", + "Uk_Comparison_Comments": "GEO2 UK Comparison Comments", + "Geographic_Abbreviation": "G1", + "Geographic_Theme": "GEO2 Theme", + "Geographic_Coverage": "GEO2 Coverage", + "Variable_Type": { + "Variable_Type_Code": "GEOG", + "Variable_Type_Description": "Geographic variable" + }, + "Statistical_Unit": null, + "Topic": null, + "Keywords": [], + "Questions": [] + }, + "Topics": [] + }, + "catLabels": null + }, + { + "name": "GEO3", + "label": "GEO3 Title", + "description": "GEO3 Description", + "meta": { + "Mnemonic_2011": null, + "Parent_Classification_Mnemonic": "GEO3", + "Default_Classification_Flag": null, + "Version": "1", + "ONS_Variable": { + "Variable_Mnemonic": "GEO3", + "Variable_Mnemonic_2011": "GEO3 2011", + "Version": "1", + "Quality_Statement_Text": null, + "Quality_Summary_URL": null, + "Variable_Title": "GEO3 Title", + "Comparability_Comments": "GEO3 Comparability Comments", + "Uk_Comparison_Comments": "GEO3 UK Comparison Comments", + "Geographic_Abbreviation": "G1", + "Geographic_Theme": "GEO3 Theme", + "Geographic_Coverage": "GEO3 Coverage", + "Variable_Type": { + "Variable_Type_Code": "GEOG", + "Variable_Type_Description": "Geographic variable" + }, + "Statistical_Unit": null, + "Topic": null, + "Keywords": [], + "Questions": [] + }, + "Topics": [] + }, + "catLabels": null + } + ] + }, + { + "name": "base", + "label": "Base dataset with metadata for all variables in Welsh", + "lang": "cy", + "description": "This is the Welsh version of the base dataset containing metadata for all variables.", + "meta": { + "Source": { + "Source_Mnemonic": "Census2021", + "Source_Description": "The 2021 England and Wales Census", + "Version": "1" + }, + "Version": "1" + }, + "vars": [ + { + "name": "CLASS1", + "label": "CLASS1 Label Internal", + "description": "VAR1 Description", + "meta": { + "Mnemonic_2011": null, + "Parent_Classification_Mnemonic": null, + "Default_Classification_Flag": null, + "Version": "1", + "ONS_Variable": { + "Variable_Mnemonic": "VAR1", + "Variable_Mnemonic_2011": null, + "Version": "1", + "Quality_Statement_Text": null, + "Quality_Summary_URL": null, + "Variable_Title": "VAR1 Title", + "Comparability_Comments": "VAR1 Comparability Comments", + "Uk_Comparison_Comments": "VAR1 UK Comparison Comments", + "Geographic_Abbreviation": null, + "Geographic_Theme": null, + "Geographic_Coverage": null, + "Variable_Type": { + "Variable_Type_Code": "DVO", + "Variable_Type_Description": "Derived variable" + }, + "Statistical_Unit": null, + "Topic": null, + "Keywords": [], + "Questions": [] + }, + "Topics": [] + }, + "catLabels": null + }, + { + "name": "CLASS3", + "label": "CLASS3 Label Internal", + "description": "VAR3 Description", + "meta": { + "Mnemonic_2011": null, + "Parent_Classification_Mnemonic": null, + "Default_Classification_Flag": null, + "Version": "1", + "ONS_Variable": { + "Variable_Mnemonic": "VAR3", + "Variable_Mnemonic_2011": null, + "Version": "1", + "Quality_Statement_Text": null, + "Quality_Summary_URL": null, + "Variable_Title": "VAR3 Title", + "Comparability_Comments": "VAR3 Comparability Comments", + "Uk_Comparison_Comments": "VAR3 UK Comparison Comments", + "Geographic_Abbreviation": null, + "Geographic_Theme": null, + "Geographic_Coverage": null, + "Variable_Type": { + "Variable_Type_Code": "DVO", + "Variable_Type_Description": "Derived variable" + }, + "Statistical_Unit": null, + "Topic": null, + "Keywords": [], + "Questions": [] + }, + "Topics": [] + }, + "catLabels": null + }, + { + "name": "GEO1", + "label": "GEO1 Title (Welsh)", + "description": "GEO1 Description (Welsh)", + "meta": { + "Mnemonic_2011": null, + "Parent_Classification_Mnemonic": "GEO1", + "Default_Classification_Flag": null, + "Version": "1", + "ONS_Variable": { + "Variable_Mnemonic": "GEO1", + "Variable_Mnemonic_2011": "GEO1 2011", + "Version": "1", + "Quality_Statement_Text": null, + "Quality_Summary_URL": null, + "Variable_Title": "GEO1 Title (Welsh)", + "Comparability_Comments": "GEO1 Comparability Comments (Welsh)", + "Uk_Comparison_Comments": "GEO1 UK Comparison Comments (Welsh)", + "Geographic_Abbreviation": "G1 (Welsh)", + "Geographic_Theme": "GEO1 Theme (Welsh)", + "Geographic_Coverage": "GEO1 Coverage (Welsh)", + "Variable_Type": { + "Variable_Type_Code": "GEOG", + "Variable_Type_Description": "Geographic variable (Welsh)" + }, + "Statistical_Unit": null, + "Topic": null, + "Keywords": [], + "Questions": [] + }, + "Topics": [] + }, + "catLabels": null + }, + { + "name": "GEO2", + "label": "GEO2 Title (Welsh)", + "description": "GEO2 Description (Welsh)", + "meta": { + "Mnemonic_2011": null, + "Parent_Classification_Mnemonic": "GEO2", + "Default_Classification_Flag": null, + "Version": "1", + "ONS_Variable": { + "Variable_Mnemonic": "GEO2", + "Variable_Mnemonic_2011": "GEO2 2011", + "Version": "1", + "Quality_Statement_Text": null, + "Quality_Summary_URL": null, + "Variable_Title": "GEO2 Title (Welsh)", + "Comparability_Comments": "GEO2 Comparability Comments (Welsh)", + "Uk_Comparison_Comments": "GEO2 UK Comparison Comments (Welsh)", + "Geographic_Abbreviation": "G1 (Welsh)", + "Geographic_Theme": "GEO2 Theme (Welsh)", + "Geographic_Coverage": "GEO2 Coverage (Welsh)", + "Variable_Type": { + "Variable_Type_Code": "GEOG", + "Variable_Type_Description": "Geographic variable (Welsh)" + }, + "Statistical_Unit": null, + "Topic": null, + "Keywords": [], + "Questions": [] + }, + "Topics": [] + }, + "catLabels": null + }, + { + "name": "GEO3", + "label": "GEO3 Title (Welsh)", + "description": "GEO3 Description (Welsh)", + "meta": { + "Mnemonic_2011": null, + "Parent_Classification_Mnemonic": "GEO3", + "Default_Classification_Flag": null, + "Version": "1", + "ONS_Variable": { + "Variable_Mnemonic": "GEO3", + "Variable_Mnemonic_2011": "GEO3 2011", + "Version": "1", + "Quality_Statement_Text": null, + "Quality_Summary_URL": null, + "Variable_Title": "GEO3 Title (Welsh)", + "Comparability_Comments": "GEO3 Comparability Comments (Welsh)", + "Uk_Comparison_Comments": "GEO3 UK Comparison Comments (Welsh)", + "Geographic_Abbreviation": "G1 (Welsh)", + "Geographic_Theme": "GEO3 Theme (Welsh)", + "Geographic_Coverage": "GEO3 Coverage (Welsh)", + "Variable_Type": { + "Variable_Type_Code": "GEOG", + "Variable_Type_Description": "Geographic variable (Welsh)" + }, + "Statistical_Unit": null, + "Topic": null, + "Keywords": [], + "Questions": [] + }, + "Topics": [] + }, + "catLabels": null + } + ] + }, + { + "name": "DB1", + "incl": [ + { + "name": "base", + "lang": "en" + } + ], + "label": "DB1 Title", + "description": "DB1 Description", + "lang": "en", + "meta": { + "Cantabular_DB_Flag": null, + "Version": "1", + "Source": { + "Source_Mnemonic": "SRC1", + "Copyright_Statement": null, + "Licence": null, + "Nationals_Statistic_Certified": null, + "Methodology_Link": null, + "SDC_Link": null, + "Version": "1", + "Source_Description": "SRC1 Description", + "Methodology_Statement": null, + "SDC_Statement": null, + "Contact": null + }, + "Lowest_Geog_Variable": "GEO1" + }, + "vars": null + }, + { + "name": "DB1", + "incl": [ + { + "name": "base", + "lang": "cy" + } + ], + "label": "DB1 Title", + "description": "DB1 Description", + "lang": "cy", + "meta": { + "Cantabular_DB_Flag": null, + "Version": "1", + "Source": { + "Source_Mnemonic": "SRC1", + "Copyright_Statement": null, + "Licence": null, + "Nationals_Statistic_Certified": null, + "Methodology_Link": null, + "SDC_Link": null, + "Version": "1", + "Source_Description": "SRC1 Description", + "Methodology_Statement": null, + "SDC_Statement": null, + "Contact": null + }, + "Lowest_Geog_Variable": "GEO1" + }, + "vars": null + } +] diff --git a/test/expected/dataset-metadata-no-geo.json b/test/expected/dataset-metadata-no-geo.json index 5cae910..0cb7694 100644 --- a/test/expected/dataset-metadata-no-geo.json +++ b/test/expected/dataset-metadata-no-geo.json @@ -7,7 +7,8 @@ "meta": { "Source": { "Source_Mnemonic": "Census2021", - "Source_Description": "The 2021 England and Wales Census" + "Source_Description": "The 2021 England and Wales Census", + "Version": "1" }, "Version": "1" }, @@ -245,7 +246,8 @@ "meta": { "Source": { "Source_Mnemonic": "Census2021", - "Source_Description": "The 2021 England and Wales Census" + "Source_Description": "The 2021 England and Wales Census", + "Version": "1" }, "Version": "1" }, @@ -357,7 +359,9 @@ }, "Topics": [] }, - "catLabels": null + "catLabels": { + "CODE2-1": "LABEL2-1 (Welsh)" + } }, { "name": "CLASS3", @@ -678,4 +682,4 @@ }, "vars": null } -] +] \ No newline at end of file diff --git a/test/expected/dataset-metadata.json b/test/expected/dataset-metadata.json index 1bae173..5154346 100644 --- a/test/expected/dataset-metadata.json +++ b/test/expected/dataset-metadata.json @@ -7,7 +7,8 @@ "meta": { "Source": { "Source_Mnemonic": "Census2021", - "Source_Description": "The 2021 England and Wales Census" + "Source_Description": "The 2021 England and Wales Census", + "Version": "1" }, "Version": "1" }, @@ -245,7 +246,8 @@ "meta": { "Source": { "Source_Mnemonic": "Census2021", - "Source_Description": "The 2021 England and Wales Census" + "Source_Description": "The 2021 England and Wales Census", + "Version": "1" }, "Version": "1" }, @@ -357,7 +359,9 @@ }, "Topics": [] }, - "catLabels": null + "catLabels": { + "CODE2-1": "LABEL2-1 (Welsh)" + } }, { "name": "CLASS3", @@ -682,4 +686,4 @@ }, "vars": null } -] +] \ No newline at end of file diff --git a/test/expected/table-metadata-best-effort.json b/test/expected/table-metadata-best-effort.json new file mode 100644 index 0000000..d304b86 --- /dev/null +++ b/test/expected/table-metadata-best-effort.json @@ -0,0 +1,62 @@ +[ + { + "name": "DS1", + "datasetName": "DB1", + "vars": [ + "GEO1", + "CLASS1" + ], + "ref": [ + { + "lang": "en", + "label": "DS1 Title", + "description": "DS1 Description", + "meta": { + "Dataset_Mnemonic_2011": null, + "Last_Updated": null, + "Unique_Url": null, + "Version": "1", + "Geographic_Coverage": "Everywhere", + "Dataset_Population": "Everyone", + "Statistical_Unit": { + "Statistical_Unit": "Houses", + "Statistical_Unit_Description": "House Description" + }, + "Contact": null, + "Keywords": [], + "Related_Datasets": [], + "Census_Releases": [], + "Publications": [], + "Alternate_Geographic_Variables": [ + "GEO3" + ] + } + }, + { + "lang": "cy", + "label": "DS1 Title", + "description": "DS1 Description", + "meta": { + "Dataset_Mnemonic_2011": null, + "Last_Updated": null, + "Unique_Url": null, + "Version": "1", + "Geographic_Coverage": "Everywhere", + "Dataset_Population": "Everyone", + "Statistical_Unit": { + "Statistical_Unit": "Houses", + "Statistical_Unit_Description": "House Description" + }, + "Contact": null, + "Keywords": [], + "Related_Datasets": [], + "Census_Releases": [], + "Publications": [], + "Alternate_Geographic_Variables": [ + "GEO3" + ] + } + } + ] + } +] diff --git a/test/expected/table-metadata.json b/test/expected/table-metadata.json index cee40cf..a3bee43 100644 --- a/test/expected/table-metadata.json +++ b/test/expected/table-metadata.json @@ -64,6 +64,9 @@ "Publisher_Name": null, "Publisher_Website": null } + ], + "Alternate_Geographic_Variables": [ + "GEO2" ] } }, @@ -122,6 +125,9 @@ "Publisher_Name": null, "Publisher_Website": null } + ], + "Alternate_Geographic_Variables": [ + "GEO2" ] } } @@ -159,7 +165,8 @@ "Release_Date": "1/1/2022" } ], - "Publications": [] + "Publications": [], + "Alternate_Geographic_Variables": [] } }, { @@ -187,7 +194,8 @@ "Release_Date": "1/1/2022" } ], - "Publications": [] + "Publications": [], + "Alternate_Geographic_Variables": [] } } ] @@ -224,7 +232,8 @@ "Keywords": [], "Related_Datasets": [], "Census_Releases": [], - "Publications": [] + "Publications": [], + "Alternate_Geographic_Variables": [] } }, { @@ -252,7 +261,8 @@ "Keywords": [], "Related_Datasets": [], "Census_Releases": [], - "Publications": [] + "Publications": [], + "Alternate_Geographic_Variables": [] } } ] @@ -283,7 +293,8 @@ "Keywords": [], "Related_Datasets": [], "Census_Releases": [], - "Publications": [] + "Publications": [], + "Alternate_Geographic_Variables": [] } }, { @@ -305,9 +316,10 @@ "Keywords": [], "Related_Datasets": [], "Census_Releases": [], - "Publications": [] + "Publications": [], + "Alternate_Geographic_Variables": [] } } ] } -] +] \ No newline at end of file diff --git a/test/test_best_effort.py b/test/test_best_effort.py new file mode 100644 index 0000000..5f9d46d --- /dev/null +++ b/test/test_best_effort.py @@ -0,0 +1,79 @@ +import json +import unittest.mock +import unittest +import pathlib +import os +import logging +from io import StringIO +from datetime import date +import ons_csv_to_ctb_json_main + +FILENAME_TABLES = 'cantabm_v9-3-0_best-effort_tables-md_19700101-1.json' +FILENAME_DATASET = 'cantabm_v9-3-0_best-effort_dataset-md_19700101-1.json' +FILENAME_SERVICE = 'cantabm_v9-3-0_best-effort_service-md_19700101-1.json' + +class TestBestEffort(unittest.TestCase): + @unittest.mock.patch('ons_csv_to_ctb_json_main.date') + def test_generated_json_best_effort(self, mock_date): + """Generate JSON from source CSV and compare it with expected values.""" + mock_date.today.return_value = date(1970, 1, 1) + mock_date.side_effect = lambda *args, **kw: date(*args, **kw) + + file_dir = pathlib.Path(__file__).parent.resolve() + input_dir = os.path.join(file_dir, 'testdata/best_effort') + output_dir = os.path.join(file_dir, 'out') + + with self.assertLogs(level='WARNING') as cm: + with unittest.mock.patch('sys.argv', ['test', '-i', input_dir, '-o', output_dir, '-m', 'best-effort', '--best-effort']): + ons_csv_to_ctb_json_main.main() + with open(os.path.join(output_dir, FILENAME_SERVICE)) as f: + service_metadata = json.load(f) + with open(os.path.join(file_dir, 'expected/service-metadata.json')) as f: + expected_service_metadata = json.load(f) + self.assertEqual(service_metadata, expected_service_metadata) + + with open(os.path.join(output_dir, FILENAME_DATASET)) as f: + dataset_metadata = json.load(f) + with open(os.path.join(file_dir, 'expected/dataset-metadata-best-effort.json')) as f: + expected_dataset_metadata = json.load(f) + self.assertEqual(dataset_metadata, expected_dataset_metadata) + + with open(os.path.join(output_dir, FILENAME_TABLES)) as f: + table_metadata = json.load(f) + with open(os.path.join(file_dir, 'expected/table-metadata-best-effort.json')) as f: + expected_table_metadata = json.load(f) + self.assertEqual(table_metadata, expected_table_metadata) + + warnings = [ + r'Classification.csv:3 no value supplied for required field Variable_Mnemonic', + r'Classification.csv:3 dropping record', + r'Classification.csv:4 duplicate value CLASS1 for Classification_Mnemonic', + r'Classification.csv:4 dropping record', + r'Classification.csv:5 invalid value x for Number_Of_Category_Items', + r'Classification.csv:5 ignoring field Number_Of_Category_Items', + r'Category.csv Unexpected number of categories for CLASS1: expected 4 but found 1', + r'Database_Variable.csv Lowest_Geog_Variable_Flag set on GEO3 and GEO1 for database DB1', + r'Dataset_Variable.csv:4 duplicate value combo DS1/VAR1 for Dataset_Mnemonic/Variable_Mnemonic', + r'Dataset_Variable.csv:4 dropping record', + r'Dataset_Variable.csv:2 Lowest_Geog_Variable_Flag set on non-geographic variable VAR1 for dataset DS1', + r'Dataset_Variable.csv:2 Processing_Priority not specified for classification CLASS1 in dataset DS1', + r'Dataset_Variable.csv:2 using 0 for Processing_Priority', + r'Dataset_Variable.csv:3 Classification_Mnemonic must not be specified for geographic variable GEO1 in dataset DS1', + r'Dataset_Variable.csv:3 Processing_Priority must not be specified for geographic variable GEO1 in dataset DS1', + r'Dataset_Variable.csv:5 Lowest_Geog_Variable_Flag set on variable GEO2 and GEO1 for dataset DS1', + r'Dataset_Variable.csv:7 Classification must be specified for non-geographic VAR2 in dataset DS1', + r'Dataset_Variable.csv:7 dropping record', + r'Dataset_Variable.csv:8 Invalid classification CLASS1 specified for variable VAR3 in dataset DS1', + r'Dataset_Variable.csv:8 dropping record', + r'Dataset_Variable.csv Invalid processing_priorities \[0\] for dataset DS1', + r'Dataset.csv:3 DS2 has classification CLASS3 that is not in database DB1', + r'Dataset.csv:3 dropping record', + r'Dataset.csv:4 DS3 has no associated classifications or geographic variable', + r'Dataset.csv:4 dropping record', + r'16 errors were encountered during processing', + ] + + self.assertEqual(len(warnings), len(cm.output)) + for i, warning in enumerate(cm.output): + self.assertRegex(warning, warnings[i]) + diff --git a/test/test_classification.py b/test/test_classification.py index b17ca47..574aeaf 100644 --- a/test/test_classification.py +++ b/test/test_classification.py @@ -14,9 +14,9 @@ 'Security_Mnemonic': 'PUB', 'Variable_Mnemonic': 'VAR1', 'Internal_Classification_Label_English': 'label', - 'Number_Of_Category_Items': '1', 'Version': '1', - 'Id': '1'} + 'Id': '1', + 'Signed_Off_Flag': 'N'} INPUT_DIR = os.path.join(pathlib.Path(__file__).parent.resolve(), 'testdata') @@ -37,7 +37,8 @@ def test_required_fields(self): self.run_test([row], f'^Reading {FILENAME}:2 no value supplied for required field {field}$') def test_invalid_values(self): - for field in ['Security_Mnemonic', 'Variable_Mnemonic', 'Number_Of_Category_Items']: + for field in ['Security_Mnemonic', 'Variable_Mnemonic', 'Number_Of_Category_Items', + 'Signed_Off_Flag']: with self.subTest(field=field): row = REQUIRED_FIELDS.copy() row[field] = 'X' diff --git a/test/test_contact.py b/test/test_contact.py index ac51e6b..f5a2adb 100644 --- a/test/test_contact.py +++ b/test/test_contact.py @@ -36,4 +36,4 @@ def test_duplicate_contact_id(self): if __name__ == '__main__': - unittest.main() \ No newline at end of file + unittest.main() diff --git a/test/test_csv_read.py b/test/test_csv_read.py index ce9236d..f68553d 100644 --- a/test/test_csv_read.py +++ b/test/test_csv_read.py @@ -16,6 +16,8 @@ def validate_fn(value): return validate_fn +def raise_error(msg): + raise ValueError(msg) class TestCSVRead(unittest.TestCase): @unittest.mock.patch('builtins.open', new_callable=mock_open, read_data="""id,name,email,age @@ -29,7 +31,7 @@ def test_read_file(self, m): required('age', validate_fn=isoneof(['40', '50'])), required('id'), ] - data = Reader('file.csv', columns).read() + data = Reader('file.csv', columns, raise_error).read() self.assertEqual(data, [ ({'name': 'bob', 'email': 'bob@bob.com', 'age': '40', 'id': '1'}, 2), @@ -43,7 +45,7 @@ def test_extra_fields(self, m): required('name'), required('id'), ] - data = Reader('file.csv', columns).read() + data = Reader('file.csv', columns, raise_error).read() self.assertEqual(data, [ ({'name': 'bob', 'id': '1'}, 2)]) @@ -58,7 +60,7 @@ def test_missing_fields(self, m): required('id'), ] with self.assertRaisesRegex(ValueError, 'Reading file.csv: missing expected columns: email, id'): - Reader('file.csv', columns).read() + Reader('file.csv', columns, raise_error).read() @unittest.mock.patch('builtins.open', new_callable=mock_open, read_data="""name,email bob,bob@bob.com @@ -69,8 +71,8 @@ def test_too_many_columns(self, m): required('name'), required('email'), ] - with self.assertRaisesRegex(ValueError, 'Reading file.csv: too many fields on line 3'): - Reader('file.csv', columns).read() + with self.assertRaisesRegex(ValueError, 'Reading file.csv: too many fields on row 3'): + Reader('file.csv', columns, raise_error).read() @unittest.mock.patch('builtins.open', new_callable=mock_open, read_data="""name,email bob,bob@bob.com @@ -81,8 +83,8 @@ def test_too_few_columns(self, m): required('name'), required('email'), ] - with self.assertRaisesRegex(ValueError, 'Reading file.csv: too few fields on line 3'): - Reader('file.csv', columns).read() + with self.assertRaisesRegex(ValueError, 'Reading file.csv: too few fields on row 3'): + Reader('file.csv', columns, raise_error).read() @unittest.mock.patch('builtins.open', new_callable=mock_open, read_data="""name bob @@ -94,7 +96,7 @@ def test_invalid_value(self, m): required('name', validate_fn=isoneof(['bob', 'bill'])), ] with self.assertRaisesRegex(ValueError, 'Reading file.csv:4 invalid value ben for name'): - Reader('file.csv', columns).read() + Reader('file.csv', columns, raise_error).read() @unittest.mock.patch('builtins.open', new_callable=mock_open, read_data="""name bob @@ -106,7 +108,7 @@ def test_non_unique_value(self, m): required('name', unique=True), ] with self.assertRaisesRegex(ValueError, 'Reading file.csv:4 duplicate value bob for name'): - Reader('file.csv', columns).read() + Reader('file.csv', columns, raise_error).read() @unittest.mock.patch('builtins.open', new_callable=mock_open, read_data="""name,email bob,bob@bob.com @@ -118,7 +120,7 @@ def test_empty_rows(self, m): required('name'), required('email'), ] - data = Reader('file.csv', columns).read() + data = Reader('file.csv', columns, raise_error).read() self.assertEqual(data, [ ({'name': 'bob', 'email': 'bob@bob.com'}, 2)]) @@ -133,7 +135,7 @@ def test_whitespace(self, m): required('name'), required('id'), ] - data = Reader('file.csv', columns).read() + data = Reader('file.csv', columns, raise_error).read() self.assertEqual(data, [ ({'name': 'bob', 'id': '1'}, 2), @@ -153,7 +155,7 @@ def test_unique_combos(self, m): required('id'), ] with self.assertRaisesRegex(ValueError, 'Reading file.csv:6 duplicate value combo bob/1 for name/id'): - Reader('file.csv', columns, unique_combo_fields=['name', 'id']).read() + Reader('file.csv', columns, raise_error, unique_combo_fields=['name', 'id']).read() if __name__ == '__main__': diff --git a/test/test_dataset.py b/test/test_dataset.py index 3c085b7..9ab18dc 100644 --- a/test/test_dataset.py +++ b/test/test_dataset.py @@ -18,7 +18,8 @@ 'Dataset_Population': 'population', 'Id': '1', 'Statistical_Unit': 'People', - 'Version': '1'} + 'Version': '1', + 'Signed_Off_Flag': 'N'} REQUIRED_FIELDS = {'Dataset_Mnemonic': 'DS1', 'Database_Mnemonic': 'DB1', @@ -44,7 +45,8 @@ def test_required_fields(self): self.run_test([row], f'^Reading {FILENAME}:2 no value supplied for required field {field}$') def test_invalid_values(self): - for field in ['Security_Mnemonic', 'Database_Mnemonic', 'Contact_Id', 'Statistical_Unit']: + for field in ['Security_Mnemonic', 'Database_Mnemonic', 'Contact_Id', 'Statistical_Unit', + 'Signed_Off_Flag']: with self.subTest(field=field): row = REQUIRED_FIELDS.copy() row[field] = 'X' diff --git a/test/test_dataset_classifications.py b/test/test_dataset_classifications.py index 5fa1d39..9d563da 100644 --- a/test/test_dataset_classifications.py +++ b/test/test_dataset_classifications.py @@ -63,38 +63,38 @@ def test_classification_on_geo_var(self): self.run_test( [{'Dataset_Mnemonic': 'DS1', 'Variable_Mnemonic': 'GEO1', 'Id': '1', 'Classification_Mnemonic': 'GEO1'}], - f'^Reading {FILENAME} Classification_Mnemonic must not be specified for geographic variable GEO1 in dataset DS1$') + f'^Reading {FILENAME}:2 Classification_Mnemonic must not be specified for geographic variable GEO1 in dataset DS1$') def test_processing_priority_on_geo_var(self): self.run_test( [{'Dataset_Mnemonic': 'DS1', 'Variable_Mnemonic': 'GEO1', 'Id': '1', 'Processing_Priority': '1'}], - f'^Reading {FILENAME} Processing_Priority must not be specified for geographic variable GEO1 in dataset DS1$') + f'^Reading {FILENAME}:2 Processing_Priority must not be specified for geographic variable GEO1 in dataset DS1$') def test_no_classification_on_non_geo_var(self): self.run_test( [{'Dataset_Mnemonic': 'DS1', 'Variable_Mnemonic': 'VAR1', 'Id': '1', 'Processing_Priority': '1'}], - f'^Reading {FILENAME} Classification must be specified for non-geographic VAR1 in dataset DS1$') + f'^Reading {FILENAME}:2 Classification must be specified for non-geographic VAR1 in dataset DS1$') def test_no_processing_priority_on_non_geo_var(self): self.run_test( [{'Dataset_Mnemonic': 'DS1', 'Variable_Mnemonic': 'VAR1', 'Id': '1', 'Classification_Mnemonic': 'CLASS1'}], - f'^Reading {FILENAME} Processing_Priority not specified for classification CLASS1 in dataset DS1$') + f'^Reading {FILENAME}:2 Processing_Priority not specified for classification CLASS1 in dataset DS1$') def test_lowest_geog_on_non_geo_var(self): self.run_test( [{'Dataset_Mnemonic': 'DS1', 'Variable_Mnemonic': 'VAR1', 'Id': '1', 'Classification_Mnemonic': 'CLASS1', 'Processing_Priority': '1', 'Lowest_Geog_Variable_Flag': 'Y'}], - f'^Reading {FILENAME} Lowest_Geog_Variable_Flag set on non-geographic variable VAR1 for dataset DS1$') + f'^Reading {FILENAME}:2 Lowest_Geog_Variable_Flag set on non-geographic variable VAR1 for dataset DS1$') def test_invalid_classification_on_var(self): self.run_test( [{'Dataset_Mnemonic': 'DS1', 'Variable_Mnemonic': 'VAR1', 'Id': '1', 'Classification_Mnemonic': 'CLASS2', 'Processing_Priority': '1'}], - f'^Reading {FILENAME} Invalid classification CLASS2 specified for variable VAR1 in dataset DS1$') + f'^Reading {FILENAME}:2 Invalid classification CLASS2 specified for variable VAR1 in dataset DS1$') def test_no_lowest_geog_flag(self): self.run_test( @@ -108,7 +108,7 @@ def test_duplicate_lowest_geog_flag(self): 'Id': '1', 'Lowest_Geog_Variable_Flag': 'Y'}, {'Dataset_Mnemonic': 'DS1', 'Variable_Mnemonic': 'GEO2', 'Id': '1', 'Lowest_Geog_Variable_Flag': 'Y'}], - f'^Reading {FILENAME} Lowest_Geog_Variable_Flag set on variable GEO2 and GEO1 for dataset DS1$') + f'^Reading {FILENAME}:3 Lowest_Geog_Variable_Flag set on variable GEO2 and GEO1 for dataset DS1$') if __name__ == '__main__': diff --git a/test/test_geo_read.py b/test/test_geo_read.py index b59681b..500fd4b 100644 --- a/test/test_geo_read.py +++ b/test/test_geo_read.py @@ -64,14 +64,14 @@ def test_valid_varname_characters(self, m): OA1,LAD1,LAD1 Name,LAD1 Name (Welsh),COUNTRY1,COUNTRY1 Name,extra """) def test_too_many_columns(self, m): - with self.assertRaisesRegex(ValueError, 'Reading file.csv: too many fields on line 2'): + with self.assertRaisesRegex(ValueError, 'Reading file.csv: too many fields on row 2'): read_geo_cats('file.csv') @unittest.mock.patch('builtins.open', new_callable=mock_open, read_data="""OA11CD,LAD22CD,LAD22NM,LAD22NMW,COUNTRY22CD,COUNTRY22NM OA1,LAD1,LAD1 Name,LAD1 Name (Welsh),COUNTRY1 """) def test_too_few_columns(self, m): - with self.assertRaisesRegex(ValueError, 'Reading file.csv: too few fields on line 2'): + with self.assertRaisesRegex(ValueError, 'Reading file.csv: too few fields on row 2'): read_geo_cats('file.csv') @unittest.mock.patch('builtins.open', new_callable=mock_open, read_data="""LAD22CD,LAD22NM,LAD22NMW diff --git a/test/test_integration.py b/test/test_integration.py index d2d019c..b3e2225 100644 --- a/test/test_integration.py +++ b/test/test_integration.py @@ -3,8 +3,17 @@ import unittest import pathlib import os +from datetime import date import ons_csv_to_ctb_json_main +FILENAME_TABLES = 'cantabm_v9-3-0_unknown-metadata-version_tables-md_19700101-1.json' +FILENAME_DATASET = 'cantabm_v9-3-0_unknown-metadata-version_dataset-md_19700101-1.json' +FILENAME_SERVICE = 'cantabm_v9-3-0_unknown-metadata-version_service-md_19700101-1.json' + +FILENAME_TABLES_NO_GEO = 't_cantabm_v9-3-0_no-geo_tables-md_19700101-2.json' +FILENAME_DATASET_NO_GEO = 't_cantabm_v9-3-0_no-geo_dataset-md_19700101-2.json' +FILENAME_SERVICE_NO_GEO = 't_cantabm_v9-3-0_no-geo_service-md_19700101-2.json' + class TestIntegration(unittest.TestCase): def test_directory_validity(self): """Check that a sensible error is raised if the input/output directory is invalid.""" @@ -31,52 +40,88 @@ def test_directory_validity(self): with self.assertRaisesRegex(ValueError, expected_error): ons_csv_to_ctb_json_main.main() - def test_generated_json(self): + def test_metadata_master_version(self): + """Check that a SystemExit is raised if the metadata master version is invalid.""" + file_dir = pathlib.Path(__file__).parent.resolve() + input_dir = os.path.join(file_dir, 'testdata') + output_dir = os.path.join(file_dir, 'out') + + with unittest.mock.patch('sys.argv', ['test', '-i', input_dir, '-o', output_dir, + '-m', 'a/../b']): + with self.assertRaises(SystemExit): + ons_csv_to_ctb_json_main.main() + + def test_build_number(self): + """Check that a SystemExit is raised if the build number is invalid.""" + file_dir = pathlib.Path(__file__).parent.resolve() + input_dir = os.path.join(file_dir, 'testdata') + output_dir = os.path.join(file_dir, 'out') + + with unittest.mock.patch('sys.argv', ['test', '-i', input_dir, '-o', output_dir, + '-b', 'a']): + with self.assertRaises(SystemExit): + ons_csv_to_ctb_json_main.main() + + with unittest.mock.patch('sys.argv', ['test', '-i', input_dir, '-o', output_dir, + '-b', '-1']): + with self.assertRaises(SystemExit): + ons_csv_to_ctb_json_main.main() + + @unittest.mock.patch('ons_csv_to_ctb_json_main.date') + def test_generated_json(self, mock_date): """Generate JSON from source CSV and compare it with expected values.""" + mock_date.today.return_value = date(1970, 1, 1) + mock_date.side_effect = lambda *args, **kw: date(*args, **kw) + file_dir = pathlib.Path(__file__).parent.resolve() input_dir = os.path.join(file_dir, 'testdata') output_dir = os.path.join(file_dir, 'out') geo_dir = os.path.join(input_dir, 'geography/geography.csv') with unittest.mock.patch('sys.argv', ['test', '-i', input_dir, '-o', output_dir, '-g', geo_dir]): ons_csv_to_ctb_json_main.main() - with open(os.path.join(output_dir, 'service-metadata.json')) as f: + with open(os.path.join(output_dir, FILENAME_SERVICE)) as f: service_metadata = json.load(f) with open(os.path.join(file_dir, 'expected/service-metadata.json')) as f: expected_service_metadata = json.load(f) self.assertEqual(service_metadata, expected_service_metadata) - with open(os.path.join(output_dir, 'dataset-metadata.json')) as f: + with open(os.path.join(output_dir, FILENAME_DATASET)) as f: dataset_metadata = json.load(f) with open(os.path.join(file_dir, 'expected/dataset-metadata.json')) as f: expected_dataset_metadata = json.load(f) self.assertEqual(dataset_metadata, expected_dataset_metadata) - with open(os.path.join(output_dir, 'table-metadata.json')) as f: + with open(os.path.join(output_dir, FILENAME_TABLES)) as f: table_metadata = json.load(f) with open(os.path.join(file_dir, 'expected/table-metadata.json')) as f: expected_table_metadata = json.load(f) self.assertEqual(table_metadata, expected_table_metadata) - def test_no_geography_file(self): + @unittest.mock.patch('ons_csv_to_ctb_json_main.date') + def test_no_geography_file(self, mock_date): """Generate JSON from source CSV and compare it with expected values.""" + mock_date.today.return_value = date(1970, 1, 1) + mock_date.side_effect = lambda *args, **kw: date(*args, **kw) + file_dir = pathlib.Path(__file__).parent.resolve() input_dir = os.path.join(file_dir, 'testdata') output_dir = os.path.join(file_dir, 'out') - with unittest.mock.patch('sys.argv', ['test', '-i', input_dir, '-o', output_dir]): + with unittest.mock.patch('sys.argv', ['test', '-i', input_dir, '-o', output_dir, + '-m', 'no-geo', '-b', '2', '-p', 't']): ons_csv_to_ctb_json_main.main() - with open(os.path.join(output_dir, 'service-metadata.json')) as f: + with open(os.path.join(output_dir, FILENAME_SERVICE_NO_GEO)) as f: service_metadata = json.load(f) with open(os.path.join(file_dir, 'expected/service-metadata.json')) as f: expected_service_metadata = json.load(f) self.assertEqual(service_metadata, expected_service_metadata) - with open(os.path.join(output_dir, 'dataset-metadata.json')) as f: + with open(os.path.join(output_dir, FILENAME_DATASET_NO_GEO)) as f: dataset_metadata = json.load(f) with open(os.path.join(file_dir, 'expected/dataset-metadata-no-geo.json')) as f: expected_dataset_metadata = json.load(f) self.assertEqual(dataset_metadata, expected_dataset_metadata) - with open(os.path.join(output_dir, 'table-metadata.json')) as f: + with open(os.path.join(output_dir, FILENAME_TABLES_NO_GEO)) as f: table_metadata = json.load(f) with open(os.path.join(file_dir, 'expected/table-metadata.json')) as f: expected_table_metadata = json.load(f) diff --git a/test/test_security_classification.py b/test/test_security_classification.py index ab410c3..ddd74df 100644 --- a/test/test_security_classification.py +++ b/test/test_security_classification.py @@ -8,7 +8,8 @@ HEADERS = ['Security_Mnemonic', 'Id', 'Security_Description', 'Security_Description_Welsh'] REQUIRED_FIELDS = {'Security_Mnemonic': 'PUB', - 'Id': '1'} + 'Id': '1', + 'Security_Description': 'Public'} INPUT_DIR = os.path.join(pathlib.Path(__file__).parent.resolve(), 'testdata') @@ -35,7 +36,7 @@ def test_duplicate_security_mnemonic(self): def test_missing_public_security_classification(self): self.run_test( - [{'Security_Mnemonic': 'PRIVATE', 'Id': '1'}], + [{'Security_Mnemonic': 'PRIVATE', 'Id': '1', 'Security_Description': 'Private'}], f'^PUB not found as Security_Mnemonic for any entry in {FILENAME}$') diff --git a/test/test_source.py b/test/test_source.py index 6b037aa..09a3d6e 100644 --- a/test/test_source.py +++ b/test/test_source.py @@ -12,7 +12,8 @@ REQUIRED_FIELDS = {'Source_Mnemonic': 'SRC1', 'Source_Description': 'description', - 'Id': '1'} + 'Id': '1', + 'Version': '1'} INPUT_DIR = os.path.join(pathlib.Path(__file__).parent.resolve(), 'testdata') diff --git a/test/test_variable.py b/test/test_variable.py index 6dd9ecb..51998c0 100644 --- a/test/test_variable.py +++ b/test/test_variable.py @@ -18,7 +18,8 @@ 'Variable_Title': 'title', 'Variable_Description': 'description', 'Id': '1', - 'Version': '1'} + 'Version': '1', + 'Signed_Off_Flag': 'N'} REQUIRED_FIELDS = {'Variable_Mnemonic': 'VAR1', 'Variable_Type_Code': 'DVO', @@ -43,7 +44,8 @@ def test_required_fields(self): self.run_test([row], f'^Reading {FILENAME}:2 no value supplied for required field {field}$') def test_invalid_values(self): - for field in ['Security_Mnemonic', 'Variable_Type_Code', 'Statistical_Unit', 'Topic_Mnemonic']: + for field in ['Security_Mnemonic', 'Variable_Type_Code', 'Statistical_Unit', 'Topic_Mnemonic', + 'Signed_Off_Flag']: with self.subTest(field=field): row = REQUIRED_FIELDS.copy() row[field] = 'X' diff --git a/test/testdata/Category.csv b/test/testdata/Category.csv index f6de984..81d470f 100644 --- a/test/testdata/Category.csv +++ b/test/testdata/Category.csv @@ -5,3 +5,4 @@ SOURCE,CLASS1,6,CODE6,,LABEL6 (Welsh),6,1,LABEL6 Internal SOURCE,CLASS1,2,CODE2,,LABEL2 (Welsh),2,1,LABEL2 Internal SOURCE,CLASS1,4,CODE4,,,,1,LABEL4 Internal SOURCE,CLASS1,1,CODE1,LABEL1,,,1,LABEL1 Internal +SOURCE2,CLASS2,1,CODE2-1,,LABEL2-1 (Welsh),1,1,LABEL2-1 Internal diff --git a/test/testdata/Classification.csv b/test/testdata/Classification.csv index 869f827..56a6734 100644 --- a/test/testdata/Classification.csv +++ b/test/testdata/Classification.csv @@ -1,5 +1,5 @@ Classification_Mnemonic,Variable_Mnemonic,Id,External_Classification_Label_English,External_Classification_Label_Welsh,Number_Of_Category_Items,Mnemonic_2011,Flat_Classification_Flag,Parent_Classification_Mnemonic,Security_Mnemonic,Signed_Off_Flag,Default_Classification_Flag,Version,Internal_Classification_Label_English CLASS1,VAR1,1,CLASS1 Label,CLASS1 Label Welsh,6,CLASS1 2011,N,CLASS1 Parent,PUB,Y,Y,1,CLASS1 Label Internal -CLASS2,VAR2,2,,,5,,,,PUB,N,,1,CLASS2 Label Internal -CLASS3,VAR3,3,,CLASS3 Label Welsh,5,,,,PUB,,N,1,CLASS3 Label Internal +CLASS2,VAR2,2,,,,,,,PUB,N,,1,CLASS2 Label Internal +CLASS3,VAR3,3,,CLASS3 Label Welsh,,,,,PUB,N,N,1,CLASS3 Label Internal CLASS_PRIV,VAR_PRIV,5,CLASS_PRIV Label,CLASS_PRIV Label Welsh,0,CLASS_PRIV 2011,Y,CLASS_PRIV,CLASS,Y,Y,1,CLASS_PRIV Label Internal diff --git a/test/testdata/Dataset.csv b/test/testdata/Dataset.csv index 94c789d..116948a 100644 --- a/test/testdata/Dataset.csv +++ b/test/testdata/Dataset.csv @@ -1,6 +1,6 @@ Dataset_Mnemonic,Id,Dataset_Title,Dataset_Title_Welsh,Dataset_Description,Dataset_Description_Welsh,Statistical_Unit,Dataset_Mnemonic_2011,Geographic_Coverage,Geographic_Coverage_Welsh,Dataset_Population,Dataset_Population_Welsh,Last_Updated,Unique_Url,Security_Mnemonic,Signed_Off_Flag,Database_Mnemonic,Contact_Id,Version DS1,1,DS1 Title,DS1 Title (Welsh),DS1 Description,DS1 Description (Welsh),People,DS1 2011,Everywhere,Everywhere (Welsh),Everyone,Everyone (Welsh),Today,DS1 Unique URL,PUB,Y,DB1,2,1 -DS2,2,DS2 Title,,DS2 Description,,Houses,,Everywhere,,Everyone,,,,PUB,,DB2,,1 +DS2,2,DS2 Title,,DS2 Description,,Houses,,Everywhere,,Everyone,,,,PUB,N,DB2,,1 DS3,3,DS3 Title,DS3 Title (Welsh),DS3 Description,DS3 Description (Welsh),People,DS3 2011,Everywhere,Everywhere (Welsh),Everyone,Everyone (Welsh),Today,DS3 Unique URL,PUB,Y,DB2,1,1 DS_PRIV,4,DS_PRIV Title,DS_PRIV Title (Welsh),DS_PRIV Description,DS_PRIV Description (Welsh),People,DS_PRIV 2011,Everywhere,Everywhere (Welsh),Everyone,Everyone (Welsh),Today,DS_PRIV Unique URL,CLASS,Y,DB1,1,1 -DS4,5,DS4 Title,,DS4 Description,,Houses,,Everywhere,,Everyone,,,,PUB,,DB1,,1 +DS4,5,DS4 Title,,DS4 Description,,Houses,,Everywhere,,Everyone,,,,PUB,N,DB1,,1 diff --git a/test/testdata/Dataset_Variable.csv b/test/testdata/Dataset_Variable.csv index 4eec30c..9e36d33 100644 --- a/test/testdata/Dataset_Variable.csv +++ b/test/testdata/Dataset_Variable.csv @@ -8,3 +8,4 @@ CLASS2,DS4,5,1,VAR2,N ,DS2,7,,GEO2,Y ,DS3,8,,GEO2,Y ,DS_PRIV,9,,GEO1,Y +,DS1,10,,GEO2,N diff --git a/test/testdata/Security_Classification.csv b/test/testdata/Security_Classification.csv index 7d5c322..3fa7783 100644 --- a/test/testdata/Security_Classification.csv +++ b/test/testdata/Security_Classification.csv @@ -1,3 +1,3 @@ Security_Mnemonic,Id,Security_Description,Security_Description_Welsh PUB,1,Public,Public (Welsh) -CLASS,2,, +CLASS,2,Classified, diff --git a/test/testdata/Variable.csv b/test/testdata/Variable.csv index 94ef3e9..b5d2386 100644 --- a/test/testdata/Variable.csv +++ b/test/testdata/Variable.csv @@ -1,7 +1,8 @@ Variable_Mnemonic,Id,Variable_Title,Variable_Title_Welsh,Variable_Description,Variable_Description_Welsh,Variable_Type_Code,Statistical_Unit,Topic_Mnemonic,Variable_Mnemonic_2011,Comparability_Comments,Comparability_Comments_Welsh,Uk_Comparison_Comments,Uk_Comparison_Comments_Welsh,Security_Mnemonic,Signed_Off_Flag,Number_Of_Classifications,Geographic_Abbreviation,Geographic_Abbreviation_Welsh,Geographic_Theme,Geographic_Theme_Welsh,Geographic_Coverage,Geographic_Coverage_Welsh,Version,Quality_Statement_Text,Quality_Summary_URL VAR1,1,VAR1 Title,VAR1 Title (Welsh),VAR1 Description,VAR1 Description (Welsh),DVO,People,TOPIC1,VAR1 2011,VAR1 Comparability Comments,VAR1 Comparability Comments (Welsh),VAR1 UK Comparison Comments,VAR1 UK Comparison Comments (Welsh),PUB,Y,3,,,,,,,1,VAR1 Quality Statement Text,VAR1 Quality Statement URL GEO1,2,GEO1 Title,GEO1 Title (Welsh),GEO1 Description,GEO1 Description (Welsh),GEOG,People,TOPIC1,GEO1 2011,GEO1 Comparability Comments,GEO1 Comparability Comments (Welsh),GEO1 UK Comparison Comments,GEO1 UK Comparison Comments (Welsh),PUB,Y,3,G1,G1 (Welsh),GEO1 Theme,GEO1 Theme (Welsh),GEO1 Coverage,GEO1 Coverage (Welsh),1,, -VAR2,3,VAR2 Title,,VAR2 Description,,DVO,,,,VAR2 Comparability Comments,,VAR2 UK Comparison Comments,,PUB,,,,,,,,,1,, -VAR3,4,VAR3 Title,,VAR3 Description,,DVO,,,,,,,,PUB,,,,,,,,,1,, +VAR2,3,VAR2 Title,,VAR2 Description,,DVO,,,,VAR2 Comparability Comments,,VAR2 UK Comparison Comments,,PUB,N,,,,,,,,1,, +VAR3,4,VAR3 Title,,VAR3 Description,,DVO,,,,,,,,PUB,N,,,,,,,,1,, VAR_PRIV,5,VAR_PRIV Title,VAR_PRIV Title (Welsh),VAR_PRIV Description,VAR_PRIV Description (Welsh),DVO,People,TOPIC1,VAR_PRIV 2011,VAR_PRIV Comparability Comments,VAR_PRIV Comparability Comments (Welsh),VAR_PRIV UK Comparison Comments,VAR_PRIV Comparison Comments (Welsh),CLASS,Y,1,,,,,,,1,, GEO2,6,GEO2 Title,,GEO2 Description,,GEOG,,,,GEO2 Comparability Comments,,GEO2 UK Comparison Comments,,PUB,Y,3,G2,,GEO1 Theme,,GEO2 Coverage,,1,, +GEO_PRIV,7,GEO_PRIV Title,,GEO_PRIV Description,,GEOG,,,,GEO_PRIV Comparability Comments,,GEO_PRIV UK Comparison Comments,,CLASS,Y,3,G_PRIV,,GEO_PRIV Theme,,GEO_PRIV Coverage,,1,, diff --git a/test/testdata/best_effort/Category.csv b/test/testdata/best_effort/Category.csv new file mode 100644 index 0000000..6bf67d4 --- /dev/null +++ b/test/testdata/best_effort/Category.csv @@ -0,0 +1,2 @@ +Variable_Mnemonic,Classification_Mnemonic,Id,Category_Code,External_Category_Label_English,External_Category_Label_Welsh,Sort_Order,Version,Internal_Category_Label_English +SOURCE,CLASS1,1,CODE1,LABEL1,,,1,LABEL1 Internal diff --git a/test/testdata/best_effort/Census_Release.csv b/test/testdata/best_effort/Census_Release.csv new file mode 100644 index 0000000..532c8cd --- /dev/null +++ b/test/testdata/best_effort/Census_Release.csv @@ -0,0 +1 @@ +Census_Release_Number,Id,Census_Release_Description,Release_Date diff --git a/test/testdata/best_effort/Classification.csv b/test/testdata/best_effort/Classification.csv new file mode 100644 index 0000000..8d20083 --- /dev/null +++ b/test/testdata/best_effort/Classification.csv @@ -0,0 +1,5 @@ +Classification_Mnemonic,Variable_Mnemonic,Id,External_Classification_Label_English,External_Classification_Label_Welsh,Number_Of_Category_Items,Mnemonic_2011,Flat_Classification_Flag,Parent_Classification_Mnemonic,Security_Mnemonic,Signed_Off_Flag,Default_Classification_Flag,Version,Internal_Classification_Label_English +CLASS1,VAR1,1,,,4,,,,PUB,N,,1,CLASS1 Label Internal +CLASS2,,2,,,,,,,PUB,N,,1,CLASS2 Label Internal +CLASS1,VAR1,3,,,,,,,PUB,N,,1,CLASS1 Label Internal (Alternative) +CLASS3,VAR3,4,,,x,,,,PUB,N,,1,CLASS3 Label Internal diff --git a/test/testdata/best_effort/Contact.csv b/test/testdata/best_effort/Contact.csv new file mode 100644 index 0000000..9c33c77 --- /dev/null +++ b/test/testdata/best_effort/Contact.csv @@ -0,0 +1 @@ +Contact_Id,Contact_Name,Contact_Email,Contact_Phone,Contact_Website diff --git a/test/testdata/best_effort/Database.csv b/test/testdata/best_effort/Database.csv new file mode 100644 index 0000000..b1b01a7 --- /dev/null +++ b/test/testdata/best_effort/Database.csv @@ -0,0 +1,2 @@ +Database_Mnemonic,Id,Database_Title,Database_Title_Welsh,Database_Description,Database_Description_Welsh,Cantabular_DB_Flag,IAR_Asset_Id,Source_Mnemonic,Version +DB1,1,DB1 Title,,DB1 Description,,,,SRC1,1 diff --git a/test/testdata/best_effort/Database_Variable.csv b/test/testdata/best_effort/Database_Variable.csv new file mode 100644 index 0000000..9b0bb5e --- /dev/null +++ b/test/testdata/best_effort/Database_Variable.csv @@ -0,0 +1,4 @@ +Id,Database_Mnemonic,Variable_Mnemonic,Version,Lowest_Geog_Variable_Flag +1,DB1,VAR1,1, +2,DB1,GEO1,1,Y +3,DB1,GEO3,1,Y diff --git a/test/testdata/best_effort/Dataset.csv b/test/testdata/best_effort/Dataset.csv new file mode 100644 index 0000000..9e9cd1c --- /dev/null +++ b/test/testdata/best_effort/Dataset.csv @@ -0,0 +1,4 @@ +Dataset_Mnemonic,Id,Dataset_Title,Dataset_Title_Welsh,Dataset_Description,Dataset_Description_Welsh,Statistical_Unit,Dataset_Mnemonic_2011,Geographic_Coverage,Geographic_Coverage_Welsh,Dataset_Population,Dataset_Population_Welsh,Last_Updated,Unique_Url,Security_Mnemonic,Signed_Off_Flag,Database_Mnemonic,Contact_Id,Version +DS1,1,DS1 Title,,DS1 Description,,Houses,,Everywhere,,Everyone,,,,PUB,N,DB1,,1 +DS2,2,DS2 Title,,DS2 Description,,Houses,,Everywhere,,Everyone,,,,PUB,N,DB1,,1 +DS3,3,DS3 Title,,DS3 Description,,Houses,,Everywhere,,Everyone,,,,PUB,N,DB1,,1 diff --git a/test/testdata/best_effort/Dataset_Keyword.csv b/test/testdata/best_effort/Dataset_Keyword.csv new file mode 100644 index 0000000..4204157 --- /dev/null +++ b/test/testdata/best_effort/Dataset_Keyword.csv @@ -0,0 +1 @@ +Dataset_Mnemonic,Id,Dataset_Keyword,Dataset_Keyword_Welsh diff --git a/test/testdata/best_effort/Dataset_Variable.csv b/test/testdata/best_effort/Dataset_Variable.csv new file mode 100644 index 0000000..3aaf62a --- /dev/null +++ b/test/testdata/best_effort/Dataset_Variable.csv @@ -0,0 +1,9 @@ +Classification_Mnemonic,Dataset_Mnemonic,Id,Processing_Priority,Variable_Mnemonic,Lowest_Geog_Variable_Flag +CLASS1,DS1,1,,VAR1,Y +GEO1,DS1,2,1,GEO1,Y +CLASS1,DS1,3,1,VAR1,N +,DS1,4,,GEO2,Y +,DS1,5,,GEO3,N +,DS1,6,1,VAR2,N +CLASS1,DS1,7,1,VAR3,N +CLASS3,DS2,8,1,VAR3,N diff --git a/test/testdata/best_effort/Publication_Dataset.csv b/test/testdata/best_effort/Publication_Dataset.csv new file mode 100644 index 0000000..57af1aa --- /dev/null +++ b/test/testdata/best_effort/Publication_Dataset.csv @@ -0,0 +1 @@ +Publication_Mnemonic,Dataset_Mnemonic,Id,Publication_Title,Publisher_Name,Publisher_Website diff --git a/test/testdata/best_effort/Question.csv b/test/testdata/best_effort/Question.csv new file mode 100644 index 0000000..31aa1cb --- /dev/null +++ b/test/testdata/best_effort/Question.csv @@ -0,0 +1 @@ +Question_Code,Id,Question_Label,Question_Label_Welsh,Reason_For_Asking_Question,Reason_For_Asking_Question_Welsh,Question_First_Asked_In_Year,Version diff --git a/test/testdata/best_effort/Related_Datasets.csv b/test/testdata/best_effort/Related_Datasets.csv new file mode 100644 index 0000000..959f4d8 --- /dev/null +++ b/test/testdata/best_effort/Related_Datasets.csv @@ -0,0 +1 @@ +Dataset_Mnemonic,Id,Related_Dataset_Mnemonic diff --git a/test/testdata/best_effort/Release_Dataset.csv b/test/testdata/best_effort/Release_Dataset.csv new file mode 100644 index 0000000..37adceb --- /dev/null +++ b/test/testdata/best_effort/Release_Dataset.csv @@ -0,0 +1 @@ +Census_Release_Number,Dataset_Mnemonic,Id diff --git a/test/testdata/best_effort/Security_Classification.csv b/test/testdata/best_effort/Security_Classification.csv new file mode 100644 index 0000000..3fa7783 --- /dev/null +++ b/test/testdata/best_effort/Security_Classification.csv @@ -0,0 +1,3 @@ +Security_Mnemonic,Id,Security_Description,Security_Description_Welsh +PUB,1,Public,Public (Welsh) +CLASS,2,Classified, diff --git a/test/testdata/best_effort/Source.csv b/test/testdata/best_effort/Source.csv new file mode 100644 index 0000000..7455d90 --- /dev/null +++ b/test/testdata/best_effort/Source.csv @@ -0,0 +1,2 @@ +Source_Mnemonic,Id,Source_Description,Source_Description_Welsh,Copyright_Statement,Licence,Nationals_Statistic_Certified,Methodology_Link,Methodology_Statement,Methodology_Statement_Welsh,SDC_Link,SDC_Statement,SDC_Statement_Welsh,Contact_Id,Version +SRC1,1,SRC1 Description,,,,,,,,,,,,1 diff --git a/test/testdata/best_effort/Statistical_Unit.csv b/test/testdata/best_effort/Statistical_Unit.csv new file mode 100644 index 0000000..2e3106b --- /dev/null +++ b/test/testdata/best_effort/Statistical_Unit.csv @@ -0,0 +1,2 @@ +Statistical_Unit,Id,Statistical_Unit_Description,Statistical_Unit_Description_Welsh +Houses,1,House Description, diff --git a/test/testdata/best_effort/Topic.csv b/test/testdata/best_effort/Topic.csv new file mode 100644 index 0000000..00b0e08 --- /dev/null +++ b/test/testdata/best_effort/Topic.csv @@ -0,0 +1 @@ +Topic_Mnemonic,Id,Topic_Description,Topic_Description_Welsh,Topic_Title,Topic_Title_Welsh diff --git a/test/testdata/best_effort/Topic_Classification.csv b/test/testdata/best_effort/Topic_Classification.csv new file mode 100644 index 0000000..0bb94c9 --- /dev/null +++ b/test/testdata/best_effort/Topic_Classification.csv @@ -0,0 +1 @@ +Classification_Mnemonic,Topic_Mnemonic,Id diff --git a/test/testdata/best_effort/Variable.csv b/test/testdata/best_effort/Variable.csv new file mode 100644 index 0000000..e386025 --- /dev/null +++ b/test/testdata/best_effort/Variable.csv @@ -0,0 +1,7 @@ +Variable_Mnemonic,Id,Variable_Title,Variable_Title_Welsh,Variable_Description,Variable_Description_Welsh,Variable_Type_Code,Statistical_Unit,Topic_Mnemonic,Variable_Mnemonic_2011,Comparability_Comments,Comparability_Comments_Welsh,Uk_Comparison_Comments,Uk_Comparison_Comments_Welsh,Security_Mnemonic,Signed_Off_Flag,Number_Of_Classifications,Geographic_Abbreviation,Geographic_Abbreviation_Welsh,Geographic_Theme,Geographic_Theme_Welsh,Geographic_Coverage,Geographic_Coverage_Welsh,Version,Quality_Statement_Text,Quality_Summary_URL +VAR1,1,VAR1 Title,,VAR1 Description,,DVO,,,,VAR1 Comparability Comments,,VAR1 UK Comparison Comments,,PUB,N,,,,,,,,1,, +GEO1,2,GEO1 Title,GEO1 Title (Welsh),GEO1 Description,GEO1 Description (Welsh),GEOG,,,GEO1 2011,GEO1 Comparability Comments,GEO1 Comparability Comments (Welsh),GEO1 UK Comparison Comments,GEO1 UK Comparison Comments (Welsh),PUB,Y,3,G1,G1 (Welsh),GEO1 Theme,GEO1 Theme (Welsh),GEO1 Coverage,GEO1 Coverage (Welsh),1,, +GEO2,3,GEO2 Title,GEO2 Title (Welsh),GEO2 Description,GEO2 Description (Welsh),GEOG,,,GEO2 2011,GEO2 Comparability Comments,GEO2 Comparability Comments (Welsh),GEO2 UK Comparison Comments,GEO2 UK Comparison Comments (Welsh),PUB,Y,3,G1,G1 (Welsh),GEO2 Theme,GEO2 Theme (Welsh),GEO2 Coverage,GEO2 Coverage (Welsh),1,, +GEO3,4,GEO3 Title,GEO3 Title (Welsh),GEO3 Description,GEO3 Description (Welsh),GEOG,,,GEO3 2011,GEO3 Comparability Comments,GEO3 Comparability Comments (Welsh),GEO3 UK Comparison Comments,GEO3 UK Comparison Comments (Welsh),PUB,Y,3,G1,G1 (Welsh),GEO3 Theme,GEO3 Theme (Welsh),GEO3 Coverage,GEO3 Coverage (Welsh),1,, +VAR2,5,VAR2 Title,,VAR2 Description,,DVO,,,,VAR2 Comparability Comments,,VAR2 UK Comparison Comments,,PUB,N,,,,,,,,1,, +VAR3,6,VAR3 Title,,VAR3 Description,,DVO,,,,VAR3 Comparability Comments,,VAR3 UK Comparison Comments,,PUB,N,,,,,,,,1,, diff --git a/test/testdata/best_effort/Variable_Keyword.csv b/test/testdata/best_effort/Variable_Keyword.csv new file mode 100644 index 0000000..7e9e35f --- /dev/null +++ b/test/testdata/best_effort/Variable_Keyword.csv @@ -0,0 +1 @@ +Variable_Mnemonic,Id,Variable_Keyword,Variable_Keyword_Welsh diff --git a/test/testdata/best_effort/Variable_Source_Question.csv b/test/testdata/best_effort/Variable_Source_Question.csv new file mode 100644 index 0000000..993c687 --- /dev/null +++ b/test/testdata/best_effort/Variable_Source_Question.csv @@ -0,0 +1 @@ +Variable_Mnemonic,Source_Question_Code,Id diff --git a/test/testdata/best_effort/Variable_Type.csv b/test/testdata/best_effort/Variable_Type.csv new file mode 100644 index 0000000..9794587 --- /dev/null +++ b/test/testdata/best_effort/Variable_Type.csv @@ -0,0 +1,3 @@ +Variable_Type_Code,Id,Variable_Type_Description,Variable_Type_Description_Welsh +GEOG,1,Geographic variable,Geographic variable (Welsh) +DVO,1,Derived variable,