From 037698a3a297f334844b8d9158f0bd759b6c16a1 Mon Sep 17 00:00:00 2001 From: Nicola Cerutti <94574085+ncerutti@users.noreply.github.com> Date: Thu, 9 Nov 2023 15:01:01 +0000 Subject: [PATCH] Versioning info (#166) * Added versioning sheet * added versioning sheets * versioning info grouped by version * removed the unused 'Other' category * if no versioning info, create empty dict * example schema for adding versioning information * Fix CI testing * added codes to revert to OED v2.2 and v2.0 * Removed test schema * fallback 1256 and 1305 -> 1104 & 5503 -> 5501 * 5309, 5310 -> 5100 * new codes (agriculture) * Fix CD release * Disable guard * Revert "Disable guard" This reverts commit 0b14734e8846e8ac78067b7dcfccd8ef01d86b25. * Replaced spec xlsx file * Corrected field "number of storeys" * Add files via upload --------- Co-authored-by: Sam Gamble --- .github/workflows/publish.yml | 2 +- .github/workflows/test.yml | 6 +- docker/extract_spec.py | 314 +++++++++++++++++++++------------- 3 files changed, 196 insertions(+), 126 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index abdb1323..e3d7da75 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -134,7 +134,7 @@ jobs: echo "run_status=${{ job.status }}" >> $GITHUB_OUTPUT slack: - uses: OasisLMF/OasisLMF/.github/workflows/notify.yml@master + uses: OasisLMF/OasisLMF/.github/workflows/notify.yml@main secrets: inherit needs: release with: diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index aee8a897..0b4aaa51 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -9,7 +9,7 @@ on: required: false env: - ods_tools_branch: ${{ github.event_name != 'workflow_dispatch' && 'develop' || inputs.ods_tools_branch }} + ods_tools_branch: ${{ github.event_name != 'workflow_dispatch' && 'main' || inputs.ods_tools_branch }} jobs: build_spec: @@ -18,11 +18,11 @@ jobs: build_package: needs: build_spec - uses: OasisLMF/ODS_Tools/.github/workflows/build.yml@develop + uses: OasisLMF/ODS_Tools/.github/workflows/build.yml@main secrets: inherit with: oed_spec_json: ${{ needs.build_spec.outputs.spec_filename }} - ods_branch: ${{ github.event_name != 'workflow_dispatch' && 'develop' || inputs.ods_tools_branch }} + ods_branch: ${{ github.event_name != 'workflow_dispatch' && 'main' || inputs.ods_tools_branch }} test: name: Run Pytest diff --git a/docker/extract_spec.py b/docker/extract_spec.py index beaf6718..c6215fa7 100755 --- a/docker/extract_spec.py +++ b/docker/extract_spec.py @@ -24,16 +24,16 @@ "tinyint": "Int32", "uniqueidentifier": "category", "varbinary": "bytes", - "varchar": "category" + "varchar": "category", } dtype_to_python = { - 'Int8': int, - 'Int32': int, - 'Int64': int, - 'bytes': lambda x: bytes(x, 'utf-8'), - 'float64': float, - 'category': str + "Int8": int, + "Int32": int, + "Int64": int, + "bytes": lambda x: bytes(x, "utf-8"), + "float64": float, + "category": str, } @@ -42,10 +42,18 @@ def cli(): pass -@cli.command('csv') -@click.option('--source-excel-path', required=True, default=None, help='Path to MS excel sheet') -@click.option('--output-csv-path', default='OpenExposureData_Spec.csv', help='Path to write csv file') -@click.option('--excel-sheet-name', default="OED Input Fields", help='Sheet label to extract') +@cli.command("csv") +@click.option( + "--source-excel-path", required=True, default=None, help="Path to MS excel sheet" +) +@click.option( + "--output-csv-path", + default="OpenExposureData_Spec.csv", + help="Path to write csv file", +) +@click.option( + "--excel-sheet-name", default="OED Input Fields", help="Sheet label to extract" +) def extract_spec_to_csv(source_excel_path, output_csv_path, excel_sheet_name): """ convert an Excel sheet to a csv file @@ -58,20 +66,23 @@ def extract_spec_to_csv(source_excel_path, output_csv_path, excel_sheet_name): df_spec = pd.read_excel( source_excel_path, sheet_name=excel_sheet_name, - dtype={'Default': str}, + dtype={"Default": str}, keep_default_na=False, - na_values=[]) - df_spec.to_csv( - path_or_buf=output_csv_path, - encoding='utf-8', - mode='w', - index=False) - print(f'Written CSV spec: "{output_csv_path}" \noutput based on file: "{source_excel_path}", sheet: "{excel_sheet_name}"') - - -@cli.command('json') -@click.option('--source-excel-path', required=True, default=None, help='Path to MS excel sheet') -@click.option('--output-json-path', default='oed.json', help='Path to write json oed file') + na_values=[], + ) + df_spec.to_csv(path_or_buf=output_csv_path, encoding="utf-8", mode="w", index=False) + print( + f'Written CSV spec: "{output_csv_path}" \noutput based on file: "{source_excel_path}", sheet: "{excel_sheet_name}"' + ) + + +@cli.command("json") +@click.option( + "--source-excel-path", required=True, default=None, help="Path to MS excel sheet" +) +@click.option( + "--output-json-path", default="oed.json", help="Path to write json oed file" +) def extract_spec_to_json(source_excel_path, output_json_path): """ read an Excel ods_schema (OpenExposureData_Spec.xlsx) and write relevant information from each sheet to a json file @@ -80,25 +91,32 @@ def extract_spec_to_json(source_excel_path, output_json_path): source_excel_path (str): path to the Excel ods_schema output_json_path (str): path to the json output """ + def _read_excel(excel_sheet_name): - return pd.read_excel(source_excel_path, - sheet_name=excel_sheet_name, - dtype={'Default': str}, - keep_default_na=False, - na_values=[]) + return pd.read_excel( + source_excel_path, + sheet_name=excel_sheet_name, + dtype={"Default": str}, + keep_default_na=False, + na_values=[], + ) ods_schema = {} - ods_schema['input_fields'] = get_ods_input_fields(_read_excel('OED Input Fields')) - ods_schema['perils'] = get_ods_perils(_read_excel('Peril Values')) - ods_schema['occupancy'] = get_occupancy(_read_excel('Occupancy Values')) - ods_schema['construction'] = get_construction(_read_excel('Construction Values')) - ods_schema['country'] = get_country(_read_excel('Country Values')) - ods_schema['area'] = get_area(_read_excel('AreaCode Values')) - ods_schema['cr_field'] = get_cr_field(_read_excel('OED CR Field Appendix')) + ods_schema["input_fields"] = get_ods_input_fields(_read_excel("OED Input Fields")) + ods_schema["perils"] = get_ods_perils(_read_excel("Peril Values")) + ods_schema["occupancy"] = get_occupancy(_read_excel("Occupancy Values")) + ods_schema["construction"] = get_construction(_read_excel("Construction Values")) + ods_schema["country"] = get_country(_read_excel("Country Values")) + ods_schema["area"] = get_area(_read_excel("AreaCode Values")) + ods_schema["cr_field"] = get_cr_field(_read_excel("OED CR Field Appendix")) + try: + ods_schema["versioning"] = get_versioning(_read_excel("Versioning")) + except ValueError: + ods_schema["versioning"] = {} pathlib.Path(output_json_path).parent.mkdir(parents=True, exist_ok=True) - with open(output_json_path, 'w') as fp: - json.dump(ods_schema, fp, indent=' ') + with open(output_json_path, "w") as fp: + json.dump(ods_schema, fp, indent=" ") def get_ods_input_fields(ods_fields_df): @@ -113,46 +131,51 @@ def get_ods_input_fields(ods_fields_df): """ # convert Data Type to pandas DataType - ods_fields_df = ( - ods_fields_df - .assign(pd_dtype=ods_fields_df['Data Type'].str.split('(', n=1, expand=True)[0].map(pd_converter)) - .rename(columns={'File Name': 'File Names'}) - ) - ods_fields_df["Case Insensitive Field Name"] = ods_fields_df["Input Field Name"].str.lower() + ods_fields_df = ods_fields_df.assign( + pd_dtype=ods_fields_df["Data Type"] + .str.split("(", n=1, expand=True)[0] + .map(pd_converter) + ).rename(columns={"File Name": "File Names"}) + ods_fields_df["Case Insensitive Field Name"] = ods_fields_df[ + "Input Field Name" + ].str.lower() # check that to Data Type is missing from our converter - if ods_fields_df['pd_dtype'].isna().any(): - raise ValueError(f"missing pd_dtype for:\n" - f"""{ods_fields_df.loc[ods_fields_df['pd_dtype'].isna(), - ['File Name', 'Input Field Name', 'Type & Description', 'Data Type']]}""") + if ods_fields_df["pd_dtype"].isna().any(): + raise ValueError( + f"missing pd_dtype for:\n" + f"""{ods_fields_df.loc[ods_fields_df['pd_dtype'].isna(), + ['File Name', 'Input Field Name', 'Type & Description', 'Data Type']]}""" + ) # split ods_fields per File Name - split_df = ods_fields_df['File Names'].str.split(';').apply(pd.Series, 1).stack() + split_df = ods_fields_df["File Names"].str.split(";").apply(pd.Series, 1).stack() split_df = ( - split_df - .set_axis(split_df.index.droplevel(-1)) - .rename('File Name') - .str.strip() + split_df.set_axis(split_df.index.droplevel(-1)).rename("File Name").str.strip() ) - ods_fields_df = ods_fields_df.join(split_df).drop(columns='File Names') + ods_fields_df = ods_fields_df.join(split_df).drop(columns="File Names") - ods_fields_df['Valid value range'] = ods_fields_df.apply(lambda row: extract_valid_value_range(row['Valid value range'], - dtype_to_python[row['pd_dtype']]), - axis=1) + ods_fields_df["Valid value range"] = ods_fields_df.apply( + lambda row: extract_valid_value_range( + row["Valid value range"], dtype_to_python[row["pd_dtype"]] + ), + axis=1, + ) # create a field dict for each File Name available and None __ods_fields = {} - for file_name in ods_fields_df['File Name'].unique(): - __ods_fields[file_name] = (ods_fields_df[ods_fields_df['File Name'] == file_name] - .set_index(['Case Insensitive Field Name']) - .to_dict(orient='index')) + for file_name in ods_fields_df["File Name"].unique(): + __ods_fields[file_name] = ( + ods_fields_df[ods_fields_df["File Name"] == file_name] + .set_index(["Case Insensitive Field Name"]) + .to_dict(orient="index") + ) __ods_fields[None] = ( - ods_fields_df - .drop(columns=['File Name']) - .drop_duplicates(subset=['Input Field Name']) - .set_index(['Case Insensitive Field Name']) - .to_dict(orient='index') + ods_fields_df.drop(columns=["File Name"]) + .drop_duplicates(subset=["Input Field Name"]) + .set_index(["Case Insensitive Field Name"]) + .to_dict(orient="index") ) return __ods_fields @@ -168,21 +191,28 @@ def get_ods_perils(perils_df): dict with info on perils and mapping between peril and list of sub perils """ return { - 'info': ( - perils_df - [['DB table PerilCode', 'Peril Description', 'Input format abbreviation', 'Grouped PerilCode']] - [:perils_df[perils_df['Input format abbreviation'] == ''].index[0]] # select rows until 1st empty cell - .set_index(['Input format abbreviation']) - .to_dict(orient='index') + "info": ( + perils_df[ + [ + "DB table PerilCode", + "Peril Description", + "Input format abbreviation", + "Grouped PerilCode", + ] + ][ + : perils_df[perils_df["Input format abbreviation"] == ""].index[0] + ] # select rows until 1st empty cell + .set_index(["Input format abbreviation"]) + .to_dict(orient="index") ), - 'covered': ( # create a dict of peril => list of sub perils - perils_df - [['Peril', 'PerilsCovered']] + "covered": ( # create a dict of peril => list of sub perils + perils_df[["Peril", "PerilsCovered"]] .replace("", float("NaN")) .dropna() - .groupby('PerilsCovered')['Peril'].apply(list) + .groupby("PerilsCovered")["Peril"] + .apply(list) .to_dict() - ) + ), } @@ -196,10 +226,19 @@ def get_occupancy(occupancy_df): dict of information on occupancies """ return ( - occupancy_df - [['Category', 'OED Code', 'AIR code', 'Name', 'Description', 'Code Range', 'Broad Category']] - .set_index('OED Code') - .to_dict(orient='index') + occupancy_df[ + [ + "Category", + "OED Code", + "AIR code", + "Name", + "Description", + "Code Range", + "Broad Category", + ] + ] + .set_index("OED Code") + .to_dict(orient="index") ) @@ -213,10 +252,19 @@ def get_construction(construction_df): dict of information on construction codes """ return ( - construction_df - [['Category', 'OED Code', 'AIR code', 'Name', 'Description', 'Code Range', 'Broad Category']] - .set_index('OED Code') - .to_dict(orient='index') + construction_df[ + [ + "Category", + "OED Code", + "AIR code", + "Name", + "Description", + "Code Range", + "Broad Category", + ] + ] + .set_index("OED Code") + .to_dict(orient="index") ) @@ -229,11 +277,7 @@ def get_country(country_df): Returns: dict of information on country codes """ - return ( - country_df - .set_index('Code') - .to_dict(orient='index') - ) + return country_df.set_index("Code").to_dict(orient="index") def get_area(area_df): @@ -245,11 +289,7 @@ def get_area(area_df): Returns: dict of information on area codes """ - return ( - area_df - .groupby('CountryCode')['AreaCode'].apply(list) - .to_dict() - ) + return area_df.groupby("CountryCode")["AreaCode"].apply(list).to_dict() def get_cr_field(cr_field_df): @@ -262,40 +302,67 @@ def get_cr_field(cr_field_df): Returns: conditional requirement dict """ - cr_field_df = ( - cr_field_df - .assign(pd_dtype=cr_field_df['Data Type'].str.split('(', n=1, expand=True)[0].map(pd_converter)) - .rename(columns={'File Name': 'File Names'}) - ) + cr_field_df = cr_field_df.assign( + pd_dtype=cr_field_df["Data Type"] + .str.split("(", n=1, expand=True)[0] + .map(pd_converter) + ).rename(columns={"File Name": "File Names"}) # split ods_fields per File Name - split_df = cr_field_df['File Names'].str.split(';').apply(pd.Series, 1).stack() + split_df = cr_field_df["File Names"].str.split(";").apply(pd.Series, 1).stack() split_df = ( - split_df - .set_axis(split_df.index.droplevel(-1)) - .rename('File Name') - .str.strip() + split_df.set_axis(split_df.index.droplevel(-1)).rename("File Name").str.strip() ) - cr_field_df = cr_field_df.join(split_df).drop(columns='File Names') + cr_field_df = cr_field_df.join(split_df).drop(columns="File Names") cr_fields_by_file = {} - for file_name in cr_field_df['File Name'].unique(): - cr_to_field = cr_field_df[cr_field_df['File Name'] == file_name].groupby('Required Field')['Input Field Name'].apply(list).to_dict() + for file_name in cr_field_df["File Name"].unique(): + cr_to_field = ( + cr_field_df[cr_field_df["File Name"] == file_name] + .groupby("Required Field")["Input Field Name"] + .apply(list) + .to_dict() + ) cr_field = {} for cr, fields in cr_to_field.items(): - if '-' not in cr: + if "-" not in cr: continue for field in fields: cur_cr_field = set(fields) - for i in range(cr.count('-') + 1): - cur_cr_field |= set(cr_to_field.get(cr.rsplit('-', i)[0], [])) - if len(cur_cr_field) > 1: # we remove field that provide no extra requirement + for i in range(cr.count("-") + 1): + cur_cr_field |= set(cr_to_field.get(cr.rsplit("-", i)[0], [])) + if ( + len(cur_cr_field) > 1 + ): # we remove field that provide no extra requirement cr_field[field] = list(cur_cr_field) cr_fields_by_file[file_name] = cr_field return cr_fields_by_file +def get_versioning(version_values_df): + """ + Gets information on versioning. The sheet should be structured as follow: + Version: the version to which the change will lead + Category: the category to which the change applies (e.g. "Occupancy") + New code: the code that should be reverted + Fallback: the code to which we should revert + + Args: + version_values_df (pd.DataFrame): Versioning + + Returns: + dict of information on versioning + """ + # Group by the version + grouped = version_values_df.groupby("Version") + version_dict = { + version: group[["Category", "New code", "Fallback"]].to_dict(orient="records") + for version, group in grouped + } + return version_dict + + def extract_valid_value_range(valid_value_range, dtype): """ translate a list of valid values from the string input writen in ods schema @@ -311,11 +378,12 @@ def extract_valid_value_range(valid_value_range, dtype): >>> extract_valid_value_range('[-999,-999], [0,5], [10,), (,-5)', int) [{'min': 0, 'max': 5}, {'min': 10}, {'max': -5}, {'enum': [-999]}] """ + def get_next_bracket(_str: str, opening): if opening: - bracket = ['[', '('] + bracket = ["[", "("] else: - bracket = [']', ')'] + bracket = ["]", ")"] ind = (_str.find(s) for s in bracket) found_ind = [i for i in ind if i >= 0] @@ -333,24 +401,26 @@ def get_next_bracket(_str: str, opening): c_index = get_next_bracket(valid_value_range_left, opening=False) if o_index == -1: break - min_val, max_val = [x.strip() for x in valid_value_range_left[o_index + 1: c_index].split(',')] + min_val, max_val = [ + x.strip() for x in valid_value_range_left[o_index + 1 : c_index].split(",") + ] if min_val == max_val: - valid_single_values.setdefault('enum', []).append(dtype(min_val)) + valid_single_values.setdefault("enum", []).append(dtype(min_val)) elif max_val and min_val: - valid_values.append({'min': dtype(min_val), 'max': dtype(max_val)}) + valid_values.append({"min": dtype(min_val), "max": dtype(max_val)}) elif min_val: - valid_values.append({'min': dtype(min_val)}) + valid_values.append({"min": dtype(min_val)}) else: - valid_values.append({'max': dtype(max_val)}) - valid_value_range_left = valid_value_range_left[c_index + 1:] + valid_values.append({"max": dtype(max_val)}) + valid_value_range_left = valid_value_range_left[c_index + 1 :] if valid_single_values: valid_values.append(valid_single_values) if valid_values: return valid_values else: - return 'n/a' + return "n/a" -if __name__ == '__main__': +if __name__ == "__main__": cli()