From 5057829b95858beeb0edd7a9669eed398832f685 Mon Sep 17 00:00:00 2001 From: christinehc Date: Thu, 2 Jan 2025 16:13:53 -0800 Subject: [PATCH] fix,refactor: build tables correctly. clarify code changelog: - `map_samples_to_chemicals.py` now correctly builds tables and will allow `build_script.py` to run to the end (tested with `--samps` flag) - redundant functions consolidated (e.g. `combine_chemical_fit_data` and `combine_chemical_dose_data` -> `combine_chemical_data` function with `data_type` arg) - variable names changed to be clearer to the end user (e.g. `samp_chem` -> `chem_data`. Most instance of `samp` -> `sample`) - added type hinting and docstrings to most functions - enforce snake_case for all variable and column names - enforced formatting on all `src/*.py` modules for building files --- sampleChemMapping/map_samples_to_chemicals.py | 388 +++++++++--------- sampleChemMapping/src/format.py | 32 +- sampleChemMapping/src/metadata.py | 55 ++- sampleChemMapping/src/params.py | 7 +- sampleChemMapping/src/tables.py | 10 +- 5 files changed, 234 insertions(+), 258 deletions(-) diff --git a/sampleChemMapping/map_samples_to_chemicals.py b/sampleChemMapping/map_samples_to_chemicals.py index 8611e51..31d50fa 100644 --- a/sampleChemMapping/map_samples_to_chemicals.py +++ b/sampleChemMapping/map_samples_to_chemicals.py @@ -14,13 +14,15 @@ import pandas as pd from numpy.typing import ArrayLike -from src.format import process_fses, rename_duplicates, snakeify, snakeify_all_columns +from src.format import rename_duplicates, snakeify, snakeify_all_columns from src.mapping import rename_chemical_class from src.metadata import build_chem_metadata, get_endpoint_metadata from src.params import ( MASV_CC, MASV_SOURCE, + QC_FLAGS, REQUIRED_BMD_COLUMNS, + REQUIRED_SAMPLE_COLUMNS, SAMPLE_CHEM_COLUMNS, SAMPLE_COLUMNS, ) @@ -87,19 +89,57 @@ def get_new_chemical_class(data_dir: str) -> pd.DataFrame: return full_class -#' buildSampleData - takes the curated information and selects the data we need -#' @param data.dir -#' @return data.frame -# fses_files, #files from barton that contain sample info -# chemMeta, #metadata for chemicals including identifier mapping -# sampIds, #new ids for samples -# sampMapping ##mapping for sample names to clean up +def process_fses(filename: str, snake_case: bool = True) -> pd.DataFrame: + # Replace invalid values with nulls for filtering + df = pd.read_csv(filename)[REQUIRED_SAMPLE_COLUMNS].replace( + {"BLOD": "0", "NULL": "0", "nc:BDL": "0"} + ) + if snake_case: + df = snakeify_all_columns(df) + + # Remove null and invalid entries + df = df[ + (df["sample_number"].notna()) + & (df["cas_number"].notna()) + & (~df["measurement_value"].isin(["0", np.nan])) + & (~df["measurement_value_molar"].isin(["0", np.nan])) + ] + + # Format FSES location data + df["location_lon"] = pd.to_numeric(df["location_lon"], errors="coerce") + + # Only allow negative longitudes + # Note: Our data is already all negative, so unnecessary? + # df["location_lon"] = np.where( + # df["location_lon"].gt(0), -df["location_lon"], df["location_lon"] + # ) + return df + + def build_sample_data( fses_files: list[str], - chem_meta: pd.DataFrame, - sample_ids: str, - samp_mapping: Optional[str] = None, -): + chem_metadata: pd.DataFrame, + sample_id_file: str, + sample_mapping: Optional[str] = None, +) -> pd.DataFrame: + """Select relevant data from curated tables. + + Parameters + ---------- + fses_files : list[str] + List of FSES files from the Barton lab with sample info + chem_metadata : pd.DataFrame + Chemical metadata table containing identifier mapping + sample_id_file : str + /path/to/sample_id_file (CSV file) + sample_mapping : Optional[str], optional + Mapping file containing new data, by default None + + Returns + ------- + pd.DataFrame + _description_ + """ data = [] # Read and preprocess each file in fses_files for f in fses_files: @@ -109,20 +149,20 @@ def build_sample_data( # Concatenate samples and merge with chemical metadata data = pd.concat(data).merge( - chem_meta[["chemical_id", "cas_number", "average_mass"]], + chem_metadata[["chemical_id", "cas_number", "average_mass"]], on="cas_number", how="left", ) # Merge with sample IDs - ids = sample_id_master_table(data["sample_number"], sample_ids) + ids = sample_id_master_table(data["sample_number"], sample_id_file) data = data.merge(ids, on="sample_number", how="left").drop_duplicates() # Rename duplicate sample names as sample:01, :02, etc. data["sample_name"] = rename_duplicates(data) # Merge with sample name remappings if provided - if samp_mapping is not None: + if sample_mapping is not None: sample_remap_cols = [ "sample_id", "new_project_name", @@ -130,7 +170,7 @@ def build_sample_data( "new_location_name", ] sample_name_remap = ( - snakeify_all_columns(pd.read_excel(samp_mapping, sheet_name=0)) + snakeify_all_columns(pd.read_excel(sample_mapping, sheet_name=0)) .rename(columns={"project_name": "new_project_name"}) .loc[:, sample_remap_cols] .drop_duplicates() @@ -140,11 +180,10 @@ def build_sample_data( sample_name_remap, on="sample_id", how="left" ).drop_duplicates() - # Fill in NAs with values from remapping table + # Fill in NAs with new values from remapping table nas = data["project_name"].isna() - data.loc[nas, "project_name"] = data.loc[nas, "new_project_name"] - data.loc[nas, "location_name"] = data.loc[nas, "new_location_name"] - data.loc[nas, "sample_name"] = data.loc[nas, "new_sample_name"] + for col in ["project_name", "location_name", "sample_name"]: + data.loc[nas, col] = data.loc[nas, f"new_{col}"] # Drop unnecessary columns and rows with missing cas_number data = data.drop( @@ -160,133 +199,147 @@ def build_sample_data( def combine_v2_chemical_endpoint_data( - bmd_files, is_extract=False, samp_chem=None, endpoint_details=None + bmd_files: list[str], + is_extract: bool = False, + chem_data: Optional[pd.DataFrame] = None, + endpoint_details: Optional[pd.DataFrame] = None, ): + """Combine chemical endpoint data + + Parameters + ---------- + bmd_files : list[str] + List of BMD files + is_extract : bool, optional + True if data is for extracts, by default False + chem_data : Optional[pd.DataFrame], optional + Tabulated chemical data, by default None + endpoint_details : Optional[pd.DataFrame], optional + Tabulated endpoint data, by default None + + Returns + ------- + _type_ + _description_ + """ print(f"Combining bmd files: {', '.join(bmd_files)}") + + # Read and concatenate the specified columns from all provided BMD files into one df cols = REQUIRED_BMD_COLUMNS["bmd"] - mid_bmd = pd.concat([pd.read_csv(bmd_file).loc[:, cols] for bmd_file in bmd_files]) + df = pd.concat([pd.read_csv(bmd_file).loc[:, cols] for bmd_file in bmd_files]) + df = snakeify_all_columns(df) # .rename(columns={"chemical_id": "tmp_id"}) - dupes = mid_bmd[["Chemical_ID", "End_Point"]].duplicated().values - if dupes.any(): - mid_bmd = mid_bmd[~dupes] + # Remove duplicate entries (based on 'chemical_id' and 'end_point') + print(df.columns) + df = df.drop_duplicates(subset=["chemical_id", "end_point"]) + # Process extracts if is_extract: - sd_samp = ( - samp_chem["Sample_ID"] - .str.split("-", expand=True) - .iloc[:, 0:2] - .rename(columns={0: "tmp_id", 1: "sub"}) - .merge(samp_chem[["Sample_ID"]].drop_duplicates(), on="sub", how="left") - ) + chem_data = snakeify_all_columns(chem_data) + print(chem_data.columns) - full_bmd = ( - mid_bmd.assign(tmp_id=mid_bmd["Chemical_ID"].astype(str)) - .drop("Chemical_ID", axis=1) - .merge(sd_samp, on="tmp_id", how="left") + # Split `sample_id` column on '-', take first two parts, and rename split cols + chem_data[["tmp_id", "sub"]] = chem_data["sample_id"].str.split( + "-", expand=True, n=1 + ) + print(chem_data.columns) + # df["chemical_id"] = df["chemical_id"].astype(str) + df = df.merge( + chem_data[["chemical_id", "sample_id"]], on="chemical_id", how="outer" ) - nas = full_bmd["Sample_ID"].isna() - full_bmd.loc[nas, "Sample_ID"] = full_bmd.loc[nas, "tmp_id"] - - new_nas = full_bmd["SampleName"].isna() - if new_nas.any(): - full_bmd.loc[new_nas, "SampleName"] = ( - "Sample " + full_bmd.loc[new_nas, "Sample_ID"] - ) + # Fill NAs and format sample names + df["sample_id"] = df["sample_id"].fillna(df["chemical_id"]) + df["sample_name"] = df["sample_id"].apply( + lambda x: f"Sample {x}" + if pd.isnull(df.get("sample_name")) + else df["sample_name"] + ) - full_bmd = ( - full_bmd.fillna({"End_Point": "NoData"}) - .merge(endpoint_details, on="End_Point", how="right") - .drop(columns=["End_Point", "tmp_id"]) - .dropna(subset=["Sample_ID"]) + # Fill missing 'end_point' values with "NoData", merge with `endpoint_details` on 'end_point', + # drop unused columns, remove rows with missing 'sample_id', drop duplicates, + # and fill remaining missing 'location_name' values with "None" + print(endpoint_details.columns) + # print(endpoint_details.head(2)) + df = ( + df.fillna({"end_point": "NoData"}) + .merge(endpoint_details, on="end_point", how="right") + .drop(columns=["end_point", "chemical_id"]) + .dropna(subset=["sample_id"]) .drop_duplicates() - .fillna({"LocationName": "None"}) + .fillna({"location_name": "None"}) ) + + # Similar operations for non-extract case: + # Fill missing 'end_point' values with "NoData", merge with `endpoint_details` on 'end_point' + # drop unused column, remove rows with missing 'cas_number', drop duplicates, + # and fill remaining missing 'chemical_class' values with "Unclassified" else: - full_bmd = ( - mid_bmd.fillna({"End_Point": "NoData"}) - .merge(endpoint_details, on="End_Point", how="right") - .drop(columns="End_Point") + df = ( + df.fillna({"end_point": "NoData"}) + .merge(endpoint_details, on="end_point", how="right") + .drop(columns="end_point") .dropna(subset=["cas_number"]) .drop_duplicates() .fillna({"chemical_class": "Unclassified"}) ) - full_bmd = ( - full_bmd.rename(columns={"DataQC_Flag": "qc_num"}) - .assign( - DataQC_Flag=full_bmd["qc_num"].replace( - {0: "Poor", 1: "Poor", 4: "Moderate", 5: "Moderate"}, regex=True - ) - ) - .assign(Model=lambda df: df["Model"].str.replace("NULL", "None")) - .drop(columns="qc_num") - ) + # Grade data QC on scale and replace NULL values in model col + df["data_qc_flag"] = df["data_qc_flag"].map(QC_FLAGS).fillna("Good") + df["model"] = df["model"].str.replace("NULL", "None") - return full_bmd + return df -def combine_chemical_fit_data( - bmd_files, is_extract=False, samp_chem=None, endpoint_details=None +def combine_chemical_data( + bmd_files: list[str], + data_type: str = "fit", + is_extract: bool = False, + chem_data: Optional[pd.DataFrame] = None, + endpoint_details: Optional[pd.DataFrame] = None, ): - print(f"Combining fit files: {', '.join(bmd_files)}") - cols = REQUIRED_BMD_COLUMNS["fitVals"] - files = [pd.read_csv(bmd_file).loc[:, cols] for bmd_file in bmd_files] - - chem_eps = [ - set( - file.assign(combined=lambda df: df["Chemical_ID"] + df["End_Point"])[ - "combined" - ] - ) - for file in files - ] - new_chem_eps = chem_eps.copy() - - if len(chem_eps) > 1: - for i in range(1, len(new_chem_eps)): - previous_eps = set.union(*chem_eps[:i]) - new_chem_eps[i] -= previous_eps - - fixed_files = [] - for i, (file, new_eps) in enumerate(zip(files, new_chem_eps)): - fixed_files.append( - file.assign(combined=lambda df: df["Chemical_ID"] + df["End_Point"]) - .query("combined in @new_eps") - .replace({"X_vals": "NULL"}, np.nan) - .assign( - X_vals=lambda df: df["X_vals"].astype(float), - Y_vals=lambda df: df["Y_vals"].astype(float), - ) - ) + print(f'Combining {data_type} files: {", ".join(bmd_files)}') + + col_type = ( + "fitVals" + if data_type == "fit" + else "doseRep" + if data_type == "dose" + else ValueError("Invalid data_type. Must be 'fit' or 'dose'.") + ) + cols = REQUIRED_BMD_COLUMNS[col_type] - mid_bmd = pd.concat(fixed_files) - full_bmd = ( - mid_bmd.merge(endpoint_details, on="End_Point", how="right") - .drop_duplicates() - .drop(columns=["End_Point", "Description"]) + df = pd.concat([pd.read_csv(file)[cols] for file in bmd_files]) + df = snakeify_all_columns(df) + + df["combined"] = df[["chemical_id", "end_point"]].astype(str).agg(" ".join, axis=1) + df = df.drop_duplicates(subset=["combined"], keep="last") + + if data_type == "fit": + df = df[df["x_vals"] != "NULL"] + df["x_vals"] = pd.to_numeric(df["x_vals"]) + df["y_vals"] = pd.to_numeric(df["y_vals"]) + + df = df.merge( + snakeify_all_columns(endpoint_details), + on="end_point", + how="right", ) + df = df.drop(columns=["end_point", "description"]).drop_duplicates() if is_extract: - sd_samp = ( - samp_chem["Sample_ID"] - .str.split("-", expand=True) - .iloc[:, 0:2] - .rename(columns={0: "tmp_id", 1: "sub"}) - .merge(samp_chem[["Sample_ID"]].drop_duplicates(), on="sub", how="left") + chem_data[["tmp_id", "sub"]] = chem_data["sample_id"].str.split( + "-", expand=True ) + chem_data = chem_data[["sample_id", "tmp_id"]].drop_duplicates() - full_bmd = ( - full_bmd.assign(tmp_id=full_bmd["Chemical_ID"].astype(str)) - .drop("Chemical_ID", axis=1) - .merge(sd_samp, on="tmp_id", how="left") - ) + df["tmp_id"] = df["chemical_id"].astype(str) + df = df.drop(columns="chemical_id").merge(chem_data, on="tmp_id", how="left") - nas = full_bmd["Sample_ID"].isna() - full_bmd.loc[nas, "Sample_ID"] = full_bmd.loc[nas, "tmp_id"] - full_bmd = full_bmd.drop(columns=["tmp_id"]) + df["sample_id"] = df["sample_id"].fillna(df["tmp_id"]) + df = df.drop(columns="tmp_id") - return full_bmd + return df.drop_duplicates() def _flatten_class_df( @@ -338,25 +391,25 @@ def _flatten_class_df( def masv_chem_class( - class_file, + class_file: str, save_to: str = "MASV_classAndSource.csv", id_cols: ArrayLike = ["CASNumber", "ParameterName"], -): +) -> pd.DataFrame: """Reads full MASV class annotations and assigns values to chemicals. Parameters ---------- - class_file : _type_ - _description_ + class_file : str + Filename of class annotation data. save_to : str, optional - _description_, by default "MASV_classAndSource.csv" + Desired output filename, by default "MASV_classAndSource.csv" id_cols : ArrayLike, optional - _description_, by default ["CASNumber", "ParameterName"] + List of chemical ID cols, by default ["CASNumber", "ParameterName"] Returns ------- - _type_ - _description_ + pd.DataFrame + Combined chemical class information """ data = pd.read_excel(class_file, sheet_name=0) @@ -389,62 +442,6 @@ def masv_chem_class( return combined -def combine_chemical_dose_data( - bmd_files, is_extract=False, samp_chem=None, endpoint_details=None -): - print(f'Combining dose response files: {", ".join(bmd_files)}') - cols = REQUIRED_BMD_COLUMNS["doseRep"] - files = [pd.read_csv(bmd_file).loc[:, cols] for bmd_file in bmd_files] - - chem_eps = [ - set( - file.assign(combined=lambda df: df["Chemical_ID"] + df["End_Point"])[ - "combined" - ] - ) - for file in files - ] - new_chem_eps = chem_eps.copy() - - if len(chem_eps) > 1: - for i in range(1, len(new_chem_eps)): - previous_eps = set.union(*chem_eps[:i]) - new_chem_eps[i] -= previous_eps - - fixed_files = [] - for i, (file, new_eps) in enumerate(zip(files, new_chem_eps)): - fixed_files.append( - file.assign(combined=lambda df: df["Chemical_ID"] + df["End_Point"]) - .query("combined in @new_eps") - .loc[:, cols] - ) - - mid_bmd = pd.concat(fixed_files) - full_bmd = ( - mid_bmd.merge(endpoint_details, on="End_Point", how="right") - .drop_duplicates() - .drop(columns=["End_Point", "Description"]) - ) - - if is_extract: - sd_samp = ( - samp_chem["Sample_ID"] - .str.split("-", expand=True) - .iloc[:, 0:2] - .rename(columns={0: "tmp_id", 1: "sub"}) - .merge(samp_chem[["Sample_ID"]].drop_duplicates(), on="sub", how="left") - ) - - full_bmd = ( - full_bmd.assign(tmp_id=full_bmd["Chemical_ID"].astype(str)) - .drop("Chemical_ID", axis=1) - .merge(sd_samp, on="tmp_id", how="left") - .assign(Sample_ID=lambda df: df["Sample_ID"].fillna(df["tmp_id"])) - ) - - return full_bmd - - # ========================================================= # Command Line Interface (CLI) # ========================================================= @@ -520,7 +517,7 @@ def main(): parser.add_argument( "-m", "--sample_map", - dest="samp_map", + dest="sample_map", default="", help="File that maps sample locations", ) @@ -539,13 +536,14 @@ def main(): ) args = parser.parse_args() + print(args) chem_class = masv_chem_class(args.chem_class_file) - chem_meta = build_chem_metadata(args.metadata) + chem_metadata = build_chem_metadata(args.metadata) sample_files_list = args.sample_files.split(",") chem_sample = build_sample_data( - sample_files_list, chem_meta, args.sample_id_file, args.samp_map + sample_files_list, chem_metadata, args.sample_id_file, args.sample_map ) endpoint_details = get_endpoint_metadata( @@ -558,30 +556,32 @@ def main(): dose_files = [file for file in all_files if "dose" in file] fit_files = [file for file in all_files if "fit" in file] - meta_file = chem_sample if args.is_sample else chem_meta + chem_data = chem_sample if args.is_sample else chem_metadata bmds = ( combine_v2_chemical_endpoint_data( bmd_files, is_extract=args.is_sample, - samp_chem=meta_file, + chem_data=chem_data, endpoint_details=endpoint_details, ) - .dropna(subset=["BMD_Analysis_Flag"]) - .query("BMD_Analysis_Flag != 'NA'") + .dropna(subset=["bmd_analysis_flag"]) + .query("bmd_analysis_flag != 'NA'") ) - curves = combine_chemical_fit_data( + curves = combine_chemical_data( fit_files, + data_type="fit", is_extract=args.is_sample, - samp_chem=meta_file, + chem_data=chem_data, endpoint_details=endpoint_details, ) - dose_reps = combine_chemical_dose_data( + dose_reps = combine_chemical_data( dose_files, + data_type="dose", is_extract=args.is_sample, - samp_chem=meta_file, + chem_data=chem_data, endpoint_details=endpoint_details, - ).dropna(subset=["Dose"]) + ).dropna(subset=["dose"]) if args.is_sample: bmds.to_csv( @@ -624,7 +624,7 @@ def main(): ) else: - chem_meta.to_csv( + chem_metadata.to_csv( os.path.join(args.output_dir, "chemicals.csv"), index=False, quotechar='"' ) chem_sample[SAMPLE_COLUMNS].drop_duplicates().to_csv( diff --git a/sampleChemMapping/src/format.py b/sampleChemMapping/src/format.py index edccc7e..4b91ee1 100644 --- a/sampleChemMapping/src/format.py +++ b/sampleChemMapping/src/format.py @@ -1,7 +1,5 @@ import re -from numpy import nan -from pandas import DataFrame, read_csv, to_numeric -from .params import REQUIRED_SAMPLE_COLUMNS +from pandas import DataFrame RENAME = {"casrn": "cas_number"} @@ -30,6 +28,7 @@ def snakeify(name: str) -> str: # Convert camelCase to snake_case using regular expressions snake_case_name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name) snake_case_name = re.sub("([a-z0-9])([A-Z])", r"\1_\2", snake_case_name).lower() + snake_case_name = snake_case_name.replace("__", "_") return snake_case_name @@ -75,33 +74,6 @@ def chunker(seq: DataFrame, size: int) -> DataFrame: return (seq.iloc[pos : pos + size] for pos in range(0, len(seq), size)) -def process_fses(filename: str, snake_case: bool = True) -> DataFrame: - # Replace invalid values with nulls for filtering - df = read_csv(filename)[REQUIRED_SAMPLE_COLUMNS].replace( - {"BLOD": "0", "NULL": "0", "nc:BDL": "0"} - ) - if snake_case: - df = snakeify_all_columns(df) - - # Remove null and invalid entries - df = df[ - (df["sample_number"].notna()) - & (df["cas_number"].notna()) - & (~df["measurement_value"].isin(["0", nan])) - & (~df["measurement_value_molar"].isin(["0", nan])) - ] - - # Format FSES location data - df["location_lon"] = to_numeric(df["location_lon"], errors="coerce") - - # Only allow negative longitudes - # Note: Our data is already all negative, so unnecessary? - # df["location_lon"] = np.where( - # df["location_lon"].gt(0), -df["location_lon"], df["location_lon"] - # ) - return df - - def rename_duplicates(data: DataFrame, col: str = "sample_name") -> list[str]: """Rename duplicate name entries with index, i.e. sample:01, etc. diff --git a/sampleChemMapping/src/metadata.py b/sampleChemMapping/src/metadata.py index cb375c7..c5f8550 100644 --- a/sampleChemMapping/src/metadata.py +++ b/sampleChemMapping/src/metadata.py @@ -266,33 +266,30 @@ def get_endpoint_metadata(filename: str) -> pd.DataFrame: pd.DataFrame _description_ """ - endpoint_details = ( - pd.read_excel(filename, sheet_name=3) - .rename( - columns={ - "Abbreviation": "End_Point", - "Simple name (<20char)": "End_Point_Name", - "Ontology Link": "endPointLink", - } - ) - .assign( - IncludeInPortal="No", - End_Point="NoData", - End_Point_Name=None, - Description="No data", - endPointLink="", - ) - .pipe( - lambda df: df.append( - { - "IncludeInPortal": "No", - "End_Point": "NoData", - "End_Point_Name": None, - "Description": "No data", - "endPointLink": "", - }, - ignore_index=True, - ) - ) + # Read file and rename cols to expected format + df = pd.read_excel(filename, sheet_name=3).rename( + columns={ + "Abbreviation": "End_Point", + "Simple name (<20char)": "End_Point_Name", + "Ontology Link": "endPointLink", + } ) - return endpoint_details + + # Trim whitespace from 'End_Point' column + df["End_Point"] = df["End_Point"].str.strip() + + # Create a new row for 'No Data' + no_data_row = pd.DataFrame( + { + "IncludeInPortal": ["No"], + "End_Point": ["NoData"], + "End_Point_Name": [None], + "Description": ["No data"], + "endPointLink": [""], + } + ) + + # Append the new row to the dataframe + df = pd.concat([df, no_data_row], ignore_index=True) + + return snakeify_all_columns(df) diff --git a/sampleChemMapping/src/params.py b/sampleChemMapping/src/params.py index 77af62b..e006d38 100644 --- a/sampleChemMapping/src/params.py +++ b/sampleChemMapping/src/params.py @@ -1,3 +1,4 @@ +from pandas import isna # ========================================================= # Define Columns / Table Schemas # ========================================================= @@ -156,6 +157,10 @@ "uncategorized", ] +QC_FLAGS = {0: "Poor", 1: "Poor", 4: "Moderate", 5: "Moderate"} +# BMD_FLAGS ? +# QC_SCALE = {0: "Poor", 1: "Poor", 2: "Good", 3: "Good", 4: "Moderate", 5: "Poor"} + # Map the newClass values based on conditions provided def map_classification(x): @@ -179,7 +184,7 @@ def map_classification(x): return "Natural" elif x == "pestFungicide": return "Fungicide" - elif pd.isna(x) or x == "NA": + elif isna(x) or x == "NA": return "Unclassified" else: return x diff --git a/sampleChemMapping/src/tables.py b/sampleChemMapping/src/tables.py index 92b2bd7..6c71653 100644 --- a/sampleChemMapping/src/tables.py +++ b/sampleChemMapping/src/tables.py @@ -75,7 +75,7 @@ def chem_id_master_table( def sample_id_master_table( - existing_sample_numbers: ArrayLike, smap: str + existing_sample_numbers: ArrayLike, sample_id_file: str ) -> pd.DataFrame: """Generate master table for sample ID. @@ -86,15 +86,17 @@ def sample_id_master_table( ---------- existing_sample_numbers : ArrayLike _description_ - smap : str - _description_ + sample_id_file : str + /path/to/sample_id_file (CSV file) Returns ------- pd.DataFrame Sample ID master table """ - map_df = pd.read_csv(smap)[["Sample_ID", "SampleNumber"]].drop_duplicates() + map_df = pd.read_csv(sample_id_file)[ + ["Sample_ID", "SampleNumber"] + ].drop_duplicates() missing = set(existing_sample_numbers) - set(map_df["SampleNumber"]) if missing: