diff --git a/src/relation_validator/utils/utils.py b/src/relation_validator/utils/utils.py index b3b57cd..ed1999a 100644 --- a/src/relation_validator/utils/utils.py +++ b/src/relation_validator/utils/utils.py @@ -22,6 +22,14 @@ # LIMIT """ +QUERY_LABEL = """ + VALUES (?subject) {{ + {terms} + }} + ?subject rdfs:label ?object . + # LIMIT +""" + DEFAULT_STYLE = "obograph-style.json" @@ -211,14 +219,25 @@ def to_set(term_pairs: Set[str]) -> Set[Tuple[str, str]]: def get_labels(data: DataFrame) -> Dict[str, str]: """ - Get labels from terms in table + Get labels from terms in table or search for missing labels in Ubergraph """ labels = {} + terms = set() for _, row in data.iterrows(): if row["s"] not in labels: - labels[row["s"]] = row["slabel"] + if row["slabel"] != "": + labels[row["s"]] = row["slabel"] + else: + terms.add(f"({row['s']})") if row["o"] not in labels: - labels[row["o"]] = row["olabel"] + if row["olabel"] != "": + labels[row["o"]] = row["olabel"] + else: + terms.add(f"({row['o']})") + + ont_labels = search_labels(terms) + for term, label in ont_labels: + labels[term] = label return labels @@ -234,4 +253,60 @@ def tsv_or_csv(filename: Path) -> tuple[str, str]: if "tsv" in extension: sep = "\t" - return temp_filename, sep + return temp_filename, sep, extension + + +def save_tsv_or_csv(data: DataFrame, filename: str): + """ + Save DataFrame with file proprer TSV or CSV extension + """ + if str(filename).endswith(".csv"): + sep = ',' + elif str(filename).endswith(".tsv"): + sep = '\t' + else: + raise ValueError("Unsupported file extension. Please provide a .csv or .tsv file.") + data.to_csv(filename, sep=sep, index=False) + + +def parse_table(data: DataFrame) -> DataFrame: + """ + Parse generic tree table to pairs table + """ + terms = {} + # Generate dict with terms and label + for _, row in data.iterrows(): + for term_id, label in chunks(data.columns, 2): + terms[row.loc[term_id]] = row.loc[label] + + table_parsed = [] + for _, row in data.filter(regex=".*ID").iterrows(): + for current, next in zip(row, row[1:]): + r = {} + r['s'] = next + r['slabel'] = "" + r['user_slabel'] = terms[next] + r['o'] = current + r['olabel'] = "" + r['user_olabel'] = terms[current] + table_parsed.append(r) + + return DataFrame.from_records(table_parsed).drop_duplicates() + + +def search_labels(terms: set) -> set: + """ + Search label for terms in Ubergraph + """ + result = set() + for chunk in chunks(list(terms), 90): + result = result.union( + extract_results( + query_ubergraph( + QUERY_LABEL.format( + terms=" ".join(chunk) + ) + ) + ) + ) + return result diff --git a/src/relation_validator/validator.py b/src/relation_validator/validator.py index 2c3b491..7d22008 100644 --- a/src/relation_validator/validator.py +++ b/src/relation_validator/validator.py @@ -7,8 +7,9 @@ import pandas as pd from .utils.utils import (get_config, get_labels, get_obograph, - get_ontologies_version, get_pairs, save_obograph, - split_terms, to_set, tsv_or_csv, verify_relationship) + get_ontologies_version, get_pairs, parse_table, + save_obograph, save_tsv_or_csv, split_terms, to_set, + tsv_or_csv, verify_relationship) def run_validation( @@ -45,15 +46,17 @@ def validate(args): return exit filename = config["filename"] - temp_filename, sep = tsv_or_csv(filename) + temp_filename, sep, ext = tsv_or_csv(filename) data = pd.read_csv(filename, sep=sep) + if config["to_be_parsed"]: + data = parse_table(data) + save_tsv_or_csv(data, f"{temp_filename}_parsed{ext}") report, rel_terms = run_validation(data, config["relationships"]) output_filename = args.output - _, sep = tsv_or_csv(output_filename) - report.to_csv(output_filename, sep=sep, index=False) + save_tsv_or_csv(report, output_filename) labels = get_labels(data) graph = get_obograph(rel_terms, labels, config["relationships"]) diff --git a/tests/generic-test.tsv b/tests/generic-test.tsv new file mode 100644 index 0000000..ba8e875 --- /dev/null +++ b/tests/generic-test.tsv @@ -0,0 +1 @@ +s slabel user_slabel o olabel user_olabel diff --git a/tests/test-generic.png b/tests/test-generic.png new file mode 100644 index 0000000..857259b Binary files /dev/null and b/tests/test-generic.png differ diff --git a/tests/test-generic.tsv b/tests/test-generic.tsv new file mode 100644 index 0000000..0e542d3 --- /dev/null +++ b/tests/test-generic.tsv @@ -0,0 +1,4 @@ +Organ_Level_1 ID Organ_Level_1 Anatomical_Level_2 ID Anatomical_Level_2 Anatomical_Level_3 ID Anatomical_Level_3 Anatomical_Level_4 ID Anatomical_Level_4 +UBERON:0000955 Brain UBERON:0001890 Forebrain UBERON:0001894 Diencephalon UBERON:0001900 Prethalamus +UBERON:0000955 Brain UBERON:0001890 Forebrain UBERON:0001894 Diencephalon UBERON:0001898 Hypothalamus +UBERON:0000955 Brain UBERON:0001890 Forebrain UBERON:0001894 Diencephalon UBERON:0001900 Subthalamus \ No newline at end of file diff --git a/tests/test-generic_parsed.tsv b/tests/test-generic_parsed.tsv new file mode 100644 index 0000000..ae72beb --- /dev/null +++ b/tests/test-generic_parsed.tsv @@ -0,0 +1,5 @@ +s slabel user_slabel o olabel user_olabel +UBERON:0001890 Forebrain UBERON:0000955 Brain +UBERON:0001894 Diencephalon UBERON:0001890 Forebrain +UBERON:0001900 Subthalamus UBERON:0001894 Diencephalon +UBERON:0001898 Hypothalamus UBERON:0001894 Diencephalon diff --git a/tests/test-parser.yaml b/tests/test-parser.yaml index 0b470fb..4401172 100644 --- a/tests/test-parser.yaml +++ b/tests/test-parser.yaml @@ -4,4 +4,5 @@ relationships: has_part: BFO:0000051 # has_soma_location: RO:0002100 -filename: tests/placenta.csv \ No newline at end of file +filename: tests/test-generic.tsv +to_be_parsed: true \ No newline at end of file