Skip to content

Commit

Permalink
Merge pull request #11 from INCATools/anitacaron/issue9
Browse files Browse the repository at this point in the history
Enable to parse tree table before validation
  • Loading branch information
Anita Caron authored Apr 26, 2024
2 parents eda7019 + a1409a9 commit f172a3e
Show file tree
Hide file tree
Showing 7 changed files with 99 additions and 10 deletions.
83 changes: 79 additions & 4 deletions src/relation_validator/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,14 @@
# LIMIT
"""

QUERY_LABEL = """
VALUES (?subject) {{
{terms}
}}
?subject rdfs:label ?object .
# LIMIT
"""

DEFAULT_STYLE = "obograph-style.json"


Expand Down Expand Up @@ -211,14 +219,25 @@ def to_set(term_pairs: Set[str]) -> Set[Tuple[str, str]]:

def get_labels(data: DataFrame) -> Dict[str, str]:
"""
Get labels from terms in table
Get labels from terms in table or search for missing labels in Ubergraph
"""
labels = {}
terms = set()
for _, row in data.iterrows():
if row["s"] not in labels:
labels[row["s"]] = row["slabel"]
if row["slabel"] != "":
labels[row["s"]] = row["slabel"]
else:
terms.add(f"({row['s']})")
if row["o"] not in labels:
labels[row["o"]] = row["olabel"]
if row["olabel"] != "":
labels[row["o"]] = row["olabel"]
else:
terms.add(f"({row['o']})")

ont_labels = search_labels(terms)
for term, label in ont_labels:
labels[term] = label

return labels

Expand All @@ -234,4 +253,60 @@ def tsv_or_csv(filename: Path) -> tuple[str, str]:
if "tsv" in extension:
sep = "\t"

return temp_filename, sep
return temp_filename, sep, extension


def save_tsv_or_csv(data: DataFrame, filename: str):
"""
Save DataFrame with file proprer TSV or CSV extension
"""
if str(filename).endswith(".csv"):
sep = ','
elif str(filename).endswith(".tsv"):
sep = '\t'
else:
raise ValueError("Unsupported file extension. Please provide a .csv or .tsv file.")
data.to_csv(filename, sep=sep, index=False)


def parse_table(data: DataFrame) -> DataFrame:
"""
Parse generic tree table to pairs table
"""
terms = {}
# Generate dict with terms and label
for _, row in data.iterrows():
for term_id, label in chunks(data.columns, 2):
terms[row.loc[term_id]] = row.loc[label]

table_parsed = []
for _, row in data.filter(regex=".*ID").iterrows():
for current, next in zip(row, row[1:]):
r = {}
r['s'] = next
r['slabel'] = ""
r['user_slabel'] = terms[next]
r['o'] = current
r['olabel'] = ""
r['user_olabel'] = terms[current]
table_parsed.append(r)

return DataFrame.from_records(table_parsed).drop_duplicates()


def search_labels(terms: set) -> set:
"""
Search label for terms in Ubergraph
"""
result = set()
for chunk in chunks(list(terms), 90):
result = result.union(
extract_results(
query_ubergraph(
QUERY_LABEL.format(
terms=" ".join(chunk)
)
)
)
)
return result
13 changes: 8 additions & 5 deletions src/relation_validator/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@
import pandas as pd

from .utils.utils import (get_config, get_labels, get_obograph,
get_ontologies_version, get_pairs, save_obograph,
split_terms, to_set, tsv_or_csv, verify_relationship)
get_ontologies_version, get_pairs, parse_table,
save_obograph, save_tsv_or_csv, split_terms, to_set,
tsv_or_csv, verify_relationship)


def run_validation(
Expand Down Expand Up @@ -45,15 +46,17 @@ def validate(args):
return exit

filename = config["filename"]
temp_filename, sep = tsv_or_csv(filename)
temp_filename, sep, ext = tsv_or_csv(filename)

data = pd.read_csv(filename, sep=sep)
if config["to_be_parsed"]:
data = parse_table(data)
save_tsv_or_csv(data, f"{temp_filename}_parsed{ext}")

report, rel_terms = run_validation(data, config["relationships"])

output_filename = args.output
_, sep = tsv_or_csv(output_filename)
report.to_csv(output_filename, sep=sep, index=False)
save_tsv_or_csv(report, output_filename)

labels = get_labels(data)
graph = get_obograph(rel_terms, labels, config["relationships"])
Expand Down
1 change: 1 addition & 0 deletions tests/generic-test.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
s slabel user_slabel o olabel user_olabel
Binary file added tests/test-generic.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
4 changes: 4 additions & 0 deletions tests/test-generic.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Organ_Level_1 ID Organ_Level_1 Anatomical_Level_2 ID Anatomical_Level_2 Anatomical_Level_3 ID Anatomical_Level_3 Anatomical_Level_4 ID Anatomical_Level_4
UBERON:0000955 Brain UBERON:0001890 Forebrain UBERON:0001894 Diencephalon UBERON:0001900 Prethalamus
UBERON:0000955 Brain UBERON:0001890 Forebrain UBERON:0001894 Diencephalon UBERON:0001898 Hypothalamus
UBERON:0000955 Brain UBERON:0001890 Forebrain UBERON:0001894 Diencephalon UBERON:0001900 Subthalamus
5 changes: 5 additions & 0 deletions tests/test-generic_parsed.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
s slabel user_slabel o olabel user_olabel
UBERON:0001890 Forebrain UBERON:0000955 Brain
UBERON:0001894 Diencephalon UBERON:0001890 Forebrain
UBERON:0001900 Subthalamus UBERON:0001894 Diencephalon
UBERON:0001898 Hypothalamus UBERON:0001894 Diencephalon
3 changes: 2 additions & 1 deletion tests/test-parser.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ relationships:
has_part: BFO:0000051
# has_soma_location: RO:0002100

filename: tests/placenta.csv
filename: tests/test-generic.tsv
to_be_parsed: true

0 comments on commit f172a3e

Please sign in to comment.