From 259e0d9337def4999ba14357abc46e5c8eb3be98 Mon Sep 17 00:00:00 2001 From: Joe Flack Date: Sun, 8 Sep 2024 00:50:07 -0400 Subject: [PATCH] Add alt & included symbol synonyms - Update: Now adding synonyms for alt & included symbols, w/ type of mondo#abbreviation. - Misc updates: Todo comments. Renamed variables/methods for consistency and accuracy. Refactored some things. --- omim2obo/main.py | 95 ++++++++++++++++++--------- omim2obo/parsers/omim_entry_parser.py | 36 +++++----- 2 files changed, 85 insertions(+), 46 deletions(-) diff --git a/omim2obo/main.py b/omim2obo/main.py index 69790e1..889cbea 100644 --- a/omim2obo/main.py +++ b/omim2obo/main.py @@ -52,7 +52,7 @@ from rdflib.term import Identifier from omim2obo.namespaces import * -from omim2obo.parsers.omim_entry_parser import get_alt_labels, get_pubs, \ +from omim2obo.parsers.omim_entry_parser import parse_alt_and_included_titles, get_pubs, \ get_mapped_ids, LabelCleaner from omim2obo.config import ROOT_DIR, GLOBAL_TERMS from omim2obo.parsers.omim_txt_parser import * @@ -165,20 +165,28 @@ def omim2obo(use_cache: bool = False): # - Non-deprecated # Parse titles - omim_type, pref_labels_str, alt_labels, inc_labels = omim_type_and_titles[omim_id] - other_labels = [] - cleaned_inc_labels = [] - label_endswith_included_alt = False - label_endswith_included_inc = False - pref_labels: List[str] = [x.strip() for x in pref_labels_str.split(';')] - pref_title: str = pref_labels[0] - pref_symbols: List[str] = pref_labels[1:] - if alt_labels: - cleaned_alt_labels, label_endswith_included_alt = get_alt_labels(alt_labels) - other_labels += cleaned_alt_labels - if inc_labels: - cleaned_inc_labels, label_endswith_included_inc = get_alt_labels(inc_labels) - # other_labels += cleaned_inc_labels # deactivated 7/2024 in favor of alternative for tagging 'included' + omim_type, pref_titles_str, alt_titles_str, inc_titles_str = omim_type_and_titles[omim_id] + alt_titles: List[str] = [] + alt_symbols: List[str] = [] + alt_title_endswith_included = False + included_titles: List[str] = [] + included_symbols: List[str] = [] + included_title_endswith_included = False + + pref_titles: List[str] = [x.strip() for x in pref_titles_str.split(';')] + pref_title: str = pref_titles[0] + pref_symbols: List[str] = pref_titles[1:] + # TODO: separate symbols from titles (2x) + # - do this in the func itself + # TODO: Refactor this redundant code block? + # TODO: finally: I think parse_alt_and_included_labels() might be problematic. It returns this bool if case in + # any of the titles, but doesn't say which one + if alt_titles_str: + alt_titles, alt_symbols, alt_title_endswith_included = \ + parse_alt_and_included_titles(alt_titles_str) + if inc_titles_str: + included_titles, included_symbols, included_title_endswith_included = \ + parse_alt_and_included_titles(inc_titles_str) # Special cases depending on OMIM term type is_gene = omim_type == OmimType.GENE or omim_type == OmimType.HAS_AFFECTED_FEATURE @@ -206,28 +214,53 @@ def omim2obo(use_cache: bool = False): # todo: .clean()/.cleanup_label() 2nd param `explicit_abbrev` should be List[str] instead of str. And below, # should pass all symbols/abbrevs from each of preferred, alt, included each time it is called. If no symbols # for given term, should pass empty list. See: https://github.com/monarch-initiative/omim/issues/129 - abbrev: Union[str, None] = None if not pref_symbols else pref_symbols[0] + pref_abbrev: Union[str, None] = None if not pref_symbols else pref_symbols[0] # Add synonyms - graph.add((omim_uri, oboInOwl.hasExactSynonym, Literal(label_cleaner.clean(pref_title, abbrev)))) - for alt_label in other_labels: - graph.add((omim_uri, oboInOwl.hasExactSynonym, Literal(label_cleaner.clean(alt_label, abbrev)))) - for abbreviation in pref_symbols: - graph.add((omim_uri, oboInOwl.hasExactSynonym, Literal(abbreviation))) - # Reify on abbreviations. See: https://github.com/monarch-initiative/omim/issues/2 - axiom = BNode() - graph.add((axiom, RDF.type, OWL.Axiom)) - graph.add((axiom, OWL.annotatedSource, omim_uri)) - graph.add((axiom, OWL.annotatedProperty, oboInOwl.hasExactSynonym)) - graph.add((axiom, OWL.annotatedTarget, Literal(abbreviation))) - graph.add((axiom, OBOINOWL.hasSynonymType, MONDONS.abbreviation)) + graph.add((omim_uri, oboInOwl.hasExactSynonym, Literal(label_cleaner.clean(pref_title, pref_abbrev)))) + for alt_title in alt_titles: + graph.add((omim_uri, oboInOwl.hasExactSynonym, Literal(label_cleaner.clean(alt_title, pref_abbrev)))) + # TODO: add abbrevs for all types. this good now? just check, then remove theis temp code + i = 0 + for abbrevs in [pref_symbols, alt_symbols, included_symbols]: + i += 1 + if i == 2 and alt_symbols: + print() # TODO: make sure at least one case + if i == 3 and included_symbols: + print() # TODO: make sure at least one case + for abbreviation in abbrevs: + graph.add((omim_uri, oboInOwl.hasExactSynonym, Literal(abbreviation))) + # Reify on abbreviations. See: https://github.com/monarch-initiative/omim/issues/2 + axiom = BNode() + graph.add((axiom, RDF.type, OWL.Axiom)) + graph.add((axiom, OWL.annotatedSource, omim_uri)) + graph.add((axiom, OWL.annotatedProperty, oboInOwl.hasExactSynonym)) + graph.add((axiom, OWL.annotatedTarget, Literal(abbreviation))) + graph.add((axiom, OBOINOWL.hasSynonymType, MONDONS.abbreviation)) # Add 'included' entry properties included_detected_comment = "This term has one or more labels that end with ', INCLUDED'." - if label_endswith_included_alt or label_endswith_included_inc: + if alt_title_endswith_included or included_title_endswith_included: graph.add((omim_uri, RDFS['comment'], Literal(included_detected_comment))) - for included_label in cleaned_inc_labels: - graph.add((omim_uri, URIRef(INCLUDED_URI), Literal(label_cleaner.clean(included_label, abbrev)))) + # TODO: are these correct? Do all such labels in inc_labels and alt_labels end with 'INCLUDED'? Or just 1 of + # them, given this boolean? there probably is only 1 such title, and otherwise are symbols. so need to rrefactor. + # should not be iterating here. symbols should be added elsewhere + # - If #1 and #2 never happen, then the _parse*() func shouldn't return this bool, or we shouldn't use it. + # And if we don't use it, then if we only set titles = parse*(), does the boolean get tacked on there? if + # not, then remove its assignment. + for alt_title in alt_titles: + # TODO: #1 Check: do alt titles really ever end with text 'included'? if not, remove this whole variable + if alt_title_endswith_included: + graph.add((omim_uri, URIRef(INCLUDED_URI), Literal(label_cleaner.clean(alt_title, pref_abbrev)))) + # TODO: Don't we want to add synonym otherwise? + # TODO: Ref issue here if exists, else make, and then convert to lowercase todo + else: + print() + # graph.add((omim_uri, oboInOwl.hasExactSynonym, Literal(label_cleaner.clean(alt_title, pref_abbrev)))) + for included_title in included_titles: + if not included_title_endswith_included: # #2 TODO: this shouldn't happen. check + print() + graph.add((omim_uri, URIRef(INCLUDED_URI), Literal(label_cleaner.clean(included_title, pref_abbrev)))) # Gene ID # Why is 'skos:exactMatch' appropriate for disease::gene relationships? - joeflack4 2022/06/06 diff --git a/omim2obo/parsers/omim_entry_parser.py b/omim2obo/parsers/omim_entry_parser.py index b97a618..c9e8b11 100644 --- a/omim2obo/parsers/omim_entry_parser.py +++ b/omim2obo/parsers/omim_entry_parser.py @@ -38,10 +38,10 @@ def transform_entry(entry) -> Graph: omim_uri = URIRef(OMIM[omim_num]) other_labels = [] if 'alternativeTitles' in titles: - cleaned, label_endswith_included = get_alt_labels(titles['alternativeTitles']) + cleaned, label_endswith_included = parse_alt_and_included_titles(titles['alternativeTitles']) other_labels += cleaned if 'includedTitles' in titles: - cleaned, label_endswith_included = get_alt_labels(titles['includedTitles']) + cleaned, label_endswith_included = parse_alt_and_included_titles(titles['includedTitles']) other_labels += cleaned graph.add((omim_uri, RDF.type, OWL.Class)) @@ -165,6 +165,7 @@ def _detect_abbreviations( return replacements +# todo: This step should no longer be necessary as it is now done beforehand: "remove the abbreviation suffixes" # todo: explicit_abbrev: Change to List[str]. See: https://github.com/monarch-initiative/omim/issues/129 def cleanup_label( label: str, @@ -268,27 +269,32 @@ def cleanup_label( return formatted_label -def get_alt_labels(titles: str) -> Tuple[List[str], bool]: - """ - From a string of delimited titles, make an array. - This assumes that the titles are double-semicolon (';;') delimited. - This will additionally pass each through the _cleanup_label method to - convert the screaming ALL CAPS to something more pleasant to read. - :param titles: - :return: an array of cleaned-up labels - """ +# TOOD: get symbols +def parse_alt_and_included_titles(titles: str) -> Tuple[List[str], List[str], bool]: + """Parse delimited titles/symbol pairs from string to list, and detect any 'included' cases. + + This assumes that the titles are double-semicolon (';;') delimited. This will additionally pass each through the + _cleanup_label() method to convert the screaming ALL CAPS to something more pleasant to read. + :param titles: a string of 1+ pairs of symbol/titles, 1 title and and 0-2+ symbols per pair, e.g.: + Alternative Title(s); symbol(s): + ACROCEPHALOSYNDACTYLY, TYPE V; ACS5;; ACS V;; NOACK SYNDROME + Included Title(s); symbols: + CRANIOFACIAL-SKELETAL-DERMATOLOGIC DYSPLASIA, INCLUDED + + :return: + List[str]: cleaned-up titles + List[str]: symbols + bool: whether any of the labels ended with 'included' + """ labels = [] label_endswith_included = False - # "alternativeTitles": " - # ACROCEPHALOSYNDACTYLY, TYPE V; ACS5;;\nACS V;;\nNOACK SYNDROME", - # "includedTitles": - # "CRANIOFACIAL-SKELETAL-DERMATOLOGIC DYSPLASIA, INCLUDED" for title in titles.split(';;'): # remove ', included', if present title = title.strip() label = re.sub(r',\s*INCLUDED', '', title, re.IGNORECASE) label_endswith_included = label != title + # TODO: Only use this on titles, not symbols label = cleanup_label(label) labels.append(label)