Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Abbreviation recasing: use all abbreviations #153

Merged
merged 4 commits into from
Dec 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 19 additions & 15 deletions omim2obo/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@
Assumptions
1. Mappings obtained from official OMIM files as described above are interpreted correctly (e.g. skos:exactMatch).
"""
from typing import Set

import yaml
from hashlib import md5

Expand All @@ -59,8 +61,8 @@

from omim2obo.config import REVIEW_CASES_PATH, ROOT_DIR, GLOBAL_TERMS
from omim2obo.namespaces import *
from omim2obo.parsers.omim_entry_parser import REVIEW_CASES, log_review_cases, cleanup_title, \
get_alt_and_included_titles_and_symbols, get_pubs, get_mapped_ids, capitalize_acronyms_in_title
from omim2obo.parsers.omim_entry_parser import REVIEW_CASES, cleanup_title, get_alt_and_included_titles_and_symbols, \
get_pubs, get_mapped_ids, log_review_cases, recapitalize_acronyms_in_titles
from omim2obo.parsers.omim_txt_parser import * # todo: change to specific imports


Expand Down Expand Up @@ -233,6 +235,16 @@ def omim2obo(use_cache: bool = False):
get_alt_and_included_titles_and_symbols(inc_titles_str)
included_is_included = included_titles or included_symbols # redundant. can't be included symbol w/out title

# Recapitalize acronyms in titles
all_abbrevs: Set[str] = \
set(pref_symbols + alt_symbols + former_alt_symbols + included_symbols + former_included_symbols)
# todo: consider DRYing to 1 call by passing all 5 title types to a wrapper function
pref_title = recapitalize_acronyms_in_titles(pref_title, all_abbrevs)
alt_titles = recapitalize_acronyms_in_titles(alt_titles, all_abbrevs)
former_alt_titles = recapitalize_acronyms_in_titles(former_alt_titles, all_abbrevs)
included_titles = recapitalize_acronyms_in_titles(included_titles, all_abbrevs)
former_included_titles = recapitalize_acronyms_in_titles(former_included_titles, all_abbrevs)

# Special cases depending on OMIM term type
is_gene = omim_type == OmimType.GENE or omim_type == OmimType.HAS_AFFECTED_FEATURE
if omim_type == OmimType.HERITABLE_PHENOTYPIC_MARKER: # '%' char
Expand All @@ -256,25 +268,19 @@ def omim2obo(use_cache: bool = False):
else:
graph.add((omim_uri, RDFS.label, Literal(pref_title)))

# todo: .clean()/.cleanup_label() 2nd param `explicit_abbrev` should be List[str] instead of str. And below,
# should pass all symbols/abbrevs from each of preferred, alt, included each time it is called. If no symbols
# for given term, should pass empty list. See: https://github.com/monarch-initiative/omim/issues/129
pref_abbrev: Union[str, None] = None if not pref_symbols else pref_symbols[0]

# Add synonyms
# - exact titles
graph.add((omim_uri, oboInOwl.hasExactSynonym, Literal(capitalize_acronyms_in_title(pref_title, pref_abbrev))))
graph.add((omim_uri, oboInOwl.hasExactSynonym, Literal(pref_title)))
for title in alt_titles:
graph.add((omim_uri, oboInOwl.hasExactSynonym, Literal(capitalize_acronyms_in_title(title, pref_abbrev))))
graph.add((omim_uri, oboInOwl.hasExactSynonym, Literal(title)))
# - exact abbreviations
for abbrevs in [pref_symbols, alt_symbols]:
for abbreviation in abbrevs:
add_triple_and_optional_annotations(graph, omim_uri, oboInOwl.hasExactSynonym, abbreviation,
[(oboInOwl.hasSynonymType, OMO['0003000'])])
# - related, deprecated 'former' titles
for title in former_alt_titles:
clean_title = capitalize_acronyms_in_title(title, pref_abbrev)
add_triple_and_optional_annotations(graph, omim_uri, oboInOwl.hasRelatedSynonym, clean_title,
add_triple_and_optional_annotations(graph, omim_uri, oboInOwl.hasRelatedSynonym, title,
[(OWL.deprecated, Literal(True))])
# - related, deprecated 'former' abbreviations
for abbreviation in former_alt_symbols:
Expand All @@ -288,8 +294,7 @@ def omim2obo(use_cache: bool = False):
graph.add((omim_uri, RDFS['comment'], Literal(included_comment)))
# - titles
for title in included_titles:
graph.add((
omim_uri, URIRef(MONDONS.omim_included), Literal(capitalize_acronyms_in_title(title, pref_abbrev))))
graph.add((omim_uri, URIRef(MONDONS.omim_included), Literal(title)))
# - symbols
for symbol in included_symbols:
add_triple_and_optional_annotations(graph, omim_uri, URIRef(MONDONS.omim_included), symbol, [
Expand All @@ -298,8 +303,7 @@ def omim2obo(use_cache: bool = False):
])
# - deprecated, 'former'
for title in former_included_titles:
clean_title = capitalize_acronyms_in_title(title, pref_abbrev)
add_triple_and_optional_annotations(graph, omim_uri, URIRef(MONDONS.omim_included), clean_title,
add_triple_and_optional_annotations(graph, omim_uri, URIRef(MONDONS.omim_included), title,
[(OWL.deprecated, Literal(True))])
for symbol in former_included_symbols:
add_triple_and_optional_annotations(graph, omim_uri, URIRef(MONDONS.omim_included), symbol, [
Expand Down
103 changes: 64 additions & 39 deletions omim2obo/parsers/omim_entry_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import csv
import logging
from collections import defaultdict
from typing import List, Dict, Tuple
from typing import List, Dict, Set, Tuple, Union

import pandas as pd
from rdflib import Graph, RDF, RDFS, DC, Literal, OWL, SKOS, URIRef
Expand All @@ -26,15 +26,21 @@

def get_known_capitalizations() -> Dict[str, str]:
"""Get list of known capitalizations for proper names, acronyms, and the like.
TODO: Contains space-delimited words, e.g. "vitamin d". The way that
todo: Contains space-delimited words, e.g. "vitamin d". The way that
cleanup_label is currently implemented, each word in the label gets
replaced; i.e. it would try to replace "vitamin" and "d" separately. Hence,
this would fail.
Therefore, we should probably do this in 2 different operations: (1) use
the current 'word replacement' logic, but also, (2), at the end, do a
generic string replacement (e.g. my_str.replace(a, b). When implementing
(2), we should also split this dictionary into two separate dictionaries,
each for 1 of these 2 different purposes."""
each for 1 of these 2 different purposes.

todo: known_capitalizations.tsv can be refactored possibly. It really only needs 1 column, the case to replaace. The
pattern column is not used, and the first column (lowercase) can be computed by using .lower() on the case to
replace. We could also leave as-is since this file is shared elsewhere in the project infrastructure, though I do
not know its source-of-truth location.
"""
path = DATA_DIR / 'known_capitalizations.tsv'
with open(path, "r") as file:
data_io = csv.reader(file, delimiter="\t")
Expand Down Expand Up @@ -154,13 +160,12 @@ def transform_entry(entry) -> Graph:
return graph


# todo: probably best to combine explicit abbrevs outside of this func
# noinspection RegExpSimplifiable eventually_should_address
def _detect_abbreviations(label: str, explicit_abbrev: str = None, capitalization_threshold=0.75) -> List[str]:
def detect_abbreviations(label: str, capitalization_threshold=0.75) -> List[str]:
"""Detect possible abbreviations / acronyms"""
# Compile regexp
# todo: handle several warnings: {1} redundant, {1,} simplified to +
acronyms_without_periods_compiler = re.compile('[A-Z]{1}[A-Z0-9]{1,}')
# todo: PyCharm flagged as invalid escape sequence, but this code seems to work? Should double check
# todo: PyCharm flagged next 2 lines as invalid escape sequence, but this code seems to work? Should double check
acronyms_with_periods_compiler = re.compile('[A-Z]{1}\.([A-Z0-9]\.){1,}')
title_cased_abbrev_compiler = re.compile('[A-Z]{1}[a-zA-Z]{1,}\.')

Expand All @@ -174,29 +179,21 @@ def _detect_abbreviations(label: str, explicit_abbrev: str = None, capitalizatio
is_largely_uppercase = \
fully_capitalized_count / len(words) >= capitalization_threshold

# Detect acronyms without periods
# Detect cases
if is_largely_uppercase:
acronyms_without_periods = [] # can't infer because everything was uppercase
else:
acronyms_without_periods = acronyms_without_periods_compiler.findall(label)
# Detect more
title_cased_abbrevs = title_cased_abbrev_compiler.findall(label)
acronyms_with_periods = acronyms_with_periods_compiler.findall(label)
# Combine list of things to re-format
replacements = []
candidates: List[List[str]] = [
acronyms_with_periods, acronyms_without_periods, title_cased_abbrevs, [explicit_abbrev]]
for item_list in candidates:
for item in item_list:
if item:
replacements.append(item)

return replacements
acronyms_without_periods: List[str] = acronyms_without_periods_compiler.findall(label)
title_cased_abbrevs: List[str] = title_cased_abbrev_compiler.findall(label)
acronyms_with_periods: List[str] = acronyms_with_periods_compiler.findall(label)

return acronyms_with_periods + acronyms_without_periods + title_cased_abbrevs


# todo: rename? It's doing more than cleaning; it's mutating
def cleanup_title(
title: str,
replacement_case_method: str = 'lower', # 'upper', 'title', 'lower', 'capitalize' (=sentence case)
conjunctions: List[str] = ['and', 'but', 'yet', 'for', 'nor', 'so'],
little_preps: List[str] = ['at', 'by', 'in', 'of', 'on', 'to', 'up', 'as', 'it', 'or'],
articles: List[str] = ['a', 'an', 'the'],
Expand All @@ -206,9 +203,10 @@ def cleanup_title(

:param title: A preferred, alternative, or included title.

1. Removes the abbreviation suffixes
2. Converts roman numerals to arabic
3. Makes the text Title Case, except for supplied conjunctions/prepositions/articles
1. Converts roman numerals to arabic
2. Makes the text adhere to the case of `replacement_case_method`, except for supplied
conjunctions, prepositions, and articles, which will always be lowercased. NOTE: The default for this is 'lower',
meaning that this operation by default does nothing.

Assumptions:
1. All acronyms are capitalized
Expand Down Expand Up @@ -242,9 +240,6 @@ def cleanup_title(
e.g.: Balint syndrome, Barre-Lieou syndrome, Wallerian degeneration, etc.
How to do this? Simply get/create a list of known eponyms? Is this feasible?
"""
# Simple method: Lower/title case everything but acronyms
# label_newcase = getattr(label2, replacement_case_method)()
# Advanced method: iteritavely format words
fixedwords = []
i = 0
for wrd in title.split():
Expand All @@ -263,8 +258,7 @@ def cleanup_title(
suffix = wrd.replace(toRoman(num), '', 1)
fixed = ''.join((str(num), suffix))
wrd = fixed
# todo: next few lines don't make sense. why lower 'wrd', and then conditionally lowercase it again?
wrd = wrd.lower()
wrd = getattr(wrd, replacement_case_method)()
# replace interior conjunctions, prepositions, and articles with lowercase, always
if wrd in (conjunctions + little_preps + articles) and i != 1:
wrd = wrd.lower()
Expand All @@ -276,18 +270,49 @@ def cleanup_title(
return label_newcase


# todo: explicit_abbrev: Change to List[str]. See: https://github.com/monarch-initiative/omim/issues/129
def capitalize_acronyms_in_title(title: str, explicit_abbrev=None, capitalization_threshold=0.75) -> str:
"""Re-capitalize acronyms / words based on information contained w/in original label"""
# todo: probably best to combine explicit abbrevs outside of this func
possible_abbreviations = _detect_abbreviations(
title, explicit_abbrev, capitalization_threshold=capitalization_threshold)
title2 = title
for abbrev in possible_abbreviations:
title2 = title2.replace(abbrev.upper(), abbrev)
def recapitalize_acronyms_in_title(title: str, known_abbrevs: Set[str] = None, capitalization_threshold=0.75) -> str:
"""Re-capitalize acronyms / words based on information contained w/in original label

todo: If title has been used on cleanup_title() using a replacement_case_method other than the non-default 'lower',
then the .replace() operation will not work. To solve, this (a) capture the replacement_case_method used and
pass that here, or (b) duplicate the .replace() line and call it on alternative casing variations (.title() and
capitalize() (=sentence case)), (c) possibly just compare to word.lower() instead of 'word.
todo: (more important): It's probable that .split(' ') is not enough to cover all cases. Should also run the check
by splitting on other characters. E.g. consider the following potential cases: "TITLE (ACRONYM)",
"TITLE: ACRONYM1&ACRONYM2", "TITLE/ACRONYM" or "TITLE ACRONYM/ACRONYM", "TITLE {ACRONYM1,ACRONYM2}",
"TITLE[ACRONYM]", "TITLE-ACRONYM", or less likely cases such as "TITLE_ACRONYM", "TITLE.ACRONYM". There are quite
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do you think these other potential cases exist in the data? Have you seen these in your analysis of the OMIM files?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might be a time consuming analysis to investigate all current patterns of titles and acronyms with special characters. But we have seen several different variations.

Even if we did such an analysis, it does not necessarily future-proof the parser.

I think that many of the cases of this usage of special characters is not actually a rigorous syntax they've implemented, but a kind of lax syntax, or maybe just a bundle of ad hoc cases.

a few different combos of special char usage that could theoretically arise. It might be possible for thisthat to
utilize the regular expressions in detect_abbreviations(), and substitute in the acronym in the place of the [A-Z]
part. It is also possible to improve detect_abbreviations() by considering some of thes eother possible example
cases above.
"""
inferred_abbrevs: Set[str] = set(detect_abbreviations(title, capitalization_threshold))
abbrevs: Set[str] = known_abbrevs.union(inferred_abbrevs)
if not abbrevs:
return title
title2_words: List[str] = []
for word in title.split():
abbrev_match = False
for abbrev in abbrevs:
if abbrev.lower() == word:
title2_words.append(abbrev)
abbrev_match = True
break
if not abbrev_match:
title2_words.append(word)
title2 = ' '.join(title2_words)
return title2


def recapitalize_acronyms_in_titles(
titles: Union[str, List[str]], known_abbrevs: Set[str] = None, capitalization_threshold=0.75
) -> Union[str, List[str]]:
"""Re-capitalize acronyms in a list of titles"""
if isinstance(titles, str):
return recapitalize_acronyms_in_title(titles, known_abbrevs, capitalization_threshold)
return [recapitalize_acronyms_in_title(title, known_abbrevs, capitalization_threshold) for title in titles]


def remove_included_and_formerly_suffixes(title: str) -> str:
"""Remove ', INCLUDED' and ', FORMERLY' suffixes from a title"""
for suffix in ['FORMERLY', 'INCLUDED']:
Expand Down