Skip to content

Commit

Permalink
Merge pull request #507 from OmarAI2003/OptionalStatmentsOrdering
Browse files Browse the repository at this point in the history
Add optional QID order validation and refactor label decomposition
  • Loading branch information
andrewtavis authored Nov 14, 2024
2 parents a0e0692 + 38361b8 commit 9773596
Show file tree
Hide file tree
Showing 51 changed files with 488 additions and 418 deletions.
108 changes: 89 additions & 19 deletions src/scribe_data/check/check_query_forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@
if "label" in sub_value
)

qid_label_dict = dict(zip(lexeme_form_labels_order, lexeme_form_qid_order))


# MARK: Extract Forms


Expand Down Expand Up @@ -107,6 +110,44 @@ def extract_form_rep_label(form_text: str):
return label_match[1].strip()


# MARK: Decompose Label


def decompose_label_features(label):
"""
Decomposes a concatenated grammatical label into a list of individual features.
Parameters
----------
label : str
The concatenated label string composed of several grammatical features.
Returns
-------
list
A list of grammatical features extracted from the label in their original order.
"""
components = re.findall(r"[A-Za-z][^A-Z]*", label)
valid_components = []
temp_component = ""

for index, component in enumerate(components):
temp_component += component.capitalize()

# Append valid components in lexeme_form_labels_order.
if index + 1 != len(components) and (
temp_component.lower() in map(str.lower, lexeme_form_labels_order)
and temp_component + components[index + 1] not in lexeme_form_labels_order
):
valid_components.append(temp_component)
temp_component = ""

if temp_component:
valid_components.append(temp_component)

return valid_components


# MARK: Extract QIDs


Expand Down Expand Up @@ -399,25 +440,7 @@ def check_forms_order(query_text):
# Split each column label into components.
split_vars = []
for col in set(select_vars) - set(labeling_service_cols):
components = re.findall(r"[A-Za-z][^A-Z]*", col)
valid_components = []
temp_component = ""

for index, component in enumerate(components):
temp_component += component.capitalize()

# Append valid components in lexeme_form_labels_order.
if index + 1 != len(components) and (
temp_component.lower() in map(str.lower, lexeme_form_labels_order)
and temp_component + components[index + 1]
not in lexeme_form_labels_order
):
valid_components.append(temp_component)
temp_component = ""

if temp_component:
valid_components.append(temp_component)

valid_components = decompose_label_features(col)
split_vars.append(valid_components)

# Create a map for fast component position lookup.
Expand Down Expand Up @@ -463,6 +486,48 @@ def compare_key(components):
return sorted_lower == select_lower


# MARK: Optional Validation


def check_optional_qid_order(query_file: str) -> str:
"""
Checks the order of QIDs in optional statements within a SPARQL query file to ensure they
align with the expected sequence based on label features.
Parameters
----------
query_file : str
The path to the SPARQL query file to be checked.
Returns
-------
str
A formatted string with details on any order mismatches in the QIDs, or an empty
string if all QIDs are correctly ordered.
"""
forms = extract_forms_from_sparql(query_file)
error_messages = []
for form_text in forms:
if "ontolex:lexicalForm" in form_text and "ontolex:representation" in form_text:
# Extract the actual QIDs and label for the current form.
actual_qids = extract_form_qids(form_text=form_text)
form_label = extract_form_rep_label(form_text)
label_components = decompose_label_features(form_label)
expected_qids = [qid_label_dict[key] for key in label_components]

# Keep PastParticiple and imperfective QIDs as is in the query since we have duplicate qids for it.
for i in ["Q12717679", "Q1230649", "Q2898727", "Q54556033"]:
if i in actual_qids and i not in expected_qids:
expected_qids[actual_qids.index(i)] = i
# Check if the actual QIDs match the expected order.
if len(actual_qids) == len(expected_qids) and actual_qids != expected_qids:
formatted_qids = ", ".join(f"wd:{qid}" for qid in expected_qids) + " ."
error_messages.append(
f"\nThe QIDs in optional statement for {form_label} should be ordered:\n{formatted_qids}"
)
return "\n".join(error_messages) if error_messages else ""


# MARK: Main Validation


Expand Down Expand Up @@ -495,6 +560,11 @@ def check_query_forms() -> None:
error_output += f"\n{index}. {query_file_str}:\n - {forms_order_and_definition_check}\n"
index += 1

# Check that all variables in the OPTIONAL clauses have their QIDs in the correct order.
if labels_qids_order_check := check_optional_qid_order(query_file_str):
error_output += f"\n{index}. {query_file_str}:\n{labels_qids_order_check}\n"
index += 1

if extract_forms_from_sparql(query_file):
query_form_check_dict = {}
for form_text in extract_forms_from_sparql(query_file):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,81 +46,77 @@ WHERE {
OPTIONAL {
?lexeme ontolex:lexicalForm ?nominativeFeminineIndefiniteSingularForm .
?nominativeFeminineIndefiniteSingularForm ontolex:representation ?nominativeFeminineIndefiniteSingular ;
wikibase:grammaticalFeature wd:Q1775415, wd:Q110786, wd:Q131105, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q131105, wd:Q1775415, wd:Q53997857, wd:Q110786 .
}

OPTIONAL {
?lexeme ontolex:lexicalForm ?nominativeFeminineIndefinitePluralForm .
?nominativeFeminineIndefinitePluralForm ontolex:representation ?nominativeFeminineIndefinitePlural ;
wikibase:grammaticalFeature wd:Q1775415, wd:Q146786, wd:Q131105, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q131105, wd:Q1775415, wd:Q53997857, wd:Q146786 .
}

OPTIONAL {
?lexeme ontolex:lexicalForm ?nominativeFeminineIndefiniteDualForm .
?nominativeFeminineIndefiniteDualForm ontolex:representation ?nominativeFeminineIndefiniteDual ;
wikibase:grammaticalFeature wd:Q1775415, wd:Q110022, wd:Q131105, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q131105, wd:Q1775415, wd:Q53997857, wd:Q110022 .
}

# Masculine

OPTIONAL {
?lexeme ontolex:lexicalForm ?nominativeMasculineIndefiniteSingularForm .
?nominativeMasculineIndefiniteSingularForm ontolex:representation ?nominativeMasculineIndefiniteSingular ;
wikibase:grammaticalFeature wd:Q499327, wd:Q110786, wd:Q131105, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q131105, wd:Q499327, wd:Q53997857, wd:Q110786 .
}

OPTIONAL {
?lexeme ontolex:lexicalForm ?nominativeMasculineIndefinitePluralForm .
?nominativeMasculineIndefinitePluralForm ontolex:representation ?nominativeMasculineIndefinitePlural ;
wikibase:grammaticalFeature wd:Q499327, wd:Q146786, wd:Q131105, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q131105, wd:Q499327, wd:Q53997857, wd:Q146786 .
}

OPTIONAL {
?lexeme ontolex:lexicalForm ?nominativeMasculineIndefiniteDualForm .
?nominativeMasculineIndefiniteDualForm ontolex:representation ?nominativeMasculineIndefiniteDual ;
wikibase:grammaticalFeature wd:Q499327, wd:Q110022, wd:Q131105, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q131105, wd:Q499327, wd:Q53997857, wd:Q110022 .
}

# MARK: Genitive

# Feminine

OPTIONAL {
?lexeme ontolex:lexicalForm ?genitiveFeminineIndefiniteSingularForm .
?genitiveFeminineIndefiniteSingularForm ontolex:representation ?genitiveFeminineIndefiniteSingular ;
wikibase:grammaticalFeature wd:Q1775415, wd:Q110786, wd:Q146233, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q146233, wd:Q1775415, wd:Q53997857, wd:Q110786 .
}

OPTIONAL {
?lexeme ontolex:lexicalForm ?genitiveFeminineIndefinitePluralForm .
?genitiveFeminineIndefinitePluralForm ontolex:representation ?genitiveFeminineIndefinitePlural ;
wikibase:grammaticalFeature wd:Q1775415, wd:Q146786, wd:Q146233, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q146233, wd:Q1775415, wd:Q53997857, wd:Q146786 .
}

OPTIONAL {
?lexeme ontolex:lexicalForm ?genitiveFeminineIndefiniteDualForm .
?genitiveFeminineIndefiniteDualForm ontolex:representation ?genitiveFeminineIndefiniteDual ;
wikibase:grammaticalFeature wd:Q1775415, wd:Q110022, wd:Q146233, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q146233, wd:Q1775415, wd:Q53997857, wd:Q110022 .
}

# Masculine

OPTIONAL {
?lexeme ontolex:lexicalForm ?genitiveMasculineIndefiniteSingularForm .
?genitiveMasculineIndefiniteSingularForm ontolex:representation ?genitiveMasculineIndefiniteSingular ;
wikibase:grammaticalFeature wd:Q499327, wd:Q110786, wd:Q146233, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q146233, wd:Q499327, wd:Q53997857, wd:Q110786 .
}

OPTIONAL {
?lexeme ontolex:lexicalForm ?genitiveMasculineIndefinitePluralForm .
?genitiveMasculineIndefinitePluralForm ontolex:representation ?genitiveMasculineIndefinitePlural ;
wikibase:grammaticalFeature wd:Q499327, wd:Q146786, wd:Q146233, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q146233, wd:Q499327, wd:Q53997857, wd:Q146786 .
}

OPTIONAL {
?lexeme ontolex:lexicalForm ?genitiveMasculineIndefiniteDualForm .
?genitiveMasculineIndefiniteDualForm ontolex:representation ?genitiveMasculineIndefiniteDual ;
wikibase:grammaticalFeature wd:Q499327, wd:Q110022, wd:Q146233, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q146233, wd:Q499327, wd:Q53997857, wd:Q110022 .
}

# MARK: Accusative
Expand All @@ -130,39 +126,39 @@ WHERE {
OPTIONAL {
?lexeme ontolex:lexicalForm ?accusativeFeminineIndefiniteSingularForm .
?accusativeFeminineIndefiniteSingularForm ontolex:representation ?accusativeFeminineIndefiniteSingular ;
wikibase:grammaticalFeature wd:Q1775415, wd:Q110786, wd:Q146078, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q146078, wd:Q1775415, wd:Q53997857, wd:Q110786 .
}

OPTIONAL {
?lexeme ontolex:lexicalForm ?accusativeFeminineIndefinitePluralForm .
?accusativeFeminineIndefinitePluralForm ontolex:representation ?accusativeFeminineIndefinitePlural ;
wikibase:grammaticalFeature wd:Q1775415, wd:Q146786, wd:Q146078, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q146078, wd:Q1775415, wd:Q53997857, wd:Q146786 .
}

OPTIONAL {
?lexeme ontolex:lexicalForm ?accusativeFeminineIndefiniteDualForm .
?accusativeFeminineIndefiniteDualForm ontolex:representation ?accusativeFeminineIndefiniteDual ;
wikibase:grammaticalFeature wd:Q1775415, wd:Q110022, wd:Q146078, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q146078, wd:Q1775415, wd:Q53997857, wd:Q110022 .
}

# Masculine

OPTIONAL {
?lexeme ontolex:lexicalForm ?accusativeMasculineIndefiniteSingularForm .
?accusativeMasculineIndefiniteSingularForm ontolex:representation ?accusativeMasculineIndefiniteSingular ;
wikibase:grammaticalFeature wd:Q499327, wd:Q110786, wd:Q146078, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q146078, wd:Q499327, wd:Q53997857, wd:Q110786 .
}

OPTIONAL {
?lexeme ontolex:lexicalForm ?accusativeMasculineIndefinitePluralForm .
?accusativeMasculineIndefinitePluralForm ontolex:representation ?accusativeMasculineIndefinitePlural ;
wikibase:grammaticalFeature wd:Q499327, wd:Q146786, wd:Q146078, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q146078, wd:Q499327, wd:Q53997857, wd:Q146786 .
}

OPTIONAL {
?lexeme ontolex:lexicalForm ?accusativeMasculineIndefiniteDualForm .
?accusativeMasculineIndefiniteDualForm ontolex:representation ?accusativeMasculineIndefiniteDual ;
wikibase:grammaticalFeature wd:Q499327, wd:Q110022, wd:Q146078, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q146078, wd:Q499327, wd:Q53997857, wd:Q110022 .
}

# MARK: Pausal
Expand All @@ -172,38 +168,40 @@ WHERE {
OPTIONAL {
?lexeme ontolex:lexicalForm ?pausalFeminineIndefiniteSingularForm .
?pausalFeminineIndefiniteSingularForm ontolex:representation ?pausalFeminineIndefiniteSingular ;
wikibase:grammaticalFeature wd:Q1775415, wd:Q110786, wd:Q117262361, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q117262361, wd:Q1775415, wd:Q53997857, wd:Q110786 .
}

OPTIONAL {
?lexeme ontolex:lexicalForm ?pausalFeminineIndefinitePluralForm .
?pausalFeminineIndefinitePluralForm ontolex:representation ?pausalFeminineIndefinitePlural ;
wikibase:grammaticalFeature wd:Q1775415, wd:Q146786, wd:Q117262361, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q117262361, wd:Q1775415, wd:Q53997857, wd:Q146786 .
}


OPTIONAL {
?lexeme ontolex:lexicalForm ?pausalFeminineIndefiniteDualForm .
?pausalFeminineIndefiniteDualForm ontolex:representation ?pausalFeminineIndefiniteDual ;
wikibase:grammaticalFeature wd:Q1775415, wd:Q110022, wd:Q117262361, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q117262361, wd:Q1775415, wd:Q53997857, wd:Q110022 .
}


# Masculine

OPTIONAL {
?lexeme ontolex:lexicalForm ?pausalMasculineIndefiniteSingularForm .
?pausalMasculineIndefiniteSingularForm ontolex:representation ?pausalMasculineIndefiniteSingular ;
wikibase:grammaticalFeature wd:Q499327, wd:Q110786, wd:Q117262361, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q117262361, wd:Q499327, wd:Q53997857, wd:Q110786 .
}

OPTIONAL {
?lexeme ontolex:lexicalForm ?pausalMasculineIndefinitePluralForm .
?pausalMasculineIndefinitePluralForm ontolex:representation ?pausalMasculineIndefinitePlural ;
wikibase:grammaticalFeature wd:Q499327, wd:Q146786, wd:Q117262361, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q117262361, wd:Q499327, wd:Q53997857, wd:Q146786 .
}

OPTIONAL {
?lexeme ontolex:lexicalForm ?pausalMasculineIndefiniteDualForm .
?pausalMasculineIndefiniteDualForm ontolex:representation ?pausalMasculineIndefiniteDual ;
wikibase:grammaticalFeature wd:Q499327, wd:Q110022, wd:Q117262361, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q117262361, wd:Q499327, wd:Q53997857, wd:Q110022 .
}
}
Loading

0 comments on commit 9773596

Please sign in to comment.