Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add optional QID order validation and refactor label decomposition #507

Merged
merged 2 commits into from
Nov 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 89 additions & 19 deletions src/scribe_data/check/check_query_forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@
if "label" in sub_value
)

qid_label_dict = dict(zip(lexeme_form_labels_order, lexeme_form_qid_order))


# MARK: Extract Forms


Expand Down Expand Up @@ -107,6 +110,44 @@ def extract_form_rep_label(form_text: str):
return label_match[1].strip()


# MARK: Decompose Label


def decompose_label_features(label):
"""
Decomposes a concatenated grammatical label into a list of individual features.

Parameters
----------
label : str
The concatenated label string composed of several grammatical features.

Returns
-------
list
A list of grammatical features extracted from the label in their original order.
"""
components = re.findall(r"[A-Za-z][^A-Z]*", label)
valid_components = []
temp_component = ""

for index, component in enumerate(components):
temp_component += component.capitalize()

# Append valid components in lexeme_form_labels_order.
if index + 1 != len(components) and (
temp_component.lower() in map(str.lower, lexeme_form_labels_order)
and temp_component + components[index + 1] not in lexeme_form_labels_order
):
valid_components.append(temp_component)
temp_component = ""

if temp_component:
valid_components.append(temp_component)

return valid_components


# MARK: Extract QIDs


Expand Down Expand Up @@ -399,25 +440,7 @@ def check_forms_order(query_text):
# Split each column label into components.
split_vars = []
for col in set(select_vars) - set(labeling_service_cols):
components = re.findall(r"[A-Za-z][^A-Z]*", col)
valid_components = []
temp_component = ""

for index, component in enumerate(components):
temp_component += component.capitalize()

# Append valid components in lexeme_form_labels_order.
if index + 1 != len(components) and (
temp_component.lower() in map(str.lower, lexeme_form_labels_order)
and temp_component + components[index + 1]
not in lexeme_form_labels_order
):
valid_components.append(temp_component)
temp_component = ""

if temp_component:
valid_components.append(temp_component)

valid_components = decompose_label_features(col)
split_vars.append(valid_components)

# Create a map for fast component position lookup.
Expand Down Expand Up @@ -463,6 +486,48 @@ def compare_key(components):
return sorted_lower == select_lower


# MARK: Optional Validation


def check_optional_qid_order(query_file: str) -> str:
"""
Checks the order of QIDs in optional statements within a SPARQL query file to ensure they
align with the expected sequence based on label features.

Parameters
----------
query_file : str
The path to the SPARQL query file to be checked.

Returns
-------
str
A formatted string with details on any order mismatches in the QIDs, or an empty
string if all QIDs are correctly ordered.
"""
forms = extract_forms_from_sparql(query_file)
error_messages = []
for form_text in forms:
if "ontolex:lexicalForm" in form_text and "ontolex:representation" in form_text:
# Extract the actual QIDs and label for the current form.
actual_qids = extract_form_qids(form_text=form_text)
form_label = extract_form_rep_label(form_text)
label_components = decompose_label_features(form_label)
expected_qids = [qid_label_dict[key] for key in label_components]

# Keep PastParticiple and imperfective QIDs as is in the query since we have duplicate qids for it.
for i in ["Q12717679", "Q1230649", "Q2898727", "Q54556033"]:
if i in actual_qids and i not in expected_qids:
expected_qids[actual_qids.index(i)] = i
# Check if the actual QIDs match the expected order.
if len(actual_qids) == len(expected_qids) and actual_qids != expected_qids:
formatted_qids = ", ".join(f"wd:{qid}" for qid in expected_qids) + " ."
error_messages.append(
f"\nThe QIDs in optional statement for {form_label} should be ordered:\n{formatted_qids}"
)
return "\n".join(error_messages) if error_messages else ""


# MARK: Main Validation


Expand Down Expand Up @@ -495,6 +560,11 @@ def check_query_forms() -> None:
error_output += f"\n{index}. {query_file_str}:\n - {forms_order_and_definition_check}\n"
index += 1

# Check that all variables in the OPTIONAL clauses have their QIDs in the correct order.
if labels_qids_order_check := check_optional_qid_order(query_file_str):
error_output += f"\n{index}. {query_file_str}:\n{labels_qids_order_check}\n"
index += 1

if extract_forms_from_sparql(query_file):
query_form_check_dict = {}
for form_text in extract_forms_from_sparql(query_file):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,81 +46,77 @@ WHERE {
OPTIONAL {
?lexeme ontolex:lexicalForm ?nominativeFeminineIndefiniteSingularForm .
?nominativeFeminineIndefiniteSingularForm ontolex:representation ?nominativeFeminineIndefiniteSingular ;
wikibase:grammaticalFeature wd:Q1775415, wd:Q110786, wd:Q131105, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q131105, wd:Q1775415, wd:Q53997857, wd:Q110786 .
}

OPTIONAL {
?lexeme ontolex:lexicalForm ?nominativeFeminineIndefinitePluralForm .
?nominativeFeminineIndefinitePluralForm ontolex:representation ?nominativeFeminineIndefinitePlural ;
wikibase:grammaticalFeature wd:Q1775415, wd:Q146786, wd:Q131105, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q131105, wd:Q1775415, wd:Q53997857, wd:Q146786 .
}

OPTIONAL {
?lexeme ontolex:lexicalForm ?nominativeFeminineIndefiniteDualForm .
?nominativeFeminineIndefiniteDualForm ontolex:representation ?nominativeFeminineIndefiniteDual ;
wikibase:grammaticalFeature wd:Q1775415, wd:Q110022, wd:Q131105, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q131105, wd:Q1775415, wd:Q53997857, wd:Q110022 .
}

# Masculine

OPTIONAL {
?lexeme ontolex:lexicalForm ?nominativeMasculineIndefiniteSingularForm .
?nominativeMasculineIndefiniteSingularForm ontolex:representation ?nominativeMasculineIndefiniteSingular ;
wikibase:grammaticalFeature wd:Q499327, wd:Q110786, wd:Q131105, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q131105, wd:Q499327, wd:Q53997857, wd:Q110786 .
}

OPTIONAL {
?lexeme ontolex:lexicalForm ?nominativeMasculineIndefinitePluralForm .
?nominativeMasculineIndefinitePluralForm ontolex:representation ?nominativeMasculineIndefinitePlural ;
wikibase:grammaticalFeature wd:Q499327, wd:Q146786, wd:Q131105, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q131105, wd:Q499327, wd:Q53997857, wd:Q146786 .
}

OPTIONAL {
?lexeme ontolex:lexicalForm ?nominativeMasculineIndefiniteDualForm .
?nominativeMasculineIndefiniteDualForm ontolex:representation ?nominativeMasculineIndefiniteDual ;
wikibase:grammaticalFeature wd:Q499327, wd:Q110022, wd:Q131105, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q131105, wd:Q499327, wd:Q53997857, wd:Q110022 .
}

# MARK: Genitive

# Feminine

OPTIONAL {
?lexeme ontolex:lexicalForm ?genitiveFeminineIndefiniteSingularForm .
?genitiveFeminineIndefiniteSingularForm ontolex:representation ?genitiveFeminineIndefiniteSingular ;
wikibase:grammaticalFeature wd:Q1775415, wd:Q110786, wd:Q146233, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q146233, wd:Q1775415, wd:Q53997857, wd:Q110786 .
}

OPTIONAL {
?lexeme ontolex:lexicalForm ?genitiveFeminineIndefinitePluralForm .
?genitiveFeminineIndefinitePluralForm ontolex:representation ?genitiveFeminineIndefinitePlural ;
wikibase:grammaticalFeature wd:Q1775415, wd:Q146786, wd:Q146233, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q146233, wd:Q1775415, wd:Q53997857, wd:Q146786 .
}

OPTIONAL {
?lexeme ontolex:lexicalForm ?genitiveFeminineIndefiniteDualForm .
?genitiveFeminineIndefiniteDualForm ontolex:representation ?genitiveFeminineIndefiniteDual ;
wikibase:grammaticalFeature wd:Q1775415, wd:Q110022, wd:Q146233, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q146233, wd:Q1775415, wd:Q53997857, wd:Q110022 .
}

# Masculine

OPTIONAL {
?lexeme ontolex:lexicalForm ?genitiveMasculineIndefiniteSingularForm .
?genitiveMasculineIndefiniteSingularForm ontolex:representation ?genitiveMasculineIndefiniteSingular ;
wikibase:grammaticalFeature wd:Q499327, wd:Q110786, wd:Q146233, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q146233, wd:Q499327, wd:Q53997857, wd:Q110786 .
}

OPTIONAL {
?lexeme ontolex:lexicalForm ?genitiveMasculineIndefinitePluralForm .
?genitiveMasculineIndefinitePluralForm ontolex:representation ?genitiveMasculineIndefinitePlural ;
wikibase:grammaticalFeature wd:Q499327, wd:Q146786, wd:Q146233, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q146233, wd:Q499327, wd:Q53997857, wd:Q146786 .
}

OPTIONAL {
?lexeme ontolex:lexicalForm ?genitiveMasculineIndefiniteDualForm .
?genitiveMasculineIndefiniteDualForm ontolex:representation ?genitiveMasculineIndefiniteDual ;
wikibase:grammaticalFeature wd:Q499327, wd:Q110022, wd:Q146233, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q146233, wd:Q499327, wd:Q53997857, wd:Q110022 .
}

# MARK: Accusative
Expand All @@ -130,39 +126,39 @@ WHERE {
OPTIONAL {
?lexeme ontolex:lexicalForm ?accusativeFeminineIndefiniteSingularForm .
?accusativeFeminineIndefiniteSingularForm ontolex:representation ?accusativeFeminineIndefiniteSingular ;
wikibase:grammaticalFeature wd:Q1775415, wd:Q110786, wd:Q146078, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q146078, wd:Q1775415, wd:Q53997857, wd:Q110786 .
}

OPTIONAL {
?lexeme ontolex:lexicalForm ?accusativeFeminineIndefinitePluralForm .
?accusativeFeminineIndefinitePluralForm ontolex:representation ?accusativeFeminineIndefinitePlural ;
wikibase:grammaticalFeature wd:Q1775415, wd:Q146786, wd:Q146078, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q146078, wd:Q1775415, wd:Q53997857, wd:Q146786 .
}

OPTIONAL {
?lexeme ontolex:lexicalForm ?accusativeFeminineIndefiniteDualForm .
?accusativeFeminineIndefiniteDualForm ontolex:representation ?accusativeFeminineIndefiniteDual ;
wikibase:grammaticalFeature wd:Q1775415, wd:Q110022, wd:Q146078, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q146078, wd:Q1775415, wd:Q53997857, wd:Q110022 .
}

# Masculine

OPTIONAL {
?lexeme ontolex:lexicalForm ?accusativeMasculineIndefiniteSingularForm .
?accusativeMasculineIndefiniteSingularForm ontolex:representation ?accusativeMasculineIndefiniteSingular ;
wikibase:grammaticalFeature wd:Q499327, wd:Q110786, wd:Q146078, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q146078, wd:Q499327, wd:Q53997857, wd:Q110786 .
}

OPTIONAL {
?lexeme ontolex:lexicalForm ?accusativeMasculineIndefinitePluralForm .
?accusativeMasculineIndefinitePluralForm ontolex:representation ?accusativeMasculineIndefinitePlural ;
wikibase:grammaticalFeature wd:Q499327, wd:Q146786, wd:Q146078, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q146078, wd:Q499327, wd:Q53997857, wd:Q146786 .
}

OPTIONAL {
?lexeme ontolex:lexicalForm ?accusativeMasculineIndefiniteDualForm .
?accusativeMasculineIndefiniteDualForm ontolex:representation ?accusativeMasculineIndefiniteDual ;
wikibase:grammaticalFeature wd:Q499327, wd:Q110022, wd:Q146078, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q146078, wd:Q499327, wd:Q53997857, wd:Q110022 .
}

# MARK: Pausal
Expand All @@ -172,38 +168,40 @@ WHERE {
OPTIONAL {
?lexeme ontolex:lexicalForm ?pausalFeminineIndefiniteSingularForm .
?pausalFeminineIndefiniteSingularForm ontolex:representation ?pausalFeminineIndefiniteSingular ;
wikibase:grammaticalFeature wd:Q1775415, wd:Q110786, wd:Q117262361, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q117262361, wd:Q1775415, wd:Q53997857, wd:Q110786 .
}

OPTIONAL {
?lexeme ontolex:lexicalForm ?pausalFeminineIndefinitePluralForm .
?pausalFeminineIndefinitePluralForm ontolex:representation ?pausalFeminineIndefinitePlural ;
wikibase:grammaticalFeature wd:Q1775415, wd:Q146786, wd:Q117262361, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q117262361, wd:Q1775415, wd:Q53997857, wd:Q146786 .
}


OPTIONAL {
?lexeme ontolex:lexicalForm ?pausalFeminineIndefiniteDualForm .
?pausalFeminineIndefiniteDualForm ontolex:representation ?pausalFeminineIndefiniteDual ;
wikibase:grammaticalFeature wd:Q1775415, wd:Q110022, wd:Q117262361, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q117262361, wd:Q1775415, wd:Q53997857, wd:Q110022 .
}


# Masculine

OPTIONAL {
?lexeme ontolex:lexicalForm ?pausalMasculineIndefiniteSingularForm .
?pausalMasculineIndefiniteSingularForm ontolex:representation ?pausalMasculineIndefiniteSingular ;
wikibase:grammaticalFeature wd:Q499327, wd:Q110786, wd:Q117262361, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q117262361, wd:Q499327, wd:Q53997857, wd:Q110786 .
}

OPTIONAL {
?lexeme ontolex:lexicalForm ?pausalMasculineIndefinitePluralForm .
?pausalMasculineIndefinitePluralForm ontolex:representation ?pausalMasculineIndefinitePlural ;
wikibase:grammaticalFeature wd:Q499327, wd:Q146786, wd:Q117262361, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q117262361, wd:Q499327, wd:Q53997857, wd:Q146786 .
}

OPTIONAL {
?lexeme ontolex:lexicalForm ?pausalMasculineIndefiniteDualForm .
?pausalMasculineIndefiniteDualForm ontolex:representation ?pausalMasculineIndefiniteDual ;
wikibase:grammaticalFeature wd:Q499327, wd:Q110022, wd:Q117262361, wd:Q53997857 .
wikibase:grammaticalFeature wd:Q117262361, wd:Q499327, wd:Q53997857, wd:Q110022 .
}
}
Loading
Loading