Skip to content

Commit

Permalink
Add preposition queries and fix sqlite conversion for all data
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewtavis committed Dec 9, 2024
1 parent 1f9f418 commit 6355649
Show file tree
Hide file tree
Showing 19 changed files with 136 additions and 75 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,19 @@ Emojis for the following are chosen based on [gitmoji](https://gitmoji.dev/).
### ✨ Features

- Queries for noun genders and other properties that require the Wikidata label service now return their English label rather than auto label that was returning just the Wikidata QID.
- SPARQL queries for English and Portuguese prepositions were added to allow the CLI to query these types of data.
- The convert functionality once again works for lists of languages all data types for them.

### 🐞 Bug Fixes

- SQLite conversion was fixed for all queries ([#527](https://github.com/scribe-org/Scribe-Data/issues/527)).
- The data conversion process outputs were improved including capitalizing language names and repeat notices to the user were removed.
- The CLI's `get` command now returns all data types if none is passed.
- The Portuguese verbs query was fixed as it wasn't formatted correctly.
- The emoji keyword functionality was fixed given the new lexeme ID based form of the data.
- Arguments were fixed that were breaking the functionality.
- Languages for the user were capitalized.
- `case` has been renamed `grammaticalCase` in preposition queries to assure that SQLite reserved keywords are not used.

## Scribe-Data 4.0.0

Expand Down
57 changes: 30 additions & 27 deletions src/scribe_data/cli/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,7 @@ def convert_to_sqlite(
input_file: str = None,
output_dir: str = None,
overwrite: bool = False,
identifier_case: str = "snake",
identifier_case: str = "camel",
) -> None:
"""
Converts a Scribe-Data output file to an SQLite file.
Expand Down Expand Up @@ -416,16 +416,13 @@ def convert_to_sqlite(
-------
A SQLite file saved in the given location.
"""
if not language:
raise ValueError("Language must be specified for SQLite conversion.")

if input_file:
input_file = Path(input_file)

if not input_file.exists():
if input_file is not None and not input_file.exists():
raise ValueError(f"Input file does not exist: {input_file}")

languages = [language]
languages = [language] if data_type else None
specific_tables = [data_type] if data_type else None

if output_dir is None:
Expand All @@ -450,30 +447,31 @@ def convert_to_sqlite(


def convert_wrapper(
language: str,
data_type: Union[str, List[str]],
languages: Union[str, List[str]],
data_types: Union[str, List[str]],
output_type: str,
input_file: str,
input_files: Union[str, List[str]],
output_dir: str = None,
overwrite: bool = False,
identifier_case: str = "snake",
identifier_case: str = "camel",
all: bool = False,
):
"""
Convert data to the specified output type: JSON, CSV/TSV, or SQLite.
Parameters
----------
language : str
The language of the data to convert.
language : Union[str, List[str]]
The language(s) of the data to convert.
data_type : Union[str, List[str]]
The data type(s) of the data to convert.
output_type : str
The desired output format. It can be 'json', 'csv', 'tsv', or 'sqlite'.
input_file : str
The path to the input file.
input_file : Union[str, List[str]]
The path(s) to the input file(s).
output_dir : str, optional
The output directory where converted files will be stored. Defaults to None.
Expand All @@ -484,44 +482,49 @@ def convert_wrapper(
identifier_case : str
The case format for identifiers. Default is "camel".
all : bool
Convert all languages and data types.
Returns
-------
None
None
"""
output_type = output_type.lower()
print(
f"Converting data for {language.capitalize()} {data_type.capitalize()} to {output_type}..."
)

if languages is not None and data_types is not None:
print(
f"Converting data for {languages.capitalize()} {data_types} to {output_type}..."
)

# Route the function call to the correct conversion function.
if output_type == "json":
convert_to_json(
language=language,
data_type=data_type,
language=languages,
data_type=data_types,
output_type=output_type,
input_file=input_file,
input_file=input_files,
output_dir=output_dir,
overwrite=overwrite,
identifier_case=identifier_case,
)

elif output_type in {"csv", "tsv"}:
convert_to_csv_or_tsv(
language=language,
data_type=data_type,
language=languages,
data_type=data_types,
output_type=output_type,
input_file=input_file,
input_file=input_files,
output_dir=output_dir,
overwrite=overwrite,
identifier_case=identifier_case,
)

elif output_type == "sqlite":
convert_to_sqlite(
language=language,
data_type=data_type,
language=languages,
data_type=data_types,
output_type=output_type,
input_file=input_file,
input_file=input_files,
output_dir=output_dir,
overwrite=overwrite,
identifier_case=identifier_case,
Expand Down
31 changes: 22 additions & 9 deletions src/scribe_data/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def main() -> None:
help="List all languages and data types.",
)

# MARK: GET
# MARK: Get

get_parser = subparsers.add_parser(
"get",
Expand Down Expand Up @@ -206,21 +206,21 @@ def main() -> None:
"-lang",
"--language",
type=str,
required=True,
required=False,
help="The language of the file to convert.",
)
convert_parser.add_argument(
"-dt",
"--data-type",
type=str,
required=True,
required=False,
help="The data type(s) of the file to convert (e.g., nouns, verbs).",
)
convert_parser.add_argument(
"-if",
"--input-file",
type=Path,
required=True,
required=False,
help="The path to the input file to convert.",
)
convert_parser.add_argument(
Expand Down Expand Up @@ -258,6 +258,12 @@ def main() -> None:
default="camel",
help="The case format for identifiers in the output data (default: camel).",
)
convert_parser.add_argument(
"-a",
"--all",
action=argparse.BooleanOptionalAction,
help="Convert all languages and data types.",
)

# MARK: Setup CLI

Expand Down Expand Up @@ -296,8 +302,12 @@ def main() -> None:

else:
get_data(
language=args.language.lower(),
data_type=args.data_type.lower(),
language=args.language.lower()
if args.language is not None
else None,
data_type=args.data_type.lower()
if args.data_type is not None
else None,
output_type=args.output_type,
output_dir=args.output_dir,
outputs_per_entry=args.outputs_per_entry,
Expand All @@ -323,13 +333,16 @@ def main() -> None:

elif args.command in ["convert", "c"]:
convert_wrapper(
language=args.language.lower(),
data_type=args.data_type,
languages=args.language.lower() if args.language is not None else None,
data_types=args.data_type.lower()
if args.data_type is not None
else None,
output_type=args.output_type,
input_file=args.input_file,
input_files=args.input_file,
output_dir=args.output_dir,
overwrite=args.overwrite,
identifier_case=args.identifier_case,
all=args.all,
)

else:
Expand Down
10 changes: 8 additions & 2 deletions src/scribe_data/load/data_to_sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def create_table(data_type, cols):
]

cursor.execute(
f"CREATE TABLE IF NOT EXISTS {data_type} ({' Text, '.join(cols)} Text, UNIQUE({cols[0]}))"
f"CREATE TABLE IF NOT EXISTS {data_type} ({' Text, '.join(cols)} Text, unique({cols[0]}))"
)

def table_insert(data_type, keys):
Expand Down Expand Up @@ -238,7 +238,13 @@ def table_insert(data_type, keys):

if dt in ["nouns", "verbs", "prepositions"]:
cols = ["wdLexemeId"]
cols += json_data[list(json_data.keys())[0]].keys()

all_elem_keys = [
json_data[k].keys() for k in list(json_data.keys())
]
all_keys_flat = list({k for ks in all_elem_keys for k in ks})

cols += all_keys_flat
create_table(data_type=dt, cols=cols)
cursor.execute(f"DELETE FROM {dt}") # clear existing data

Expand Down
2 changes: 1 addition & 1 deletion src/scribe_data/unicode/generate_emoji_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def generate_emoji(language, output_dir: str = None):
emojis_per_keyword=EMOJI_KEYWORDS_DICT,
):
export_formatted_data(
file_path=output_dir,
dir_path=output_dir,
formatted_data=emoji_keywords_dict,
query_data_in_use=True,
language=language,
Expand Down
25 changes: 19 additions & 6 deletions src/scribe_data/unicode/process_unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def gen_emoji_lexicon(

for cldr_char in tqdm(
iterable=cldr_dict,
desc=f"Characters processed from '{cldr_file_key}' CLDR file for {language}",
desc=f"Characters processed from '{cldr_file_key}' CLDR file for {language.capitalize()}",
unit="cldr characters",
):
# Filter CLDR data for emoji characters while not including certain emojis.
Expand Down Expand Up @@ -187,11 +187,24 @@ def gen_emoji_lexicon(
) as f:
noun_data = json.load(f)

plurals_to_singulars_dict = {
noun_data[row]["plural"].lower(): row.lower()
for row in noun_data
if noun_data[row]["plural"] != "isPlural"
}
if language not in ["german", "russian"]:
plurals_to_singulars_dict = {
noun_data[row]["plural"].lower(): row.lower()
for row in noun_data
if "singular" in noun_data[row]
and "plural" in noun_data[row]
and noun_data[row]["singular"] != noun_data[row]["plural"]
}

else:
plurals_to_singulars_dict = {
noun_data[row]["nominativePlural"].lower(): row.lower()
for row in noun_data
if "nominativeSingular" in noun_data[row]
and "nominativePlural" in noun_data[row]
and noun_data[row]["nominativeSingular"]
!= noun_data[row]["nominativePlural"]
}

for plural, singular in plurals_to_singulars_dict.items():
if plural not in keyword_dict and singular in keyword_dict:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
SELECT
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
?preposition
?case
?grammaticalCase

WHERE {
?lexeme dct:language wd:Q9610 ;
Expand All @@ -21,6 +21,6 @@ WHERE {

SERVICE wikibase:label {
bd:serviceParam wikibase:language "en" .
?caseForm rdfs:label ?case .
?caseForm rdfs:label ?grammaticalCase .
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
SELECT
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
?preposition
?case
?grammaticalCase

WHERE {
?lexeme dct:language wd:Q9610 ;
Expand All @@ -20,6 +20,6 @@ WHERE {

SERVICE wikibase:label {
bd:serviceParam wikibase:language "en" .
?caseForm rdfs:label ?case .
?caseForm rdfs:label ?grammaticalCase .
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
SELECT
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
?preposition
?case
?grammaticalCase

WHERE {
?lexeme dct:language wd:Q9056 ;
Expand All @@ -21,6 +21,6 @@ WHERE {
SERVICE wikibase:label {
bd:serviceParam wikibase:language "en" .
?lemma rdfs:label ?preposition .
?caseForm rdfs:label ?case .
?caseForm rdfs:label ?grammaticalCase .
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
SELECT
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
?preposition
?case
?grammaticalCase

WHERE {
?lexeme dct:language wd:Q9035 ;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# tool: scribe-data
# All English (Q1860) prepositions (Q4833830) and the given forms.
# Enter this query at https://query.wikidata.org/.

SELECT
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
?preposition

WHERE {
?lexeme dct:language wd:Q1860 ;
wikibase:lexicalCategory wd:Q4833830 ;
wikibase:lemma ?preposition .
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
SELECT
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
?preposition
?case
?grammaticalCase

WHERE {
?lexeme dct:language wd:Q9072 ;
Expand All @@ -20,6 +20,6 @@ WHERE {

SERVICE wikibase:label {
bd:serviceParam wikibase:language "en" .
?caseForm rdfs:label ?case .
?caseForm rdfs:label ?grammaticalCase .
}
}
Loading

0 comments on commit 6355649

Please sign in to comment.