Skip to content

Commit

Permalink
doc: add type hints
Browse files Browse the repository at this point in the history
  • Loading branch information
MariellaCC committed Apr 9, 2024
1 parent b3e670b commit 4326e20
Showing 1 changed file with 6 additions and 5 deletions.
11 changes: 6 additions & 5 deletions src/kiara_plugin/topic_modelling/modules/pre_process.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# -*- coding: utf-8 -*-
from typing import Optional, List
from kiara.api import KiaraModule
from kiara.exceptions import KiaraProcessingException
from pyarrow import Table as PyArrowTable

#TODO add type hints

class TokenizeCorpus(KiaraModule):
"""
Expand Down Expand Up @@ -57,8 +58,8 @@ def process(self, inputs, outputs):

nltk.download("punkt")

tokenized_list = None
table_pa = None
tokenized_list: Optional[List[str]] = None
table_pa: Optional[PyArrowTable] = None

# check that both inputs table and array are not set simultaneously
if inputs.get_value_obj("corpus_table").is_set and inputs.get_value_obj("corpus_array").is_set:
Expand All @@ -73,7 +74,7 @@ def process(self, inputs, outputs):
raise KiaraProcessingException("The 'column_name' input must be set when 'corpus_table' is set.")

column_name: str = inputs.get_value_obj("column_name").data
table_cols: list = table_pa.column_names
table_cols: List[str] = table_pa.column_names

# check that the column name provided exists in the table
if column_name not in table_cols:
Expand All @@ -90,7 +91,7 @@ def process(self, inputs, outputs):

corpus_list = corpus_array_pa.to_pylist()

def tokenize(text, tokenize_by_character=False):
def tokenize(text: str, tokenize_by_character:bool = False) -> Optional[List[str]]:
if not tokenize_by_character:
try:
return nltk.word_tokenize(str(text))
Expand Down

0 comments on commit 4326e20

Please sign in to comment.