Skip to content

Commit

Permalink
Merge pull request #6 from ShoukanLabs/dev
Browse files Browse the repository at this point in the history
Bump version, better check all punctuation
  • Loading branch information
korakoe authored Oct 9, 2024
2 parents 285fac4 + 4274aa3 commit 2b1ab6d
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 6 deletions.
12 changes: 7 additions & 5 deletions VoPho/langtokenizers/multicoded.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,12 +146,14 @@ def detect_writing_system(self, text):

def is_punctuation(self, char):
if len(char) > 1:
valid_chars = set(string.punctuation + ' ')
return all(c in valid_chars for c in char)
# Valid punctuation characters, including space
return all(self.is_punctuation(c) for c in char) # Check each character individually
else:
return not char.isalnum() and not char.isspace() and not self.is_writing_system(char,
self.detect_writing_system(
char))
# Single character check (as per your original logic)
return (not char.isalnum() # Is not alphanumeric
and not char.isspace() # Is not whitespace
and not self.is_writing_system(char, self.detect_writing_system(char))) # Is not in a writing system self.detect_writing_system(


def split_text_by_writing_system(self, text):
segments = []
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "VoPho"
version = "0.0.4"
version = "0.0.5"
description = "An easy to use Multilingual phonemization meta-library"
readme = "README.md"
authors = [
Expand Down

0 comments on commit 2b1ab6d

Please sign in to comment.