Skip to content

Commit

Permalink
add separate_script feature.
Browse files Browse the repository at this point in the history
  • Loading branch information
kargaranamir committed Feb 17, 2024
1 parent c83b258 commit 6123ac1
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 8 deletions.
72 changes: 66 additions & 6 deletions GlotScript/GlotScript.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,18 @@
Author: Amir Hossein Kargaran
Date: August, 2023
Description: This code detects the script (writing system) of the given text.
Description: This code detects/separates the script(s) (writing system(s)) of the given text.
MIT License
Original code is from Meta and is based on the MIT license, with permission for distribution and modification.
The original code is capable of detecting less than 40 scripts: https://github.com/facebookresearch/stopes/blob/main/stopes/pipelines/monolingual/utils/predict_script.py
The base code (Meta, MIT license::permission for distribution and modification) is capable of detecting less than 40 scripts: https://github.com/facebookresearch/stopes/blob/main/stopes/pipelines/monolingual/utils/predict_script.py
The currently developed code supports all 161 Unicode 15.0 scripts.
"""

import string
import typing as tp
from collections import Counter, defaultdict

from typing import Dict, List

SCRIPT_RANGES = {
'Latn': [(65, 90), (97, 122), (170, 170), (186, 186), (192, 214), (216, 246), (248, 696), (736, 740), (7424, 7461), (7468, 7516), (7522, 7525), (7531, 7543), (7545, 7614), (7680, 7935), (8305, 8305), (8319, 8319), (8336, 8348), (8490, 8491), (8498, 8498), (8526, 8526), (8544, 8584), (11360, 11391), (42786, 42887), (42891, 42954), (42960, 42961), (42963, 42963), (42965, 42969), (42994, 43007), (43824, 43866), (43868, 43876), (43878, 43881), (64256, 64262), (65313, 65338), (65345, 65370), (67456, 67461), (67463, 67504), (67506, 67514), (122624, 122654), (122661, 122666)], # Latin
Expand Down Expand Up @@ -186,7 +186,7 @@
ScoredScript = tp.Tuple[tp.Optional[str], float]


def get_script_predictor() -> tp.Callable[[str], ScoredScript]:
def get_script_predictor(replace_punctuation=True, replace_digits=True) -> tp.Callable[[str], ScoredScript]:

hist_map: tp.Dict[int, tp.Set[str]] = {}
for key, ranges in SCRIPT_RANGES.items():
Expand All @@ -199,9 +199,15 @@ def get_script_predictor() -> tp.Callable[[str], ScoredScript]:
replace_by = "" # we just get rid of characters that are ubiquitous
replacement_map = {
ord(c): replace_by
for c in string.whitespace + string.punctuation + string.digits
for c in string.whitespace
}

if replace_punctuation:
replacement_map.update({ord(c): replace_by for c in string.punctuation})

if replace_digits:
replacement_map.update({ord(c): replace_by for c in string.digits})

def predict_script(sent: str) -> ScoredScript:
sent = sent.translate(replacement_map)

Expand Down Expand Up @@ -243,6 +249,38 @@ def predict_script(sent: str) -> ScoredScript:

return predict_script



def separate_script(sent: str) -> Dict[str, str]:
"""
Separates characters in the input string based on different scripts.
Args:
sent (str): Input string containing characters from different scripts.
Returns:
Dict[str, str]: A dictionary mapping script names to the separated characters.
"""
result: Dict[str, List[str]] = {}

for char in sent:
code_point = ord(char)

for script, ranges in SCRIPT_RANGES.items():
for start, end in ranges:
if start <= code_point <= end or code_point == ord(' '):
if script not in result:
result[script] = []
result[script].append(char)
break

# Filter out empty values and spaces, and convert the list of characters to a string
result = {key: ''.join(value) for key, value in result.items() if value and ''.join(value).strip()}

return result



def test_predict_script():
predictor_fn = get_script_predictor()

Expand All @@ -257,3 +295,25 @@ def test_predict_script():
assert predictor_fn(string.digits)[:2] == (None, 0)
assert predictor_fn(string.whitespace)[:2] == (None, 0)
assert predictor_fn("")[:2] == (None, 0)


def test_separate_script():

sent = "Hello Salut سلام 你好 こんにちは שלום مرحبا"
detected_scripts = separate_script(sent)

ground_truth = {
'Latn': 'Hello Salut ',
'Hebr': ' שלום ',
'Arab': ' سلام مرحبا',
'Hani': ' 你好 ',
'Hira': ' こんにちは '
}

for key in ground_truth.keys():
assert key in detected_scripts, f"Error: '{key}' script not found in detected scripts."

detected_tokens = [x.strip() for x in detected_scripts[key].split() if len(x.strip()) != 0]
ground_truth_tokens = [x.strip() for x in ground_truth[key].split() if len(x.strip()) != 0]

assert sorted(detected_tokens) == sorted(ground_truth_tokens), f"Error: Tokens for key '{key}' do not match."
5 changes: 4 additions & 1 deletion GlotScript/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from .GlotScript import get_script_predictor
from .GlotScript import separate_script

sp = get_script_predictor()

sp = get_script_predictor(replace_punctuation=True, replace_digits=True)
sc = separate_script

__version__ = '1.2'
26 changes: 25 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,19 @@ pip3 install GlotScript
pip3 install GlotScript@git+https://github.com/cisnlp/GlotScript
```

### Usage
### Usage: Script Detection

```python
from GlotScript import get_script_predictor
sp = get_script_predictor()
```

OR

```python
from GlotScript import sp
```

```python
sp('これは日本人です')
>> ('Hira', 0.625, {'details': {'Hira': 0.625, 'Hani': 0.375}, 'tie': False, 'interval': 0.25})
Expand All @@ -60,6 +66,24 @@ sp('𝄞𝄫 𒊕𒀸')
>> ('Xsux', 0.5, {'details': {'Xsux': 0.5, 'Zyyy': 0.5}, 'tie': True, 'interval': 0.0})
```

### Usage: Script Separation

```python
from GlotScript import separate_script
```

```python
sent = "Hello Salut سلام 你好 こんにちは שלום مرحبا"
separate_script(sent)
>> {
"Latn":"Hello Salut ",
"Hebr":" שלום ",
"Arab":" سلام مرحبا",
"Hani":" 你好 ",
"Hira":" こんにちは "
}
```

### Exploring Unicode Blocks: Related Sources
<details>
<summary>Click to Exapand</summary>
Expand Down

0 comments on commit 6123ac1

Please sign in to comment.