-
Notifications
You must be signed in to change notification settings - Fork 10
/
tokenizer.py
77 lines (69 loc) · 2 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from bn_nlp.preprocessing import ban_processing,StaticArray
bp=ban_processing()
class wordTokenizer:
def basic_tokenizer(self,sent):
sent=bp.punctuation_remove(sent)
tokens = sent.split()
temp_tokens = []
for i in tokens:
temp_tokens.append(i)
return temp_tokens
def normalize_tokenizer(self,sent):
sent=bp.word_normalize(sent)
tokens=sent.split()
temp_tokens=[]
for i in tokens:
if len(bp.dust_removal(i))==0:
continue
i=bp.dust_removal(i)
temp_tokens.append(i)
return temp_tokens
class sentenceTokenizer:
def basic_tokenizer(self,text):
text=text.replace('\n',' ')
tokens = []
s = ""
bangla_fullstop = '0964'
for c in text:
g = c.encode("unicode_escape")
g = g.upper()
g = g[2:]
g = g.decode('utf-8')
if g == bangla_fullstop:
tokens.append(s)
s = ""
continue
s += c
if len(s) > 0:
tokens.append(s)
temp_tokens = []
for i in tokens:
if len(bp.dust_removal(i))==0:
continue
temp_tokens.append(i)
return temp_tokens
def normalize_tokenizer(self,text):
tokens=[]
text = text.replace('\n', ' ')
s=""
bangla_fullstop = '0964'
for c in text:
g = c.encode("unicode_escape")
g = g.upper()
g = g[2:]
g = g.decode('utf-8')
if g==bangla_fullstop:
tokens.append(s)
s=""
continue
s+=c
if len(s)>0:
tokens.append(s)
temp_tokens=[]
for i in tokens:
if len(bp.dust_removal(i))==0:
continue
i=bp.punctuation_remove(i)
i=bp.word_normalize(i)
temp_tokens.append(i)
return temp_tokens