-
Notifications
You must be signed in to change notification settings - Fork 10
/
Stemmer.py
86 lines (75 loc) · 3.75 KB
/
Stemmer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from bn_nlp.preprocessing import ban_processing,StaticArray
from bn_nlp.tokenizer import wordTokenizer
import functools
import re
import string
wordtokens=wordTokenizer()
word_vec=[]
word_dict={}
word_dict2={}
bp=ban_processing()
rule_words = ['ই', 'ও', 'তো', 'কে', 'তে', 'রা', 'চ্ছি', 'চ্ছিল', 'চ্ছে', 'চ্ছিস', 'চ্ছিলেন', 'চ্ছ', 'য়েছে', 'েছ', 'েছে',
'েছেন', 'রছ', 'রব', 'েল', 'েলো', 'ওয়া', 'েয়ে', 'য়', 'য়ে', 'েয়েছিল', 'েছিল', 'য়েছিল', 'েয়েছিলেন',
'ে.েছিলেন', 'েছিলেন', 'লেন', 'দের', 'ে.ে', 'ের', 'ার', 'েন', 'বেন', 'িস', 'ছিস', 'ছিলি', 'ছি', 'ছে', 'লি',
'বি', 'ে', 'টি', 'টির', 'েরটা', 'েরটার', 'টা', 'টার', 'গুলো', 'গুলোর', 'েরগুলো', 'েরগুলোর'
'যোগ্য','কেই','েও','সহ','রা','ভাবে','কারি','কৃত','ই','কে','র','কি','েই','ভাবে','গুলো',
'তে','েতে','গন','মুলক','সুচক','টুকু','টুকুই','গুলি','পদ','সমুহ','সংক্রান্ত','সংলগ্ন','সংশ্লিষ্ট',
'সুত্রে','রুপে','ানুসারে','ানুযায়ি','তত্ত্','ি','মুখি','প্রতি','ভাবে','য়'
]
rule_dict = {"রছ":"র","রব":"র","েয়ে":"া","েয়েছিল":"া","েয়েছিলেন":"া","ে.েছিলেন":"া.","ে.ে":"া."}
class stemmerOP:
def __init__(self):
for word in open("bn_nlp/dataset/word2.txt", "r"):
word=word.replace('\n','')
word_vec.append(word)
word_dict[word]=1
for word in open("bn_nlp/dataset/word3.txt", "r"):
word=word.replace('\n','')
word_dict2[word]=1
def search(self,word):
if bp.word_normalize(word) in word_dict:
return True
return False
def stem_normalize(self,word):
g = word[-1].encode("unicode_escape")
g = g.upper()
g = g[2:]
g = g.decode('utf-8')
if g in StaticArray.bn2enSuffix:
word=word[:-1]
return word
def bnCompare(self,item1,item2):
return (len(item1)<len(item2))-(len(item1)>len(item2))
def stem(self,word):
if word_dict2.get(word)!=None:
return word
suf_arr=[]
for wd in rule_words:
if re.search('.*' + wd + '$', word):
suf_arr.append(wd)
suf_arr = sorted(suf_arr, key=functools.cmp_to_key(self.bnCompare))
if len(suf_arr)>0:
for i in suf_arr:
if i in rule_dict:
ind = len(word) - len(i)
new_word=word[0:ind]+rule_dict[i]
if self.search(new_word):
return new_word
else:
ind = len(word) - len(i)
new_word = word[0:ind]
if len(new_word)==0:
return word
if self.search(new_word):
return new_word
new_word= self.stem_normalize(word)
if self.search(new_word):
return new_word
return word
def stemSent(self,sent):
tokens=wordtokens.basic_tokenizer(sent)
temp_tokens=[]
for i in tokens:
temp_tokens.append(self.stem(i))
result = ' '.join(temp_tokens)
return result