-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathremovestopwords.py
executable file
·51 lines (38 loc) · 1.26 KB
/
removestopwords.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# - *- coding: utf- 8 - *-
import nltk
# import tensorflow as tf
from nltk import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
import re
taglist = []
wordswithoutstopwords = []
def removestopwords(words):
words = removepunc(words)
wordswithoutstopwords = []
# print("removestopword")
total = len(words)
sortedwords = set(stopwords.words('sinhala'))
# print(stopwords)
i = 0
while i < total:
if words[i] not in sortedwords:
wordswithoutstopwords.append(words[i])
#print("if")
i += 1
# print("stopwords")
# print(wordswithoutstopwords)
return wordswithoutstopwords
def removepunc(words):
wordlist = []
punclist = [".", ",", "?", "!", ";", ":", "-", "(", ")", "[", "]", "{", "}", "'", '"', "..."]
for n in words:
if n not in punclist:
x = re.match("[0-9]", n)
if x == None:
wordlist.append(n)
return wordlist
# readPath = './Data/ලංකාවේ පුරාවෘත්ත/3-මලවාණේ මහ බළකොටුව.txt'
# read_file = open(readPath,'r',encoding="utf16")
# file = read_file.read()
# words = word_tokenize(file)
# removestopwords(words)