-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRemoveStopWords.py
80 lines (68 loc) · 2.65 KB
/
RemoveStopWords.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import math
import re
from sklearn.feature_extraction.text import TfidfVectorizer
def RemoveSymbols(Words):
# included @ because of emails
alphanums = "abcdefghijklmnopqrstuvwxyz@ABCDEFGHIJKLMNOPQRSTUVWXYZ"
email_regex = '^[a-z0-9]+[\._]?[a-z0-9]+[@]\w+[.]\w+$'
website_regex = r"^www.|^http:\/\/|^https:\/\/"
result = []
Symbols = set()
temp = []
for word in Words:
for letter in word:
if not letter in alphanums and not letter in Symbols:
Symbols.add(letter)
for word in Words:
if not re.search(email_regex,word):
if not re.search(website_regex,word):
for sym in Symbols.intersection(set(word)):
word = word.replace(sym,' ')
temp = word.split(' ')
for exceptEmptyCharacters in temp:
if exceptEmptyCharacters != '':
result.append(exceptEmptyCharacters)
return result
def topN_Words(Corpus,n):
if type(Corpus) == str:
Corpus = [Corpus]
vectorizer = TfidfVectorizer(token_pattern= '(\S+)')
response = vectorizer.fit_transform(Corpus)
TF_Matrix = pd.DataFrame(response.toarray(),columns = vectorizer.get_feature_names())
TFidf_Sum = TF_Matrix.sum(axis = 0)
Result = TFidf_Sum.sort_values(ascending = False)
Result = list(Result.index)
Result = [word for word in Result if len(word)>1]
Result = Result[:n]
return Result
def Process_Text(CSV_path):
N = 10
stop_Words = set(stopwords.words('english'))
df = pd.read_csv(CSV_path)
sentences = [' '.join(list(word.lower() for word in df.loc[pdf_text][0].split())) for pdf_text in range(50) ]
sentences = [' '.join(RemoveSymbols(sentence.split())) for sentence in sentences]
Cleaned_Corpus = [ ' '.join([word for word in sentence.split() if not word in stop_Words ]) for sentence in sentences]
Result = topN_Words(Cleaned_Corpus,N)
return Result
def Process_Text_For_API(Text):
N = 10
Text = ''.join(Text)
stop_Words = set(stopwords.words('english'))
sentences = ' '.join([word.lower() for word in Text.split()])
sentences = ' '.join(RemoveSymbols(sentences.split()))
Cleaned_Corpus = ' '.join([word for word in sentences.split() if not word in stop_Words ])
Result = topN_Words(Cleaned_Corpus,N)
return Result
def serverSnippet(Text):
if type(Text) == str:
Text = list(Text)
Imp_Words = Process_Text_For_API(Text)
return Imp_Words
if __name__ == '__main__':
DIRECTORY = 'CSVs/Resumes.csv'
Imp_Words = Process_Text(DIRECTORY)
print(Imp_Words)