-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathserver.py
97 lines (75 loc) · 2.99 KB
/
server.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import os
import nltk
import json
import enchant
import re
import traceback
import datetime
import pytz
from aiohttp import web
from dotenv import load_dotenv
from pathlib import Path
# load in port number
dotenv_path = Path("./.env")
load_dotenv(dotenv_path=dotenv_path)
port_number = os.getenv("PY_PORT")
# create app to listen for internal http requests
app = web.Application()
# globals for query filtering
MAX_QUERY_LEN = 50
enchant_dict = enchant.Dict("en_US")
stop_words_ls = nltk.corpus.stopwords.words("english")
additional_sw = [".", "'", "-"] # catching the chars that are allowed
for sw in additional_sw: stop_words_ls.append(sw)
stop_words = set(stop_words_ls)
lemmatizer = nltk.stem.WordNetLemmatizer()
# desc.: given word, return true if it contains valid characters, false if not
def valid_word(word):
# same regex pattern used to collect data
valid = re.compile("[-a-zA-z0-9'\.\- ]")
for char in word:
if not re.match(valid, char):
return False
return True
# desc.: given text string, return list of valid/filtered words to be used in query
def word_filter(text):
valid_words = [] # for valid words to return
# break text str into set of unique words (not going to query duplicates or count them towards score twice)
word_ls = set(nltk.word_tokenize(text))
if len(word_ls) > MAX_QUERY_LEN:
return [], "max_len"
for word in word_ls:
word = word.lower() # all words everywhere will be lowercase, doing this before check because stopwords are lowercase
# not stopword, only contains valid characters
if word not in stop_words and valid_word(word):
# update total word counter for the title, used for calculating frequency of each word
word_lem = lemmatizer.lemmatize(word) # all words everywhere will be lemmatized
check = enchant_dict.check(word_lem)
# case matters on proper nouns, this is kinda hacky, but whatever
if not check:
check = enchant_dict.check(word_lem.capitalize())
# if true, this word is "valid"
if check:
valid_words.append(word_lem)
return valid_words, "len_fine"
# main function
async def filter_query(request):
try:
data = await request.json()
query = data["unfilteredQuery"]
return_ls, return_msg = word_filter(query)
return_obj = json.dumps({"filteredQueryArray": return_ls, "message": return_msg})
return web.Response(text=return_obj)
except:
return_obj = json.dumps({"filteredQueryArray": [], "message": "invalid"})
east_tz = pytz.timezone("US/Eastern")
formatted_time = datetime.datetime.now(tz=east_tz).strftime("%m/%d/%Y, %I:%M:%S %p %Z")
print(formatted_time)
print(traceback.format_exc())
print("\n")
return web.Response(text=return_obj)
# set up post route
app.router.add_post("/", filter_query)
if __name__ == "__main__":
# run app
web.run_app(app, port=port_number)