-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
193 lines (154 loc) · 6.25 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
#!/usr/bin/python
import wikipedia
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from textblob import TextBlob, Word
import argparse, sys, os
class PreProcess():
def __init__(self, postsFile, preproceed_postsFile):
print "[*] Preprocess module starting"
if not os.path.exists('./files'):
os.makedirs('./files')
try:
myFile = open(preproceed_postsFile, 'w')
except:
print "[-] Fail to create the file."
sys.exit(0)
myFile.close()
self.foodTagsFile = 'tags/relatedToFood.txt'
self.stemmer = SnowballStemmer("english")
self.stop_words = set(stopwords.words('english'))
self.main(postsFile, preproceed_postsFile)
def main(self, postsFile, preproceed_postsFile):
"Pre-process the posts file"
try:
myFile = open(postsFile, 'r')
except:
print "[-] Fail to open the file."
return False
self.foodWords, self.posTag = ['food', 'foodie', 'cuisine'], ['NN', 'NNP']
for line in myFile:
summary, first = "", True
infos = line.split('\t')
if not self.checkIfNew(preproceed_postsFile, infos[0]):
print "[*] Already treated this entry."
continue
toKeep = False
for tag in infos[1].split('#'):
isShort = False
tag = self.getEngTag(tag)
if self.isFoodTag(tag):
tag = TextBlob(tag)
try:
tag = tag.words
except:
continue
if len(tag) == 1:
tag = TextBlob(tag[0].singularize())
isShort = True
else:
tag = " ".join(tag)
if tag not in self.foodWords:
if isShort:
if tag.tags[0][1] in self.posTag:
wiki = self.getSummary(tag)
if wiki != None:
toKeep = True
if not first:
summary += " "
first = False
summary += wiki
else:
tag = TextBlob(tag)
wiki = self.getSummary(tag)
if wiki != None:
toKeep = True
if not first:
summary += " "
first = False
summary += wiki
if toKeep:
self.savePost(preproceed_postsFile, infos[0], summary)
myFile.close()
def getEngTag(self, tag):
"Get the tag in English"
tagName = TextBlob(tag.decode('utf-8'))
tagName = tagName.words[0].singularize()
if len(tagName) >= 3:
lang = tagName.detect_language()
if lang != 'en':
tagName = tagName.translate(from_lang=lang, to='en')
return tagName.encode('utf-8')
def isFoodTag(self, tag):
"Check if the tag is related to food"
try:
myFile = open(self.foodTagsFile, 'r')
except:
print "[-] Fail to open the file."
return False
for line in myFile:
if line.strip() == tag:
myFile.close()
return True
myFile.close()
return False
def getSummary(self, tag):
"Get the summary related to a given tag"
try:
summary = wikipedia.summary(str(tag))
except:
try:
definitions = Word(tag).definitions
summary = " ".join(definitions)
except:
print "[-] Fail to retrieve the summary of the tag : " + str(tag)
return None
text = TextBlob(summary)
filtered_words = [self.stemmer.stem(w) for w in text.words.lower() if not w in self.stop_words]
summary = " ".join(filtered_words)
return summary
def savePost(self, file, post_id, summary):
"Save the pre-processed posts"
try:
myFile = open(file, 'a')
except:
print "[-] Fail to open the file."
return False
first = True
myFile.write(str(post_id))
myFile.write('\t')
myFile.write(summary.encode('utf-8'))
myFile.write("\n")
myFile.close()
return True
def checkIfNew(self, file, id_post):
"Check if a given post has been already treated"
try:
myFile = open(file, 'r')
except:
print "[-] Fail to open the file."
return False
for line in myFile:
infos = line.split('\t')
if infos[0] == id_post:
myFile.close()
return False
myFile.close()
return True
if __name__ == "__main__":
directory, postsFile, preproceed_postsFile = "./files/", "posts.txt", "preproceed_posts.txt"
parser = argparse.ArgumentParser(description='Food clustering project - Preprocess module', epilog="Developed by Paul Pidou.")
parser.add_argument('-pf', action="store", dest="postsFile", help="Source posts file. By default: posts.txt", nargs=1)
parser.add_argument('-ppf', action="store", dest="preprocessFile", help="File to save the preprocess posts. By default: preproceed_posts.txt", nargs=1)
parser.add_argument('--version', action='version', version='%(prog)s 1.0')
args, unknown = parser.parse_known_args()
if unknown:
print '[-] Unknown argument(s) : ' + str(unknown).strip('[]')
print '[*] Exciting ...'
sys.exit(0)
if args.postsFile != None:
postsFile = args.postsFile[0]
if args.preprocessFile != None:
preproceed_postsFile = args.preprocessFile[0]
PreProcess(directory + postsFile, directory + preproceed_postsFile)