-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex_words.py
executable file
·43 lines (38 loc) · 1013 Bytes
/
index_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#!/usr/bin/python
import features
import glob
from nltk.corpus import wordnet as nlwn
import sys
import pdb
import operator
import os
import pdb
import sqlite3
import simplejson as json
import math
from bz2 import BZ2File
def get_feature_set(source_dir):
print 'Loading models...'
pmi_models = []
realcount = 0
feature_set = set()
for n, pmi_filename in enumerate(glob.glob('%s/*.pmi.bz2' % source_dir)):
print 'Loading:', n, pmi_filename
fin = BZ2File(pmi_filename)
word = fin.readline().strip()
fin.close()
if word not in feature_set:
feature_set.add(word)
return feature_set
def save_feature_set(features, filename):
fout = open(filename, 'w')
for word in features:
fout.write('%s\n' % (word,))
fout.close()
def main():
source_dir = sys.argv[1]
features_file = sys.argv[2]
features = get_feature_set(source_dir)
save_feature_set(features, features_file)
if __name__ == "__main__":
main()