-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_cv.py
executable file
·178 lines (149 loc) · 4.98 KB
/
generate_cv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
#!/usr/bin/python
import pdb
import features
import sys
import os
class CVScriptEntry(object):
def __init__(self, concept_line, wv_line, wvi_line, p_line):
self.concept = concept_line[2:]
self.wv = wv_line[3:].split()
self.wvi = []
self.p = []
for w1_w2 in wvi_line[4:].split(';'):
w1_w2_tup = w1_w2.split()
if len(w1_w2_tup) == 2:
w1, w2 = w1_w2_tup
self.wvi.append( (w1.strip(), w2.strip()) )
for parent_weight in p_line[2:].split(';'):
parent_weight_tup = parent_weight.split()
if len(parent_weight_tup) == 2:
parent, weight = parent_weight_tup
self.p.append( (parent.strip(), float(weight.strip())) )
#def load_cv_script(cv_script_filename='cv-script-nomono'):
def load_cv_script(cv_script_filename):
cv_script_entries = {}
fin = open(cv_script_filename)
while True:
concept_line = fin.readline().strip()
if not concept_line:
break
wv_line = fin.readline().strip()
wvi_line = fin.readline().strip()
p_line = fin.readline().strip()
fin.readline()
entry = CVScriptEntry(concept_line, wv_line, wvi_line, p_line)
cv_script_entries[entry.concept] = entry
return cv_script_entries
cached_pmi_models = {}
def get_pmi_model_OLD1(wc_dir, noun):
if cached_pmi_models.has_key(noun):
return cached_pmi_models[noun]
else:
pmi_model = features.PMINounModel()
filename = pmi_model.lemma_to_filename(noun, wc_dir)
try:
os.stat(filename)
except OSError:
return None
pmi_model._load_from_pmi_file(filename)
cached_pmi_models[noun] = pmi_model
return pmi_model
def get_pmi_model(wc_dir, noun):
pmi_model = features.PMINounModel()
filename = pmi_model.lemma_to_filename(noun, wc_dir)
try:
os.stat(filename)
except OSError:
return None
pmi_model._load_from_pmi_file(filename)
return pmi_model
def generate_cv_entries(cv_entries_dict, input_dir, output_dir):
nm = features.PMINounModel()
for concept in cv_entries_dict:
filename = nm.lemma_to_filename(concept, output_dir)
concept_exists_already = False
try:
os.stat(filename)
concept_exists_already = True
except OSError:
pass
if not concept_exists_already:
generate_cv(cv_entries_dict[concept], cv_entries_dict, input_dir, output_dir)
def generate_cv(cv_entry, cv_entries_dict, input_dir, output_dir):
concept = cv_entry.concept
# check if it exists on disk already
pmi_model = features.PMINounModel()
filename = pmi_model.lemma_to_filename(concept, output_dir)
try:
os.stat(filename)
pmi_model._load_from_pmi_file(filename)
return pmi_model
except OSError:
pass
# since it doesn't exist on disk, compute it recursively
wv_models = []
for word in cv_entry.wv:
word_pmi_model = get_pmi_model(input_dir, word)
if word_pmi_model:
wv_models.append(word_pmi_model)
for word1, word2 in cv_entry.wvi:
word1_pmi_model = get_pmi_model(input_dir, word1)
word2_pmi_model = get_pmi_model(input_dir, word2)
if word1_pmi_model and word2_pmi_model:
wv_models.append(word1_pmi_model.intersection(word2_pmi_model))
#for parent, weight in cv_entry.p:
# parent_cv_entry = cv_entries_dict[parent]
# parent_pmi_model = generate_cv(parent_cv_entry, cv_entries_dict, input_dir, output_dir)
# parent_pmi_model.scale(weight)
# wv_models.append(parent_pmi_model)
concept_pmi_model = features.PMINounModel()
concept_pmi_model.noun = concept
for model in wv_models:
concept_pmi_model.union_max(model)
concept_pmi_model.save_to_file(output_dir)
return concept_pmi_model
# uses cv of the parents
def generate_cv_old_recursive(cv_entry, cv_entries_dict, input_dir, output_dir):
concept = cv_entry.concept
# check if it exists on disk already
pmi_model = features.PMINounModel()
filename = pmi_model.lemma_to_filename(concept, output_dir)
try:
os.stat(filename)
pmi_model._load_from_pmi_file(filename)
return pmi_model
except OSError:
pass
# since it doesn't exist on disk, compute it recursively
wv_models = []
for word in cv_entry.wv:
word_pmi_model = get_pmi_model(input_dir, word)
if word_pmi_model:
wv_models.append(word_pmi_model)
for word1, word2 in cv_entry.wvi:
word1_pmi_model = get_pmi_model(input_dir, word1)
word2_pmi_model = get_pmi_model(input_dir, word2)
if word1_pmi_model and word2_pmi_model:
wv_models.append(word1_pmi_model.intersection(word2_pmi_model))
for parent, weight in cv_entry.p:
parent_cv_entry = cv_entries_dict[parent]
parent_pmi_model = generate_cv(parent_cv_entry, cv_entries_dict, input_dir, output_dir)
parent_pmi_model.scale(weight)
wv_models.append(parent_pmi_model)
concept_pmi_model = features.PMINounModel()
concept_pmi_model.noun = concept
for model in wv_models:
concept_pmi_model.union_max(model)
concept_pmi_model.save_to_file(output_dir)
return concept_pmi_model
def main():
if len(sys.argv) != 4:
print 'Arguments: <cv-script> <input pmi dir> <output pmi dir>'
return
cv_script = sys.argv[1]
input_dir = sys.argv[2]
output_dir = sys.argv[3]
cv_script_entries = load_cv_script(cv_script)
generate_cv_entries(cv_script_entries, input_dir, output_dir)
if __name__ == "__main__":
main()