-
Notifications
You must be signed in to change notification settings - Fork 1
/
wn2text.py
323 lines (288 loc) · 14.7 KB
/
wn2text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
#!/bin/python3
import json
import os
import rdflib as r
from rdflib import Graph, Namespace
from rdflib.namespace import RDF
from rdflib.term import Literal
import click
WN30 = Namespace("https://w3id.org/own-pt/wn30/schema/")
WN30EN = Namespace("https://w3id.org/own-pt/wn30-en/instances/")
WN30PT = Namespace("https://w3id.org/own-pt/wn30-pt/instances/") # not used (yet)
CURRENT_LANG=Literal("en")
SYNSET_RELATIONS, WORD_RELATIONS, FRAMES_TO_ID = {}, {}, {}
LEXICOGRAPHER_FILE = WN30['lexicographerFile']
LANG = WN30['lang']
CONTAINS_WORDSENSE = WN30['containsWordSense']
SAME_AS = WN30['sameAs']
LEXICAL_ID = WN30['lexicalId']
LEXICAL_FORM = WN30['lexicalForm']
WORD = WN30['word']
@click.group()
def cli():
return None
@cli.command()
@click.argument('rdf_input',
type=click.File(mode="rb"), required=True)
@click.argument('config_dir',
type=click.Path(exists=True, file_okay=False, resolve_path=True), required=True)
@click.argument('output_dir'
, type=click.Path(file_okay=False, resolve_path=True, writable=True), required=True)
@click.option('-f', '--rdf-file-format', 'rdf_file_format'
, type=click.STRING, default='nt', show_default=True,
help="RDF input format. Must be accepted by RDFlib.")
def to_text(rdf_input, config_dir, output_dir, rdf_file_format="nt"):
"""Convert RDF_INPUT to lexicographer files placed at OUTPUT_DIR,
according to the configuration files in CONFIG_DIR."""
global SYNSET_RELATIONS, WORD_RELATIONS, FRAMES_TO_ID
(SYNSET_RELATIONS, WORD_RELATIONS, FRAMES_TO_ID) = read_config(config_dir)
graph = Graph()
graph.parse(rdf_input, format=rdf_file_format)
print_graph(graph, output_dir)
###
## rdf -> text
def read_config(config_dir):
def read_tsv(file_path, line_function, initial_value):
def read_line(line):
return list(map(str.strip, line.split(sep='\t')))
res = initial_value
with open(file_path, 'r') as input_stream:
field_names = read_line(next(input_stream))
number_fields = len(field_names)
for line_number, line in enumerate(input_stream):
line_fields = read_line(line)
number_line_fields = len(line_fields)
if number_line_fields == 1 & (line_fields[0].lstrip()[:2] == '--'):
pass
elif number_line_fields == number_fields:
res = line_function(res, line_fields)
else:
print("Error: while reading TSV file {}, uneven number of fields; should have {} fields, but has {} fields in line {}".format(
file_path, number_fields, number_line_fields, 1 + line_number))
break
return res
def read_relations(res, fields):
(synset_relations, word_relations) = res
relation_name = fields[3]
relation_text = fields[2]
if relation_text != "_":
if fields[5] == "word":
word_relations[relation_name] = relation_text
elif fields[5] == "synset":
synset_relations[relation_name] = relation_text
else:
word_relations[relation_name] = relation_text
synset_relations[relation_name] = relation_text
return (synset_relations, word_relations)
def read_frames(res, fields):
res[fields[1]] = fields[0]
return res
synset_relations, word_relations = read_tsv(os.path.join(
config_dir, "relations.tsv"), read_relations, ({}, {}))
frames = read_tsv(os.path.join(config_dir, "frames.tsv"),
read_frames, {})
return (synset_relations, word_relations, frames)
def print_graph(graph, output_dir):
global CURRENT_LANG
lex_files = set(graph.objects(predicate=LEXICOGRAPHER_FILE))
langs = set(graph.objects(predicate=LANG))
if not langs:
langs = [CURRENT_LANG]
for synset in graph.subjects(CONTAINS_WORDSENSE):
graph.add((synset, LANG, CURRENT_LANG))
for lang in langs:
CURRENT_LANG = lang
for lexicographer_file in lex_files:
print_lexfile(graph, lexicographer_file, output_dir)
def sort_word_senses(graph, synset):
def word_sense_form(ws):
word = graph.value(ws, WORD)
lexical_form = graph.value(word, LEXICAL_FORM)
return lexical_form
#
wordsenses = list(graph.objects(synset, CONTAINS_WORDSENSE))
if wordsenses:
return sorted(wordsenses,key=word_sense_form)
else:
print("missing ws for {}".format(synset))
def sort_synsets(graph, synsets):
# careful when changing this, other scripts depend on this function
synsets_word_senses = map(lambda ss: (ss, sort_word_senses(graph, ss)), synsets)
result = sorted(synsets_word_senses, key=lambda i: word_sense_id(graph, "", i[1][0]))
return result
def print_lexfile(graph, lexicographer_file, output_dir):
lexfile_synsets = graph.subjects(predicate=LEXICOGRAPHER_FILE, object=lexicographer_file)
lang_synsets = list(filter(lambda s: graph.value(s,LANG) == CURRENT_LANG,
lexfile_synsets))
if lang_synsets:
with open(os.path.join(output_dir, CURRENT_LANG, lexicographer_file), 'w+') as output_stream:
write = lambda data, *args, **kwargs: print(data, file=output_stream,
*args, **kwargs)
pos, lexname = lexicographer_file.split(".")
write("{}.{}".format(pos, lexname), end="\n\n")
for synset, sorted_word_senses in sort_synsets(graph, lang_synsets):
print_synset(graph, synset, sorted_word_senses, lexicographer_file, write)
def word_sense_id(graph, lexicographer_file, word_sense):
word = graph.value(word_sense, WORD)
word_form = graph.value(word, LEXICAL_FORM)
lexical_id = graph.value(word_sense, LEXICAL_ID)
in_synset = graph.value(
predicate=CONTAINS_WORDSENSE, object=word_sense)
in_lang = graph.value(in_synset, LANG)
in_lexfile = graph.value(
subject=in_synset, predicate=LEXICOGRAPHER_FILE)
if None not in (word, word_form, lexical_id, in_synset, in_lexfile, in_lang):
return (in_lang, in_lexfile, word_form, int(lexical_id))
else:
raise LookupError("Error: missing wordsense information for wordsense {}.\n word_form: {}, lexical_id: {}, in_synset: {}, in_lexfile: {}".format(word_sense, word_form, lexical_id, in_synset, in_lexfile))
def print_word_sense_id(wordsense_id, lexicographer_file=None):
(in_lang, in_lexfile, word_form, lexical_id) = wordsense_id
lexfile_str = "{}:".format(in_lexfile) if in_lexfile.neq(lexicographer_file) else ""
prefix = "{}".format(lexfile_str) if in_lang == CURRENT_LANG else "@{}:{}:".format(in_lang, in_lexfile)
return "{}{}{}".format(prefix,
word_form,
" {}".format(lexical_id) if lexical_id != 0 else "")
def print_synset(graph, synset, sorted_word_senses, lexicographer_file, write):
def print_relations():
def print_relation(name, wordsense_id):
return "{}: {}".format(name,
print_word_sense_id(wordsense_id, lexicographer_file))
rels = []
for predicate, obj in graph.predicate_objects(synset):
_, predicate_name = r.namespace.split_uri(predicate)
predicate_txt_name = SYNSET_RELATIONS.get(predicate_name, None)
if predicate_name in ["frame", "containsWordSense", "gloss",
"example", "lexicographerFile", "lexicalForm"]:
pass
elif predicate_txt_name:
rels.append((predicate_txt_name,
word_sense_id(graph, lexicographer_file,
# first word sense is head
sort_word_senses(graph, obj)[0])))
# # we can't raise this error because we decided to ignore lots of relations
# else:
# raise LookupError("{} relation not found".format(predicate_name))
rels.sort()
for (relation_name, target) in rels:
write(print_relation(relation_name, target))
def print_synset_gloss_splited_in_definition_and_examples(gloss):
def remove_quotes(example):
if example[-1] == "\"" and "\"" not in example[:-1]:
return example.strip("\"")
else:
return "\"" + example
def_examples = gloss.split("; \"")
definition = def_examples[0].strip()
examples = def_examples[1:]
write("{}: {}".format(SYNSET_RELATIONS["definition"], definition))
for example in examples:
write("{}: {}".format(
SYNSET_RELATIONS["example"], remove_quotes(example.strip())))
for word_sense in sorted_word_senses:
print_word_sense(graph, word_sense, lexicographer_file, write)
# definition
print_synset_gloss_splited_in_definition_and_examples(
graph.value(synset, WN30["gloss"]))
# examples
for example in graph.objects(synset, WN30["example"]):
write("{}: {}".format(SYNSET_RELATIONS["example"], example))
# frames
frames = graph.objects(synset, WN30["frame"])
frame_ids = list(map(lambda frame: FRAMES_TO_ID[frame.n3()[1:-1]], frames))
if frame_ids:
write("{}: {}".format(SYNSET_RELATIONS['frame'], " ".join(frame_ids)))
print_relations()
write("")
def print_word_sense(graph, word_sense, lexicographer_file, write):
def print_word_relations():
def print_word_relation(name, wordsense_id):
return " {} {}".format(name,
print_word_sense_id(wordsense_id, lexicographer_file))
frames = []
relations = []
markers = []
for predicate, obj in graph.predicate_objects(word_sense):
_, predicate_name = r.namespace.split_uri(predicate)
predicate_txt_name = WORD_RELATIONS.get(predicate_name, None)
if predicate_name == "frame":
frames.append(FRAMES_TO_ID[str(obj)])
elif predicate_name == "syntacticMarker":
markers.append(obj)
elif predicate_txt_name:
relations.append((predicate_txt_name,
word_sense_id(graph, lexicographer_file, obj)))
# # we can't raise this error because we decided to ignore lots of relations
# else:
# raise lookuperror("{} not found".format(predicate_name))
if frames:
frames.sort()
write(" {} {}".format(
WORD_RELATIONS["frame"], " ".join(frames)), end="")
if markers:
[marker] = markers # check that there is only one marker
write(" {} {}".format(
WORD_RELATIONS["syntacticMarker"], marker), end="")
relations.sort()
for relation_name, target in relations:
write(print_word_relation(relation_name, target), end="")
write("")
write("{}: {}".format(SYNSET_RELATIONS['containsWordSense'],
print_word_sense_id(word_sense_id(graph,
lexicographer_file,
word_sense),
lexicographer_file)),
end="")
print_word_relations()
@cli.command()
@click.argument('original_file', type=click.File(mode="rb"), required=True)
@click.argument('new_file', type=click.File(mode="rb"), required=True)
@click.option('-f', '--rdf-format', 'rdf_format'
, type=click.STRING, default='nt', show_default=True,
help="RDF output format. Must be accepted by RDFlib.")
def check_conversion(original_file, new_file, rdf_format):
# check relations between synsets and wordsenses are preserved
## not checking lexform and lexical ids and other literals because
## missing these would have caused syntactic problems (plus we
## don't export syntacticMarker yet, for instance)
def new_uri(lexicographer_file, original_uri, wn_obj):
if wn_obj == "synset":
sorted_wordsenses = sort_word_senses(original_g, original_uri)
wordsense = sorted_wordsenses[0]
elif wn_obj == "wordsense":
wordsense = original_uri
else:
raise Exception("argument wn_obj must be either synset or wordsense")
word = original_g.value(wordsense, WORD)
lexical_form = original_g.value(word, LEXICAL_FORM)
lexical_id = original_g.value(wordsense, LEXICAL_ID)
return WN30EN["{}-{}-{}-{}".format(wn_obj, lexicographer_file,
lexical_form, lexical_id)]
#
original_g = Graph()
new_g = Graph()
original_g.parse(original_file, format=rdf_format)
new_g.parse(new_file, format=rdf_format)
for (original_en_synset, subj_lexfile) in original_g.subject_objects(LEXICOGRAPHER_FILE): # for every synset
for (predicate, obj) in original_g.predicate_objects(original_en_synset): # for every relation
obj_lexfile = original_g.value(obj, LEXICOGRAPHER_FILE, any=False)
if predicate == CONTAINS_WORDSENSE:
original_en_wordsense = obj
for (predicate, obj) in original_g.predicate_objects(original_en_wordsense):
obj_synset = original_g.value(predicate=CONTAINS_WORDSENSE,object=obj)
if obj_synset: # if truthy obj is also a wordsense
obj_lexfile = original_g.value(obj_synset, LEXICOGRAPHER_FILE)
new_en_wordsense = new_uri(subj_lexfile, original_en_wordsense, "wordsense")
new_en_obj = new_uri(obj_lexfile, obj, "wordsense")
if (new_en_wordsense, predicate, new_en_obj) not in new_g:
print("wordsense relation {} missing between {} and {}".format(predicate, new_en_wordsense, new_en_obj))
elif obj_lexfile: # truthy if object is synset too
new_en_synset = new_uri(subj_lexfile, original_en_synset, "synset")
new_en_obj = new_uri(obj_lexfile, obj, "synset")
if (new_en_synset, predicate, new_en_obj) not in new_g:
print("synset relation {} missing between {} and {}".format(predicate, new_en_synset, new_en_obj))
new_pt_synset = new_g.value(predicate=SAME_AS, object=new_en_synset)
new_pt_obj = new_g.value(predicate=SAME_AS, object=new_en_obj)
if (new_pt_synset, predicate, new_pt_obj) not in new_g:
print("synset relation {} missing between {} and {}".format(predicate, new_pt_synset, new_pt_obj))
if __name__ == '__main__':
cli()