-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathvislcg3-split-space.py
161 lines (149 loc) · 4.83 KB
/
vislcg3-split-space.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# IN:
# "<Сөздерін>"
# "сөз" n pl px3sp acc @obj #1->4
# "<екеуі де>"
# "екеу" num coll subst px3sp nom @nsubj #2->4
# "да" postadv @advmod #3->2
# "<қабыл алды>"
# "қабыл ал" v tv ifi p3 sg @root #4->0
# "<.>"
# "." sent @punct #5->4
# OUT:
# "<Сөздерін>"
# "сөз" n pl px3sp acc @obj #1->5
# "<екеуі де>"
# "екеу" num coll subst px3sp nom @nsubj #2->5
# "да" postadv @advmod #3->2
# "<қабыл>"
# "қабыл" x @compound #4->5
# "<алды>"
# "ал" v tv ifi p3 sg @root #5->0
# "<.>"
# "." sent @punct #6->5
import sys;
# If the token has space in lemma, and surface form and the number of analysis lines is 1
# split the lemma and the surface form into two tokens.
# increment head indexes >= cur_idx by 1
# increment token indexes >= cur_idx by 1
def break_token(t, idx, idmax): #{
#('"<Сөздерін>"\n', '\t"сөз" n pl px3sp acc @obj ')
surf = t[0].strip()[2:-2].split(' ');
lem = t[1].split('"')[1].split(' ');
tags = '';
if idx == idmax: #{
tags = ' '.join(t[1].split('"')[2:]);
else: #{
tags = ' x @dep ';
#}
return ('"<' + surf[idx] + '>"\n', '\t"'+lem[idx]+'"' + tags);
#}
def kasitella(heads, tokens, cur_sur, max_tok): #{
#print('!!!', cur_sur, max_tok, '|||', heads, tokens, file=sys.stderr);
cur_tok = 0;
# While we are not at the end of the sentence
while cur_tok <= max_tok: #{
new_tokens = {};
new_heads = {};
# For each of the tokens in the tree
for i in tokens.keys(): #{
lem = '"'.join(tokens[i][1].split('"')[0:2]).strip();
# If we have found a token we can split, e.g. the lemma has more than
# one space and the number of spaces in the token and in the lemma matches
if tokens[i][0].strip().count(' ') == lem.count(' ') and lem.count(' ') > 0: #{
print('¶ [',cur_tok,max_tok,'] ¶', i, '|||', tokens[i], heads[i], file=sys.stderr)
offset = lem.count(' '); # FIXME: This was below
# For each of the tokens in the tree
for j in tokens.keys(): #{
# If the current token matches the index we are processing
if j == i: #{
local_max = lem.count(' ');
for k in range(0, local_max+1): #{
new_tokens[j+k] = break_token(tokens[j], k, local_max);
#}
for k in range(0, local_max): #{
new_heads[j+k] = j+local_max;
#}
print('§', new_tokens, file=sys.stderr);
print('½', heads[j], i, file=sys.stderr);
if heads[j] >= i: #{
new_heads[j+local_max] = heads[j]+local_max;
else: #{
new_heads[j+local_max] = heads[j];
#}
print('\t@|j: %d; i: %d; heads[j]: %d; offset: %d; %s|' %(j,i,heads[j],offset,lem), file=sys.stderr);
print('§', new_tokens, file=sys.stderr);
#@ offset += local_max; # FIXME: Moved from here
elif j > i: #{
new_tokens[j+offset] = tokens[j];
if heads[j] >= i: #{
new_heads[j+offset] = heads[j]+offset;
else: #{
new_heads[j+offset] = heads[j];
#}
print('\t!|j: %d; i: %d; heads[j]: %d; offset: %d; %s|' %(j,i,heads[j],offset,lem), file=sys.stderr);
else: #{
new_tokens[j] = tokens[j];
if heads[j] >= i: #{
new_heads[j] = heads[j]+offset;
else: #{
new_heads[j] = heads[j];
#}
print('\t%%|j: %d; i: %d; heads[j]: %d; offset: %d; %s|' %(j,i,heads[j],offset,lem), file=sys.stderr);
#}
#}
print('===', new_tokens, file=sys.stderr);
print('===', new_heads, file=sys.stderr);
cur_tok = i;
break;
else: #{
print('[',cur_tok,max_tok,'] >', tokens, file=sys.stderr);
print('® [',cur_tok,max_tok,'] § >', i, '|||', tokens[i], '|||', file=sys.stderr);
new_tokens[i] = tokens[i];
new_heads[i] = heads[i]
cur_tok = i+1;
#break;
#}
#}
print(cur_tok, '////////////////////////////////////////////////////////////////////////', file=sys.stderr);
tokens = new_tokens;
heads = new_heads;
#}
for i in tokens.keys(): #{
# print(i, tokens[i], heads[i])
print(tokens[i][0] + tokens[i][1] + '#' + str(i) + '->' + str(heads[i]));
#}
#}
heads = {};
tokens = {};
lineno = 0
cur_sur = '';
max_tok = 0;
for line in sys.stdin.readlines(): #{
if line.strip() == '' and max_tok != 0: #{
print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$', file=sys.stderr);
kasitella(heads, tokens, cur_sur, max_tok)
heads = {};
tokens = {};
cur_sur = '';
print('');
continue;
#}
if line[0] == '"': #{
cur_sur = line;
elif line[0] == '\t': #{
row = line.split('#');
anal = row[0];
(d, h) = row[1].replace('->', '\t').split('\t');
head = int(h);
toki = int(d);
heads[toki] = head;
tokens[toki] = (cur_sur, anal);
cur_sur = '';
max_tok = toki;
elif line[0] == '#': #{
print(line.strip('\n'));
else: #{
print('[',lineno,'] Invalid:', line, file=sys.stderr);
#}
lineno += 1;
#}