-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathconllu-nospaceafter.py
113 lines (100 loc) · 1.95 KB
/
conllu-nospaceafter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import sys;
def e(s): #{
return s.replace(' ', '·');
#}
def calc_spaceafter(a, b): #{
if '_' in b: #{
b = b.replace('_', '');
#}
if a.replace(' ', '') != b.replace(' ', ''): #{
print('[nospace] ERR[a]:', a.replace(' ', ''),'||',a, file=sys.stderr);
print('[nospace] ERR[b]:', b.replace(' ', ''),'||',b, file=sys.stderr);
return {};
#}
idx = 0;
ord = '';
i = 0;
j = 0;
m = {};
while True: #{
if i >= len(a): break;
if a[i] == ' ' and b[j] == ' ': #{
# print('%', idx, ord, i, j,'|||',e(a[i]),'|', e(b[i]));
# print('%d\t%s\t%s' % (idx, ord, '_'));
m[idx] = '';
ord = '';
idx += 1;
if a[i] == b[j]: #{
# print('\t',idx, ord, i, j,'|||',e(a[i]),'|', e(b[i]));
ord = ord + a[i];
else: #{
if b[j] == ' ': #{
#print('%d\t%s\t%s' % (idx, ord, 'SpaceAfter=No'));
m[idx] = 'SpaceAfter=No';
ord = '';
idx += 1;
j += 1;
continue;
else:
break;
#}
#}
i += 1;
j += 1;
#}
return m;
#}
for blokk in sys.stdin.read().split('\n\n'): #{
a = '';
b = '';
for line in blokk.split('\n'): #{
if line.strip() == '': #{
break;
#}
if line.count('# text = ') > 0: #{
a = line.split('# text = ')[1].strip();
continue;
#}
if line[0] == '#': #{
continue;
#}
row = line.split('\t');
if '.' in row[0] or '-' in row[0]: #{
continue;
#}
if b == '': #{
b = row[1];
else: #{
b = b + ' ' + row[1];
#}
#}
spaces = calc_spaceafter(a, b);
for line in blokk.split('\n'): #{
if line.strip() == '': #{
break;
#}
if line[0] == '#': #{
print(line);
continue;
#}
row = line.split('\t');
if '.' in row[0] or '-' in row[0]: #{
print(line);
continue;
#}
idx = int(row[0]) - 1;
if idx in spaces and spaces[idx] != '': #{
if row[9] == '_': #{
row[9] = 'SpaceAfter=No';
else: #{
row[9] += '|SpaceAfter=No';
#}
print('\t'.join(row));
else: #{
print(line);
#}
#}
if blokk.strip() != '': #{
print('');
#}
#}