-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathbuild.py
290 lines (257 loc) · 10.4 KB
/
build.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
import csv
import dataclasses
from dataclasses import dataclass
import re
@dataclass
class 小韻Row:
小韻號: str
首字: str
反切: str
音韻地位: str
def load_小韻表() -> tuple[
dict[str, 小韻Row], dict[str, list[str]], dict[str, list[str]]
]:
小韻_data = dict[str, 小韻Row]()
細分號_by_原書小韻 = dict[str, list[str]]()
細分轄字_by_小韻 = dict[str, list[str]]()
with open('src/小韻表.tsv') as fin:
header = next(fin)
assert header.rstrip('\n').split('\t') == [
'小韻號',
'首字',
'反切',
'音韻地位',
'細分轄字',
], repr(header)
for line in fin:
小韻號, 首字, 反切, 音韻地位, 細分轄字 = line.rstrip('\n').split('\t')
小韻_data[小韻號] = 小韻Row(小韻號, 首字, 反切, 音韻地位)
if 小韻號[-1].isalpha():
原書小韻號 = 小韻號[:-1]
細分號_by_原書小韻.setdefault(原書小韻號, []).append(小韻號[-1])
細分轄字_by_小韻[小韻號] = 細分轄字
return 小韻_data, 細分號_by_原書小韻, 細分轄字_by_小韻
@dataclass
class 字序Row:
原書小韻號: str
小韻字號: str
字: str
poem_小韻內字序: str
poem_字: str
sbgy_id: str
sbgy_字: str
ytenx_小韻內字序: str
ytenx_流水序: str
ytenx_字: str
def load_字序表() -> dict[tuple[str, str], 字序Row]:
字序_data: dict[tuple[str, str], 字序Row] = {}
with open('src/字序表.csv') as fin:
rows = csv.reader(fin)
header = next(rows)
assert header == [x.name for x in dataclasses.fields(字序Row)]
for row in rows:
key = (row[0], row[1])
字序_data[key] = 字序Row(*row)
return 字序_data
@dataclass
class Patch:
原書小韻號: str
小韻字號: str
原字頭: str
校正字頭: str
原釋義: str
校正釋義: str
原釋義參照: str
校正釋義參照: str
字頭說明: str
備注: str
def load_patches() -> dict[tuple[str, str], Patch]:
patches: dict[tuple[str, str], Patch] = {}
with open('src/patches.csv') as fin:
rows = csv.reader(fin)
header = next(rows)
assert header == [x.name for x in dataclasses.fields(Patch)]
for row in rows:
key = (row[0], row[1])
patches[key] = Patch(*row)
return patches
@dataclass
class 廣韻Row:
小韻號: str
小韻字號: str
韻目原貌: str
音韻地位: str
反切: str
字頭原貌: str
字頭: str
字頭說明: str
釋義: str
釋義參照: str
def main():
小韻_data, 細分號_by_原書小韻, 細分轄字_by_小韻 = load_小韻表()
字序_data = load_字序表()
patches = load_patches()
小韻號_seen = set[str]()
小韻細分_coverage = dict[str, set[str]]()
patch_coverage = set[tuple[str, str]]()
poem_data = dict[tuple[str, str], dict[str, str]]()
with open('src/廣韻(20170209).csv') as fin:
for row in csv.DictReader(fin):
key = (row['小韻序'], row['小韻內字序'])
poem_data[key] = row
廣韻_data: dict[tuple[str, str], list[str] | None] = {k: None for k in 字序_data}
for 字序_key in 廣韻_data:
原書小韻號, 小韻字號 = 字序_key
poem_小韻內字序 = 字序_data[字序_key].poem_小韻內字序
if not poem_小韻內字序:
poem_反切 = poem_data[(原書小韻號, '1')]['廣韻反切(覈校後)']
含原貌字頭 = ''
釋義 = ''
釋義參照 = ''
else:
poem_row = poem_data[(原書小韻號, poem_小韻內字序)]
# Formerly used fields (field number is 1-based, same as awk & MS Excel):
# '字頭-補', # 19
# '廣韻反切原貌(覈校前)', # 20
# '廣韻頁序', # 57
# '小韻序', # 59
# '小韻內字序', # 60
(
字頭覈校說明,
poem_反切,
字頭原貌,
含原貌字頭,
釋義,
釋義補充,
韻目原貌,
) = (
poem_row[key]
for key in (
'字頭-覈校說明', # 18
'廣韻反切(覈校後)', # 21
'廣韻字頭原貌(覈校前)', # 24
'廣韻字頭(覈校後)', # 25
'廣韻釋義', # 26
'釋義補充', # 27
'廣韻韻部原貌(調整前)', # 40
)
)
if 字頭覈校說明 == '校':
含原貌字頭 = f'[{字頭原貌}/{含原貌字頭}]'
if not 釋義:
釋義參照 = '下'
elif 釋義補充:
釋義參照 = '上'
else:
釋義參照 = ''
# 修正
字頭說明 = ''
if (patch := patches.get(字序_key)) is not None:
assert patch.原字頭 == 含原貌字頭, (
f'patching 小韻 #{原書小韻號}/{小韻字號} 字 "{patch.原字頭}", but the actual 字 is "{含原貌字頭}"'
)
patch_coverage.add(字序_key)
assert patch.校正字頭, (
f'patching 小韻 #{原書小韻號}/{小韻字號} 字 "{patch.原字頭}", but 校正字頭 is missing'
)
if patch.校正字頭.startswith('['):
assert re.fullmatch(r'\[.+/.+\]', patch.校正字頭), (
f'invalid 校正字頭: "{patch.校正字頭}"'
)
if '~' in patch.校正字頭:
assert not 含原貌字頭.startswith('['), (
f'cannot use "~" in 校正字頭 when 字頭 contains correction: "{含原貌字頭}"'
)
含原貌字頭 = patch.校正字頭.replace('~', 含原貌字頭)
# 字頭說明 is an added field, thus it does not have an original value
字頭說明 = patch.字頭說明
if patch.校正釋義 or patch.原釋義:
assert patch.原釋義 == 釋義, (
f'patching 釋義 on 小韻 #{原書小韻號}/{小韻字號} 字 "{patch.原字頭}", but the actual 釋義 is "{釋義}"'
)
corrected = re.sub(r'\[.+?/(?:-|(.+?))\]|[{}]', r'\1', patch.校正釋義)
釋義 = corrected
if patch.校正釋義參照 or patch.原釋義參照:
assert patch.原釋義參照 == 釋義參照, (
f'patching 釋義參照 on 小韻 #{原書小韻號}/{小韻字號} 字 "{patch.原字頭}", but the actual 釋義參照 is "{釋義參照}"'
)
釋義參照 = patch.校正釋義參照
elif 字序_data[字序_key].sbgy_字.endswith('/-]'):
assert not 含原貌字頭.startswith('[')
含原貌字頭 = f'[{含原貌字頭}/-]'
字_check = 字序_data[字序_key].字
assert 含原貌字頭 == 字_check, (
f'字頭 mismatch between 字序表 and (patched) 廣韻 data: "{字_check}" != "{含原貌字頭}" (小韻 {原書小韻號}/{小韻字號})'
)
if 含原貌字頭.startswith('['):
字頭原貌, 字頭 = 含原貌字頭[1:-1].split('/')
字頭 = '' if 字頭 == '-' else 字頭
字頭原貌 = '' if 字頭原貌 == '-' else 字頭原貌
else:
字頭 = 含原貌字頭
字頭原貌 = ''
# 小韻號
# NOTE 字頭 & 細分轄字 in 小韻表.tsv does not contain 字頭原貌 (yet)
字頭或原貌 = 字頭 or 字頭原貌
if 原書小韻號 in 細分號_by_原書小韻:
for 細分 in 細分號_by_原書小韻[原書小韻號]:
小韻號 = 原書小韻號 + 細分
if 字頭或原貌 in 細分轄字_by_小韻[小韻號]:
小韻細分_coverage.setdefault(小韻號, set()).add(字頭或原貌)
break
else:
raise ValueError(
f'cannot determine 小韻細分 for {字頭或原貌} (小韻 #{原書小韻號})'
)
else:
小韻號 = 原書小韻號
if 小韻號 not in 小韻號_seen:
assert 字頭或原貌 == 小韻_data[小韻號].首字, (
f'首字 mismatch for 小韻 #{小韻號}: {字頭或原貌} != {小韻_data[小韻號].首字}'
)
小韻號_seen.add(小韻號)
# 音韻地位
音韻地位 = 小韻_data[小韻號].音韻地位
# 反切
反切 = 小韻_data[小韻號].反切
if 反切 == '-':
反切 = ''
# 釋義中反切
if 小韻字號 == '1' and 反切:
反切原貌 = re.sub(r'\[.\]|<.>|⦉.⦊|\(.\)|⦅.⦆', '', 反切)
if 反切原貌 != poem_反切:
assert 釋義.count(poem_反切 + '切') == 1, (
f'釋義 not containing {反切}切 exactly once: {釋義}'
)
釋義 = 釋義.replace(poem_反切 + '切', 反切原貌 + '切')
廣韻_data[字序_key] = 廣韻Row(
小韻號,
小韻字號,
韻目原貌,
音韻地位,
反切,
字頭原貌,
字頭,
字頭說明,
釋義,
釋義參照,
)
for 小韻號, cov in 小韻細分_coverage.items():
specified = set(細分轄字_by_小韻[小韻號])
diff = specified - cov
assert not diff, (
f'字頭 listed in 小韻細分_data but not seen: {"".join(sorted(diff))} (小韻 #{小韻號})'
)
assert patch_coverage == set(patches), (
f'invalid patches: {", ".join(f"#{原書小韻號}/{小韻字號}" for 原書小韻號, 小韻字號 in set(patches) - patch_coverage)}'
)
with open('韻書/廣韻.csv', 'w', newline='') as fout:
print(
','.join(x.name for x in dataclasses.fields(廣韻Row)),
file=fout,
)
for 字序_key, row in 廣韻_data.items():
assert row is not None, f'Missing: {字序_data[字序_key]}'
print(','.join(dataclasses.astuple(row)), file=fout)
if __name__ == '__main__':
main()