-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaozora_data.py
executable file
·235 lines (207 loc) · 10.1 KB
/
aozora_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
#!/usr/bin/env python3
# Alphabet Soup gives language learners easily digestible chunks for practice.
# Copyright 2019-2020 Yorwba
# Alphabet Soup is free software: you can redistribute it and/or
# modify it under the terms of the GNU Affero General Public License
# as published by the Free Software Foundation, either version 3 of
# the License, or (at your option) any later version.
# Alphabet Soup is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
# You should have received a copy of the GNU Affero General Public License
# along with Alphabet Soup. If not, see <https://www.gnu.org/licenses/>.
import argparse
import csv
import re
with open('data/aozora/list_person_all_extended_utf8.csv') as f:
aozora_list = list(csv.reader(f))
aozora_header = aozora_list[0]
aozora_table = aozora_list[1:]
library_card_url = aozora_header.index('図書カードURL')
author_family_name = aozora_header.index('姓')
author_given_name = aozora_header.index('名')
author_role = aozora_header.index('役割フラグ')
author_birth = aozora_header.index('生年月日')
author_death = aozora_header.index('没年月日')
input_by = aozora_header.index('入力者')
proofread_by = aozora_header.index('校正者')
text_url = aozora_header.index('テキストファイルURL')
html_file_url = aozora_header.index('XHTML/HTMLファイルURL')
def modern_works(args):
modern_files = list(
row[text_url] for row in aozora_table
if row[author_birth].startswith('19')
and not row[author_death]
and row[text_url].strip().split('.')[-1] in ('zip', 'txt')
and (not args.aozora_only
or row[text_url].startswith('https://www.aozora.gr.jp/')))
for url in modern_files:
print(url)
def librivox_audiobooks(args):
html_to_text = {
row[html_file_url].replace('http:', 'https:'): row[text_url]
for row in aozora_table}
for line in args.librivox_links.readlines():
line = line.strip()
language, aozora_link, archive_id = line.split('\t')
if language == 'Japanese':
try:
text_link = html_to_text[aozora_link.replace('http:', 'https:')]
print(text_link, archive_id, sep='\t')
except KeyError:
pass
def sentences_in_paragraph(paragraph, ruby):
paragraph = paragraph.replace('[', r'[').replace(']', r']')
if ruby:
cjk = '⺀-⺙⺛-⻳㇀-㇣㐀-䶵一-鿕豈-舘並-龎🈐-🈒🈔-🈻🉀-🉈𠀀-𪛖𪜀-𫜴𫝀-𫠝𫠠-𬺡丽-𪘀'
cjk += '0-9A-Za-z0-9A-Za-z々〆※×' # not actually CJK, but can have furigana
paragraph = re.sub(f'|?([{cjk}]+)《([^》]+)》', r'[\1|\2]', paragraph)
left_brackets = '「『(〈《“'
right_brackets = '」』)〉》”'
terminators = '。?!'
boundary_pattern = '(['+left_brackets+right_brackets+terminators+'])'
parts = re.split(boundary_pattern, paragraph)
i = 0
sentence = ''
character_count = 0
while i < len(parts):
if not sentence:
character_count = sum(map(len, parts[:i]))
if (parts[i].startswith('と') or parts[i].startswith('って'))\
and not sentence and i > 0 and parts[i-1] in right_brackets:
# After quotations of sentences, there might be an awkward
# "...と言います" or similar hanging around. Skip it.
i += 2
continue
sentence += parts[i]
if i+1 < len(parts):
bracket = left_brackets.find(parts[i+1])
if bracket >= 0 and i+3 < len(parts) and parts[i+3] == right_brackets[bracket]:
sentence += ''.join(parts[i+1:i+4])
i += 4
continue
if parts[i+1] in terminators:
sentence += parts[i+1]
sentence = sentence.strip()
if len(sentence) > 3: # XXX how to better handle short sentences?
yield (character_count, sentence)
sentence = ''
i += 2
def JIS_X_0213_encode(men, ku, ten):
"""
JIS X 0213 has two planes (men) and each plane consists of a 94x94 grid.
This function turns a character identified by its position in the grid into
Unicode, via Shift JIS 2004 as an intermediate encoding.
"""
# https://en.wikipedia.org/wiki/Shift_JIS#Shift_JISx0213_and_Shift_JIS-2004
if men == 1:
if 1 <= ku <= 62:
s1 = (ku+257)//2
elif 63 <= ku <= 94:
s1 = (ku+385)//2
elif men == 2:
if ku in (1, 3, 4, 5, 8, 12, 13, 14, 15):
s1 = (ku+479)//2 - (ku//8) * 3
elif 78 <= ku <= 94:
s1 = (ku+411)//2
if ku & 1 == 1:
if 1 <= ten <= 63:
s2 = ten + 63
elif 64 <= ten <= 94:
s2 = ten + 64
else:
s2 = ten + 158
return bytes((s1, s2)).decode('sjis_2004')
def extract_sentences(args):
import os
import zipfile
filename_to_table_row = {
row[text_url].split('/')[-1]: row
for row in aozora_table}
filedir = 'data/aozora/files'
for filename in os.listdir(filedir):
table_row = filename_to_table_row[filename]
filepath = os.path.join(filedir, filename)
if zipfile.is_zipfile(filepath):
try:
with zipfile.ZipFile(filepath) as z:
ruby = 'ruby' in filename
url = table_row[library_card_url]
for n in z.namelist():
if n.endswith('.txt'):
text = z.read(n).decode('shift-jis')
lines = text.split('\r\n')
separators = [i for i, l in enumerate(lines)
if set(l) == {'-'}]
empty_stretch = 0
max_empty_stretch = 0
max_empty_stretch_index = None
for i, line in enumerate(lines):
if line:
empty_stretch = 0
else:
empty_stretch += 1
if empty_stretch > max_empty_stretch:
max_empty_stretch = empty_stretch
max_empty_stretch_index = i
end = max_empty_stretch_index + 1
license = None
creators = [
(table_row[author_role], table_row[author_family_name]+table_row[author_given_name]),
('入力者', table_row[input_by]),
('校正者', table_row[proofread_by])]
for line in lines[end+1:]:
# not the correct regex, but good enough:
match = re.match('.*(https?://creativecommons.org/licenses/[!-~]*)', line)
if match:
license = match.group(1)
match = re.match('このファイルは、インターネットの図書館、青空文庫(https?://www.aozora.gr.jp/?)で作られました。入力、校正、制作にあたったのは、ボランティアの皆さんです。', line)
if match:
# technically not a license
license = 'https://ja.wikipedia.org/wiki/%E3%83%91%E3%83%96%E3%83%AA%E3%83%83%E3%82%AF%E3%83%89%E3%83%A1%E3%82%A4%E3%83%B3'
match = re.match('(.*者):(.*)$', line)
if match:
creators.append(match.group(1, 2))
creators = ' '.join(
role+':'+name
for role, name in creators
if name)
text_start = separators[1]+1
text_lines = lines[text_start:end]
for line_number, line in enumerate(text_lines):
line = re.sub(
'※[#[^]]*([12])-([0-9]{1,2})-([0-9]{1,2})]',
lambda m: JIS_X_0213_encode(*map(int,m.groups())),
line
)
line = re.sub(
r'※[#[^]]*U\+([0-9a-fA-F]+)([^]0-9a-fA-F][^]]*)?]',
lambda m: chr(int(m.group(1), 16)),
line
)
line = re.sub('[#[^]]*]', '', line)
for (character_count, sentence) in sentences_in_paragraph(line, ruby):
print('\t'.join((
'aozora',
url,
':'.join((filename, n, str(text_start+line_number), str(character_count))),
license,
creators,
sentence)))
except zipfile.BadZipFile:
pass
def main(argv):
parser = argparse.ArgumentParser(
description='Aozora data file parser')
parser.add_argument('command', nargs=1, choices={
'modern-works',
'librivox-audiobooks',
'extract-sentences'})
parser.add_argument('--aozora-only', type=bool, default=True)
parser.add_argument('--librivox-links', type=argparse.FileType('r'), default=None)
args = parser.parse_args(argv[1:])
globals()[args.command[0].replace('-', '_')](args)
if __name__ == '__main__':
import sys
main(sys.argv)