-
Notifications
You must be signed in to change notification settings - Fork 1
/
corpus2alphabet.py
executable file
·204 lines (178 loc) · 7.48 KB
/
corpus2alphabet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
#!/usr/bin/env python3.7
import argparse
import grapheme # type: ignore
import logging
import sys
from typing import Dict, List, Callable, Tuple, Set, Mapping, Iterable, Iterator
import unicodedata
"""Extracts an alphabet of characters from a corpus.
This file was developed as part of the Neural Polysynthetic Language Modelling project
at the 2019 Frederick Jelinek Memorial Summer Workshop at École de Technologie Supérieure in Montréal, Québec, Canada.
https://www.clsp.jhu.edu/workshops/19-workshop/
"""
__author__ = "Lane Schwartz"
__copyright__ = "Copyright 2019, Lane Schwartz"
__license__ = "MPL 2.0"
__credits__ = [
"Lane Schwartz",
"JSALT 2019 NPLM team members",
]
__maintainer = "Lane Schwartz"
__email__ = "[email protected]"
__version__ = "0.0.1"
__status__ = "Prototype"
if sys.version_info < (3, 7):
raise RuntimeError(f"{__file__} requires Python 3.7 or later")
def escaped_codepoints(s: str) -> str:
return "".join([f"\\u{char_to_code_point(c)}" for c in s])
def char_to_code_point(c: str) -> str:
x_hex_string: str = hex(ord(c)) # a string of the form "0x95" or "0x2025"
hex_string: str = x_hex_string[2:] # a string of the form "95" or "2025"
required_zero_padding = max(0, 4 - len(hex_string))
return (
f"{required_zero_padding * '0'}{hex_string}"
) # a string of the form "\\u0095" or "\\u2025"
def char_to_name(c: str) -> str:
try:
return unicodedata.name(c)
except ValueError:
return ""
def unicode_info(s: str) -> str:
return "; ".join([f"U+{char_to_code_point(c)} {char_to_name(c)}" for c in s])
def output(output_file, int_value, character, unicode_name, description):
print(f"{int_value}\t{character}\t{unicode_name}\t{description}", file=output_file)
def main(
morpheme_delimiter: str,
end_of_morpheme_symbol: str,
padding_symbol: str,
input_file,
output_file,
verbose: int,
blacklist_char: str,
) -> None:
if grapheme.length(end_of_morpheme_symbol) != 1:
raise RuntimeError(
"The end of morpheme symbol must consist of a single grapheme cluster "
+ "(see Unicode Standard Annex #29)."
)
alphabet_set: Set[str] = set()
logging.info(f"Reading alphabet from input file {input_file.name}...")
for line in input_file:
for character in grapheme.graphemes(line.strip()):
category = unicodedata.category(character)
if category[0] == "Z":
logging.debug("Input contains whitespace character:\t{unicode_info(symbol)}. This character will not be included in the alphabet.")
elif category[0] == "C":
logging.debug("Input contains control character:\t{unicode_info(symbol)}. This character will not be included in the alphabet.")
elif character == morpheme_delimiter:
logging.debug("Not including morpheme delimeter {morpheme_delimiter} in the alphabet.")
elif character == blacklist_char:
logging.debug("Not including character {blacklist_char} in the alphabet.")
elif character == padding_symbol:
raise RuntimeError(f"Input contains reserved padding character {padding_symbol}, but this character must not occur in the corpus.")
elif character == end_of_morpheme_symbol:
raise RuntimeError(f"Input contains reserved end of morpheme character {end_of_morpheme_symbol}, but this character must not occur in the corpus.")
else:
alphabet_set.add(character)
# Zero is reserved for OOV
output(output_file=output_file,
int_value=0,
character="",
unicode_name="",
description="Integer value 0 is reserved to represent out-of-vocabulary characters in a tensor product representation")
# We reserve another character to represent the end of morpheme in a tensor product representation
output(output_file=output_file,
int_value=1,
character=escaped_codepoints(end_of_morpheme_symbol),
unicode_name=unicode_info(end_of_morpheme_symbol),
description="Integer value 1 is reserved to represent the end of a morpheme in a tensor product representation")
# We reserve another character to represent the padding after the end of morpheme in a tensor product representation
output(output_file=output_file,
int_value=2,
character=escaped_codepoints(padding_symbol),
unicode_name=unicode_info(padding_symbol),
description="Integer value 2 is reserved to represent padding beyond the end of a morpheme in a tensor product representation")
# Remaining actual characters
for i, symbol in enumerate(sorted(alphabet_set), start=3):
output(output_file=output_file,
int_value=i,
character=symbol,
unicode_name=unicode_info(symbol),
description="")
if __name__ == "__main__":
import argparse
arg_parser = argparse.ArgumentParser(
description="Construct tensor product representations of each morpheme."
)
arg_parser.add_argument(
"-e",
"--end_of_morpheme_symbol",
metavar="character",
type=str,
nargs="?",
default="\\u0000",
help="In the output tensor representation, "
+ "this character will be appended as the final symbol in every morpheme. "
+ "This symbol must not appear in the alphabet",
)
arg_parser.add_argument(
"-d",
"--morpheme_delimiter",
metavar="string",
type=str,
nargs="?",
default=">",
help="In the user-provided input file, "
+ "this character must appear between adjacent morphemes. "
+ "This symbol must not appear in the alphabet",
)
arg_parser.add_argument(
"-p",
"--padding_symbol",
metavar="string",
type=str,
nargs="?",
default="\\u0004",
help="In the output tensor representation, "
+ "this character will be appended after the end-of-morpheme symbol."
+ "This symbol must not appear in the alphabet",
)
arg_parser.add_argument(
"-i",
"--input_file",
metavar="filename",
type=str,
nargs="?",
default="-",
help="Input file containing whitespace delimited words (- for standard input)",
)
arg_parser.add_argument(
"--blacklist_char",
metavar="filename",
type=str,
nargs="?",
default="*",
help="Character that marks unanalyzed words that should be ignored",
)
arg_parser.add_argument(
"-o",
"--output_file",
metavar="filename",
type=str,
nargs="?",
default="-",
help="Output file"
)
arg_parser.add_argument("-v", "--verbose", metavar="int", type=int, default=0)
args = arg_parser.parse_args()
input_file=sys.stdin if args.input_file=="-" else open(args.input_file, mode="rt", encoding="utf8")
output_file=sys.stdout if args.output_file=="-" else open(args.output_file, mode="wt", encoding="utf8")
main(
end_of_morpheme_symbol=str.encode(args.end_of_morpheme_symbol).decode("unicode_escape"),
padding_symbol=str.encode(args.padding_symbol).decode("unicode_escape"),
morpheme_delimiter=str.encode(args.morpheme_delimiter).decode("unicode_escape"),
input_file=input_file,
output_file=output_file,
verbose=args.verbose,
blacklist_char=args.blacklist_char
)