-
Notifications
You must be signed in to change notification settings - Fork 1
/
alphabet.py
executable file
·151 lines (134 loc) · 4.97 KB
/
alphabet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#!/usr/bin/env python3.7
from iiksiin import Alphabet, Dimension
import grapheme # type: ignore
import logging
import pickle
import sys
from typing import List, Mapping, Set, Iterable, Iterator
import unicodedata
"""Constructs a character alphabet for use in the
Tensor Product Representation for potentially multi-morphemic words.
This file was developed as part of the Neural Polysynthetic Language Modelling project
at the 2019 Frederick Jelinek Memorial Summer Workshop at École de Technologie Supérieure in Montréal, Québec, Canada.
https://www.clsp.jhu.edu/workshops/19-workshop/
"""
__author__ = "Lane Schwartz"
__copyright__ = "Copyright 2019, Lane Schwartz"
__license__ = "MPL 2.0"
__credits__ = [
"Lane Schwartz",
"Coleman Haley",
"Francis Tyers",
"JSALT 2019 NPLM team members",
]
__maintainer = "Lane Schwartz"
__email__ = "[email protected]"
__version__ = "0.0.1"
__status__ = "Prototype"
if sys.version_info < (3, 7):
raise RuntimeError(f"{__file__} requires Python 3.7 or later")
def main(name: str, input_source: Iterable[str], output_filename: str, log_filename: str, morpheme_delimiter: str, end_of_morpheme_symbol: str, padding_symbol: str, blacklist_char: str) -> None:
alphabet: Alphabet = Alphabet.create_from_source(name, input_source, morpheme_delimiter, end_of_morpheme_symbol, padding_symbol, blacklist_char)
with open(log_filename, "wt") as log:
print(f"Symbols in alphabet: {alphabet.number_of_symbols()}", file=log)
print("-----------------------", file=log)
print(f"0\t\t\tThe integer value 0 is reserved to represent any symbol not in the alphabet", file=log)
for symbol in sorted(iter(alphabet)):
message=f"{alphabet[symbol]}\t{Alphabet.unicode_info(symbol)}\t"
if symbol==alphabet.end_of_morpheme_symbol:
message += "End-of-morpheme symbol"
if symbol==alphabet.padding_symbol:
message += "Padding symbol"
print(message, file=log)
logging.info(f"Writing alphabet object as pickle to {output_filename}")
with open(output_filename, "wb") as output:
pickle.dump(alphabet, output)
if __name__ == "__main__":
import argparse
arg_parser = argparse.ArgumentParser(
description="Construct alphabet for use in tensor product representations of morphemes."
)
arg_parser.add_argument(
"--description",
metavar="string",
type=str,
required=True,
help="Description of the alphabet. Will serve as the name of the alphabet."
)
arg_parser.add_argument(
"-d",
"--morpheme_delimiter",
metavar="string",
type=str,
nargs="?",
default=">",
help="In the user-provided input file, "
+ "this character must appear between adjacent morphemes. "
+ "This symbol must not appear in the alphabet",
)
arg_parser.add_argument(
"-e",
"--end_of_morpheme_symbol",
metavar="character",
type=str,
nargs="?",
default="\\u0000",
help="In this output tensor representation, "
+ "this character will be appended as the final symbol in every morpheme. "
+ "This symbol must not appear in the alphabet",
)
arg_parser.add_argument(
"-p",
"--padding_symbol",
metavar="character",
type=str,
nargs="?",
default="\\u0004",
help="This character will be used when padding is needed in a tensor. "
+ "This symbol must not appear in the alphabet",
)
arg_parser.add_argument(
"-i",
"--input_file",
metavar="filename",
type=str,
required=True,
help="Input file containing whitespace delimited words (- for standard input)",
)
arg_parser.add_argument(
"--blacklist_char",
metavar="filename",
type=str,
nargs="?",
default="*",
help="Character that marks unanalyzed words that should be ignored",
)
arg_parser.add_argument(
"-o",
"--output_file",
metavar="filename",
type=str,
nargs="?",
required=True,
help="Output file where pickled alphabet is dumped",
)
arg_parser.add_argument(
"--log",
metavar="filename",
type=str,
nargs="?",
required=True,
help="Log file"
)
arg_parser.add_argument("-v", "--verbose", metavar="int", type=int, default=0)
args = arg_parser.parse_args()
main(
name=args.description,
input_source=open(args.input_file) if args.input_file != "-" else sys.stdin,
output_filename=args.output_file,
log_filename=args.log,
morpheme_delimiter=str.encode(args.morpheme_delimiter).decode("unicode_escape"),
end_of_morpheme_symbol=str.encode(args.end_of_morpheme_symbol).decode("unicode_escape"),
padding_symbol=str.encode(args.padding_symbol).decode("unicode_escape"),
blacklist_char=args.blacklist_char
)