forked from digitalprk/dump_biyak
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdump.py
157 lines (125 loc) · 5.58 KB
/
dump.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
from utils import convert_euckp_to_unicode
from collections import Counter, defaultdict
import sqlite3
import os
def Byte4ToInt(arg1, arg2):
return ((arg1[arg2 + 3] & 255) << 24) | ((arg1[arg2 + 2] & 255) << 16) | ((arg1[arg2 + 1] & 255) << 8) | (
arg1[arg2] & 255)
def Byte2ToInt(arg1, arg2):
return ((arg2 & 0xFF) << 8) | (arg1 & 0xFF)
class Dumper():
def __init__(self, dicfile):
self.dicfile = open(dicfile, 'rb')
def dump_word_list_as_bytes(self, language_id = 0):
wordlist = {}
lang_index = {0: 0x4, 4: 0x10, 6: 0x24}
if language_id not in lang_index:
self.dicfile.close()
raise ValueError("Invalid Language ID")
self.dicfile.seek(lang_index[language_id])
main_index = Byte4ToInt(self.dicfile.read(4), 0)
self.dicfile.seek(main_index)
max_size = Byte4ToInt(self.dicfile.read(4), 0) >> 2
for i in range(max_size):
self.dicfile.seek(i * 4 + main_index)
cur_word_offset = Byte4ToInt(self.dicfile.read(4), 0)
cur_word_address = main_index + cur_word_offset
self.dicfile.seek(cur_word_address)
word_length = int.from_bytes(self.dicfile.read(1), "little", signed=False)
word, word_index = self.dicfile.read(word_length).split(b'\x00', 1)
word_index = Byte4ToInt(word_index, 0)
wordlist[word_index] = word
return wordlist
def dump_encoded_content(self):
wordlist = []
self.dicfile.seek(0)
header_size = Byte4ToInt(self.dicfile.read(4), 0)
self.dicfile.seek(header_size)
index_size = Byte4ToInt(self.dicfile.read(4), 0)
index_size_per_individual_language = index_size >> 2
for i in range(index_size_per_individual_language):
self.dicfile.seek(header_size + i * 4)
current_word_address = Byte4ToInt(self.dicfile.read(4), 0)
self.dicfile.seek(current_word_address + header_size)
buffer_length = int.from_bytes(self.dicfile.read(2), "little", signed=False)
encoded_content = self.dicfile.read(buffer_length)
szEnglish, szChinese, szKorean, szField, _ = encoded_content.split(b'\x00')
wordlist.append((szEnglish, szChinese, szKorean, szField))
return wordlist
def close(self):
self.dicfile.close()
def create_substitution_dictionary(wordlist, content):
res = defaultdict(list)
for index, word in wordlist.items():
if len(word) != len(content[index]):
continue
i = 0
while i < len(word):
if word[i] < 0x80:
res[content[index][i]].append(word[i])
i += 1
else:
res[content[index][i:i+2]].append(word[i:i+2])
i += 2
def most_common(lst):
data = Counter(lst)
return max(lst, key=data.get)
return {key: most_common(value) for key, value in res.items()}
def translate_by_substitution(word, substitution_dictionary):
i = 0
res = bytearray(len(word))
while i < len(word):
try:
if word[i:i+4] == b'\xd4\xe9\x83\xd4': # handling the case for 및 which was does not appear in the word list
res[i:i+4] = b' \xB7\xF1 '
i += 4
elif word[i:i+2] == b'"$': # another special case unaccounted for...
res[i:i+2] = b'TV'
i +=2
else:
res[i:i + 2] = substitution_dictionary[word[i:i+2]]
i += 2
except KeyError as e:
print("Error: Sequence %s not found in substitution dictionary." % word[i:i+2])
res[i:i + 2] = b'??'
i += 2
return res
def create_keek_list_of_tuples(korean_wordlist, english_wordlist, field_content):
def con(input):
return convert_euckp_to_unicode(input).decode('utf-16')
ek_data = []
ke_data = []
for index, element in korean_wordlist.items():
if index in english_wordlist:
str_kr_word = con(element)
str_en_word = con(english_wordlist[index])
ek_data.append((str_en_word, str_kr_word + ('\n\n(%s)' % field_content[index])))
ke_data.append((str_kr_word, str_en_word + ('\n\n(%s)' % field_content[index])))
return ke_data + ek_data
if __name__ == "__main__":
d = Dumper("eckdata.dic")
english_wordlist = d.dump_word_list_as_bytes(language_id=0)
korean_wordlist = d.dump_word_list_as_bytes(language_id=6)
full_content = d.dump_encoded_content()
_, _, korean_content, field_content = zip(*full_content)
subs = create_substitution_dictionary(korean_wordlist, korean_content)
substituted_field_content = [translate_by_substitution(_, subs) for i, _ in enumerate(field_content)]
decoded_field_content = [convert_euckp_to_unicode(_).decode('utf-16') for _ in substituted_field_content]
final_data = create_keek_list_of_tuples(korean_wordlist,
english_wordlist,
decoded_field_content
)
final_data = sorted(final_data, key=lambda x: x[0])
d.close()
# Create database
DB_NAME = "biyak.db"
if os.path.exists(DB_NAME):
os.remove(DB_NAME)
conn = sqlite3.connect(DB_NAME)
conn.execute("CREATE TABLE name (dicname text)")
conn.execute("INSERT INTO name VALUES ('%s')" % "Biyak Technical Dictionary")
conn.commit()
conn.execute("CREATE TABLE dictionary (word text, definition text)")
conn.executemany("INSERT INTO dictionary VALUES (?, ?)", final_data)
conn.commit()
conn.close()