-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchaizi.py
84 lines (73 loc) · 2.91 KB
/
chaizi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# author: JMXGODLZZ
# datetime: 2022/5/3 下午5:07
# ide: PyCharm
# -*- coding: utf-8 -*-
# some code ref: https://github.com/howl-anderson/hanzi_chaizi
# Modified by @author: Yiwen Jiang @Winning Health Group
import copy
import pickle
import logging
import pkg_resources
class HanziChaizi(object):
def __init__(self):
data_file = pkg_resources.resource_filename(__name__, "./data/chaizi.pkl")
with open(data_file, 'rb') as fd:
self.data = pickle.load(fd)
fd.close()
self.special_token = '離'
def query(self, input_char, default=None):
result = self.data.get(input_char, default)
return result
def _flatten(self, li):
return sum(([x] if not isinstance(x, list) else self._flatten(x) for x in li), [])
def form_feature(self, input_char):
feature = []
input_list = [input_char]
continue_chai = True
while continue_chai == True:
input_list_tmp = copy.deepcopy(input_list)
for idx, char in enumerate(input_list):
char_query = self.query(char)
if char_query != None:
char_query_tmp = char_query[0]
if char in char_query_tmp and len(char_query) > 1:
for result in char_query:
if char not in result:
char_query_tmp = result
break
if char in char_query_tmp:
while char in char_query_tmp:
char_query_tmp.remove(char)
if self.special_token in char_query_tmp:
while self.special_token in char_query_tmp:
char_query_tmp.remove(self.special_token)
char_query = char_query_tmp
else:
char_query = [char]
input_list_tmp[idx] = char_query
input_list_tmp = self._flatten(input_list_tmp)
if input_list != input_list_tmp:
continue_chai = True
feature += input_list_tmp
input_list = copy.deepcopy(input_list_tmp)
if len(feature) > 200:
feature = feature[:200]
break
else:
continue_chai = False
return feature[:80]
def init_logger():
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt='%m/%d/%Y %H:%M:%S',
level=logging.INFO)
# er = HanziChaizi()
# rs = er.form_feature('李')
# print(rs)
# tokens = ['我', '是', '李', '白']
# chaizi = [list(er.form_feature(token)) for token in tokens]
# for idx_c, char in enumerate(chaizi):
# for idx_p, part in enumerate(chaizi[idx_c]):
# chaizi[idx_c][idx_p] = part
# print(chaizi)