-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata.py
164 lines (142 loc) · 5.15 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
from utils import Params
from openpyxl import load_workbook
import jieba
import numpy as np
def xlsx_read(file_dir=Params.example_input, sheet_name="男(799)", min_col=0, max_col=14, min_row=2, max_row=75):
wb = load_workbook(file_dir)
sheet = wb[sheet_name]
# print("===================表格基本信息:=======================")
# print("sheet names:", wb.sheetnames)
# print(sheet_name+"的维度:", sheet.calculate_dimension())
r = sheet.calculate_dimension()
print(r)
data = sheet[r.split(":")[0][0]+":"+r.split(":")[1][0]]
if max_row >= int(r.split(":")[1][1:]):
max_row = int(r.split(":")[1][1:])
total = []
for col in range(min_col, max_col): # 列
temp = []
for row in range(min_row, max_row): # 行
# print(data[row][col].value)
temp.append(data[col][row].value)
# print(temp)
total.append(temp)
# print(total)
return total
def xlsx_read_v2(file_dir=Params.advices_regularization, sheet_name="yan_zheng_jiedu", min_col=0, max_col=1, min_row=0, max_row=50):
wb = load_workbook(file_dir)
sheet = wb[sheet_name]
r = sheet.calculate_dimension()
if max_row >= int(r.split(":")[1][1:]):
max_row = int(r.split(":")[1][1:])
# 通过指定范围(列 → 列)
total = []
for row in sheet.iter_rows(min_row=min_row, max_row=max_row, min_col=min_col, max_col=max_col):
if row:
for cell in row:
# print(cell.value)
total.append(cell.value)
return total
def get_input(input_excel=Params.example_input, sheet_name="女(799)"):
input_cols = xlsx_read(file_dir=input_excel, sheet_name=sheet_name)
# a exp line in total_rows: (1, '骨科疾病', '骨关节炎', 0.609499828501506, 8.868, 14.5497, 2, None)]
input_rows = list(zip(input_cols[0],
input_cols[1],
input_cols[2],
input_cols[6],
input_cols[7],
input_cols[9],
input_cols[12],
input_cols[13]
))
return input_cols, input_rows
def get_dz_advices(file_dir=Params.advices_regularization): # 获取邓总提供的建议
wb = load_workbook(Params.advices_regularization)
sheet_names = wb.sheetnames[0:-1]
advices = dict()
seg = dict()
for name in sheet_names:
ad = xlsx_read_v2(file_dir=file_dir, sheet_name=name, min_col=0, max_col=1, min_row=0, max_row=50)
if ad[-1]:
ad.append(None)
temp_ad = []
temp_seg = []
cnt = 0
for line in ad:
if line:
cnt = cnt + 1
temp = ""
# print(line)
for i in line:
if i != " ":
temp = temp + i
temp_ad.append(temp.split(".")[1])
if not line:
if cnt != 0:
temp_seg.append(cnt)
cnt = 0
advices[name]=temp_ad
seg[name] = temp_seg
return advices, seg
def cal_similarity(sentence1="", sentence2=""):
sentence1 = jieba.cut(sentence1, cut_all=False)
sentence2 = jieba.cut(sentence2, cut_all=False)
s1 = []
s2 = []
for each in sentence1:
s1.append(each)
for each in sentence2:
s2.append(each)
s1 = ' '.join(str(i) for i in s1)
s2 = ' '.join(str(i) for i in s2)
word_bag = set(s1.split(" ")+s2.split(" "))
s1_v = []
for i in word_bag:
if i in s1:
s1_v.append(1)
else:
s1_v.append(0)
s2_v = []
for i in word_bag:
if i in s2:
s2_v.append(1)
else:
s2_v.append(0)
fenzi = 0
fenmu1 = 0
fenmu2 = 0
for i in range(0, len(word_bag)):
fenzi = fenzi + s1_v[i]*s2_v[i]
fenmu1 = fenmu1 + s1_v[i]*s1_v[i]
fenmu2 = fenmu2 + s2_v[i]*s2_v[i]
cos_dis = fenzi/(np.sqrt(fenmu1) * np.sqrt(fenmu2))
return cos_dis, [s1, s1_v], [s2, s2_v], word_bag
if __name__ == "__main__":
advices, seg = get_dz_advices(file_dir=Params.advices_regularization)
sen_freq = []
advices_list = []
for key in advices.keys():
for line in advices[key]:
advices_list.append(line)
# print(line)
# 按照句子之间的相似度进行分类
classified_advices = []
sorted_advices = sorted(advices_list)
while(sorted_advices):
temp = []
query = sorted_advices[0]
temp.append(query)
remaining_sorted_advices = sorted_advices[1:]
sorted_advices.remove(query)
for i in range(0, len(remaining_sorted_advices)):
cos_dis, s1, s2, word_bag = cal_similarity(query, remaining_sorted_advices[i])
# print(cos_dis)
if cos_dis >= 0.5:
# print(remaining_sorted_advices[i])
temp.append(remaining_sorted_advices[i])
sorted_advices.remove(remaining_sorted_advices[i])
classified_advices.append(temp)
for l in classified_advices:
print("\n")
for line in l:
print(line)