forked from yao8839836/kg-bert
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessing.py
157 lines (136 loc) · 5.35 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# entities list for umls, YAGO3-10, FB15k-237, WN18RR, WN11, FB13
with open('data/WN18RR/train.tsv', 'r') as f, open('data/WN18RR/test.tsv', 'r') as f1, open('data/WN18RR/dev.tsv', 'r') as f2, open('data/WN18RR/entities.txt', 'w') as f3:
lines = f.readlines() + f1.readlines() + f2.readlines()
entities = set()
for line in lines:
line = line.strip()
temp = line.split('\t')
entities.add(temp[0])
entities.add(temp[2])
entities_str = '\n'.join(list(entities))
f3.write(entities_str)
# relations list for FB15k-237, umls, WN18RR and YAGO3-10, WN11, FB13
with open('data/WN18RR/train.tsv', 'r') as f, open('data/WN18RR/test.tsv', 'r') as f1, open('data/WN18RR/dev.tsv', 'r') as f2, open('data/WN18RR/relations.txt', 'w') as f3:
relations = set()
lines = f.readlines() + f1.readlines() + f2.readlines()
for line in lines:
line = line.strip()
temp = line.split('\t')
relations.add(temp[1])
relations_str = '\n'.join(relations)
f3.write(relations_str)
# entities and relation list for WN18 and FB15K
with open('data/FB15K/entity2id.txt', 'r') as f, open('data/FB15K/relation2id.txt', 'r') as f1:
lines = f.readlines()
ent_id_dict = {}
entities = []
for line in lines:
line = line.strip()
temp = line.split()
if len(temp) == 2:
ent_id_dict[temp[1]] = temp[0]
entities.append(temp[0])
lines = f1.readlines()
rel_id_dict = {}
relations = []
for line in lines:
line = line.strip()
temp = line.split()
if len(temp) == 2:
rel_id_dict[temp[1]] = temp[0]
relations.append(temp[0])
with open('data/FB15K/entities.txt', 'w') as f, open('data/FB15K/relations.txt', 'w') as f1:
f.write('\n'.join(entities))
f1.write('\n'.join(relations))
# train.tsv, dev.tsv and test.tsv for WN18 and FB15K
with open('data/FB15K/valid2id.txt', 'r') as f, open('data/FB15K/dev.tsv', 'w') as f1:
lines = f.readlines()
text_lines = []
for line in lines:
line = line.strip()
temp = line.split()
if len(temp) == 3:
head_entity = ent_id_dict[temp[0]]
tail_entity = ent_id_dict[temp[1]]
relation = rel_id_dict[temp[2]]
text_lines.append(head_entity + '\t' + relation + '\t'+ tail_entity)
text_lines_str = '\n'.join(text_lines)
f1.write(text_lines_str)
# entity to text for WN18RR
with open('data/WN18RR/wordnet-mlj12-definitions.txt', 'r') as f, open('data/WN18RR/entity2text.txt', 'w') as f1:
lines = f.readlines()
ent2texts = []
count = 0
for line in lines:
line = line.strip()
temp = line.split('\t')
if len(temp) == 3:
if temp[1].find('_NN_') != -1 or temp[1].find('_JJ_') != -1 or temp[1].find('_VB_') != -1 or temp[1].find('_RB_') != -1:
wordnet_str = temp[1][2:]
num_start = wordnet_str.rfind('_')
pos_start = wordnet_str[:num_start].rfind('_')
name = wordnet_str[:pos_start]
name = name.replace('_', ' ')
ent2texts.append(temp[0] + '\t' + name + ', ' + temp[2])
count += 1
print(count)
f1.write('\n'.join(ent2texts))
# entity to text for FB, names
with open('data/FB15k-237/FB15k_mid2name.txt', 'r') as f, open('data/FB15k-237/entity2text.txt', 'w') as f1:
lines = f.readlines()
ent2texts = []
for line in lines:
line = line.strip()
temp = line.split('\t')
temp[1] = temp[1].replace('_', ' ')
ent2texts.append(temp[0] + '\t' + temp[1])
f1.write('\n'.join(ent2texts))
# entity to text for UMLS and YAGO3-10
with open('data/YAGO3-10/entities.txt', 'r') as f, open('data/YAGO3-10/entity2text.txt', 'w') as f1:
lines = f.readlines()
ent2texts = []
for line in lines:
line = line.strip()
ent2texts.append(line + '\t' + line.replace('_', ' '))
f1.write('\n'.join(ent2texts))
# relations to texts for Freebase, umls, WordNet
with open('data/WN18/relations.txt', 'r') as f, open('data/WN18/relation2text.txt', 'w') as f1:
lines = f.readlines()
relation_texts = []
for line in lines:
line = line.strip()
text = line.replace('/' , ' ')
text = text.replace('_', ' ').strip()
#print(line, text)
relation_texts.append(line + '\t' + text)
f1.write('\n'.join(relation_texts))
# entity to text for FB, descriptions
with open('data/FB15K/FB15k_mid2description.txt', 'r') as f, open('data/FB15K/entity2textlong.txt', 'w') as f1:
lines = f.readlines()
ent2texts = []
for line in lines:
line = line.strip()
temp = line.split('\t')
temp[1] = temp[1][1:-4]
temp[1] = temp[1].replace('_', ' ')
ent2texts.append(temp[0] + '\t' + temp[1])
f1.write('\n'.join(ent2texts))
# entity text length
max_len = 0
min_len = 10000
avg_len = 0
with open('data/WN18RR/entity2text.txt', 'r') as f1:
lines = f1.readlines()
for line in lines:
temp = line.strip().split("\t")
words = temp[1].split(" ")
length = len(words)
print(length)
if length < min_len:
min_len = length
if length > max_len:
max_len = length
avg_len += length
print("max:", max_len)
print("min:", min_len)
print("avg:", avg_len * 1.0 / len(lines))