-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathfeature_extraction.py
175 lines (148 loc) · 5.43 KB
/
feature_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
from operator import itemgetter
import math
TOTAL_DOC = 77704
idf_dict = {}
index_dict = {}
# w2v expand
query_expand_dict = {}
num_query_expand = 0
def load_query_expand(query_expand_file):
'''
Load query expand file query(word_word)\texpands(word_word).
'''
with open(query_expand_file) as f:
for line in f:
query, expand = line.rstrip().split('\t')
expand_list = expand.split()
if query[-1] != '#':
query = query.replace('_', ' ')
for i in range(len(expand_list)):
if expand_list[i][-1] != '#':
expand_list[i] = expand_list[i].replace('_', ' ')
query_expand_dict[query] = expand_list
print 'Load query expand Complete.'
def load_df(df_file):
'''
Load idf dict from file.
'''
index = 1
with open(df_file) as f:
for line in f:
temp_list = line.rstrip().split('\t')
idf_dict[temp_list[0]] = math.log(TOTAL_DOC / (float(temp_list[1]) + 1))
index_dict[temp_list[0]] = index
index += 1
print 'Load DF Complete.'
def cal_doc_tfidf(line, n):
'''
Cal tf vector of doc.
'''
tf_dict = {}
query, titles, class_str = line.rstrip().split('\t')
# get query tf dict
query_tf_dict = cal_segment_tf(query, n)
# expand query
# max top 10 ALL EXPAND!!!
# Only expand querys that number of words <= 4
if query in query_expand_dict:
global num_query_expand
num_query_expand += 1
expand_list = query_expand_dict[query][: 5]
for i in range(len(expand_list)):
if expand_list[i] not in query_tf_dict:
query_tf_dict[expand_list[i]] = 1 - float(i + 1) / 10
max_count = 0
for title, count in [(title_count.split(':')[0], int(title_count.split(':')[1])) for title_count in titles.split(',') if len(title_count) > 0]:
# count 的分布需要统计一下
# 尝试设置title权重
weight = 1.0 + math.log(count) # 对点击数取log
# weight = count # 点击数作为weight
# weight = 1
title_tf_dict = multiply_weight(cal_segment_tf(title, n), weight)
for word, tf in title_tf_dict.items():
if word in tf_dict:
tf_dict[word] += tf
else:
tf_dict[word] = tf
# 设置query中的tf权重
for word, _ in query_tf_dict.items():
query_tf_dict[word] += count * 30 # 出现一个种类的title query tf + 1
for word, tf in query_tf_dict.items():
if word in tf_dict:
tf_dict[word] += tf
else:
tf_dict[word] = tf
# cal tfidf
tfidf_dict = {}
# cal max tf
tf_max = 0
for word, tf in tf_dict.items():
if tf > tf_max:
tf_max = tf
for word, tf in tf_dict.items():
if word in idf_dict:
tfidf_dict[index_dict[word]] = (0.2 + 0.8 * tf / tf_max) * idf_dict[word]
return (query, class_str, sorted(tfidf_dict.iteritems(), key=itemgetter(0)))
def multiply_weight(tf_dict, weight):
for word, count in tf_dict.items():
tf_dict[word] = weight * count
return tf_dict
def cal_segment_tf(segment, n):
rs_dict = {}
words = segment.split()
for i in range(1, n + 1):
for j in range(len(words) - i + 1):
word = ' '.join(words[j : i + j])
if word in rs_dict:
rs_dict[word] += 1
else:
rs_dict[word] = 1
# add 4 end gram , which is the best
# From Zhouxing
grams = []
grams.append(words[-1]+'#')
if len(words) > 1:
grams.append(words[-2]+'_'+words[-1]+'#')
if len(words) > 2:
grams.append(words[-3]+'_'+grams[-1])
if len(words) > 3:
grams.append(words[-4]+'_'+grams[-1])
if len(words) > 4:
grams.append(words[-5]+'-'+grams[-1])
for gram in grams:
if gram in rs_dict:
rs_dict[gram] += 1
else:
rs_dict[gram] = 1
return rs_dict
def extract_feature(merged_file, df_file, train_feature, train_query, test_feature, test_query, n):
load_df(df_file)
f_train = open(train_feature, 'w')
f_test = open(test_feature, 'w')
f_test_query = open(test_query, 'w')
f_train_query = open(train_query, 'w')
with open(merged_file) as f:
for line in f:
query, class_str, tfidf_tuple_list = cal_doc_tfidf(line, int(n))
if '8' in class_str:
f_test_query.write(query + '\n')
f_test.write(class_str + '\t' + ' '.join([str(tfidf_tuple[0]) + ':' + str(tfidf_tuple[1]) for tfidf_tuple in tfidf_tuple_list]) + '\n')
else:
f_train_query.write(query + '\n')
f_train.write(class_str + '\t' + ' '.join([str(tfidf_tuple[0]) + ':' + str(tfidf_tuple[1]) for tfidf_tuple in tfidf_tuple_list]) + '\n')
f_train.close()
f_test.close()
f_train_query.close()
f_test_query.close()
def main():
if len(sys.argv) != 8:
print 'Usage: ./feature_extraction.py merged_file df_file train_feature train_query test_feature test_query 4'
else:
load_query_expand('../Data/test_query_expand')
extract_feature(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[6], sys.argv[7])
print 'Number of expanded queries: %d' % num_query_expand
if __name__ == '__main__':
main()