forked from lytforgood/TextClassification
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFastText.py
executable file
·162 lines (138 loc) · 6.6 KB
/
FastText.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# -*- coding: utf-8 -*-
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import GlobalAveragePooling1D
from keras.callbacks import EarlyStopping
import numpy as np
import logging
import pandas as pd
# Set parameters:
# ngram_range = 2 will add bi-grams features
# ngram_range = 2
# max_features = len(words)
# maxlen = 30
# batch_size = 32
# embedding_dims = 64
# epochs = 100
# embedding_dims,batch_size,epochs
def getdata_train(path,ngram_range,maxlen,max_token,embedding_dims,batch_size,epochs,logpath,modelpath,modelname):
print("fastText n-gram sentence new_maxlen"+str(maxlen))
##数据获取
print('Loading data...')
# path = './data/nlpmaildatasample2.csv'
d = pd.read_csv(path, header=None)
d.columns = ['title', 'lable']
# drop=True 不生成index列
d = d[-pd.isnull(d["title"])].reset_index(drop=True)
all_data = set()
for line in d["title"]:
ws = line.split(" ")
for w in ws:
all_data.add(w)
words = list(all_data)
word_to_id = dict(zip(words, range(len(words))))
dx = []
for line in d["title"]:
ws = line.split(" ")
dx.append([word_to_id[w] for w in ws if w in word_to_id])
# dy=list(d['lable'])
dy = d['lable']
def create_ngram_set(input_list, ngram_value=2):
"""
Extract a set of n-grams from a list of integers.
>>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
{(4, 9), (4, 1), (1, 4), (9, 4)}
>>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
[(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
"""
return set(zip(*[input_list[i:] for i in range(ngram_value)]))
def add_ngram(sequences, token_indice, ngram_range=2):
"""
Augment the input list of list (sequences) by appending n-grams values.
Example: adding bi-gram
>>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
>>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
>>> add_ngram(sequences, token_indice, ngram_range=2)
[[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
Example: adding tri-gram
>>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
>>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
>>> add_ngram(sequences, token_indice, ngram_range=3)
[[1, 3, 4, 5, 1337], [1, 3, 7, 9, 2, 1337, 2018]]
"""
new_sequences = []
for input_list in sequences:
new_list = input_list[:]
for i in range(len(new_list) - ngram_range + 1):
for ngram_value in range(2, ngram_range + 1):
ngram = tuple(new_list[i:i + ngram_value])
if ngram in token_indice:
new_list.append(token_indice[ngram])
new_sequences.append(new_list)
return new_sequences
print('Loading data...')
inx = int(len(dx) / 5 * 3)
x_train, y_train, x_test, y_test = dx[0:inx], dy[0:inx], dx[inx:len(dx)], dy[inx:len(dx)]
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))
if ngram_range > 1:
print('Adding {}-gram features'.format(ngram_range))
# Create set of unique n-gram from the training set.
ngram_set = set()
for input_list in x_train:
for i in range(2, ngram_range + 1):
set_of_ngram = create_ngram_set(input_list, ngram_value=i)
ngram_set.update(set_of_ngram)
# Dictionary mapping n-gram token to a unique integer.
# Integer values are greater than max_features in order
# to avoid collision with existing features.
start_index = max_token + 1
token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
indice_token = {token_indice[k]: k for k in token_indice}
# max_features is the highest integer that could be found in the dataset.
max_features = np.max(list(indice_token.keys())) + 1
# Augmenting x_train and x_test with n-grams features
x_train = add_ngram(x_train, token_indice, ngram_range)
x_test = add_ngram(x_test, token_indice, ngram_range)
print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('Build model...')
model = Sequential()
# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
embedding_dims,
input_length=maxlen))
# we add a GlobalAveragePooling1D, which will average the embeddings
# of all words in the document
model.add(GlobalAveragePooling1D())
# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
# patience经过几个epoch后loss不在变化停止训练
early_stopping = EarlyStopping(monitor='val_loss', patience=2)
# model.fit(X, y, validation_split=0.2, callbacks=[early_stopping])
hist = model.fit(x_train, y_train,
batch_size=batch_size,
epochs=epochs,
validation_data=(x_test, y_test), callbacks=[early_stopping])
# print(hist.history)
##输出loss与acc到日志文件
log_format = "%(asctime)s - %(message)s"
logging.basicConfig(filename=logpath, level=logging.DEBUG, format=log_format)
logging.warning(modelname)
for i in range(len(hist.history["acc"])):
strlog=str(i+1)+" Epoch "+"-loss: "+str(hist.history["loss"][i])+" -acc: "+str(hist.history["acc"][i])+" -val_loss: "+str(hist.history["val_loss"][i])+" -val_acc: "+str(hist.history["val_acc"][i])
logging.warning(strlog)
model.save(modelpath + modelname + '.h5')