forked from lybroman/Chinese-sentiment-analysis-with-Doc2Vec
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwords_segment.py
38 lines (30 loc) · 1.39 KB
/
words_segment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
__author__ = "[email protected]"
# this script is especially for Chinese docs,
# for English related docs, just doc.split(' ') is fine~
# using third-party package jieba to
import jieba, os, traceback
label = ['train', 'test']
for tag in label:
# score from 1 -> 5
for score in range(1, 6):
with open(f'{score}_{tag}.txt', 'w', encoding='utf-8') as f:
for file in os.listdir(f'./{score}_{tag}'):
try:
with open(f'./{score}_{tag}/{file}', 'r', encoding='gb2312') as ff:
# since doc won't be too long, so read all at once
line = ff.read()
# filter some non-related chars
filter_chars = "\r\n\t,。;!,.:;:、“”‘’"
trans_dict = dict.fromkeys((ord(_) for _ in filter_chars), '')
line = line.translate(trans_dict)
# words segment
it = jieba.cut(line, cut_all=False)
_ = []
for w in it:
_.append(w)
f.write(' '.join(_) + '\n')
except:
# bypass some bad samples for decoding errors
# print(traceback.format_exc())
# print(f'failed to parse {file}')
pass