-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRTF_to_Dataframe.py
63 lines (56 loc) · 2.31 KB
/
RTF_to_Dataframe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import pickle
from striprtf.striprtf import rtf_to_text
import string
from os import listdir
from datetime import datetime
from tqdm import tqdm
import sys
if __name__ == '__main__':
paths = sys.argv[1]
all_news = []
for mypath in paths:
# mypath = './data/' + year + '/'
files = listdir(mypath)
for file in tqdm(files):
with open(mypath + file, 'r') as f:
data = f.readlines()
for i in data:
if i[:4] == '\\par' and rtf_to_text(i) != '\n':
new = {}
text = rtf_to_text(i)
text = text.split('\n')
text = [x for x in text if x != '']
if len(text) < 5:
continue
count = 0
# 總字數
while '字' not in text[count]:
count += 1
# 作者
if 'By' in text[count-1] and count-2 >= 0:
new['title'] = text[count-2]
else:
new['title'] = text[count-1]
new['date'] = ''.join([text[count+1].split(' ')[x].zfill(2) for x in [0, 2, 4]])
new['source'] = text[count+2]
count = count + 2
while '(c)' not in text[count] and 'Copyright' not in text[count]:
count += 1
if text[count+1][-1] in string.ascii_letters:
s = text[count+1] + '.'
else:
s = text[count+1]
for j in range(count+2, len(text)-2):
if text[j][-1] in string.ascii_letters:
s += ' ' + text[j] + '.'
else:
s += ' ' + text[j]
new['text'] = s
all_news.append(new)
news = pd.DataFrame({'id':range(len(all_news)), 'title':[x['title'] for x in all_news], 'date':[x['date'] for x in all_news], 'source':[x['source'] for x in all_news], 'document_body':[x['text'] for x in all_news]})
with open(sys.argv[2], 'wb') as f:
pickle.dump(news, f)