forked from oshino29/ngaArchive
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnga.py
176 lines (149 loc) · 7.07 KB
/
nga.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
# -*- coding: UTF-8 -*-
import re
import requests
import os
import sys
import time
from contextlib import closing
import json
import nga_format
# =============先修改
headers = {
'User-agent': 'Nga_Official/80023'}
cookies = {
'ngaPassportUid': '60246374',
'ngaPassportCid': 'X91nnit70dsvtb64gma19jgilb3s3rgcoe6fuvli',
}
totalfloor = [] # [0]int几层,[1]int pid, [2]str时间,[3]str昵称,[4]str内容,[5]int赞数
tid = 0
title = 'title'
localmaxpage = 1
localmaxfloor = -1
# (在single里用)部分楼层有评论,content是挂在被评论楼层的,所以先放在这里,之后判断当前楼层是否是评论楼层(是的话没有content),是的话就直接读成这里 int pid,str时间,str昵称,str内容,int赞数
commentreply = []
errortext = ''
def single(page):
print('trypage%d' % page)
params = (
('tid', tid),
('page', page),
('lite', 'js')
)
ss1 = requests.Session()
get = ss1.get('https://bbs.nga.cn/read.php', headers=headers,
params=params, cookies=cookies)
get.encoding = 'GBK'
content = get.text.replace(' ', '') # 过滤掉防止json解析出错
if "服务器忙" in content:
print("服务器忙")
return False
usertext = re.search(r',"__U":(.+?),"__R":', content, flags=re.S).group(1)
# 这里处理一次,然后在回复内容的时候会调用nga_format.format对内容中的用户名引用会处理
usertext = nga_format.anony(usertext)
userdict = json.loads(usertext, strict=False) # 牵涉到的用户信息
replytext = re.search(r',"__R":(.+?),"__T":', content, flags=re.S).group(1)
replydict = json.loads(replytext, strict=False) # 具体的回复楼
ttext = re.search(r',"__T":(.+?),"__F":', content, flags=re.S).group(1)
tdict = json.loads(ttext, strict=False) # 帖子的一些数据
global title
title = tdict['subject']
global commentreply
for i in range(len(replydict)):
one = ''
if 'comment' in replydict[str(i)]: # 该楼层下挂有评论,先+comment,下面到正经楼层
for one in replydict[str(i)]['comment']:
commentreply.append([int(replydict[str(i)]['comment'][one]['pid']), replydict[str(i)]['comment'][one]['postdate'], userdict[str(
replydict[str(i)]['comment'][one]['authorid'])]['username'], '[评论] ' + str(replydict[str(i)]['comment'][one]['content']), int(replydict[str(i)]['comment'][one]['score'])])
if 'content' in replydict[str(i)]: # 正经楼层
commentnumtxt = ''
if one != '':
commentnumtxt = '[评论数:' + str(int(one) + 1) + ']\n\n'
totalfloor.append([int(replydict[str(i)]['lou']), int(replydict[str(i)]['pid']), replydict[str(i)]['postdate'], userdict[str(
replydict[str(i)]['authorid'])]['username'], commentnumtxt + str(replydict[str(i)]['content']), int(replydict[str(i)]['score'])])
else: # 评论楼层,无content
for one in commentreply:
if one[0] == int(replydict[str(i)]['pid']):
totalfloor.append([int(replydict[str(i)]['lou']), int(
replydict[str(i)]['pid']), one[1], one[2], one[3], one[4]])
commentreply.remove(one)
# lastposter 对不上 “且”不是只有主楼的情况
return int(tdict['replies']) > totalfloor[len(totalfloor)-1][0] and not(len(totalfloor) == 1 and totalfloor[0][0] == 0)
def makefile():
global localmaxfloor
global errortext
lastfloor = 0
total = totalfloor[len(totalfloor)-1][0]
# 为https://github.com/ludoux/ngapost2md/issues/12 而增加,存储每一层format后的纯文本,采用的是 pid-format文本的字典映射
formattedfloor = {}
with open(('./%d/post.md' % tid), 'a', encoding='utf-8') as f:
for onefloor in totalfloor:
if localmaxfloor < int(onefloor[0]):
if onefloor[0] == 0:
f.write(
'### %s\n\n(c) ludoux [GitHub Repo](https://github.com/ludoux/ngapost2md)\n\n' % title)
f.write('----\n##### <span id="pid%d">%d.[%d] \<pid:%d\> %s by %s</span>\n' %
(onefloor[1], onefloor[0], onefloor[5], onefloor[1], onefloor[2], onefloor[3]))
raw = str(onefloor[4])
rt = nga_format.format(
raw, tid, onefloor[0], total, errortext) # format的是每一层的
raw = rt[0]
errortext = rt[1]
appendpid = rt[2]
formattedfloor[onefloor[1]] = raw
for it in appendpid:
if it in formattedfloor:
raw = raw + '\n\n\n--appendpid:' + \
str(it) + '--\n>' + \
str(formattedfloor[it]).replace(
'\n', '\n> ') + '\n\n--end--\n'
else:
# 出现在这个reply的pid不在本次获取的内容(比如已经写到了文本里面)
raw = raw + '\n\n\n--appendpid:' + \
str(it) + '--\n>' + \
'此 pid 未在本次联网获取中拿到,请全新下载本帖子。' + '\n\n--end--\n'
f.write(('%s\n\n' % raw))
lastfloor = int(onefloor[0])
return lastfloor
def main():
global tid
if cookies['ngaPassportUid'][0] == '_' or cookies['ngaPassportCid'][0] == '_':
print('Please edit cookies info in the code file first...')
tid = int(input('tid:'))
try:
holder()
except Exception as e:
print('Oops! %s' % e)
input('press to exit.')
def holder():
global localmaxpage
global localmaxfloor
global errortext
print(tid)
if not os.path.exists(('./%d' % tid)):
os.mkdir(('./%d' % tid))
elif os.path.exists('./%d/max.txt' % tid):
with open('./%d/max.txt' % tid, 'r', encoding='utf-8') as f:
r = f.read()
localmaxpage = int(r.split()[0])
localmaxfloor = int(r.split()[1])
print('localmaxpage%d\nlocalmaxfloor%d' % (localmaxpage, localmaxfloor))
cpage = localmaxpage
while single(cpage) != False:
time.sleep(0.1)
cpage = cpage + 1
lastfloor = makefile()
with open(('./%d/max.txt' % tid), 'w', encoding='utf-8') as f:
f.write('%d %s' % (cpage, totalfloor[len(totalfloor) - 1][0]))
if os.path.exists('./%d/info.txt' % tid):
with open(('./%d/info.txt' % tid), 'a', encoding='utf-8') as f:
f.write('[%s]%d Err:%s\n' % (time.asctime(
time.localtime(time.time())), len(totalfloor), errortext))
else:
with open(('./%d/info.txt' % tid), 'w', encoding='utf-8') as f:
f.write(
'tid:%d\ntitle:%s\n(c) ludoux https://github.com/ludoux/ngapost2md\n==========\n' % (tid, title))
f.write(
('[%s]%d Err:%s\n' % (time.asctime(time.localtime(time.time())), len(totalfloor), errortext)))
print('makeuntil:%d' % lastfloor)
if __name__ == '__main__':
main()