-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathEstablishDb.py
121 lines (101 loc) · 3.78 KB
/
EstablishDb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from collections import deque
import urllib
import re
from bs4 import BeautifulSoup
import pymysql
import jieba
from DbOptimizer import func
config = {
'host':'127.0.0.1',
'port':3306,
'user':'root',
'password':'xxx',
'database':'searchengine',
'charset':'utf8mb4',
'cursorclass':pymysql.cursors.Cursor}
#入口
#url = 'https://blog.csdn.net/chen_holy/article/details/90181282'
url = 'https://www.csdn.net/'
q = deque() #待爬取链接的双端队列,使用广度优先搜索
visited = set() #已访问的链接集合
q.append(url)
conn = pymysql.connect(**config)
c = conn.cursor()
c.execute('drop table doc')
c.execute('drop table word')
#在create table之前先drop table是因为我之前测试的时候已经建过table了,所以再次运行代码的时候得把旧的table删了重新建
c.execute('create table doc (id int primary key,link text)')
c.execute('create table word (term varchar(25) primary key,list text)')
conn.commit()
conn.close()
print('***************开始!***************************************************')
cnt = 0
# 当队列不为空时循环
# 最多抓取1000个url
while q and cnt <= 1000:
url = q.popleft()
visited.add(url)
#爬取网页内容
try:
req = urllib.request.Request(url = url)
res = urllib.request.urlopen(req)
content = res.read().decode('utf-8')
except:
continue
#寻找下一个可爬的链接,因为搜索范围是网站内,所以对链接有格式要求,这个格式要求根据具体情况而定
m = re.findall(r'<a href="http.*?"', content, re.I)
for x in m:
x = x[9: -1]
if (x not in visited) and (x not in q):
q.append(x)
#解析网页内容,可能有几种情况,这个也是根据这个网站网页的具体情况写的
soup = BeautifulSoup(content, 'html.parser')
title = soup.title
article = soup.find('h1')
if article == None:
article = soup.find('p')
if title == None and article == None:
print('无内容的页面')
continue
elif title != None and article == None:
print('只有标题')
title = title.get_text("", strip=True)
article = ''
elif title == None and article != None:
print('只有内容')
title = ''
article = article.get_text("", strip=True)
else:
print('有标题有内容')
title = title.get_text("", strip=True)
article = article.get_text("", strip=True)
cnt += 1
print('开始抓取第', cnt, '个链接:', url)
#提取出的网页内容存在title,article两个个字符串里,对它们进行中文分词
seggen = jieba.cut_for_search(title)
seglist = list(seggen)
seggen = jieba.cut_for_search(article)
seglist += list(seggen)
#数据存储
conn = pymysql.connect(**config)
c = conn.cursor()
c.execute('insert into doc values(%s,%s)', (cnt, url))
#对每个分出的词语建立词表
for word in seglist:
#print(word)
#检验看看这个词语是否已存在于数据库
c.execute('select list from word where term=%s', word)
result = c.fetchall()
#如果不存在
if len(result) == 0:
docliststr = str(cnt)
c.execute('insert into word values(%s,%s)', (word, docliststr))
#如果已存在
else:
docliststr = result[0][0]#得到字符串
docliststr += ' ' + str(cnt)
c.execute('update word set list=%s where term=%s', (docliststr, word))
conn.commit()
conn.close()
print('词表建立完毕=======================================================')
func()