-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspider.class.php
334 lines (323 loc) · 12.9 KB
/
spider.class.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
<?php
class Spider
{
// 定义一些属性
public function __construct($area,$keyword)
{
$this->area = $area;
$this->keyword = $keyword;
$this->page = PAGE;
}
// 获取免费代理
public function getProxy()
{
// $url = 'http://www.kuaidaili.com/proxylist/1';
$url = 'http://www.kuaidaili.com/free/intr/1/';
$content = $this->request($url, false);
// var_dump($content);die();
$doc = phpQuery::newDocumentHTML($content);
$proxyArray = array();
foreach (pq('tr', $doc) as $trOne) {
$proxyOne = array();
foreach (pq('td', $trOne) as $tdOne) {
$td = pq($tdOne)->text();
$proxyOne[] = $td;
}
$proxyArray[] = $proxyOne;
}
return $proxyArray;
// var_dump($proxyArray);
}
// 请求方法
public function request($url, $https = true, $proxy = false, $method = 'get', $data = null)
{
// 1.初始化
$ch = curl_init($url);
// 2.设置curl
// 返回数据不输出
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
// 开启支持gzip
curl_setopt($ch, CURLOPT_ENCODING, 'gzip,deflate');
// 设置超时限制
// curl_setopt($ch, CURLOPT_TIMEOUT, 5);
// 根据url设置referer
$host = parse_url($url);
$host = $host['host'];
curl_setopt($ch, CURLOPT_REFERER, 'http://' . $host);
// curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36');
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3218.0 Safari/537.36');
// 确认是否开启代理
if ($proxy === true) {
// $proxyArray = $this->getProxy();
// $proxyOne = $proxyArray[rand(1,(count($proxyArray)-1))];
// file_put_contents('./dbug',json_encode($proxyOne));
// 开启代理
// curl_setopt($ch, CURLOPT_PROXY, $proxyOne[0]);
// curl_setopt($ch, CURLOPT_PROXYPORT,$proxyOne[1]);
curl_setopt($ch, CURLOPT_PROXY, '61.191.41.130');
curl_setopt($ch, CURLOPT_PROXYPORT, 80);
}
// 支持https
if ($https === true) {
//绕过ssl验证
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
}
// 支持post
if ($method === 'post') {
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, $data);
}
// 3.发送请求
$content = curl_exec($ch);
// 4.关闭资源
curl_close($ch);
return $content;
}
// 写入数据库方法
public function add($data, $mysqli)
{
$keys = implode(array_keys($data), ',');
$keys = 'id,' . $keys;
$values = implode(array_values($data), '\',\'');
$values .= '\'';
$values = 'null,\'' . $values;
$sql = 'INSERT INTO curl.zhilian (' . $keys . ') VALUES (' . $values . ');';
return $mysqli->multi_query($sql);
}
// 获取lagou页面的所有单页链接
public function getLagouIndex()
{
$url = 'https://www.lagou.com/jobs/positionAjax.json?px=default&city=北京&needAddtionalResult=false&isSchoolJob=0';
// 1.初始化
$ch = curl_init($url);
// 2.设置curl
// 返回数据不输出
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
// 开启支持gzip
// curl_setopt($ch, CURLOPT_ENCODING, 'gzip,deflate');
// 设置超时限制
// curl_setopt($ch, CURLOPT_TIMEOUT, 5);
// 根据url设置referer
$host = parse_url($url);
$host = $host['host'];
curl_setopt($ch, CURLOPT_REFERER, 'http://' . $host);
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36');
// 满足https
// 绕过ssl验证
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
// 满足post
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, $data);
// 3.发送请求
$content = curl_exec($ch);
// 4.关闭资源
curl_close($ch);
echo $content;
}
// 通过招聘的列表页获取所有单页链接
public function getIndex()
{
// 定义一个篮子,用来存储所有的招聘链接
$hrefsArray = array();
// 遍历,确定取几页
for ($i = 1; $i <= $this->page; $i++) {
$url = 'http://sou.zhaopin.com/jobs/searchresult.ashx?jl=' . $this->area . '&kw=' . $this->keyword . '&sm=0&p=' . $i;
// echo $url;die();
$content = $this->request($url, false);
$doc = phpQuery::newDocumentHTML($content);
// 获取数量并存储
$obj = pq($doc);
$total = trim($obj->find('.search_yx_tj')->text());
file_put_contents('./data/total.txt', $total);
$hrefs = array();
foreach (pq('a', $doc) as $one) {
$href = $one->getAttribute('href');
$testa = array($href => pq($one)->text());
// 过滤出具体招聘链接
if (strpos($href, 'http://jobs.zhaopin.com/') !== false && strpos($href, '.htm') !== false) {
$hrefs[] = $href;
}
}
array_push($hrefsArray, $hrefs);
}
return $hrefsArray;
}
// zhilian职位页信息规则
public function getOneInfo($array)
{
$doc = phpQuery::newDocumentHTML($array['content']);
$obj = pq($doc);
$title = $obj->find('h1:eq(0)')->text();
$companyName = $obj->find('h2:eq(0)')->text();
// 薪资
$salary = $obj->find('.terminal-ul li:eq(0)')->text();
// 工作地点
$location = $obj->find('.terminal-ul li:eq(1)')->text();
// 发布时间
$time = $obj->find('.terminal-ul li:eq(2)')->text();
// 工作性质
$jobType = $obj->find('.terminal-ul li:eq(3)')->text();
// 工作经验
$experience = $obj->find('.terminal-ul li:eq(4)')->text();
// 最低学历
$education = $obj->find('.terminal-ul li:eq(5)')->text();
// 招聘人数
$nums = $obj->find('.terminal-ul li:eq(6)')->text();
// 职位类别
$jobCategory = $obj->find('.terminal-ul li:eq(7)')->text();
$jobInfo = $obj->find('.tab-inner-cont:eq(0)')->html();
$jobInfo = pq($jobInfo)->not('b,h2')->html();
$jobInfo = pq($jobInfo)->not('button')->html();
$jobInfo = pq($jobInfo)->text();
$jobInfo = str_replace('SWSStringCutStart', '', $jobInfo);
$jobInfo = str_replace('SWSStringCutEnd', '', $jobInfo);
$jobInfo = trim($jobInfo);
// 工作地址
$address = $obj->find('h2:eq(1)')->text();
$address = trim(str_replace('查看职位地图', '', $address));
$oneInfo = array(
'title' => $title,
'companyName' => $companyName,
'salary' => $salary,
'location' => $location,
'time' => $time,
'jobType' => $jobType,
'experience' => $experience,
'education' => $education,
'nums' => $nums,
'jobCategory' => $jobCategory,
'jobInfo' => addslashes($jobInfo),
'address' => $address,
'url' => $array['url'],
);
return $oneInfo;
}
// 访问并获取每一页的招聘信息
public function getInfo()
{
if (USEMYSQL == 'yes') {
$mysqli = new mysqli(HOST, USER, PASSWORD, DATABASE);
}
// 读取文件,或者直接调用抓取所有的招聘信息链接
$hrefsArray = $this->getIndex();
foreach ($hrefsArray as $key => $value) {
$pageInfo = array();
foreach ($value as $k => $v) {
// 访问获取每一页的具体信息
$content = $this->request($v, false);
// 调用规则获取数据
$oneInfo = $this->getOneInfo(array('url' => $v,'content' => $content));
// 把所有数据组合成为数组
$pageInfo[] = $oneInfo;
// 判断当前进度的index
if ($key == 0) {
$index = $k + 1;
} else {
$index = $key * 60 + ($k + 1);
}
// 计算总数量
$count = count($hrefsArray) * count($value);
$progress = round(($index / $count) * 100);
echo "<script type=\"text/javascript\">$('.progress-bar').css('width','$progress%');</script>";
ob_flush();
flush();
}
}
$resultLength = file_put_contents('./data/info.json', json_encode($pageInfo));
if ($resultLength > 0) {
echo "<script type=\"text/javascript\">self.location=\"http://localhost/curlpc/show.php\"</script>";
exit();
}
}
// 批量并发发送请求获取页面数据
public function getInfoByMulti()
{
$hrefsArray = $this->getIndex();
$hrefsArray = $hrefsArray[0];
// // 模拟同时发送4条请求
// $count = count($hrefsArray);
// $times = $count/4;
// // 存储所有页面数据的信息数组
// $allHtmlArray = array();
// for ($i=0; $i <= $times; $i++) {
// $jobsHtmlArray = $this->requestByMulti(array($hrefsArray[$i],$hrefsArray[$i+1],$hrefsArray[$i+2],$hrefsArray[$i+3]));
// echo '<pre>';
// var_dump($jobsHtmlArray);
// echo '<hr>';
// foreach ($jobsHtmlArray as $key => $value) {
// $allHtmlArray[$key] = $value;
// }
// }
// echo count($allHtmlArray);die();
$allHtmlArray = $this->requestByMulti($hrefsArray);
//清洗获取需要数据
$pageInfo = array();
foreach ($allHtmlArray as $key => $value) {
$oneInfo = $this->getOneInfo(array('url' => $key,'content' => $value));
$pageInfo[] = $oneInfo;
}
// var_dump($pageInfo);die;
$resultLength = file_put_contents('./data/info.json', json_encode($pageInfo));
if ($resultLength > 0) {
echo "<script type=\"text/javascript\">self.location=\"http://localhost/curlpc/show.php\"</script>";
exit();
}
}
// 批量发送请求
public function requestByMulti($urlArray)
{
$mh = curl_multi_init();
// 遍历数组,并同时创建生成
foreach ($urlArray as $key => $url) {
// 生成单个
$ch[$key] = curl_init($url);
curl_setopt($ch[$key], CURLOPT_RETURNTRANSFER,1);
curl_setopt($ch[$key], CURLOPT_HEADER,0);
curl_setopt($ch[$key], CURLOPT_USERAGENT, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3218.0 Safari/537.36');
// 开启支持gzip
curl_setopt($ch[$key], CURLOPT_ENCODING, 'gzip,deflate');
// 设置超时限制
// curl_setopt($ch, CURLOPT_TIMEOUT, 5);
// 根据url设置referer
$host = parse_url($url);
$host = $host['host'];
curl_setopt($ch[$key], CURLOPT_REFERER, 'http://' . $host);
curl_multi_add_handle($mh,$ch[$key]);
}
// 进行请求批量发送
do {
curl_multi_exec($mh,$active);
} while ($active);
// $active = null;
// // 循环执行查看返回值
// do {
// $mrc = curl_multi_exec($mh, $active);
// }while ($mrc == CURLM_CALL_MULTI_PERFORM);
// // 执行失败就继续执行
// while ($active && $mrc == CURLM_OK) {
// if (curl_multi_select($mh) != -1) {
// do {
// $mrc = curl_multi_exec($mh, $active);
// } while ($mrc == CURLM_CALL_MULTI_PERFORM);
// }
// }
// 创建存储返回数据的数组
$jobsHtmlArray = array();
// 获取请求返回的数据
foreach ($urlArray as $key=>$url){
$html = curl_multi_getcontent($ch[$key]);
// 解析源文件名称
// $host = parse_url($url);
// $filename = $host['path'];
// 保存源文件到本地
// file_put_contents("./data".$filename,$html);
$jobsHtmlArray[$url] = $html;
curl_multi_remove_handle($mh, $ch[$key]);
curl_close($ch[$key]);
}
curl_multi_close($mh);
return $jobsHtmlArray;
}
}