-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Create ip89.py www.89ip.cn 免费代理 * Update ip89.py update Class name * Create fatezero_proxylist.py 增加 http://proxylist.fatezero.org/ 代理 * Create ihuan.py i幻 代理 * update example usage2 * update requirements.txt * 优化 public crawlers * add proxy jiangxianli * tester 增加单个proxy测试方法 * reset setting Dockerfile docker-compose to default Co-authored-by: jy <[email protected]> Co-authored-by: 崔庆才丨静觅 <[email protected]>
- Loading branch information
1 parent
cf03d87
commit 4878bf5
Showing
16 changed files
with
231 additions
and
29 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
FROM python:3.6 | ||
WORKDIR /app | ||
COPY . . | ||
RUN pip install -r requirements.txt | ||
# RUN pip install -r requirements.txt -i https://pypi.douban.com/simple | ||
RUN pip install -r requirements.txt -i | ||
VOLUME ["/app/proxypool/crawlers/private"] | ||
CMD ["supervisord", "-c", "supervisord.conf"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
# -*- coding: UTF-8 -*- | ||
|
||
''' | ||
''' | ||
import requests | ||
import time | ||
import threading | ||
import urllib3 | ||
from fake_headers import Headers | ||
import uuid | ||
from geolite2 import geolite2 | ||
ips = [] | ||
|
||
# 爬数据的线程类 | ||
|
||
def getChinaIP(ip='127.0.0.1'): | ||
reader = geolite2.reader() | ||
ip_info = reader.get(ip) | ||
geolite2.close() | ||
print(ip_info) | ||
return True if ip_info['country']['iso_code'] == 'CN' else False | ||
|
||
|
||
|
||
class CrawlThread(threading.Thread): | ||
def __init__(self, proxyip): | ||
super(CrawlThread, self).__init__() | ||
self.proxyip = proxyip | ||
|
||
def run(self): | ||
# 开始计时 | ||
pure_ip_address = self.proxyip.split(':')[0] | ||
# 验证IP归属 | ||
if not getChinaIP(pure_ip_address): | ||
# pass | ||
raise ValueError('不是有效IP') | ||
# | ||
start = time.time() | ||
# 消除关闭证书验证的警告 | ||
urllib3.disable_warnings() | ||
headers = Headers(headers=True).generate() | ||
headers['Referer'] = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=index&id=2676' | ||
headers['Pragma'] = 'no-cache' | ||
headers['Host'] = 'bb.cf08tp.cn' | ||
headers['x-forward-for'] = pure_ip_address | ||
headers['Cookie'] = 'PHPSESSID={}'.format( | ||
''.join(str(uuid.uuid1()).split('-'))) | ||
print(headers) | ||
html = requests.get(headers=headers, url=targetUrl, proxies={ | ||
"http": 'http://' + self.proxyip, "https": 'https://' + self.proxyip}, verify=False, timeout=2).content.decode() | ||
# 结束计时 | ||
end = time.time() | ||
# 输出内容 | ||
print(threading.current_thread().getName() + "使用代理IP, 耗时 " + str(end - start) + | ||
"毫秒 " + self.proxyip + " 获取到如下HTML内容:\n" + html + "\n*************") | ||
|
||
# 获取代理IP的线程类 | ||
|
||
|
||
class GetIpThread(threading.Thread): | ||
def __init__(self, fetchSecond): | ||
super(GetIpThread, self).__init__() | ||
self.fetchSecond = fetchSecond | ||
|
||
def run(self): | ||
global ips | ||
while True: | ||
# 获取IP列表 | ||
res = requests.get(apiUrl).content.decode() | ||
# 按照\n分割获取到的IP | ||
ips = res.split('\n') | ||
# 利用每一个IP | ||
for proxyip in ips: | ||
if proxyip.strip(): | ||
# 开启一个线程 | ||
# CrawlThread(proxyip).start() | ||
try: | ||
CrawlThread(proxyip).run() | ||
time.sleep(1.5) | ||
except Exception as e: | ||
print(e) | ||
# 休眠 | ||
time.sleep(len(ips) /self.fetchSecond ) | ||
|
||
|
||
if __name__ == '__main__': | ||
# 获取IP的API接口 | ||
# apiUrl = "http://127.0.0.1:5555/all" | ||
apiUrl = "http://127.0.0.1:5555/random" | ||
# 要抓取的目标网站地址 | ||
targetUrl = "http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335688&id=2676&tp=" | ||
# targetUrl = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335608&id=2676&tp=' | ||
fetchSecond = 5 | ||
# 开始自动获取IP | ||
GetIpThread(fetchSecond).start() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
from proxypool.schemas.proxy import Proxy | ||
from proxypool.crawlers.base import BaseCrawler | ||
import re | ||
from pyquery import PyQuery as pq | ||
import time | ||
BASE_URL = 'http://www.goubanjia.com/' | ||
|
||
|
||
class GoubanjiaCrawler(BaseCrawler): | ||
""" | ||
ip Goubanjia crawler, http://www.goubanjia.com/ | ||
""" | ||
urls = [BASE_URL] | ||
|
||
def parse(self, html): | ||
""" | ||
parse html file to get proxies | ||
:return: | ||
""" | ||
doc = pq(html)('.ip').items() | ||
# ''.join([*filter(lambda x: x != '',re.compile('\>([\d:\.]*)\<').findall(td.html()))]) | ||
for td in doc: | ||
trs = td.children() | ||
ip_str = '' | ||
for tr in trs: | ||
attrib = tr.attrib | ||
if 'style' in attrib and 'none' in tr.attrib['style']: | ||
continue | ||
ip_str+= '' if not tr.text else tr.text | ||
addr_split = ip_str.split(':') | ||
if(len(addr_split) == 2): | ||
host = addr_split[0] | ||
port = addr_split[1] | ||
yield Proxy(host=host, port=port) | ||
else: | ||
port = trs[-1].text | ||
host = ip_str.replace(port,'') | ||
yield Proxy(host=host, port=port) | ||
|
||
|
||
if __name__ == '__main__': | ||
crawler = GoubanjiaCrawler() | ||
for proxy in crawler.crawl(): | ||
print(proxy) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
from proxypool.schemas.proxy import Proxy | ||
from proxypool.crawlers.base import BaseCrawler | ||
import re | ||
import json | ||
BASE_URL = 'https://ip.jiangxianli.com/api/proxy_ips?page={page}' | ||
|
||
MAX_PAGE = 10 | ||
class JiangxianliCrawler(BaseCrawler): | ||
""" | ||
jiangxianli crawler,https://ip.jiangxianli.com/ | ||
""" | ||
urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)] | ||
|
||
def parse(self, html): | ||
""" | ||
parse html file to get proxies | ||
:return: | ||
""" | ||
|
||
result =json.loads(html) | ||
if result['code'] != 0: | ||
return | ||
MAX_PAGE = int(result['data']['last_page']) | ||
hosts_ports = result['data']['data'] | ||
for ip_address in hosts_ports: | ||
if(ip_address): | ||
host = ip_address['ip'] | ||
port = ip_address['port'] | ||
yield Proxy(host=host, port=port) | ||
|
||
|
||
if __name__ == '__main__': | ||
crawler = JiangxianliCrawler() | ||
for proxy in crawler.crawl(): | ||
print(proxy) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,13 @@ | ||
environs==7.2.0 | ||
Flask==1.0.3 | ||
attrs==19.1.0 | ||
environs==9.3.0 | ||
Flask==1.1.2 | ||
attrs==20.3.0 | ||
retrying==1.3.3 | ||
aiohttp==3.7.4 | ||
requests==2.22.0 | ||
loguru==0.3.2 | ||
pyquery==1.4.0 | ||
supervisor==4.1.0 | ||
redis==2.10.6 | ||
lxml==4.6.2 | ||
requests==2.25.1 | ||
loguru==0.5.3 | ||
pyquery==1.4.3 | ||
supervisor==4.2.1 | ||
redis==3.5.3 | ||
lxml==4.6.2 | ||
fake_headers==1.0.2 | ||
maxminddb_geolite2==2018.703 |