Skip to content

Commit

Permalink
增加若干代理, 优化部分代码 (#108)
Browse files Browse the repository at this point in the history
* Create ip89.py

www.89ip.cn 免费代理

* Update ip89.py

update Class name

* Create fatezero_proxylist.py

增加 http://proxylist.fatezero.org/ 代理

* Create ihuan.py

i幻 代理

* update example usage2

* update requirements.txt

* 优化 public crawlers

* add proxy jiangxianli

* tester 增加单个proxy测试方法

* reset setting Dockerfile docker-compose to default

Co-authored-by: jy <[email protected]>
Co-authored-by: 崔庆才丨静觅 <[email protected]>
  • Loading branch information
3 people authored Mar 7, 2021
1 parent cf03d87 commit 4878bf5
Show file tree
Hide file tree
Showing 16 changed files with 231 additions and 29 deletions.
3 changes: 2 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
FROM python:3.6
WORKDIR /app
COPY . .
RUN pip install -r requirements.txt
# RUN pip install -r requirements.txt -i https://pypi.douban.com/simple
RUN pip install -r requirements.txt -i
VOLUME ["/app/proxypool/crawlers/private"]
CMD ["supervisord", "-c", "supervisord.conf"]
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ services:
command: redis-server
ports:
- "6379:6379"
# restart: always
# restart: always
proxypool:
build: .
image: 'germey/proxypool'
Expand Down
95 changes: 95 additions & 0 deletions examples/usage2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# -*- coding: UTF-8 -*-

'''
'''
import requests
import time
import threading
import urllib3
from fake_headers import Headers
import uuid
from geolite2 import geolite2
ips = []

# 爬数据的线程类

def getChinaIP(ip='127.0.0.1'):
reader = geolite2.reader()
ip_info = reader.get(ip)
geolite2.close()
print(ip_info)
return True if ip_info['country']['iso_code'] == 'CN' else False



class CrawlThread(threading.Thread):
def __init__(self, proxyip):
super(CrawlThread, self).__init__()
self.proxyip = proxyip

def run(self):
# 开始计时
pure_ip_address = self.proxyip.split(':')[0]
# 验证IP归属
if not getChinaIP(pure_ip_address):
# pass
raise ValueError('不是有效IP')
#
start = time.time()
# 消除关闭证书验证的警告
urllib3.disable_warnings()
headers = Headers(headers=True).generate()
headers['Referer'] = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=index&id=2676'
headers['Pragma'] = 'no-cache'
headers['Host'] = 'bb.cf08tp.cn'
headers['x-forward-for'] = pure_ip_address
headers['Cookie'] = 'PHPSESSID={}'.format(
''.join(str(uuid.uuid1()).split('-')))
print(headers)
html = requests.get(headers=headers, url=targetUrl, proxies={
"http": 'http://' + self.proxyip, "https": 'https://' + self.proxyip}, verify=False, timeout=2).content.decode()
# 结束计时
end = time.time()
# 输出内容
print(threading.current_thread().getName() + "使用代理IP, 耗时 " + str(end - start) +
"毫秒 " + self.proxyip + " 获取到如下HTML内容:\n" + html + "\n*************")

# 获取代理IP的线程类


class GetIpThread(threading.Thread):
def __init__(self, fetchSecond):
super(GetIpThread, self).__init__()
self.fetchSecond = fetchSecond

def run(self):
global ips
while True:
# 获取IP列表
res = requests.get(apiUrl).content.decode()
# 按照\n分割获取到的IP
ips = res.split('\n')
# 利用每一个IP
for proxyip in ips:
if proxyip.strip():
# 开启一个线程
# CrawlThread(proxyip).start()
try:
CrawlThread(proxyip).run()
time.sleep(1.5)
except Exception as e:
print(e)
# 休眠
time.sleep(len(ips) /self.fetchSecond )


if __name__ == '__main__':
# 获取IP的API接口
# apiUrl = "http://127.0.0.1:5555/all"
apiUrl = "http://127.0.0.1:5555/random"
# 要抓取的目标网站地址
targetUrl = "http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335688&id=2676&tp="
# targetUrl = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335608&id=2676&tp='
fetchSecond = 5
# 开始自动获取IP
GetIpThread(fetchSecond).start()
9 changes: 6 additions & 3 deletions proxypool/crawlers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,19 @@
import requests
from loguru import logger
from proxypool.setting import GET_TIMEOUT


from fake_headers import Headers
import time
class BaseCrawler(object):
urls = []

@retry(stop_max_attempt_number=3, retry_on_result=lambda x: x is None, wait_fixed=2000)
def fetch(self, url, **kwargs):
try:
headers = Headers(headers=True).generate()
kwargs.setdefault('timeout', GET_TIMEOUT)
kwargs.setdefault('verify', False)
response = requests.get(url, **kwargs)
kwargs.setdefault('headers', headers)
response = requests.get(url ,**kwargs)
if response.status_code == 200:
response.encoding = 'utf-8'
return response.text
Expand All @@ -27,6 +29,7 @@ def crawl(self):
for url in self.urls:
logger.info(f'fetching {url}')
html = self.fetch(url)
time.sleep(.5)
for proxy in self.parse(html):
logger.info(f'fetched proxy {proxy.string()} from {url}')
yield proxy
2 changes: 1 addition & 1 deletion proxypool/crawlers/public/daili66.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@


BASE_URL = 'http://www.66ip.cn/{page}.html'
MAX_PAGE = 5
MAX_PAGE = 50


class Daili66Crawler(BaseCrawler):
Expand Down
5 changes: 2 additions & 3 deletions proxypool/crawlers/public/fatezero_proxylist.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,12 @@ def parse(self, html):

hosts_ports = html.split('\n')
for addr in hosts_ports:
ip_address = json.loads(addr)
if(True):
if(addr):
ip_address = json.loads(addr)
host = ip_address['host']
port = ip_address['port']
yield Proxy(host=host, port=port)


if __name__ == '__main__':
crawler = FatezeroCrawler()
for proxy in crawler.crawl():
Expand Down
44 changes: 44 additions & 0 deletions proxypool/crawlers/public/goubanjia.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from proxypool.schemas.proxy import Proxy
from proxypool.crawlers.base import BaseCrawler
import re
from pyquery import PyQuery as pq
import time
BASE_URL = 'http://www.goubanjia.com/'


class GoubanjiaCrawler(BaseCrawler):
"""
ip Goubanjia crawler, http://www.goubanjia.com/
"""
urls = [BASE_URL]

def parse(self, html):
"""
parse html file to get proxies
:return:
"""
doc = pq(html)('.ip').items()
# ''.join([*filter(lambda x: x != '',re.compile('\>([\d:\.]*)\<').findall(td.html()))])
for td in doc:
trs = td.children()
ip_str = ''
for tr in trs:
attrib = tr.attrib
if 'style' in attrib and 'none' in tr.attrib['style']:
continue
ip_str+= '' if not tr.text else tr.text
addr_split = ip_str.split(':')
if(len(addr_split) == 2):
host = addr_split[0]
port = addr_split[1]
yield Proxy(host=host, port=port)
else:
port = trs[-1].text
host = ip_str.replace(port,'')
yield Proxy(host=host, port=port)


if __name__ == '__main__':
crawler = GoubanjiaCrawler()
for proxy in crawler.crawl():
print(proxy)
5 changes: 3 additions & 2 deletions proxypool/crawlers/public/ihuan.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@ class IhuanCrawler(BaseCrawler):
"""
ip ihuan crawler, https://ip.ihuan.me
"""
urls = [BASE_URL.format(path=time.strftime("%Y/%m/%d/%H", time.localtime()))]

path = time.strftime("%Y/%m/%d/%H", time.localtime())
urls = [BASE_URL.format(path=path)]
ignore = False
def parse(self, html):
"""
parse html file to get proxies
Expand Down
6 changes: 3 additions & 3 deletions proxypool/crawlers/public/ip3366.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@
import re


MAX_PAGE = 5
BASE_URL = 'http://www.ip3366.net/free/?stype=1&page={page}'
MAX_PAGE = 8
BASE_URL = 'http://www.ip3366.net/free/?stype={stype}&page={page}'


class IP3366Crawler(BaseCrawler):
"""
ip3366 crawler, http://www.ip3366.net/
"""
urls = [BASE_URL.format(page=i) for i in range(1, 8)]
urls = [BASE_URL.format(stype=stype,page=i) for stype in range(1,3) for i in range(1, 8)]

def parse(self, html):
"""
Expand Down
35 changes: 35 additions & 0 deletions proxypool/crawlers/public/jiangxianli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from proxypool.schemas.proxy import Proxy
from proxypool.crawlers.base import BaseCrawler
import re
import json
BASE_URL = 'https://ip.jiangxianli.com/api/proxy_ips?page={page}'

MAX_PAGE = 10
class JiangxianliCrawler(BaseCrawler):
"""
jiangxianli crawler,https://ip.jiangxianli.com/
"""
urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)]

def parse(self, html):
"""
parse html file to get proxies
:return:
"""

result =json.loads(html)
if result['code'] != 0:
return
MAX_PAGE = int(result['data']['last_page'])
hosts_ports = result['data']['data']
for ip_address in hosts_ports:
if(ip_address):
host = ip_address['ip']
port = ip_address['port']
yield Proxy(host=host, port=port)


if __name__ == '__main__':
crawler = JiangxianliCrawler()
for proxy in crawler.crawl():
print(proxy)
6 changes: 3 additions & 3 deletions proxypool/crawlers/public/kuaidaili.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@
from pyquery import PyQuery as pq


BASE_URL = 'https://www.kuaidaili.com/free/inha/{page}/'
MAX_PAGE = 5
BASE_URL = 'https://www.kuaidaili.com/free/{type}/{page}/'
MAX_PAGE = 300


class KuaidailiCrawler(BaseCrawler):
"""
kuaidaili crawler, https://www.kuaidaili.com/
"""
urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)]
urls = [BASE_URL.format(type=type,page=page) for type in ('intr','inha') for page in range(1, MAX_PAGE + 1)]

def parse(self, html):
"""
Expand Down
2 changes: 1 addition & 1 deletion proxypool/crawlers/public/zhandaye.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@


BASE_URL = 'https://www.zdaye.com/dayProxy/{page}.html'
MAX_PAGE = 5
MAX_PAGE = 5 * 2

class ZhandayeCrawler(BaseCrawler):
"""
Expand Down
15 changes: 15 additions & 0 deletions proxypool/processors/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,21 @@ def get_proxy():
return conn.random().string()


@app.route('/all')
def get_proxy_all():
"""
get a random proxy
:return: get a random proxy
"""
conn = get_conn()
proxies = conn.all()
proxies_string = ''
for proxy in proxies:
proxies_string += str(proxy) + '\n'

return proxies_string


@app.route('/count')
def get_count():
"""
Expand Down
7 changes: 7 additions & 0 deletions proxypool/processors/tester.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,14 @@ def run(self):
if not cursor:
break

def run_tester():
host = '96.113.165.182'
port = '3128'
tasks = [tester.test(Proxy(host=host, port=port))]
tester.loop.run_until_complete(asyncio.wait(tasks))

if __name__ == '__main__':
tester = Tester()
tester.run()
# run_tester()

4 changes: 2 additions & 2 deletions proxypool/storages/redis.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,11 @@ def random(self) -> Proxy:
:return: proxy, like 8.8.8.8:8
"""
# try to get proxy with max score
proxies = self.db.zrangebyscore(REDIS_KEY, PROXY_SCORE_MAX, PROXY_SCORE_MAX)
proxies = self.db.zrangebyscore(REDIS_KEY, PROXY_SCORE_MAX , PROXY_SCORE_MAX)
if len(proxies):
return convert_proxy_or_proxies(choice(proxies))
# else get proxy by rank
proxies = self.db.zrevrange(REDIS_KEY, PROXY_SCORE_MIN, PROXY_SCORE_MAX)
proxies = self.db.zrevrange(REDIS_KEY, PROXY_SCORE_MIN , PROXY_SCORE_MAX)
if len(proxies):
return convert_proxy_or_proxies(choice(proxies))
# else raise error
Expand Down
20 changes: 11 additions & 9 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
environs==7.2.0
Flask==1.0.3
attrs==19.1.0
environs==9.3.0
Flask==1.1.2
attrs==20.3.0
retrying==1.3.3
aiohttp==3.7.4
requests==2.22.0
loguru==0.3.2
pyquery==1.4.0
supervisor==4.1.0
redis==2.10.6
lxml==4.6.2
requests==2.25.1
loguru==0.5.3
pyquery==1.4.3
supervisor==4.2.1
redis==3.5.3
lxml==4.6.2
fake_headers==1.0.2
maxminddb_geolite2==2018.703

0 comments on commit 4878bf5

Please sign in to comment.