Skip to content

Commit

Permalink
catch retry error
Browse files Browse the repository at this point in the history
  • Loading branch information
Germey committed Dec 28, 2021
1 parent f7cf600 commit 08385f6
Show file tree
Hide file tree
Showing 9 changed files with 42 additions and 55 deletions.
28 changes: 19 additions & 9 deletions proxypool/crawlers/base.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from retrying import retry
from retrying import RetryError, retry
import requests
from loguru import logger
from proxypool.setting import GET_TIMEOUT
Expand All @@ -23,15 +23,25 @@ def fetch(self, url, **kwargs):
except requests.ConnectionError:
return

@logger.catch
def process(self, html, url):
"""
used for parse html
"""
for proxy in self.parse(html):
logger.info(f'fetched proxy {proxy.string()} from {url}')
yield proxy

def crawl(self):
"""
crawl main method
"""
for url in self.urls:
logger.info(f'fetching {url}')
html = self.fetch(url)
time.sleep(.5)
for proxy in self.parse(html):
logger.info(f'fetched proxy {proxy.string()} from {url}')
yield proxy
try:
for url in self.urls:
logger.info(f'fetching {url}')
html = self.fetch(url)
time.sleep(.5)
yield from self.process(html, url)
except RetryError:
logger.error(
f'crawler {self} crawled proxy unsuccessfully, '
'please check if target url is valid or network issue')
16 changes: 0 additions & 16 deletions proxypool/crawlers/public/data5u.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,23 +11,7 @@ class Data5UCrawler(BaseCrawler):
data5u crawler, http://www.data5u.com
"""
urls = [BASE_URL]

headers = {
'User-Agent': 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
}

@logger.catch
def crawl(self):
"""
crawl main method
"""
for url in self.urls:
logger.info(f'fetching {url}')
html = self.fetch(url, headers=self.headers)
for proxy in self.parse(html):
logger.info(f'fetched proxy {proxy.string()} from {url}')
yield proxy

def parse(self, html):
"""
parse html file to get proxies
Expand Down
File renamed without changes.
1 change: 1 addition & 0 deletions proxypool/crawlers/public/ihuan.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ class IhuanCrawler(BaseCrawler):
path = time.strftime("%Y/%m/%d/%H", time.localtime())
urls = [BASE_URL.format(path=path)]
ignore = False

def parse(self, html):
"""
parse html file to get proxies
Expand Down
12 changes: 8 additions & 4 deletions proxypool/crawlers/public/jiangxianli.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,27 @@
from proxypool.schemas.proxy import Proxy
from proxypool.crawlers.base import BaseCrawler
import re
import json


BASE_URL = 'https://ip.jiangxianli.com/api/proxy_ips?page={page}'

MAX_PAGE = 10


class JiangxianliCrawler(BaseCrawler):
"""
jiangxianli crawler,https://ip.jiangxianli.com/
"""

urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)]

def parse(self, html):
"""
parse html file to get proxies
:return:
"""
result =json.loads(html)

result = json.loads(html)
if result['code'] != 0:
return
MAX_PAGE = int(result['data']['last_page'])
Expand Down
15 changes: 10 additions & 5 deletions proxypool/crawlers/public/xiaoshudaili.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import re

from pyquery import PyQuery as pq

from proxypool.schemas.proxy import Proxy
from proxypool.crawlers.base import BaseCrawler

Expand All @@ -16,16 +14,23 @@ class XiaoShuCrawler(BaseCrawler):
"""

def __init__(self):
html = self.fetch(url=BASE_URL)
"""
init urls
"""
try:
html = self.fetch(url=BASE_URL)
except:
self.urls = []
return
doc = pq(html)
title = doc(".title:eq(0) a").items()

latest_page = 0
for t in title:
res = re.search(r"/(\d+)\.html", t.attr("href"))
latest_page = int(res.group(1)) if res else 0
if latest_page:
self.urls = [PAGE_BASE_URL.format(page=page) for page in range(latest_page - MAX_PAGE, latest_page)]
self.urls = [PAGE_BASE_URL.format(page=page) for page in range(
latest_page - MAX_PAGE, latest_page)]
else:
self.urls = []

Expand Down
17 changes: 0 additions & 17 deletions proxypool/crawlers/public/xicidaili.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,23 +12,7 @@ class XicidailiCrawler(BaseCrawler):
"""
urls = [BASE_URL]
ignore = True

headers = {
'User-Agent': 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
}

@logger.catch
def crawl(self):
"""
crawl main method
"""
for url in self.urls:
logger.info(f'fetching {url}')
html = self.fetch(url, headers=self.headers)
for proxy in self.parse(html):
logger.info(f'fetched proxy {proxy.string()} from {url}')
yield proxy

def parse(self, html):
"""
parse html file to get proxies
Expand All @@ -49,4 +33,3 @@ def parse(self, html):
crawler = XicidailiCrawler()
for proxy in crawler.crawl():
print(proxy)

2 changes: 1 addition & 1 deletion proxypool/crawlers/public/zhandaye.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
BASE_URL = 'https://www.zdaye.com/dayProxy/{page}.html'
MAX_PAGE = 5 * 2


class ZhandayeCrawler(BaseCrawler):
"""
zhandaye crawler, https://www.zdaye.com/dayProxy/
Expand Down Expand Up @@ -56,4 +57,3 @@ def parse(self, html):
crawler = ZhandayeCrawler()
for proxy in crawler.crawl():
print(proxy)

6 changes: 3 additions & 3 deletions proxypool/processors/getter.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,22 @@ class Getter(object):
"""
getter of proxypool
"""

def __init__(self):
"""
init db and crawlers
"""
self.redis = RedisClient()
self.crawlers_cls = crawlers_cls
self.crawlers = [crawler_cls() for crawler_cls in self.crawlers_cls]

def is_full(self):
"""
if proxypool if full
return: bool
"""
return self.redis.count() >= PROXY_NUMBER_MAX

@logger.catch
def run(self):
"""
Expand Down

0 comments on commit 08385f6

Please sign in to comment.