增加若干代理，优化部分代码 (#108)

* Create ip89.py www.89ip.cn 免费代理 * Update ip89.py update Class name * Create fatezero_proxylist.py 增加 http://proxylist.fatezero.org/ 代理 * Create ihuan.py i幻代理 * update example usage2 * update requirements.txt * 优化 public crawlers * add proxy jiangxianli * tester 增加单个proxy测试方法 * reset setting Dockerfile docker-compose to default Co-authored-by: jy <[email protected]> Co-authored-by: 崔庆才丨静觅 <[email protected]>
Python3WebSpider · Mar 7, 2021 · 4878bf5 · 4878bf5
1 parent cf03d87
commit 4878bf5
Show file tree

Hide file tree

Showing 16 changed files with 231 additions and 29 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,6 +1,7 @@
 FROM python:3.6
 WORKDIR /app
 COPY . .
-RUN pip install -r requirements.txt
+# RUN pip install -r requirements.txt  -i https://pypi.douban.com/simple
+RUN pip install -r requirements.txt  -i
 VOLUME ["/app/proxypool/crawlers/private"]
 CMD ["supervisord", "-c", "supervisord.conf"]
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -6,7 +6,7 @@ services:
     command: redis-server
     ports:
       - "6379:6379"
-  #    restart: always
+    # restart: always
   proxypool:
     build: .
     image: 'germey/proxypool'

diff --git a/examples/usage2.py b/examples/usage2.py
@@ -0,0 +1,95 @@
+# -*- coding: UTF-8 -*-
+
+'''
+'''
+import requests
+import time
+import threading
+import urllib3
+from fake_headers import Headers
+import uuid
+from geolite2 import geolite2
+ips = []
+
+# 爬数据的线程类
+
+def getChinaIP(ip='127.0.0.1'):
+    reader = geolite2.reader()
+    ip_info = reader.get(ip)
+    geolite2.close()
+    print(ip_info)
+    return True if ip_info['country']['iso_code'] == 'CN' else False
+
+
+
+class CrawlThread(threading.Thread):
+    def __init__(self, proxyip):
+        super(CrawlThread, self).__init__()
+        self.proxyip = proxyip
+
+    def run(self):
+        # 开始计时
+        pure_ip_address = self.proxyip.split(':')[0]
+        # 验证IP归属
+        if not getChinaIP(pure_ip_address):
+            # pass
+            raise ValueError('不是有效IP')
+        # 
+        start = time.time()
+        # 消除关闭证书验证的警告
+        urllib3.disable_warnings()
+        headers = Headers(headers=True).generate()
+        headers['Referer'] = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=index&id=2676'
+        headers['Pragma'] = 'no-cache'
+        headers['Host'] = 'bb.cf08tp.cn'
+        headers['x-forward-for'] = pure_ip_address
+        headers['Cookie'] = 'PHPSESSID={}'.format(
+            ''.join(str(uuid.uuid1()).split('-')))
+        print(headers)
+        html = requests.get(headers=headers, url=targetUrl, proxies={
+                            "http": 'http://' + self.proxyip, "https": 'https://' + self.proxyip}, verify=False, timeout=2).content.decode()
+        # 结束计时
+        end = time.time()
+        # 输出内容
+        print(threading.current_thread().getName() + "使用代理IP, 耗时 " + str(end - start) +
+              "毫秒 " + self.proxyip + " 获取到如下HTML内容：\n" + html + "\n*************")
+
+# 获取代理IP的线程类
+
+
+class GetIpThread(threading.Thread):
+    def __init__(self, fetchSecond):
+        super(GetIpThread, self).__init__()
+        self.fetchSecond = fetchSecond
+
+    def run(self):
+        global ips
+        while True:
+            # 获取IP列表
+            res = requests.get(apiUrl).content.decode()
+            # 按照\n分割获取到的IP
+            ips = res.split('\n')
+            # 利用每一个IP
+            for proxyip in ips:
+                if proxyip.strip():
+                    # 开启一个线程
+                    # CrawlThread(proxyip).start()
+                    try:
+                        CrawlThread(proxyip).run()
+                        time.sleep(1.5)
+                    except Exception as e:
+                        print(e)
+            # 休眠
+            time.sleep(len(ips) /self.fetchSecond )
+
+
+if __name__ == '__main__':
+    # 获取IP的API接口
+    # apiUrl = "http://127.0.0.1:5555/all"
+    apiUrl = "http://127.0.0.1:5555/random"
+    # 要抓取的目标网站地址
+    targetUrl = "http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335688&id=2676&tp="
+    # targetUrl = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335608&id=2676&tp='
+    fetchSecond = 5
+    # 开始自动获取IP
+    GetIpThread(fetchSecond).start()
diff --git a/proxypool/crawlers/base.py b/proxypool/crawlers/base.py
@@ -2,17 +2,19 @@
 import requests
 from loguru import logger
 from proxypool.setting import GET_TIMEOUT
-
-
+from fake_headers import Headers
+import time
 class BaseCrawler(object):
     urls = []
 
     @retry(stop_max_attempt_number=3, retry_on_result=lambda x: x is None, wait_fixed=2000)
     def fetch(self, url, **kwargs):
         try:
+            headers = Headers(headers=True).generate()
             kwargs.setdefault('timeout', GET_TIMEOUT)
             kwargs.setdefault('verify', False)
-            response = requests.get(url, **kwargs)
+            kwargs.setdefault('headers', headers)
+            response = requests.get(url ,**kwargs)
             if response.status_code == 200:
                 response.encoding = 'utf-8'
                 return response.text
@@ -27,6 +29,7 @@ def crawl(self):
         for url in self.urls:
             logger.info(f'fetching {url}')
             html = self.fetch(url)
+            time.sleep(.5)
             for proxy in self.parse(html):
                 logger.info(f'fetched proxy {proxy.string()} from {url}')
                 yield proxy
diff --git a/proxypool/crawlers/public/daili66.py b/proxypool/crawlers/public/daili66.py
@@ -4,7 +4,7 @@
 
 
 BASE_URL = 'http://www.66ip.cn/{page}.html'
-MAX_PAGE = 5
+MAX_PAGE = 50
 
 
 class Daili66Crawler(BaseCrawler):

diff --git a/proxypool/crawlers/public/fatezero_proxylist.py b/proxypool/crawlers/public/fatezero_proxylist.py
@@ -19,13 +19,12 @@ def parse(self, html):
 
         hosts_ports = html.split('\n')
         for addr in hosts_ports:
-            ip_address = json.loads(addr)
-            if(True):
+            if(addr):
+                ip_address = json.loads(addr)
                 host = ip_address['host']
                 port = ip_address['port']
                 yield Proxy(host=host, port=port)
 
-
 if __name__ == '__main__':
     crawler = FatezeroCrawler()
     for proxy in crawler.crawl():

diff --git a/proxypool/crawlers/public/goubanjia.py b/proxypool/crawlers/public/goubanjia.py
@@ -0,0 +1,44 @@
+from proxypool.schemas.proxy import Proxy
+from proxypool.crawlers.base import BaseCrawler
+import re
+from pyquery import PyQuery as pq
+import time
+BASE_URL = 'http://www.goubanjia.com/'
+
+
+class GoubanjiaCrawler(BaseCrawler):
+    """
+    ip  Goubanjia crawler, http://www.goubanjia.com/
+    """
+    urls = [BASE_URL]
+
+    def parse(self, html):
+        """
+        parse html file to get proxies
+        :return:
+        """
+        doc = pq(html)('.ip').items()
+        # ''.join([*filter(lambda x: x != '',re.compile('\>([\d:\.]*)\<').findall(td.html()))])
+        for td in doc:
+            trs = td.children()
+            ip_str = ''
+            for tr in trs:
+                attrib = tr.attrib
+                if 'style' in attrib and 'none' in  tr.attrib['style']:
+                    continue
+                ip_str+= '' if not tr.text else tr.text
+            addr_split = ip_str.split(':')
+            if(len(addr_split) == 2):
+                host = addr_split[0]
+                port = addr_split[1]
+                yield Proxy(host=host, port=port)
+            else:
+                port = trs[-1].text
+                host = ip_str.replace(port,'')
+                yield Proxy(host=host, port=port)
+
+
+if __name__ == '__main__':
+    crawler = GoubanjiaCrawler()
+    for proxy in crawler.crawl():
+        print(proxy)
diff --git a/proxypool/crawlers/public/ihuan.py b/proxypool/crawlers/public/ihuan.py
@@ -10,8 +10,9 @@ class IhuanCrawler(BaseCrawler):
     """
     ip  ihuan crawler, https://ip.ihuan.me
     """
-    urls = [BASE_URL.format(path=time.strftime("%Y/%m/%d/%H", time.localtime()))]
-
+    path = time.strftime("%Y/%m/%d/%H", time.localtime())
+    urls = [BASE_URL.format(path=path)]
+    ignore = False
     def parse(self, html):
         """
         parse html file to get proxies

diff --git a/proxypool/crawlers/public/ip3366.py b/proxypool/crawlers/public/ip3366.py
@@ -3,15 +3,15 @@
 import re
 
 
-MAX_PAGE = 5
-BASE_URL = 'http://www.ip3366.net/free/?stype=1&page={page}'
+MAX_PAGE = 8
+BASE_URL = 'http://www.ip3366.net/free/?stype={stype}&page={page}'
 
 
 class IP3366Crawler(BaseCrawler):
     """
     ip3366 crawler, http://www.ip3366.net/
     """
-    urls = [BASE_URL.format(page=i) for i in range(1, 8)]
+    urls = [BASE_URL.format(stype=stype,page=i) for stype in range(1,3) for i in range(1, 8)]
 
     def parse(self, html):
         """

diff --git a/proxypool/crawlers/public/jiangxianli.py b/proxypool/crawlers/public/jiangxianli.py
@@ -0,0 +1,35 @@
+from proxypool.schemas.proxy import Proxy
+from proxypool.crawlers.base import BaseCrawler
+import re
+import json
+BASE_URL = 'https://ip.jiangxianli.com/api/proxy_ips?page={page}'
+
+MAX_PAGE = 10
+class JiangxianliCrawler(BaseCrawler):
+    """
+    jiangxianli crawler,https://ip.jiangxianli.com/
+    """
+    urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)]
+
+    def parse(self, html):
+        """
+        parse html file to get proxies
+        :return:
+        """
+
+        result =json.loads(html)
+        if result['code'] != 0:
+            return
+        MAX_PAGE = int(result['data']['last_page'])
+        hosts_ports = result['data']['data']
+        for ip_address in hosts_ports:
+            if(ip_address):
+                host = ip_address['ip']
+                port = ip_address['port']
+                yield Proxy(host=host, port=port)
+
+
+if __name__ == '__main__':
+    crawler = JiangxianliCrawler()
+    for proxy in crawler.crawl():
+        print(proxy)
diff --git a/proxypool/crawlers/public/kuaidaili.py b/proxypool/crawlers/public/kuaidaili.py
@@ -4,15 +4,15 @@
 from pyquery import PyQuery as pq
 
 
-BASE_URL = 'https://www.kuaidaili.com/free/inha/{page}/'
-MAX_PAGE = 5
+BASE_URL = 'https://www.kuaidaili.com/free/{type}/{page}/'
+MAX_PAGE = 300
 
 
 class KuaidailiCrawler(BaseCrawler):
     """
     kuaidaili crawler, https://www.kuaidaili.com/
     """
-    urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)]
+    urls = [BASE_URL.format(type=type,page=page)  for type in ('intr','inha') for page in range(1, MAX_PAGE + 1)]
 
     def parse(self, html):
         """

diff --git a/proxypool/crawlers/public/zhandaye.py b/proxypool/crawlers/public/zhandaye.py
@@ -6,7 +6,7 @@
 
 
 BASE_URL = 'https://www.zdaye.com/dayProxy/{page}.html'
-MAX_PAGE = 5
+MAX_PAGE = 5 * 2
 
 class ZhandayeCrawler(BaseCrawler):
     """

diff --git a/proxypool/processors/server.py b/proxypool/processors/server.py
@@ -37,6 +37,21 @@ def get_proxy():
     return conn.random().string()
 
 
+@app.route('/all')
+def get_proxy_all():
+    """
+    get a random proxy
+    :return: get a random proxy
+    """
+    conn = get_conn()
+    proxies = conn.all()
+    proxies_string = ''
+    for proxy in proxies:
+        proxies_string += str(proxy) + '\n'
+
+    return proxies_string
+
+
 @app.route('/count')
 def get_count():
     """

diff --git a/proxypool/processors/tester.py b/proxypool/processors/tester.py
@@ -84,7 +84,14 @@ def run(self):
             if not cursor:
                 break
 
+def run_tester():
+    host = '96.113.165.182'
+    port = '3128'
+    tasks = [tester.test(Proxy(host=host, port=port))]
+    tester.loop.run_until_complete(asyncio.wait(tasks))
 
 if __name__ == '__main__':
     tester = Tester()
     tester.run()
+    # run_tester()
+
diff --git a/proxypool/storages/redis.py b/proxypool/storages/redis.py
@@ -51,11 +51,11 @@ def random(self) -> Proxy:
         :return: proxy, like 8.8.8.8:8
         """
         # try to get proxy with max score
-        proxies = self.db.zrangebyscore(REDIS_KEY, PROXY_SCORE_MAX, PROXY_SCORE_MAX)
+        proxies = self.db.zrangebyscore(REDIS_KEY, PROXY_SCORE_MAX , PROXY_SCORE_MAX)
         if len(proxies):
             return convert_proxy_or_proxies(choice(proxies))
         # else get proxy by rank
-        proxies = self.db.zrevrange(REDIS_KEY, PROXY_SCORE_MIN, PROXY_SCORE_MAX)
+        proxies = self.db.zrevrange(REDIS_KEY, PROXY_SCORE_MIN , PROXY_SCORE_MAX)
         if len(proxies):
             return convert_proxy_or_proxies(choice(proxies))
         # else raise error

diff --git a/requirements.txt b/requirements.txt
@@ -1,11 +1,13 @@
-environs==7.2.0
-Flask==1.0.3
-attrs==19.1.0
+environs==9.3.0
+Flask==1.1.2
+attrs==20.3.0
 retrying==1.3.3
 aiohttp==3.7.4
-requests==2.22.0
-loguru==0.3.2
-pyquery==1.4.0
-supervisor==4.1.0
-redis==2.10.6
-lxml==4.6.2
+requests==2.25.1
+loguru==0.5.3
+pyquery==1.4.3
+supervisor==4.2.1
+redis==3.5.3
+lxml==4.6.2
+fake_headers==1.0.2
+maxminddb_geolite2==2018.703