Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make code upto date with latest version of scrapy #36

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 15 additions & 9 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,24 +1,30 @@
language: python
python: 3.5

sudo: false

env:
matrix:
- TOXENV=py27
- TOXENV=py35
matrix:
include:
- python: 3.6
env:
- TOX_ENV=py36
- python: 3.7
env:
- TOX_ENV=py37
- python: 3.8
env:
- TOX_ENV=py38

addons:
apt:
packages:
- libdb-dev
- libdb-dev

install: pip install -U tox codecov

script: tox
script: tox -e $TOX_ENV

after_success:
- codecov
- codecov

deploy:
provider: pypi
Expand All @@ -29,4 +35,4 @@ deploy:
on:
tags: true
repo: scrapy-plugins/scrapy-deltafetch
condition: $TOXENV = py35
condition: $TOXENV = py38
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
scrapy>=1.1.0
scrapy>=2.3.0
bsddb3
3 changes: 1 addition & 2 deletions scrapy_deltafetch/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from .middleware import DeltaFetch

from .middleware import DeltaFetch # noqa

__version__ = "1.2.1"
90 changes: 45 additions & 45 deletions scrapy_deltafetch/middleware.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,24 @@
import logging
import os
import time
from logging import getLogger
from typing import Iterable

from scrapy.http import Request
from scrapy.item import BaseItem
from scrapy.utils.request import request_fingerprint
import bsddb3
from scrapy import signals
from scrapy.crawler import Crawler
from scrapy.exceptions import NotConfigured
from scrapy.http import Request, Response
from scrapy.item import Item
from scrapy.spiders import Spider
from scrapy.statscollectors import StatsCollector
from scrapy.utils.project import data_path
from scrapy.utils.python import to_bytes
from scrapy.exceptions import NotConfigured
from scrapy import signals

from scrapy.utils.request import request_fingerprint

logger = logging.getLogger(__name__)
logger = getLogger(__name__)


class DeltaFetch(object):
class DeltaFetch:
"""
This is a spider middleware to ignore requests to pages containing items
seen in previous crawls of the same spider, thus producing a "delta crawl"
Expand All @@ -25,70 +29,66 @@ class DeltaFetch(object):
intensive).
"""

def __init__(self, dir, reset=False, stats=None):
dbmodule = None
try:
dbmodule = __import__('bsddb3').db
except ImportError:
raise NotConfigured('bsddb3 is required')
self.dbmodule = dbmodule
def __init__(self, dir: str, reset: bool = False, stats: StatsCollector = None):
self.dir = dir
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bsddb3 is not available on some platforms, so it will crash for some people without bsddb3 and they will not know exactly why. I created ticket for moving away from bsddb3 to other library #37

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for the late reply. But bsddb3 is imported on the top which will clearly say

    import bsddb3
ModuleNotFoundError: No module named 'bsddb3'

when it is not installed on the device

self.reset = reset
self.stats = stats

@classmethod
def from_crawler(cls, crawler):
def from_crawler(cls, crawler: Crawler):
s = crawler.settings
if not s.getbool('DELTAFETCH_ENABLED'):
if not s.getbool("DELTAFETCH_ENABLED"):
raise NotConfigured
dir = data_path(s.get('DELTAFETCH_DIR', 'deltafetch'))
reset = s.getbool('DELTAFETCH_RESET')
dir = data_path(s.get("DELTAFETCH_DIR", "deltafetch"))
reset = s.getbool("DELTAFETCH_RESET")
o = cls(dir, reset, crawler.stats)
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
return o

def spider_opened(self, spider):
if not os.path.exists(self.dir):
def spider_opened(self, spider: Spider) -> None:
if not os.path.isdir(self.dir):
os.makedirs(self.dir)
dbpath = os.path.join(self.dir, '%s.db' % spider.name)
reset = self.reset or getattr(spider, 'deltafetch_reset', False)
flag = self.dbmodule.DB_TRUNCATE if reset else self.dbmodule.DB_CREATE
dbpath = os.path.join(self.dir, f"{spider.name}.db")
reset = self.reset or getattr(spider, "deltafetch_reset", False)
flag = bsddb3.db.DB_TRUNCATE if reset else bsddb3.db.DB_CREATE

try:
self.db = self.dbmodule.DB()
self.db.open(filename=dbpath,
dbtype=self.dbmodule.DB_HASH,
flags=flag)
except Exception:
logger.warning("Failed to open DeltaFetch database at %s, "
"trying to recreate it" % dbpath)
if os.path.exists(dbpath):
self.db = bsddb3.db.DB()
self.db.open(filename=dbpath, dbtype=bsddb3.db.DB_HASH, flags=flag)
except bsddb3.db.DBError:
logger.warning(
f"Failed to open DeltaFetch database at {dbpath}, trying to recreate it"
)
if os.path.isfile(dbpath):
os.remove(dbpath)
self.db = self.dbmodule.DB()
self.db.open(filename=dbpath,
dbtype=self.dbmodule.DB_HASH,
flags=self.dbmodule.DB_CREATE)
self.db = bsddb3.db.DB()
self.db.open(
filename=dbpath, dbtype=bsddb3.db.DB_HASH, flags=bsddb3.db.DB_CREATE,
)

def spider_closed(self, spider):
def spider_closed(self, _spider: Spider) -> None:
self.db.close()

def process_spider_output(self, response, result, spider):
def process_spider_output(
self, response: Response, result: Iterable, spider: Spider
):
for r in result:
if isinstance(r, Request):
key = self._get_key(r)
if key in self.db:
logger.info("Ignoring already visited: %s" % r)
logger.info(f"Ignoring already visited: {r}")
if self.stats:
self.stats.inc_value('deltafetch/skipped', spider=spider)
self.stats.inc_value("deltafetch/skipped", spider=spider)
continue
elif isinstance(r, (BaseItem, dict)):
elif isinstance(r, (Item, dict)):
key = self._get_key(response.request)
self.db[key] = str(time.time())
if self.stats:
self.stats.inc_value('deltafetch/stored', spider=spider)
self.stats.inc_value("deltafetch/stored", spider=spider)
yield r

def _get_key(self, request):
key = request.meta.get('deltafetch_key') or request_fingerprint(request)
def _get_key(self, request: Request) -> bytes:
key = request.meta.get("deltafetch_key") or request_fingerprint(request)
# request_fingerprint() returns `hashlib.sha1().hexdigest()`, is a string
return to_bytes(key)
2 changes: 0 additions & 2 deletions setup.cfg

This file was deleted.

48 changes: 29 additions & 19 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,34 @@
from setuptools import setup
from setuptools import find_packages, setup

with open(
"README.rst",
) as fh:
long_description = fh.read()

setup(
name='scrapy-deltafetch',
version='1.2.1',
license='BSD',
description='Scrapy middleware to ignore previously crawled pages',
author='Scrapinghub',
author_email='[email protected]',
url='http://github.com/scrapy-plugins/scrapy-deltafetch',
packages=['scrapy_deltafetch'],
platforms=['Any'],
name="scrapy-deltafetch",
version="1.2.1",
description="Scrapy middleware to ignore previously crawled pages",
long_description=long_description,
long_description_content_type="text/x-rst",
author="Scrapinghub",
author_email="[email protected]",
maintainer="Rabin Adhikari",
maintainer_email="[email protected]",
url="http://github.com/scrapy-plugins/scrapy-deltafetch",
packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
license="BSD",
classifiers=[
'Development Status :: 4 - Beta',
'License :: OSI Approved :: BSD License',
'Operating System :: OS Independent',
'Programming Language :: Python',
'Programming Language :: Python :: 2',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.5',
"Development Status :: 4 - Beta",
"License :: OSI Approved :: BSD License",
"Operating System :: OS Independent",
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
],
install_requires=['Scrapy>=1.1.0', 'bsddb3']
install_requires=["Scrapy>=2.3.0", "bsddb3"],
python_requires=">=3.6",
zip_safe=True,
)
Loading