Skip to content

Commit

Permalink
v0.1.7 (#9)
Browse files Browse the repository at this point in the history
* Adding support for setting scraper state shared between jobs

* Added support for downloading multiple urls at once

* Added scraper context tests

* Added helper functions to download and process files in parallel

* Update version
  • Loading branch information
flulemon authored May 14, 2023
1 parent ee2d477 commit e89e0c1
Show file tree
Hide file tree
Showing 8 changed files with 706 additions and 88 deletions.
6 changes: 2 additions & 4 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
**/__pycache__/*
.venv

*.install.stamp

dist
.dist
**/.pytest_cache/*
.pytest_cache/

.coverage
.coverage
htmlcov
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ test: $(PY_INSTALL_STAMP) ##Run tests

.PHONE: coverage
coverage: $(PY_INSTALL_STAMP) ##Run tests
$(POETRY) run pytest --cov=sneakpeek tests --cov-fail-under=70
$(POETRY) run pytest --cov=sneakpeek tests --cov-fail-under=70 --cov-report term-missing --cov-report html

build-ui: ##Build frontend
$(YARN) --cwd $(ROOT_DIR)/front/ quasar build
Expand Down
17 changes: 16 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[tool.poetry]
name = "sneakpeek-py"
packages = [{ include = "sneakpeek" }]
version = "0.1.6"
version = "0.1.7"
description = "Sneakpeek is a framework that helps to quickly and conviniently develop scrapers. It's the best choice for scrapers that have some specific complex scraping logic that needs to be run on a constant basis."
authors = ["Dan Yazovsky <[email protected]>"]
maintainers = ["Dan Yazovsky <[email protected]>"]
Expand Down Expand Up @@ -49,6 +49,7 @@ black = "^23.3.0"
pytest-lazy-fixture = "^0.6.3"
pytest-asyncio = "^0.21.0"
pytest-cov = "^4.0.0"
aioresponses = "^0.7.4"

[build-system]
requires = ["poetry-core"]
Expand Down
2 changes: 2 additions & 0 deletions sneakpeek/lib/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ class Scraper(BaseModel):
config: ScraperConfig #: Scraper configuration that is passed to the handler
#: Default priority to enqueue scraper jobs with
schedule_priority: ScraperJobPriority = ScraperJobPriority.NORMAL
#: Scraper state (might be useful to optimise scraping, e.g. only process pages that weren't processed in the last jobs)
state: str | None = None


class ScraperJob(BaseModel):
Expand Down
50 changes: 43 additions & 7 deletions sneakpeek/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from prometheus_client import Counter

from sneakpeek.lib.errors import ScraperJobPingFinishedError, UnknownScraperHandlerError
from sneakpeek.lib.models import ScraperJob, ScraperJobStatus
from sneakpeek.lib.models import Scraper, ScraperJob, ScraperJobStatus
from sneakpeek.lib.queue import QueueABC
from sneakpeek.lib.storage.base import ScraperJobsStorage
from sneakpeek.logging import configure_logging, scraper_job_context
Expand Down Expand Up @@ -124,26 +124,43 @@ async def ping_session():
class LocalRunner:
"""Scraper runner that is meant to be used for local debugging"""

@staticmethod
async def _ping_session():
logging.debug("Pinging session")

@staticmethod
async def _update_scraper_state(state: str) -> Scraper | None:
logging.debug(f"Updating scraper state with: {state}")
return None

@staticmethod
async def run_async(
handler: ScraperHandler,
config: ScraperConfig,
plugins: list[Plugin] | None = None,
scraper_state: str | None = None,
logging_level: int = logging.DEBUG,
) -> None:
"""
Execute scraper locally.
Args:
config (ScraperConfig): Scraper config
handler (ScraperHandler): Scraper handler to execute
config (ScraperConfig): Scraper config to pass to the handler
plugins (list[Plugin] | None, optional): List of plugins that will be used by scraper runner. Defaults to None.
scraper_state (str | None, optional): Scraper state to pass to the handler. Defaults to None.
logging_level (int, optional): Minimum logging level. Defaults to logging.DEBUG.
"""
configure_logging(logging_level)
logging.info("Starting scraper")

async def ping_session():
pass

context = ScraperContext(config, plugins, ping_session)
context = ScraperContext(
config,
plugins,
scraper_state=scraper_state,
ping_session_func=LocalRunner._ping_session,
update_scraper_state_func=LocalRunner._update_scraper_state,
)
try:
await context.start_session()
result = await handler.run(context)
Expand All @@ -159,6 +176,25 @@ def run(
handler: ScraperHandler,
config: ScraperConfig,
plugins: list[Plugin] | None = None,
scraper_state: str | None = None,
logging_level: int = logging.DEBUG,
) -> None:
asyncio.run(LocalRunner.run_async(handler, config, plugins, logging_level))
"""
Execute scraper locally.
Args:
handler (ScraperHandler): Scraper handler to execute
config (ScraperConfig): Scraper config to pass to the handler
plugins (list[Plugin] | None, optional): List of plugins that will be used by scraper runner. Defaults to None.
scraper_state (str | None, optional): Scraper state to pass to the handler. Defaults to None.
logging_level (int, optional): Minimum logging level. Defaults to logging.DEBUG.
"""
asyncio.run(
LocalRunner.run_async(
handler,
config,
plugins,
scraper_state,
logging_level,
)
)
Loading

0 comments on commit e89e0c1

Please sign in to comment.