Skip to content

Commit

Permalink
V0.1.8 (#10)
Browse files Browse the repository at this point in the history
- Merged metrics and API servers
- Added code coverage tracking
- Added tests
- Updated dependencies
  • Loading branch information
flulemon authored May 20, 2023
1 parent e89e0c1 commit b0145c0
Show file tree
Hide file tree
Showing 16 changed files with 598 additions and 164 deletions.
8 changes: 8 additions & 0 deletions .coveragerc
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,11 @@ exclude_lines =
raise AssertionError
raise NotImplementedError
if __name__ == .__main__.:
@entrypoint.method()
pragma: no cover
def __repr__
if self.debug:
if settings.DEBUG
if 0:
class .*\bProtocol\):
logger.
7 changes: 7 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,13 @@ jobs:
- name: Tests coverage
run: make coverage

- name: Upload coverage reports to Codecov
uses: codecov/codecov-action@v3
with:
token: ${{ secrets.CODECOV_TOKEN }}
files: ./coverage.xml
verbose: true

- name: Build package
run: make build

Expand Down
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,6 @@ dist
**/.pytest_cache/*
.pytest_cache/
.coverage
htmlcov
htmlcov
demo
coverage.xml
2 changes: 1 addition & 1 deletion .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
"name": "Run demo",
"type": "python",
"request": "launch",
"program": "${workspaceFolder}/demo/main.py",
"program": "${workspaceFolder}/demo/app.py",
"console": "integratedTerminal",
"justMyCode": true,
"env": {
Expand Down
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,11 @@ install: install-py install-js ##Install all dependencies

.PHONY: test
test: $(PY_INSTALL_STAMP) ##Run tests
$(POETRY) run pytest
$(POETRY) run pytest -n 20

.PHONE: coverage
coverage: $(PY_INSTALL_STAMP) ##Run tests
$(POETRY) run pytest --cov=sneakpeek tests --cov-fail-under=70 --cov-report term-missing --cov-report html
$(POETRY) run pytest --cov=sneakpeek tests --cov-fail-under=85 --cov-report term-missing --cov-report html --cov-report xml

build-ui: ##Build frontend
$(YARN) --cwd $(ROOT_DIR)/front/ quasar build
Expand Down
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
[![PyPI version](https://badge.fury.io/py/sneakpeek-py.svg)](https://badge.fury.io/py/sneakpeek-py)
![PyPI - Downloads](https://img.shields.io/pypi/dm/sneakpeek-py?color=)
[![Documentation Status](https://readthedocs.org/projects/sneakpeek-py/badge/?version=latest)](https://sneakpeek-py.readthedocs.io/en/latest/?badge=latest)
[![codecov](https://codecov.io/gh/flulemon/sneakpeek/branch/main/graph/badge.svg?token=7h45P8qHRG)](https://codecov.io/gh/flulemon/sneakpeek)

**Sneakpeek** - is a framework that helps to quickly and conviniently develop scrapers.
It's the best choice for scrapers that have some specific complex scraping logic that needs
Expand All @@ -16,7 +17,7 @@ to be run on a constant basis.
You can also run the demo using Docker:

```bash
docker run -it --rm -p 8080:8080 -p 9090:9090 flulemon/sneakpeek-demo
docker run -it --rm -p 8080:8080 flulemon/sneakpeek-demo
```

Once it has started head over to http://localhost:8080 to play around with it.
Expand Down Expand Up @@ -210,7 +211,7 @@ For the argument `LocalRunner.run` takes:
Now you can run you handler as an ordinary Python script. Given it's in `demo_scraper.py` file you can use:

```bash
python3 demo_scraper.py
python demo_scraper.py
```

## Documentation
Expand Down
231 changes: 126 additions & 105 deletions poetry.lock

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[tool.poetry]
name = "sneakpeek-py"
packages = [{ include = "sneakpeek" }]
version = "0.1.7"
version = "0.1.8"
description = "Sneakpeek is a framework that helps to quickly and conviniently develop scrapers. It's the best choice for scrapers that have some specific complex scraping logic that needs to be run on a constant basis."
authors = ["Dan Yazovsky <[email protected]>"]
maintainers = ["Dan Yazovsky <[email protected]>"]
Expand Down Expand Up @@ -50,6 +50,7 @@ pytest-lazy-fixture = "^0.6.3"
pytest-asyncio = "^0.21.0"
pytest-cov = "^4.0.0"
aioresponses = "^0.7.4"
pytest-xdist = "^3.3.0"

[build-system]
requires = ["poetry-core"]
Expand Down
48 changes: 45 additions & 3 deletions sneakpeek/api.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,17 @@
import os
import pathlib

import fastapi_jsonrpc as jsonrpc
from fastapi import Body
from fastapi import Body, Request, Response
from fastapi.middleware.cors import CORSMiddleware
from fastapi.staticfiles import StaticFiles
from prometheus_client import (
CONTENT_TYPE_LATEST,
REGISTRY,
CollectorRegistry,
generate_latest,
)
from prometheus_client.multiprocess import MultiProcessCollector
from pydantic import BaseModel

from sneakpeek.lib.errors import ScraperHasActiveRunError, ScraperNotFoundError
Expand All @@ -15,6 +23,7 @@
)
from sneakpeek.lib.queue import Queue, QueueABC
from sneakpeek.lib.storage.base import ScraperJobsStorage, ScrapersStorage
from sneakpeek.metrics import count_invocations, measure_latency
from sneakpeek.scraper_handler import ScraperHandler


Expand All @@ -23,12 +32,24 @@ class Priority(BaseModel):
value: int


def metrics(request: Request) -> Response: # pragma: no cover
if "prometheus_multiproc_dir" in os.environ:
registry = CollectorRegistry()
MultiProcessCollector(registry)
else:
registry = REGISTRY

return Response(
generate_latest(registry), headers={"Content-Type": CONTENT_TYPE_LATEST}
)


def get_api_entrypoint(
scrapers_storage: ScrapersStorage,
jobs_storage: ScraperJobsStorage,
queue: Queue,
handlers: list[ScraperHandler],
) -> jsonrpc.Entrypoint:
) -> jsonrpc.Entrypoint: # pragma: no cover
"""
Create public JsonRPC API entrypoint (mostly mimics storage and queue API)
Expand All @@ -51,45 +72,65 @@ async def search_scrapers(
return await scrapers_storage.search_scrapers(name_filter, max_items, last_id)

@entrypoint.method()
@count_invocations(subsystem="api")
@measure_latency(subsystem="api")
async def get_scrapers() -> list[Scraper]:
return await scrapers_storage.get_scrapers()

@entrypoint.method(errors=[ScraperNotFoundError])
@count_invocations(subsystem="api")
@measure_latency(subsystem="api")
async def get_scraper(id: int = Body(...)) -> Scraper:
return await scrapers_storage.get_scraper(id)

@entrypoint.method()
@count_invocations(subsystem="api")
@measure_latency(subsystem="api")
async def create_scraper(scraper: Scraper = Body(...)) -> Scraper:
return await scrapers_storage.create_scraper(scraper)

@entrypoint.method(errors=[ScraperNotFoundError, ScraperHasActiveRunError])
@count_invocations(subsystem="api")
@measure_latency(subsystem="api")
async def enqueue_scraper(
scraper_id: int = Body(...),
priority: ScraperJobPriority = Body(...),
) -> ScraperJob:
return await queue.enqueue(scraper_id, priority)

@entrypoint.method(errors=[ScraperNotFoundError])
@count_invocations(subsystem="api")
@measure_latency(subsystem="api")
async def update_scraper(scraper: Scraper = Body(...)) -> Scraper:
return await scrapers_storage.update_scraper(scraper)

@entrypoint.method(errors=[ScraperNotFoundError])
@count_invocations(subsystem="api")
@measure_latency(subsystem="api")
async def delete_scraper(id: int = Body(...)) -> Scraper:
return await scrapers_storage.delete_scraper(id)

@entrypoint.method(errors=[ScraperNotFoundError])
@count_invocations(subsystem="api")
@measure_latency(subsystem="api")
async def get_scraper_jobs(scraper_id: int = Body(...)) -> list[ScraperJob]:
return await jobs_storage.get_scraper_jobs(scraper_id)

@entrypoint.method()
@count_invocations(subsystem="api")
@measure_latency(subsystem="api")
async def get_scraper_handlers() -> list[str]:
return [handler.name for handler in handlers]

@entrypoint.method()
@count_invocations(subsystem="api")
@measure_latency(subsystem="api")
async def get_schedules() -> list[str]:
return [schedule.value for schedule in ScraperSchedule]

@entrypoint.method()
@count_invocations(subsystem="api")
@measure_latency(subsystem="api")
async def get_priorities() -> list[Priority]:
return [
Priority(name=priority.name, value=priority.value)
Expand All @@ -108,7 +149,7 @@ def create_api(
jobs_storage: ScraperJobsStorage,
queue: QueueABC,
handlers: list[ScraperHandler],
) -> jsonrpc.API:
) -> jsonrpc.API: # pragma: no cover
"""
Create JsonRPC API (FastAPI is used under the hood)
Expand All @@ -132,6 +173,7 @@ def create_api(
handlers,
)
)
app.add_route("/metrics", metrics)
app.mount(
"/docs/",
StaticFiles(
Expand Down
12 changes: 10 additions & 2 deletions sneakpeek/metrics.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import asyncio
from functools import wraps
from typing import Any

from prometheus_client import Counter, Gauge, Histogram

Expand Down Expand Up @@ -29,6 +30,13 @@
)


def _get_full_class_name(obj: Any) -> str:
module = obj.__class__.__module__
if module is None or module == str.__class__.__module__:
return obj.__class__.__name__
return module + "." + obj.__class__.__name__


def measure_latency(subsystem: str):
"""
Decorator for measuring latency of the function (works for both sync and async functions).
Expand Down Expand Up @@ -121,7 +129,7 @@ def sync_wrapper(*args, **kwargs):
subsystem=subsystem,
method=func.__name__,
type="error",
error=e.__class__,
error=_get_full_class_name(e),
).inc()
raise

Expand All @@ -147,7 +155,7 @@ async def async_wrapper(*args, **kwargs):
subsystem=subsystem,
method=func.__name__,
type="error",
error=e.__class__,
error=_get_full_class_name(e),
).inc()
raise

Expand Down
2 changes: 1 addition & 1 deletion sneakpeek/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ async def ping_session():
context = ScraperContext(job.scraper.config, self._plugins, ping_session)
try:
await context.start_session()
await self._queue.ping_scraper_job(job.scraper.id, job.id)
await ping_session()
handler = self._get_handler(job)
job.result = await handler.run(context)
job.status = ScraperJobStatus.SUCCEEDED
Expand Down
Loading

0 comments on commit b0145c0

Please sign in to comment.