Skip to content

Commit

Permalink
Parsing utils (#8)
Browse files Browse the repository at this point in the history
* Adding regex search util

* Increasing integration test speed

* Fix readme

* Updating deps. Bumping version

* Removing demo
  • Loading branch information
flulemon authored Apr 27, 2023
1 parent e0e92eb commit 33184f9
Show file tree
Hide file tree
Showing 9 changed files with 155 additions and 285 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ from sneakpeek.runner import LocalRunner

And add the following lines to the end of the file:

```
```python3
if __name__ == "__main__":
LocalRunner.run(
DemoScraper(),
Expand Down
6 changes: 0 additions & 6 deletions demo/Dockerfile

This file was deleted.

41 changes: 0 additions & 41 deletions demo/demo_scraper.py

This file was deleted.

70 changes: 0 additions & 70 deletions demo/main.py

This file was deleted.

176 changes: 88 additions & 88 deletions poetry.lock

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[tool.poetry]
name = "sneakpeek-py"
packages = [{ include = "sneakpeek" }]
version = "0.1.5"
version = "0.1.6"
description = "Sneakpeek is a framework that helps to quickly and conviniently develop scrapers. It's the best choice for scrapers that have some specific complex scraping logic that needs to be run on a constant basis."
authors = ["Dan Yazovsky <[email protected]>"]
maintainers = ["Dan Yazovsky <[email protected]>"]
Expand Down Expand Up @@ -44,7 +44,7 @@ yarl = "^1.9.1"

[tool.poetry.group.dev.dependencies]
pytest = "^7.2.2"
fakeredis = "^2.10.3"
fakeredis = "2.11.0"
black = "^23.3.0"
pytest-lazy-fixture = "^0.6.3"
pytest-asyncio = "^0.21.0"
Expand Down
12 changes: 0 additions & 12 deletions sneakpeek/lib/storage/redis_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,21 +29,9 @@ async def is_read_only(self) -> bool:
async def _generate_id(self) -> int:
return int(await self._redis.incr("internal:id_counter"))

async def _generate_queue_id(self, priority: ScraperJobPriority) -> int:
return int(await self._redis.incr(f"internal:queue:{priority}:last_id"))

async def _get_queue_last_id(self, priority: ScraperJobPriority) -> int:
return int(await self._redis.get(f"internal:queue:{priority}:last_id") or 0)

async def _get_queue_offset(self, priority: ScraperJobPriority) -> int:
return int(await self._redis.get(f"internal:queue:{priority}:offset") or 0)

def _get_scraper_key(self, id: int) -> str:
return f"scraper:{id}"

def _get_scraper_job_key(self, scraper_id: int, run_id: int) -> str:
return f"scraper_job:{scraper_id}:{run_id}"

@count_invocations(subsystem="storage")
@measure_latency(subsystem="storage")
async def search_scrapers(
Expand Down
30 changes: 30 additions & 0 deletions sneakpeek/scraper_context.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import re
from abc import ABC, abstractmethod
from dataclasses import dataclass
from enum import Enum
Expand Down Expand Up @@ -37,6 +38,14 @@ class Request:
kwargs: dict[str, Any] | None = None


@dataclass
class RegexMatch:
"""Regex match"""

full_match: str #: Full regular expression match
groups: dict[str, str] #: Regular expression group matches


class BeforeRequestPlugin(ABC):
"""Abstract class for the plugin which is called before each request (like Middleware)"""

Expand Down Expand Up @@ -341,3 +350,24 @@ async def options(
kwargs=kwargs,
)
)

def regex(
self,
text: str,
pattern: str,
flags: re.RegexFlag = re.UNICODE | re.MULTILINE | re.IGNORECASE,
) -> list[RegexMatch]:
"""Find matches in the text using regular expression
Args:
text (str): Text to search in
pattern (str): Regular expression
flags (re.RegexFlag, optional): Regular expression flags. Defaults to re.UNICODE | re.MULTILINE | re.IGNORECASE.
Returns:
list[RegexMatch]: Matches found in the text
"""
return [
RegexMatch(full_match=match.group(0), groups=match.groupdict())
for match in re.finditer(pattern, text, flags)
]
99 changes: 34 additions & 65 deletions tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@
SCRAPER_1_ID = 100000001
SCRAPER_2_ID = 100000002
TEST_URL = "test_url"
MIN_SECONDS_TO_HAVE_1_SUCCESSFUL_RUN = 3
MIN_SECONDS_TO_HAVE_1_SUCCESSFUL_RUN = 2.1
MIN_SECONDS_TO_EXECUTE_RUN = 2.1
HANDLER_NAME = "test_scraper_handler"


Expand Down Expand Up @@ -63,76 +64,44 @@ def scrapers() -> list[Scraper]:
]


@pytest.fixture
def in_memory_scrapers_storage(scrapers: list[Scraper]) -> ScrapersStorage:
return InMemoryScrapersStorage(scrapers=scrapers)


@pytest.fixture
def redis_scrapers_storage(scrapers: list[Scraper]) -> ScrapersStorage:
storage = RedisScrapersStorage(FakeRedis())
loop = asyncio.get_event_loop()
for scraper in scrapers:
loop.run_until_complete(storage.create_scraper(scraper))
return storage


@pytest.fixture(
params=[
pytest.lazy_fixture(in_memory_scrapers_storage.__name__),
pytest.lazy_fixture(redis_scrapers_storage.__name__),
]
)
def scrapers_storage(request) -> ScrapersStorage:
return request.param


@pytest.fixture
def in_memory_jobs_storage() -> ScraperJobsStorage:
return InMemoryScraperJobsStorage()
Storages = tuple[ScrapersStorage, ScraperJobsStorage, LeaseStorage]


@pytest.fixture
def redis_jobs_storage(scrapers_storage: ScrapersStorage) -> ScraperJobsStorage:
return RedisScraperJobsStorage(FakeRedis(), scrapers_storage)


@pytest.fixture(
params=[
pytest.lazy_fixture(in_memory_jobs_storage.__name__),
pytest.lazy_fixture(redis_jobs_storage.__name__),
]
)
def jobs_storage(request) -> ScrapersStorage:
return request.param


@pytest.fixture
def in_memory_lease_storage() -> LeaseStorage:
return InMemoryLeaseStorage()
def in_memory_storage(scrapers: list[Scraper]) -> Storages:
return (
InMemoryScrapersStorage(scrapers=scrapers),
InMemoryScraperJobsStorage(),
InMemoryLeaseStorage(),
)


@pytest.fixture
def redis_lease_storage() -> LeaseStorage:
return RedisLeaseStorage(FakeRedis())
def redis_storage(scrapers: list[Scraper]) -> Storages:
scrapers_storage = RedisScrapersStorage(FakeRedis())
loop = asyncio.get_event_loop()
for scraper in scrapers:
loop.run_until_complete(scrapers_storage.create_scraper(scraper))
return (
scrapers_storage,
RedisScraperJobsStorage(FakeRedis(), scrapers_storage),
RedisLeaseStorage(FakeRedis()),
)


@pytest.fixture(
params=[
pytest.lazy_fixture(in_memory_lease_storage.__name__),
pytest.lazy_fixture(redis_lease_storage.__name__),
pytest.lazy_fixture(in_memory_storage.__name__),
pytest.lazy_fixture(redis_storage.__name__),
]
)
def lease_storage(request) -> LeaseStorage:
def storages(request) -> Storages:
return request.param


@pytest.fixture
def server_with_scheduler(
scrapers_storage: ScrapersStorage,
jobs_storage: ScraperJobsStorage,
lease_storage: LeaseStorage,
) -> SneakpeekServer:
def server_with_scheduler(storages: Storages) -> SneakpeekServer:
scrapers_storage, jobs_storage, lease_storage = storages
return SneakpeekServer.create(
handlers=[TestScraper()],
scrapers_storage=scrapers_storage,
Expand All @@ -145,11 +114,8 @@ def server_with_scheduler(


@pytest.fixture
def server_with_worker_only(
scrapers_storage: ScrapersStorage,
jobs_storage: ScraperJobsStorage,
lease_storage: LeaseStorage,
) -> SneakpeekServer:
def server_with_worker_only(storages: Storages) -> SneakpeekServer:
scrapers_storage, jobs_storage, lease_storage = storages
return SneakpeekServer.create(
handlers=[TestScraper()],
scrapers_storage=scrapers_storage,
Expand All @@ -165,8 +131,9 @@ def server_with_worker_only(
@pytest.mark.asyncio
async def test_scraper_schedules_and_completes(
server_with_scheduler: SneakpeekServer,
jobs_storage: ScraperJobsStorage,
storages: Storages,
):
_, jobs_storage, _ = storages
try:
server_with_scheduler.serve(blocking=False)
with patch("sneakpeek.scraper_context.ScraperContext.get") as mocked_request:
Expand All @@ -190,16 +157,17 @@ async def test_scraper_schedules_and_completes(
@pytest.mark.asyncio
async def test_scraper_completes_on_request(
server_with_worker_only: SneakpeekServer,
jobs_storage: ScraperJobsStorage,
storages: Storages,
):
_, jobs_storage, _ = storages
try:
server_with_worker_only.serve(blocking=False)
with patch("sneakpeek.scraper_context.ScraperContext.get") as mocked_request:
await server_with_worker_only.worker._queue.enqueue(
SCRAPER_1_ID,
ScraperJobPriority.HIGH,
)
await asyncio.sleep(2)
await asyncio.sleep(MIN_SECONDS_TO_EXECUTE_RUN)
jobs = await jobs_storage.get_scraper_jobs(SCRAPER_1_ID)
assert len(jobs) == 1, "Expected scraper to be run once"
assert (
Expand All @@ -216,8 +184,9 @@ async def test_scraper_completes_on_request(
@pytest.mark.asyncio
async def test_jobs_are_executed_according_to_priority(
server_with_worker_only: SneakpeekServer,
jobs_storage: ScraperJobsStorage,
storages: Storages,
):
_, jobs_storage, _ = storages
try:
high_pri_job = await server_with_worker_only.worker._queue.enqueue(
SCRAPER_1_ID,
Expand All @@ -229,7 +198,7 @@ async def test_jobs_are_executed_according_to_priority(
)
server_with_worker_only.serve(blocking=False)
with patch("sneakpeek.scraper_context.ScraperContext.get") as mocked_request:
await asyncio.sleep(3)
await asyncio.sleep(MIN_SECONDS_TO_EXECUTE_RUN)
high_pri_job = await jobs_storage.get_scraper_job(
SCRAPER_1_ID, high_pri_job.id
)
Expand Down

0 comments on commit 33184f9

Please sign in to comment.