diff --git a/harambe/core.py b/harambe/core.py index 9a3abf1..f0961e8 100644 --- a/harambe/core.py +++ b/harambe/core.py @@ -24,7 +24,7 @@ LoggingObserver, OutputObserver, DownloadMeta, - DeduplicationObserver, + DuplicateHandler, ObservationTrigger, ) from harambe.tracker import FileDataTracker @@ -75,7 +75,7 @@ def __init__( observer = [observer] self._observers = observer - self._deduper = DeduplicationObserver() + self._deduper = DuplicateHandler() async def save_data(self, *data: ScrapeResult) -> None: """ diff --git a/harambe/observer.py b/harambe/observer.py index 7b13845..1343108 100644 --- a/harambe/observer.py +++ b/harambe/observer.py @@ -125,7 +125,7 @@ def files(self) -> List[Tuple[str, bytes]]: return self._files -class DeduplicationObserver(OutputObserver): +class DuplicateHandler: def __init__(self): self._saved_data: set[bytes] = set() self.rows_on_page = 0 diff --git a/tests/test_observers.py b/tests/test_observers.py index c6e6026..5dfd74c 100644 --- a/tests/test_observers.py +++ b/tests/test_observers.py @@ -1,6 +1,6 @@ import pytest -from harambe.observer import InMemoryObserver, DeduplicationObserver +from harambe.observer import InMemoryObserver, DuplicateHandler @pytest.mark.asyncio @@ -28,7 +28,7 @@ async def in_memory_on_queue_url(): @pytest.mark.asyncio async def test_stop_pagination_observer_duplicate_data_error(): - observer = DeduplicationObserver() + observer = DuplicateHandler() unduplicated = await observer.on_save_data({"foo": "bar"}) await observer.on_paginate("https://example.com/page2") @@ -42,7 +42,7 @@ async def test_stop_pagination_observer_duplicate_data_error(): @pytest.mark.asyncio async def test_stop_pagination_observer_duplicate_url_error(): - observer = DeduplicationObserver() + observer = DuplicateHandler() unduplicated = await observer.on_queue_url("https://example.com", {"foo": "bar"}) await observer.on_paginate("https://example.com/page2") @@ -56,7 +56,7 @@ async def test_stop_pagination_observer_duplicate_url_error(): @pytest.mark.asyncio async def test_stop_pagination_observer_duplicate_download_error(): - observer = DeduplicationObserver() + observer = DuplicateHandler() unduplicated = await observer.on_download("https://example.com", "foo.txt", b"foo") await observer.on_paginate("https://example.com/page2") @@ -70,7 +70,7 @@ async def test_stop_pagination_observer_duplicate_download_error(): @pytest.mark.asyncio async def test_stop_pagination_observer_no_duplicate_data(): - observer = DeduplicationObserver() + observer = DuplicateHandler() unduplicated1 = await observer.on_save_data({"foo": "bar"}) await observer.on_paginate("https://example.com/page2") unduplicated2 = await observer.on_save_data({"baz": "qux"}) @@ -88,7 +88,7 @@ async def test_stop_pagination_observer_no_duplicate_data(): @pytest.mark.asyncio async def test_ignore_underscore_attributes(): - observer = DeduplicationObserver() + observer = DuplicateHandler() unduplicated1 = await observer.on_save_data({"foo": "bar", "__url": "qux"}) unduplicated2 = await observer.on_save_data({"qux": "bar", "__url": "qux"}) @@ -105,7 +105,7 @@ async def test_ignore_underscore_attributes(): @pytest.mark.asyncio async def test_duplicate_data_without_pagination(): - observer = DeduplicationObserver() + observer = DuplicateHandler() unduplicated = await observer.on_save_data({"foo": "bar"}) duplicated = await observer.on_save_data({"foo": "bar"}) assert not unduplicated and duplicated