Skip to content

Commit

Permalink
🐛 Make enqueue work on pages with no go to url
Browse files Browse the repository at this point in the history
  • Loading branch information
awtkns committed Jan 4, 2025
1 parent fecb4fd commit 9381591
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 30 deletions.
2 changes: 1 addition & 1 deletion core/harambe_core/normalize_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def normalize_url(path: str, base_path: str | None) -> str:
path = _normalize(path)
escaped_path = path.replace(" ", "%20")

if base_path is None:
if base_path is None or base_path == "about:blank":
return escaped_path

validate_allowed_scheme(base_path, scheme_required=True)
Expand Down
6 changes: 3 additions & 3 deletions sdk/harambe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,11 +132,11 @@ async def save_data(self, *data: ScrapeResult) -> None:
"`SDK.save_data` should be called with one dict at a time, not a list of dicts."
)

url = self.page.url
source_url = self.page.url
for d in data:
if self._validator is not None:
d = self._validator.validate(d, base_url=self.page.url)
d["__url"] = url
d = self._validator.validate(d, base_url=source_url)
d["__url"] = source_url
await self._notify_observers("on_save_data", d)

async def enqueue(
Expand Down
68 changes: 47 additions & 21 deletions sdk/test/test_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
import pytest
from aiohttp import web
from bs4 import BeautifulSoup
from harambe.observer import InMemoryObserver
from harambe.types import BrowserType
from harambe_core.errors import GotoError

from harambe import SDK
from harambe.contrib import playwright_harness, soup_harness
from harambe.observer import InMemoryObserver
from harambe.types import BrowserType
from harambe_core.errors import GotoError


@pytest.fixture(scope="module")
Expand Down Expand Up @@ -53,7 +53,6 @@ def observer():
async def test_save_data(server, observer, harness, browser_type):
url = f"{server}/table"

@SDK.scraper("test", "detail", observer=observer)
async def scraper(sdk: SDK, *args, **kwargs):
page = sdk.page

Expand All @@ -73,6 +72,7 @@ async def scraper(sdk: SDK, *args, **kwargs):
headless=True,
harness=harness,
browser_type=browser_type,
observer=observer,
)

assert len(observer.data) == 3
Expand All @@ -86,12 +86,13 @@ async def scraper(sdk: SDK, *args, **kwargs):


async def test_enqueue_data(server, observer):
@SDK.scraper("test", "detail", observer=observer)
async def scraper(sdk: SDK, *args, **kwargs):
await sdk.enqueue("?page=1")
await sdk.enqueue("/terms", "https://reworkd.ai")

await SDK.run(scraper=scraper, url=server, schema={}, headless=True)
await SDK.run(
scraper=scraper, url=server, schema={}, headless=True, observer=observer
)

assert not observer.data
assert len(observer.urls) == 3
Expand All @@ -106,18 +107,37 @@ async def scraper(sdk: SDK, *args, **kwargs):


async def test_enqueue_data_with_context(server, observer):
@SDK.scraper("test", "detail", observer=observer)
async def scraper(sdk: SDK, *args, **kwargs):
await sdk.enqueue("/adam/?page=55", context={"last": "Watkins"})

await SDK.run(scraper=scraper, url=server, schema={}, headless=True)
await SDK.run(
scraper=scraper, url=server, schema={}, headless=True, observer=observer
)

assert not observer.data
assert len(observer.urls) == 1
assert observer.urls[0][0] == f"{server}/adam/?page=55"
assert observer.urls[0][1] == {"__url": f"{server}/", "last": "Watkins"}


@pytest.mark.parametrize("harness", [playwright_harness, soup_harness])
async def test_enqueue_no_goto_url(observer, harness):
async def scraper(sdk: SDK, *args, **kwargs):
await sdk.enqueue("https://reworkd.ai", context={"last": "Watkins"})

await SDK.run(
scraper=scraper,
url="https://example.com",
schema={},
headless=True,
disable_go_to_url=True,
harness=harness,
observer=observer,
)
assert len(observer.urls) == 1
assert observer.urls[0][0] == "https://reworkd.ai"


@pytest.mark.parametrize("harness", [playwright_harness, soup_harness])
async def test_enqueue_coro(server, observer, harness):
@SDK.scraper("test", "detail", observer=observer)
Expand All @@ -138,7 +158,6 @@ async def scraper(sdk: SDK, *args, **kwargs):

@pytest.mark.parametrize("harness", [playwright_harness, soup_harness])
async def test_paginate(server, observer, harness):
@SDK.scraper("test", "detail", observer=observer)
async def scraper(sdk: SDK, *args, **kwargs):
page = sdk.page
await sdk.save_data({"content": await page.content()})
Expand All @@ -155,6 +174,7 @@ async def pager():
schema={},
headless=True,
harness=harness,
observer=observer,
)

assert len(observer.data) == 2
Expand All @@ -167,7 +187,6 @@ async def pager():

@pytest.mark.parametrize("harness", [playwright_harness, soup_harness])
async def test_narcotics(server, observer, harness):
@SDK.scraper("test", "detail", observer=observer)
async def scraper(sdk: SDK, *args, **kwargs) -> None:
page = sdk.page
await page.wait_for_selector("div#ds-content")
Expand All @@ -193,6 +212,7 @@ async def scraper(sdk: SDK, *args, **kwargs) -> None:
schema={},
headless=True,
harness=harness,
observer=observer,
)

assert len(observer.data) == 1
Expand All @@ -205,7 +225,6 @@ async def scraper(sdk: SDK, *args, **kwargs) -> None:

@pytest.mark.parametrize("harness", [playwright_harness, soup_harness])
async def test_regulations(server, observer, harness):
@SDK.scraper("test", "detail", observer=observer)
async def scraper(sdk: SDK, *args, **kwargs) -> None:
page = sdk.page
await page.wait_for_selector("table.table.mb-0.table-hover.table-striped")
Expand All @@ -223,6 +242,7 @@ async def scraper(sdk: SDK, *args, **kwargs) -> None:
schema={},
headless=True,
harness=harness,
observer=observer,
)

assert not observer.data
Expand All @@ -239,7 +259,6 @@ async def scraper(sdk: SDK, *args, **kwargs) -> None:
async def test_text_content(server, observer, harness):
url = f"{server}/table"

@SDK.scraper("test", "detail", observer=observer)
async def scraper(sdk: SDK, *args, **kwargs):
page = sdk.page
content = await page.text_content("table")
Expand All @@ -248,7 +267,14 @@ async def scraper(sdk: SDK, *args, **kwargs):
table = await page.query_selector("table")
await sdk.save_data({"table_content": await table.text_content()})

await SDK.run(scraper=scraper, url=url, schema={}, headless=True, harness=harness)
await SDK.run(
scraper=scraper,
url=url,
schema={},
headless=True,
harness=harness,
observer=observer,
)
assert len(observer.data) == 2

assert observer.data[0]["page_content"] == observer.data[1]["table_content"]
Expand Down Expand Up @@ -330,7 +356,7 @@ async def test_save_local_storage(server, observer, harness):
"value": "test",
}

@SDK.scraper(local_storage_entry["domain"], "detail", observer=observer)
@SDK.scraper(local_storage_entry["domain"], "detail")
async def scraper(sdk: SDK, *args, **kwargs):
page = sdk.page
# Save test local storage key value pair
Expand All @@ -346,6 +372,7 @@ async def scraper(sdk: SDK, *args, **kwargs):
headless=True,
harness=harness,
schema={},
observer=observer,
)

assert len(observer.local_storage) == 1
Expand Down Expand Up @@ -388,7 +415,6 @@ async def test_load_local_storage(
"value": test_value,
}

@SDK.scraper("test", "detail", observer=observer)
async def scraper(sdk: SDK, *args, **kwargs):
page = sdk.page
page_local_storage = await page.evaluate("localStorage")
Expand All @@ -401,10 +427,10 @@ async def scraper(sdk: SDK, *args, **kwargs):
harness=harness,
schema={},
local_storage=[local_storage_entry_1, local_storage_entry_2],
observer=observer,
)

assert len(observer.data) == 1
print(observer.data)
assert observer.data[0]["local_storage"] == {
local_storage_entry_1["key"]: expected_value,
local_storage_entry_2["key"]: expected_value,
Expand All @@ -420,7 +446,6 @@ async def test_reset_local_storage(server, observer, harness):
"value": "test_value",
}

@SDK.scraper("test", "detail", observer=observer)
async def scraper(sdk: SDK, current_url: str, *args, **kwargs):
page = sdk.page
await page.evaluate("localStorage.clear();")
Expand All @@ -435,6 +460,7 @@ async def scraper(sdk: SDK, current_url: str, *args, **kwargs):
harness=harness,
schema={},
local_storage=[local_storage_entry, local_storage_entry],
observer=observer,
)

assert len(observer.data) == 1
Expand All @@ -447,7 +473,6 @@ async def test_capture_html_with_different_options(server, observer, harness):

replaced_element = '<div id="reworkd">Replaced Text</div>'

@SDK.scraper("test", "detail", observer=observer)
async def scraper(sdk: SDK, *args, **kwargs):
full_html_metadata = await sdk.capture_html()
await sdk.save_data(full_html_metadata)
Expand All @@ -469,6 +494,7 @@ async def scraper(sdk: SDK, *args, **kwargs):
schema={},
headless=True,
harness=harness,
observer=observer,
)

assert len(observer.data) == 3
Expand Down Expand Up @@ -507,7 +533,6 @@ async def scraper(sdk: SDK, *args, **kwargs):
async def test_capture_html_conversion_types(server, observer, harness):
url = f"{server}/heading"

@SDK.scraper("test", "detail", observer=observer)
async def scraper(sdk: SDK, *args, **kwargs):
markdown_html_metadata = await sdk.capture_html()
await sdk.save_data({"text": markdown_html_metadata["text"]})
Expand All @@ -521,6 +546,7 @@ async def scraper(sdk: SDK, *args, **kwargs):
schema={},
headless=True,
harness=harness,
observer=observer,
)

assert len(observer.data) == 2
Expand All @@ -535,7 +561,6 @@ async def scraper(sdk: SDK, *args, **kwargs):
async def test_capture_html_table(server, observer, harness):
url = f"{server}/table"

@SDK.scraper("test", "detail", observer=observer)
async def scraper(sdk: SDK, *args, **kwargs):
text_html_metadata = await sdk.capture_html(html_converter_type="text")
await sdk.save_data({"text": text_html_metadata["text"]})
Expand All @@ -546,6 +571,7 @@ async def scraper(sdk: SDK, *args, **kwargs):
schema={},
headless=True,
harness=harness,
observer=observer,
)

assert len(observer.data) == 1
Expand All @@ -565,7 +591,6 @@ async def scraper(sdk: SDK, *args, **kwargs):
async def test_capture_html_element_not_found(server, observer, harness):
url = f"{server}/table"

@SDK.scraper("test", "detail", observer=observer)
async def scraper(sdk: SDK, *args, **kwargs):
with pytest.raises(ValueError):
await sdk.capture_html("#missing .selector .lies .adam-watkins")
Expand All @@ -576,6 +601,7 @@ async def scraper(sdk: SDK, *args, **kwargs):
schema={},
headless=True,
harness=harness,
observer=observer,
)

assert len(observer.data) == 0
Expand Down
5 changes: 0 additions & 5 deletions sdk/test/test_sdk.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ def test_sdk_init_assigns_correct_values():
assert sdk._observers == [observer]


@pytest.mark.asyncio
async def test_sdk_save_data_calls_on_save_data_for_each_observer():
page = AsyncMock(spec=Page)
observer = AsyncMock(spec=OutputObserver)
Expand All @@ -44,7 +43,6 @@ async def test_sdk_save_data_calls_on_save_data_for_each_observer():
assert observer.on_save_data.call_count == len(data)


@pytest.mark.asyncio
async def test_sdk_enqueue_calls_on_enqueue_url_for_each_observer():
page = AsyncMock(spec=Page)
page.url = "https://example.net"
Expand Down Expand Up @@ -82,7 +80,6 @@ def test_scraper_decorator_adds_observers_to_function(scraper):
assert len(decorated_scraper.observer) == 2


@pytest.mark.asyncio
async def test_scraper_decorator_preserves_functionality_of_decorated_function(scraper):
decorated_scraper = SDK.scraper("https://example.org", "listing")(scraper)

Expand All @@ -99,7 +96,6 @@ async def test_scraper_decorator_preserves_functionality_of_decorated_function(s
sdk.save_data.assert_awaited_once_with({"baz": "qux"})


@pytest.mark.asyncio
async def test_sdk_save_data_saves_valid_data():
page = AsyncMock(spec=Page)
page.url = "https://example.net"
Expand All @@ -125,7 +121,6 @@ async def test_sdk_save_data_saves_valid_data():
assert observer.on_save_data.call_count == 2


@pytest.mark.asyncio
async def test_sdk_save_data_does_not_save_invalid_data():
page = AsyncMock(spec=Page)
observer = AsyncMock(spec=OutputObserver)
Expand Down

0 comments on commit 9381591

Please sign in to comment.