Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

⚠️ Goto Error Handler #109

Merged
merged 5 commits into from
Jan 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions core/harambe_core/errors.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
from typing import Any


async def default_error_callback(url: str, status: int, *args):
raise GotoError(url, status)


class HarambeException(Exception):
"""Base exception for all custom exceptions in Harambe."""

Expand Down
2 changes: 1 addition & 1 deletion core/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "harambe-core"
version = "0.59.2"
version = "0.59.3"
description = "Core types for harambe SDK 🐒🍌"
authors = [
{ name = "Adam Watkins", email = "[email protected]" }
Expand Down
6 changes: 3 additions & 3 deletions core/test/parser/test_type_date.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ def assert_is_iso_format(date_string):
)
def test_pydantic_type_date_validate_type_success(date_string):
parsed_date = ParserTypeDate.validate_type(date_string)
assert isinstance(
parsed_date, str
), f"Expected string for '{date_string}', got {parsed_date}"
assert isinstance(parsed_date, str), (
f"Expected string for '{date_string}', got {parsed_date}"
)
assert_is_iso_format(parsed_date)


Expand Down
1 change: 1 addition & 0 deletions sdk/harambe/contrib/soup/impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ async def goto(self, url: str, **kwargs: Any) -> ResponseWithStatus:

class SoupResponseWithStatus:
status: int = res.status_code
headers: dict[str, str] = res.headers

return SoupResponseWithStatus()

Expand Down
1 change: 1 addition & 0 deletions sdk/harambe/contrib/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ class ResponseWithStatus(Protocol):
"""Protocol for goto responses across all harnesses. Use minimal attributes required for current use cases."""

status: int
headers: dict[str, str]


class AbstractPage(Selectable[T], abc.ABC):
Expand Down
12 changes: 5 additions & 7 deletions sdk/harambe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
LocalStorage,
)
from harambe_core import SchemaParser, Schema
from harambe_core.errors import GotoError
from harambe_core.errors import default_error_callback
from harambe_core.normalize_url import normalize_url
from harambe_core.parser.expression import ExpressionEvaluator
from playwright.async_api import (
Expand All @@ -64,10 +64,6 @@
from harambe.contrib import WebHarness, playwright_harness


async def default_callback(url: str, status: int):
raise GotoError(url, status)


class AsyncScraper(Protocol):
"""
Protocol that all classed based scrapers should implement.
Expand Down Expand Up @@ -453,7 +449,9 @@ async def run(
harness: WebHarness = playwright_harness,
evaluator: Optional[ExpressionEvaluator] = None,
observer: Optional[OutputObserver | List[OutputObserver]] = None,
callback: Callable[[str, int], Awaitable[None]] = default_callback,
goto_error_handler: Callable[
[str, int, dict[str, str]], Awaitable[None]
] = default_error_callback,
**harness_options: Unpack[HarnessOptions],
) -> "SDK":
"""
Expand Down Expand Up @@ -497,7 +495,7 @@ async def run(
if not harness_options.get("disable_go_to_url", False):
response = await page.goto(url)
if response.status >= 400:
await callback(url, response.status)
await goto_error_handler(url, response.status, response.headers)
Comment on lines -500 to +498
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

main change is passing back response headers here so we can check for our custom mitm-error header

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we be using this for all go_to calls? Just calling it directly wouldn't use it

elif isinstance(page, SoupPage):
page.url = url
await scraper(sdk, url, context)
Expand Down
4 changes: 2 additions & 2 deletions sdk/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
[project]
name = "harambe-sdk"
version = "0.59.2"
version = "0.59.3"
description = "Data extraction SDK for Playwright 🐒🍌"
authors = [
{ name = "Adam Watkins", email = "[email protected]" }
]
requires-python = ">=3.11,<4.0"
readme = "README.md"
dependencies = [
"harambe_core==0.59.2",
"harambe_core==0.59.3",
"playwright==1.47.0",
"beautifulsoup4==4.12.3",
"requests==2.32.3",
Expand Down
16 changes: 8 additions & 8 deletions sdk/test/test_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,12 +279,12 @@ async def scraper(sdk: SDK, *args, **kwargs):

assert observer.data[0]["page_content"] == observer.data[1]["table_content"]
for text in ["Apple", "Orange", "Banana"]:
assert (
text in observer.data[0]["page_content"]
), f"{text} not in {observer.data[0]['page_content']}"
assert (
text in observer.data[1]["table_content"]
), f"{text} not in {observer.data[1]['table_content']}"
assert text in observer.data[0]["page_content"], (
f"{text} not in {observer.data[0]['page_content']}"
)
assert text in observer.data[1]["table_content"], (
f"{text} not in {observer.data[1]['table_content']}"
)


@pytest.mark.parametrize("harness", [soup_harness])
Expand Down Expand Up @@ -687,7 +687,7 @@ async def test_403_status_on_goto_with_custom_callback(
async def scrape(sdk: SDK, current_url, context) -> None:
await sdk.save_data({"key": "this shouldn't be saved if GotoError is raised"})

async def custom_error_handler(url, status_code):
async def custom_error_handler(url, status_code, *args):
print(f"Handled {status_code} for {url} gracefully.")

error_callback = custom_error_handler
Expand All @@ -698,7 +698,7 @@ async def custom_error_handler(url, status_code):
schema={},
context={"status": "Open"},
observer=observer,
callback=error_callback,
goto_error_handler=error_callback,
)

# Ensure data is saved when error is handled (either with custom or no callback)
Expand Down
18 changes: 9 additions & 9 deletions sdk/test/test_stealth.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@ async def test_navigator_webdriver(async_page):
@pytest.mark.asyncio
async def test_user_agent(async_page):
user_agent = await async_page.evaluate("navigator.userAgent")
assert (
"headless" not in user_agent.lower()
), "User agent should not contain 'headless'"
assert "headless" not in user_agent.lower(), (
"User agent should not contain 'headless'"
)


@pytest.mark.asyncio
Expand All @@ -56,9 +56,9 @@ async def test_plugins(async_page):
@pytest.mark.asyncio
async def test_app_version(async_page):
app_version = await async_page.evaluate("navigator.appVersion")
assert (
"headless" not in app_version.lower()
), "App version should not contain 'headless'"
assert "headless" not in app_version.lower(), (
"App version should not contain 'headless'"
)


@pytest.mark.asyncio
Expand Down Expand Up @@ -88,6 +88,6 @@ async def test_connection_rtt(async_page):
}
""")

assert (
connection_rtt is not None and connection_rtt != 0
), "Connection RTT should exist and not be zero in non-headless browsers"
assert connection_rtt is not None and connection_rtt != 0, (
"Connection RTT should exist and not be zero in non-headless browsers"
)
Loading