Skip to content

Commit

Permalink
Update type hint to not brak docusaurus
Browse files Browse the repository at this point in the history
  • Loading branch information
Pijukatel committed Feb 13, 2025
1 parent 3aff36f commit 760f272
Show file tree
Hide file tree
Showing 6 changed files with 13 additions and 11 deletions.
4 changes: 3 additions & 1 deletion docs/guides/playwright_crawler_adaptive.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,9 @@ Request handler for <ApiLink to="class/AdaptivePlaywrightCrawler">`AdaptivePlayw

<ApiLink to="class/AdaptivePlaywrightCrawlingContext#wait_for_selector">`wait_for_selector`</ApiLink> accepts `css` selector as first argument and timeout as second argument. The function will try to locate this selector a return once it is found(within timeout). In practice this means that if HTTP-based sub crawler was used, the function will find the selector only if it is part of the static content. If not, the adaptive crawler will fall back to the playwright sub crawler and will wait try to locate the selector within the timeout using playwright.

<ApiLink to="class/AdaptivePlaywrightCrawlingContext#query_selector">`query_selector`</ApiLink> accepts `css` selector as first argument and timeout as second argument. This function acts similar to `wait_for_selector`, but it also returns this selector if found. Return value type is determined by used HTTP-based sub crawler. For example, it will be `Selector` for <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> and `Tag` for <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>.
<ApiLink to="class/AdaptivePlaywrightCrawlingContext#query_selector_one">`query_selector_one`</ApiLink> accepts `css` selector as first argument and timeout as second argument. This function acts similar to `wait_for_selector`, but it also returns one selector if any selector is found. Return value type is determined by used HTTP-based sub crawler. For example, it will be `Selector` for <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> and `Tag` for <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>.

<ApiLink to="class/AdaptivePlaywrightCrawlingContext#query_selector_one">`query_selector_all`</ApiLink> same as <ApiLink to="class/AdaptivePlaywrightCrawlingContext#query_selector_one">`query_selector_one`</ApiLink>, but returns all found selectors.

<ApiLink to="class/AdaptivePlaywrightCrawlingContext#parse_with_static_parser">`parse_with_static_parser`</ApiLink> will re-parse the whole page. Return value type is determined by used HTTP-based sub crawler. It has optional arguments: `selector` and `timeout`. If those optional arguments are used then the function first calls <ApiLink to="class/AdaptivePlaywrightCrawlingContext#wait_for_selector">`wait_for_selector`</ApiLink> and then do the parsing. This can be used in scenario where some specific element can signal, that page is already complete.

Expand Down
4 changes: 2 additions & 2 deletions src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from ._http_crawling_context import TParseResult, TSelectResult

if TYPE_CHECKING:
from collections.abc import Iterable
from collections.abc import Iterable, Sequence

from crawlee.http_clients import HttpResponse

Expand Down Expand Up @@ -42,7 +42,7 @@ async def parse_text(self, text: str) -> TParseResult:
"""

@abstractmethod
async def select(self, parsed_content: TParseResult, selector: str) -> tuple[TSelectResult, ...]:
async def select(self, parsed_content: TParseResult, selector: str) -> Sequence[TSelectResult]:
"""Use css selector to select page element and return it.
Args:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
)

if TYPE_CHECKING:
from collections.abc import Awaitable, Callable
from collections.abc import Awaitable, Callable, Sequence

from playwright.async_api import Page, Response
from typing_extensions import Self
Expand Down Expand Up @@ -111,7 +111,7 @@ async def query_selector_one(

async def query_selector_all(
self, selector: str, timeout: timedelta = timedelta(seconds=5)
) -> tuple[TStaticSelectResult, ...]:
) -> Sequence[TStaticSelectResult]:
"""Locate element by css selector and return all elements found.
If element is not found within timeout, `TimeoutError` is raised.
Expand Down
4 changes: 2 additions & 2 deletions src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from crawlee.crawlers._abstract_http import AbstractHttpParser

if TYPE_CHECKING:
from collections.abc import Iterable
from collections.abc import Iterable, Sequence

from crawlee.http_clients import HttpResponse

Expand All @@ -32,7 +32,7 @@ def is_matching_selector(self, parsed_content: Tag, selector: str) -> bool:
return parsed_content.select_one(selector) is not None

@override
async def select(self, parsed_content: Tag, selector: str) -> tuple[Tag, ...]:
async def select(self, parsed_content: Tag, selector: str) -> Sequence[Tag]:
return tuple(match for match in parsed_content.select(selector))

@override
Expand Down
4 changes: 2 additions & 2 deletions src/crawlee/crawlers/_http/_http_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from crawlee.crawlers._types import BlockedInfo

if TYPE_CHECKING:
from collections.abc import Iterable
from collections.abc import Iterable, Sequence

from crawlee.http_clients import HttpResponse

Expand All @@ -28,7 +28,7 @@ async def parse_text(self, text: str) -> bytes:
raise NotImplementedError

@override
async def select(self, parsed_content: bytes, selector: str) -> tuple[bytes, ...]:
async def select(self, parsed_content: bytes, selector: str) -> Sequence[bytes]:
raise NotImplementedError

@override
Expand Down
4 changes: 2 additions & 2 deletions src/crawlee/crawlers/_parsel/_parsel_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from crawlee.crawlers._abstract_http import AbstractHttpParser

if TYPE_CHECKING:
from collections.abc import Iterable
from collections.abc import Iterable, Sequence

from crawlee.http_clients import HttpResponse

Expand All @@ -26,7 +26,7 @@ async def parse_text(self, text: str) -> Selector:
return Selector(text=text)

@override
async def select(self, parsed_content: Selector, selector: str) -> tuple[Selector, ...]:
async def select(self, parsed_content: Selector, selector: str) -> Sequence[Selector]:
return tuple(match for match in parsed_content.css(selector))

@override
Expand Down

0 comments on commit 760f272

Please sign in to comment.