diff --git a/docs/guides/playwright_crawler_adaptive.mdx b/docs/guides/playwright_crawler_adaptive.mdx index 58f1fbdc38..0beb81954f 100644 --- a/docs/guides/playwright_crawler_adaptive.mdx +++ b/docs/guides/playwright_crawler_adaptive.mdx @@ -34,7 +34,9 @@ Request handler for `AdaptivePlayw `wait_for_selector` accepts `css` selector as first argument and timeout as second argument. The function will try to locate this selector a return once it is found(within timeout). In practice this means that if HTTP-based sub crawler was used, the function will find the selector only if it is part of the static content. If not, the adaptive crawler will fall back to the playwright sub crawler and will wait try to locate the selector within the timeout using playwright. -`query_selector` accepts `css` selector as first argument and timeout as second argument. This function acts similar to `wait_for_selector`, but it also returns this selector if found. Return value type is determined by used HTTP-based sub crawler. For example, it will be `Selector` for `ParselCrawler` and `Tag` for `BeautifulSoupCrawler`. +`query_selector_one` accepts `css` selector as first argument and timeout as second argument. This function acts similar to `wait_for_selector`, but it also returns one selector if any selector is found. Return value type is determined by used HTTP-based sub crawler. For example, it will be `Selector` for `ParselCrawler` and `Tag` for `BeautifulSoupCrawler`. + +`query_selector_all` same as `query_selector_one`, but returns all found selectors. `parse_with_static_parser` will re-parse the whole page. Return value type is determined by used HTTP-based sub crawler. It has optional arguments: `selector` and `timeout`. If those optional arguments are used then the function first calls `wait_for_selector` and then do the parsing. This can be used in scenario where some specific element can signal, that page is already complete. diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py index 5281838181..c8c77965ef 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py @@ -10,7 +10,7 @@ from ._http_crawling_context import TParseResult, TSelectResult if TYPE_CHECKING: - from collections.abc import Iterable + from collections.abc import Iterable, Sequence from crawlee.http_clients import HttpResponse @@ -42,7 +42,7 @@ async def parse_text(self, text: str) -> TParseResult: """ @abstractmethod - async def select(self, parsed_content: TParseResult, selector: str) -> tuple[TSelectResult, ...]: + async def select(self, parsed_content: TParseResult, selector: str) -> Sequence[TSelectResult]: """Use css selector to select page element and return it. Args: diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py index c0450410ed..ea39b13402 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py @@ -16,7 +16,7 @@ ) if TYPE_CHECKING: - from collections.abc import Awaitable, Callable + from collections.abc import Awaitable, Callable, Sequence from playwright.async_api import Page, Response from typing_extensions import Self @@ -111,7 +111,7 @@ async def query_selector_one( async def query_selector_all( self, selector: str, timeout: timedelta = timedelta(seconds=5) - ) -> tuple[TStaticSelectResult, ...]: + ) -> Sequence[TStaticSelectResult]: """Locate element by css selector and return all elements found. If element is not found within timeout, `TimeoutError` is raised. diff --git a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py index 87b056367d..f0543b9a8e 100644 --- a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py +++ b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py @@ -8,7 +8,7 @@ from crawlee.crawlers._abstract_http import AbstractHttpParser if TYPE_CHECKING: - from collections.abc import Iterable + from collections.abc import Iterable, Sequence from crawlee.http_clients import HttpResponse @@ -32,7 +32,7 @@ def is_matching_selector(self, parsed_content: Tag, selector: str) -> bool: return parsed_content.select_one(selector) is not None @override - async def select(self, parsed_content: Tag, selector: str) -> tuple[Tag, ...]: + async def select(self, parsed_content: Tag, selector: str) -> Sequence[Tag]: return tuple(match for match in parsed_content.select(selector)) @override diff --git a/src/crawlee/crawlers/_http/_http_parser.py b/src/crawlee/crawlers/_http/_http_parser.py index 75fa534af1..0a9af538dc 100644 --- a/src/crawlee/crawlers/_http/_http_parser.py +++ b/src/crawlee/crawlers/_http/_http_parser.py @@ -8,7 +8,7 @@ from crawlee.crawlers._types import BlockedInfo if TYPE_CHECKING: - from collections.abc import Iterable + from collections.abc import Iterable, Sequence from crawlee.http_clients import HttpResponse @@ -28,7 +28,7 @@ async def parse_text(self, text: str) -> bytes: raise NotImplementedError @override - async def select(self, parsed_content: bytes, selector: str) -> tuple[bytes, ...]: + async def select(self, parsed_content: bytes, selector: str) -> Sequence[bytes]: raise NotImplementedError @override diff --git a/src/crawlee/crawlers/_parsel/_parsel_parser.py b/src/crawlee/crawlers/_parsel/_parsel_parser.py index 2bb9b1056c..9baa1eba7c 100644 --- a/src/crawlee/crawlers/_parsel/_parsel_parser.py +++ b/src/crawlee/crawlers/_parsel/_parsel_parser.py @@ -9,7 +9,7 @@ from crawlee.crawlers._abstract_http import AbstractHttpParser if TYPE_CHECKING: - from collections.abc import Iterable + from collections.abc import Iterable, Sequence from crawlee.http_clients import HttpResponse @@ -26,7 +26,7 @@ async def parse_text(self, text: str) -> Selector: return Selector(text=text) @override - async def select(self, parsed_content: Selector, selector: str) -> tuple[Selector, ...]: + async def select(self, parsed_content: Selector, selector: str) -> Sequence[Selector]: return tuple(match for match in parsed_content.css(selector)) @override