Skip to content

Commit

Permalink
refactor!: rename enqueue_links to extract_links
Browse files Browse the repository at this point in the history
  • Loading branch information
vdusek committed Feb 12, 2025
1 parent 0da6a8e commit 8edd8ee
Show file tree
Hide file tree
Showing 65 changed files with 182 additions and 175 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,8 @@ async def main() -> None:
# Push the extracted data to the default dataset.
await context.push_data(data)

# Enqueue all links found on the page.
await context.enqueue_links()
# Extract and enqueue all links found on the page.
await context.extract_links()

# Run the crawler with the initial list of URLs.
await crawler.run(['https://crawlee.dev'])
Expand Down Expand Up @@ -154,8 +154,8 @@ async def main() -> None:
# Push the extracted data to the default dataset.
await context.push_data(data)

# Enqueue all links found on the page.
await context.enqueue_links()
# Extract and enqueue all links found on the page.
await context.extract_links()

# Run the crawler with the initial list of requests.
await crawler.run(['https://crawlee.dev'])
Expand Down
2 changes: 1 addition & 1 deletion docs/deployment/code/apify/crawler_as_actor_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,6 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
'title': context.soup.title.string if context.soup.title else None,
}
await context.push_data(data)
await context.enqueue_links()
await context.extract_links()

await crawler.run(['https://crawlee.dev'])
4 changes: 2 additions & 2 deletions docs/examples/code/adaptive_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
# Do some processing using `parsed_content`
context.log.info(context.parsed_content.title)

# Find more links and enqueue them.
await context.enqueue_links()
# Extract and enqueue all links found on the page.
await context.extract_links()
await context.push_data({'Top crawler Url': context.request.url})

@crawler.pre_navigation_hook
Expand Down
4 changes: 2 additions & 2 deletions docs/examples/code/crawl_all_links_on_website_bs.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ async def main() -> None:
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

# Enqueue all links found on the page.
await context.enqueue_links()
# Extract and enqueue all links found on the page.
await context.extract_links()

# Run the crawler with the initial list of requests.
await crawler.run(['https://crawlee.dev'])
Expand Down
4 changes: 2 additions & 2 deletions docs/examples/code/crawl_all_links_on_website_pw.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ async def main() -> None:
async def request_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

# Enqueue all links found on the page.
await context.enqueue_links()
# Extract and enqueue all links found on the page.
await context.extract_links()

# Run the crawler with the initial list of requests.
await crawler.run(['https://crawlee.dev'])
Expand Down
4 changes: 2 additions & 2 deletions docs/examples/code/crawl_specific_links_on_website_bs.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ async def main() -> None:
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

# Enqueue all the documentation links found on the page, except for the examples.
await context.enqueue_links(
# Extract and enqueue all links found on the page, except for the examples.
await context.extract_links(
include=[Glob('https://crawlee.dev/docs/**')],
exclude=[Glob('https://crawlee.dev/docs/examples')],
)
Expand Down
4 changes: 2 additions & 2 deletions docs/examples/code/crawl_specific_links_on_website_pw.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ async def main() -> None:
async def request_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

# Enqueue all the documentation links found on the page, except for the examples.
await context.enqueue_links(
# Extract and enqueue all links found on the page, except for the examples.
await context.extract_links(
include=[Glob('https://crawlee.dev/docs/**')],
exclude=[Glob('https://crawlee.dev/docs/examples')],
)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import asyncio

from crawlee import EnqueueStrategy
from crawlee import ExtractStrategy
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


Expand All @@ -15,9 +15,9 @@ async def main() -> None:
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

# Enqueue all links found on the page. Any URLs found will be matched by
# this strategy, even if they go off the site you are currently crawling.
await context.enqueue_links(strategy=EnqueueStrategy.ALL)
# Extract and enqueue all links found on the page. Any URLs found will be matched
# by this strategy, even if they go off the site you are currently crawling.
await context.extract_links(strategy=ExtractStrategy.ALL)

# Run the crawler with the initial list of requests.
await crawler.run(['https://crawlee.dev'])
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import asyncio

from crawlee import EnqueueStrategy
from crawlee import ExtractStrategy
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


Expand All @@ -15,9 +15,9 @@ async def main() -> None:
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

# Setting the strategy to SAME_DOMAIN will enqueue all links found that
# Setting the strategy to SAME_DOMAIN will extract all links found that
# are on the same hostname as request.loaded_url or request.url.
await context.enqueue_links(strategy=EnqueueStrategy.SAME_DOMAIN)
await context.extract_links(strategy=ExtractStrategy.SAME_DOMAIN)

# Run the crawler with the initial list of requests.
await crawler.run(['https://crawlee.dev'])
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import asyncio

from crawlee import EnqueueStrategy
from crawlee import ExtractStrategy
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


Expand All @@ -15,9 +15,9 @@ async def main() -> None:
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

# Setting the strategy to SAME_HOSTNAME will enqueue all links found that are on
# Setting the strategy to SAME_HOSTNAME will extract all links found that are on
# the same hostname (including subdomains) as request.loaded_url or request.url.
await context.enqueue_links(strategy=EnqueueStrategy.SAME_HOSTNAME)
await context.extract_links(strategy=ExtractStrategy.SAME_HOSTNAME)

# Run the crawler with the initial list of requests.
await crawler.run(['https://crawlee.dev'])
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import asyncio

from crawlee import EnqueueStrategy
from crawlee import ExtractStrategy
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


Expand All @@ -15,9 +15,9 @@ async def main() -> None:
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

# Setting the strategy to SAME_ORIGIN will enqueue all links found that are on
# Setting the strategy to SAME_ORIGIN will extract all links found that are on
# the same origin as request.loaded_url or request.url.
await context.enqueue_links(strategy=EnqueueStrategy.SAME_ORIGIN)
await context.extract_links(strategy=ExtractStrategy.SAME_ORIGIN)

# Run the crawler with the initial list of requests.
await crawler.run(['https://crawlee.dev'])
Expand Down
4 changes: 2 additions & 2 deletions docs/examples/code/export_entire_dataset_to_file_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
'title': context.soup.title.string if context.soup.title else None,
}

# Enqueue all links found on the page.
await context.enqueue_links()
# Extract and enqueue all links found on the page.
await context.extract_links()

# Push the extracted data to the default dataset.
await context.push_data(data)
Expand Down
4 changes: 2 additions & 2 deletions docs/examples/code/export_entire_dataset_to_file_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
'title': context.soup.title.string if context.soup.title else None,
}

# Enqueue all links found on the page.
await context.enqueue_links()
# Extract and enqueue all links found on the page.
await context.extract_links()

# Push the extracted data to the default dataset.
await context.push_data(data)
Expand Down
4 changes: 2 additions & 2 deletions docs/examples/code/parsel_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ async def request_handler(context: ParselCrawlingContext) -> None:
# Push the extracted data to the default dataset.
await context.push_data(data)

# Enqueue all links found on the page.
await context.enqueue_links()
# Extract and enqueue all links found on the page.
await context.extract_links()

# Register pre navigation hook which will be called before each request.
# This hook is optional and does not need to be defined at all.
Expand Down
3 changes: 2 additions & 1 deletion docs/examples/code/playwright_block_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ async def main() -> None:
async def request_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

await context.enqueue_links()
# Extract and enqueue all links found on the page.
await context.extract_links()

# Define the hook, which will be called before every request.
@crawler.pre_navigation_hook
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/code/playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
await context.push_data(data)

# Find a link to the next page and enqueue it if it exists.
await context.enqueue_links(selector='.morelink')
await context.extract_links(selector='.morelink')

# Define a hook that will be called each time before navigating to a new URL.
# The hook receives a context parameter, providing access to the request and
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/code/playwright_crawler_with_camoufox.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
await context.push_data({'title': title})

# Find a link to the next page and enqueue it if it exists.
await context.enqueue_links(selector='.morelink')
await context.extract_links(selector='.morelink')

# Run the crawler with the initial list of URLs.
await crawler.run(['https://news.ycombinator.com/'])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

# Find a link to the next page and enqueue it if it exists.
await context.enqueue_links(selector='.morelink')
await context.extract_links(selector='.morelink')

# Run the crawler with the initial list of URLs.
await crawler.run(['https://news.ycombinator.com/'])
Expand Down
4 changes: 2 additions & 2 deletions docs/examples/crawl_all_links_on_website.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@ import CodeBlock from '@theme/CodeBlock';
import BeautifulSoupExample from '!!raw-loader!./code/crawl_all_links_on_website_bs.py';
import PlaywrightExample from '!!raw-loader!./code/crawl_all_links_on_website_pw.py';

This example uses the <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> helper to add new links to the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink> as the crawler navigates from page to page. By automatically discovering and enqueuing all links on a given page, the crawler can systematically scrape an entire website. This approach is ideal for web scraping tasks where you need to collect data from multiple interconnected pages.
This example uses the <ApiLink to="class/ExtractLinksFunction">`extract_links`</ApiLink> helper function to extract new links from the page and add them to the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink> as the crawler navigates from page to page. By automatically discovering and enqueuing all links on a given page, the crawler can systematically scrape an entire website. This approach is ideal for web scraping tasks where you need to collect data from multiple interconnected pages.

:::tip

If no options are given, by default the method will only add links that are under the same subdomain. This behavior can be controlled with the `strategy` option, which is an instance of the <ApiLink to="enum/EnqueueStrategy">`EnqueueStrategy`</ApiLink> enum. You can find more info about this option in the [Crawl website with relative links](./crawl-website-with-relative-links) example.
If no options are given, by default the method will only add links that are under the same subdomain. This behavior can be controlled with the `strategy` option, which is an instance of the <ApiLink to="enum/ExtractStrategy">`ExtractStrategy`</ApiLink> enum. You can find more info about this option in the [Crawl website with relative links](./crawl-website-with-relative-links) example.

:::

Expand Down
2 changes: 1 addition & 1 deletion docs/examples/crawl_specific_links_on_website.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import CodeBlock from '@theme/CodeBlock';
import BeautifulSoupExample from '!!raw-loader!./code/crawl_specific_links_on_website_bs.py';
import PlaywrightExample from '!!raw-loader!./code/crawl_specific_links_on_website_pw.py';

This example demonstrates how to crawl a website while targeting specific patterns of links. By utilizing the <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> helper, you can pass `include` or `exclude` parameters to improve your crawling strategy. This approach ensures that only the links matching the specified patterns are added to the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>. Both `include` and `exclude` support lists of globs or regular expressions. This functionality is great for focusing on relevant sections of a website and avoiding scraping unnecessary or irrelevant content.
This example demonstrates how to crawl a website while targeting specific patterns of links. By utilizing the <ApiLink to="class/ExtractLinksFunction">`extract_links`</ApiLink> helper, you can pass `include` or `exclude` parameters to improve your crawling strategy. This approach ensures that only the links matching the specified patterns are added to the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>. Both `include` and `exclude` support lists of globs or regular expressions. This functionality is great for focusing on relevant sections of a website and avoiding scraping unnecessary or irrelevant content.

<Tabs groupId="main">
<TabItem value="BeautifulSoupCrawler" label="BeautifulSoupCrawler">
Expand Down
12 changes: 6 additions & 6 deletions docs/examples/crawl_website_with_relative_links.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -13,20 +13,20 @@ import SameDomainExample from '!!raw-loader!./code/crawl_website_with_relative_l
import SameHostnameExample from '!!raw-loader!./code/crawl_website_with_relative_links_same_hostname.py';
import SameOriginExample from '!!raw-loader!./code/crawl_website_with_relative_links_same_origin.py';

When crawling a website, you may encounter various types of links that you wish to include in your crawl. To facilitate this, we provide the <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> method on the crawler context, which will automatically find and add these links to the crawler's <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>. This method simplifies the process of handling different types of links, including relative links, by automatically resolving them based on the page's context.
When crawling a website, you may encounter various types of links that you wish to include in your crawl. To facilitate this, we provide the <ApiLink to="class/ExtractLinksFunction">`extract_links`</ApiLink> method on the crawler context, which will automatically find and add these links to the crawler's <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>. This method simplifies the process of handling different types of links, including relative links, by automatically resolving them based on the page's context.

:::note

For these examples, we are using the <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>. However, the same method is available for other crawlers as well. You can use it in exactly the same way.

:::

<ApiLink to="enum/EnqueueStrategy">`EnqueueStrategy`</ApiLink> enum provides four distinct strategies for crawling relative links:
<ApiLink to="enum/ExtractStrategy">`ExtractStrategy`</ApiLink> enum provides four distinct strategies for crawling relative links:

- `EnqueueStrategy.All` - Enqueues all links found, regardless of the domain they point to. This strategy is useful when you want to follow every link, including those that navigate to external websites.
- `EnqueueStrategy.SAME_DOMAIN` - Enqueues all links found that share the same domain name, including any possible subdomains. This strategy ensures that all links within the same top-level and base domain are included.
- `EnqueueStrategy.SAME_HOSTNAME` - Enqueues all links found for the exact same hostname. This is the **default** strategy, and it restricts the crawl to links that have the same hostname as the current page, excluding subdomains.
- `EnqueueStrategy.SAME_ORIGIN` - Enqueues all links found that share the same origin. The same origin refers to URLs that share the same protocol, domain, and port, ensuring a strict scope for the crawl.
- `ExtractStrategy.All` - Enqueues all links found, regardless of the domain they point to. This strategy is useful when you want to follow every link, including those that navigate to external websites.
- `ExtractStrategy.SAME_DOMAIN` - Enqueues all links found that share the same domain name, including any possible subdomains. This strategy ensures that all links within the same top-level and base domain are included.
- `ExtractStrategy.SAME_HOSTNAME` - Enqueues all links found for the exact same hostname. This is the **default** strategy, and it restricts the crawl to links that have the same hostname as the current page, excluding subdomains.
- `ExtractStrategy.SAME_ORIGIN` - Enqueues all links found that share the same origin. The same origin refers to URLs that share the same protocol, domain, and port, ensuring a strict scope for the crawl.

<Tabs groupId="main">
<TabItem value="all_links" label="All links">
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/parsel_crawler.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import CodeBlock from '@theme/CodeBlock';

import ParselCrawlerExample from '!!raw-loader!./code/parsel_crawler.py';

This example shows how to use <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> to crawl a website or a list of URLs. Each URL is loaded using a plain HTTP request and the response is parsed using [Parsel](https://pypi.org/project/parsel/) library which supports CSS and XPath selectors for HTML responses and JMESPath for JSON responses. We can extract data from all kinds of complex HTML structures using XPath. In this example, we will use Parsel to crawl github.com and extract page title, URL and emails found in the webpage. The default handler will scrape data from the current webpage and enqueue all the links found in the webpage for continuous scraping. It also shows how you can add optional pre-navigation hook to the crawler. Pre-navigation hooks are user defined functions that execute before sending the request.
This example shows how to use <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> to crawl a website or a list of URLs. Each URL is loaded using a plain HTTP request and the response is parsed using [Parsel](https://pypi.org/project/parsel/) library which supports CSS and XPath selectors for HTML responses and JMESPath for JSON responses. We can extract data from all kinds of complex HTML structures using XPath. In this example, we will use Parsel to crawl github.com and extract page title, URL and emails found in the webpage. The default handler will scrape data from the current webpage and extract and enqueue all the links found in the webpage for continuous scraping. It also shows how you can add optional pre-navigation hook to the crawler. Pre-navigation hooks are user defined functions that execute before sending the request.

<CodeBlock className="language-python">
{ParselCrawlerExample}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

# Find a link to the next page and enqueue it if it exists.
await context.enqueue_links(selector='.morelink')
await context.extract_links(selector='.morelink')

# Run the crawler with the initial list of URLs.
await crawler.run(['https://news.ycombinator.com/'])
Expand Down
Loading

0 comments on commit 8edd8ee

Please sign in to comment.