From 50f07f9ef5f64292ba6e9f6e7b9cc2b2bafc60c0 Mon Sep 17 00:00:00 2001 From: Pavel Date: Thu, 12 Oct 2023 12:53:33 +0400 Subject: [PATCH] limit crawler --- application/parser/remote/crawler_loader.py | 49 ++++++++++++++------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/application/parser/remote/crawler_loader.py b/application/parser/remote/crawler_loader.py index 208195cb7..9acb8f39f 100644 --- a/application/parser/remote/crawler_loader.py +++ b/application/parser/remote/crawler_loader.py @@ -4,33 +4,48 @@ from application.parser.remote.base import BaseRemote class CrawlerLoader(BaseRemote): - def __init__(self): + def __init__(self, limit=10): from langchain.document_loaders import WebBaseLoader self.loader = WebBaseLoader + self.limit = limit def load_data(self, url): - # Fetch the content of the initial URL - response = requests.get(url) - if response.status_code != 200: - print(f"Failed to fetch initial URL: {url}") - return None - - # Parse the HTML content - soup = BeautifulSoup(response.text, 'html.parser') + # Create a set to store visited URLs to avoid revisiting the same page + visited_urls = set() # Extract the base URL to ensure we only fetch URLs from the same domain base_url = urlparse(url).scheme + "://" + urlparse(url).hostname - # Extract all links from the HTML content - all_links = [a['href'] for a in soup.find_all('a', href=True)] + # Initialize a list with the initial URL + urls_to_visit = [url] + + while urls_to_visit: + current_url = urls_to_visit.pop(0) + visited_urls.add(current_url) + + # Fetch the content of the current URL + response = requests.get(current_url) + if response.status_code != 200: + print(f"Failed to fetch URL: {current_url}") + continue + + # Parse the HTML content + soup = BeautifulSoup(response.text, 'html.parser') + + # Extract all links from the HTML content + all_links = [urljoin(current_url, a['href']) for a in soup.find_all('a', href=True) if base_url in urljoin(current_url, a['href'])] + + # Add the new links to the urls_to_visit list if they haven't been visited yet + urls_to_visit.extend([link for link in all_links if link not in visited_urls]) - # Filter out the links that lead to a different domain - same_domain_links = [urljoin(base_url, link) for link in all_links if base_url in urljoin(base_url, link)] + # Remove duplicates + urls_to_visit = list(set(urls_to_visit)) - # Remove duplicates - same_domain_links = list(set(same_domain_links)) + # Stop if the limit is reached + if self.limit is not None and len(visited_urls) >= self.limit: + break #TODO: Optimize this section to parse pages as they are being crawled - loaded_content = self.loader(same_domain_links).load() + loaded_content = self.loader(list(visited_urls)).load() - return loaded_content + return loaded_content \ No newline at end of file