diff --git a/application/parser/remote/crawler_loader.py b/application/parser/remote/crawler_loader.py index 380a25bf4..ee037e59e 100644 --- a/application/parser/remote/crawler_loader.py +++ b/application/parser/remote/crawler_loader.py @@ -9,7 +9,8 @@ def __init__(self, limit=10): self.loader = WebBaseLoader # Initialize the document loader self.limit = limit # Set the limit for the number of pages to scrape - def load_data(self, url): + def load_data(self, inputs): + url = inputs['data'] # Check if the input is a list and if it is, use the first element if isinstance(url, list) and url: url = url[0] diff --git a/application/parser/remote/sitemap_loader.py b/application/parser/remote/sitemap_loader.py index e2339ab78..0748f104d 100644 --- a/application/parser/remote/sitemap_loader.py +++ b/application/parser/remote/sitemap_loader.py @@ -9,11 +9,12 @@ def __init__(self, limit=20): self.loader = WebBaseLoader self.limit = limit # Adding limit to control the number of URLs to process - def load_data(self, sitemap_url): + def load_data(self, inputs): + sitemap_url= inputs['data'] # Check if the input is a list and if it is, use the first element if isinstance(sitemap_url, list) and sitemap_url: url = sitemap_url[0] - + urls = self._extract_urls(sitemap_url) if not urls: print(f"No URLs found in the sitemap: {sitemap_url}") diff --git a/application/parser/remote/web_loader.py b/application/parser/remote/web_loader.py index 4a55e1c52..e5cd2e2ff 100644 --- a/application/parser/remote/web_loader.py +++ b/application/parser/remote/web_loader.py @@ -5,7 +5,9 @@ def __init__(self): from langchain.document_loaders import WebBaseLoader self.loader = WebBaseLoader - def load_data(self, urls): + def load_data(self, inputs): + urls = inputs['data'] + if isinstance(urls, str): urls = [urls] # Convert string to list if a single URL is passed diff --git a/application/worker.py b/application/worker.py index 444772d5c..90de02869 100644 --- a/application/worker.py +++ b/application/worker.py @@ -121,7 +121,7 @@ def remote_worker(self, inputs, name_job, user, directory = 'temp', loader = 'ur # inputs {"data": [url]} for url type task just urls # Use RemoteCreator to load data from URL - remote_loader = RemoteCreator.create_loader(loader, inputs['data']) + remote_loader = RemoteCreator.create_loader(loader, inputs) raw_docs = remote_loader.load_data() raw_docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check)