change input

arc53 · Oct 13, 2023 · 381a274 · 381a274
1 parent 8b3b16b
commit 381a274
Show file tree

Hide file tree

Showing 4 changed files with 9 additions and 5 deletions.
diff --git a/application/parser/remote/crawler_loader.py b/application/parser/remote/crawler_loader.py
@@ -9,7 +9,8 @@ def __init__(self, limit=10):
         self.loader = WebBaseLoader  # Initialize the document loader
         self.limit = limit  # Set the limit for the number of pages to scrape
 
-    def load_data(self, url):
+    def load_data(self, inputs):
+        url = inputs['data']
         # Check if the input is a list and if it is, use the first element
         if isinstance(url, list) and url:
             url = url[0]

diff --git a/application/parser/remote/sitemap_loader.py b/application/parser/remote/sitemap_loader.py
@@ -9,11 +9,12 @@ def __init__(self, limit=20):
         self.loader = WebBaseLoader
         self.limit = limit  # Adding limit to control the number of URLs to process
 
-    def load_data(self, sitemap_url):
+    def load_data(self, inputs):
+        sitemap_url= inputs['data']
         # Check if the input is a list and if it is, use the first element
         if isinstance(sitemap_url, list) and sitemap_url:
             url = sitemap_url[0]
-            
+
         urls = self._extract_urls(sitemap_url)
         if not urls:
             print(f"No URLs found in the sitemap: {sitemap_url}")

diff --git a/application/parser/remote/web_loader.py b/application/parser/remote/web_loader.py
@@ -5,7 +5,9 @@ def __init__(self):
         from langchain.document_loaders import WebBaseLoader
         self.loader = WebBaseLoader
 
-    def load_data(self, urls):
+    def load_data(self, inputs):
+        urls = inputs['data']
+
         if isinstance(urls, str):
             urls = [urls] # Convert string to list if a single URL is passed
 

diff --git a/application/worker.py b/application/worker.py
@@ -121,7 +121,7 @@ def remote_worker(self, inputs, name_job, user, directory = 'temp', loader = 'ur
     # inputs {"data": [url]} for url type task just urls
 
     # Use RemoteCreator to load data from URL
-    remote_loader = RemoteCreator.create_loader(loader, inputs['data'])
+    remote_loader = RemoteCreator.create_loader(loader, inputs)
     raw_docs = remote_loader.load_data()
 
     raw_docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check)