Skip to content

Commit

Permalink
change input
Browse files Browse the repository at this point in the history
  • Loading branch information
pabik committed Oct 13, 2023
1 parent 8b3b16b commit 381a274
Show file tree
Hide file tree
Showing 4 changed files with 9 additions and 5 deletions.
3 changes: 2 additions & 1 deletion application/parser/remote/crawler_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ def __init__(self, limit=10):
self.loader = WebBaseLoader # Initialize the document loader
self.limit = limit # Set the limit for the number of pages to scrape

def load_data(self, url):
def load_data(self, inputs):
url = inputs['data']
# Check if the input is a list and if it is, use the first element
if isinstance(url, list) and url:
url = url[0]
Expand Down
5 changes: 3 additions & 2 deletions application/parser/remote/sitemap_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,12 @@ def __init__(self, limit=20):
self.loader = WebBaseLoader
self.limit = limit # Adding limit to control the number of URLs to process

def load_data(self, sitemap_url):
def load_data(self, inputs):
sitemap_url= inputs['data']
# Check if the input is a list and if it is, use the first element
if isinstance(sitemap_url, list) and sitemap_url:
url = sitemap_url[0]

urls = self._extract_urls(sitemap_url)
if not urls:
print(f"No URLs found in the sitemap: {sitemap_url}")
Expand Down
4 changes: 3 additions & 1 deletion application/parser/remote/web_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ def __init__(self):
from langchain.document_loaders import WebBaseLoader
self.loader = WebBaseLoader

def load_data(self, urls):
def load_data(self, inputs):
urls = inputs['data']

if isinstance(urls, str):
urls = [urls] # Convert string to list if a single URL is passed

Expand Down
2 changes: 1 addition & 1 deletion application/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def remote_worker(self, inputs, name_job, user, directory = 'temp', loader = 'ur
# inputs {"data": [url]} for url type task just urls

# Use RemoteCreator to load data from URL
remote_loader = RemoteCreator.create_loader(loader, inputs['data'])
remote_loader = RemoteCreator.create_loader(loader, inputs)
raw_docs = remote_loader.load_data()

raw_docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check)
Expand Down

0 comments on commit 381a274

Please sign in to comment.