diff --git a/App_Function_Libraries/Gradio_UI/Chat_ui.py b/App_Function_Libraries/Gradio_UI/Chat_ui.py index 1ccd7426..91c2dcdb 100644 --- a/App_Function_Libraries/Gradio_UI/Chat_ui.py +++ b/App_Function_Libraries/Gradio_UI/Chat_ui.py @@ -261,7 +261,7 @@ def create_chat_interface(): lines=3, visible=False) with gr.Column(scale=2): - chatbot = gr.Chatbot(height=600, elem_classes="chatbot-container") + chatbot = gr.Chatbot(height=800, elem_classes="chatbot-container") msg = gr.Textbox(label="Enter your message") submit = gr.Button("Submit") regenerate_button = gr.Button("Regenerate Last Message") @@ -465,7 +465,7 @@ def create_chat_interface_stacked(): gr.Markdown("Scroll down for the chat window...") with gr.Row(): with gr.Column(scale=1): - chatbot = gr.Chatbot(height=600, elem_classes="chatbot-container") + chatbot = gr.Chatbot(height=800, elem_classes="chatbot-container") msg = gr.Textbox(label="Enter your message") with gr.Row(): with gr.Column(): diff --git a/App_Function_Libraries/Gradio_UI/Website_scraping_tab.py b/App_Function_Libraries/Gradio_UI/Website_scraping_tab.py index 6640943c..a81706d0 100644 --- a/App_Function_Libraries/Gradio_UI/Website_scraping_tab.py +++ b/App_Function_Libraries/Gradio_UI/Website_scraping_tab.py @@ -19,8 +19,8 @@ # # Local Imports -from App_Function_Libraries.Web_Scraping.Article_Extractor_Lib import scrape_from_sitemap, scrape_by_url_level, scrape_article -from App_Function_Libraries.Web_Scraping.Article_Summarization_Lib import scrape_and_summarize_multiple +from App_Function_Libraries.Web_Scraping.Article_Extractor_Lib import scrape_from_sitemap, scrape_by_url_level, \ + scrape_article, collect_bookmarks, scrape_and_summarize_multiple from App_Function_Libraries.DB.DB_Manager import load_preset_prompts from App_Function_Libraries.Gradio_UI.Chat_ui import update_user_prompt from App_Function_Libraries.Summarization.Summarization_General_Lib import summarize @@ -35,12 +35,12 @@ def get_url_depth(url: str) -> int: return len(urlparse(url).path.strip('/').split('/')) -def sync_recursive_scrape(url_input, max_pages, max_depth, progress_callback, delay=1.0): +def sync_recursive_scrape(url_input, max_pages, max_depth, progress_callback, delay=1.0, custom_cookies=None): def run_async_scrape(): loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) return loop.run_until_complete( - recursive_scrape(url_input, max_pages, max_depth, progress_callback, delay) + recursive_scrape(url_input, max_pages, max_depth, progress_callback, delay, custom_cookies=custom_cookies) ) with ThreadPoolExecutor() as executor: @@ -55,7 +55,8 @@ async def recursive_scrape( progress_callback: callable, delay: float = 1.0, resume_file: str = 'scrape_progress.json', - user_agent: str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" + user_agent: str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3", + custom_cookies: Optional[List[Dict[str, Any]]] = None ) -> List[Dict]: async def save_progress(): temp_file = resume_file + ".tmp" @@ -90,6 +91,10 @@ def is_valid_url(url: str) -> bool: browser = await p.chromium.launch(headless=True) context = await browser.new_context(user_agent=user_agent) + # Set custom cookies if provided + if custom_cookies: + await context.add_cookies(custom_cookies) + try: while to_visit and pages_scraped < max_pages: current_url, current_depth = to_visit.pop(0) @@ -293,7 +298,7 @@ def create_website_scraping_tab(): lines=5 ) with gr.Row(): - summarize_checkbox = gr.Checkbox(label="Summarize Articles", value=False) + summarize_checkbox = gr.Checkbox(label="Summarize/Analyze Articles", value=False) custom_prompt_checkbox = gr.Checkbox(label="Use a Custom Prompt", value=False, visible=True) preset_prompt_checkbox = gr.Checkbox(label="Use a pre-set Prompt", value=False, visible=True) with gr.Row(): @@ -348,14 +353,35 @@ def create_website_scraping_tab(): placeholder="Enter your API key here; Ignore if using Local API or Built-in API", type="password" ) + custom_cookies_input = gr.Textbox( + label="Custom Cookies (JSON format)", + placeholder="Enter custom cookies in JSON format", + lines=3, + visible=True + ) keywords_input = gr.Textbox( label="Keywords", placeholder="Enter keywords here (comma-separated)", value="default,no_keyword_set", visible=True ) + # Updated: Added output to display parsed URLs + bookmarks_file_input = gr.File( + label="Upload Bookmarks File", + type="filepath", + file_types=[".json", ".html"], + visible=True + ) + parsed_urls_output = gr.Textbox( + label="Parsed URLs from Bookmarks", + placeholder="URLs will be displayed here after uploading a bookmarks file.", + lines=10, + interactive=False, + visible=False # Initially hidden, shown only when URLs are parsed + ) scrape_button = gr.Button("Scrape and Summarize") + with gr.Column(): progress_output = gr.Textbox(label="Progress", lines=3) result_output = gr.Textbox(label="Result", lines=20) @@ -396,10 +422,61 @@ def update_prompts(preset_name): preset_prompt.change( update_prompts, - inputs=preset_prompt, + inputs=[preset_prompt], outputs=[website_custom_prompt_input, system_prompt_input] ) + def parse_bookmarks(file_path): + """ + Parses the uploaded bookmarks file and extracts URLs. + + Args: + file_path (str): Path to the uploaded bookmarks file. + + Returns: + str: Formatted string of extracted URLs or error message. + """ + try: + bookmarks = collect_bookmarks(file_path) + # Extract URLs + urls = [] + for value in bookmarks.values(): + if isinstance(value, list): + urls.extend(value) + elif isinstance(value, str): + urls.append(value) + if not urls: + return "No URLs found in the bookmarks file." + # Format URLs for display + formatted_urls = "\n".join(urls) + return formatted_urls + except Exception as e: + logging.error(f"Error parsing bookmarks file: {str(e)}") + return f"Error parsing bookmarks file: {str(e)}" + + def show_parsed_urls(bookmarks_file): + """ + Determines whether to show the parsed URLs output. + + Args: + bookmarks_file: Uploaded file object. + + Returns: + Tuple indicating visibility and content of parsed_urls_output. + """ + if bookmarks_file is None: + return gr.update(visible=False), "" + file_path = bookmarks_file.name + parsed_urls = parse_bookmarks(file_path) + return gr.update(visible=True), parsed_urls + + # Connect the parsing function to the file upload event + bookmarks_file_input.change( + fn=show_parsed_urls, + inputs=[bookmarks_file_input], + outputs=[parsed_urls_output, parsed_urls_output] + ) + async def scrape_and_summarize_wrapper( scrape_method: str, url_input: str, @@ -413,15 +490,46 @@ async def scrape_and_summarize_wrapper( keywords: str, custom_titles: Optional[str], system_prompt: Optional[str], - temperature: float = 0.7, + temperature: float, + custom_cookies: Optional[str], + bookmarks_file, progress: gr.Progress = gr.Progress() ) -> str: try: result: List[Dict[str, Any]] = [] + # Handle bookmarks file if provided + if bookmarks_file is not None: + bookmarks = collect_bookmarks(bookmarks_file.name) + # Extract URLs from bookmarks + urls_from_bookmarks = [] + for value in bookmarks.values(): + if isinstance(value, list): + urls_from_bookmarks.extend(value) + elif isinstance(value, str): + urls_from_bookmarks.append(value) + if scrape_method == "Individual URLs": + url_input = "\n".join(urls_from_bookmarks) + else: + if urls_from_bookmarks: + url_input = urls_from_bookmarks[0] + else: + return convert_json_to_markdown(json.dumps({"error": "No URLs found in the bookmarks file."})) + + # Handle custom cookies + custom_cookies_list = None + if custom_cookies: + try: + custom_cookies_list = json.loads(custom_cookies) + if not isinstance(custom_cookies_list, list): + custom_cookies_list = [custom_cookies_list] + except json.JSONDecodeError as e: + return convert_json_to_markdown(json.dumps({"error": f"Invalid JSON format for custom cookies: {e}"})) + if scrape_method == "Individual URLs": + # FIXME modify scrape_and_summarize_multiple to accept custom_cookies result = await scrape_and_summarize_multiple(url_input, custom_prompt, api_name, api_key, keywords, - custom_titles, system_prompt) + custom_titles, system_prompt, summarize_checkbox, custom_cookies=custom_cookies_list) elif scrape_method == "Sitemap": result = await asyncio.to_thread(scrape_from_sitemap, url_input) elif scrape_method == "URL Level": @@ -430,7 +538,8 @@ async def scrape_and_summarize_wrapper( json.dumps({"error": "URL level is required for URL Level scraping."})) result = await asyncio.to_thread(scrape_by_url_level, url_input, url_level) elif scrape_method == "Recursive Scraping": - result = await recursive_scrape(url_input, max_pages, max_depth, progress.update, delay=1.0) + result = await recursive_scrape(url_input, max_pages, max_depth, progress.update, delay=1.0, + custom_cookies=custom_cookies_list) else: return convert_json_to_markdown(json.dumps({"error": f"Unknown scraping method: {scrape_method}"})) @@ -496,7 +605,8 @@ async def scrape_and_summarize_wrapper( fn=lambda *args: asyncio.run(scrape_and_summarize_wrapper(*args)), inputs=[scrape_method, url_input, url_level, max_pages, max_depth, summarize_checkbox, website_custom_prompt_input, api_name_input, api_key_input, keywords_input, - custom_article_title_input, system_prompt_input, temp_slider], + custom_article_title_input, system_prompt_input, temp_slider, + custom_cookies_input, bookmarks_file_input], outputs=[result_output] ) @@ -527,11 +637,15 @@ def convert_json_to_markdown(json_str: str) -> str: markdown += f"- **Scrape Method:** {data['scrape_method']}\n" markdown += f"- **API Used:** {data['api_used']}\n" markdown += f"- **Keywords:** {data['keywords']}\n" - if data['url_level'] is not None: + if data.get('url_level') is not None: markdown += f"- **URL Level:** {data['url_level']}\n" + if data.get('max_pages') is not None: + markdown += f"- **Maximum Pages:** {data['max_pages']}\n" + if data.get('max_depth') is not None: + markdown += f"- **Maximum Depth:** {data['max_depth']}\n" markdown += f"- **Total Articles Scraped:** {data['total_articles_scraped']}\n\n" - # Add URLs scraped + # Add URLs Scraped markdown += "## URLs Scraped\n\n" for url in data['urls_scraped']: markdown += f"- {url}\n" @@ -549,6 +663,7 @@ def convert_json_to_markdown(json_str: str) -> str: return f"# Error\n\nMissing key in JSON data: {str(e)}" except Exception as e: return f"# Error\n\nAn unexpected error occurred: {str(e)}" + # # End of File ######################################################################################################################## diff --git a/App_Function_Libraries/Local_LLM/Local_LLM_Inference_Engine_Lib.py b/App_Function_Libraries/Local_LLM/Local_LLM_Inference_Engine_Lib.py index 0546903b..abb263f9 100644 --- a/App_Function_Libraries/Local_LLM/Local_LLM_Inference_Engine_Lib.py +++ b/App_Function_Libraries/Local_LLM/Local_LLM_Inference_Engine_Lib.py @@ -28,7 +28,6 @@ import requests # # Import Local -from App_Function_Libraries.Web_Scraping.Article_Summarization_Lib import * from App_Function_Libraries.Utils.Utils import download_file # ####################################################################################################################### diff --git a/App_Function_Libraries/Web_Scraping/Article_Extractor_Lib.py b/App_Function_Libraries/Web_Scraping/Article_Extractor_Lib.py index b1e1228c..fb29c5e4 100644 --- a/App_Function_Libraries/Web_Scraping/Article_Extractor_Lib.py +++ b/App_Function_Libraries/Web_Scraping/Article_Extractor_Lib.py @@ -13,29 +13,37 @@ #################### # # Import necessary libraries +import hashlib +from datetime import datetime import json import logging -# 3rd-Party Imports -import asyncio import os import tempfile -from datetime import datetime -from typing import List, Dict, Union +from typing import Any, Dict, List, Union, Optional +# +# 3rd-Party Imports +import asyncio from urllib.parse import urljoin, urlparse from xml.dom import minidom -from playwright.async_api import async_playwright -from bs4 import BeautifulSoup +import xml.etree.ElementTree as ET +# +# External Libraries import requests import trafilatura -import xml.etree.ElementTree as ET - - -# Import Local +from playwright.async_api import async_playwright +from bs4 import BeautifulSoup # +# Import Local +from App_Function_Libraries.DB.DB_Manager import ingest_article_to_db +from App_Function_Libraries.Summarization.Summarization_General_Lib import summarize ####################################################################################################################### # Function Definitions # +################################################################# +# +# Scraping-related functions: + def get_page_title(url: str) -> str: try: response = requests.get(url) @@ -48,12 +56,15 @@ def get_page_title(url: str) -> str: return "Untitled" -async def scrape_article(url): +async def scrape_article(url, custom_cookies: Optional[List[Dict[str, Any]]] = None): async def fetch_html(url: str) -> str: async with async_playwright() as p: browser = await p.chromium.launch(headless=True) context = await browser.new_context( - user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3") + user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" + ) + if custom_cookies: + await context.add_cookies(custom_cookies) page = await context.new_page() await page.goto(url) await page.wait_for_load_state("networkidle") # Wait for the network to be idle @@ -61,8 +72,8 @@ async def fetch_html(url: str) -> str: await browser.close() return content - # FIXME - Add option for extracting comments/tables/images def extract_article_data(html: str, url: str) -> dict: + # FIXME - Add option for extracting comments/tables/images downloaded = trafilatura.extract(html, include_comments=False, include_tables=False, include_images=False) metadata = trafilatura.extract_metadata(html) @@ -108,96 +119,176 @@ def convert_html_to_markdown(html: str) -> str: return article_data -def collect_internal_links(base_url: str) -> set: - visited = set() - to_visit = {base_url} +async def scrape_and_summarize_multiple( + urls: str, + custom_prompt_arg: Optional[str], + api_name: str, + api_key: Optional[str], + keywords: str, + custom_article_titles: Optional[str], + system_message: Optional[str] = None, + summarize_checkbox: bool = False, + custom_cookies: Optional[List[Dict[str, Any]]] = None, + temperature: float = 0.7 +) -> List[Dict[str, Any]]: + urls_list = [url.strip() for url in urls.split('\n') if url.strip()] + custom_titles = custom_article_titles.split('\n') if custom_article_titles else [] + + results = [] + errors = [] + + # Loop over each URL to scrape and optionally summarize + for i, url in enumerate(urls_list): + custom_title = custom_titles[i] if i < len(custom_titles) else None + try: + # Scrape the article + article = await scrape_article(url, custom_cookies=custom_cookies) + if article and article['extraction_successful']: + if custom_title: + article['title'] = custom_title + + # If summarization is requested + if summarize_checkbox: + content = article.get('content', '') + if content: + # Prepare prompts + system_message_final = system_message or "Act as a professional summarizer and summarize this article." + article_custom_prompt = custom_prompt_arg or "Act as a professional summarizer and summarize this article." + + # Summarize the content using the summarize function + summary = summarize( + input_data=content, + custom_prompt_arg=article_custom_prompt, + api_name=api_name, + api_key=api_key, + temp=temperature, + system_message=system_message_final + ) + article['summary'] = summary + logging.info(f"Summary generated for URL {url}") + else: + article['summary'] = "No content available to summarize." + logging.warning(f"No content to summarize for URL {url}") + else: + article['summary'] = None - while to_visit: - current_url = to_visit.pop() - if current_url in visited: - continue + results.append(article) + else: + error_message = f"Extraction unsuccessful for URL {url}" + errors.append(error_message) + logging.error(error_message) + except Exception as e: + error_message = f"Error processing URL {i + 1} ({url}): {str(e)}" + errors.append(error_message) + logging.error(error_message, exc_info=True) - try: - response = requests.get(current_url) - response.raise_for_status() - soup = BeautifulSoup(response.text, 'html.parser') + if errors: + logging.error("\n".join(errors)) - # Collect internal links - for link in soup.find_all('a', href=True): - full_url = urljoin(base_url, link['href']) - # Only process links within the same domain - if urlparse(full_url).netloc == urlparse(base_url).netloc: - if full_url not in visited: - to_visit.add(full_url) + if not results: + logging.error("No articles were successfully scraped and summarized/analyzed.") + return [] - visited.add(current_url) - except requests.RequestException as e: - logging.error(f"Error visiting {current_url}: {e}") - continue + return results - return visited +def scrape_and_no_summarize_then_ingest(url, keywords, custom_article_title): + try: + # Step 1: Scrape the article + article_data = asyncio.run(scrape_article(url)) + print(f"Scraped Article Data: {article_data}") # Debugging statement + if not article_data: + return "Failed to scrape the article." -def generate_temp_sitemap_from_links(links: set) -> str: + # Use the custom title if provided, otherwise use the scraped title + title = custom_article_title.strip() if custom_article_title else article_data.get('title', 'Untitled') + author = article_data.get('author', 'Unknown') + content = article_data.get('content', '') + ingestion_date = datetime.now().strftime('%Y-%m-%d') + + print(f"Title: {title}, Author: {author}, Content Length: {len(content)}") # Debugging statement + + # Step 2: Ingest the article into the database + ingestion_result = ingest_article_to_db(url, title, author, content, keywords, ingestion_date, None, None) + + return f"Title: {title}\nAuthor: {author}\nIngestion Result: {ingestion_result}\n\nArticle Contents: {content}" + except Exception as e: + logging.error(f"Error processing URL {url}: {str(e)}") + return f"Failed to process URL {url}: {str(e)}" + + +def scrape_from_filtered_sitemap(sitemap_file: str, filter_function) -> list: """ - Generate a temporary sitemap file from collected links and return its path. + Scrape articles from a sitemap file, applying an additional filter function. - :param links: A set of URLs to include in the sitemap - :return: Path to the temporary sitemap file + :param sitemap_file: Path to the sitemap file + :param filter_function: A function that takes a URL and returns True if it should be scraped + :return: List of scraped articles """ - # Create the root element - urlset = ET.Element("urlset") - urlset.set("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9") + try: + tree = ET.parse(sitemap_file) + root = tree.getroot() - # Add each link to the sitemap - for link in links: - url = ET.SubElement(urlset, "url") - loc = ET.SubElement(url, "loc") - loc.text = link - lastmod = ET.SubElement(url, "lastmod") - lastmod.text = datetime.now().strftime("%Y-%m-%d") - changefreq = ET.SubElement(url, "changefreq") - changefreq.text = "daily" - priority = ET.SubElement(url, "priority") - priority.text = "0.5" + articles = [] + for url in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}loc'): + if filter_function(url.text): + article_data = scrape_article(url.text) + if article_data: + articles.append(article_data) - # Create the tree and get it as a string - xml_string = ET.tostring(urlset, 'utf-8') + return articles + except ET.ParseError as e: + logging.error(f"Error parsing sitemap: {e}") + return [] - # Pretty print the XML - pretty_xml = minidom.parseString(xml_string).toprettyxml(indent=" ") - # Create a temporary file - with tempfile.NamedTemporaryFile(mode="w", suffix=".xml", delete=False) as temp_file: - temp_file.write(pretty_xml) - temp_file_path = temp_file.name +def is_content_page(url: str) -> bool: + """ + Determine if a URL is likely to be a content page. + This is a basic implementation and may need to be adjusted based on the specific website structure. - logging.info(f"Temporary sitemap created at: {temp_file_path}") - return temp_file_path + :param url: The URL to check + :return: True if the URL is likely a content page, False otherwise + """ + #Add more specific checks here based on the website's structure + # Exclude common non-content pages + exclude_patterns = [ + '/tag/', '/category/', '/author/', '/search/', '/page/', + 'wp-content', 'wp-includes', 'wp-json', 'wp-admin', + 'login', 'register', 'cart', 'checkout', 'account', + '.jpg', '.png', '.gif', '.pdf', '.zip' + ] + return not any(pattern in url.lower() for pattern in exclude_patterns) +def scrape_and_convert_with_filter(source: str, output_file: str, filter_function=is_content_page, level: int = None): + """ + Scrape articles from a sitemap or by URL level, apply filtering, and convert to a single markdown file. -def generate_sitemap_for_url(url: str) -> List[Dict[str, str]]: + :param source: URL of the sitemap, base URL for level-based scraping, or path to a local sitemap file + :param output_file: Path to save the output markdown file + :param filter_function: Function to filter URLs (default is is_content_page) + :param level: URL level for scraping (None if using sitemap) """ - Generate a sitemap for the given URL using the create_filtered_sitemap function. + if level is not None: + # Scraping by URL level + articles = scrape_by_url_level(source, level) + articles = [article for article in articles if filter_function(article['url'])] + elif source.startswith('http'): + # Scraping from online sitemap + articles = scrape_from_sitemap(source) + articles = [article for article in articles if filter_function(article['url'])] + else: + # Scraping from local sitemap file + articles = scrape_from_filtered_sitemap(source, filter_function) - Args: - url (str): The base URL to generate the sitemap for + articles = [article for article in articles if filter_function(article['url'])] + markdown_content = convert_to_markdown(articles) - Returns: - List[Dict[str, str]]: A list of dictionaries, each containing 'url' and 'title' keys - """ - with tempfile.NamedTemporaryFile(mode="w+", suffix=".xml", delete=False) as temp_file: - create_filtered_sitemap(url, temp_file.name, is_content_page) - temp_file.seek(0) - tree = ET.parse(temp_file.name) - root = tree.getroot() + with open(output_file, 'w', encoding='utf-8') as f: + f.write(markdown_content) - sitemap = [] - for url_elem in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}url"): - loc = url_elem.find("{http://www.sitemaps.org/schemas/sitemap/0.9}loc").text - sitemap.append({"url": loc, "title": loc.split("/")[-1] or url}) # Use the last part of the URL as a title + logging.info(f"Scraped and filtered content saved to {output_file}") - return sitemap async def scrape_entire_site(base_url: str) -> List[Dict]: """ @@ -267,37 +358,103 @@ def scrape_from_sitemap(sitemap_url: str) -> list: logging.error(f"Error fetching sitemap: {e}") return [] +# +# End of Scraping Functions +####################################################### +# +# Sitemap/Crawling-related Functions -def convert_to_markdown(articles: list) -> str: - """Convert a list of article data into a single markdown document.""" - markdown = "" - for article in articles: - markdown += f"# {article['title']}\n\n" - markdown += f"Author: {article['author']}\n" - markdown += f"Date: {article['date']}\n\n" - markdown += f"{article['content']}\n\n" - markdown += "---\n\n" # Separator between articles - return markdown +def collect_internal_links(base_url: str) -> set: + visited = set() + to_visit = {base_url} -def is_content_page(url: str) -> bool: + while to_visit: + current_url = to_visit.pop() + if current_url in visited: + continue + + try: + response = requests.get(current_url) + response.raise_for_status() + soup = BeautifulSoup(response.text, 'html.parser') + + # Collect internal links + for link in soup.find_all('a', href=True): + full_url = urljoin(base_url, link['href']) + # Only process links within the same domain + if urlparse(full_url).netloc == urlparse(base_url).netloc: + if full_url not in visited: + to_visit.add(full_url) + + visited.add(current_url) + except requests.RequestException as e: + logging.error(f"Error visiting {current_url}: {e}") + continue + + return visited + + +def generate_temp_sitemap_from_links(links: set) -> str: """ - Determine if a URL is likely to be a content page. - This is a basic implementation and may need to be adjusted based on the specific website structure. + Generate a temporary sitemap file from collected links and return its path. - :param url: The URL to check - :return: True if the URL is likely a content page, False otherwise + :param links: A set of URLs to include in the sitemap + :return: Path to the temporary sitemap file """ - #Add more specific checks here based on the website's structure - # Exclude common non-content pages - exclude_patterns = [ - '/tag/', '/category/', '/author/', '/search/', '/page/', - 'wp-content', 'wp-includes', 'wp-json', 'wp-admin', - 'login', 'register', 'cart', 'checkout', 'account', - '.jpg', '.png', '.gif', '.pdf', '.zip' - ] - return not any(pattern in url.lower() for pattern in exclude_patterns) + # Create the root element + urlset = ET.Element("urlset") + urlset.set("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9") + # Add each link to the sitemap + for link in links: + url = ET.SubElement(urlset, "url") + loc = ET.SubElement(url, "loc") + loc.text = link + lastmod = ET.SubElement(url, "lastmod") + lastmod.text = datetime.now().strftime("%Y-%m-%d") + changefreq = ET.SubElement(url, "changefreq") + changefreq.text = "daily" + priority = ET.SubElement(url, "priority") + priority.text = "0.5" + + # Create the tree and get it as a string + xml_string = ET.tostring(urlset, 'utf-8') + + # Pretty print the XML + pretty_xml = minidom.parseString(xml_string).toprettyxml(indent=" ") + + # Create a temporary file + with tempfile.NamedTemporaryFile(mode="w", suffix=".xml", delete=False) as temp_file: + temp_file.write(pretty_xml) + temp_file_path = temp_file.name + + logging.info(f"Temporary sitemap created at: {temp_file_path}") + return temp_file_path + + +def generate_sitemap_for_url(url: str) -> List[Dict[str, str]]: + """ + Generate a sitemap for the given URL using the create_filtered_sitemap function. + + Args: + url (str): The base URL to generate the sitemap for + + Returns: + List[Dict[str, str]]: A list of dictionaries, each containing 'url' and 'title' keys + """ + with tempfile.NamedTemporaryFile(mode="w+", suffix=".xml", delete=False) as temp_file: + create_filtered_sitemap(url, temp_file.name, is_content_page) + temp_file.seek(0) + tree = ET.parse(temp_file.name) + root = tree.getroot() + + sitemap = [] + for url_elem in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}url"): + loc = url_elem.find("{http://www.sitemaps.org/schemas/sitemap/0.9}loc").text + sitemap.append({"url": loc, "title": loc.split("/")[-1] or url}) # Use the last part of the URL as a title + + return sitemap def create_filtered_sitemap(base_url: str, output_file: str, filter_function): """ @@ -323,61 +480,44 @@ def create_filtered_sitemap(base_url: str, output_file: str, filter_function): print(f"Filtered sitemap saved to {output_file}") -def scrape_from_filtered_sitemap(sitemap_file: str, filter_function) -> list: - """ - Scrape articles from a sitemap file, applying an additional filter function. - - :param sitemap_file: Path to the sitemap file - :param filter_function: A function that takes a URL and returns True if it should be scraped - :return: List of scraped articles - """ - try: - tree = ET.parse(sitemap_file) - root = tree.getroot() - - articles = [] - for url in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}loc'): - if filter_function(url.text): - article_data = scrape_article(url.text) - if article_data: - articles.append(article_data) - - return articles - except ET.ParseError as e: - logging.error(f"Error parsing sitemap: {e}") - return [] +# +# End of Crawling Functions +################################################################# +# +# Utility Functions +def convert_to_markdown(articles: list) -> str: + """Convert a list of article data into a single markdown document.""" + markdown = "" + for article in articles: + markdown += f"# {article['title']}\n\n" + markdown += f"Author: {article['author']}\n" + markdown += f"Date: {article['date']}\n\n" + markdown += f"{article['content']}\n\n" + markdown += "---\n\n" # Separator between articles + return markdown -def scrape_and_convert_with_filter(source: str, output_file: str, filter_function=is_content_page, level: int = None): - """ - Scrape articles from a sitemap or by URL level, apply filtering, and convert to a single markdown file. +def compute_content_hash(content: str) -> str: + return hashlib.sha256(content.encode('utf-8')).hexdigest() - :param source: URL of the sitemap, base URL for level-based scraping, or path to a local sitemap file - :param output_file: Path to save the output markdown file - :param filter_function: Function to filter URLs (default is is_content_page) - :param level: URL level for scraping (None if using sitemap) - """ - if level is not None: - # Scraping by URL level - articles = scrape_by_url_level(source, level) - articles = [article for article in articles if filter_function(article['url'])] - elif source.startswith('http'): - # Scraping from online sitemap - articles = scrape_from_sitemap(source) - articles = [article for article in articles if filter_function(article['url'])] +def load_hashes(filename: str) -> Dict[str, str]: + if os.path.exists(filename): + with open(filename, 'r') as f: + return json.load(f) else: - # Scraping from local sitemap file - articles = scrape_from_filtered_sitemap(source, filter_function) - - articles = [article for article in articles if filter_function(article['url'])] - markdown_content = convert_to_markdown(articles) + return {} - with open(output_file, 'w', encoding='utf-8') as f: - f.write(markdown_content) +def save_hashes(hashes: Dict[str, str], filename: str): + with open(filename, 'w') as f: + json.dump(hashes, f) - logging.info(f"Scraped and filtered content saved to {output_file}") +def has_page_changed(url: str, new_hash: str, stored_hashes: Dict[str, str]) -> bool: + old_hash = stored_hashes.get(url) + return old_hash != new_hash +# +# ################################################### # # Bookmark Parsing Functions @@ -524,5 +664,5 @@ def collect_bookmarks(file_path: str) -> Dict[str, Union[str, List[str]]]: ##################################################################### # -# +# End of Article_Extractor_Lib.py ####################################################################################################################### diff --git a/App_Function_Libraries/Web_Scraping/Article_Summarization_Lib.py b/App_Function_Libraries/Web_Scraping/Article_Summarization_Lib.py deleted file mode 100644 index 723ce6eb..00000000 --- a/App_Function_Libraries/Web_Scraping/Article_Summarization_Lib.py +++ /dev/null @@ -1,259 +0,0 @@ -# Article_Summarization_Lib.py -######################################### -# Article Summarization Library -# This library is used to handle summarization of articles. -import asyncio -# FIXME - this library should be refactored into `Article_Extractor_Lib` and then renamed to `Web_Scraping_Lib` - -# -#### -# -#################### -# Function List -# -# 1. -# -#################### -# -# Import necessary libraries -import datetime -from datetime import datetime -import gradio as gr -import json -import os -import logging -import requests -# 3rd-Party Imports -# -# Local Imports -from App_Function_Libraries.Utils.Utils import sanitize_filename, load_comprehensive_config -from App_Function_Libraries.Web_Scraping.Article_Extractor_Lib import scrape_article -from App_Function_Libraries.Summarization.Local_Summarization_Lib import summarize_with_llama, summarize_with_oobabooga, \ - summarize_with_tabbyapi, \ - summarize_with_vllm, summarize_with_kobold, save_summary_to_file, summarize_with_local_llm, summarize_with_ollama, \ - summarize_with_custom_openai -from App_Function_Libraries.Summarization.Summarization_General_Lib import summarize_with_openai, summarize_with_anthropic, summarize_with_cohere, \ - summarize_with_groq, summarize_with_openrouter, summarize_with_deepseek, summarize_with_huggingface, \ - summarize_with_mistral -from App_Function_Libraries.DB.DB_Manager import ingest_article_to_db -# -####################################################################################################################### -# Function Definitions -# - -async def scrape_and_summarize_multiple(urls, custom_prompt_arg, api_name, api_key, keywords, custom_article_titles, system_message=None): - urls = [url.strip() for url in urls.split('\n') if url.strip()] - custom_titles = custom_article_titles.split('\n') if custom_article_titles else [] - - results = [] - errors = [] - - # Create a progress bar - progress = gr.Progress() - - # FIXME - add progress tracking to the gradio UI - for i, url in enumerate(urls): - custom_title = custom_titles[i] if i < len(custom_titles) else None - try: - article = await scrape_article(url) - if article and article['extraction_successful']: - if custom_title: - article['title'] = custom_title - results.append(article) - except Exception as e: - error_message = f"Error processing URL {i + 1} ({url}): {str(e)}" - errors.append(error_message) - - # Update progress - progress((i + 1) / len(urls), desc=f"Processed {i + 1}/{len(urls)} URLs") - - if errors: - logging.error("\n".join(errors)) - - return results - - - -def scrape_and_summarize(url, custom_prompt_arg, api_name, api_key, keywords, custom_article_title, system_message=None): - try: - # Step 1: Scrape the article - article_data = asyncio.run(scrape_article(url)) - print(f"Scraped Article Data: {article_data}") # Debugging statement - if not article_data: - return "Failed to scrape the article." - - # Use the custom title if provided, otherwise use the scraped title - title = custom_article_title.strip() if custom_article_title else article_data.get('title', 'Untitled') - author = article_data.get('author', 'Unknown') - content = article_data.get('content', '') - ingestion_date = datetime.now().strftime('%Y-%m-%d') - - print(f"Title: {title}, Author: {author}, Content Length: {len(content)}") # Debugging statement - - # Custom system prompt for the article - system_message = system_message or "Act as a professional summarizer and summarize this article." - # Custom prompt for the article - article_custom_prompt = custom_prompt_arg or "Act as a professional summarizer and summarize this article." - - # Step 2: Summarize the article - summary = None - if api_name: - logging.debug(f"Article_Summarizer: Summarization being performed by {api_name}") - - # Sanitize filename for saving the JSON file - sanitized_title = sanitize_filename(title) - json_file_path = os.path.join("Results", f"{sanitized_title}_segments.json") - - with open(json_file_path, 'w') as json_file: - json.dump([{'text': content}], json_file, indent=2) - config = load_comprehensive_config() - try: - if api_name.lower() == 'openai': - # def summarize_with_openai(api_key, input_data, custom_prompt_arg) - summary = summarize_with_openai(api_key, json_file_path, article_custom_prompt, system_message) - - elif api_name.lower() == "anthropic": - # def summarize_with_anthropic(api_key, input_data, model, custom_prompt_arg, max_retries=3, retry_delay=5): - summary = summarize_with_anthropic(api_key, json_file_path, article_custom_prompt, system_message) - elif api_name.lower() == "cohere": - # def summarize_with_cohere(api_key, input_data, model, custom_prompt_arg) - summary = summarize_with_cohere(api_key, json_file_path, article_custom_prompt, system_message) - - elif api_name.lower() == "groq": - logging.debug(f"MAIN: Trying to summarize with groq") - # def summarize_with_groq(api_key, input_data, model, custom_prompt_arg): - summary = summarize_with_groq(api_key, json_file_path, article_custom_prompt, system_message) - - elif api_name.lower() == "openrouter": - logging.debug(f"MAIN: Trying to summarize with OpenRouter") - # def summarize_with_openrouter(api_key, input_data, custom_prompt_arg): - summary = summarize_with_openrouter(api_key, json_file_path, article_custom_prompt, system_message) - - elif api_name.lower() == "deepseek": - logging.debug(f"MAIN: Trying to summarize with DeepSeek") - # def summarize_with_deepseek(api_key, input_data, custom_prompt_arg): - summary = summarize_with_deepseek(api_key, json_file_path, article_custom_prompt, system_message) - - elif api_name.lower() == "mistral": - summary = summarize_with_mistral(api_key, json_file_path, article_custom_prompt, system_message) - - elif api_name.lower() == "llama.cpp": - logging.debug(f"MAIN: Trying to summarize with Llama.cpp") - # def summarize_with_llama(api_url, file_path, token, custom_prompt) - summary = summarize_with_llama(json_file_path, article_custom_prompt, config['Local-API']['llama_api_key'], None, system_message) - elif api_name.lower() == "kobold": - logging.debug(f"MAIN: Trying to summarize with Kobold.cpp") - # def summarize_with_kobold(input_data, kobold_api_token, custom_prompt_input, api_url): - summary = summarize_with_kobold(json_file_path, api_key, article_custom_prompt, system_message) - - elif api_name.lower() == "ooba": - # def summarize_with_oobabooga(input_data, api_key, custom_prompt, api_url): - summary = summarize_with_oobabooga(json_file_path, api_key, article_custom_prompt, system_message) - - elif api_name.lower() == "tabbyapi": - # def summarize_with_tabbyapi(input_data, tabby_model, custom_prompt_input, api_key=None, api_IP): - summary = summarize_with_tabbyapi(json_file_path, article_custom_prompt, system_message) - - elif api_name.lower() == "vllm": - logging.debug(f"MAIN: Trying to summarize with VLLM") - # def summarize_with_vllm(api_key, input_data, custom_prompt_input): - summary = summarize_with_vllm(json_file_path, article_custom_prompt, None, None, system_message) - elif api_name.lower() == "local-llm": - logging.debug(f"MAIN: Trying to summarize with Local LLM") - summary = summarize_with_local_llm(json_file_path, article_custom_prompt, system_message) - - elif api_name.lower() == "ollama": - logging.debug(f"MAIN: Trying to summarize with OLLAMA") - # def summarize_with_ollama(input_data, api_key, custom_prompt, api_url): - summary = summarize_with_ollama(json_file_path, article_custom_prompt, None, api_key, None, system_message, None) - - elif api_name == "custom_openai_api": - logging.debug(f"MAIN: Trying to summarize with Custom_OpenAI API") - summary = summarize_with_custom_openai(json_file_path, article_custom_prompt, api_key, temp=None, system_message=None) - - - elif api_name.lower() == "huggingface": - logging.debug(f"MAIN: Trying to summarize with huggingface") - # def summarize_with_huggingface(api_key, input_data, custom_prompt_arg): - summarize_with_huggingface(api_key, json_file_path, article_custom_prompt, system_message) - # Add additional API handlers here... - - except requests.exceptions.ConnectionError as e: - logging.error(f"Connection error while trying to summarize with {api_name}: {str(e)}") - - if summary: - logging.info(f"Article_Summarizer: Summary generated using {api_name} API") - save_summary_to_file(summary, json_file_path) - else: - summary = "Summary not available" - logging.warning(f"Failed to generate summary using {api_name} API") - - else: - summary = "Article Summarization: No API provided for summarization." - - print(f"Summary: {summary}") # Debugging statement - - # Step 3: Ingest the article into the database - ingestion_result = ingest_article_to_db(url, title, author, content, keywords, summary, ingestion_date, - article_custom_prompt) - - return f"Title: {title}\nAuthor: {author}\nIngestion Result: {ingestion_result}\n\nSummary: {summary}\n\nArticle Contents: {content}" - except Exception as e: - logging.error(f"Error processing URL {url}: {str(e)}") - return f"Failed to process URL {url}: {str(e)}" - - -def scrape_and_no_summarize_then_ingest(url, keywords, custom_article_title): - try: - # Step 1: Scrape the article - article_data = asyncio.run(scrape_article(url)) - print(f"Scraped Article Data: {article_data}") # Debugging statement - if not article_data: - return "Failed to scrape the article." - - # Use the custom title if provided, otherwise use the scraped title - title = custom_article_title.strip() if custom_article_title else article_data.get('title', 'Untitled') - author = article_data.get('author', 'Unknown') - content = article_data.get('content', '') - ingestion_date = datetime.now().strftime('%Y-%m-%d') - - print(f"Title: {title}, Author: {author}, Content Length: {len(content)}") # Debugging statement - - # Step 2: Ingest the article into the database - ingestion_result = ingest_article_to_db(url, title, author, content, keywords, ingestion_date, None, None) - - return f"Title: {title}\nAuthor: {author}\nIngestion Result: {ingestion_result}\n\nArticle Contents: {content}" - except Exception as e: - logging.error(f"Error processing URL {url}: {str(e)}") - return f"Failed to process URL {url}: {str(e)}" - - -def ingest_unstructured_text(text, custom_prompt, api_name, api_key, keywords, custom_article_title, system_message=None): - title = custom_article_title.strip() if custom_article_title else "Unstructured Text" - author = "Unknown" - ingestion_date = datetime.now().strftime('%Y-%m-%d') - - # Summarize the unstructured text - if api_name: - json_file_path = f"Results/{title.replace(' ', '_')}_segments.json" - with open(json_file_path, 'w') as json_file: - json.dump([{'text': text}], json_file, indent=2) - - if api_name.lower() == 'openai': - summary = summarize_with_openai(api_key, json_file_path, custom_prompt, system_message) - # Add other APIs as needed - else: - summary = "Unsupported API." - else: - summary = "No API provided for summarization." - - # Ingest the unstructured text into the database - ingestion_result = ingest_article_to_db('Unstructured Text', title, author, text, keywords, summary, ingestion_date, - custom_prompt) - return f"Title: {title}\nSummary: {summary}\nIngestion Result: {ingestion_result}" - - - -# -# -####################################################################################################################### \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 200dcba0..7880c405 100644 --- a/requirements.txt +++ b/requirements.txt @@ -34,6 +34,7 @@ psutil #psycopg2-binary pyannote.audio PyAudio +pycookiecheat pydub pymupdf pypandoc