From 865eb23fe64913b27e5dc0607490dda4cf8180f5 Mon Sep 17 00:00:00 2001 From: Richard Lora Date: Tue, 10 Dec 2024 17:20:10 -0500 Subject: [PATCH] feat: improve API gateway lifecycle management and update documentation This commit introduces the following changes: - Added initialization and shutdown methods for the API gateway lifecycle. - Improved the README with better examples and instructions for initializing and shutting down the API gateway. - Introduced user agent rotation in search requests for better reliability. - Updated search logic to provide more structured and maintainable results handling. - Enhanced the novexity_check script with better error handling and field validation. Motivation: Opening and closing API gateways within the search can lead to performance overhead during frequent searches. This change allows the user to manage the lifecycle of the API gateway explicitly, improving scalability. Additional Changes: - Added `venv` to .gitignore. - Bumped package version from 1.0.5 to 1.0.6. - Updated dependencies in `setup.cfg` to include `python-dotenv`. BREAKING CHANGE: Users must now explicitly call `NovexitySearch.init_gateway()` before initiating searches and `NovexitySearch.shutdown_gateway()` to clean up resources. --- .gitignore | 1 + README.md | 34 ++--- novexity/search.py | 286 ++++++++++++++++++---------------------- novexity/user_agents.py | 16 +++ novexity_check.py | 30 +++-- requirements.txt | Bin 114 -> 236 bytes setup.cfg | 3 +- 7 files changed, 180 insertions(+), 190 deletions(-) create mode 100644 novexity/user_agents.py diff --git a/.gitignore b/.gitignore index 2a1d7c7..ba49529 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ Novexity.egg-info # Cache __pycache__ +venv # Environment variables .env diff --git a/README.md b/README.md index 14bd13f..530aa94 100644 --- a/README.md +++ b/README.md @@ -40,33 +40,33 @@ Example usage: ```python import os +import json from dotenv import load_dotenv from novexity import NovexitySearch, configure + load_dotenv() -AWS_ACCESS_KEY_ID = os.getenv('GOOGLE_SEARCH_AWS_ACCESS_KEY_ID') -AWS_SECRET_ACCESS_KEY = os.getenv('GOOGLE_SEARCH_AWS_SECRET_ACCESS_KEY') -configure(aws_access_key_id=AWS_ACCESS_KEY_ID, - aws_secret_access_key=AWS_SECRET_ACCESS_KEY) +AWS_ACCESS_KEY_ID = os.getenv("GOOGLE_SEARCH_AWS_ACCESS_KEY_ID") +AWS_SECRET_ACCESS_KEY = os.getenv("GOOGLE_SEARCH_AWS_SECRET_ACCESS_KEY") +configure( + aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY +) -params = { - "q": "Minecraft", - "country": "fr", - "lang": "fr" -} +# Initialize API Gateway +NovexitySearch.init_gateway() -# Initialize NovexitySearch with the parameters -novexity_search = NovexitySearch(params) +params = {"q": "Minecraft", "country": "fr", "lang": "fr"} -# Get the search results -novexity, returned_gateway = novexity_search.get_dict() +# Perform search +novexity_search = NovexitySearch(params) +search_results = novexity_search.get_dict() # Save the results to search.json with open("google-search.json", "w", encoding="utf-8") as file: - file.write(novexity) + file.write(json.dumps(search_results, indent=4)) -# Shut down the gateways -returned_gateway.shutdown() +# Shut down the API Gateway +NovexitySearch.shutdown_gateway() ``` ⚠️ Remember: If gateways are not shut down via the `shutdown()` method, you may incur charges. @@ -167,7 +167,7 @@ To make your search more tailored, Novexity supports various parameters that you - `lang`: Two-letter language code for Google search (e.g., en for English, es for Spanish, or fr for French). Refer to [Google languages](./static/json/google-languages.json) for all available languages. Defaults to English ("en") if not specified. -- `lang_restrict`: Restricts search results to specific languages. Refer to [Google languages](./static/json/google-lr-languages.json) for all available languages. +- `lang_restrict`: Restricts search results to specific languages. Refer to [Google languages](./static/json/google-lr-languages.json) for all available languages. - `location`: Google encoded location you want to use for the search. Note: Using `uule` requires a special value format (Google's encrypted location format). diff --git a/novexity/search.py b/novexity/search.py index 7727912..66be641 100644 --- a/novexity/search.py +++ b/novexity/search.py @@ -5,6 +5,7 @@ from bs4 import BeautifulSoup from collections import OrderedDict from .requests_ip_rotator.ip_rotator import ApiGateway +from .user_agents import get_useragent class Configuration: @@ -20,7 +21,15 @@ def __init__(self): config = Configuration() # Initialize the configuration # Valid keys for search result fields -VALID_KEYS = {"title", "link", "displayed_link", "favicon", "snippet", "source"} +VALID_KEYS = { + "position", + "title", + "link", + "displayed_link", + "favicon", + "snippet", + "source", +} # Titles to be excluded from search results invalid_titles = ["Description"] @@ -28,9 +37,39 @@ def __init__(self): class NovexitySearch: """ - Class for performing Novexity search with specified parameters. + Class for managing the Novexity search with gateway lifecycle. """ + gateway = None # Shared gateway instance across all searches + + @classmethod + def init_gateway(cls, regions=None): + """ + Initialize the API gateway. + """ + if cls.gateway is None: + cls.gateway = ApiGateway( + "https://www.google.com", + access_key_id=config.AWS_ACCESS_KEY_ID, + access_key_secret=config.AWS_ACCESS_KEY_SECRET, + regions=regions or ["us-east-1", "us-west-1"], + verbose=True, + ) + cls.gateway.start() + else: + print("Gateway is already initialized.") + + @classmethod + def shutdown_gateway(cls): + """ + Shutdown the API gateway. + """ + if cls.gateway: + cls.gateway.shutdown() + cls.gateway = None + else: + print("Gateway is not initialized.") + def __init__(self, params): self.query = params.get("q") self.country = params.get("country") @@ -41,9 +80,11 @@ def __init__(self, params): def get_dict(self): """ - Method to initiate the search and return results. + Perform a search and return results. """ - return search( + if not NovexitySearch.gateway: + raise RuntimeError("Gateway is not initialized. Call `init_gateway` first.") + return self._search( self.query, *self.fields, country=self.country, @@ -51,100 +92,29 @@ def get_dict(self): location=self.location, ) + def _search( + self, query, *fields, country=None, lang="en", location=None, lang_restrict=None + ): + """ + Perform the search and fetch results using the API gateway. + """ + session = requests.Session() + session.mount("https://www.google.com", NovexitySearch.gateway) + + url = f"https://www.google.com/search?q={query}" + if country: + url += f"&gl={country}" + if lang: + url += f"&hl={lang}" + if lang_restrict: + url += f"&lr={lang_restrict}" + if location: + url += f"&uule={location}" -def configure(aws_access_key_id=None, aws_secret_access_key=None): - """ - Configure AWS keys for the search function. - """ - if aws_access_key_id: - config.AWS_ACCESS_KEY_ID = aws_access_key_id - if aws_secret_access_key: - config.AWS_ACCESS_KEY_SECRET = aws_secret_access_key - - -def format_json_output(data): - """ - Format data into a JSON string. - """ - return json.dumps(data, indent=4, ensure_ascii=False) - - -def clean_url(url): - """ - Clean and extract the actual URL from the query string. - """ - parsed = urlparse(url) - url_qs = parse_qs(parsed.query) - if "q" in url_qs: - return unquote(url_qs["q"][0]) - elif "url" in url_qs: - return unquote(url_qs["url"][0]) - return url - - -def extract_snippet(result_item): - """ - Extract snippet text from a search result item. - """ - snippet_parts = result_item.select( - ".VwiC3b.yXK7lf.lyLwlc.yDYNvb.W8l4ac.lEBKkf span" - ) - if snippet_parts: - return " ".join(part.get_text() for part in snippet_parts) - return "" - - -def search( - query: str, *fields, country=None, lang="en", location=None, lang_restrict=None -): - """ - Perform a search query on Google and return the organic search results. - """ - - # Validate field inputs - if "position" in fields: - return ( - format_json_output({"error": "The 'position' key is not a valid choice."}), - None, - ) - invalid_keys = set(fields) - VALID_KEYS - if invalid_keys: - return ( - format_json_output({"error": f"Invalid keys: {', '.join(invalid_keys)}"}), - None, - ) + search_results = [] + headers = {"User-Agent": get_useragent()} + position = 0 - # Prepare headers and gateway for the request - headers = { - "User-Agent": "Mozilla/5.0 (X11; Linux x86_64)" - " AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4454.0 Safari/537.36" - } - - gateway = ApiGateway( - "https://www.google.com", - access_key_id=config.AWS_ACCESS_KEY_ID, - access_key_secret=config.AWS_ACCESS_KEY_SECRET, - ) - gateway.start() - - session = requests.Session() - session.mount("https://www.google.com", gateway) - - # Construct search URL - url = f"https://www.google.com/search?q={query}" - if country: - url += f"&gl={country}" - if lang: - url += f"&hl={lang}" - if lang_restrict: - url += f"&lr={lang_restrict}" - if location: - url += f"&uule={location}" - - response = None - - # Attempt to get a valid response from Google - try: while True: try: response = session.get(url, headers=headers) @@ -152,96 +122,90 @@ def search( print("Received a 200 OK response!") break print("Status Code:", response.status_code, "Switching IP...") + headers["User-Agent"] = get_useragent() except requests.exceptions.Timeout: print("Request timed out. Switching IP...") except requests.ConnectionError: print("Connection error. Switching IP...") except Exception as error: - print("An unexpected error occurred:", error, ". Switching IP...") + print("Unexpected error:", error, ". Switching IP...") + headers["User-Agent"] = get_useragent() if response.status_code != 200: - return ( - format_json_output({"error": "Failed to retrieve web page."}), - gateway, - ) + return {"error": f"Failed with status code {response.status_code}"} soup = BeautifulSoup(response.text, "html.parser") - - # Check if search results are loaded if not soup.select_one("div#search"): - return {"error": "Search results not loaded."}, gateway - - search_results = [] - - # Determine which fields to fetch based on input - fetch_all = not bool(fields) - fetch_position = "position" in fields or fetch_all - fetch_title = "title" in fields or fetch_all - fetch_link = "link" in fields or fetch_all - fetch_displayed_link = "displayed_link" in fields or fetch_all - fetch_favicon = "favicon" in fields or fetch_all - fetch_snippet = "snippet" in fields or fetch_all - fetch_source = "source" in fields or fetch_all - - if ( - any( - [ - fetch_title, - fetch_link, - fetch_displayed_link, - fetch_favicon, - fetch_snippet, - fetch_source, - ] - ) - and "position" not in fields - ): - fetch_position = True + return {"error": "Search results not loaded."} - position = 0 for result in soup.select(".tF2Cxc"): result_dict = OrderedDict() - if fetch_position: + if "position" in fields or not fields: position += 1 result_dict["position"] = position - # Extract information based on fields to fetch - if fetch_title: - title_element = result.select_one("h3") - if title_element: - result_dict["title"] = title_element.get_text() + title_element = result.select_one("h3") + if title_element and ("title" in fields or not fields): + result_dict["title"] = title_element.get_text() - if fetch_link: - link_element = result.select_one("a") - if link_element and link_element.has_attr("href"): - result_dict["link"] = clean_url(link_element["href"]) + link_element = result.select_one("a") + if ( + link_element + and link_element.has_attr("href") + and ("link" in fields or not fields) + ): + result_dict["link"] = clean_url(link_element["href"]) - if fetch_displayed_link: - displayed_link_element = result.select_one(".TbwUpd.NJjxre") - if displayed_link_element: - result_dict["displayed_link"] = displayed_link_element.get_text() + displayed_link_element = result.select_one(".TbwUpd.NJjxre") + if displayed_link_element and ("displayed_link" in fields or not fields): + result_dict["displayed_link"] = displayed_link_element.get_text() - if fetch_favicon: - favicon_element = result.select_one(".TbwUpd.NJjxre img") - if favicon_element: - result_dict["favicon"] = favicon_element["src"] + favicon_element = result.select_one(".TbwUpd.NJjxre img") + if favicon_element and ("favicon" in fields or not fields): + result_dict["favicon"] = favicon_element["src"] - if fetch_snippet: - result_dict["snippet"] = extract_snippet(result) + snippet_parts = result.select(".VwiC3b.yXK7lf span") + if snippet_parts and ("snippet" in fields or not fields): + result_dict["snippet"] = " ".join( + part.get_text() for part in snippet_parts + ) - if fetch_source: - source_element = result.select_one("cite") - if source_element: - result_dict["source"] = source_element.get_text() + source_element = result.select_one("cite") + if source_element and ("source" in fields or not fields): + result_dict["source"] = source_element.get_text() if result_dict.get("title") not in invalid_titles: search_results.append(result_dict) - json_result = {"organic_results": search_results} + return {"organic_results": search_results} - return format_json_output(json_result), gateway - except (ConnectionError, ValueError) as exc: - error = {"error": str(exc)} - return error, gateway +def configure(aws_access_key_id=None, aws_secret_access_key=None): + """ + Configure AWS keys for the search function. + """ + if aws_access_key_id: + config.AWS_ACCESS_KEY_ID = aws_access_key_id + if aws_secret_access_key: + config.AWS_ACCESS_KEY_SECRET = aws_secret_access_key + + +def format_json_output(data): + """ + Format data into a JSON string. + """ + return json.dumps(data, indent=4, ensure_ascii=False) + + +def clean_url(url): + """ + Clean and extract the actual URL from the query string. + """ + parsed = urlparse(url) + url_qs = parse_qs(parsed.query) + if "q" in url_qs: + return unquote(url_qs["q"][0]) + elif "url" in url_qs: + return unquote(url_qs["url"][0]) + return url diff --git a/novexity/user_agents.py b/novexity/user_agents.py new file mode 100644 index 0000000..d10514a --- /dev/null +++ b/novexity/user_agents.py @@ -0,0 +1,16 @@ +import random + + +def get_useragent(): + return random.choice(_useragent_list) + + +_useragent_list = [ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0", +] diff --git a/novexity_check.py b/novexity_check.py index e822e40..792a102 100644 --- a/novexity_check.py +++ b/novexity_check.py @@ -3,37 +3,39 @@ from dotenv import load_dotenv from novexity import NovexitySearch, configure +# Load environment variables load_dotenv() +# Configure AWS keys AWS_ACCESS_KEY_ID = os.getenv("GOOGLE_SEARCH_AWS_ACCESS_KEY_ID") AWS_SECRET_ACCESS_KEY = os.getenv("GOOGLE_SEARCH_AWS_SECRET_ACCESS_KEY") configure( aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY ) +# Initialize API Gateway +NovexitySearch.init_gateway() + +# Search parameters params = {"q": "Minecraft", "country": "fr", "lang": "fr"} -# Initialize NovexitySearch with the parameters +# Perform search novexity_search = NovexitySearch(params) - -# Get the search results -novexity, returned_gateway = novexity_search.get_dict() +search_results = novexity_search.get_dict() # Save the results to search.json with open("google-search.json", "w", encoding="utf-8") as file: - file.write(novexity) + file.write(json.dumps(search_results, indent=4)) -# Shut down the gateways -returned_gateway.shutdown() - -# Check if the required and optional fields are present in the output +# Required and optional fields required_fields = ["title", "link", "snippet"] optional_fields = ["displayed_link", "favicon", "source"] -output_json = json.loads(novexity) -for result in output_json.get("organic_results", []): +# Check and print field presence +for result in search_results.get("organic_results", []): position = result.get("position", "Unknown position") + # Check for missing fields missing_required_fields = [ field for field in required_fields if field not in result ] @@ -41,15 +43,21 @@ field for field in optional_fields if field not in result ] + # Print missing required fields if missing_required_fields: print( f"Position {position}: Missing required fields: {missing_required_fields}" ) + NovexitySearch.shutdown_gateway() exit(1) + # Print missing optional fields if missing_optional_fields: print( f"Position {position}: Warning: Missing optional fields: {missing_optional_fields}" ) print("All required fields are present.") + +# Shut down the API Gateway +NovexitySearch.shutdown_gateway() diff --git a/requirements.txt b/requirements.txt index 684ee52af9bb10a7611705a6c9535de077039e3e..8adeb0778f65ba73826e2b3774ebe96d1a74d856 100644 GIT binary patch literal 236 zcmXwzQ4WGY3`F0zi8oQ=vgj9j7eI_A22d9hFRx6yF`KrV_D!e1Z%a?ZOyb5x_0Ee? zb)w>^S5Xisl}pyFc#3~$-iu2LV{95l73`d8wIL}YwRa;2pKUmaHZHPP4(i*^M&2WT oRj0qX)V|jk+ncrK6kMCHnfS=*D#gH-vucuZ;O%$ii%elI3!dR3y8r+H delta 24 ecmaFESTrF)$d