From f186bfda3af71eeab7ba6d168e52428600964a8e Mon Sep 17 00:00:00 2001 From: Jan Polonsky Date: Fri, 1 Dec 2023 12:53:19 +0100 Subject: [PATCH] - small code beautify - rss collector now import just items with published date > last previous crawl (based on atom collector logic) - better temp names in pdf presenter --- src/bots/managers/auth_manager.py | 2 +- src/collectors/collectors/atom_collector.py | 24 +++-- src/collectors/collectors/rss_collector.py | 98 ++++++++++----------- src/collectors/collectors/web_collector.py | 11 +-- src/collectors/managers/auth_manager.py | 2 +- src/presenters/managers/auth_manager.py | 2 +- src/presenters/presenters/pdf_presenter.py | 25 +----- 7 files changed, 78 insertions(+), 86 deletions(-) diff --git a/src/bots/managers/auth_manager.py b/src/bots/managers/auth_manager.py index 24b59d938..1ce891a05 100644 --- a/src/bots/managers/auth_manager.py +++ b/src/bots/managers/auth_manager.py @@ -30,7 +30,7 @@ def api_key_required(fn): @wraps(fn) def wrapper(*args, **kwargs): - if "Authorization" not in request.headers.keys() or request.headers["Authorization"] != ("Bearer " + api_key): + if "Authorization" not in request.headers or request.headers["Authorization"] != ("Bearer " + api_key): return {"error": "not authorized"}, 401 else: return fn(*args, **kwargs) diff --git a/src/collectors/collectors/atom_collector.py b/src/collectors/collectors/atom_collector.py index 3ed9df57a..7afe13ce2 100644 --- a/src/collectors/collectors/atom_collector.py +++ b/src/collectors/collectors/atom_collector.py @@ -50,14 +50,16 @@ def collect(self, source): else: feed = feedparser.parse(feed_url) + log_manager.log_collector_activity("atom", source.name, "ATOM returned feed with {} entries".format(len(feed["entries"]))) + news_items = [] limit = BaseCollector.history(interval) for feed_entry in feed['entries']: published = feed_entry['updated'] published = parse(published, tzinfos=BaseCollector.timezone_info()) + # comment this at the beginning of the testing to get some initial data if str(published) > str(limit): - link_for_article = feed_entry['link'] log_manager.log_collector_activity("atom", source.name, "Processing entry [{}]".format(link_for_article)) if proxies: @@ -76,10 +78,20 @@ def collect(self, source): for_hash = feed_entry['author'] + feed_entry['title'] + feed_entry['link'] - news_item = NewsItemData(uuid.uuid4(), hashlib.sha256(for_hash.encode()).hexdigest(), - feed_entry['title'], description, feed_url, - feed_entry['link'], feed_entry['updated'], feed_entry['author'], - datetime.datetime.now(), content, source.id, []) + news_item = NewsItemData( + uuid.uuid4(), + hashlib.sha256(for_hash.encode()).hexdigest(), + feed_entry['title'], + description, + feed_url, + feed_entry['link'], + feed_entry['updated'], + feed_entry['author'], + datetime.datetime.now(), + content, + source.id, + [] + ) news_items.append(news_item) @@ -88,5 +100,5 @@ def collect(self, source): log_manager.log_collector_activity("atom", source.name, "ATOM collection exceptionally failed") BaseCollector.print_exception(source, error) log_manager.log_debug(traceback.format_exc()) - + log_manager.log_debug("{} collection finished.".format(self.type)) diff --git a/src/collectors/collectors/rss_collector.py b/src/collectors/collectors/rss_collector.py index 7c2fa62be..57619f428 100644 --- a/src/collectors/collectors/rss_collector.py +++ b/src/collectors/collectors/rss_collector.py @@ -44,7 +44,7 @@ def collect(self, source): source -- Source object. """ feed_url = source.parameter_values["FEED_URL"] - # interval = source.parameter_values["REFRESH_INTERVAL"] + interval = source.parameter_values["REFRESH_INTERVAL"] log_manager.log_collector_activity("rss", source.name, "Starting collector for url: {}".format(feed_url)) @@ -67,11 +67,11 @@ def collect(self, source): if proxy_server == "none": proxy_handler = urllib.request.ProxyHandler({}) else: - proxy = re.search(r"^(http|https|socks4|socks5)://([a-zA-Z0-9\-\.\_]+):(\d+)/?$", proxy_server) + proxy = re.search(r"^(http|https|socks4|socks5|ftp)://([a-zA-Z0-9\-\.\_]+):(\d+)/?$", proxy_server) if proxy: scheme, host, port = proxy.groups() # classic HTTP/HTTPS proxy - if scheme in ["http", "https"]: + if scheme in ["http", "https", "ftp"]: proxy_handler = urllib.request.ProxyHandler( { "http": "{}://{}:{}".format(scheme, host, port), @@ -100,59 +100,59 @@ def collect(self, source): news_items = [] + limit = BaseCollector.history(interval) for feed_entry in feed["entries"]: for key in ["author", "published", "title", "description", "link"]: if key not in feed_entry.keys(): feed_entry[key] = "" - # limit = BaseCollector.history(interval) published = feed_entry["published"] published = dateparser.parse(published, settings={"DATE_ORDER": "DMY"}) - - # if published > limit: TODO: uncomment after testing, we need some initial data now - link_for_article = feed_entry["link"] - if not link_for_article: - log_manager.log_collector_activity("rss", source.name, "Skipping (empty link)") - continue - - log_manager.log_collector_activity("rss", source.name, "Processing entry [{}]".format(link_for_article)) - - html_content = "" - request = urllib.request.Request(link_for_article) - request.add_header("User-Agent", user_agent) - - with opener(request) as response: - html_content = response.read() - - soup = BeautifulSoup(html_content, features="html.parser") - - content = "" - - if html_content: - content_text = [p.text.strip() for p in soup.findAll("p")] - replaced_str = "\xa0" - if replaced_str: - content = [w.replace(replaced_str, " ") for w in content_text] - content = " ".join(content) - - for_hash = feed_entry["author"] + feed_entry["title"] + feed_entry["link"] - - news_item = NewsItemData( - uuid.uuid4(), - hashlib.sha256(for_hash.encode()).hexdigest(), - feed_entry["title"], - feed_entry["description"], - feed_url, - feed_entry["link"], - feed_entry["published"], - feed_entry["author"], - datetime.datetime.now(), - content, - source.id, - [], - ) - - news_items.append(news_item) + # comment this at the beginning of the testing to get some initial data + if str(published) > str(limit): + link_for_article = feed_entry["link"] + if not link_for_article: + log_manager.log_collector_activity("rss", source.name, "Skipping (empty link)") + continue + + log_manager.log_collector_activity("rss", source.name, "Processing entry [{}]".format(link_for_article)) + + html_content = "" + request = urllib.request.Request(link_for_article) + request.add_header("User-Agent", user_agent) + + with opener(request) as response: + html_content = response.read() + + soup = BeautifulSoup(html_content, features="html.parser") + + content = "" + + if html_content: + content_text = [p.text.strip() for p in soup.findAll("p")] + replaced_str = "\xa0" + if replaced_str: + content = [w.replace(replaced_str, " ") for w in content_text] + content = " ".join(content) + + for_hash = feed_entry["author"] + feed_entry["title"] + feed_entry["link"] + + news_item = NewsItemData( + uuid.uuid4(), + hashlib.sha256(for_hash.encode()).hexdigest(), + feed_entry["title"], + feed_entry["description"], + feed_url, + feed_entry["link"], + feed_entry["published"], + feed_entry["author"], + datetime.datetime.now(), + content, + source.id, + [], + ) + + news_items.append(news_item) BaseCollector.publish(news_items, source) diff --git a/src/collectors/collectors/web_collector.py b/src/collectors/collectors/web_collector.py index 7c4a7a33e..208ac905c 100644 --- a/src/collectors/collectors/web_collector.py +++ b/src/collectors/collectors/web_collector.py @@ -383,7 +383,7 @@ def __get_headless_driver_chrome(self): "ftpProxy": self.proxy, "sslProxy": self.proxy } - + driver = webdriver.Chrome(service=chrome_service, options=chrome_options) log_manager.log_debug('Chrome driver initialized.') return driver @@ -509,15 +509,16 @@ def __browse_title_page(self, index_url): # if there is a popup selector, click on it! if self.selectors['popup_close']: + popup = None try: popup = WebDriverWait(browser, 10).until(EC.presence_of_element_located(self.__get_element_locator(self.selectors['popup_close']))) except Exception as ex: log_manager.log_collector_activity('web', self.source.name, 'Popup find error: ' + traceback.format_exc()) - try: - if popup: + if popup is not None: + try: popup.click() - except Exception as ex: - log_manager.log_collector_activity('web', self.source.name, 'Popup click error: ' + traceback.format_exc()) + except Exception as ex: + log_manager.log_collector_activity('web', self.source.name, 'Popup click error: ' + traceback.format_exc()) # if there is a "load more" selector, click on it! page = 1 diff --git a/src/collectors/managers/auth_manager.py b/src/collectors/managers/auth_manager.py index 28d4ce145..56bbdb59e 100644 --- a/src/collectors/managers/auth_manager.py +++ b/src/collectors/managers/auth_manager.py @@ -31,7 +31,7 @@ def api_key_required(fn): @wraps(fn) def wrapper(*args, **kwargs): - if "Authorization" not in request.headers.keys() or request.headers["Authorization"] != ("Bearer " + api_key): + if "Authorization" not in request.headers or request.headers["Authorization"] != ("Bearer " + api_key): return {"error": "not authorized"}, 401 else: return fn(*args, **kwargs) diff --git a/src/presenters/managers/auth_manager.py b/src/presenters/managers/auth_manager.py index 24b59d938..1ce891a05 100644 --- a/src/presenters/managers/auth_manager.py +++ b/src/presenters/managers/auth_manager.py @@ -30,7 +30,7 @@ def api_key_required(fn): @wraps(fn) def wrapper(*args, **kwargs): - if "Authorization" not in request.headers.keys() or request.headers["Authorization"] != ("Bearer " + api_key): + if "Authorization" not in request.headers or request.headers["Authorization"] != ("Bearer " + api_key): return {"error": "not authorized"}, 401 else: return fn(*args, **kwargs) diff --git a/src/presenters/presenters/pdf_presenter.py b/src/presenters/presenters/pdf_presenter.py index fdad204a1..eb30e1530 100644 --- a/src/presenters/presenters/pdf_presenter.py +++ b/src/presenters/presenters/pdf_presenter.py @@ -39,10 +39,8 @@ def generate(self, presenter_input): dict: mime type and base64 encoded data of the generated PDF document """ try: - temporary_directory = tempfile.gettempdir() + "/" - output_html = temporary_directory + "pdf_body.html" - output_pdf = temporary_directory + "pdf_report__" + datetime.datetime.now().strftime("%d-%m-%Y_%H:%M") + ".pdf" - + output_html = tempfile.NamedTemporaryFile(prefix="pdf_report_", suffix='.html', delete_on_close=False).name + output_pdf = tempfile.NamedTemporaryFile(prefix="pdf_report_", suffix='.pdf', delete_on_close=False).name head, tail = os.path.split(presenter_input.parameter_values_map["PDF_TEMPLATE_PATH"]) input_data = BasePresenter.generate_input_data(presenter_input) @@ -54,25 +52,6 @@ def generate(self, presenter_input): with open(output_html, "w") as output_file: output_file.write(output_text) - if not os.path.exists(temporary_directory): - os.mkdir(temporary_directory) - - # options = { - # 'dpi': 500, - # 'page-size': 'A4', - # 'margin-top': '1.55in', - # 'margin-right': '0.75in', - # 'margin-bottom': '1.55in', - # 'margin-left': '0.75in', - # 'encoding': "UTF-8", - # 'header-html': pdf_header_template, - # 'footer-html': pdf_footer_template, - # 'custom-header': [ - # ('Accept-Encoding', 'gzip') - # ], - # 'no-outline': None, - # 'enable-local-file-access': None - # } HTML(output_html).write_pdf(output_pdf) encoding = "UTF-8"