Skip to content

Commit

Permalink
Merge pull request #198 from Progress1/PR_fix
Browse files Browse the repository at this point in the history
Fix some small code issues from last PR
  • Loading branch information
milankowww authored Dec 4, 2023
2 parents c7019cb + f186bfd commit 268614b
Show file tree
Hide file tree
Showing 7 changed files with 78 additions and 86 deletions.
2 changes: 1 addition & 1 deletion src/bots/managers/auth_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def api_key_required(fn):

@wraps(fn)
def wrapper(*args, **kwargs):
if "Authorization" not in request.headers.keys() or request.headers["Authorization"] != ("Bearer " + api_key):
if "Authorization" not in request.headers or request.headers["Authorization"] != ("Bearer " + api_key):
return {"error": "not authorized"}, 401
else:
return fn(*args, **kwargs)
Expand Down
24 changes: 18 additions & 6 deletions src/collectors/collectors/atom_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,16 @@ def collect(self, source):
else:
feed = feedparser.parse(feed_url)

log_manager.log_collector_activity("atom", source.name, "ATOM returned feed with {} entries".format(len(feed["entries"])))

news_items = []

limit = BaseCollector.history(interval)
for feed_entry in feed['entries']:
published = feed_entry['updated']
published = parse(published, tzinfos=BaseCollector.timezone_info())
# comment this at the beginning of the testing to get some initial data
if str(published) > str(limit):

link_for_article = feed_entry['link']
log_manager.log_collector_activity("atom", source.name, "Processing entry [{}]".format(link_for_article))
if proxies:
Expand All @@ -76,10 +78,20 @@ def collect(self, source):

for_hash = feed_entry['author'] + feed_entry['title'] + feed_entry['link']

news_item = NewsItemData(uuid.uuid4(), hashlib.sha256(for_hash.encode()).hexdigest(),
feed_entry['title'], description, feed_url,
feed_entry['link'], feed_entry['updated'], feed_entry['author'],
datetime.datetime.now(), content, source.id, [])
news_item = NewsItemData(
uuid.uuid4(),
hashlib.sha256(for_hash.encode()).hexdigest(),
feed_entry['title'],
description,
feed_url,
feed_entry['link'],
feed_entry['updated'],
feed_entry['author'],
datetime.datetime.now(),
content,
source.id,
[]
)

news_items.append(news_item)

Expand All @@ -88,5 +100,5 @@ def collect(self, source):
log_manager.log_collector_activity("atom", source.name, "ATOM collection exceptionally failed")
BaseCollector.print_exception(source, error)
log_manager.log_debug(traceback.format_exc())

log_manager.log_debug("{} collection finished.".format(self.type))
98 changes: 49 additions & 49 deletions src/collectors/collectors/rss_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def collect(self, source):
source -- Source object.
"""
feed_url = source.parameter_values["FEED_URL"]
# interval = source.parameter_values["REFRESH_INTERVAL"]
interval = source.parameter_values["REFRESH_INTERVAL"]

log_manager.log_collector_activity("rss", source.name, "Starting collector for url: {}".format(feed_url))

Expand All @@ -67,11 +67,11 @@ def collect(self, source):
if proxy_server == "none":
proxy_handler = urllib.request.ProxyHandler({})
else:
proxy = re.search(r"^(http|https|socks4|socks5)://([a-zA-Z0-9\-\.\_]+):(\d+)/?$", proxy_server)
proxy = re.search(r"^(http|https|socks4|socks5|ftp)://([a-zA-Z0-9\-\.\_]+):(\d+)/?$", proxy_server)
if proxy:
scheme, host, port = proxy.groups()
# classic HTTP/HTTPS proxy
if scheme in ["http", "https"]:
if scheme in ["http", "https", "ftp"]:
proxy_handler = urllib.request.ProxyHandler(
{
"http": "{}://{}:{}".format(scheme, host, port),
Expand Down Expand Up @@ -100,59 +100,59 @@ def collect(self, source):

news_items = []

limit = BaseCollector.history(interval)
for feed_entry in feed["entries"]:
for key in ["author", "published", "title", "description", "link"]:
if key not in feed_entry.keys():
feed_entry[key] = ""

# limit = BaseCollector.history(interval)
published = feed_entry["published"]
published = dateparser.parse(published, settings={"DATE_ORDER": "DMY"})

# if published > limit: TODO: uncomment after testing, we need some initial data now
link_for_article = feed_entry["link"]
if not link_for_article:
log_manager.log_collector_activity("rss", source.name, "Skipping (empty link)")
continue

log_manager.log_collector_activity("rss", source.name, "Processing entry [{}]".format(link_for_article))

html_content = ""
request = urllib.request.Request(link_for_article)
request.add_header("User-Agent", user_agent)

with opener(request) as response:
html_content = response.read()

soup = BeautifulSoup(html_content, features="html.parser")

content = ""

if html_content:
content_text = [p.text.strip() for p in soup.findAll("p")]
replaced_str = "\xa0"
if replaced_str:
content = [w.replace(replaced_str, " ") for w in content_text]
content = " ".join(content)

for_hash = feed_entry["author"] + feed_entry["title"] + feed_entry["link"]

news_item = NewsItemData(
uuid.uuid4(),
hashlib.sha256(for_hash.encode()).hexdigest(),
feed_entry["title"],
feed_entry["description"],
feed_url,
feed_entry["link"],
feed_entry["published"],
feed_entry["author"],
datetime.datetime.now(),
content,
source.id,
[],
)

news_items.append(news_item)
# comment this at the beginning of the testing to get some initial data
if str(published) > str(limit):
link_for_article = feed_entry["link"]
if not link_for_article:
log_manager.log_collector_activity("rss", source.name, "Skipping (empty link)")
continue

log_manager.log_collector_activity("rss", source.name, "Processing entry [{}]".format(link_for_article))

html_content = ""
request = urllib.request.Request(link_for_article)
request.add_header("User-Agent", user_agent)

with opener(request) as response:
html_content = response.read()

soup = BeautifulSoup(html_content, features="html.parser")

content = ""

if html_content:
content_text = [p.text.strip() for p in soup.findAll("p")]
replaced_str = "\xa0"
if replaced_str:
content = [w.replace(replaced_str, " ") for w in content_text]
content = " ".join(content)

for_hash = feed_entry["author"] + feed_entry["title"] + feed_entry["link"]

news_item = NewsItemData(
uuid.uuid4(),
hashlib.sha256(for_hash.encode()).hexdigest(),
feed_entry["title"],
feed_entry["description"],
feed_url,
feed_entry["link"],
feed_entry["published"],
feed_entry["author"],
datetime.datetime.now(),
content,
source.id,
[],
)

news_items.append(news_item)

BaseCollector.publish(news_items, source)

Expand Down
11 changes: 6 additions & 5 deletions src/collectors/collectors/web_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,7 @@ def __get_headless_driver_chrome(self):
"ftpProxy": self.proxy,
"sslProxy": self.proxy
}

driver = webdriver.Chrome(service=chrome_service, options=chrome_options)
log_manager.log_debug('Chrome driver initialized.')
return driver
Expand Down Expand Up @@ -509,15 +509,16 @@ def __browse_title_page(self, index_url):

# if there is a popup selector, click on it!
if self.selectors['popup_close']:
popup = None
try:
popup = WebDriverWait(browser, 10).until(EC.presence_of_element_located(self.__get_element_locator(self.selectors['popup_close'])))
except Exception as ex:
log_manager.log_collector_activity('web', self.source.name, 'Popup find error: ' + traceback.format_exc())
try:
if popup:
if popup is not None:
try:
popup.click()
except Exception as ex:
log_manager.log_collector_activity('web', self.source.name, 'Popup click error: ' + traceback.format_exc())
except Exception as ex:
log_manager.log_collector_activity('web', self.source.name, 'Popup click error: ' + traceback.format_exc())

# if there is a "load more" selector, click on it!
page = 1
Expand Down
2 changes: 1 addition & 1 deletion src/collectors/managers/auth_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def api_key_required(fn):

@wraps(fn)
def wrapper(*args, **kwargs):
if "Authorization" not in request.headers.keys() or request.headers["Authorization"] != ("Bearer " + api_key):
if "Authorization" not in request.headers or request.headers["Authorization"] != ("Bearer " + api_key):
return {"error": "not authorized"}, 401
else:
return fn(*args, **kwargs)
Expand Down
2 changes: 1 addition & 1 deletion src/presenters/managers/auth_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def api_key_required(fn):

@wraps(fn)
def wrapper(*args, **kwargs):
if "Authorization" not in request.headers.keys() or request.headers["Authorization"] != ("Bearer " + api_key):
if "Authorization" not in request.headers or request.headers["Authorization"] != ("Bearer " + api_key):
return {"error": "not authorized"}, 401
else:
return fn(*args, **kwargs)
Expand Down
25 changes: 2 additions & 23 deletions src/presenters/presenters/pdf_presenter.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,8 @@ def generate(self, presenter_input):
dict: mime type and base64 encoded data of the generated PDF document
"""
try:
temporary_directory = tempfile.gettempdir() + "/"
output_html = temporary_directory + "pdf_body.html"
output_pdf = temporary_directory + "pdf_report__" + datetime.datetime.now().strftime("%d-%m-%Y_%H:%M") + ".pdf"

output_html = tempfile.NamedTemporaryFile(prefix="pdf_report_", suffix='.html', delete_on_close=False).name
output_pdf = tempfile.NamedTemporaryFile(prefix="pdf_report_", suffix='.pdf', delete_on_close=False).name
head, tail = os.path.split(presenter_input.parameter_values_map["PDF_TEMPLATE_PATH"])

input_data = BasePresenter.generate_input_data(presenter_input)
Expand All @@ -54,25 +52,6 @@ def generate(self, presenter_input):
with open(output_html, "w") as output_file:
output_file.write(output_text)

if not os.path.exists(temporary_directory):
os.mkdir(temporary_directory)

# options = {
# 'dpi': 500,
# 'page-size': 'A4',
# 'margin-top': '1.55in',
# 'margin-right': '0.75in',
# 'margin-bottom': '1.55in',
# 'margin-left': '0.75in',
# 'encoding': "UTF-8",
# 'header-html': pdf_header_template,
# 'footer-html': pdf_footer_template,
# 'custom-header': [
# ('Accept-Encoding', 'gzip')
# ],
# 'no-outline': None,
# 'enable-local-file-access': None
# }
HTML(output_html).write_pdf(output_pdf)

encoding = "UTF-8"
Expand Down

0 comments on commit 268614b

Please sign in to comment.