diff --git a/app/app.py b/app/app.py index 8a209d5..1980449 100644 --- a/app/app.py +++ b/app/app.py @@ -1,5 +1,5 @@ import base64 -from datetime import datetime +from datetime import datetime, timedelta import streamlit as st @@ -13,13 +13,13 @@ PAGE_ICON = "assets/parthenon.png" TITLE = "assets/waybacktweets.png" -PREVIEW_IMAGE = "assets/preview_image.jpg" DOWNLOAD = "assets/download.svg" collapse = None matchtype = None -start_date = datetime(2006, 1, 1) +start_date = datetime.now() - timedelta(days=365 * 2) end_date = datetime.now() +min_date = datetime(2006, 1, 1) # ------ Verbose Mode Configuration ------ # @@ -81,7 +81,7 @@ # ------ Requestings ------ # -@st.cache_data(ttl=600, show_spinner=True) +@st.cache_data(ttl=600, show_spinner=False) def wayback_tweets( username, collapse, @@ -105,7 +105,7 @@ def wayback_tweets( return archived_tweets -@st.cache_data(ttl=600, show_spinner=True) +@st.cache_data(ttl=600, show_spinner=False) def tweets_parser(archived_tweets, username, field_options): parser = TweetsParser(archived_tweets, username, field_options) parsed_tweets = parser.parse() @@ -113,7 +113,7 @@ def tweets_parser(archived_tweets, username, field_options): return parsed_tweets -@st.cache_data(ttl=600, show_spinner=True) +@st.cache_data(ttl=600, show_spinner=False) def tweets_exporter(parsed_tweets, username, field_options): exporter = TweetsExporter(parsed_tweets, username, field_options) @@ -135,11 +135,11 @@ def tweets_exporter(parsed_tweets, username, field_options): ) st.write("Retrieve archived tweets CDX data in CSV, JSON, and HTML formats.") -st.caption( +st.write( "This application uses the Wayback Tweets Python package, which can be used as a module or as a standalone command line tool. [Read the documentation](https://claromes.github.io/waybacktweets)." # noqa: E501 ) -st.caption( +st.write( "To access the legacy version of Wayback Tweets [click here](https://waybacktweets-legacy.streamlit.app)." # noqa: E501 ) @@ -150,13 +150,14 @@ def tweets_exporter(parsed_tweets, username, field_options): username = st.text_input("Username *", key="username", placeholder="Without @") with st.expander("Filtering"): - start_date = datetime(2006, 1, 1) - end_date = datetime.now() + st.caption( + ":orange[A large date range takes a long time to process, and the app's resources may not be sufficient. Try to perform searches with smaller ranges to get faster results.]" # noqa: E501 + ) st.session_state.archived_timestamp_filter = st.date_input( "Tweets saved between", (start_date, end_date), - start_date, + min_date, end_date, format="YYYY/MM/DD", help="Using the `from` and `to` filters. Format: YYYY/MM/DD", @@ -178,21 +179,11 @@ def tweets_exporter(parsed_tweets, username, field_options): help="Allows for a simple way to scroll through the results", ) - col3, col4 = st.columns(2) - - with col3: - not_available = st.checkbox( - "Only tweets not available", - key="not_available", - help="Checks if the archived URL still exists on Twitter", - ) - - with col4: - unique = st.checkbox( - "Only unique Wayback Machine URLs", - key="unique", - help="Filtering by the collapse option using the `urlkey` field and the URL Match Scope `prefix`", # noqa: E501 - ) + unique = st.checkbox( + "Only unique Wayback Machine URLs", + key="unique", + help="Filtering by the collapse option using the `urlkey` field and the URL Match Scope `prefix`", # noqa: E501 + ) query = st.button("Query", type="primary", use_container_width=True) @@ -208,102 +199,111 @@ def tweets_exporter(parsed_tweets, username, field_options): matchtype = "prefix" try: - wayback_tweets = wayback_tweets( - st.session_state.current_username, - collapse, - st.session_state.archived_timestamp_filter[0], - st.session_state.archived_timestamp_filter[1], - limit, - offset, - matchtype, - ) + with st.spinner( + f"Waybacking @{st.session_state.current_username}'s archived tweets" + ): + wayback_tweets = wayback_tweets( + st.session_state.current_username, + collapse, + st.session_state.archived_timestamp_filter[0], + st.session_state.archived_timestamp_filter[1], + limit, + offset, + matchtype, + ) if not wayback_tweets: st.error("No data was saved due to an empty response.") st.stop() - parsed_tweets = tweets_parser( - wayback_tweets, st.session_state.current_username, FIELD_OPTIONS - ) + with st.spinner( + f"Parsing @{st.session_state.current_username}'s archived tweets" + ): + parsed_tweets = tweets_parser( + wayback_tweets, st.session_state.current_username, FIELD_OPTIONS + ) - df, file_name = tweets_exporter( - parsed_tweets, st.session_state.current_username, FIELD_OPTIONS - ) + df, file_name = tweets_exporter( + parsed_tweets, st.session_state.current_username, FIELD_OPTIONS + ) csv_data = df.to_csv(index=False) json_data = df.to_json(orient="records", lines=False) html = HTMLTweetsVisualizer(username, json_data) html_content = html.generate() - st.session_state.count = len(df) - st.write(f"**{st.session_state.count} URLs have been captured**") + # -- Rendering -- # - # -- HTML -- # + if csv_data and json_data and html_content: + st.session_state.count = len(df) + st.write(f"**{st.session_state.count} URLs have been captured**") - st.header("HTML", divider="gray") - st.write( - f"Visualize tweets more efficiently through iframes. Download the @{st.session_state.current_username}'s archived tweets in HTML." # noqa: E501 - ) + # -- HTML -- # - col5, col6 = st.columns([1, 18]) + st.header("HTML", divider="gray") + st.write( + f"Visualize tweets more efficiently through iframes. Download the @{st.session_state.current_username}'s archived tweets in HTML." # noqa: E501 + ) - with col5: - st.image(DOWNLOAD, width=22) + col5, col6 = st.columns([1, 18]) - with col6: - b64_html = base64.b64encode(html_content.encode()).decode() - href_html = f"data:text/html;base64,{b64_html}" + with col5: + st.image(DOWNLOAD, width=22) - st.markdown( - f'{file_name}.html', # noqa: E501 - unsafe_allow_html=True, - ) + with col6: + b64_html = base64.b64encode(html_content.encode()).decode() + href_html = f"data:text/html;base64,{b64_html}" - st.image(PREVIEW_IMAGE, "Preview image") + st.markdown( + f'{file_name}.html', # noqa: E501 + unsafe_allow_html=True, + ) - # -- CSV -- # + # -- CSV -- # - st.header("CSV", divider="gray") - st.write( - "Check the data returned in the dataframe below and download the file." - ) + st.header("CSV", divider="gray") + st.write( + "Check the data returned in the dataframe below and download the file." + ) - col7, col8 = st.columns([1, 18]) + col7, col8 = st.columns([1, 18]) - with col7: - st.image(DOWNLOAD, width=22) + with col7: + st.image(DOWNLOAD, width=22) - with col8: - b64_csv = base64.b64encode(csv_data.encode()).decode() - href_csv = f"data:file/csv;base64,{b64_csv}" + with col8: + b64_csv = base64.b64encode(csv_data.encode()).decode() + href_csv = f"data:file/csv;base64,{b64_csv}" - st.markdown( - f'{file_name}.csv', # noqa: E501 - unsafe_allow_html=True, - ) + st.markdown( + f'{file_name}.csv', # noqa: E501 + unsafe_allow_html=True, + ) - st.dataframe(df, use_container_width=True) + st.dataframe(df, use_container_width=True) - # -- JSON -- # + # -- JSON -- # - st.header("JSON", divider="gray") - st.write("Check the data returned in JSON format below and download the file.") + st.header("JSON", divider="gray") + st.write( + "Check the data returned in JSON format below and download the file." + ) - col9, col10 = st.columns([1, 18]) + col9, col10 = st.columns([1, 18]) - with col9: - st.image(DOWNLOAD, width=22) + with col9: + st.image(DOWNLOAD, width=22) - with col10: - b64_json = base64.b64encode(json_data.encode()).decode() - href_json = f"data:file/json;base64,{b64_json}" + with col10: + b64_json = base64.b64encode(json_data.encode()).decode() + href_json = f"data:file/json;base64,{b64_json}" - st.markdown( - f'{file_name}.json', # noqa: E501 - unsafe_allow_html=True, - ) + st.markdown( + f'{file_name}.json', # noqa: E501 + unsafe_allow_html=True, + ) - st.json(json_data, expanded=False) + st.json(json_data, expanded=False) except TypeError as e: st.error( f""" diff --git a/assets/preview_image.jpg b/assets/preview_image.jpg deleted file mode 100644 index cf4633d..0000000 Binary files a/assets/preview_image.jpg and /dev/null differ diff --git a/waybacktweets/_cli.py b/waybacktweets/_cli.py index d115c09..4048fc7 100644 --- a/waybacktweets/_cli.py +++ b/waybacktweets/_cli.py @@ -121,7 +121,7 @@ def main( username, collapse, timestamp_from, timestamp_to, limit, offset, matchtype ) - print("Making a request to the Internet Archive...") + print(f"Waybacking @{username}'s archived tweets...") archived_tweets = api.get() if archived_tweets: diff --git a/waybacktweets/api/export.py b/waybacktweets/api/export.py index af8e804..6524bfe 100644 --- a/waybacktweets/api/export.py +++ b/waybacktweets/api/export.py @@ -97,23 +97,23 @@ def save_to_json(self) -> None: """ Saves the DataFrame to a JSON file. """ - json_file_path = f"{self.filename}.json" - self.dataframe.to_json(json_file_path, orient="records", lines=False) + json_path = f"{self.filename}.json" + self.dataframe.to_json(json_path, orient="records", lines=False) - print(f"Saved to {json_file_path}") + print(f"Saved to {json_path}") def save_to_html(self) -> None: """ Saves the DataFrame to an HTML file. """ - json_file_path = f"{self.filename}.json" + json_path = f"{self.filename}.json" - if not os.path.exists(json_file_path): + if not os.path.exists(json_path): self.save_to_json() html_file_path = f"{self.filename}.html" - html = HTMLTweetsVisualizer(self.username, json_file_path, html_file_path) + html = HTMLTweetsVisualizer(self.username, json_path, html_file_path) html_content = html.generate() html.save(html_content) diff --git a/waybacktweets/api/parse.py b/waybacktweets/api/parse.py index 65ad041..31e0e34 100644 --- a/waybacktweets/api/parse.py +++ b/waybacktweets/api/parse.py @@ -279,7 +279,8 @@ def parse(self, print_progress=False) -> Dict[str, List[Any]]: task = None if print_progress: task = progress.add_task( - f"Waybacking @{self.username} tweets\n", total=len(futures) + f"Parsing @{self.username}'s archived tweets\n", + total=len(futures), ) for future in as_completed(futures): diff --git a/waybacktweets/api/visualize.py b/waybacktweets/api/visualize.py index 3f5bfcc..e1e9e8e 100644 --- a/waybacktweets/api/visualize.py +++ b/waybacktweets/api/visualize.py @@ -16,36 +16,36 @@ class HTMLTweetsVisualizer: Args: username (str): The username associated with the tweets. - json_file_path (Union[str, List[str]]): The path of the JSON file or the JSON data itself. + json_path (Union[str, List[str]]): The path of the JSON file or the JSON data itself. html_file_path (str, optional): The path where the HTML file will be saved. """ def __init__( self, username: str, - json_file_path: Union[str, List[str]], + json_path: Union[str, List[str]], html_file_path: str = None, ): self.username = username - self.json_file_path = self._json_loader(json_file_path) + self.json_path = self._json_loader(json_path) self.html_file_path = html_file_path @staticmethod - def _json_loader(json_file_path: Union[str, List[str]]) -> List[Dict[str, Any]]: + def _json_loader(json_path: Union[str, List[str]]) -> List[Dict[str, Any]]: """ Reads and loads JSON data from a specified file path or JSON string. Args: - json_file_path (Union[str, List[str]]): The path of the JSON file or the JSON data itself. + json_path (Union[str, List[str]]): The path of the JSON file or the JSON data itself. Returns: The content of the JSON file or data. """ - if os.path.isfile(json_file_path): - with open(json_file_path, "r", encoding="utf-8") as f: + if os.path.isfile(json_path): + with open(json_path, "r", encoding="utf-8") as f: return json.load(f) - return json.loads(json_file_path) + return json.loads(json_path) def generate(self) -> str: """ @@ -104,7 +104,7 @@ def generate(self) -> str: html += f"

@{self.username}'s archived tweets

\n" html += '
\n' - for index, tweet in enumerate(self.json_file_path): + for index, tweet in enumerate(self.json_path): html += '
\n' if not tweet["available_tweet_text"]: @@ -115,10 +115,6 @@ def generate(self) -> str: "Parsed Tweet": tweet["parsed_tweet_url"], } - html += f'

{tweet["original_tweet_url"]}

\n' - html += f'

{tweet["archived_mimetype"]}

\n' - html += "
\n" - for key, value in iframe_src.items(): key_cleaned = key.replace(" ", "_") @@ -155,6 +151,12 @@ def generate(self) -> str: html += f'

Available Tweet Username: {tweet["available_tweet_info"]}

\n' html += "
\n" + html += f'

Archived Tweet: {tweet["archived_tweet_url"]}

\n' + html += f'

Parsed Archived Tweet: {tweet["parsed_archived_tweet_url"]}

\n' + html += f'

Original Tweet: {tweet["original_tweet_url"]}

\n' + html += ( + f'

Parsed Tweet: {tweet["parsed_tweet_url"]}

\n' + ) html += f'

Archived URL Key: {tweet["archived_urlkey"]}

\n' html += f'

Archived Timestamp: {timestamp_parser(tweet["archived_timestamp"])} ({tweet["archived_timestamp"]})

\n' html += f'

Archived mimetype: {tweet["archived_mimetype"]}

\n'