Skip to content

Commit

Permalink
update HTML, update docs app and spinner, update print msg, delete im…
Browse files Browse the repository at this point in the history
…age preview
  • Loading branch information
claromes committed Jun 24, 2024
1 parent ac094de commit ccb53fe
Show file tree
Hide file tree
Showing 6 changed files with 113 additions and 110 deletions.
178 changes: 89 additions & 89 deletions app/app.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import base64
from datetime import datetime
from datetime import datetime, timedelta

import streamlit as st

Expand All @@ -13,13 +13,13 @@

PAGE_ICON = "assets/parthenon.png"
TITLE = "assets/waybacktweets.png"
PREVIEW_IMAGE = "assets/preview_image.jpg"
DOWNLOAD = "assets/download.svg"

collapse = None
matchtype = None
start_date = datetime(2006, 1, 1)
start_date = datetime.now() - timedelta(days=365 * 2)
end_date = datetime.now()
min_date = datetime(2006, 1, 1)

# ------ Verbose Mode Configuration ------ #

Expand Down Expand Up @@ -81,7 +81,7 @@
# ------ Requestings ------ #


@st.cache_data(ttl=600, show_spinner=True)
@st.cache_data(ttl=600, show_spinner=False)
def wayback_tweets(
username,
collapse,
Expand All @@ -105,15 +105,15 @@ def wayback_tweets(
return archived_tweets


@st.cache_data(ttl=600, show_spinner=True)
@st.cache_data(ttl=600, show_spinner=False)
def tweets_parser(archived_tweets, username, field_options):
parser = TweetsParser(archived_tweets, username, field_options)
parsed_tweets = parser.parse()

return parsed_tweets


@st.cache_data(ttl=600, show_spinner=True)
@st.cache_data(ttl=600, show_spinner=False)
def tweets_exporter(parsed_tweets, username, field_options):
exporter = TweetsExporter(parsed_tweets, username, field_options)

Expand All @@ -135,11 +135,11 @@ def tweets_exporter(parsed_tweets, username, field_options):
)
st.write("Retrieve archived tweets CDX data in CSV, JSON, and HTML formats.")

st.caption(
st.write(
"This application uses the Wayback Tweets Python package, which can be used as a module or as a standalone command line tool. [Read the documentation](https://claromes.github.io/waybacktweets)." # noqa: E501
)

st.caption(
st.write(
"To access the legacy version of Wayback Tweets [click here](https://waybacktweets-legacy.streamlit.app)." # noqa: E501
)

Expand All @@ -150,13 +150,14 @@ def tweets_exporter(parsed_tweets, username, field_options):
username = st.text_input("Username *", key="username", placeholder="Without @")

with st.expander("Filtering"):
start_date = datetime(2006, 1, 1)
end_date = datetime.now()

st.caption(
":orange[A large date range takes a long time to process, and the app's resources may not be sufficient. Try to perform searches with smaller ranges to get faster results.]" # noqa: E501
)
st.session_state.archived_timestamp_filter = st.date_input(
"Tweets saved between",
(start_date, end_date),
start_date,
min_date,
end_date,
format="YYYY/MM/DD",
help="Using the `from` and `to` filters. Format: YYYY/MM/DD",
Expand All @@ -178,21 +179,11 @@ def tweets_exporter(parsed_tweets, username, field_options):
help="Allows for a simple way to scroll through the results",
)

col3, col4 = st.columns(2)

with col3:
not_available = st.checkbox(
"Only tweets not available",
key="not_available",
help="Checks if the archived URL still exists on Twitter",
)

with col4:
unique = st.checkbox(
"Only unique Wayback Machine URLs",
key="unique",
help="Filtering by the collapse option using the `urlkey` field and the URL Match Scope `prefix`", # noqa: E501
)
unique = st.checkbox(
"Only unique Wayback Machine URLs",
key="unique",
help="Filtering by the collapse option using the `urlkey` field and the URL Match Scope `prefix`", # noqa: E501
)


query = st.button("Query", type="primary", use_container_width=True)
Expand All @@ -208,102 +199,111 @@ def tweets_exporter(parsed_tweets, username, field_options):
matchtype = "prefix"

try:
wayback_tweets = wayback_tweets(
st.session_state.current_username,
collapse,
st.session_state.archived_timestamp_filter[0],
st.session_state.archived_timestamp_filter[1],
limit,
offset,
matchtype,
)
with st.spinner(
f"Waybacking @{st.session_state.current_username}'s archived tweets"
):
wayback_tweets = wayback_tweets(
st.session_state.current_username,
collapse,
st.session_state.archived_timestamp_filter[0],
st.session_state.archived_timestamp_filter[1],
limit,
offset,
matchtype,
)

if not wayback_tweets:
st.error("No data was saved due to an empty response.")
st.stop()

parsed_tweets = tweets_parser(
wayback_tweets, st.session_state.current_username, FIELD_OPTIONS
)
with st.spinner(
f"Parsing @{st.session_state.current_username}'s archived tweets"
):
parsed_tweets = tweets_parser(
wayback_tweets, st.session_state.current_username, FIELD_OPTIONS
)

df, file_name = tweets_exporter(
parsed_tweets, st.session_state.current_username, FIELD_OPTIONS
)
df, file_name = tweets_exporter(
parsed_tweets, st.session_state.current_username, FIELD_OPTIONS
)

csv_data = df.to_csv(index=False)
json_data = df.to_json(orient="records", lines=False)
html = HTMLTweetsVisualizer(username, json_data)
html_content = html.generate()

st.session_state.count = len(df)
st.write(f"**{st.session_state.count} URLs have been captured**")
# -- Rendering -- #

# -- HTML -- #
if csv_data and json_data and html_content:
st.session_state.count = len(df)
st.write(f"**{st.session_state.count} URLs have been captured**")

st.header("HTML", divider="gray")
st.write(
f"Visualize tweets more efficiently through iframes. Download the @{st.session_state.current_username}'s archived tweets in HTML." # noqa: E501
)
# -- HTML -- #

col5, col6 = st.columns([1, 18])
st.header("HTML", divider="gray")
st.write(
f"Visualize tweets more efficiently through iframes. Download the @{st.session_state.current_username}'s archived tweets in HTML." # noqa: E501
)

with col5:
st.image(DOWNLOAD, width=22)
col5, col6 = st.columns([1, 18])

with col6:
b64_html = base64.b64encode(html_content.encode()).decode()
href_html = f"data:text/html;base64,{b64_html}"
with col5:
st.image(DOWNLOAD, width=22)

st.markdown(
f'<a href="{href_html}" download="{file_name}.html" title="Download {file_name}.html">{file_name}.html</a>', # noqa: E501
unsafe_allow_html=True,
)
with col6:
b64_html = base64.b64encode(html_content.encode()).decode()
href_html = f"data:text/html;base64,{b64_html}"

st.image(PREVIEW_IMAGE, "Preview image")
st.markdown(
f'<a href="{href_html}" download="{file_name}.html" title="Download {file_name}.html">{file_name}.html</a>', # noqa: E501
unsafe_allow_html=True,
)

# -- CSV -- #
# -- CSV -- #

st.header("CSV", divider="gray")
st.write(
"Check the data returned in the dataframe below and download the file."
)
st.header("CSV", divider="gray")
st.write(
"Check the data returned in the dataframe below and download the file."
)

col7, col8 = st.columns([1, 18])
col7, col8 = st.columns([1, 18])

with col7:
st.image(DOWNLOAD, width=22)
with col7:
st.image(DOWNLOAD, width=22)

with col8:
b64_csv = base64.b64encode(csv_data.encode()).decode()
href_csv = f"data:file/csv;base64,{b64_csv}"
with col8:
b64_csv = base64.b64encode(csv_data.encode()).decode()
href_csv = f"data:file/csv;base64,{b64_csv}"

st.markdown(
f'<a href="{href_csv}" download="{file_name}.csv" title="Download {file_name}.csv">{file_name}.csv</a>', # noqa: E501
unsafe_allow_html=True,
)
st.markdown(
f'<a href="{href_csv}" download="{file_name}.csv" title="Download {file_name}.csv">{file_name}.csv</a>', # noqa: E501
unsafe_allow_html=True,
)

st.dataframe(df, use_container_width=True)
st.dataframe(df, use_container_width=True)

# -- JSON -- #
# -- JSON -- #

st.header("JSON", divider="gray")
st.write("Check the data returned in JSON format below and download the file.")
st.header("JSON", divider="gray")
st.write(
"Check the data returned in JSON format below and download the file."
)

col9, col10 = st.columns([1, 18])
col9, col10 = st.columns([1, 18])

with col9:
st.image(DOWNLOAD, width=22)
with col9:
st.image(DOWNLOAD, width=22)

with col10:
b64_json = base64.b64encode(json_data.encode()).decode()
href_json = f"data:file/json;base64,{b64_json}"
with col10:
b64_json = base64.b64encode(json_data.encode()).decode()
href_json = f"data:file/json;base64,{b64_json}"

st.markdown(
f'<a href="{href_json}" download="{file_name}.json" title="Download {file_name}.json">{file_name}.json</a>', # noqa: E501
unsafe_allow_html=True,
)
st.markdown(
f'<a href="{href_json}" download="{file_name}.json" title="Download {file_name}.json">{file_name}.json</a>', # noqa: E501
unsafe_allow_html=True,
)

st.json(json_data, expanded=False)
st.json(json_data, expanded=False)
except TypeError as e:
st.error(
f"""
Expand Down
Binary file removed assets/preview_image.jpg
Binary file not shown.
2 changes: 1 addition & 1 deletion waybacktweets/_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def main(
username, collapse, timestamp_from, timestamp_to, limit, offset, matchtype
)

print("Making a request to the Internet Archive...")
print(f"Waybacking @{username}'s archived tweets...")
archived_tweets = api.get()

if archived_tweets:
Expand Down
12 changes: 6 additions & 6 deletions waybacktweets/api/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,23 +97,23 @@ def save_to_json(self) -> None:
"""
Saves the DataFrame to a JSON file.
"""
json_file_path = f"{self.filename}.json"
self.dataframe.to_json(json_file_path, orient="records", lines=False)
json_path = f"{self.filename}.json"
self.dataframe.to_json(json_path, orient="records", lines=False)

print(f"Saved to {json_file_path}")
print(f"Saved to {json_path}")

def save_to_html(self) -> None:
"""
Saves the DataFrame to an HTML file.
"""
json_file_path = f"{self.filename}.json"
json_path = f"{self.filename}.json"

if not os.path.exists(json_file_path):
if not os.path.exists(json_path):
self.save_to_json()

html_file_path = f"{self.filename}.html"

html = HTMLTweetsVisualizer(self.username, json_file_path, html_file_path)
html = HTMLTweetsVisualizer(self.username, json_path, html_file_path)

html_content = html.generate()
html.save(html_content)
Expand Down
3 changes: 2 additions & 1 deletion waybacktweets/api/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,8 @@ def parse(self, print_progress=False) -> Dict[str, List[Any]]:
task = None
if print_progress:
task = progress.add_task(
f"Waybacking @{self.username} tweets\n", total=len(futures)
f"Parsing @{self.username}'s archived tweets\n",
total=len(futures),
)

for future in as_completed(futures):
Expand Down
Loading

0 comments on commit ccb53fe

Please sign in to comment.