From 81a6bca560e1535897c53ce0bc6f56dc49f617d6 Mon Sep 17 00:00:00 2001 From: Claromes Date: Mon, 17 Jun 2024 20:15:41 -0300 Subject: [PATCH] review tweet url parser, update docs --- README.md | 2 +- docs/api.rst | 6 ++++++ docs/field_options.rst | 9 +++------ docs/index.rst | 2 +- docs/todo.rst | 13 ++++++------ waybacktweets/api/parse.py | 4 ++++ waybacktweets/config/__init__.py | 1 + waybacktweets/config/field_options.py | 19 ++++++++++++++++++ waybacktweets/utils/utils.py | 29 ++++++++++++++++++--------- 9 files changed, 61 insertions(+), 24 deletions(-) create mode 100644 waybacktweets/config/field_options.py diff --git a/README.md b/README.md index 1ddd1b3..12672c6 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![PyPI](https://img.shields.io/pypi/v/waybacktweets)](https://pypi.org/project/waybacktweets) [![Streamlit App](https://static.streamlit.io/badges/streamlit_badge_black_white.svg)](https://waybacktweets.streamlit.app) -Retrieves archived tweets CDX data from the Wayback Machine, performs necessary parsing, and saves the data in CSV, JSON, and HTML formats. +Retrieves archived tweets CDX data from the Wayback Machine, performs several parses to facilitate the analysis of archived tweets and types of tweets (see [Field Options](https://claromes.github.io/waybacktweets/field_options.html)), and saves the data in CSV, JSON, and HTML formats. ## Installation diff --git a/docs/api.rst b/docs/api.rst index 2cfa7af..d0eb615 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -9,6 +9,8 @@ Request .. autoclass:: WaybackTweets :members: +.. _parser: + Parse --------- @@ -24,6 +26,8 @@ Parse .. autoclass:: JsonParser :members: +.. _exporter: + Export --------- @@ -42,6 +46,8 @@ Visualize :members: :private-members: +.. _utils: + Utils ------- diff --git a/docs/field_options.rst b/docs/field_options.rst index 02f4228..358e7fa 100644 --- a/docs/field_options.rst +++ b/docs/field_options.rst @@ -3,7 +3,7 @@ Field Options ================ -The package saves in three formats: CSV, JSON, and HTML. The files have the following fields: +The package performs several parses to facilitate the analysis of archived tweets and types of tweets. The fields below are available, which can be passed to the :ref:`parser` and :ref:`exporter`, in addition, the command line tool returns all these fields. - ``archived_urlkey``: (`str`) A canonical transformation of the URL you supplied, for example, ``org,eserver,tc)/``. Such keys are useful for indexing. @@ -13,12 +13,9 @@ The package saves in three formats: CSV, JSON, and HTML. The files have the foll - ``archived_tweet_url``: (`str`) The original archived URL. -- ``parsed_tweet_url``: (`str`) The original tweet URL after parsing. `Check the utility functions `_. +- ``parsed_tweet_url``: (`str`) The original tweet URL after parsing. Old URLs were archived in a nested manner. The parsing applied here unnests these URLs, when necessary. Check the :ref:`utils`. -- ``parsed_archived_tweet_url``: (`str`) The original archived URL after parsing. `Check the utility functions `_. - -.. TODO: JSON Issue -.. - ``parsed_tweet_text_mimetype_json``: (`str`) The tweet text extracted from the archived URL that has mimetype ``application/json``. +- ``parsed_archived_tweet_url``: (`str`) The original archived URL after parsing. It is not guaranteed that this option will be archived, it is just a facilitator, as the originally archived URL does not always exist, due to changes in URLs and web services of the social network Twitter. Check the :ref:`utils`. - ``available_tweet_text``: (`str`) The tweet text extracted from the URL that is still available on the Twitter account. diff --git a/docs/index.rst b/docs/index.rst index ec945d2..4912f14 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -6,7 +6,7 @@ Wayback Tweets .. image:: ../assets/waybacktweets.png :align: center -Retrieves archived tweets CDX data from the Wayback Machine, performs necessary parsing, and saves the data in CSV, JSON, and HTML formats. +Retrieves archived tweets CDX data from the Wayback Machine, performs several parses to facilitate the analysis of archived tweets and types of tweets (see :ref:`field_options`), and saves the data in CSV, JSON, and HTML formats. .. note:: Intensive queries can lead to rate limiting, resulting in a temporary ban of a few minutes from web.archive.org. diff --git a/docs/todo.rst b/docs/todo.rst index b684cae..20566e6 100644 --- a/docs/todo.rst +++ b/docs/todo.rst @@ -5,15 +5,16 @@ TODO -|uncheck| JSON Parser: Create a separate function to handle JSON return, apply JsonParser (``waybacktweets/api/parse.py:111``), and avoid rate limiting (`Planned for v1.2`) +|uncheck| Review and publish the new version of the Streamlit Web App -|uncheck| Download images when tweet URL has extensions like JPG or PNG (`Planned for v1.2`) +|uncheck| Unit Tests -|uncheck| Develop a scraper to download snapshots from https://archive.today (`Not planned`) +|uncheck| JSON Parser: Create a separate function to handle JSON return, apply JsonParser (``waybacktweets/api/parse.py:111``), and avoid rate limiting -|uncheck| Unit Tests (`Planned for v1.1`) +|uncheck| Download images when tweet URL has extensions like JPG or PNG -|uncheck| Mapping and parsing of other Twitter-related URLs (`Planned`) +|uncheck| Implement logging system (remove print statements) -|uncheck| Review and publish the new version of the Streamlit Web App (`Planned for v1.0.1`) +|uncheck| Mapping and parsing of other Twitter-related URLs +|uncheck| Develop a scraper to download snapshots from https://archive.today diff --git a/waybacktweets/api/parse.py b/waybacktweets/api/parse.py index b6f604a..19228f0 100644 --- a/waybacktweets/api/parse.py +++ b/waybacktweets/api/parse.py @@ -12,6 +12,7 @@ from rich.progress import Progress from waybacktweets.config.config import config +from waybacktweets.config.field_options import FIELD_OPTIONS from waybacktweets.exceptions.exceptions import ( ConnectionError, GetResponseError, @@ -167,6 +168,9 @@ def __init__( username: str, field_options: List[str], ): + if not all(option in FIELD_OPTIONS for option in field_options): + raise ValueError("Some field options are not valid.") + self.archived_tweets_response = archived_tweets_response self.username = username self.field_options = field_options diff --git a/waybacktweets/config/__init__.py b/waybacktweets/config/__init__.py index 457fa4f..8237133 100644 --- a/waybacktweets/config/__init__.py +++ b/waybacktweets/config/__init__.py @@ -1,3 +1,4 @@ # flake8: noqa: F401 from waybacktweets.config.config import config +from waybacktweets.config.field_options import FIELD_OPTIONS diff --git a/waybacktweets/config/field_options.py b/waybacktweets/config/field_options.py new file mode 100644 index 0000000..9c1fcb6 --- /dev/null +++ b/waybacktweets/config/field_options.py @@ -0,0 +1,19 @@ +""" +List of valid field options that can be used for parsing tweets. +""" + +FIELD_OPTIONS = [ + "archived_urlkey", + "archived_timestamp", + "original_tweet_url", + "archived_tweet_url", + "parsed_tweet_url", + "parsed_archived_tweet_url", + "available_tweet_text", + "available_tweet_is_RT", + "available_tweet_info", + "archived_mimetype", + "archived_statuscode", + "archived_digest", + "archived_length", +] diff --git a/waybacktweets/utils/utils.py b/waybacktweets/utils/utils.py index ada66b8..89be2b6 100644 --- a/waybacktweets/utils/utils.py +++ b/waybacktweets/utils/utils.py @@ -114,21 +114,30 @@ def clean_wayback_machine_url( def check_pattern_tweet(tweet_url: str) -> str: """ - Extracts the tweet ID from a tweet URL. + Extracts the URL from a tweet URL with patterns such as: + + - Reply: /status// + - Link: /status/// + - Twimg: /status/https://pbs Args: - tweet_url (str): The tweet URL to extract the ID from. + tweet_url (str): The tweet URL to extract the URL from. Returns: - The extracted tweet ID. + Only the extracted URL from a tweet. """ - pattern = re.compile(r'/status/"([^"]+)"') - - match = pattern.search(tweet_url) - if match: - return match.group(1).lstrip("/") - else: - return tweet_url + patterns = [ + re.compile(r'/status/"([^"]+)"'), + re.compile(r'/status/"([^"]+)"'), + re.compile(r'/status/%3B([^"]+)%3B'), + ] + + for pattern in patterns: + match = pattern.search(tweet_url) + if match: + return match.group(1).lstrip("/") + else: + return tweet_url def delete_tweet_pathnames(tweet_url: str) -> str: