add matchtype option, review tweet parser, review docs

claromes · Jun 15, 2024 · a91c460 · a91c460
1 parent 88ef5e2
commit a91c460
Show file tree

Hide file tree

Showing 14 changed files with 322 additions and 209 deletions.
diff --git a/.streamlit/config.toml b/.streamlit/config.toml
@@ -0,0 +1,13 @@
+[theme]
+base = "light"
+primaryColor = "black"
+secondaryBackgroundColor = "gainsboro"
+textColor = "black"
+backgroundColor = "whitesmoke"
+font = "sans serif"
+
+[client]
+toolbarMode = "minimal"
+
+[server]
+port = 8501
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # Wayback Tweets
 
-[![PyPI](https://img.shields.io/pypi/v/waybacktweets)](https://pypi.org/project/waybacktweets)
+[![PyPI](https://img.shields.io/pypi/v/waybacktweets)](https://pypi.org/project/waybacktweets) [![Streamlit App](https://static.streamlit.io/badges/streamlit_badge_black_white.svg)](https://waybacktweets.streamlit.app)
 
 Retrieves archived tweets CDX data from the Wayback Machine, performs necessary parsing, and saves the data in CSV, JSON, and HTML formats.
 
@@ -32,8 +32,9 @@ timestamp_from = parse_date("20150101")
 timestamp_to = parse_date("20191231")
 limit = 250
 offset = 0
+matchtype = "exact"
 
-api = WaybackTweets(username, collapse, timestamp_from, timestamp_to, limit, offset)
+api = WaybackTweets(username, collapse, timestamp_from, timestamp_to, limit, offset, matchtype)
 
 archived_tweets = api.get()
 ```

diff --git a/app/.streamlit/config.toml b/app/.streamlit/config.toml
diff --git a/app/app.py b/app/app.py
diff --git a/app/assets/parthenon.svg b/app/assets/parthenon.svg
diff --git a/docs/api.rst b/docs/api.rst
@@ -22,9 +22,8 @@ Parse
 .. autoclass:: TwitterEmbed
     :members:
 
-.. TODO: JSON Issue
-.. .. autoclass:: JsonParser
-..     :members:
+.. autoclass:: JsonParser
+    :members:
 
 
 Export
@@ -55,6 +54,7 @@ Utils
 .. autofunction:: clean_tweet_url
 .. autofunction:: clean_wayback_machine_url
 .. autofunction:: delete_tweet_pathnames
+.. autofunction:: is_tweet_url
 .. autofunction:: get_response
 .. autofunction:: parse_date
 .. autofunction:: semicolon_parser
diff --git a/docs/cli.rst b/docs/cli.rst
@@ -40,3 +40,38 @@ However, it is possible to use it with other options. Read below text extracted
    - Ex: Only show unique urls in a prefix query (filtering out captures except first capture of a given url). This is similar to the old prefix query in wayback (note: this query may be slow at the moment):
 
       http://web.archive.org/cdx/search/cdx?url=archive.org&collapse=urlkey&matchType=prefix
+
+
+URL Match Scope
+-----------------
+
+The CDX Server can return results matching a certain prefix, a certain host or all subdomains by using the ``matchType`` param.
+
+For example, with the value ``prefix`` it is possible to retrieve URLs beyond `/status/`.
+
+Read below text extracted from the official Wayback CDX Server API (Beta) documentation.
+
+.. note::
+
+   For example, if given the url: archive.org/about/ and:
+
+   - ``matchType=exact`` (default if omitted) will return results matching exactly archive.org/about/
+
+   - ``matchType=prefix`` will return results for all results under the path archive.org/about/
+
+      http://web.archive.org/cdx/search/cdx?url=archive.org/about/&matchType=prefix&limit=1000
+
+   - ``matchType=host`` will return results from host archive.org
+
+      http://web.archive.org/cdx/search/cdx?url=archive.org/about/&matchType=host&limit=1000
+
+   - ``matchType=domain`` will return results from host archive.org and all subhosts \*.archive.org
+
+      http://web.archive.org/cdx/search/cdx?url=archive.org/about/&matchType=domain&limit=1000
+
+   The matchType may also be set implicitly by using wildcard '*' at end or beginning of the url:
+
+   - If url is ends in '/*', eg url=archive.org/* the query is equivalent to url=archive.org/&matchType=prefix
+   - If url starts with '*.', eg url=*.archive.org/ the query is equivalent to url=archive.org/&matchType=domain
+
+   (Note: The domain mode is only available if the CDX is in `SURT <http://crawler.archive.org/articles/user_manual/glossary.html#surt>`_-order format.)
diff --git a/docs/errors.rst b/docs/errors.rst
@@ -17,10 +17,10 @@ This error is raised when the package fails to establish a new connection with w
 
 The output message from the package would be: ``Failed to establish a new connection with web.archive.org. Max retries exceeded.``
 
-.. TODO: JSON Issue
-.. This is the error often returned when performing experimental parsing of URLs with the mimetype ``application/json``.
 
-.. The warning output message from the package would be: ``Connection error with https://web.archive.org/web/<TIMESTAMP>/https://twitter.com/<USERNAME>/status/<TWEET_ID>. Max retries exceeded. Error parsing the JSON, but the CDX data was saved.``
+This is the error often returned when performing experimental parsing of URLs with the mimetype ``application/json``.
+
+The warning output message from the package would be: ``Connection error with https://web.archive.org/web/<TIMESTAMP>/https://twitter.com/<USERNAME>/status/<TWEET_ID>. Max retries exceeded. Error parsing the JSON, but the CDX data was saved.``
 
 HTTPError
 ----------------

diff --git a/docs/quickstart.rst b/docs/quickstart.rst
@@ -29,8 +29,9 @@ Using Wayback Tweets as a Python Module
     timestamp_to = parse_date("20191231")
     limit = 250
     offset = 0
+    matchtype = "exact"
 
-    api = WaybackTweets(username, collapse, timestamp_from, timestamp_to, limit, offset)
+    api = WaybackTweets(username, collapse, timestamp_from, timestamp_to, limit, offset, matchtype)
 
     archived_tweets = api.get()
 

diff --git a/docs/streamlit.rst b/docs/streamlit.rst
@@ -6,18 +6,17 @@ Aplication that displays multiple archived tweets on Wayback Machine to avoid op
 
 Filters
 ----------
+- Filtering by date range: Using the `from` and `to` filters
 
-- Calendar: Filtering by date range
+- Only unavailable tweets: Checks if the archived URL still exists on Twitter (see the `flowchart <workflow.html>`_)
 
-- Checkbox: Only tweets not available
-
-- Checkbox: Only unique URLs (filtering by the collapse option using the urlkey field)
+- Only unique URLs: Filtering by the collapse option using the ``urlkey`` field and the URL Match Scope ``prefix``
 
 
 Pagination
 ------------
 
-Pagination is automatic and allows viewing up to 25 tweets per page. This is a fixed value due to the API rate limit.
+Pagination allows viewing up to 25 tweets per page. This helps to avoid rate limiting from the API, for parsing returns with the mimetype ``application/json``.
 
 
 Community Comments

diff --git a/waybacktweets/api/parse_tweets.py b/waybacktweets/api/parse_tweets.py
@@ -14,6 +14,7 @@
     clean_tweet_url,
     delete_tweet_pathnames,
     get_response,
+    is_tweet_url,
     semicolon_parser,
 )
 
@@ -95,7 +96,9 @@ def embed(self) -> Optional[Tuple[List[str], List[bool], List[str]]]:
 # TODO: JSON Issue - Create separate function to handle JSON return without hitting rate limiting # noqa: E501
 class JsonParser:
     """
-    Class responsible for parsing tweets when the mimetype is application/json.
+    Class responsible for parsing tweets when the mimetype is application/json.\n
+    Note: This class is in an experimental phase, but it is currently being
+    used by the Streamlit Web App.
 
     :param archived_tweet_url: The URL of the archived tweet to be parsed.
     """
@@ -201,13 +204,24 @@ def _process_response(self, response: List[str]) -> None:
         encoded_parsed_tweet = semicolon_parser(original_tweet)
         encoded_parsed_archived_tweet = semicolon_parser(parsed_wayback_machine_url)
 
-        embed_parser = TwitterEmbed(encoded_tweet)
-        content = embed_parser.embed()
+        available_tweet_text = None
+        available_tweet_is_RT = None
+        available_tweet_info = None
 
-        if content:
-            self._add_field("available_tweet_text", semicolon_parser(content[0][0]))
-            self._add_field("available_tweet_is_RT", content[1][0])
-            self._add_field("available_tweet_info", semicolon_parser(content[2][0]))
+        is_tweet = is_tweet_url(encoded_tweet)
+
+        if is_tweet:
+            embed_parser = TwitterEmbed(encoded_tweet)
+            content = embed_parser.embed()
+
+            if content:
+                available_tweet_text = semicolon_parser(content[0][0])
+                available_tweet_is_RT = content[1][0]
+                available_tweet_info = semicolon_parser(content[2][0])
+
+        self._add_field("available_tweet_text", available_tweet_text)
+        self._add_field("available_tweet_is_RT", available_tweet_is_RT)
+        self._add_field("available_tweet_info", available_tweet_info)
 
         # TODO: JSON Issue
         # parsed_text_json = ""