Provide a ScrapingClient that doesn't need API access

Also adds the ability to list activities using web scraping instead of the API. The activities are returned as `ScrapedActivity` objects that are mostly compatible with the normal `Activity` objects that are returned by the list activities function that uses the API.
pR0Ps · Oct 1, 2020 · b2f0204 · b2f0204
1 parent b81071a
commit b2f0204
Showing 1 changed file with 165 additions and 13 deletions.
diff --git a/stravaweblib/webclient.py b/stravaweblib/webclient.py
@@ -4,21 +4,83 @@
 import functools
 import enum
 import re
+import uuid
 
 import requests
 import stravalib
+from stravalib.attributes import Attribute, TimestampAttribute, TimeIntervalAttribute
+from stravalib.model import Activity, BaseEntity
 from bs4 import BeautifulSoup
 
 
-__all__ = ["WebClient", "FrameType", "DataFormat", "ActivityFile"]
+__all__ = ["WebClient", "ScrapingClient", "FrameType", "DataFormat", "ActivityFile", "ScrapedActivity"]
 
 
 BASE_URL = "https://www.strava.com"
 
+# Used for filtering when scraping the activity list
+ACTIVITY_WORKOUT_TYPES = {
+    "Ride": {None: 10, "Race": 11, "Workout": 12},
+    "Run": {None: 0, "Race": 1, "Long Run": 2, "Workout": 3}
+}
 
 ActivityFile = namedtuple("ActivityFile", ("filename", "content"))
 
 
+class ScrapedActivity(BaseEntity):
+    """
+    Represents an Activity (ride, run, etc.) that was scraped from the website
+
+    The attributes are compatible with stravalib.model.Activity where possible
+    (some are missing)
+    """
+
+    id = Attribute(int)
+    name = Attribute(str)
+    description = Attribute(str)
+    type = Attribute(str)
+    workout_type = Attribute(str)
+
+    start_date = TimestampAttribute()
+    distance = Attribute(float)
+    moving_time = TimeIntervalAttribute()
+    elapsed_time = TimeIntervalAttribute()
+    total_elevation_gain = Attribute(float)
+    suffer_score = Attribute(int)
+    calories = Attribute(float)
+    gear_id = Attribute(str)
+
+    has_latlng = Attribute(bool)
+
+    trainer = Attribute(bool)
+    commute = Attribute(bool)
+    private = Attribute(bool)
+    flagged = Attribute(bool)
+
+    def from_dict(self, d):
+        bike_id = d.get("bike_id")
+        shoes_id = d.get("athlete_gear_id")
+        if bike_id:
+            d["gear_id"] = "b{}".format(bike_id)
+        elif shoes_id:
+            d["gear_id"] = "g{}".format(shoes_id)
+
+        d["start_date"] = d.pop("start_time")
+        d["distance"] = d.pop("distance_raw")
+        d["moving_time"] = d.pop("moving_time_raw")
+        d["elapsed_time"] = d.pop("elapsed_time_raw")
+        d["total_elevation_gain"] = d.pop("elevation_gain_raw")
+
+        wt = d.pop("workout_type")
+        if d["type"] in ACTIVITY_WORKOUT_TYPES:
+            for k, v in ACTIVITY_WORKOUT_TYPES[d["type"]].items():
+                if wt == v:
+                    d["workout_type"] = k
+                    break
+
+        return super().from_dict(d)
+
+
 class DataFormat(enum.Enum):
     ORIGINAL = "original"
     GPX = "gpx"
@@ -45,29 +107,24 @@ def __str__(self):
         return str(self.name).replace("_", " ").title()
 
 
-class WebClient(stravalib.Client):
-    """
-    An extension to the stravalib Client that fills in some of the gaps in
-    the official API using web scraping.
+class ScrapingClient:
     """
+    A client that uses web scraping to interface with Strava.
 
-    def __init__(self, *args, **kwargs):
-        # Docstring set manually after class definition
+    Can be used as a mixin to add the extra methods to the main stravalib.Client
+    """
 
-        email = kwargs.pop("email", None)
-        password = kwargs.pop("password", None)
+    def __init__(self, *args, email, password, **kwargs):
         if not email or not password:
             raise ValueError("'email' and 'password' kwargs are required")
 
         self._csrf = {}
-        self._component_data = {}
         self._session = requests.Session()
         self._session.headers.update({
             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
         })
         self._login(email, password)
 
-        # Init the normal stravalib client with remaining args
         super().__init__(*args, **kwargs)
 
     def _login(self, email, password):
@@ -101,6 +158,89 @@ def _login(self, email, password):
         if not resp.is_redirect or resp.next.url == login_url:
             raise stravalib.exc.LoginFailed("Couldn't log in to website, check creds")
 
+    def scrape_activites(self, keywords=None, activity_type=None, workout_type=None,
+                         commute=False, is_private=False, indoor=False, gear_id=None):
+        """A scraping-based alternative to stravalib.Client.get_activities()
+
+        Note that when using multiple parameters they are treated as AND, not OR
+
+        :param keywords: Text to search for
+        :param activity_type: The type of the activity. See stravalib.model:Activity.TYPES
+        :param workout_type: The type of workout ("Race", "Workout", etc)
+        :param commute: Only return activities marked as commutes
+        :param is_private: Only return private activities
+        :param indoor: Only return indoor/trainer activities
+        :param gear_id: Only return activities using this gear
+
+        :yield: ScrapedActivity objects
+        """
+
+        if activity_type is not None and activity_type not in Activity.TYPES:
+            raise ValueError(
+                "Invalid activity type. Must be one of: {}".format(",".join(Activity.TYPES))
+            )
+
+        if activity_type in ACTIVITY_WORKOUT_TYPES:
+            workout_type = ACTIVITY_WORKOUT_TYPES[activity_type].get(workout_type)
+            if workout_type is None:
+                raise ValueError(
+                    "Invalid workout type for a {}. Must be one of: {}".format(
+                        activity_type,
+                        ", ".join(ACTIVITY_WORKOUT_TYPES[activity_type].keys())
+                    )
+                )
+        elif workout_type is not None or gear_id is not None:
+            raise ValueError(
+                "Can only filter using workout type of gear when activity type is one of: {}".format(
+                    ", ".join(ACTIVITY_WORKOUT_TYPES.keys())
+                )
+            )
+
+        page = 1
+        per_page = 20
+        search_session_id = uuid.uuid4()
+
+        conv_bool = lambda x: "" if not x else "true"
+
+        while True:
+            resp = self._session.get(
+                "{}/athlete/training_activities".format(BASE_URL),
+                headers= {
+                    "Accept": "text/javascript, application/javascript, application/ecmascript, application/x-ecmascript",
+                    #"X-CSRF-Token": next(iter(self._csrf.values())),
+                    "X-Requested-With": "XMLHttpRequest",
+                },
+                params={
+                    "search_session_id": search_session_id,
+                    "page": page,
+                    "per_page": per_page,
+                    "keywords": keywords,
+                    "new_activity_only": "false",
+                    "activity_type": activity_type or "",
+                    "commute": conv_bool(commute),
+                    "private_activities": conv_bool(is_private),
+                    "trainer": conv_bool(indoor),
+                    "gear": gear_id or "",
+                }
+            )
+            if resp.status_code != 200:
+                raise stravalib.exc.Fault(
+                    "Failed to list activities (status code {})".format(resp.status_code)
+                )
+            try:
+                data = resp.json()["models"]
+            except (ValueError, TypeError, KeyError) as e:
+                raise stravalib.exc.Fault(
+                    "Invalid JSON response from Strava"
+                ) from e
+
+            for activity in data:
+                yield ScrapedActivity(**activity)
+
+            # No results = stop requesting pages
+            if not models:
+                break
+
     def delete_activity(self, activity_id):
         """
         Deletes the specified activity.
@@ -117,8 +257,7 @@ def delete_activity(self, activity_id):
                 "Failed to delete activity (status code: {})".format(resp.status_code),
             )
 
-    def get_activity_data(self, activity_id, fmt=DataFormat.ORIGINAL,
-                          json_fmt=None):
+    def get_activity_data(self, activity_id, fmt=DataFormat.ORIGINAL, json_fmt=None):
         """
         Get a file containing the provided activity's data
 
@@ -270,6 +409,19 @@ def get_bike_components(self, bike_id, on_date=None):
         else:
             return components
 
+
+# Mix in the ScrapingClient to inherit all its methods
+class WebClient(ScrapingClient, stravalib.Client):
+    """
+    An extension to the stravalib Client that fills in some of the gaps in
+    the official API using web scraping.
+
+    Requires a username and password
+    """
+    def __init__(self, *args, **kwargs):
+        # Docstring set manually after class definition
+        super().__init__(*args, **kwargs)
+
 # Inherit parent documentation for WebClient.__init__
 WebClient.__init__.__doc__ = stravalib.Client.__init__.__doc__ + \
         """