Skip to content

Commit

Permalink
Provide a ScrapingClient that doesn't need API access
Browse files Browse the repository at this point in the history
Also adds the ability to list activities using web scraping instead of
the API. The activities are returned as `ScrapedActivity` objects that
are mostly compatible with the normal `Activity` objects that are
returned by the list activities function that uses the API.
  • Loading branch information
pR0Ps committed Oct 1, 2020
1 parent b81071a commit b2f0204
Showing 1 changed file with 165 additions and 13 deletions.
178 changes: 165 additions & 13 deletions stravaweblib/webclient.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,83 @@
import functools
import enum
import re
import uuid

import requests
import stravalib
from stravalib.attributes import Attribute, TimestampAttribute, TimeIntervalAttribute
from stravalib.model import Activity, BaseEntity
from bs4 import BeautifulSoup


__all__ = ["WebClient", "FrameType", "DataFormat", "ActivityFile"]
__all__ = ["WebClient", "ScrapingClient", "FrameType", "DataFormat", "ActivityFile", "ScrapedActivity"]


BASE_URL = "https://www.strava.com"

# Used for filtering when scraping the activity list
ACTIVITY_WORKOUT_TYPES = {
"Ride": {None: 10, "Race": 11, "Workout": 12},
"Run": {None: 0, "Race": 1, "Long Run": 2, "Workout": 3}
}

ActivityFile = namedtuple("ActivityFile", ("filename", "content"))


class ScrapedActivity(BaseEntity):
"""
Represents an Activity (ride, run, etc.) that was scraped from the website
The attributes are compatible with stravalib.model.Activity where possible
(some are missing)
"""

id = Attribute(int)
name = Attribute(str)
description = Attribute(str)
type = Attribute(str)
workout_type = Attribute(str)

start_date = TimestampAttribute()
distance = Attribute(float)
moving_time = TimeIntervalAttribute()
elapsed_time = TimeIntervalAttribute()
total_elevation_gain = Attribute(float)
suffer_score = Attribute(int)
calories = Attribute(float)
gear_id = Attribute(str)

has_latlng = Attribute(bool)

trainer = Attribute(bool)
commute = Attribute(bool)
private = Attribute(bool)
flagged = Attribute(bool)

def from_dict(self, d):
bike_id = d.get("bike_id")
shoes_id = d.get("athlete_gear_id")
if bike_id:
d["gear_id"] = "b{}".format(bike_id)
elif shoes_id:
d["gear_id"] = "g{}".format(shoes_id)

d["start_date"] = d.pop("start_time")
d["distance"] = d.pop("distance_raw")
d["moving_time"] = d.pop("moving_time_raw")
d["elapsed_time"] = d.pop("elapsed_time_raw")
d["total_elevation_gain"] = d.pop("elevation_gain_raw")

wt = d.pop("workout_type")
if d["type"] in ACTIVITY_WORKOUT_TYPES:
for k, v in ACTIVITY_WORKOUT_TYPES[d["type"]].items():
if wt == v:
d["workout_type"] = k
break

return super().from_dict(d)


class DataFormat(enum.Enum):
ORIGINAL = "original"
GPX = "gpx"
Expand All @@ -45,29 +107,24 @@ def __str__(self):
return str(self.name).replace("_", " ").title()


class WebClient(stravalib.Client):
"""
An extension to the stravalib Client that fills in some of the gaps in
the official API using web scraping.
class ScrapingClient:
"""
A client that uses web scraping to interface with Strava.
def __init__(self, *args, **kwargs):
# Docstring set manually after class definition
Can be used as a mixin to add the extra methods to the main stravalib.Client
"""

email = kwargs.pop("email", None)
password = kwargs.pop("password", None)
def __init__(self, *args, email, password, **kwargs):
if not email or not password:
raise ValueError("'email' and 'password' kwargs are required")

self._csrf = {}
self._component_data = {}
self._session = requests.Session()
self._session.headers.update({
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
})
self._login(email, password)

# Init the normal stravalib client with remaining args
super().__init__(*args, **kwargs)

def _login(self, email, password):
Expand Down Expand Up @@ -101,6 +158,89 @@ def _login(self, email, password):
if not resp.is_redirect or resp.next.url == login_url:
raise stravalib.exc.LoginFailed("Couldn't log in to website, check creds")

def scrape_activites(self, keywords=None, activity_type=None, workout_type=None,
commute=False, is_private=False, indoor=False, gear_id=None):
"""A scraping-based alternative to stravalib.Client.get_activities()
Note that when using multiple parameters they are treated as AND, not OR
:param keywords: Text to search for
:param activity_type: The type of the activity. See stravalib.model:Activity.TYPES
:param workout_type: The type of workout ("Race", "Workout", etc)
:param commute: Only return activities marked as commutes
:param is_private: Only return private activities
:param indoor: Only return indoor/trainer activities
:param gear_id: Only return activities using this gear
:yield: ScrapedActivity objects
"""

if activity_type is not None and activity_type not in Activity.TYPES:
raise ValueError(
"Invalid activity type. Must be one of: {}".format(",".join(Activity.TYPES))
)

if activity_type in ACTIVITY_WORKOUT_TYPES:
workout_type = ACTIVITY_WORKOUT_TYPES[activity_type].get(workout_type)
if workout_type is None:
raise ValueError(
"Invalid workout type for a {}. Must be one of: {}".format(
activity_type,
", ".join(ACTIVITY_WORKOUT_TYPES[activity_type].keys())
)
)
elif workout_type is not None or gear_id is not None:
raise ValueError(
"Can only filter using workout type of gear when activity type is one of: {}".format(
", ".join(ACTIVITY_WORKOUT_TYPES.keys())
)
)

page = 1
per_page = 20
search_session_id = uuid.uuid4()

conv_bool = lambda x: "" if not x else "true"

while True:
resp = self._session.get(
"{}/athlete/training_activities".format(BASE_URL),
headers= {
"Accept": "text/javascript, application/javascript, application/ecmascript, application/x-ecmascript",
#"X-CSRF-Token": next(iter(self._csrf.values())),
"X-Requested-With": "XMLHttpRequest",
},
params={
"search_session_id": search_session_id,
"page": page,
"per_page": per_page,
"keywords": keywords,
"new_activity_only": "false",
"activity_type": activity_type or "",
"commute": conv_bool(commute),
"private_activities": conv_bool(is_private),
"trainer": conv_bool(indoor),
"gear": gear_id or "",
}
)
if resp.status_code != 200:
raise stravalib.exc.Fault(
"Failed to list activities (status code {})".format(resp.status_code)
)
try:
data = resp.json()["models"]
except (ValueError, TypeError, KeyError) as e:
raise stravalib.exc.Fault(
"Invalid JSON response from Strava"
) from e

for activity in data:
yield ScrapedActivity(**activity)

# No results = stop requesting pages
if not models:
break

def delete_activity(self, activity_id):
"""
Deletes the specified activity.
Expand All @@ -117,8 +257,7 @@ def delete_activity(self, activity_id):
"Failed to delete activity (status code: {})".format(resp.status_code),
)

def get_activity_data(self, activity_id, fmt=DataFormat.ORIGINAL,
json_fmt=None):
def get_activity_data(self, activity_id, fmt=DataFormat.ORIGINAL, json_fmt=None):
"""
Get a file containing the provided activity's data
Expand Down Expand Up @@ -270,6 +409,19 @@ def get_bike_components(self, bike_id, on_date=None):
else:
return components


# Mix in the ScrapingClient to inherit all its methods
class WebClient(ScrapingClient, stravalib.Client):
"""
An extension to the stravalib Client that fills in some of the gaps in
the official API using web scraping.
Requires a username and password
"""
def __init__(self, *args, **kwargs):
# Docstring set manually after class definition
super().__init__(*args, **kwargs)

# Inherit parent documentation for WebClient.__init__
WebClient.__init__.__doc__ = stravalib.Client.__init__.__doc__ + \
"""
Expand Down

0 comments on commit b2f0204

Please sign in to comment.