Skip to content

Commit

Permalink
log requests for coles on error
Browse files Browse the repository at this point in the history
  • Loading branch information
Javex committed Jan 30, 2025
1 parent 5d2b79c commit 0e1cf88
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 7 deletions.
5 changes: 5 additions & 0 deletions .github/workflows/scrape-groceries.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,11 @@ jobs:
mkdir -p ./output/
aws s3 cp "s3://grocery-scrape-au/${save_path}" "./output/${save_path}" || true
- run: python3 main.py sync coles --skip-existing
- users: actions/upload-artifact@v3
with:
name: coles_error
path: ./output/coles/
if: ${{ failure() }}
- uses: actions/upload-artifact@v3
with:
name: coles_snapshot
Expand Down
7 changes: 7 additions & 0 deletions hotprices_au/output.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import json
import gzip
from datetime import datetime
import requests
import pathlib


def get_save_path(store, output_dir, compression="gzip", day=None):
Expand Down Expand Up @@ -39,3 +41,8 @@ def load_data(store, output_dir, compression="gzip", day=None):

decoded_data = json.loads(raw_data)
return decoded_data


def save_response(response: requests.Response, save_path_dir: pathlib.Path):
fpath = save_path_dir.joinpath("response.txt")
fpath.write_text(response.text)
25 changes: 18 additions & 7 deletions hotprices_au/sites/coles.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@


class ColesScraper:
def __init__(self, store_id, quick=False):
def __init__(self, store_id, save_path_dir: pathlib.Path, quick=False):
self.quick = quick
self.store_id = store_id
self.save_path_dir = save_path_dir

self.session = request.get_base_session()
self.session.headers = {
Expand All @@ -31,9 +32,13 @@ def start(self):
# Need to get the subscription key
response = self.session.get("https://www.coles.com.au")
response.raise_for_status()
html = BeautifulSoup(response.text, features="html.parser")
next_data_script = html.find("script", id="__NEXT_DATA__")
next_data_json = json.loads(next_data_script.string)
try:
html = BeautifulSoup(response.text, features="html.parser")
next_data_script = html.find("script", id="__NEXT_DATA__")
next_data_json = json.loads(next_data_script.string)
except:
output.save_response(response, self.save_path_dir)
raise
self.api_key = next_data_json["runtimeConfig"]["BFF_API_SUBSCRIPTION_KEY"]
self.session.headers["ocp-apim-subscription-key"] = self.api_key
self.version = next_data_json["buildId"]
Expand Down Expand Up @@ -69,7 +74,12 @@ def get_category(self, cat_slug, page_filter: int):
else:
params["page"] += 1
continue
response_data = response.json()
try:
response_data = response.json()
except:
output.save_response(response, self.save_path_dir)
raise

search_results = response_data["pageProps"]["searchResults"]
for result in search_results["results"]:
yield result
Expand Down Expand Up @@ -247,12 +257,13 @@ def parse_str_unit(size):
return units.parse_str_unit(size)


def main(quick, save_path, category: str, page: int):
def main(quick, save_path: pathlib.Path, category: str, page: int):
"""
category: Slug or name or category to fetch, will fetch only that one.
page: Page number to fetch.
"""
coles = ColesScraper(store_id="0584", quick=quick)
save_path_dir = save_path.parent
coles = ColesScraper(store_id="0584", save_path_dir=save_path_dir, quick=quick)
categories = coles.get_categories()
# Rename to avoid the overwrite below
category_filter = category.lower() if category is not None else None
Expand Down

0 comments on commit 0e1cf88

Please sign in to comment.