Skip to content

Commit

Permalink
Create broken link checker cron jobs
Browse files Browse the repository at this point in the history
  • Loading branch information
zhannaklimanova committed Aug 30, 2023
1 parent 410ce4f commit cc3ef0d
Show file tree
Hide file tree
Showing 2 changed files with 100 additions and 0 deletions.
45 changes: 45 additions & 0 deletions .github/workflows/broken-link-checker.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
name: new link check

on:
push:
pull_request: # temporarily execute on all branches
schedule:
- cron: "21 40 * * *"

jobs:
get-links:
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
steps:
- id: set-matrix
run: |
flatpages=$(curl https://cantusdatabase.org/flatpages-list/ | awk '{ gsub (" ", "\",\"", $0); print}')
articles=$(curl https://cantusdatabase.org/articles-list/ | awk '{ gsub (" ", "\",\"", $0); print}')
list="{\"links\": [\"${flatpages}\",\"${articles}\"]}"
echo $list
echo "matrix=$list" >> $GITHUB_OUTPUT
link-Checker:
runs-on: ubuntu-latest
needs: get-links
strategy:
fail-fast: false
max-parallel: 4
matrix: ${{fromJson(needs.get-links.outputs.matrix)}}
steps:
- uses: actions/checkout@v3
- name: Link Checker
id: lychee
uses: lycheeverse/[email protected]
with:
args: --exclude http:\/\/cantus\.sk.* ${{ matrix.links }}
format: json
output: /tmp/link-checker.txt
- name: parsing output
run: |
echo "***Python Version***"
python --version
echo "***Invoking parsing script***"
python "$GITHUB_WORKSPACE/scripts/parse_broken_link_checker_output.py" >> $GITHUB_STEP_SUMMARY
echo "***Printing step summary***"
cat $GITHUB_STEP_SUMMARY
55 changes: 55 additions & 0 deletions scripts/parse_broken_link_checker_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import json
import sys
from pathlib import Path
import sys

print(f"Running: {sys.argv[0]}", file=sys.stderr)

FILE_LOCATION = "/tmp/link-checker.txt"

# If link checker doesn't have any errors exit gracefully
if not Path(FILE_LOCATION).exists():
print("# ✅ No Broken Link")
sys.exit(0)
else:
print("# Broken Link found, parsing needed", file=sys.stderr)

# Loading link checker output result
with open(FILE_LOCATION) as f:
print(f"Parsing the json data for {FILE_LOCATION}", file=sys.stderr)
link_checker_result = json.load(f)

listOfFailure = link_checker_result['fail_map']

if not listOfFailure:
print("# ✅ No Broken Link")
sys.exit(0)

RealErrors = []
skipErrors = []

for failureWebSite in listOfFailure: # looping through tested websites
for failure in listOfFailure[failureWebSite]: # looping through broken links
errorCode = failure['status'].get('code')
if not errorCode: # if there's a timeout its a client side issue so will not exit 1, but just print as an additional problem
skipErrors.append(failure)
continue

# Find all 4xx errors
if 400 <= errorCode and 500 > errorCode:
RealErrors.append(failure)
else:
skipErrors.append(failure)

if RealErrors:
print("# Broken Link")
for error in RealErrors:
print(f"* {error['url']}: {error['status']['code']}")

if skipErrors:
print("# Skippable error Link")
for error in skipErrors:
print(f"* {error['url']}: {error['status']['text']}")

if RealErrors:
sys.exit(1)

0 comments on commit cc3ef0d

Please sign in to comment.