Skip to content

Commit

Permalink
Updated error handling for missing files and no response hyperlinks
Browse files Browse the repository at this point in the history
Signed-off-by: Charan-Sharan <[email protected]>
  • Loading branch information
Charan-Sharan committed Jun 24, 2024
1 parent e035112 commit 4aba7a0
Showing 1 changed file with 123 additions and 94 deletions.
217 changes: 123 additions & 94 deletions .github/workflows/test-hyperlinks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,13 @@ on:
# - '**.rst'
# - '**.xml'
workflow_dispatch:
inputs:
full-scan:
description: 'Scan all files'
required: false
inputs:
Debug-Mode:
type: boolean
description: Run in Debug mode to upload all created files
default: false
required: false


jobs:
main:
Expand All @@ -30,162 +31,190 @@ jobs:
- name: Checkout repository
uses: actions/checkout@v4

- name: List Docs
- name: List Documentation files
run: |
# if [[ ${{ github.event_name }} == "workflow_dispatch" ]]; then
if [[ true ]]; then
find $PWD -name '*.xml' -type f > xmlFilesList.txt
find $PWD -name '*.md' -type f > mdFilesList.txt
find $PWD -name '*.rst' -type f > rstFilesList.txt
else
git diff --name-only HEAD^1 HEAD > changed_files.txt
cat changed_files.txt | grep -E "*.xml" > xmlFilesList.txt
cat changed_files.txt | grep -E "*.md" > mdFilesList.txt
cat changed_files.txt | grep -E "*.rst" > rstFilesList.txt
git diff --name-only HEAD^1 HEAD > ModifiedFilesList.txt
cat ModifiedFilesList.txt | grep -E "*.xml" > xmlFilesList.txt
cat ModifiedFilesList.txt | grep -E "*.md" > mdFilesList.txt
cat ModifiedFilesList.txt | grep -E "*.rst" > rstFilesList.txt
fi
- name: List links from Docs
- name: List links from Documentation files
run: |
IFS=$'\n'
for file in $( cat xmlFilesList.txt )
for FILE in $( cat xmlFilesList.txt )
do
grep -onHE -e "<ulink" -e 'url="http[^\"\]+' -e "</ulink>" ${file} | sed 's/url="//' > links.tmp
flag=0
for line in $( cat links.tmp )
#check if the file is missing
if [[ ! -f $FILE ]]; then
echo -e "$FILE -\u001b[31m file missing"
echo $FILE >> missingFiles.txt
continue
fi
grep -onHE -e "<ulink" -e 'url="http[^\"\]+' -e "</ulink>" ${FILE} | sed 's/url="//' > links.tmp
FLAG=0
for LINE in $( cat links.tmp )
do
link=$( echo $line | cut -d ':' -f3- )
if [[ ${link:0:6} == '<ulink' ]]; then
flag=1
LINK=$( echo $LINE | cut -d ':' -f3- )
if [[ ${LINK:0:6} == '<ulink' ]]; then
FLAG=1
continue
elif [[ ${link:0:8} == '</ulink>' ]]; then
flag=0
elif [[ ${LINK:0:8} == '</ulink>' ]]; then
FLAG=0
continue
fi
if [[ $flag -eq 1 ]]; then
echo $line >> links_list.txt
else
echo $line >> rejected_links.txt
if [[ $FLAG -eq 1 ]]; then
echo $LINE >> linksList.txt
fi
done
done
for file in $( cat mdFilesList.txt )
for FILE in $( cat mdFilesList.txt )
do
grep -onHE -e "\]\([^\)]+" -e "\`\`\`[^\`]*" -e "http://[^\ \;\"\'\<\>\]\[\,\`\)]+" -e "https://[^\ \;\"\'\<\>\]\[\,\`\)]+" ${file} | sed 's/](//' > links.tmp
flag=0
for line in $( cat links.tmp )
#check if the file is missing
if [[ ! -f $FILE ]]; then
echo -e "$FILE -\u001b[31m file missing"
echo $FILE >> missingFiles.txt
continue
fi
grep -onHE -e "\]\([^\)]+" -e "\`\`\`[^\`]*" -e "http://[^\ \;\"\'\<\>\]\[\,\`\)]+" -e "https://[^\ \;\"\'\<\>\]\[\,\`\)]+" ${FILE} | sed 's/](//' > links.tmp
FLAG=0
for LINE in $( cat links.tmp )
do
link=$( echo $line | cut -d ':' -f3- )
if [[ ${link:0:3} == '```' ]]; then
flag=$(( 1 - flag ))
LINK=$( echo $LINE | cut -d ':' -f3- )
if [[ ${LINK:0:3} == '```' ]]; then
FLAG=$(( 1 - FLAG ))
continue
fi
if [[ $flag -eq 0 ]]; then
echo $line >> links_list.txt
if [[ $FLAG -eq 0 ]]; then
echo $LINE >> linksList.txt
fi
done
done
for file in $( cat rstFilesList.txt )
for FILE in $( cat rstFilesList.txt )
do
grep -onHE -e ".. _[^\]+" -e "http://[^\ \;\"\'\<\>\,\`\)]+" -e "https://[^\ \;\"\'\<\>\,\`\)]+" ${file} | sed 's/.. _[^\:]*: //' >> links_list.txt
#check if the file is missing
if [[ ! -f $FILE ]]; then
echo -e "$FILE -\u001b[31m file missing"
echo $FILE >> missingFiles.txt
continue
fi
grep -onHE -e ".. _[^\]+" -e "http://[^\ \;\"\'\<\>\,\`\)]+" -e "https://[^\ \;\"\'\<\>\,\`\)]+" ${FILE} | sed 's/.. _[^\:]*: //' >> linksList.txt
done
cat links_list.txt | grep -vE '127.0.0.1|localhost|\$|\[' | grep -E 'https://|http://' > ExternalLinks.txt
cat links_list.txt | grep -vE '127.0.0.1|localhost|\$|\[' | grep -vE 'https://|http://' > InternalLinks.txt
cat linksList.txt | grep -vE '127.0.0.1|localhost|\$|\[' | grep -E 'https://|http://' > externalLinks.txt
cat linksList.txt | grep -vE '127.0.0.1|localhost|\$|\[' | grep -vE 'https://|http://' > internalLinks.txt
- name: Test External links
run: |
touch checkedLinksCache.txt
IFS=$'\n'
for line in $(cat ExternalLinks.txt )
for LINE in $(cat externalLinks.txt )
do
link=$( echo $line | cut -d ':' -f3- )
link=${link%.} #removing trailing .
link=${link% } #removing trailing space
checkCache=$( cat checkedLinksCache.txt | grep "$link~" | wc -w )
try=3
if [[ $checkCache -eq 0 ]]; then
while [[ $try -ne 0 ]]
LINK=$( echo $LINE | cut -d ':' -f3- )
LINK=${LINK%.} #removing trailing .
LINK=${LINK% } #removing trailing space
CHECK_CACHE=$( cat checkedLinksCache.txt | grep "$LINK~" | wc -w )
TRY=3 #Max attempts to check status code of hyperlinks
if [[ $CHECK_CACHE -eq 0 ]]; then
while [[ $TRY -ne 0 ]]
do
status_code=$(curl -LI -m 60 -s $link | grep "HTTP" | tail -1 | cut -d' ' -f2 )
if [[ -n $status_code ]]; then
echo "$link~$status_code" >> checkedLinksCache.txt
STATUS_CODE=$(curl -LI -m 60 -s $LINK | grep "HTTP" | tail -1 | cut -d' ' -f2 )
if [[ -n $STATUS_CODE ]]; then
echo "$LINK~$STATUS_CODE" >> checkedLinksCache.txt
break
else
echo $line
echo $LINE
echo "retrying..."
try=$(( try - 1))
TRY=$(( TRY - 1))
fi
done
else
status_code=$( cat checkedLinksCache.txt | grep "$link~" | cut -d '~' -f2 )
STATUS_CODE=$( cat checkedLinksCache.txt | grep "$LINK~" | cut -d '~' -f2 )
fi
if [[ $status_code -eq 404 ]]; then
echo -e "${link} - \033[0;31m404 Error\033[0m"
echo "${line}" >> error-report.log
if [[ $STATUS_CODE -eq 404 ]]; then
echo -e "${LINK} - \033[0;31m404 Error\033[0m"
echo "${LINE}" >> error-report.log
elif [[ ! -n $STATUS_CODE ]]; then
echo -e "${LINK} - \033[0;31mNo Response\033[0m"
echo "${LINE}(No-Response)" >> error-report.log
else
echo "${link} - ${status_code}"
echo "${LINK} - ${STATUS_CODE}"
fi
done
- name: Test Internal Links
run: |
for line in $( cat InternalLinks.txt )
for LINE in $( cat internalLinks.txt )
do
reference=$( echo $line | cut -d ':' -f3- )
file=$( echo $line | cut -d ':' -f1 )
if [[ ${reference:0:1} == '#' ]]; then
Link_text=$( cat $file | grep -oE "\[.*\]\(${reference}\)" | sed 's/\[//' | cut -d ']' -f1 )
isPresent=$(cat $file | grep -oE "# ${Link_text}" | wc -w)
if [[ $isPresent -eq 0 ]]; then
echo -e "${line} -\u001b[31m invalid reference"
echo "${line}" >> error-report.log
REFERENCE=$( echo $LINE | cut -d ':' -f3- )
FILE=$( echo $LINE | cut -d ':' -f1 )
if [[ ${REFERENCE:0:1} == '#' ]]; then
LINK_TEXT=$( cat $FILE | grep -oE "\[.*\]\(${REFERENCE}\)" | sed 's/\[//' | cut -d ']' -f1 )
IS_PRESENT=$(cat $FILE | grep -oE "# ${LINK_TEXT}" | wc -w)
if [[ $IS_PRESENT -eq 0 ]]; then
echo -e "${LINE} -\u001b[31m invalid reference"
echo "${LINE}" >> error-report.log
fi
else
if [[ ${reference:0:1} == '/' ]]; then
baseDir=$PWD
if [[ ${REFERENCE:0:1} == '/' ]]; then
BASE_DIR=$PWD
else
baseDir=${file/$( basename $file )}
BASE_DIR=${FILE/$( basename $FILE )}
fi
searchFile="$baseDir/${reference}"
searchFile=$( realpath $searchFile )
if [[ ! -f $searchFile ]]; then
echo -e "${line} -\u001b[31m invalid reference"
echo ${line/$reference/$searchFile} >> error-report.log
SEARCH_FILE="$BASE_DIR/${REFERENCE}"
SEARCH_FILE=$( realpath $SEARCH_FILE )
if [[ ! -f $SEARCH_FILE ]]; then
echo -e "${LINE} -\u001b[31m invalid reference"
echo ${LINE/$REFERENCE/$SEARCH_FILE} >> error-report.log
fi
fi
done
- name: report Error links
- name: Report Error links
run: |
Number_of_404_links=$( cat error-report.log | wc -l )
NUMBER_OF_404_LINKS=$( cat error-report.log | wc -l )
echo -e "\u001b[32mNo. of files scanned : $( cat *FilesList.txt | wc -l )"
if [[ $Number_of_404_links -ne 0 ]]; then
if [[ $NUMBER_OF_404_LINKS -ne 0 ]]; then
echo -e "\u001b[31mNo. of unique broken links : $( cat error-report.log | cut -d: -f3- | sort | uniq | wc -l )"
echo -e "\u001b[31mTotal No. of reference to broken links : $( cat error-report.log | cut -d: -f3- | sort | wc -l )"
echo -e "\u001b[31mTotal No. of REFERENCE to broken links : $( cat error-report.log | cut -d: -f3- | sort | wc -l )"
exit -1
else
echo -e "\u001b[32mNo Broken-links found"
fi
- name: Modify Log file
- name: Modify log file
if: ${{ failure() || cancelled() }}
run: |
baseDir=${PWD%$(basename $PWD)}
baseDir=$(echo $baseDir | sed 's/\//\\\//g')
sed -i "s/${baseDir}//g" error-report.log
fileNames=$(cat error-report.log | cut -d ':' -f1 | sort | uniq )
fileCount=1
for line in $fileNames
BASE_DIR=${PWD%$(basename $PWD)}
BASE_DIR=$(echo $BASE_DIR | sed 's/\//\\\//g')
sed -i "s/${BASE_DIR}//g" error-report.log
FILE_NAMES_LIST=$(cat error-report.log | cut -d ':' -f1 | sort | uniq )
FILECOUNT=1
for LINE in $FILE_NAMES_LIST
do
rawLines=$( cat error-report.log | grep $line | cut -d ':' -f2- )
echo "$fileCount. $line" >> error-reportTmp.log
fileCount=$(( fileCount + 1))
for rawLine in $rawLines
LINKS_LIST=$( cat error-report.log | grep $LINE | cut -d ':' -f2- )
echo "$FILECOUNT. $LINE" >> error-reportTmp.log
FILECOUNT=$(( FILECOUNT + 1))
for LINK in $LINKS_LIST
do
echo -e "\t Line $rawLine" | sed 's/:/ : /' >> error-reportTmp.log
echo -e "\t Line $LINK" | sed 's/:/ : /' >> error-reportTmp.log
done
done
cat error-reportTmp.log > error-report.log
if [[ $(cat missingFiles.txt | wc -w ) -eq 0 ]]; then
echo -e "Broken links: \n" > error-report.log
cat error-reportTmp.log >> error-report.log
else
echo -e "Missing Files: \n" > error-report.log
cat missingFiles.txt >> error-report.log
echo -e "Broken links: \n" >> error-report.log
cat error-reportTmp.log >> error-report.log
fi
- name: Upload logs
uses: actions/upload-artifact@v4
Expand All @@ -194,10 +223,10 @@ jobs:
name: Hyperlinks-testing-log
path: |
/home/runner/work/HPCC-Platform/HPCC-Platform/error-report.log
/home/runner/work/HPCC-Platform/HPCC-Platform/*FilesList.txt
/home/runner/work/HPCC-Platform/HPCC-Platform/checkedLinksCache.txt
/home/runner/work/HPCC-Platform/HPCC-Platform/*Links.txt
/home/runner/work/HPCC-Platform/HPCC-Platform/links_list.txt
/home/runner/work/HPCC-Platform/HPCC-Platform/changed_files.txt
/home/runner/work/HPCC-Platform/HPCC-Platform/rejected_links.txt
if [[ ${{ github.event_name }} == "workflow_dispatch" && ${{ inputs.Debug-Mode }} == true ]]; then
/home/runner/work/HPCC-Platform/HPCC-Platform/*FilesList.txt
/home/runner/work/HPCC-Platform/HPCC-Platform/checkedLinksCache.txt
/home/runner/work/HPCC-Platform/HPCC-Platform/*Links.txt
/home/runner/work/HPCC-Platform/HPCC-Platform/linksList.txt
fi
if-no-files-found: ignore

0 comments on commit 4aba7a0

Please sign in to comment.