From 85440f13c4a393814bf1b26c1350f56f82dbd45c Mon Sep 17 00:00:00 2001 From: Joshua Chen Date: Thu, 8 Aug 2024 13:28:53 -0400 Subject: [PATCH] Generically handle some inaccessible links --- config/inaccessible-links.txt | 30 ++++-------------------------- src/server/create-graph.ts | 2 +- src/server/process-warnings.ts | 16 ++++++++++++++-- 3 files changed, 19 insertions(+), 29 deletions(-) diff --git a/config/inaccessible-links.txt b/config/inaccessible-links.txt index 5f3e565..ef8ec9d 100644 --- a/config/inaccessible-links.txt +++ b/config/inaccessible-links.txt @@ -1,33 +1,11 @@ Links in this file will be ignored by the link checker, usually because they are behind auth or crawler checks. + Note that known firewall pages and redirections to login are handled generically in the link checker. Use two-space indent for comments. Use {...} to embed regex. - Cloudflare protection (response status 403 and returned HTML contains "Just a moment..."): -https://codepen.io{/?} -https://gitlab.com/projects/new -https://help.glitch.com/hc/{.*} -https://journals.sagepub.com/doi/{.*} -https://linux.die.net/man/{.*} -https://live.browserstack.com/dashboard -https://onlinelibrary.wiley.com/doi/{.*} -https://pixabay.com/ -https://www.browserstack.com/{(users|accounts)/.*} -https://www.cloudflare.com/{.*} -https://www.researchgate.net/publication/{.*} -https://www.udemy.com/{(topic|course)/.*} - - Other kinds of firewall: + Custom firewalls: https://www.canva.com/colors/color-wheel/ https://www.openwebanalytics.com{/?} -https://www.techopedia.com/definition/{.*} -https://www.webpagetest.org{/?} https://www.reddit.com/r/{.*} - Goes to login: -https://cloud.mongodb.com/v2 -https://console.cloud.google.com/{.*} -https://docs.google.com/drawings -https://shell.cloud.google.com/{.*} -https://sites.google.com/{.*} -https://github.com/new -https://github.com/{.*}/issues/new{.*} -https://github.com/orgs/mdn/teams{.*} + 404 on purpose: +https://konmari.com/404 diff --git a/src/server/create-graph.ts b/src/server/create-graph.ts index 7525e8f..ae73b5c 100644 --- a/src/server/create-graph.ts +++ b/src/server/create-graph.ts @@ -27,7 +27,7 @@ const allowedSpacedCodeLink = [ // HTTP status /^\d+ [\w '-]+$/, // HTTP header - /^(Cache-Control|Clear-Site-Data|Connection|Content-Length|Content-Security-Policy|Cross-Origin-Opener-Policy|Cross-Origin-Resource-Policy|Feature-Policy|Permissions-Policy|Sec-Purpose|Transfer-Encoding): ([\w-]+|"[\w-]+")$/, + /^(Cache-Control|Clear-Site-Data|Connection|Content-Length|Content-Security-Policy|Cross-Origin-Opener-Policy|Cross-Origin-Resource-Policy|Expect|Feature-Policy|Permissions-Policy|Sec-Purpose|Transfer-Encoding): ([\w-]+|"[\w-]+")$/, // MIME /^[a-z]+\/[\w+-]+; [a-z]+=("[\w ,.-]+"|\w+);?$/, // Macro calls diff --git a/src/server/process-warnings.ts b/src/server/process-warnings.ts index f5856d6..e680834 100644 --- a/src/server/process-warnings.ts +++ b/src/server/process-warnings.ts @@ -133,6 +133,17 @@ async function checkLink(href: string) { }; } } + } else if (res.status === 403) { + const text = await res.text(); + // Cloudflare firewall & similar + if ( + text.includes("Just a moment...") || + text.includes("Verify you are human") + ) { + return { + type: "ok", + }; + } } return { type: "error status", @@ -140,13 +151,14 @@ async function checkLink(href: string) { }; } if (res.url !== href) { - const resURL = new URL(res.url); const hrefURL = new URL(href); if ( // Allow root URLs even if the root URL goes elsewhere (hrefURL.pathname === "/" && res.url.startsWith(href)) || // Allow if the only change is addition of queries - resURL.href === hrefURL.href && hrefURL.search === "" + hrefURL.href === res.url.split("?")[0] || + // Allow redirection to login + /\/(login|signin)\b/.test(res.url) ) { return { type: "ok",