Skip to content

Commit

Permalink
External link checker
Browse files Browse the repository at this point in the history
  • Loading branch information
Josh-Cena committed Jul 28, 2024
1 parent d3953a7 commit 4239d66
Show file tree
Hide file tree
Showing 5 changed files with 195 additions and 54 deletions.
8 changes: 6 additions & 2 deletions .github/workflows/build-and-deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,12 @@ jobs:
run: bun install
working-directory: ./mdn-graph

- name: Build this repo
run: bun run build
- name: Build MDN graph
run: bun run build-graph
working-directory: ./mdn-graph

- name: Generate warning report
run: bun run build-warnings
working-directory: ./mdn-graph

- name: Determine commit message
Expand Down
23 changes: 0 additions & 23 deletions config/http-sites.txt

This file was deleted.

7 changes: 7 additions & 0 deletions docs/warnings.css
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
body {
display: flex;
margin: 1em;
height: calc(100vh - 2em);
}

select {
min-width: 300px;
height: 100%;
}

ul {
Expand Down
28 changes: 20 additions & 8 deletions src/server/create-graph.ts
Original file line number Diff line number Diff line change
Expand Up @@ -150,12 +150,19 @@ graph.forEachNode((node) => {
});
$("ul li").each((i, li) => {
const children = $(li).contents();
if (children.length === 0 || children[0].type === "text" && children[0].data.startsWith(":")) {
if (
children.length === 0 ||
(children[0].type === "text" && children[0].data.startsWith(":"))
) {
report(node, "Bad DL", $(li).text().slice(0, 50));
}
});
if (part.value.content.includes("-: "))
report(node, "Bad DL", part.value.content.match(/-: .*$/m)?.[0].slice(0, 50));
report(
node,
"Bad DL",
part.value.content.match(/-: .*$/m)?.[0].slice(0, 50)
);
$("a:not(svg a)").each((i, a) => {
const href = $(a).attr("href");
if (!href) {
Expand Down Expand Up @@ -245,16 +252,16 @@ graph.forEachNode((node) => {
report(node, "Broken anchor", linkTarget);
}
}
} else if (!linkTarget.startsWith("https:")) {
} else if (
!linkTarget.startsWith("http") ||
(linkTarget.includes("//localhost") &&
!linkTarget.includes("_sample_."))
) {
if (
linkTarget.startsWith("mailto:") ||
(linkTarget.startsWith("http://localhost:5042") &&
linkTarget.includes("_sample_.")) ||
["/", "/discord"].includes(linkTarget)
) {
continue;
} else if (linkTarget.startsWith("http:")) {
report(node, "HTTP link", linkTarget);
} else {
report(node, "Bad href", linkTarget);
}
Expand Down Expand Up @@ -363,7 +370,12 @@ for (const node of nodes) {
"short_title",
].map((key) => [key, node.data.metadata[key]])
);
node.data.links = node.data.links.filter((link) => !link.startsWith("/en-US/"));
node.data.links = node.data.links.filter(
(link) =>
!link.startsWith("/en-US/") &&
!link.startsWith("#") &&
!link.includes("//localhost")
);
}

for (const [text, used] of allowedCodeLinkTextRec) {
Expand Down
183 changes: 162 additions & 21 deletions src/server/process-warnings.ts
Original file line number Diff line number Diff line change
Expand Up @@ -58,17 +58,6 @@ const noPageRec = new Map(
.map((x) => [x, false])
);

const allowedHTTPSitesRec = new Map(
(
await Bun.file(
Bun.fileURLToPath(import.meta.resolve("../../config/http-sites.txt"))
).text()
)
.split("\n")
.filter((x) => x && !x.startsWith(" "))
.map((x) => [x, false])
);

for (const node of nodes) {
if (Object.keys(node.data.metadata.flaws).length === 0) continue;
const nodeWarnings = (warnings[node.data.metadata.source.folder] ??= []);
Expand Down Expand Up @@ -113,6 +102,167 @@ for (const node of nodes) {
});
}

async function checkLink(href: string) {
if (href.startsWith("http:")) {
try {
const res = await fetch(href.replace("http:", "https:"), {
signal: AbortSignal.timeout(3000),
});
if (res.ok) {
return {
type: "HTTP link",
data: "has HTTPS alternative",
};
}
} catch {}
}
try {
const res = await fetch(href, {
signal: AbortSignal.timeout(10000),
});
if (!res.ok) {
return {
type: "error status",
data: res.status,
};
}
if (res.url !== href) {
return {
type: "redirected",
data: res.url,
};
} else if (href.startsWith("http:")) {
return {
type: "HTTP link",
data: "",
};
} else {
return {
type: "ok",
};
}
} catch (e) {
return {
type: "request error",
data: (e as Error).message,
};
}
}

const linkRequests: (() => Promise<void>)[] = [];
const checkedLinks = new Map<string, { type: string; data?: any } | undefined>();

function report(node, ...data) {
const nodeWarnings = (warnings[node.data.metadata.source.folder] ??= []);
nodeWarnings.push({
message: data[0],
data: data.slice(1),
});
}

for (const node of nodes) {
for (const link of node.data.links) {
if (/https:\/\/(jsfiddle\.net|codepen\.io|jsbin\.com)\/./.test(link)) {
report(node, "External sandbox link", link);
continue;
}
if (
[
// Sites that don't do redirects or break links, should save us some time
"https://stackoverflow.com",
"https://tc39.es",
"https://drafts.csswg.org",
"https://unicode.org",
"https://www.unicode.org",
"https://datatracker.ietf.org",
"https://github.com/tc39",
"https://github.com/w3c",
"https://github.com/whatwg",
"https://bugzilla.mozilla.org",
"https://bugzil.la",
"https://webkit.org/b/",
"https://caniuse.com",
"https://chromestatus.com",
"https://chromium.googlesource.com",
// Youtube uses queries, so there's no real 404
"https://www.youtube.com",
"https://youtu.be",
"https://www.wolframalpha.com/input",
// Is this safe?
"https://www.w3.org",
"https://www.npmjs.com",
].some((domain) => link.startsWith(domain)) ||
link.includes(".spec.whatwg.org")
) {
continue;
}
if (link.startsWith("http")) {
const url = new URL(link);
url.hash = "";
const href = url.href;
if (!checkedLinks.has(href)) {
checkedLinks.set(href, undefined);
linkRequests.push(() => checkLink(href).then((res) => {
checkedLinks.set(href, res);
}));
}
}
}
}

// Every time, parallel at most 25 requests, wait until any of them settles,
// remove it from the queue and pull in the next one
async function depleteQueue() {
if (linkRequests.length <= 25) {
await Promise.all(linkRequests.map((req) => req()));
return;
}
let curReq = 25;
const promisePool: Promise<number>[] = [];
for (let i = 0; i < 25; i++) {
promisePool.push(linkRequests[i]().then(() => i));
}
while (curReq < linkRequests.length) {
if (curReq % 100 === 0) {
console.log(`Processed ${curReq}/${linkRequests.length} links`);
}
const completedSlot = await Promise.race(promisePool);
promisePool[completedSlot] = linkRequests[curReq++]().then(() => completedSlot);
}
await Promise.all(promisePool);
console.log(`Processed ${curReq}/${linkRequests.length} links`);
}

await depleteQueue();

for (const node of nodes) {
for (const link of node.data.links) {
if (!link.startsWith("http")) continue;
const url = new URL(link);
url.hash = "";
const checked = checkedLinks.get(url.href);
if (!checked) continue;
if (checked.type === "ok") continue;
switch (checked.type) {
case "HTTP link":
report(node, "HTTP link", url.href, checked.data);
break;
case "error status":
report(node, "Broken external link", url.href, checked.data);
break;
case "redirected":
report(node, "Redirected external link", url.href, checked.data);
break;
case "request error":
report(node, "Broken external link", url.href, checked.data);
break;
default:
console.error("Unexpected checked link type:", checked);
break;
}
}
}

const warningList = Object.entries(warnings);
warningList.sort(([a], [b]) =>
a.replaceAll("/", "").localeCompare(b.replaceAll("/", ""))
Expand Down Expand Up @@ -149,10 +299,7 @@ for (const [nodeId, baseMessages] of warningList) {
(x.message === "Broken link" &&
(missingFeatures.has(x.data[0]) ||
(noPageRec.has(x.data[0]) &&
(noPageRec.set(x.data[0], true), true)))) ||
(x.message === "HTTP link" &&
allowedHTTPSitesRec.has(new URL(x.data[0]).origin) &&
(allowedHTTPSitesRec.set(new URL(x.data[0]).origin, true), true))
(noPageRec.set(x.data[0], true), true))))
)
)
);
Expand All @@ -175,9 +322,3 @@ for (const [url, used] of noPageRec) {
console.error(`${url} is no longer referenced`);
}
}

for (const [site, used] of allowedHTTPSitesRec) {
if (!used) {
console.error(`${site} is no longer referenced in content`);
}
}

0 comments on commit 4239d66

Please sign in to comment.