-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawl.js
106 lines (90 loc) · 2.74 KB
/
crawl.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import jsdom from "jsdom";
const { JSDOM } = jsdom;
const BASE_URL = "https://wagslane.dev";
export const crawlPage = async ({
baseUrl = BASE_URL,
currentURL = baseUrl,
pages = {},
}) => {
const baseUrlObject = new URL(baseUrl);
const currentUrlObject = new URL(currentURL);
if (baseUrlObject.hostname !== currentUrlObject.hostname) {
console.log(">>> Not crawling external URL: ${currentURL}");
return pages;
}
const normalizedUrl = normalizeURL({ url: currentURL });
if (pages[normalizedUrl] > 0) {
console.log(`>>> Already crawled ${currentURL}`);
pages[normalizedUrl] += 1;
return pages;
} else {
// initialize this page in the map
// since it doesn't exist yet
pages[normalizedUrl] = 1;
}
console.log(`>>> Crawling ${currentURL}`);
let html = "";
try {
html = await fetchHTML({ currentURL });
} catch (err) {
console.log(`${err.message}`);
return pages;
}
// recur through the page's links
const nextURLs = getURLsFromHTML(html, baseUrl);
for (const nextURL of nextURLs) {
pages = await crawlPage({ baseUrl, currentURL: nextURL, pages });
}
return pages;
};
const fetchHTML = async ({ currentURL }) => {
try {
const response = await fetch(currentURL);
const headers = response.headers;
const contentType = headers.get("content-type");
if (!response.ok) {
console.log(`${response.status} Error fetching page: ${currentURL}
${response.statusText}`);
return;
}
if (!contentType || !contentType.includes("text/html")) {
console.log(`Error: received non-HTML response when fetching: ${currentURL}
Received content-type: ${contentType}`);
return;
}
const html = await response.text();
console.log(`>>> HTML of ${currentURL}:
${html}`);
return html;
} catch (err) {
console.log(`${err.message} crawling page: ${currentURL}`);
}
};
export const getURLsFromHTML = (html, baseURL) => {
const urls = [];
const dom = new JSDOM(html);
const anchors = dom.window.document.querySelectorAll("a");
for (const anchor of anchors) {
if (anchor.hasAttribute("href")) {
let href = anchor.getAttribute("href");
try {
// convert any relative URLs to absolute URLs
href = new URL(href, baseURL).href;
urls.push(href);
} catch (err) {
console.log(`${err.message}: ${href}`);
}
}
}
return urls;
};
const removeTrailingSlash = (url) => {
return url.endsWith("/") ? url.slice(0, -1) : url;
};
export const normalizeURL = ({ url = "" }) => {
const urlObject = new URL(url);
const hostname = urlObject.hostname;
const pathname = urlObject.pathname;
const normalizedUrl = `${hostname}${pathname}`;
return removeTrailingSlash(normalizedUrl);
};