-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add ability to specify a specific file to write results to too
- Loading branch information
Showing
8 changed files
with
540 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,3 @@ | ||
built/ | ||
node_modules/ | ||
out/ | ||
.DS_Store | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,183 @@ | ||
'use strict'; | ||
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { | ||
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } | ||
return new (P || (P = Promise))(function (resolve, reject) { | ||
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } | ||
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } | ||
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } | ||
step((generator = generator.apply(thisArg, _arguments || [])).next()); | ||
}); | ||
}; | ||
import * as osLib from 'os'; | ||
import fsExtraLib from 'fs-extra'; | ||
import pathLib from 'path'; | ||
import Xvbf from 'xvfb'; | ||
import { getLogger } from './debug.js'; | ||
import { puppeteerConfigForArgs, launchWithRetry } from './puppeteer.js'; | ||
import { isDir } from './validate.js'; | ||
const xvfbPlatforms = new Set(['linux', 'openbsd']); | ||
const setupEnv = (args) => { | ||
const logger = getLogger(args); | ||
const platformName = osLib.platform(); | ||
let closeFunc; | ||
if (args.interactive) { | ||
logger.debug('Interactive mode, skipping Xvfb'); | ||
closeFunc = () => { }; | ||
} | ||
else if (xvfbPlatforms.has(platformName)) { | ||
logger.debug(`Running on ${platformName}, starting Xvfb`); | ||
const xvfbHandle = new Xvbf({ | ||
// ensure 24-bit color depth or rendering might choke | ||
xvfb_args: ['-screen', '0', '1024x768x24'] | ||
}); | ||
xvfbHandle.startSync(); | ||
closeFunc = () => { | ||
logger.debug('Tearing down Xvfb'); | ||
xvfbHandle.stopSync(); | ||
}; | ||
} | ||
else { | ||
logger.debug(`Running on ${platformName}, Xvfb not supported`); | ||
closeFunc = () => { }; | ||
} | ||
return { | ||
close: closeFunc | ||
}; | ||
}; | ||
function generatePageGraph(seconds, page, client, logger) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
const waitTimeMs = seconds * 1000; | ||
logger.debug(`Waiting for ${waitTimeMs}ms`); | ||
yield page.waitFor(waitTimeMs); | ||
logger.debug('calling generatePageGraph'); | ||
const response = yield client.send('Page.generatePageGraph'); | ||
logger.debug(`generatePageGraph { size: ${response.data.length} }`); | ||
return response; | ||
}); | ||
} | ||
function createFilename(url) { | ||
return `page_graph_${url === null || url === void 0 ? void 0 : url.replace(/[^\w]/g, '_')}_${Math.floor(Date.now() / 1000)}.graphml`; | ||
} | ||
function writeToFile(args, url, response, logger) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
const outputFilename = isDir(args.outputPath) | ||
? pathLib.join(args.outputPath, createFilename(url)) | ||
: args.outputPath; | ||
fsExtraLib.writeFile(outputFilename, response.data).catch((err) => { | ||
logger.debug('ERROR saving Page.generatePageGraph output:', err); | ||
}); | ||
}); | ||
} | ||
export const doCrawl = (args) => __awaiter(void 0, void 0, void 0, function* () { | ||
const logger = getLogger(args); | ||
const url = args.urls[0]; | ||
const depth = args.recursiveDepth || 1; | ||
let randomChildUrl = null; | ||
let redirectedUrl = null; | ||
const { puppeteerArgs, pathForProfile, shouldClean } = puppeteerConfigForArgs(args); | ||
const envHandle = setupEnv(args); | ||
try { | ||
logger.debug('Launching puppeteer with args: ', puppeteerArgs); | ||
const browser = yield launchWithRetry(puppeteerArgs, logger); | ||
try { | ||
// create new page, update UA if needed, navigate to target URL, and wait for idle time | ||
const page = yield browser.newPage(); | ||
const client = yield page.target().createCDPSession(); | ||
client.on('Target.targetCrashed', (event) => { | ||
logger.debug(`ERROR Target.targetCrashed { targetId: ${event.targetId}, status: "${event.status}", errorCode: ${event.errorCode} }`); | ||
throw new Error(event.status); | ||
}); | ||
if (args.userAgent) { | ||
yield page.setUserAgent(args.userAgent); | ||
} | ||
yield page.setRequestInterception(true); | ||
// First load is not a navigation redirect, so we need to skip it. | ||
let firstLoad = true; | ||
page.on('request', (request) => __awaiter(void 0, void 0, void 0, function* () { | ||
// Only capture parent frame navigation requests. | ||
logger.debug(`Request intercepted: ${request.url()}, first load: ${firstLoad}`); | ||
if (!firstLoad && request.isNavigationRequest() && request.frame() !== null && request.frame().parentFrame() === null) { | ||
logger.debug('Page is redirecting...'); | ||
redirectedUrl = request.url(); | ||
// Stop page load | ||
logger.debug(`Stopping page load of ${url}`); | ||
yield page._client.send('Page.stopLoading'); | ||
} | ||
firstLoad = false; | ||
request.continue(); | ||
})); | ||
logger.debug(`Navigating to ${url}`); | ||
yield page.goto(url, { waitUntil: 'load' }); | ||
logger.debug(`Loaded ${url}`); | ||
const response = yield generatePageGraph(args.seconds, page, client, logger); | ||
writeToFile(args, url, response, logger); | ||
if (depth > 1) { | ||
randomChildUrl = yield getRandomLinkFromPage(page, logger); | ||
} | ||
logger.debug('Closing page'); | ||
yield page.close(); | ||
} | ||
catch (err) { | ||
logger.debug('ERROR runtime fiasco from browser/page:', err); | ||
} | ||
finally { | ||
logger.debug('Closing the browser'); | ||
yield browser.close(); | ||
} | ||
} | ||
catch (err) { | ||
logger.debug('ERROR runtime fiasco from infrastructure:', err); | ||
} | ||
finally { | ||
envHandle.close(); | ||
if (shouldClean) { | ||
fsExtraLib.remove(pathForProfile); | ||
} | ||
} | ||
if (redirectedUrl) { | ||
const newArgs = Object.assign({}, args); | ||
newArgs.urls = [redirectedUrl]; | ||
logger.debug(`Doing new crawl with redirected URL: ${redirectedUrl}`); | ||
yield doCrawl(newArgs); | ||
} | ||
if (randomChildUrl) { | ||
const newArgs = Object.assign({}, args); | ||
newArgs.urls = [randomChildUrl]; | ||
newArgs.recursiveDepth = depth - 1; | ||
yield doCrawl(newArgs); | ||
} | ||
}); | ||
const getRandomLinkFromPage = (page, logger) => __awaiter(void 0, void 0, void 0, function* () { | ||
let rawLinks; | ||
try { | ||
rawLinks = yield page.$$('a[href]'); | ||
} | ||
catch (e) { | ||
logger.debug(`Unable to look for child links, page closed: ${e.toString()}`); | ||
return null; | ||
} | ||
const links = []; | ||
for (const link of rawLinks) { | ||
const hrefHandle = yield link.getProperty('href'); | ||
const hrefValue = yield hrefHandle.jsonValue(); | ||
try { | ||
const hrefUrl = new URL(hrefValue.trim()); | ||
hrefUrl.hash = ''; | ||
hrefUrl.search = ''; | ||
if (hrefUrl.protocol !== 'http:' && hrefUrl.protocol !== 'https:') { | ||
continue; | ||
} | ||
const childUrlString = hrefUrl.toString(); | ||
if (!childUrlString || childUrlString.length === 0) { | ||
continue; | ||
} | ||
links.push(childUrlString); | ||
} | ||
catch (_) { | ||
continue; | ||
} | ||
} | ||
// https://stackoverflow.com/a/4550514 | ||
const randomLink = links[Math.floor(Math.random() * links.length)]; | ||
return randomLink; | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
const nullLogFunc = () => { }; | ||
const actualLogFunc = console.log; | ||
const nullLogger = Object.freeze({ | ||
debug: nullLogFunc, | ||
verbose: nullLogFunc | ||
}); | ||
const debugLogger = Object.freeze({ | ||
debug: actualLogFunc, | ||
verbose: nullLogFunc | ||
}); | ||
const verboseLogger = Object.freeze({ | ||
debug: actualLogFunc, | ||
verbose: actualLogFunc | ||
}); | ||
const logLevelToLoggerMap = { | ||
none: nullLogger, | ||
debug: debugLogger, | ||
verbose: verboseLogger | ||
}; | ||
export const getLoggerForLevel = (level) => { | ||
return logLevelToLoggerMap[level]; | ||
}; | ||
export const getLogger = (args) => { | ||
return getLoggerForLevel(args.debugLevel); | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { | ||
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } | ||
return new (P || (P = Promise))(function (resolve, reject) { | ||
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } | ||
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } | ||
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } | ||
step((generator = generator.apply(thisArg, _arguments || [])).next()); | ||
}); | ||
}; | ||
import * as pathLib from 'path'; | ||
import fsExtraLib from 'fs-extra'; | ||
import tmpLib from 'tmp'; | ||
import puppeteerLib from 'puppeteer-core'; | ||
import { getLogger } from './debug.js'; | ||
const profilePathForArgs = (args) => { | ||
const logger = getLogger(args); | ||
// The easiest case is if we've been told to use an existing profile. | ||
// In this case, just return the given path. | ||
if (args.existingProfilePath) { | ||
logger.debug(`Crawling with profile at ${args.existingProfilePath}.`); | ||
return { path: args.existingProfilePath, shouldClean: false }; | ||
} | ||
// Next, figure out which existing profile we're going to use as the | ||
// template / starter profile for the new crawl. | ||
const resourcesDirPath = pathLib.join(process.cwd(), 'resources'); | ||
const templateProfile = args.withShieldsUp | ||
? pathLib.join(resourcesDirPath, 'shields-up-profile') | ||
: pathLib.join(resourcesDirPath, 'shields-down-profile'); | ||
// Finally, either copy the above profile to the destination path | ||
// that was specified, or figure out a temporary location for it. | ||
const destProfilePath = args.persistProfilePath | ||
? args.persistProfilePath | ||
: tmpLib.dirSync({ prefix: 'pagegraph-profile-' }).name; | ||
const shouldClean = !args.persistProfilePath; | ||
fsExtraLib.copySync(templateProfile, destProfilePath); | ||
logger.debug(`Crawling with profile at ${destProfilePath}.`); | ||
return { path: destProfilePath, shouldClean }; | ||
}; | ||
export const puppeteerConfigForArgs = (args) => { | ||
const { path: pathForProfile, shouldClean } = profilePathForArgs(args); | ||
process.env.PAGEGRAPH_OUT_DIR = args.outputPath; | ||
const puppeteerArgs = { | ||
defaultViewport: null, | ||
args: [ | ||
'--disable-brave-update', | ||
'--user-data-dir=' + pathForProfile, | ||
'--disable-site-isolation-trials', | ||
'--enable-features=PageGraph' | ||
], | ||
executablePath: args.executablePath, | ||
ignoreDefaultArgs: [ | ||
'--disable-sync' | ||
], | ||
dumpio: args.debugLevel !== 'none', | ||
headless: false | ||
}; | ||
if (args.debugLevel === 'debug') { | ||
puppeteerArgs.args.push('--enable-logging=stderr'); | ||
puppeteerArgs.args.push('--vmodule=page_graph*=1'); | ||
} | ||
else if (args.debugLevel === 'verbose') { | ||
puppeteerArgs.args.push('--enable-logging=stderr'); | ||
puppeteerArgs.args.push('--vmodule=page_graph*=2'); | ||
} | ||
if (args.proxyServer) { | ||
puppeteerArgs.args.push(`--proxy-server=${args.proxyServer.toString()}`); | ||
if (args.proxyServer.protocol === 'socks5') { | ||
puppeteerArgs.args.push(`--host-resolver-rules=MAP * ~NOTFOUND , EXCLUDE ${args.proxyServer.hostname}`); | ||
} | ||
} | ||
if (args.extraArgs) { | ||
puppeteerArgs.args.push(...args.extraArgs); | ||
} | ||
return { puppeteerArgs, pathForProfile, shouldClean }; | ||
}; | ||
const asyncSleep = (millis) => { | ||
return new Promise(resolve => setTimeout(resolve, millis)); | ||
}; | ||
export const launchWithRetry = (puppeteerArgs, logger, options) => __awaiter(void 0, void 0, void 0, function* () { | ||
// default to 3 retries with a base-2 exponential-backoff delay between each retry (1s, 2s, 4s, ...) | ||
const { retries = 3, computeTimeout = (tryIndex) => Math.pow(2, tryIndex - 1) * 1000 } = options || {}; | ||
try { | ||
return yield puppeteerLib.launch(puppeteerArgs); | ||
} | ||
catch (err) { | ||
logger.debug(`Failed to launch browser (${err}): ${retries} left...`); | ||
} | ||
for (let i = 1; i <= retries; ++i) { | ||
yield asyncSleep(computeTimeout(i)); | ||
try { | ||
return yield puppeteerLib.launch(puppeteerArgs); | ||
} | ||
catch (err) { | ||
logger.debug(`Failed to launch browser (${err}): ${retries - i} left...`); | ||
} | ||
} | ||
throw new Error(`Unable to launch browser after ${retries} retries!`); | ||
}); |
Oops, something went wrong.