Skip to content

Commit

Permalink
add ability to specify a specific file to write results to too
Browse files Browse the repository at this point in the history
  • Loading branch information
pes10k committed Apr 5, 2024
1 parent 0758ba9 commit 15c4403
Show file tree
Hide file tree
Showing 8 changed files with 540 additions and 9 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
built/
node_modules/
out/
.DS_Store
Expand Down
183 changes: 183 additions & 0 deletions built/brave/crawl.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
'use strict';
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
import * as osLib from 'os';
import fsExtraLib from 'fs-extra';
import pathLib from 'path';
import Xvbf from 'xvfb';
import { getLogger } from './debug.js';
import { puppeteerConfigForArgs, launchWithRetry } from './puppeteer.js';
import { isDir } from './validate.js';
const xvfbPlatforms = new Set(['linux', 'openbsd']);
const setupEnv = (args) => {
const logger = getLogger(args);
const platformName = osLib.platform();
let closeFunc;
if (args.interactive) {
logger.debug('Interactive mode, skipping Xvfb');
closeFunc = () => { };
}
else if (xvfbPlatforms.has(platformName)) {
logger.debug(`Running on ${platformName}, starting Xvfb`);
const xvfbHandle = new Xvbf({
// ensure 24-bit color depth or rendering might choke
xvfb_args: ['-screen', '0', '1024x768x24']
});
xvfbHandle.startSync();
closeFunc = () => {
logger.debug('Tearing down Xvfb');
xvfbHandle.stopSync();
};
}
else {
logger.debug(`Running on ${platformName}, Xvfb not supported`);
closeFunc = () => { };
}
return {
close: closeFunc
};
};
function generatePageGraph(seconds, page, client, logger) {
return __awaiter(this, void 0, void 0, function* () {
const waitTimeMs = seconds * 1000;
logger.debug(`Waiting for ${waitTimeMs}ms`);
yield page.waitFor(waitTimeMs);
logger.debug('calling generatePageGraph');
const response = yield client.send('Page.generatePageGraph');
logger.debug(`generatePageGraph { size: ${response.data.length} }`);
return response;
});
}
function createFilename(url) {
return `page_graph_${url === null || url === void 0 ? void 0 : url.replace(/[^\w]/g, '_')}_${Math.floor(Date.now() / 1000)}.graphml`;
}
function writeToFile(args, url, response, logger) {
return __awaiter(this, void 0, void 0, function* () {
const outputFilename = isDir(args.outputPath)
? pathLib.join(args.outputPath, createFilename(url))
: args.outputPath;
fsExtraLib.writeFile(outputFilename, response.data).catch((err) => {
logger.debug('ERROR saving Page.generatePageGraph output:', err);
});
});
}
export const doCrawl = (args) => __awaiter(void 0, void 0, void 0, function* () {
const logger = getLogger(args);
const url = args.urls[0];
const depth = args.recursiveDepth || 1;
let randomChildUrl = null;
let redirectedUrl = null;
const { puppeteerArgs, pathForProfile, shouldClean } = puppeteerConfigForArgs(args);
const envHandle = setupEnv(args);
try {
logger.debug('Launching puppeteer with args: ', puppeteerArgs);
const browser = yield launchWithRetry(puppeteerArgs, logger);
try {
// create new page, update UA if needed, navigate to target URL, and wait for idle time
const page = yield browser.newPage();
const client = yield page.target().createCDPSession();
client.on('Target.targetCrashed', (event) => {
logger.debug(`ERROR Target.targetCrashed { targetId: ${event.targetId}, status: "${event.status}", errorCode: ${event.errorCode} }`);
throw new Error(event.status);
});
if (args.userAgent) {
yield page.setUserAgent(args.userAgent);
}
yield page.setRequestInterception(true);
// First load is not a navigation redirect, so we need to skip it.
let firstLoad = true;
page.on('request', (request) => __awaiter(void 0, void 0, void 0, function* () {
// Only capture parent frame navigation requests.
logger.debug(`Request intercepted: ${request.url()}, first load: ${firstLoad}`);
if (!firstLoad && request.isNavigationRequest() && request.frame() !== null && request.frame().parentFrame() === null) {
logger.debug('Page is redirecting...');
redirectedUrl = request.url();
// Stop page load
logger.debug(`Stopping page load of ${url}`);
yield page._client.send('Page.stopLoading');
}
firstLoad = false;
request.continue();
}));
logger.debug(`Navigating to ${url}`);
yield page.goto(url, { waitUntil: 'load' });
logger.debug(`Loaded ${url}`);
const response = yield generatePageGraph(args.seconds, page, client, logger);
writeToFile(args, url, response, logger);
if (depth > 1) {
randomChildUrl = yield getRandomLinkFromPage(page, logger);
}
logger.debug('Closing page');
yield page.close();
}
catch (err) {
logger.debug('ERROR runtime fiasco from browser/page:', err);
}
finally {
logger.debug('Closing the browser');
yield browser.close();
}
}
catch (err) {
logger.debug('ERROR runtime fiasco from infrastructure:', err);
}
finally {
envHandle.close();
if (shouldClean) {
fsExtraLib.remove(pathForProfile);
}
}
if (redirectedUrl) {
const newArgs = Object.assign({}, args);
newArgs.urls = [redirectedUrl];
logger.debug(`Doing new crawl with redirected URL: ${redirectedUrl}`);
yield doCrawl(newArgs);
}
if (randomChildUrl) {
const newArgs = Object.assign({}, args);
newArgs.urls = [randomChildUrl];
newArgs.recursiveDepth = depth - 1;
yield doCrawl(newArgs);
}
});
const getRandomLinkFromPage = (page, logger) => __awaiter(void 0, void 0, void 0, function* () {
let rawLinks;
try {
rawLinks = yield page.$$('a[href]');
}
catch (e) {
logger.debug(`Unable to look for child links, page closed: ${e.toString()}`);
return null;
}
const links = [];
for (const link of rawLinks) {
const hrefHandle = yield link.getProperty('href');
const hrefValue = yield hrefHandle.jsonValue();
try {
const hrefUrl = new URL(hrefValue.trim());
hrefUrl.hash = '';
hrefUrl.search = '';
if (hrefUrl.protocol !== 'http:' && hrefUrl.protocol !== 'https:') {
continue;
}
const childUrlString = hrefUrl.toString();
if (!childUrlString || childUrlString.length === 0) {
continue;
}
links.push(childUrlString);
}
catch (_) {
continue;
}
}
// https://stackoverflow.com/a/4550514
const randomLink = links[Math.floor(Math.random() * links.length)];
return randomLink;
});
25 changes: 25 additions & 0 deletions built/brave/debug.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
const nullLogFunc = () => { };
const actualLogFunc = console.log;
const nullLogger = Object.freeze({
debug: nullLogFunc,
verbose: nullLogFunc
});
const debugLogger = Object.freeze({
debug: actualLogFunc,
verbose: nullLogFunc
});
const verboseLogger = Object.freeze({
debug: actualLogFunc,
verbose: actualLogFunc
});
const logLevelToLoggerMap = {
none: nullLogger,
debug: debugLogger,
verbose: verboseLogger
};
export const getLoggerForLevel = (level) => {
return logLevelToLoggerMap[level];
};
export const getLogger = (args) => {
return getLoggerForLevel(args.debugLevel);
};
98 changes: 98 additions & 0 deletions built/brave/puppeteer.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
import * as pathLib from 'path';
import fsExtraLib from 'fs-extra';
import tmpLib from 'tmp';
import puppeteerLib from 'puppeteer-core';
import { getLogger } from './debug.js';
const profilePathForArgs = (args) => {
const logger = getLogger(args);
// The easiest case is if we've been told to use an existing profile.
// In this case, just return the given path.
if (args.existingProfilePath) {
logger.debug(`Crawling with profile at ${args.existingProfilePath}.`);
return { path: args.existingProfilePath, shouldClean: false };
}
// Next, figure out which existing profile we're going to use as the
// template / starter profile for the new crawl.
const resourcesDirPath = pathLib.join(process.cwd(), 'resources');
const templateProfile = args.withShieldsUp
? pathLib.join(resourcesDirPath, 'shields-up-profile')
: pathLib.join(resourcesDirPath, 'shields-down-profile');
// Finally, either copy the above profile to the destination path
// that was specified, or figure out a temporary location for it.
const destProfilePath = args.persistProfilePath
? args.persistProfilePath
: tmpLib.dirSync({ prefix: 'pagegraph-profile-' }).name;
const shouldClean = !args.persistProfilePath;
fsExtraLib.copySync(templateProfile, destProfilePath);
logger.debug(`Crawling with profile at ${destProfilePath}.`);
return { path: destProfilePath, shouldClean };
};
export const puppeteerConfigForArgs = (args) => {
const { path: pathForProfile, shouldClean } = profilePathForArgs(args);
process.env.PAGEGRAPH_OUT_DIR = args.outputPath;
const puppeteerArgs = {
defaultViewport: null,
args: [
'--disable-brave-update',
'--user-data-dir=' + pathForProfile,
'--disable-site-isolation-trials',
'--enable-features=PageGraph'
],
executablePath: args.executablePath,
ignoreDefaultArgs: [
'--disable-sync'
],
dumpio: args.debugLevel !== 'none',
headless: false
};
if (args.debugLevel === 'debug') {
puppeteerArgs.args.push('--enable-logging=stderr');
puppeteerArgs.args.push('--vmodule=page_graph*=1');
}
else if (args.debugLevel === 'verbose') {
puppeteerArgs.args.push('--enable-logging=stderr');
puppeteerArgs.args.push('--vmodule=page_graph*=2');
}
if (args.proxyServer) {
puppeteerArgs.args.push(`--proxy-server=${args.proxyServer.toString()}`);
if (args.proxyServer.protocol === 'socks5') {
puppeteerArgs.args.push(`--host-resolver-rules=MAP * ~NOTFOUND , EXCLUDE ${args.proxyServer.hostname}`);
}
}
if (args.extraArgs) {
puppeteerArgs.args.push(...args.extraArgs);
}
return { puppeteerArgs, pathForProfile, shouldClean };
};
const asyncSleep = (millis) => {
return new Promise(resolve => setTimeout(resolve, millis));
};
export const launchWithRetry = (puppeteerArgs, logger, options) => __awaiter(void 0, void 0, void 0, function* () {
// default to 3 retries with a base-2 exponential-backoff delay between each retry (1s, 2s, 4s, ...)
const { retries = 3, computeTimeout = (tryIndex) => Math.pow(2, tryIndex - 1) * 1000 } = options || {};
try {
return yield puppeteerLib.launch(puppeteerArgs);
}
catch (err) {
logger.debug(`Failed to launch browser (${err}): ${retries} left...`);
}
for (let i = 1; i <= retries; ++i) {
yield asyncSleep(computeTimeout(i));
try {
return yield puppeteerLib.launch(puppeteerArgs);
}
catch (err) {
logger.debug(`Failed to launch browser (${err}): ${retries - i} left...`);
}
}
throw new Error(`Unable to launch browser after ${retries} retries!`);
});
Loading

0 comments on commit 15c4403

Please sign in to comment.