Skip to content

Commit

Permalink
Profiles: Support for running with existing profiles + saving profile…
Browse files Browse the repository at this point in the history
… after a login (#34)

Support for profiles via a mounted .tar.gz and --profile option + improved docs #18

* support creating profiles via 'create-login-profile' command with options for where to save profile, username/pass and debug screenshot output. support entering username and password (hidden) on command-line if omitted.

* use patched pywb for fix

* bump browsertrix-behaviors to 0.1.0

* README: updates to include better getting started, behaviors and profile reference/examples

* bump version to 0.3.0!
  • Loading branch information
ikreymer authored Apr 10, 2021
1 parent c9f8fe0 commit b59788e
Show file tree
Hide file tree
Showing 8 changed files with 477 additions and 82 deletions.
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ ADD uwsgi.ini /app/
ADD *.js /app/

RUN ln -s /app/main.js /usr/bin/crawl
RUN ln -s /app/create-login-profile.js /usr/bin/create-login-profile

WORKDIR /crawls

Expand Down
208 changes: 154 additions & 54 deletions README.md

Large diffs are not rendered by default.

51 changes: 32 additions & 19 deletions crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ const fetch = require("node-fetch");
const AbortController = require("abort-controller");
const path = require("path");
const fs = require("fs");
const os = require("os");
const Sitemapper = require("sitemapper");
const { v4: uuidv4 } = require("uuid");
const warcio = require("warcio");
Expand Down Expand Up @@ -44,6 +45,7 @@ class Crawler {

this.userAgent = "";
this.behaviorsLogDebug = false;
this.profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-"));

const params = require("yargs")
.usage("browsertrix-crawler [options]")
Expand Down Expand Up @@ -279,6 +281,11 @@ class Crawler {
default: "autoplay,autofetch,siteSpecific",
type: "string",
},

"profile": {
describe: "Path to tar.gz file which will be extracted and used as the browser profile",
type: "string",
},
};
}

Expand Down Expand Up @@ -399,6 +406,10 @@ class Crawler {
argv.statsFilename = path.resolve(argv.cwd, argv.statsFilename);
}

if (argv.profile) {
child_process.execSync("tar xvfz " + argv.profile, {cwd: this.profileDir});
}

return true;
}

Expand All @@ -411,6 +422,7 @@ class Crawler {
"--disable-background-media-suspend",
"--autoplay-policy=no-user-gesture-required",
"--disable-features=IsolateOrigins,site-per-process",
"--disable-popup-blocking"
];
}

Expand All @@ -420,7 +432,9 @@ class Crawler {
headless: this.params.headless,
executablePath: CHROME_PATH,
ignoreHTTPSErrors: true,
args: this.chromeArgs
args: this.chromeArgs,
userDataDir: this.profileDir,
defaultViewport: null,
};
}

Expand All @@ -436,31 +450,30 @@ class Crawler {
process.exit(1);
}
}


_behaviorLog({data, type}) {
switch (type) {
case "info":
console.log(JSON.stringify(data));
break;

case "debug":
default:
if (this.behaviorsLogDebug) {
console.log("behavior debug: " + JSON.stringify(data));
}
}
}

async crawlPage({page, data}) {
try {
if (this.emulateDevice) {
await page.emulate(this.emulateDevice);
}

if (this.behaviorOpts) {
await page.exposeFunction(BEHAVIOR_LOG_FUNC, ({data, type}) => {
switch (type) {
case "info":
console.log(JSON.stringify(data));
break;

case "debug":
default:
if (this.behaviorsLogDebug) {
console.log("behavior debug: " + JSON.stringify(data));
}
}
});

await page.evaluateOnNewDocument(behaviors + `
self.__bx_behaviors.init(${this.behaviorOpts});
`);
await page.exposeFunction(BEHAVIOR_LOG_FUNC, (logdata) => this._behaviorLog(logdata));
await page.evaluateOnNewDocument(behaviors + `;\nself.__bx_behaviors.init(${this.behaviorOpts});`);
}

// run custom driver here
Expand Down
178 changes: 178 additions & 0 deletions create-login-profile.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
#!/usr/bin/env node

const readline = require("readline");
const child_process = require("child_process");

const puppeteer = require("puppeteer-core");
const yargs = require("yargs");

function cliOpts() {
return {
"url": {
describe: "The URL of the login page",
type: "string",
demandOption: true,
},

"user": {
describe: "The username for the login. If not specified, will be prompted",
},

"password": {
describe: "The password for the login. If not specified, will be prompted (recommended)",
},

"filename": {
describe: "The filename for the profile tarball",
default: "/output/profile.tar.gz",
},

"debugScreenshot": {
describe: "If specified, take a screenshot after login and save as this filename"
},

"headless": {
describe: "Run in headless mode, otherwise start xvfb",
type: "boolean",
default: false,
},
};
}



async function main() {
const params = yargs
.usage("browsertrix-crawler profile [options]")
.option(cliOpts())
.argv;

if (!params.headless) {
console.log("Launching XVFB");
child_process.spawn("Xvfb", [
process.env.DISPLAY,
"-listen",
"tcp",
"-screen",
"0",
process.env.GEOMETRY,
"-ac",
"+extension",
"RANDR"
]);
}

//await new Promise(resolve => setTimeout(resolve, 2000));

const args = {
headless: !!params.headless,
executablePath: "google-chrome",
ignoreHTTPSErrors: true,
args: [
"--no-xshm",
"--no-sandbox",
"--disable-background-media-suspend",
"--autoplay-policy=no-user-gesture-required",
"--disable-features=IsolateOrigins,site-per-process",
"--user-data-dir=/tmp/profile"
]
};

if (!params.user) {
params.user = await promptInput("Enter username: ");
}

if (!params.password) {
params.password = await promptInput("Enter password: ", true);
}

const browser = await puppeteer.launch(args);

const page = await browser.newPage();

const waitUntil = ["load", "networkidle2"];

await page.setCacheEnabled(false);

console.log("loading");

await page.goto(params.url, {waitUntil});

console.log("loaded");

let u, p;

try {
u = await page.waitForXPath("//input[contains(@name, 'user')]");

p = await page.waitForXPath("//input[contains(@name, 'pass') and @type='password']");

} catch (e) {
if (params.debugScreenshot) {
await page.screenshot({path: params.debugScreenshot});
}
console.log("Login form could not be found");
await page.close();
process.exit(1);
return;
}

await u.type(params.user);

await p.type(params.password);

await Promise.allSettled([
p.press("Enter"),
page.waitForNavigation({waitUntil})
]);

await page._client.send("Network.clearBrowserCache");

if (params.debugScreenshot) {
await page.screenshot({path: params.debugScreenshot});
}

await browser.close();

console.log("creating profile");

const profileFilename = params.filename || "/output/profile.tar.gz";

child_process.execFileSync("tar", ["cvfz", profileFilename, "./"], {cwd: "/tmp/profile"});
console.log("done");

process.exit(0);
}

function promptInput(msg, hidden = false) {
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout
});

if (hidden) {
// from https://stackoverflow.com/a/59727173
rl.input.on("keypress", function () {
// get the number of characters entered so far:
const len = rl.line.length;
// move cursor back to the beginning of the input:
readline.moveCursor(rl.output, -len, 0);
// clear everything to the right of the cursor:
readline.clearLine(rl.output, 1);
// replace the original input with asterisks:
for (let i = 0; i < len; i++) {
rl.output.write("*");
}
});
}

return new Promise((resolve) => {
rl.question(msg, function (res) {
rl.close();
resolve(res);
});
});
}

main();

2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ version: '3.5'

services:
crawler:
image: webrecorder/browsertrix-crawler:0.3.0-beta.0
image: webrecorder/browsertrix-crawler:0.3.0
build:
context: ./

Expand Down
6 changes: 3 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
{
"name": "browsertrix-crawler",
"version": "0.3.0-beta.0",
"version": "0.3.0",
"main": "browsertrix-crawler",
"repository": "https://github.com/webrecorder/browsertrix-crawler",
"author": "Ilya Kreymer <[email protected]>, Webrecorder Software",
"license": "MIT",
"dependencies": {
"abort-controller": "^3.0.0",
"browsertrix-behaviors": "github:webrecorder/browsertrix-behaviors",
"browsertrix-behaviors": "^0.1.0",
"node-fetch": "^2.6.1",
"puppeteer-cluster": "^0.22.0",
"puppeteer-core": "^5.3.1",
Expand All @@ -20,6 +20,6 @@
"eslint-plugin-react": "^7.22.0",
"jest": "^26.6.3",
"md5": "^2.3.0",
"warcio": "^1.4.2"
"warcio": "^1.4.3"
}
}
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
pywb>=2.5.0
#pywb>=2.5.0
git+https://github.com/webrecorder/pywb@yt-rules-improve
uwsgi
wacz>=0.2.1
Loading

0 comments on commit b59788e

Please sign in to comment.