Skip to content

Commit

Permalink
BlockRules Fixes (0.4.3) (#75)
Browse files Browse the repository at this point in the history
- blockrules fix: when checking an iframe nav request, match inFrameUrl against the parent iframe, not current one
- blockrules: cleanup, always allow 'pywb.proxy' static files
- logging: when 'debug' logging enabled, log urls blocked and conditional iframe checks from blockrules
- tests: add more complex test for blockrules
- update CHANGES and support info in README
- bump to 0.4.3
  • Loading branch information
ikreymer authored Jul 27, 2021
1 parent f0c5ca1 commit be1ee53
Show file tree
Hide file tree
Showing 6 changed files with 85 additions and 26 deletions.
5 changes: 5 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
## CHANGES

v0.4.3
- BlockRules Fixes: When considering the 'inFrameUrl' for a navigation request for an iframe, use URL of parent frame.
- BlockRules Fixes: Always allow pywb proxy scripts.
- Logging: Improved debug logging for block rules (log blocked requests and conditional iframe requests) when 'debug' set in 'logging'

v0.4.2
- Compose/docs: Build latest image by default, update README to refer to latest image
- Fix typo in `crawler.capturePrefix` that resulted in `directFetchCapture()` always failing
Expand Down
5 changes: 2 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -484,10 +484,9 @@ Then, loading the `http://localhost:8080/wr-net/https://webrecorder.net/` should
Support
-------

Initial support for development of Browsertrix Crawler, was provided by [Kiwix](https://kiwix.org/)
Initial support for development of Browsertrix Crawler, was provided by [Kiwix](https://kiwix.org/). The initial functionality for Browsertrix Crawler was developed to support the [zimit](https://github.com/openzim/zimit) project in a collaboration between. Webrecorder and Kiwix, and this project has been split off from Zimit into a core component of Webrecorder.

Initial functionality for Browsertrix Crawler was developed to support the [zimit](https://github.com/openzim/zimit) project in a collaboration between
Webrecorder and Kiwix, and this project has been split off from Zimit into a core component of Webrecorder.
Additional support for Browsertrix Crawler, including for the development of the 0.4.x version has been provided by [Portico](https://www.portico.org/).


License
Expand Down
2 changes: 1 addition & 1 deletion crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ class Crawler {
await this.initPages();

if (this.params.blockRules && this.params.blockRules.length) {
this.blockRules = new BlockRules(this.params.blockRules, this.captureBasePrefix, this.params.blockMessage);
this.blockRules = new BlockRules(this.params.blockRules, this.captureBasePrefix, this.params.blockMessage, (text) => this.debugLog(text));
}

if (this.params.screencastPort) {
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "0.4.2",
"version": "0.4.3",
"main": "browsertrix-crawler",
"repository": "https://github.com/webrecorder/browsertrix-crawler",
"author": "Ilya Kreymer <[email protected]>, Webrecorder Software",
Expand Down
32 changes: 32 additions & 0 deletions tests/blockrules.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -130,4 +130,36 @@ test("test block url in frame url", () => {
});


test("test block rules complex example, block external urls on main frame, but not on youtube", () => {
const config = {
"seeds": [
"https://archiveweb.page/guide/troubleshooting/errors.html",
],
"depth": "0",
"blockRules": [{
"url": "(archiveweb.page|www.youtube.com)",
"type": "allowOnly",
"inFrameUrl": "archiveweb.page"
}, {
"url": "https://archiveweb.page/assets/js/vendor/lunr.min.js",
"inFrameUrl": "archiveweb.page"
}, {
"url": "https://www.youtube.com/embed/",
"type": "allowOnly",
"frameTextMatch": "(\\\\\"channelId\\\\\":\\\\\"UCOHO8gYUWpDYFWHXmIwE02g\\\\\")"
}],

"combineWARC": true,

"logging": "stats,debug"
};


runCrawl("block-7", config);

expect(doesCDXContain("block-7", "\"https://archiveweb.page/assets/js/vendor/lunr.min.js\"")).toBe(false);
expect(doesCDXContain("block-7", "\"video/mp4\"")).toBe(true);
});



65 changes: 44 additions & 21 deletions util/blockrules.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ const fetch = require("node-fetch");

const RULE_TYPES = ["block", "allowOnly"];

const ALWAYS_ALLOW = ["https://pywb.proxy/", "http://pywb.proxy/"];


// ===========================================================================
class BlockRule
Expand Down Expand Up @@ -37,20 +39,21 @@ ${this.frameTextMatch ? "Frame Text Regex: " + this.frameTextMatch : ""}
// ===========================================================================
class BlockRules
{
constructor(blockRules, blockPutUrl, blockErrMsg) {
constructor(blockRules, blockPutUrl, blockErrMsg, debugLog) {
this.rules = [];
this.blockPutUrl = blockPutUrl;
this.blockErrMsg = blockErrMsg;
this.debugLog = debugLog;
this.putUrlSet = new Set();

for (const ruleData of blockRules) {
this.rules.push(new BlockRule(ruleData));
}

if (this.rules.length) {
console.log("URL Block Rules:\n");
this.debugLog("URL Block Rules:\n");
for (const rule of this.rules) {
console.log(rule.toString());
this.debugLog(rule.toString());
}
}
}
Expand Down Expand Up @@ -79,15 +82,20 @@ class BlockRules
return;
}

// always allow special pywb proxy script
for (const allowUrl of ALWAYS_ALLOW) {
if (url.startsWith(allowUrl)) {
request.continue();
return;
}
}

for (const rule of this.rules) {
const {done, block} = await this.shouldBlock(rule, request);
const {done, block, frameUrl} = await this.shouldBlock(rule, request, url);

if (block) {
//const frameUrl = request.frame().url();
//console.log("Blocking/Aborting Request for: " + request.url());
// not allowed, abort loading this response
request.abort();
await this.recordBlockMsg(request.url());
await this.recordBlockMsg(url, frameUrl);
return;
}
if (done) {
Expand All @@ -98,42 +106,56 @@ class BlockRules
request.continue();
}

async shouldBlock(rule, request) {
const reqUrl = request.url();

async shouldBlock(rule, request, reqUrl) {
const {url, inFrameUrl, frameTextMatch} = rule;

const type = rule.type || "block";
const allowOnly = (type === "allowOnly");

const frameUrl = request.frame().url();
const isNavReq = request.isNavigationRequest();

const frame = request.frame();

let frameUrl = null;

if (isNavReq) {
const parentFrame = frame.parentFrame();
if (parentFrame) {
frameUrl = parentFrame.url();
} else {
frameUrl = frame.url();
}
} else {
frameUrl = frame.url();
}

// ignore initial page
if (frameUrl === "about:blank") {
return {block: false, done: true};
return {block: false, done: true, frameUrl};
}

// not a frame match, skip rule
if (inFrameUrl && !frameUrl.match(inFrameUrl)) {
return {block: false, done: false};
return {block: false, done: false, frameUrl};
}

const urlMatched = (url && reqUrl.match(url));

// if frame text-based rule: if url matched and a frame request
// frame text-based match: only applies to nav requests, never block otherwise
if (frameTextMatch) {
if (!urlMatched || !request.isNavigationRequest()) {
return {block: false, done: false};
if (!urlMatched || !isNavReq) {
return {block: false, done: false, frameUrl};
}

const block = await this.isTextMatch(request, reqUrl, frameTextMatch) ? !allowOnly : allowOnly;
return {block, done: true};
this.debugLog(`iframe ${url} conditionally ${block ? "BLOCKED" : "ALLOWED"}, parent frame ${frameUrl}`);
return {block, done: true, frameUrl};
}

// for non frame text rule, simply match by URL
const block = urlMatched ? !allowOnly : allowOnly;
return {block, done: false};
return {block, done: false, frameUrl};
}

async isTextMatch(request, reqUrl, frameTextMatch) {
Expand All @@ -144,11 +166,13 @@ class BlockRules
return !!text.match(frameTextMatch);

} catch (e) {
console.log(e);
this.debugLog(e);
}
}

async recordBlockMsg(url) {
async recordBlockMsg(url, frameUrl) {
this.debugLog(`URL Blocked/Aborted: ${url} in frame ${frameUrl}`);

if (!this.blockErrMsg || !this.blockPutUrl) {
return;
}
Expand All @@ -162,7 +186,6 @@ class BlockRules
const body = this.blockErrMsg;
const putUrl = new URL(this.blockPutUrl);
putUrl.searchParams.set("url", url);
//console.log("put url", putUrl.href);
await fetch(putUrl.href, {method: "PUT", headers: {"Content-Type": "text/html"}, body});
}
}
Expand Down

0 comments on commit be1ee53

Please sign in to comment.