diff --git a/CHANGES.md b/CHANGES.md index 6a287c15e..539902f1c 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,10 @@ ## CHANGES +v0.4.3 +- BlockRules Fixes: When considering the 'inFrameUrl' for a navigation request for an iframe, use URL of parent frame. +- BlockRules Fixes: Always allow pywb proxy scripts. +- Logging: Improved debug logging for block rules (log blocked requests and conditional iframe requests) when 'debug' set in 'logging' + v0.4.2 - Compose/docs: Build latest image by default, update README to refer to latest image - Fix typo in `crawler.capturePrefix` that resulted in `directFetchCapture()` always failing diff --git a/README.md b/README.md index 201b5dda5..272c0b205 100644 --- a/README.md +++ b/README.md @@ -484,10 +484,9 @@ Then, loading the `http://localhost:8080/wr-net/https://webrecorder.net/` should Support ------- -Initial support for development of Browsertrix Crawler, was provided by [Kiwix](https://kiwix.org/) +Initial support for development of Browsertrix Crawler, was provided by [Kiwix](https://kiwix.org/). The initial functionality for Browsertrix Crawler was developed to support the [zimit](https://github.com/openzim/zimit) project in a collaboration between. Webrecorder and Kiwix, and this project has been split off from Zimit into a core component of Webrecorder. -Initial functionality for Browsertrix Crawler was developed to support the [zimit](https://github.com/openzim/zimit) project in a collaboration between -Webrecorder and Kiwix, and this project has been split off from Zimit into a core component of Webrecorder. +Additional support for Browsertrix Crawler, including for the development of the 0.4.x version has been provided by [Portico](https://www.portico.org/). License diff --git a/crawler.js b/crawler.js index 7064d7634..e0eba5113 100644 --- a/crawler.js +++ b/crawler.js @@ -329,7 +329,7 @@ class Crawler { await this.initPages(); if (this.params.blockRules && this.params.blockRules.length) { - this.blockRules = new BlockRules(this.params.blockRules, this.captureBasePrefix, this.params.blockMessage); + this.blockRules = new BlockRules(this.params.blockRules, this.captureBasePrefix, this.params.blockMessage, (text) => this.debugLog(text)); } if (this.params.screencastPort) { diff --git a/package.json b/package.json index a993c4985..72784308c 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "browsertrix-crawler", - "version": "0.4.2", + "version": "0.4.3", "main": "browsertrix-crawler", "repository": "https://github.com/webrecorder/browsertrix-crawler", "author": "Ilya Kreymer , Webrecorder Software", diff --git a/tests/blockrules.test.js b/tests/blockrules.test.js index a1ba19f39..91eac846d 100644 --- a/tests/blockrules.test.js +++ b/tests/blockrules.test.js @@ -130,4 +130,36 @@ test("test block url in frame url", () => { }); +test("test block rules complex example, block external urls on main frame, but not on youtube", () => { + const config = { + "seeds": [ + "https://archiveweb.page/guide/troubleshooting/errors.html", + ], + "depth": "0", + "blockRules": [{ + "url": "(archiveweb.page|www.youtube.com)", + "type": "allowOnly", + "inFrameUrl": "archiveweb.page" + }, { + "url": "https://archiveweb.page/assets/js/vendor/lunr.min.js", + "inFrameUrl": "archiveweb.page" + }, { + "url": "https://www.youtube.com/embed/", + "type": "allowOnly", + "frameTextMatch": "(\\\\\"channelId\\\\\":\\\\\"UCOHO8gYUWpDYFWHXmIwE02g\\\\\")" + }], + + "combineWARC": true, + + "logging": "stats,debug" + }; + + + runCrawl("block-7", config); + + expect(doesCDXContain("block-7", "\"https://archiveweb.page/assets/js/vendor/lunr.min.js\"")).toBe(false); + expect(doesCDXContain("block-7", "\"video/mp4\"")).toBe(true); +}); + + diff --git a/util/blockrules.js b/util/blockrules.js index 77f356c9f..ee5007090 100644 --- a/util/blockrules.js +++ b/util/blockrules.js @@ -2,6 +2,8 @@ const fetch = require("node-fetch"); const RULE_TYPES = ["block", "allowOnly"]; +const ALWAYS_ALLOW = ["https://pywb.proxy/", "http://pywb.proxy/"]; + // =========================================================================== class BlockRule @@ -37,10 +39,11 @@ ${this.frameTextMatch ? "Frame Text Regex: " + this.frameTextMatch : ""} // =========================================================================== class BlockRules { - constructor(blockRules, blockPutUrl, blockErrMsg) { + constructor(blockRules, blockPutUrl, blockErrMsg, debugLog) { this.rules = []; this.blockPutUrl = blockPutUrl; this.blockErrMsg = blockErrMsg; + this.debugLog = debugLog; this.putUrlSet = new Set(); for (const ruleData of blockRules) { @@ -48,9 +51,9 @@ class BlockRules } if (this.rules.length) { - console.log("URL Block Rules:\n"); + this.debugLog("URL Block Rules:\n"); for (const rule of this.rules) { - console.log(rule.toString()); + this.debugLog(rule.toString()); } } } @@ -79,15 +82,20 @@ class BlockRules return; } + // always allow special pywb proxy script + for (const allowUrl of ALWAYS_ALLOW) { + if (url.startsWith(allowUrl)) { + request.continue(); + return; + } + } + for (const rule of this.rules) { - const {done, block} = await this.shouldBlock(rule, request); + const {done, block, frameUrl} = await this.shouldBlock(rule, request, url); if (block) { - //const frameUrl = request.frame().url(); - //console.log("Blocking/Aborting Request for: " + request.url()); - // not allowed, abort loading this response request.abort(); - await this.recordBlockMsg(request.url()); + await this.recordBlockMsg(url, frameUrl); return; } if (done) { @@ -98,24 +106,37 @@ class BlockRules request.continue(); } - async shouldBlock(rule, request) { - const reqUrl = request.url(); - + async shouldBlock(rule, request, reqUrl) { const {url, inFrameUrl, frameTextMatch} = rule; const type = rule.type || "block"; const allowOnly = (type === "allowOnly"); - const frameUrl = request.frame().url(); + const isNavReq = request.isNavigationRequest(); + + const frame = request.frame(); + + let frameUrl = null; + + if (isNavReq) { + const parentFrame = frame.parentFrame(); + if (parentFrame) { + frameUrl = parentFrame.url(); + } else { + frameUrl = frame.url(); + } + } else { + frameUrl = frame.url(); + } // ignore initial page if (frameUrl === "about:blank") { - return {block: false, done: true}; + return {block: false, done: true, frameUrl}; } // not a frame match, skip rule if (inFrameUrl && !frameUrl.match(inFrameUrl)) { - return {block: false, done: false}; + return {block: false, done: false, frameUrl}; } const urlMatched = (url && reqUrl.match(url)); @@ -123,17 +144,18 @@ class BlockRules // if frame text-based rule: if url matched and a frame request // frame text-based match: only applies to nav requests, never block otherwise if (frameTextMatch) { - if (!urlMatched || !request.isNavigationRequest()) { - return {block: false, done: false}; + if (!urlMatched || !isNavReq) { + return {block: false, done: false, frameUrl}; } const block = await this.isTextMatch(request, reqUrl, frameTextMatch) ? !allowOnly : allowOnly; - return {block, done: true}; + this.debugLog(`iframe ${url} conditionally ${block ? "BLOCKED" : "ALLOWED"}, parent frame ${frameUrl}`); + return {block, done: true, frameUrl}; } // for non frame text rule, simply match by URL const block = urlMatched ? !allowOnly : allowOnly; - return {block, done: false}; + return {block, done: false, frameUrl}; } async isTextMatch(request, reqUrl, frameTextMatch) { @@ -144,11 +166,13 @@ class BlockRules return !!text.match(frameTextMatch); } catch (e) { - console.log(e); + this.debugLog(e); } } - async recordBlockMsg(url) { + async recordBlockMsg(url, frameUrl) { + this.debugLog(`URL Blocked/Aborted: ${url} in frame ${frameUrl}`); + if (!this.blockErrMsg || !this.blockPutUrl) { return; } @@ -162,7 +186,6 @@ class BlockRules const body = this.blockErrMsg; const putUrl = new URL(this.blockPutUrl); putUrl.searchParams.set("url", url); - //console.log("put url", putUrl.href); await fetch(putUrl.href, {method: "PUT", headers: {"Content-Type": "text/html"}, body}); } }