From c5494be653566c4352cea298a9b2d9bac9bb2a4e Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ikreymer@users.noreply.github.com>
Date: Tue, 17 Aug 2021 20:54:18 -0700
Subject: [PATCH] Page Resource Block Rules Avoid Duplicate Handlers + Ignore
 top-level pages + README update (0.4.4) (#81)

* blockrules improvements:
- add await to continue/abort to catch errors, each called only in one place.
- avoid adding multiple interception handlers for same page to avoid 'request already handled' errors
- disallow blocking full pages via blockRules (should be handled via scope exclusion) and print warning

* setup: ensure the 'cwd' for the crawl output exists on startup, in case a custom cwd was set.

* scopeType rename:
- rename 'page' -> page-spa to indicate support for hashtag / single-page-app intended usage
- rename 'none' -> page to indicate default single-page-only crawl
- messaging: adjust error message displaying valid scopeTypes

* README: Add additional examples for scope rules, update scopeType param, explain different between scope rules vs block rules, to better address confusion as per #80

bump to 0.4.4
---
 README.md            |  94 ++++++++++++++++++++++++++++-----
 crawler.js           |   2 +
 package.json         |   2 +-
 requirements.txt     |   2 +-
 tests/scopes.test.js |  12 ++---
 util/blockrules.js   | 121 ++++++++++++++++++++++++++++---------------
 util/seeds.js        |  10 ++--
 yarn.lock            |   7 ++-
 8 files changed, 179 insertions(+), 71 deletions(-)

diff --git a/README.md b/README.md
index 5cd6e12e7..bbe275fcf 100644
--- a/README.md
+++ b/README.md
@@ -218,38 +218,84 @@ Certain settings such scope type, scope includes and excludes, and depth can als
 seeds:
   - url: https://webrecorder.net/
     depth: 1
-    type: "prefix"
+    scopeType: "prefix"
 ```
 
-### Scope Types
+### Crawl Scope -- Configuring Pages Included or Excluded from a Crawl
 
 The crawl scope can be configured globally for all seeds, or customized per seed, by specifying the `--scopeType` command-line option or setting the `type` property for each seed.
 
-The scope controls which linked pages are also included in the crawl.
+There is also a `depth` setting also limits how many pages will be crawled for that seed, while the `limit` option sets the total number of pages crawled from any seed.
 
-The available types are:
+The scope controls which linked pages are included and which pages are excluded from the crawl.
 
-- `page` - crawl only this page, but load any links that include different hashtags. Useful for single-page apps that may load different content based on hashtag.
+To make this configuration as simple as possible, there are several predefined scope types. The available types are:
+
+- `page` - crawl only this page and no additional links.
+
+- `page-spa` - crawl only this page, but load any links that include different hashtags. Useful for single-page apps that may load different content based on hashtag.
 
 - `prefix` - crawl any pages in the same directory, eg. starting from `https://example.com/path/page.html`, crawl anything under `https://example.com/path/` (default)
 
 - `host` - crawl pages that share the same host.
 
-- `any` - crawl any and all pages.
+- `any` - crawl any and all pages linked from this page..
+
+- `custom` - crawl based on the `--include` regular expression rules.
+
+
+#### Custom Scope Inclusion Rules
+
+Instead of setting a scope type, it is possible to instead configure custom scope regex by setting `--include` config to one or more regular expressions.
+If using the YAML config, the `include` field can contain a list of regexes.
+
+Extracted links that match the regular expression will be considered 'in scope' and included.
+
+#### Custom Scope Exclusion Rules
+
+In addition to the inclusion rules, Browsertrix Crawler supports a separate list of exclusion regexes, that if match, override an exclude a URL from the crawl.
 
-- `none` - don't crawl any additional pages besides the seed.
+The exclusion regexes are often used with a custom scope, but could be used with a predefined scopeType as well.
 
 
-The `depth` setting also limits how many pages will be crawled for that seed, while the `limit` option sets the total
-number of pages crawled from any seed.
+#### Scope Rule Examples
 
-### Block Rules
+For example, the following seed will start on `https://example.com/startpage.html` and crawl all pages on the `https://example.com/` domain, except pages that match the regexes `example.com/skip.*` or `example.com/search.*`
 
-While scope rules define which pages are to be crawled, it is also possible to block certain URLs in certain pages or frames from being recorded.
+```
+seeds:
+  - url: https://example.com/startpage.html
+    scopeType: "host"
+    exclude:
+      - example.com/skip.*
+      - example.com/search.*
+
+```
+
+In the following example, the scope include regexes will crawl all page URLs that match `example.com/(crawl-this|crawl-that)`,
+but skip URLs that end with 'skip-me'. For example, `https://example.com/crawl-this/page.html` would be crawled, but `https://example.com/crawl-this/pages/skip` would not be.
+
+```
+seeds:
+  - url: https://example.com/startpage.html
+    include: example.com/crawl-this|crawl-that
+    exclude:
+      - skip$
+```
 
-This is useful for blocking ads or other content that should not be included.
+The `include`, `exclude`, `scopeType` and `depth` settings can be configured per seed, or globally, for the entire crawl.
 
-The block rules can be specified as a list in the `blockRules` field. Each rule can contain one of the following fields:
+The per-seed settings override the per-crawl settings, if any.
+
+The test suite [tests/scopes.test.js](tests/scopes.test.js) for additional examples of configuring scope inclusion and exclusion rules.
+
+### Page Resource Block Rules
+
+While scope rules define which pages are to be crawled, it is also possible to block page resources, URLs loaded within a page or within an iframe on a page.
+
+For example, this is useful for blocking ads or other content that is loaded within multiple pages, but should be blocked.
+
+The page rules block rules can be specified as a list in the `blockRules` field. Each rule can contain one of the following fields:
 
 - `url`: regex for URL to match (required)
 
@@ -259,17 +305,37 @@ The block rules can be specified as a list in the `blockRules` field. Each rule
 
 - `frameTextMatch`: if specified, the text of the specified URL is checked for the regex, and the rule applies only if there is an additional match. When specified, this field makes the block rule apply only to frame-level resource, eg. URLs loaded directly in an iframe or top-level frame.
 
-For example, a very simple block rule that blocks all URLs from 'googleanalytics.com' can be added with:
+For example, a very simple block rule that blocks all URLs from 'googleanalytics.com' on any page can be added with:
 
 ```
 blockRules:
    - url: googleanalytics.com
 ```
 
+To instead block 'googleanalytics.com' only if loaded within pages or iframes that match the regex 'example.com/no-analytics', add:
+
+```
+blockRules:
+   - url: googleanalytics.com
+     inFrameUrl: example.com/no-analytics
+```
+
 For additional examples of block rules, see the [tests/blockrules.test.js](tests/blockrules.test.js) file in the test suite.
 
 If the `--blockMessage` is also specified, a blocked URL is replaced with the specified message (added as a WARC resource record).
 
+#### Page Resource Block Rules vs Scope Rules
+
+If it seems confusing which rules should be used, here is a quick way to determine:
+
+- If you'd like to restrict *the pages that are being crawled*, use the crawl scope rules (defined above).
+
+- If you'd like to restrict *parts of a page* that are being loaded, use the page resource block rules described in this section.
+
+The blockRules add a filter to each URL loaded on a page and incur an extra overhead. They should only be used in advance uses cases where part of a page needs to be blocked.
+
+These rules can not be used to prevent entire pages for loading -- use the scope exclusion rules for that. (A warning will be printed if a page resource block rule matches a top-level page).
+
 
 ### Custom Warcinfo Fields
 
diff --git a/crawler.js b/crawler.js
index e0eba5113..32abcbbef 100644
--- a/crawler.js
+++ b/crawler.js
@@ -206,6 +206,8 @@ class Crawler {
   }
 
   async run() {
+    await fsp.mkdir(this.params.cwd, {recursive: true});
+
     this.bootstrap();
 
     try {
diff --git a/package.json b/package.json
index 72784308c..584622fc0 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "browsertrix-crawler",
-  "version": "0.4.3",
+  "version": "0.4.4",
   "main": "browsertrix-crawler",
   "repository": "https://github.com/webrecorder/browsertrix-crawler",
   "author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
diff --git a/requirements.txt b/requirements.txt
index d5ec473f7..275dd0c52 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,3 @@
-pywb>=2.6.0b4
+pywb>=2.6.0
 uwsgi
 wacz>=0.3.1
diff --git a/tests/scopes.test.js b/tests/scopes.test.js
index 7a52acbf6..533958536 100644
--- a/tests/scopes.test.js
+++ b/tests/scopes.test.js
@@ -113,7 +113,7 @@ test("override scope with exclude", async () => {
 
 seeds:
    - url: https://example.com/1
-     scopeType: page
+     scopeType: page-spa
 
    - url: https://example.com/subpath/file.html
      scopeType: prefix
@@ -122,10 +122,10 @@ seeds:
      scopeType: any
 
    - url: https://example.com/3
-     scopeType: none
+     scopeType: page
 
    - url: https://example.com/4
-     scopeType: none
+     scopeType: page
      exclude: ''
 
 exclude:
@@ -137,7 +137,7 @@ exclude:
   expect(seeds.length).toEqual(5);
   const excludeRxs = [/\/search\?/, /q\?/];
 
-  expect(seeds[0].scopeType).toEqual("page");
+  expect(seeds[0].scopeType).toEqual("page-spa");
   expect(seeds[0].url).toEqual("https://example.com/1");
   expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\/1#.+/]);
   expect(seeds[0].exclude).toEqual(excludeRxs);
@@ -152,12 +152,12 @@ exclude:
   expect(seeds[2].include).toEqual([/.*/]);
   expect(seeds[2].exclude).toEqual(excludeRxs);
 
-  expect(seeds[3].scopeType).toEqual("none");
+  expect(seeds[3].scopeType).toEqual("page");
   expect(seeds[3].url).toEqual("https://example.com/3");
   expect(seeds[3].include).toEqual([]);
   expect(seeds[3].exclude).toEqual(excludeRxs);
 
-  expect(seeds[4].scopeType).toEqual("none");
+  expect(seeds[4].scopeType).toEqual("page");
   expect(seeds[4].url).toEqual("https://example.com/4");
   expect(seeds[4].include).toEqual([]);
   expect(seeds[4].exclude).toEqual([]);
diff --git a/util/blockrules.js b/util/blockrules.js
index ee5007090..1d17210b7 100644
--- a/util/blockrules.js
+++ b/util/blockrules.js
@@ -4,6 +4,13 @@ const RULE_TYPES = ["block", "allowOnly"];
 
 const ALWAYS_ALLOW = ["https://pywb.proxy/", "http://pywb.proxy/"];
 
+const BlockState = {
+  ALLOW: null,
+  BLOCK_PAGE_NAV: "page",
+  BLOCK_IFRAME_NAV: "iframe",
+  BLOCK_OTHER: "resource"
+};
+
 
 // ===========================================================================
 class BlockRule
@@ -44,7 +51,8 @@ class BlockRules
     this.blockPutUrl = blockPutUrl;
     this.blockErrMsg = blockErrMsg;
     this.debugLog = debugLog;
-    this.putUrlSet = new Set();
+
+    this.blockedUrlSet = new Set();
 
     for (const ruleData of blockRules) {
       this.rules.push(new BlockRule(ruleData));
@@ -63,6 +71,12 @@ class BlockRules
       return;
     }
 
+    if (page._btrix_interceptionAdded) {
+      return true;
+    }
+
+    page._btrix_interceptionAdded = true;
+
     await page.setRequestInterception(true);
 
     page.on("request", async (request) => {
@@ -77,66 +91,89 @@ class BlockRules
   async handleRequest(request) {
     const url = request.url();
 
-    if (!url.startsWith("http:") && !url.startsWith("https:")) {
-      request.continue();
-      return;
-    }
+    let blockState;
 
-    // always allow special pywb proxy script
-    for (const allowUrl of ALWAYS_ALLOW) {
-      if (url.startsWith(allowUrl)) {
-        request.continue();
-        return;
-      }
-    }
-
-    for (const rule of this.rules) {
-      const {done, block, frameUrl} = await this.shouldBlock(rule, request, url);
+    try {
+      blockState = await this.shouldBlock(request, url);
 
-      if (block) {
-        request.abort();
-        await this.recordBlockMsg(url, frameUrl);
-        return;
-      }
-      if (done) {
-        break;
+      if (blockState === BlockState.ALLOW) {
+        await request.continue();
+      } else {
+        await request.abort("blockedbyclient");
       }
-    }
 
-    request.continue();
+    } catch (e) {
+      this.debugLog(`Block: (${blockState}) Failed On: ${url} Reason: ${e}`);
+    }
   }
 
-  async shouldBlock(rule, request, reqUrl) {
-    const {url, inFrameUrl, frameTextMatch} = rule;
-
-    const type = rule.type || "block";
-    const allowOnly = (type === "allowOnly");
+  async shouldBlock(request, url) {
+    if (!url.startsWith("http:") && !url.startsWith("https:")) {
+      return BlockState.ALLOW;
+    }
 
     const isNavReq = request.isNavigationRequest();
 
     const frame = request.frame();
 
-    let frameUrl = null;
+    let frameUrl = "";
+    let blockState;
 
     if (isNavReq) {
       const parentFrame = frame.parentFrame();
       if (parentFrame) {
         frameUrl = parentFrame.url();
+        blockState = BlockState.BLOCK_IFRAME_NAV;
       } else {
         frameUrl = frame.url();
+        blockState = BlockState.BLOCK_PAGE_NAV;
       }
     } else {
-      frameUrl = frame.url();
+      frameUrl = frame ? frame.url() : "";
+      blockState = BlockState.BLOCK_OTHER;
     }
 
     // ignore initial page
     if (frameUrl === "about:blank") {
-      return {block: false, done: true, frameUrl};
+      return BlockState.ALLOW;
+    }
+
+    // always allow special pywb proxy script
+    for (const allowUrl of ALWAYS_ALLOW) {
+      if (url.startsWith(allowUrl)) {
+        return BlockState.ALLOW;
+      }
+    }
+
+    for (const rule of this.rules) {
+      const {done, block} = await this.ruleCheck(rule, request, url, frameUrl, isNavReq);
+
+      if (block) {
+        if (blockState === BlockState.BLOCK_PAGE_NAV) {
+          console.warn(`Warning: Block rule match for page request "${url}" ignored, set --exclude to block full pages`);
+          return BlockState.ALLOW;
+        }
+        this.debugLog(`URL Blocked/Aborted: ${url} in frame ${frameUrl}`);
+        await this.recordBlockMsg(url);
+        return blockState;
+      }
+      if (done) {
+        break;
+      }
     }
 
+    return BlockState.ALLOW;
+  }
+
+  async ruleCheck(rule, request, reqUrl, frameUrl, isNavReq) {
+    const {url, inFrameUrl, frameTextMatch} = rule;
+
+    const type = rule.type || "block";
+    const allowOnly = (type === "allowOnly");
+
     // not a frame match, skip rule
     if (inFrameUrl && !frameUrl.match(inFrameUrl)) {
-      return {block: false, done: false, frameUrl};
+      return {block: false, done: false};
     }
 
     const urlMatched = (url && reqUrl.match(url));
@@ -145,17 +182,17 @@ class BlockRules
     // frame text-based match: only applies to nav requests, never block otherwise
     if (frameTextMatch) {
       if (!urlMatched || !isNavReq) {
-        return {block: false, done: false, frameUrl};
+        return {block: false, done: false};
       }
 
       const block = await this.isTextMatch(request, reqUrl, frameTextMatch) ? !allowOnly : allowOnly;
       this.debugLog(`iframe ${url} conditionally ${block ? "BLOCKED" : "ALLOWED"}, parent frame ${frameUrl}`);
-      return {block, done: true, frameUrl};
+      return {block, done: true};
     }
 
     // for non frame text rule, simply match by URL
     const block = urlMatched ? !allowOnly : allowOnly;
-    return {block, done: false, frameUrl};
+    return {block, done: false};
   }
 
   async isTextMatch(request, reqUrl, frameTextMatch) {
@@ -170,19 +207,17 @@ class BlockRules
     }
   }
 
-  async recordBlockMsg(url, frameUrl) {
-    this.debugLog(`URL Blocked/Aborted: ${url} in frame ${frameUrl}`);
-
-    if (!this.blockErrMsg || !this.blockPutUrl) {
+  async recordBlockMsg(url) {
+    if (this.blockedUrlSet.has(url)) {
       return;
     }
 
-    if (this.putUrlSet.has(url)) {
+    this.blockedUrlSet.add(url);
+
+    if (!this.blockErrMsg || !this.blockPutUrl) {
       return;
     }
 
-    this.putUrlSet.add(url);
-
     const body = this.blockErrMsg;
     const putUrl = new URL(this.blockPutUrl);
     putUrl.searchParams.set("url", url);
diff --git a/util/seeds.js b/util/seeds.js
index c48346975..606def1af 100644
--- a/util/seeds.js
+++ b/util/seeds.js
@@ -61,6 +61,10 @@ class ScopedSeed
 
     switch (scopeType) {
     case "page":
+      include = [];
+      break;
+
+    case "page-spa":
       // allow scheme-agnostic URLS as likely redirects
       include = [new RegExp("^" + rxEscape(parsedUrl.href).replace(parsedUrl.protocol, "https?:") + "#.+")];
       allowHash = true;
@@ -78,12 +82,8 @@ class ScopedSeed
       include = [/.*/];
       break;
 
-    case "none":
-      include = [];
-      break;
-
     default:
-      throw new Error(`Invalid scope type "${scopeType}" specified, valid types are: page, prefix, host`);
+      throw new Error(`Invalid scope type "${scopeType}" specified, valid types are: page, page-spa, prefix, host, any`);
     }
 
     return [include, allowHash];
diff --git a/yarn.lock b/yarn.lock
index 862ff4ee2..ddfd789ae 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -4858,7 +4858,12 @@ write-file-atomic@^3.0.0:
     signal-exit "^3.0.2"
     typedarray-to-buffer "^3.1.5"
 
-ws@^7.2.3, ws@^7.4.4:
+ws@^7.2.3:
+  version "7.5.3"
+  resolved "https://registry.yarnpkg.com/ws/-/ws-7.5.3.tgz#160835b63c7d97bfab418fc1b8a9fced2ac01a74"
+  integrity sha512-kQ/dHIzuLrS6Je9+uv81ueZomEwH0qVYstcAQ4/Z93K8zeko9gtAbttJWzoC5ukqXY1PpoouV3+VSOqEAFt5wg==
+
+ws@^7.4.4:
   version "7.4.5"
   resolved "https://registry.yarnpkg.com/ws/-/ws-7.4.5.tgz#a484dd851e9beb6fdb420027e3885e8ce48986c1"
   integrity sha512-xzyu3hFvomRfXKH8vOFMU3OguG6oOvhXMo3xsGy3xWExqaM2dxBbVxuD99O7m3ZUFMvvscsZDqxfgMaRr/Nr1g==