From 7025c312864417d9ef8591ab239c5c675769fa80 Mon Sep 17 00:00:00 2001 From: Chris Kirk Date: Tue, 6 Feb 2018 17:01:44 -0800 Subject: [PATCH 1/3] Implementing proxy. --- index.js | 1 + src/puppeteer_utils.js | 32 ++++++++++++++++++++++---------- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/index.js b/index.js index fabc2513..1b1dfc69 100644 --- a/index.js +++ b/index.js @@ -29,6 +29,7 @@ const defaultOptions = { puppeteerExecutablePath: undefined, puppeteerIgnoreHTTPSErrors: false, publicPath: "/", + proxy: {}, minifyCss: {}, minifyHtml: { collapseBooleanAttributes: true, diff --git a/src/puppeteer_utils.js b/src/puppeteer_utils.js index 753e3081..100ae8a7 100644 --- a/src/puppeteer_utils.js +++ b/src/puppeteer_utils.js @@ -7,19 +7,31 @@ const path = require("path"); const fs = require("fs"); /** - * @param {{page: Page, options: {skipThirdPartyRequests: true}, basePath: string }} opt + * @param {{page: Page, options: {skipThirdPartyRequests: true, proxy: {}}, basePath: string }} opt * @return {Promise} */ -const skipThirdPartyRequests = async opt => { +const handleThirdPartyRequests = async opt => { const { page, options, basePath } = opt; - if (!options.skipThirdPartyRequests) return; + if (!options.skipThirdPartyRequests || !options.proxy) return; await page.setRequestInterception(true); page.on("request", request => { - if (request.url().startsWith(basePath)) { - request.continue(); - } else { - request.abort(); + if (options.proxy) { + for (proxyUrl in options.proxy) { + if (request.url().startsWith(proxyUrl)) { + const requestChanges = {}; + if (typeof options.proxy[proxyUrl] === 'string') { + requestChanges.url = request.url().replace(proxyUrl, options.proxy[proxyUrl]); + } + request.continue(requestChanges); + return; + } + } } + + if (options.skipThirdPartyRequests && !request.url().startsWith(basePath)) + request.abort(); + + request.continue(); }); }; @@ -164,8 +176,8 @@ const crawl = async opt => { try { const page = await browser.newPage(); if (options.viewport) await page.setViewport(options.viewport); - if (options.skipThirdPartyRequests) - await skipThirdPartyRequests({ page, options, basePath }); + if (options.skipThirdPartyRequests || options.proxy) + await handleThirdPartyRequests({ page, options, basePath }); enableLogging({ page, options, @@ -217,7 +229,7 @@ const crawl = async opt => { }); }; -exports.skipThirdPartyRequests = skipThirdPartyRequests; +exports.handleThirdPartyRequests = handleThirdPartyRequests; exports.enableLogging = enableLogging; exports.getLinks = getLinks; exports.crawl = crawl; From 100a6d7141f55e6efe40f5f13ca5fb711c573fb4 Mon Sep 17 00:00:00 2001 From: Chris Kirk Date: Tue, 6 Feb 2018 17:15:31 -0800 Subject: [PATCH 2/3] adding logging and fixing proxy errors. --- src/puppeteer_utils.js | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/puppeteer_utils.js b/src/puppeteer_utils.js index 100ae8a7..0f4bd686 100644 --- a/src/puppeteer_utils.js +++ b/src/puppeteer_utils.js @@ -17,7 +17,9 @@ const handleThirdPartyRequests = async opt => { page.on("request", request => { if (options.proxy) { for (proxyUrl in options.proxy) { + console.log('proxyUrl', proxyUrl); if (request.url().startsWith(proxyUrl)) { + console.log('proxy match!', request.url()); const requestChanges = {}; if (typeof options.proxy[proxyUrl] === 'string') { requestChanges.url = request.url().replace(proxyUrl, options.proxy[proxyUrl]); @@ -28,8 +30,10 @@ const handleThirdPartyRequests = async opt => { } } - if (options.skipThirdPartyRequests && !request.url().startsWith(basePath)) + if (options.skipThirdPartyRequests && !request.url().startsWith(basePath)) { request.abort(); + return; + } request.continue(); }); From 5821dbf8f395c6e42a846ee4ff7570c5f247184c Mon Sep 17 00:00:00 2001 From: Chris Kirk Date: Tue, 6 Feb 2018 17:20:45 -0800 Subject: [PATCH 3/3] removing console logs. --- src/puppeteer_utils.js | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/puppeteer_utils.js b/src/puppeteer_utils.js index 0f4bd686..84dc3ced 100644 --- a/src/puppeteer_utils.js +++ b/src/puppeteer_utils.js @@ -17,9 +17,7 @@ const handleThirdPartyRequests = async opt => { page.on("request", request => { if (options.proxy) { for (proxyUrl in options.proxy) { - console.log('proxyUrl', proxyUrl); if (request.url().startsWith(proxyUrl)) { - console.log('proxy match!', request.url()); const requestChanges = {}; if (typeof options.proxy[proxyUrl] === 'string') { requestChanges.url = request.url().replace(proxyUrl, options.proxy[proxyUrl]);