-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathrenderer.js
630 lines (551 loc) · 22.4 KB
/
renderer.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
/* eslint-disable max-len */
/* eslint-disable no-console */
// 'use strict' not required for modules?;
const puppeteer = require('puppeteer');
const fs = require('fs');
const fsp = require("fs/promises");
const temp = require('temp')
const { finished } = require('stream/promises');
const { promisify } = require('util');
const PuppeteerHar = require('./puppeteer-har');
//const WARCWriter = require('./warcwriter');
const WARCPoster = require('./warcPoster.js');
// Read in the package JSON to get the version:
const packageFileJSON = JSON.parse(fs.readFileSync("package.json"));
const packageVersion = packageFileJSON["version"];
const softwareId = `ukwa/webrender-puppeteer:${packageVersion}`;
// Set up WARC writing:
//const WARC_OUTPUT_PATH = process.env.WARC_OUTPUT_PATH || '.';
//const WARC_PREFIX = process.env.WARC_PREFIX || 'WEBRENDERED';
//const WARC_INFO = { 'software': softwareId };
//const ww = new WARCWriter(WARC_OUTPUT_PATH, WARC_PREFIX, WARC_INFO);
const ww = new WARCPoster();
// Get device list from Puppeteer:
const { devices } = puppeteer;
// Log any unexpected errors (rather than crashing out):
process.on('unhandledRejection', (error, p) => {
// Will print "unhandledRejection err is not defined"
console.log('Caught unhandledRejection: ', error.message, p);
process.exit(1);
});
// Helper to make headers into an array:
function headersArray(headers) {
const result = [];
Object.entries(headers).forEach(([k, v]) => {
if (!Object.is(v, undefined)) {
result.push({ name: k, value: `${v}` });
}
});
return result;
}
/**
* Captures all traffic including from Web Workers, does something with it, and continues the request
* @param target The page/tab/worker target to capture from.
*/
const interceptAllTrafficForPageUsingFetch = async (target, extraHeaders) => {
if (target) {
const client = await target.createCDPSession();
// see: https://chromedevtools.github.io/devtools-protocol/tot/Fetch#method-enable
// In rare cases ( https://covid19ukmap.com/ ) this can crash out, so protect against exceptions:
try {
await client.send('Fetch.enable');
console.log(`Sent Fetch.enable, extraHeaders = ${JSON.stringify(extraHeaders)}`);
} catch(error) {
console.log('Exception when sending Fetch.enable: ', error.message);
}
// see: https://chromedevtools.github.io/devtools-protocol/tot/Fetch#event-requestPaused
await client.on('Fetch.requestPaused', async ({
requestId,
request,
// frameId,
// resourceType,
// responseErrorReason,
// responseStatusCode,
// responseHeaders,
// networkId
}) => {
// console.log(`Intercepting ${request.url}`);
// Insert additional headers
Object.entries(extraHeaders).forEach(([k, v]) => {
request.headers[k] = v;
});
try {
// Continuing the request with the modified header:
await client.send('Fetch.continueRequest', {
requestId,
headers: headersArray(request.headers),
});
} catch(error) {
console.log('Exception when sending Fetch.continueRequest: ', error.message);
}
});
}
}
async function render_page(page, url, extraHeaders, warcPrefix=null) {
console.log("render_page got warcPrefix: " + warcPrefix );
// Add hook to track activity and modify headers in all contexts (pages, workers, etc.):
// Note that extraHTTPHeaders means the browser sends headers like:
// Access-Control-Request-Headers: warcprox-meta
// which warcprox doesn't block, and confuses the heck out of e.g. Twitter.
const interceptor = async (target) => {
await interceptAllTrafficForPageUsingFetch(target, extraHeaders);
};
await interceptor(await page.target());
await page.browser().on('targetcreated', interceptor );
console.log(`Set up interception for ${url} and extraHeaders ${JSON.stringify(extraHeaders)}.`);
// Set up some logging of any errors:
page.on('error', err=> {
console.log('error happen at the page: ', err);
});
page.on('pageerror', pageerr=> {
console.log('pageerror occurred: ', pageerr);
})
// Options for the render process:
let switchDevices = false;
if ('SWITCH_DEVICES' in process.env) {
switchDevices = (process.env.SWITCH_DEVICES.toLowerCase().trim() === 'true');
}
console.log(`switchDevices = ${switchDevices}`);
let runBehaviours = true;
if ('RUN_BEHAVIOURS' in process.env) {
runBehaviours = (process.env.RUN_BEHAVIOURS.toLowerCase().trim() === 'true');
}
console.log(`runBehaviours = ${runBehaviours}`);
// Main image width:
const viewportWidth = parseInt(process.env.VIEWPORT_WIDTH) || 1366;
const viewportHeight = parseInt(process.env.VIEWPORT_HEIGHT) || Math.round(viewportWidth * 1.6180);
const deviceScaleFactor = parseFloat(process.env.DEVICE_SCALE_FACTOR) || 1.0;
// Set the page size:
await page.setViewport({ width: viewportWidth, height: viewportHeight, deviceScaleFactor: deviceScaleFactor });
// Avoid caching:
await page.setCacheEnabled(false);
// Set the default timeout:
await page.setDefaultNavigationTimeout(60000); // 60 seconds instead of 30
// Set the user agent up:
const browserUserAgent = await page.browser().userAgent();
console.log("Default User-Agent: " + browserUserAgent );
// Add optional userAgent override:
if ('USER_AGENT' in process.env) {
console.log("Setting User-Agent: " + process.env.USER_AGENT.replace('@VERSION@', packageVersion));
page.setUserAgent(process.env.USER_AGENT.replace('@VERSION@', packageVersion));
// e.g. 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) \
// Chrome/37.0.2062.120 Safari/537.36';
} else if ('USER_AGENT_ADDITIONAL' in process.env) {
const userAgent = `${browserUserAgent} ${process.env.USER_AGENT_ADDITIONAL}`.replace('@VERSION@', packageVersion);
console.log("Setting User-Agent: " + userAgent);
page.setUserAgent(userAgent);
}
// await page.setUserAgent('Chrome/91.0.4469.0');
// await page.setUserAgent('Chrome/88.0.4298.0');
// Record requests/responses in a standard format:
const har = new PuppeteerHar(page);
await har.start();
// Go the the page to capture:
// See https://github.com/GoogleChrome/puppeteer/blob/master/docs/api.md#pagegotourl-options for definitions of networkidle0/2
console.log(`Navigating to ${url}...`);
try {
// Main navigation
await page.goto(url, { waitUntil: 'networkidle2' }); // Longer timeout set above
// Run behaviour scripts like scrollers:
if ( runBehaviours ) {
console.log(`${url} - Waiting for delayed popups...`);
await page.waitForTimeout(2*1000);
// Look for any "I Accept" buttons
console.log(`${url} - Looking for any modal buttons...`);
await clickKnownModals(page);
// Await for any more elements scrolling down prompted:
console.log(`${url} - Waiting for any activity to die down...`);
// Using networkidle0 will usually hang as this event has already passed.
// await page.waitForNavigation({ waitUntil: 'networkidle0' });
//await page.waitForNetworkIdle({timeout: 4000});
waitForNetworkIdle(page,4000);
await page.waitForTimeout(2*1000);
// Now scroll down:
console.log(`${url} - Scrolling down...`);
await autoScroll(page);
console.log(`${url} - Scrolling back to the top...`);
await page.evaluate('window.scrollTo(0,0)');
}
// Set viewport to cover whole body:
const bodyHandle = await page.$('body');
var { width, height } = await bodyHandle.boundingBox();
height = Math.floor(height);
// Don't set it very large (things get bugger >16k):
if( height > 10000 ) {
height = 10000;
}
if( height > viewportHeight) {
console.log("Setting viewport: "+ viewportWidth + "x" + height);
await page.setViewport({width: viewportWidth, height: height});
}
// Await for any more elements scrolling down prompted:
console.log(`${url} - Waiting for any activity to die down...`);
//await page.waitForNetworkIdle({timeout: 4000});
waitForNetworkIdle(page,4000);
} catch (e) {
console.error('We got an error, but lets continue and render what we get.\n', e);
}
// Render the result:
console.log(`${url} - Rendering web page as PNG...`);
// Full page:
const image = await page.screenshot({ fullPage: true });
// A place to record URLs of different kinds:
const urls = {};
// Get the main frame URL:
urls.url = await page.url();
// Get hold of the navigation links:
urls.L = await page.$$eval('a', as => as.map(a => a.href));
urls.L = [...new Set(urls.L)];
// Get the location of clickable <a> elements:
urls.map = await page.evaluate(() => {
const clickables = [];
const elements = Array.prototype.slice.call(document.getElementsByTagName('*'));
elements.forEach((element) => {
if (element.offsetParent != null) {
if (element.onclick != null || element.href !== undefined) {
const c = {};
const {
x, y, width, height,
} = element.getBoundingClientRect();
c.location = {
left: x, top: y, width, height,
};
if (element.attributes.href !== undefined) {
// Get absolute URL:
c.href = element.href;
}
if (element.onclick != null) {
c.onclick = element.onclick.toString();
}
clickables.push(c);
}
}
});
return clickables;
});
// And the HTML:
const html = await page.content();
// Additional items to generate, if writing to WARC:
if ( ww.isEnabled() ) {
// Also get a JPEG for the imagemap:
console.log(`${url} - Rendering screenshot as JPEG...`);
var imageJpeg = await page.screenshot({ type: 'jpeg', quality: 100, fullPage: true });
// Print to PDF but use the screen CSS:
console.log(`${url} - Rendering PDF...`);
await page.emulateMediaType('screen');
// Uses streaming mode to reduce RAM usage:
var pdf = await page.createPDFStream({
format: 'A4',
scale: 0.75,
printBackground: true,
timeout: 20*1000, // Use a shortish timeout as this can be flaky.
});
}
// After rendering main view, attempt to switch between devices to grab alternative media
if (switchDevices) {
try {
// Switch to different user agent settings to attempt to ensure additional media downloaded:
console.log(`${url} - Switching device settings...`);
await page.emulate(devices['iPhone 6']);
await page.emulate(devices['iPhone X landscape']);
await page.emulate(devices['Nexus 6']);
// Switch through a few widths to encourage JS-based responsive image loading:
await page.setViewport({
width: 480, height: 1024, deviceScaleFactor: 1, isMobile: false, hasTouch: false, isLandscape: false,
});
await page.setViewport({
width: 640, height: 1024, deviceScaleFactor: 1, isMobile: false, hasTouch: false, isLandscape: false,
});
await page.setViewport({
width: 800, height: 1024, deviceScaleFactor: 1, isMobile: false, hasTouch: false, isLandscape: false,
});
await page.setViewport({
width: 1024, height: 1024, deviceScaleFactor: 1, isMobile: false, hasTouch: false, isLandscape: false,
});
// Switch back to the standard device view:
await page.setViewport({
width: viewportWidth, height: 1024, deviceScaleFactor: 1, isMobile: false, hasTouch: false, isLandscape: false,
});
// Await for any more elements the device switching prompted:
console.log(`${url} - Waiting for any activity to die down...`);
await page.waitForTimeout(2000);
} catch (e) {
console.error(`${url} - We got an error, but lets continue and render what we get.\n`, e);
}
}
// Get all the transcluded resources that make up the page:
// (this works like capturing page.on('response') events but excludes the URL of the page itself.)
urls.E = await page.evaluate(() => (
performance.getEntries()
.filter(e => e.entryType === 'resource')
.map(e => e.name)
));
// Assemble the results:
const harStandard = await har.stop();
// Override creator info:
harStandard['log']['creator'] = {
'name': 'webrender-puppeteer',
'version': packageVersion,
'comment': 'https://github.com/ukwa/webrender-puppeteer'
}
// And write to WARC (even if there was no 'page' -- see below):
if ( ww.isEnabled() ) {
const finalUrl = urls.url;
await ww.writeRenderedImageFromBuffer(warcPrefix, `har:${url}`, finalUrl, 'application/json', new TextEncoder().encode(JSON.stringify(harStandard)));
// Check if there were any pages (there are none for e.g. PDFs):
if( harStandard['log']['pages'].length > 0 ) {
// Store HTML and PNG in WARCs:
await ww.writeRenderedImageFromBuffer(warcPrefix, `onreadydom:${url}`, finalUrl, 'text/html', new TextEncoder().encode(html));
await ww.writeRenderedImageFromBuffer(warcPrefix, `screenshot:${url}`, finalUrl, 'image/png', image);
// Store the PDF:
var fsStream = temp.createWriteStream();
console.log("GOT temp path: " + fsStream.path);
pdf.pipe(fsStream);
await finished(fsStream);
console.log("PIPED to temp path: " + fsStream.path);
var stats = fs.statSync(fsStream.path);
pdfContentLength = stats.size;
pdf = fs.createReadStream(fsStream.path);
console.log("Opening Readable Stream " + fsStream.path);
await ww.writeRenderedImage(warcPrefix, `pdf:${url}`, finalUrl, 'application/pdf', pdf, pdfContentLength);
console.log("Deleting temp file: " + fsStream.path);
await finished(pdf);
fs.rmSync(fsStream.path);
// Store the full page with image map:
const title = harStandard['log']['pages'][0]['title'];
const imageMapHtml = _toImageMap(url, title, imageJpeg, urls.map);
await ww.writeRenderedImageFromBuffer(warcPrefix, `imagemap:${url}`, finalUrl, 'text/html', new TextEncoder().encode(imageMapHtml));
}
} else {
console.log("WARC writing skipped.")
}
// Build extended/wrapper HAR:
const harExtended = {
'software': softwareId,
'har': harStandard,
'urls': urls,
'cookies': await page.cookies(),
};
// And store the final/rendered forms:
const b64Content = Buffer.from(html).toString('base64');
harExtended.finalPage = {
content: b64Content,
encoding: 'base64',
contentType: 'text/html',
};
const b64Viewport = Buffer.from(image).toString('base64');
harExtended.renderedViewport = {
content: b64Viewport,
encoding: 'base64',
contentType: 'image/png',
};
// Clean out the event listeners:
await page.browser().removeListener('targetcreated', interceptor);
console.log(`${url} - Complete.`);
return harExtended;
}
// HTML5: https://dev.w3.org/html5/spec-preview/image-maps.html
// <img src="shapes.png" usemap="#shapes"
// alt="Four shapes are available: a red hollow box, a green circle, a blue triangle, and a yellow four-pointed star.">
// <map name="shapes">
// <area shape=rect coords="50,50,100,100"> <!-- the hole in the red box -->
// <area shape=rect coords="25,25,125,125" href="red.html" alt="Red box.">
// <area shape=circle coords="200,75,50" href="green.html" alt="Green circle.">
// <area shape=poly coords="325,25,262,125,388,125" href="blue.html" alt="Blue triangle.">
// <area shape=poly coords="450,25,435,60,400,75,435,90,450,125,465,90,500,75,465,60"
// href="yellow.html" alt="Yellow star.">
// </map>
// <img alt="Embedded Image" src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAADIA..." />
function _toImageMap(url, title, imageJpeg, map) {
html = `<html><head><title>${title} [Static version of ${url}]</title>\n</head>\n<body style="margin: 0;">\n`
const buf = Buffer.from(imageJpeg);
html = html + `<img src="data:image/jpeg;base64,${buf.toString('base64')}" usemap="#shapes" alt="${title}">\n`
html = html + '<map name="shapes">\n'
for (box of map) {
if('href' in box) {
x1 = box['location']['left']
y1 = box['location']['top']
x2 = x1 + box['location']['width']
y2 = y1 + box['location']['height']
html = html + `<area shape=rect coords="${x1},${y1},${x2},${y2}" href="${box['href']}">\n`
} else {
console.log("_toImageMap: Skipping box with no 'href': %s" % box)
}
}
html = html + '</map>\n'
html = html + "</body>\n</html>\n"
return html
}
async function render(url) {
// Set up the browser in the required configuration:
const browserArgs = {
ignoreHTTPSErrors: true,
args: [
'--disk-cache-size=0',
'--no-sandbox',
'--ignore-certificate-errors',
'--disable-gpu',
'--disable-dev-shm-usage',
"--no-xshm", // needed for Chrome >80 (check if puppeteer adds automatically)
"--disable-background-media-suspend",
"--autoplay-policy=no-user-gesture-required",
"--disable-features=IsolateOrigins,site-per-process",
"--disable-popup-blocking",
"--disable-backgrounding-occluded-windows"
],
};
// Add proxy configuration if supplied:
if (process.env.HTTP_PROXY) {
proxy_url = process.env.HTTP_PROXY;
// Remove any trailing slash:
proxy_url = proxy_url.replace(/\/$/,'');
browserArgs.args.unshift(`--proxy-server=${proxy_url}`);
}
console.log('Browser arguments: ', browserArgs);
const browser = await puppeteer.launch(browserArgs);
// Set up a clean 'incognito' context and page:
const context = await browser.createIncognitoBrowserContext();
const page = await context.newPage();
// Run the page-level rendering process:
harExtended = render_page(page, url, extraHeaders, null);
// Output prefix:
let outPrefix = '/output/';
if ('OUTPUT_PREFIX' in process.env) {
outPrefix = process.env.OUTPUT_PREFIX;
}
console.log(`outPrefix = ${outPrefix}`);
// Write out the extended HAR:
await promisify(fs.writeFile)(`${outPrefix}rendered.har`, JSON.stringify(harExtended));
// Shut down:
console.log('Shutting down...');
await browser.close();
// And return the result too:
return harExtended;
}
// Automatically scroll down:
async function autoScroll(page) {
await page.evaluate(async () => {
await new Promise((resolve) => {
let totalHeight = 0;
const distance = Math.floor(window.innerHeight/2);
const timer = setInterval(() => {
const { scrollHeight } = document.body;
window.scrollBy(0, distance);
totalHeight += distance;
if (totalHeight >= scrollHeight || totalHeight > 8000) {
clearInterval(timer);
resolve();
}
}, 1000);
});
});
}
async function clickButton2(page, buttonText) {
// Set up matcher logic
clickMatchingButtons = (query) => {
const elements = [...document.querySelectorAll('button')];
// Find elements with filter
const targetElements = elements.filter(e => e.innerText.toLowerCase().includes(query.toLowerCase()));
// make sure the element exists, and only then click it
if (targetElements) {
targetElements.forEach((targetElement) => {
targetElement.click();
});
}
}
// Find all buttons in all frames:
//await page.evaluate(clickMatchingButtons, buttonText);
await page.frames().forEach( async (frame) => {
await frame.evaluate(clickMatchingButtons, buttonText);
});
}
/**
* This seems to reliably deal with the HuffPo UK cookie banner, where the above (which should be the same) does not.
* @param {*} page
* @param {*} buttonText
*/
async function clickButton(page, buttonText) {
await page.evaluate((query) => {
const elements = [...document.querySelectorAll('button')];
// Either use .find or .filter, comment one of these
// find element with find
const targetElement = elements.find(e => e.innerText.toLowerCase().includes(query));
// To do? Also check aria-label="Close" style buttons?
// OR, find element with filter
// const targetElement = elements.filter(e => e.innerText.includes(query))[0];
// make sure the element exists, and only then click it
if (targetElement) {
targetElement.click();
}
}, buttonText.toLowerCase());
// And scan frames:
// This was created for the Guardian, but only seems to half work.
// The page is visible, but the images are grey boxes. Not clear why.
//await clickButton2(page, buttonText);
}
/**
*
* @param {*} page
*/
async function clickKnownModals(page) {
try {
// Press escape for transient popups:
await page.keyboard.press('Escape');
// Click close on a class of popup observer at https://www.britishdeafnews.co.uk/
// Doesn't seem to work!
await page.evaluate( () => {
const elements = [...document.querySelectorAll('a.ppsPopupClose')];
const targetElement = elements[0];
// make sure the element exists, and only then click it
if (targetElement) {
targetElement.click();
}
});
// Click known common modals (doing these last as some lead to navigation events):
await clickButton(page, 'Yes, I’m happy'); // Guardian UK
await clickButton(page, 'I Accept');
await clickButton(page, 'I Understand');
await clickButton(page, 'Accept Recommended Settings');
await clickButton(page, 'OK');
await clickButton(page, 'I Agree');
await clickButton(page, 'AGREE & EXIT');
await clickButton(page, 'Allow all');
await clickButton(page, 'Close');
} catch (e) {
console.error('A page.evaluate failed, perhaps due to a navigation event.\n', e);
}
}
// From https://stackoverflow.com/questions/54377650/how-can-i-wait-for-network-idle-after-click-on-an-element-in-puppeteer
// Hack to cope with lack of this function in this version of Puppteer (<10.4.0):
function waitForNetworkIdle(page, timeout, maxInflightRequests = 0) {
page.on('request', onRequestStarted);
page.on('requestfinished', onRequestFinished);
page.on('requestfailed', onRequestFinished);
let inflight = 0;
let fulfill;
let promise = new Promise(x => fulfill = x);
let timeoutId = setTimeout(onTimeoutDone, timeout);
return promise;
function onTimeoutDone() {
page.removeListener('request', onRequestStarted);
page.removeListener('requestfinished', onRequestFinished);
page.removeListener('requestfailed', onRequestFinished);
fulfill();
}
function onRequestStarted() {
++inflight;
if (inflight > maxInflightRequests)
clearTimeout(timeoutId);
}
function onRequestFinished() {
if (inflight === 0)
return;
--inflight;
if (inflight === maxInflightRequests)
timeoutId = setTimeout(onTimeoutDone, timeout);
}
}
module.exports = {
render_page,
render
}