forked from get-set-fetch/scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
table-scraping.ts
62 lines (55 loc) · 1.71 KB
/
table-scraping.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
/* eslint-disable object-curly-newline */
import { encode, KnexStorage, PuppeteerClient, Scraper, setLogger, ScrapeEvent } from '../src/index';
setLogger({ level: 'info' });
const knexConfig = {
client: 'sqlite3',
useNullAsDefault: true,
connection: {
filename: './examples/data/languages.sqlite',
},
};
const storage = new KnexStorage(knexConfig);
const client = new PuppeteerClient({ args: [
'--disable-gpu',
'--disable-dev-shm-usage',
'--disable-setuid-sandbox',
'--no-first-run',
'--no-sandbox',
'--no-zygote',
'--single-process',
] });
const scraper = new Scraper(storage, client);
const scrapeHash = 'ePnXQdMJrZNNDoMgEIWvQrqSNNWm3bnoCXoHMpYRiYBGsE1v30FDf+JCF12QwFu8N5n38TYz4NQICq/ahy2EruH4Q8Kn0OOSmW3gLmml7gzmFgNICMD2rKcziw/d6unGgixdaA63RhuZnTi7MChr8gyzRFHpR6QF7OKE/0g78y933yO0hA7LLAFHXfK4/pWu0E3ePUoNeTeoIr6K2JDoapEG9qJ6CjfaCocoO2rpjiIFxpgXe3Oswg==';
const scrapeConfig = {
name: 'languageList',
pipeline: 'browser-static-content',
pluginOpts: [
{
name: 'ExtractUrlsPlugin',
maxDepth: 0,
},
{
name: 'ExtractHtmlContentPlugin',
selectorPairs: [
{
contentSelector: 'table.metadata + p + table.wikitable td:nth-child(2) > a:first-child',
label: 'language',
},
{
contentSelector: 'table.metadata + p + table.wikitable td:nth-child(3)',
label: 'speakers (milions)',
},
],
},
],
resources: [
{
url: 'https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers',
},
],
};
scraper.on(ScrapeEvent.ProjectScraped, async () => {
await scraper.export('./examples/data/languages.csv', { type: 'csv' });
await storage.close();
});
scraper.scrape(scrapeHash);