diff --git a/package.json b/package.json index 9aa20bff2..105513105 100644 --- a/package.json +++ b/package.json @@ -23,6 +23,7 @@ "qa": "pta ./src/qa/*.qa.js -r log | node src/qa/utils/qa-reporter.js", "test": "tape tests/**/*-test.js | tap-spec", "test:unit": "tape tests/unit/**/*-test.js | tap-spec", + "test:integration": "tape tests/integration/**/*-test.js | tap-spec", "test:watch": "tape-watch tests/**/*-test.js -p tap-spec", "test:tz": "tape scripts/test-timezones.js | tap-spec", "timeseries": "node src/shared/timeseries/index.js", diff --git a/scripts/change-fetch-api/add-cacheKey.rb b/scripts/change-fetch-api/add-cacheKey.rb new file mode 100644 index 000000000..826d71573 --- /dev/null +++ b/scripts/change-fetch-api/add-cacheKey.rb @@ -0,0 +1,181 @@ +# Hack all of the scraper files. +# +# USAGE: +# +# - cd into this directory, +# - run `ruby this-script.rb WRITE [FILENAME]` +# +# WRITE is either true or false. +# - 'save' to overwrite source files +# - 'print' to dump to console +# - 'mute' to not print or save (useful to see the changes that would happen) +# +# FILENAME: name of the file to work with. If missing, +# do all files. +# +# eg., +# ruby print DEU/_shared.js + +if (ARGV.size < 1) then + puts "usage: ruby WRITE [FILENAME]" + puts "where WRITE = save/print/mute" + return +end + +WRITE = ARGV[0].to_s.downcase +FILENAME = (ARGV.size > 1) ? ARGV[1] : nil + +# Skip some files. +# Not bothering to try to determine these programmatically. +# IGNORE_FILES = %w( +# AUS/_shared/get-data-with-tested-negative-applied.js +# AUS/_shared/get-key.js +# ) + +METHODS = 'page|fetch|raw|json|jsonAndCookies|csv|tsv|pdf|headless' + + +# NEED TO DO THIS: +FETCH_RE = /(await\s+.*?\.(?:#{METHODS})\s*\(.*?, )([^,)]*)(.*)/ + + +# Manual tests to verify +# lin = "const $ = await fetch.page(this, healthUrl); +# const $ = await fetch.page(this, healthUrl, some_other_stuff); +# const $ = await fetch.csv(this, 'something.com', some_other_stuff); +# " +# lin.scan(FETCH_RE).each do |m| +# puts '----' +# puts m.inspect +# puts "#{m[0]}#{m[1]}, 'default'#{m[2]}" +# end + + +# Print warnings only for each file f in scraper_dir. +def validate(scraper_dir, f) + fpath = File.join(scraper_dir, f) + src = File.read(fpath) + [ FETCH_RE ].each do |re| + puts "WARN: No match for #{re} in #{f}" if (src !~ re) + end +end + + +def add_cacheKey_to_fetch_calls(src) + original_src = "CLONE: #{src}" + + matches = src.scan(FETCH_RE) + # puts "add cacheKey: #{matches.inspect}" + matches.uniq.each do |m| + raise "bad re? #{m}" if m.size != 3 + before, url, after = m + wholeline = [before, url, after].join('') + newline = "#{before}#{url}, 'default'#{after}" + puts " + \"#{wholeline}\" => \"#{newline}\"" + src = src.gsub(wholeline, newline) + end + + src +end + + +# # Specific hack +# def postmigration_AU_QLD_stuff(src) +# old = "async function getCurrentArticlePage(obj) { +# const $ = await fetch.page(this, obj.url); +# const anchors = $('#content h3:first-of-type > a'); +# const currentArticleUrl = anchors[0].attribs.href; +# return fetch.page(currentArticleUrl); +# }" +# new = "async function getCurrentArticlePage(obj) { +# const $ = await fetch.page(obj, obj.url); +# const anchors = $('#content h3:first-of-type > a'); +# const currentArticleUrl = anchors[0].attribs.href; +# return fetch.page(obj, currentArticleUrl); +# }" +# src = src.gsub(old, new) +# src +# end + + +# def post_migration_check(src) +# matches = src.scan(FETCH_RE) +# # puts "add this: #{matches.inspect}" +# matches.each do |m| +# raise "bad re? #{m}" if m.size != 3 +# wholeline, before, after = m +# if (after !~ /this, /) then +# puts " ??? Missing 'this' in fetch call in \"#{wholeline}\" ???" +# end +# end +# end + +######################################## + +scraper_dir = File.join(__dir__, '..', '..', 'src', 'shared', 'scrapers') + +files = [] +Dir.chdir(scraper_dir) do + files = Dir.glob(File.join('**', '*.js')) +end +# puts "Pre remove count: #{files.count}" +# files -= IGNORE_FILES +# puts "Post remove count: #{files.count}" +puts "#{files.size} scraper files." + + +if (!FILENAME.nil?) then + if (!files.include?(FILENAME)) then + puts "#{FILENAME} is not in the list of scraper files:" + puts files.sort.map { |s| " #{s}" } + return + else + files = [FILENAME] + end +end + +files.sort! + +puts "VALIDATION ========================================" +files.each do |f| + validate(scraper_dir, f) +end +puts "END VALIDATION ====================================" + + +# During dev, just do one file. +# add_filename_to_scraper_this(scraper_dir, files[0]) +# files = [files[0]] +# files = ['DEU/_shared.js'] + +puts "MUTATION ========================================" +files.each do |f| + puts + puts '=' * 50 + puts f + puts '-' * 50 + fpath = File.join(scraper_dir, f) + src = File.read(fpath) + + src = add_cacheKey_to_fetch_calls(src) + + # post_migration_check(src) + puts + + case(WRITE) + when 'save' then + File.open(fpath, 'w') { |p| p.puts(src) } + when 'print' then + puts + puts "Result:" + puts "-" * 50 + puts src + puts "-" * 50 + puts + when 'mute' then + puts '' + end + +end +puts "END MUTATION ====================================" +puts diff --git a/scripts/change-fetch-api/migrate-scrapers-add-this-to-fetch.rb b/scripts/change-fetch-api/migrate-scrapers-add-this-to-fetch.rb new file mode 100644 index 000000000..be7776eb8 --- /dev/null +++ b/scripts/change-fetch-api/migrate-scrapers-add-this-to-fetch.rb @@ -0,0 +1,205 @@ +# Hack all of the scraper files. +# +# USAGE: +# +# - cd into this directory, +# - run `ruby this-script.rb WRITE [FILENAME]` +# +# WRITE is either true or false. +# - 'save' to overwrite source files +# - 'print' to dump to console +# - 'mute' to not print or save (useful to see the changes that would happen) +# +# FILENAME: name of the file to work with. If missing, +# do all files. +# +# eg., +# ruby print DEU/_shared.js + +if (ARGV.size < 1) then + puts "usage: ruby WRITE [FILENAME]" + puts "where WRITE = save/print/mute" + return +end + +WRITE = ARGV[0].to_s.downcase +FILENAME = (ARGV.size > 1) ? ARGV[1] : nil + +# Skip some files. +# Not bothering to try to determine these programmatically. +IGNORE_FILES = %w( +AUS/_shared/get-data-with-tested-negative-applied.js +AUS/_shared/get-key.js +) + + +LOCATION_RE = /(\s*)(city|county|state|country):/ +METHODS = 'page|fetch|raw|json|jsonAndCookies|csv|tsv|pdf|headless|getArcGISCSVURLFromOrgId|getArcGISCSVURL' + +# The fancy RE below splits a line like "await fetch.csv(this.url)" +# into ["await fetch.csv(this.url)", "await fetch.csv(", "this.url)"] +# It can screw up in some cases, so we add a hack. +FETCH_RE = /((await\s+.*?\.(?:#{METHODS})\s*\()(.*))/ + +# Print warnings only for each file f in scraper_dir. +def validate(scraper_dir, f) + fpath = File.join(scraper_dir, f) + src = File.read(fpath) + [ LOCATION_RE, FETCH_RE ].each do |re| + puts "WARN: No match for #{re} in #{f}" if (src !~ re) + end +end + + +# DISABLING THIS -- scrapers should already have _path. +# def add_filename_to_scraper_this(src) +# m = src.match(LOCATION_RE) +# # puts "add filename: #{m.inspect}" +# if (m.nil?) then +# puts " - skipping adding filepath (no match for RE)" +# return src +# end +# +# if (src =~ /filepath: __filename/) +# puts " - skipping adding _filepath, already added" +# return src +# end +# +# spaces = m[1].gsub("\n", '') +# loctype = m[2] +# puts " + adding filepath above #{loctype}" +# add_code = " +# #{spaces}_filepath: __filename, +# #{spaces}#{loctype}:" +# src = src.sub(LOCATION_RE, add_code) +# src +# end + + +def add_this_to_fetch_calls(src) + original_src = "CLONE: #{src}" + + matches = src.scan(FETCH_RE) + puts "add this: #{matches.inspect}" + matches.uniq.each do |m| + raise "bad re? #{m}" if m.size != 3 + wholeline, before, after = m + if (after =~ /this, /) then + puts " - 'this, ' already in \"#{wholeline}\", skipping" + else + newline = "#{before}this, #{after}" + puts " + \"#{wholeline}\" => \"#{newline}\"" + src = src.gsub(wholeline, newline) + end + end + + if (original_src !~ /this,\s*this,/ && src =~ /this,\s*this,/) then + src = src.gsub(/this,\s*this,/, 'this, ') + end + raise "still have 'this, this'" if (src =~ /this,\s*this,/) + + src +end + +# Specific hack +def postmigration_AU_QLD_stuff(src) + old = "async function getCurrentArticlePage(obj) { + const $ = await fetch.page(this, obj.url); + const anchors = $('#content h3:first-of-type > a'); + const currentArticleUrl = anchors[0].attribs.href; + return fetch.page(currentArticleUrl); +}" + new = "async function getCurrentArticlePage(obj) { + const $ = await fetch.page(obj, obj.url); + const anchors = $('#content h3:first-of-type > a'); + const currentArticleUrl = anchors[0].attribs.href; + return fetch.page(obj, currentArticleUrl); +}" + src = src.gsub(old, new) + src +end + + +def post_migration_check(src) + matches = src.scan(FETCH_RE) + # puts "add this: #{matches.inspect}" + matches.each do |m| + raise "bad re? #{m}" if m.size != 3 + wholeline, before, after = m + if (after !~ /this, /) then + puts " ??? Missing 'this' in fetch call in \"#{wholeline}\" ???" + end + end +end + +######################################## + +scraper_dir = File.join(__dir__, '..', '..', 'src', 'shared', 'scrapers') + +files = [] +Dir.chdir(scraper_dir) do + files = Dir.glob(File.join('**', '*.js')) +end +# puts "Pre remove count: #{files.count}" +files -= IGNORE_FILES +# puts "Post remove count: #{files.count}" +puts "#{files.size} scraper files." + + +if (!FILENAME.nil?) then + if (!files.include?(FILENAME)) then + puts "#{FILENAME} is not in the list of scraper files:" + puts files.sort.map { |s| " #{s}" } + return + else + files = [FILENAME] + end +end + +files.sort! + +puts "VALIDATION ========================================" +files.each do |f| + validate(scraper_dir, f) +end +puts "END VALIDATION ====================================" + + +# During dev, just do one file. +# add_filename_to_scraper_this(scraper_dir, files[0]) +# files = [files[0]] +# files = ['DEU/_shared.js'] + +puts "MUTATION ========================================" +files.each do |f| + puts + puts '=' * 50 + puts f + puts '-' * 50 + fpath = File.join(scraper_dir, f) + src = File.read(fpath) + # src = add_filename_to_scraper_this(src) + src = add_this_to_fetch_calls(src) + raise "BAD this, this" if (src =~ /this, this,/) + src = postmigration_AU_QLD_stuff(src) + + post_migration_check(src) + puts + + case(WRITE) + when 'save' then + File.open(fpath, 'w') { |p| p.puts(src) } + when 'print' then + puts + puts "Result:" + puts "-" * 50 + puts src + puts "-" * 50 + puts + when 'mute' then + puts '' + end + +end +puts "END MUTATION ====================================" +puts diff --git a/scripts/change-fetch-api/run-migration-for-all-dates.sh b/scripts/change-fetch-api/run-migration-for-all-dates.sh new file mode 100755 index 000000000..3d649379b --- /dev/null +++ b/scripts/change-fetch-api/run-migration-for-all-dates.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +rm log.txt + +for d in `ls -1 coronadatascraper-cache`; do + echo "Running $d ..." + echo '------------------------------------------------------------' >> log.txt + echo $d >> log.txt + MIGRATE_CACHE_DIR=zztest yarn start --onlyUseCache -d $d >> log.txt 2>&1 +done diff --git a/src/events/crawler/get-sources/load-sources.js b/src/events/crawler/get-sources/load-sources.js index 23ec37e2c..41c2d1f1f 100644 --- a/src/events/crawler/get-sources/load-sources.js +++ b/src/events/crawler/get-sources/load-sources.js @@ -11,7 +11,11 @@ function includeLocation(location, opts) { if (opts.skip && relpath.startsWith(opts.skip)) return false; - if (opts.location && !relpath.startsWith(opts.location)) return false; + if (opts.location) { + const locs = opts.location.split(',').map(s => s.trim()); + const matches = locs.filter(loc => relpath.startsWith(loc)); + return matches.length > 0; + } return true; } diff --git a/src/shared/cli/cli-args.js b/src/shared/cli/cli-args.js index 2945ce4bb..6c6818804 100644 --- a/src/shared/cli/cli-args.js +++ b/src/shared/cli/cli-args.js @@ -13,7 +13,7 @@ const { argv } = yargs }) .option('location', { alias: 'l', - description: 'Scrape only the location provided by src/shared/scraper path name', + description: 'Scrape only the location(s) matching src/shared/scraper path name (e.g, "--location US/PA,US/DE")', type: 'string' }) .option('skip', { diff --git a/src/shared/lib/fetch/cache-migration.js b/src/shared/lib/fetch/cache-migration.js new file mode 100644 index 000000000..b91fd654b --- /dev/null +++ b/src/shared/lib/fetch/cache-migration.js @@ -0,0 +1,108 @@ +import crypto from 'crypto'; +import fsBuiltIn from 'fs'; +import path from 'path'; +import zlib from 'zlib'; + +import * as datetimeFormatting from '../datetime/iso/format.js'; + +/** Cache migration helpers. ******************************* + * + * A set of functions to copy files in the cache to the new v1.0 cache + * file format. + * + * These functions are only used if process.env.MIGRATE_CACHE_DIR is set. + * + * When there's a hit for an existing file in the cache, that file is + * written to the process.env.MIGRATE_CACHE_DIR subdir in project + * root, in the appropriate subdir with the appropriate filename. + * + * If a file with the same cache key already exists in the folder, + * that's an error: we need each file in the directory to have + * different cache keys. + * + * e.g., + * MIGRATE_CACHE_DIR=zz_test yarn start + */ + +export function logCacheCall(scraper, date, url, filePath, cacheExists, type) { + const cacheCheck = { + scraperPath: scraper._path, + date, + requestedUrl: url, + cacheFilePath: filePath, + cacheFileExists: cacheExists, + type + }; + + // Write data to aid in cache migration. + const newData = `${JSON.stringify(cacheCheck, null, 2)},\n`; + fsBuiltIn.appendFile(path.join(process.cwd(), 'log_cacheCalls.txt'), newData, err => { + if (err) throw err; + }); +} + +function newTopFolder(scraperPath) { + const ret = scraperPath + .replace(/^.*?src.shared.scrapers./, '') + .toLowerCase() + .replace(/[/\\]/g, '-') + .replace(/\.js$/, '') + .replace('-index', ''); + return ret; +} + +function hashContent(thing, len = 64) { + return crypto + .createHash('sha256') + .update(thing) + .digest('hex') + .substr(0, len); +} + +/** Every file in the directory should have a distinct cache key. + * + * This is b/c the current system (in this project) should really only + * have one "file type" (which the cache key represents) per date. + * If not, throw an error. + */ +function checkCacheKeyCollision(cacheKey, destdir) { + const d = destdir.replace(process.cwd(), ''); + console.log(` Checking collision of key '${cacheKey}' in ${d}`); + const files = fsBuiltIn.readdirSync(destdir); + // console.log(` All files: ${files}`); + if (files.length === 0) return; + const matches = files.filter(f => f.includes(cacheKey)); + if (matches.length === 0) { + console.log(' No collision.'); + return; + } + const msg = ` KEY COLLISION, already have key '${cacheKey}' in ${d} (${matches})`; + throw new Error(msg); +} + +// Migrate the file to a temp folder. +// New format: +// crawler-cache/us-ca-xx-county/2020-04-12/2020-04-12t00_47_14.145z-default-344b7.html +export function migrateFile(url, filePath, encoding, scraper, date, cacheKey, type) { + console.log(`MIGRATING ${filePath}`); + const topdir = newTopFolder(scraper._path); + const dt = datetimeFormatting.getYYYYMMDD(date); + const destdir = path.join(process.cwd(), process.env.MIGRATE_CACHE_DIR, topdir, dt); + fsBuiltIn.mkdirSync(destdir, { recursive: true }); + + checkCacheKeyCollision(cacheKey, destdir); + + const tm = `${dt}t21_00_00.000z`; // Default all migrated files to 9 pm. + const content = fsBuiltIn.readFileSync(filePath, encoding); + const sha = hashContent(content, 5); + const fname = `${tm}-${cacheKey}-${sha}.${type}.gz`; + const destfile = path.join(destdir, fname); + if (fsBuiltIn.existsSync(destfile)) { + const msg = `${topdir}/${dt}/${fname} ALREADY EXISTS (called for ${url})`; + throw new Error(msg); + } + + const compressed = zlib.gzipSync(content); + fsBuiltIn.writeFileSync(destfile, compressed); + console.log(` Migrated to: ${destfile.replace(process.cwd(), '')}`); +} diff --git a/src/shared/lib/fetch/caching.js b/src/shared/lib/fetch/caching.js index 753696904..462c95035 100644 --- a/src/shared/lib/fetch/caching.js +++ b/src/shared/lib/fetch/caching.js @@ -1,3 +1,5 @@ +/* eslint-disable no-unused-vars */ + /** * This file contains the caching implementation. We provide caching to reduce strain on official data sources * and to store changes to each source on a day to day basis. @@ -5,10 +7,13 @@ import path from 'path'; import crypto from 'crypto'; +import fsBuiltIn from 'fs'; import join from '../join.js'; import datetime from '../datetime/index.js'; +import * as datetimeFormatting from '../datetime/iso/format.js'; import * as fs from '../fs.js'; +import * as cacheMigration from './cache-migration.js'; import log from '../log.js'; const DEFAULT_CACHE_PATH = 'coronadatascraper-cache'; @@ -44,7 +49,7 @@ export const getCachedFileName = (url, type) => { * @param {string} type type of the cached resource * @param {*} date the date associated with this resource, or false if a timeseries data */ -export const getCachedFilePath = (url, type, date = false) => { +export const getCachedFilePath = (scraper, url, type, date = false) => { // FIXME when we roll out new TZ support! if (date) date = datetime.old.getYYYYMD(date); let cachePath = date === false ? TIMESERIES_CACHE_PATH : join(DEFAULT_CACHE_PATH, date); @@ -62,23 +67,41 @@ export const getCachedFilePath = (url, type, date = false) => { If we are able to fetch this URL (because it is a timeseries or we are requesting today's data), the function returns `CACHE_MISS`. + * @param {*} scraper the scraper requesting the file * @param {string} url URL of the cached resource * @param {string} type type of the cached resource * @param {*} date the date associated with this resource, or false if a timeseries data * @param {string} encoding for the resource to access, default to utf-8 */ -export const getCachedFile = async (url, type, date, encoding = 'utf8') => { - const filePath = getCachedFilePath(url, type, date); +export const getCachedFile = async (scraper, url, cacheKey, type, date, encoding = 'utf8') => { + if (scraper === undefined || scraper === null) throw new Error(`Undefined scraper, trying to hit ${url}`); + + const filePath = getCachedFilePath(scraper, url, type, date); - if (await fs.exists(filePath)) { + const cacheExists = await fs.exists(filePath); + if (cacheExists) { log(' ⚡️ Cache hit for %s from %s', url, filePath); + } + + if (process.env.LOG_CACHE_CALLS) { + cacheMigration.logCacheCall(scraper, date, url, filePath, cacheExists, type); + } + + if (process.env.MIGRATE_CACHE_DIR && cacheExists && date !== false) { + // Write file with new v1.0 filename to other location. + // NOTE: We're not migrating the timeseries cache! + cacheMigration.migrateFile(url, filePath, encoding, scraper, date, cacheKey, type); + } + + if (cacheExists) { return fs.readFile(filePath, encoding); } if (date && datetime.dateIsBefore(date, datetime.old.getDate())) { log(' ⚠️ Cannot go back in time to get %s, no cache present', url, filePath); return RESOURCE_UNAVAILABLE; } - log(' 🐢 Cache miss for %s at %s', url, filePath); + const shortName = (scraper._path || 'unknown').replace(/^.*scrapers/, ''); + log(' 🐢 Cache miss for scraper: %s; url: %s; filepath: %s', shortName, url, filePath); return CACHE_MISS; }; @@ -90,7 +113,7 @@ export const getCachedFile = async (url, type, date, encoding = 'utf8') => { * @param {*} date the date associated with this resource, or false if a timeseries data * @param {*} data file data to be saved */ -export const saveFileToCache = async (url, type, date, data) => { - const filePath = getCachedFilePath(url, type, date); +export const saveFileToCache = async (scraper, url, type, date, data) => { + const filePath = getCachedFilePath(scraper, url, type, date); return fs.writeFile(filePath, data, { silent: true }); }; diff --git a/src/shared/lib/fetch/get.js b/src/shared/lib/fetch/get.js index 86cb1ae4b..360615f97 100644 --- a/src/shared/lib/fetch/get.js +++ b/src/shared/lib/fetch/get.js @@ -24,6 +24,7 @@ needle.defaults({ /** * Fetch whatever is at the provided URL. Use cached version if available. + * @param {*} scraper the scraper object * @param {string} url URL of the resource * @param {*} type type of the resource * @param {*} date the date associated with this resource, or false if a timeseries data @@ -38,7 +39,14 @@ needle.defaults({ * Returns: { body: body, cookies: cookies }. If the request failed, * both body and cookies are null. */ -export const get = async (url, type, date = datetime.old.scrapeDate() || datetime.old.getYYYYMD(), options = {}) => { +export const get = async ( + scraper, + url, + cacheKey, + type, + date = datetime.old.scrapeDate() || datetime.old.getYYYYMD(), + options = {} +) => { const { alwaysRun, disableSSL, toString, encoding, cookies, headers, method, args } = { alwaysRun: false, disableSSL: false, @@ -51,7 +59,9 @@ export const get = async (url, type, date = datetime.old.scrapeDate() || datetim ...options }; - const cachedBody = await caching.getCachedFile(url, type, date, encoding); + if (scraper === null || typeof scraper !== 'object') throw new Error(`null or invalid scraper, getting ${url}`); + + const cachedBody = await caching.getCachedFile(scraper, url, cacheKey, type, date, encoding); if (process.env.ONLY_USE_CACHE) return { body: cachedBody, cookies: null }; if (cachedBody === caching.CACHE_MISS || alwaysRun) { @@ -106,7 +116,7 @@ export const get = async (url, type, date = datetime.old.scrapeDate() || datetim // any sort of success code -- return good data if (response.statusCode < 400) { const fetchedBody = toString ? response.body.toString() : response.body; - await caching.saveFileToCache(url, type, date, fetchedBody); + await caching.saveFileToCache(scraper, url, type, date, fetchedBody); return { body: fetchedBody, cookies: response.cookies }; } diff --git a/src/shared/lib/fetch/index.js b/src/shared/lib/fetch/index.js index 7977645ad..4d0d5bfd2 100644 --- a/src/shared/lib/fetch/index.js +++ b/src/shared/lib/fetch/index.js @@ -21,15 +21,15 @@ const READ_TIMEOUT = 60000; /** * Load the webpage at the given URL and return a Cheerio object + * @param {*} scraper the scraper object * @param {string} url URL of the resource * @param {*} date the date associated with this resource, or false if a timeseries data * @param {object=} options customizable options: * - alwaysRun: fetches from URL even if resource is in cache, defaults to false * - disableSSL: disables SSL verification for this resource, should be avoided */ -export const page = async (url, date, options = {}) => { - const resp = await get(url, 'html', date, options); - +export const page = async (scraper, url, cacheKey = 'default', date, options = {}) => { + const resp = await get(scraper, url, cacheKey, 'html', date, options); if (!resp.body) { return null; } @@ -38,16 +38,16 @@ export const page = async (url, date, options = {}) => { /** * Load and parse JSON from the given URL + * @param {*} scraper the scraper object * @param {string} url URL of the resource * @param {*} date the date associated with this resource, or false if a timeseries data * @param {object} options customizable options: * - alwaysRun: fetches from URL even if resource is in cache, defaults to false * - disableSSL: disables SSL verification for this resource, should be avoided */ -export const json = async (url, date, options = {}) => { +export const json = async (scraper, url, cacheKey = 'default', date, options = {}) => { log(url); - const resp = await get(url, 'json', date, options); - + const resp = await get(scraper, url, cacheKey, 'json', date, options); if (!resp.body) { return null; } @@ -62,10 +62,9 @@ export const json = async (url, date, options = {}) => { * - alwaysRun: fetches from URL even if resource is in cache, defaults to false * - disableSSL: disables SSL verification for this resource, should be avoided */ -export const jsonAndCookies = async (url, date, options = {}) => { +export const jsonAndCookies = async (scraper, url, cacheKey = 'default', date, options = {}) => { log(url); - const resp = await get(url, 'json', date, options); - + const resp = await get(scraper, url, cacheKey, 'json', date, options); if (!resp.body) { return null; } @@ -77,6 +76,7 @@ export const jsonAndCookies = async (url, date, options = {}) => { /** * Load and parse CSV from the given URL + * @param {*} scraper the scraper object * @param {string} url URL of the resource * @param {*} date the date associated with this resource, or false if a timeseries data * @param {object} options customizable options: @@ -84,9 +84,9 @@ export const jsonAndCookies = async (url, date, options = {}) => { * - disableSSL: disables SSL verification for this resource, should be avoided * - delimiter: the delimiter to use (default is ,) */ -export const csv = async (url, date, options = {}) => { +export const csv = async (scraper, url, cacheKey = 'default', date, options = {}) => { return new Promise(async (resolve, reject) => { - const resp = await get(url, 'csv', date, options); + const resp = await get(scraper, url, cacheKey, 'csv', date, options); if (!resp.body) { resolve(null); @@ -114,41 +114,43 @@ export const csv = async (url, date, options = {}) => { /** * Load and parse TSV from the given URL + * @param {*} scraper the scraper object * @param {string} url URL of the resource * @param {*} date the date associated with this resource, or false if a timeseries data * @param {object} options customizable options: * - alwaysRun: fetches from URL even if resource is in cache, defaults to false * - disableSSL: disables SSL verification for this resource, should be avoided */ -export const tsv = async (url, date, options = {}) => { +export const tsv = async (scraper, url, cacheKey = 'default', date, options = {}) => { options.delimiter = '\t'; - return csv(url, date, options); + return csv(scraper, url, cacheKey, date, options); }; /** * Load the given URL and return a raw response + * @param {*} scraper the scraper object * @param {string} url URL of the resource * @param {*} date the date associated with this resource, or false if a timeseries data * @param {object} options customizable options: * - alwaysRun: fetches from URL even if resource is in cache, defaults to false * - disableSSL: disables SSL verification for this resource, should be avoided */ -export const raw = async (url, date, options = {}) => { - const resp = await get(url, 'raw', date, options); +export const raw = async (scraper, url, cacheKey = 'default', date, options = {}) => { + const resp = await get(scraper, url, cacheKey, 'raw', date, options); return resp.body; }; /** * Load and parse PDF from the given URL - * + * @param {*} scraper the scraper object * @param {string} url URL of the resource * @param {*} date the date associated with this resource, or false if a timeseries data * @param {object} options customizable options: * - alwaysRun: fetches from URL even if resource is in cache, defaults to false * - disableSSL: disables SSL verification for this resource, should be avoided */ -export const pdf = async (url, date, options) => { - const resp = await get(url, 'pdf', date, { ...options, toString: false, encoding: null }); +export const pdf = async (scraper, url, cacheKey = 'default', date, options) => { + const resp = await get(scraper, url, cacheKey, 'pdf', date, { ...options, toString: false, encoding: null }); if (!resp.body) { return null; @@ -222,14 +224,22 @@ const fetchHeadless = async url => { /** * Fetch whatever is at the provided URL in headless mode with Pupeteer. Use cached version if available. + * @param {*} scraper the scraper object * @param {string} url URL of the resource * @param {*} date the date associated with this resource, or false if a timeseries data * @param {*} alwaysRun fetches from URL even if resource is in cache, defaults to false */ -export const headless = async (url, date = datetime.old.scrapeDate() || datetime.old.getYYYYMD(), options = {}) => { +export const headless = async ( + scraper, + url, + cacheKey = 'default', + date = datetime.old.scrapeDate() || datetime.old.getYYYYMD(), + options = {} +) => { const { alwaysRun } = { alwaysRun: false, disableSSL: false, ...options }; - const cachedBody = await caching.getCachedFile(url, 'html', date); + const cachedBody = await caching.getCachedFile(scraper, url, cacheKey, 'html', date); + if (process.env.ONLY_USE_CACHE) { const $ = await cheerio.load(cachedBody); return $; @@ -237,7 +247,7 @@ export const headless = async (url, date = datetime.old.scrapeDate() || datetime if (cachedBody === caching.CACHE_MISS || alwaysRun) { const fetchedBody = await fetchHeadless(url); - await caching.saveFileToCache(url, 'html', date, fetchedBody); + await caching.saveFileToCache(scraper, url, 'html', date, fetchedBody); const $ = await cheerio.load(fetchedBody); return $; @@ -256,9 +266,11 @@ export const headless = async (url, date = datetime.old.scrapeDate() || datetime * orgId is 4RQmZZ0yaZkGR1zy * layerName is COVID19_testsites_READ_ONLY */ -export const getArcGISCSVURLFromOrgId = async function(serverNumber, orgId, layerName) { +export const getArcGISCSVURLFromOrgId = async function(scraper, serverNumber, orgId, layerName) { const layerMetadata = await json( - `https://services${serverNumber}.arcgis.com/${orgId}/arcgis/rest/services/${layerName}/FeatureServer/0?f=json` + scraper, + `https://services${serverNumber}.arcgis.com/${orgId}/arcgis/rest/services/${layerName}/FeatureServer/0?f=json`, + 'ArcOrgID' ); const { serviceItemId } = layerMetadata; return `https://opendata.arcgis.com/datasets/${serviceItemId}_0.csv`; @@ -266,12 +278,17 @@ export const getArcGISCSVURLFromOrgId = async function(serverNumber, orgId, laye /** * Get the URL for the CSV data from an ArcGIS dashboard + * @param {*} scraper the scraper object * @param {*} serverNumber the servern number, find this by looking at requests (i.e. https://services1.arcgis.com/ is serverNumber = 1) * @param {*} dashboardId the ID of the dashboard, as passed to the iframe that renders it (i.e. https://maps.arcgis.com/apps/opsdashboard/index.html#/ec4bffd48f7e495182226eee7962b422 is dashboardId = ec4bffd48f7e495182226eee7962b422) * @param {*} layerName the name of the layer to fetch data for, find this by examining requests */ -export const getArcGISCSVURL = async function(serverNumber, dashboardId, layerName) { - const dashboardManifest = await json(`https://maps.arcgis.com/sharing/rest/content/items/${dashboardId}?f=json`); +export const getArcGISCSVURL = async function(scraper, serverNumber, dashboardId, layerName) { + const dashboardManifest = await json( + scraper, + `https://maps.arcgis.com/sharing/rest/content/items/${dashboardId}?f=json`, + 'ArcGIS' + ); const { orgId } = dashboardManifest; - return getArcGISCSVURLFromOrgId(serverNumber, orgId, layerName); + return getArcGISCSVURLFromOrgId(scraper, serverNumber, orgId, layerName); }; diff --git a/src/shared/scrapers/AT/index.js b/src/shared/scrapers/AT/index.js index 8a3a8c79b..e080af340 100644 --- a/src/shared/scrapers/AT/index.js +++ b/src/shared/scrapers/AT/index.js @@ -27,7 +27,7 @@ const scraper = { } ]; - const casesRaw = await fetch.csv(this.url, false); + const casesRaw = await fetch.csv(this, this.url, 'default', false); const casesData = casesRaw.filter(item => datetime.scrapeDateIs(item.datetime)); if (casesData.length > 0) { @@ -84,11 +84,11 @@ const scraper = { const data = []; const recoveredDeathsUrl = 'https://info.gesundheitsministerium.at/data/GenesenTodesFaelleBL.js'; - const recoveredDeathsRaw = await fetch.fetch(recoveredDeathsUrl, 'txt'); + const recoveredDeathsRaw = await fetch.fetch(this, recoveredDeathsUrl, 'recovereddeaths', 'txt'); const recoveredDeathsData = JSON.parse(recoveredDeathsRaw.body.match(/\[.*\]/g)); const casesUrl = 'https://info.gesundheitsministerium.at/data/Bezirke.js'; - const casesRaw = await fetch.fetch(casesUrl, 'txt'); + const casesRaw = await fetch.fetch(this, casesUrl, 'cases', 'txt'); const casesRegionData = JSON.parse(casesRaw.body.match(/\[.*\]/g)); const casesByRegion = {}; diff --git a/src/shared/scrapers/AU/ACT/index.js b/src/shared/scrapers/AU/ACT/index.js index 093dedc79..3dba0c9a9 100644 --- a/src/shared/scrapers/AU/ACT/index.js +++ b/src/shared/scrapers/AU/ACT/index.js @@ -29,7 +29,8 @@ const scraper = { url: 'https://www.covid19.act.gov.au', scraper: { '0': async function() { - const $ = await fetch.page('https://www.health.act.gov.au/about-our-health-system/novel-coronavirus-covid-19'); + const healthUrl = 'https://www.health.act.gov.au/about-our-health-system/novel-coronavirus-covid-19'; + const $ = await fetch.page(this, healthUrl, 'default'); const $table = $('.statuscontent'); const $trs = $table.find('div'); const data = { @@ -47,7 +48,7 @@ const scraper = { return getDataWithTestedNegativeApplied(data); }, '2020-03-29': async function() { - const $ = await fetch.page('https://www.covid19.act.gov.au/updates/confirmed-case-information'); + const $ = await fetch.page(this, 'https://www.covid19.act.gov.au/updates/confirmed-case-information', 'default'); const $table = $('h2:contains("Cases") + table'); const $trs = $table.find('tr'); @@ -62,7 +63,7 @@ const scraper = { return getDataWithTestedNegativeApplied(data); }, '2020-04-09': async function() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $tables = $('.spf-article-card--tabular table'); const data = {}; diff --git a/src/shared/scrapers/AU/NSW/index.js b/src/shared/scrapers/AU/NSW/index.js index 3f535322e..02812a95f 100644 --- a/src/shared/scrapers/AU/NSW/index.js +++ b/src/shared/scrapers/AU/NSW/index.js @@ -37,10 +37,10 @@ const scraper = { url: 'https://www.health.nsw.gov.au/_layouts/feed.aspx?xsl=1&web=/news&page=4ac47e14-04a9-4016-b501-65a23280e841&wp=baabf81e-a904-44f1-8d59-5f6d56519965&pageurl=/news/Pages/rss-nsw-health.aspx', async scraper() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'tmpindex'); const $anchors = $('channel > item:contains("statistics") > link'); const currentArticleUrl = $anchors[0].next.data; - const $currentArticlePage = await fetch.page(currentArticleUrl); + const $currentArticlePage = await fetch.page(this, currentArticleUrl, 'default'); const $table = $currentArticlePage('.maincontent table:first-of-type'); const $trs = $table.find('tbody > tr:not(:first-child):not(:last-child)'); diff --git a/src/shared/scrapers/AU/NT/index.js b/src/shared/scrapers/AU/NT/index.js index 3511421d3..8fe6401cb 100644 --- a/src/shared/scrapers/AU/NT/index.js +++ b/src/shared/scrapers/AU/NT/index.js @@ -17,7 +17,7 @@ const scraper = { type: 'table', url: 'https://coronavirus.nt.gov.au/', async scraper() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $rowWithCases = $('.header-widget p:first-of-type'); assert($rowWithCases.text().includes('confirmed cases')); const data = { diff --git a/src/shared/scrapers/AU/QLD/index.js b/src/shared/scrapers/AU/QLD/index.js index 3d60b38df..f5bdce459 100644 --- a/src/shared/scrapers/AU/QLD/index.js +++ b/src/shared/scrapers/AU/QLD/index.js @@ -14,10 +14,10 @@ const labelFragmentsByKey = [ ]; async function getCurrentArticlePage(obj) { - const $ = await fetch.page(obj.url); + const $ = await fetch.page(obj, obj.url, 'tempindex'); const anchors = $('#content h3:first-of-type > a'); const currentArticleUrl = anchors[0].attribs.href; - return fetch.page(currentArticleUrl); + return fetch.page(obj, currentArticleUrl, 'default'); } const scraper = { diff --git a/src/shared/scrapers/AU/SA/index.js b/src/shared/scrapers/AU/SA/index.js index c41253acc..30f442838 100644 --- a/src/shared/scrapers/AU/SA/index.js +++ b/src/shared/scrapers/AU/SA/index.js @@ -28,7 +28,7 @@ const scraper = { 'https://www.sahealth.sa.gov.au/wps/wcm/connect/public+content/sa+health+internet/health+topics/health+topics+a+-+z/covid+2019/latest+updates/confirmed+and+suspected+cases+of+covid-19+in+south+australia', scraper: { '0': async function() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const paragraph = $('.middle-column p:first-of-type').text(); const { casesString } = paragraph.match(/been (?\d+) confirmed cases/).groups; this.type = 'paragraph'; @@ -38,7 +38,7 @@ const scraper = { }, '2020-03-27': async function() { this.type = 'table'; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $table = $('table:first-of-type'); const $trs = $table.find('tbody > tr'); const data = {}; diff --git a/src/shared/scrapers/AU/VIC/index.js b/src/shared/scrapers/AU/VIC/index.js index 5badeb515..ceb968bcf 100644 --- a/src/shared/scrapers/AU/VIC/index.js +++ b/src/shared/scrapers/AU/VIC/index.js @@ -24,10 +24,10 @@ const scraper = { type: 'paragraph', url: 'https://www.dhhs.vic.gov.au/media-hub-coronavirus-disease-covid-19', async scraper() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'tmpindex'); const $anchor = $('.content ul li a:contains("Department of Health and Human Services media release - ")'); const currentArticleUrl = $anchor.attr('href'); - const $currentArticlePage = await fetch.page(`https://www.dhhs.vic.gov.au${currentArticleUrl}`); + const $currentArticlePage = await fetch.page(this, `https://www.dhhs.vic.gov.au${currentArticleUrl}`, 'default'); const paragraph = $currentArticlePage('.page-content p:first-of-type').text(); const matches = paragraph.match(/cases in Victoria \w* (?[\d,]+)/) || {}; const { casesString } = matches.groups || {}; diff --git a/src/shared/scrapers/AU/WA/index.js b/src/shared/scrapers/AU/WA/index.js index c84d357c9..ba2172d3d 100644 --- a/src/shared/scrapers/AU/WA/index.js +++ b/src/shared/scrapers/AU/WA/index.js @@ -28,7 +28,7 @@ const scraper = { type: 'table', url: 'https://ww2.health.wa.gov.au/Articles/A_E/Coronavirus/COVID19-statistics', async scraper() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $table = $('table:first-of-type'); const $trs = $table.find('tbody > tr:not(:first-child)'); const data = {}; diff --git a/src/shared/scrapers/AU/aus-from-wa-health/index.js b/src/shared/scrapers/AU/aus-from-wa-health/index.js index 853b19d1c..76773a62a 100644 --- a/src/shared/scrapers/AU/aus-from-wa-health/index.js +++ b/src/shared/scrapers/AU/aus-from-wa-health/index.js @@ -37,7 +37,7 @@ const scraper = { url: 'https://ww2.health.wa.gov.au/Articles/A_E/Coronavirus/COVID19-statistics', async scraper() { const states = []; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $table = $('h2:contains("in Australia") + table'); const $ths = $table.find('th'); diff --git a/src/shared/scrapers/AU/index.js b/src/shared/scrapers/AU/index.js index 33a5dffa7..4e4b6d878 100644 --- a/src/shared/scrapers/AU/index.js +++ b/src/shared/scrapers/AU/index.js @@ -33,7 +33,7 @@ const scraper = { 'https://www.health.gov.au/news/health-alerts/novel-coronavirus-2019-ncov-health-alert/coronavirus-covid-19-current-situation-and-case-numbers', async scraper() { const states = []; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $table = $('.health-table__responsive > table'); const $trs = $table.find('tbody > tr:not(:first-child):not(:last-child)'); $trs.each((index, tr) => { diff --git a/src/shared/scrapers/BE/index.js b/src/shared/scrapers/BE/index.js index 70121951c..47d7219a2 100644 --- a/src/shared/scrapers/BE/index.js +++ b/src/shared/scrapers/BE/index.js @@ -27,10 +27,14 @@ const scraper = { async scraper() { const date = datetime.getYYYYMMDD(process.env.SCRAPE_DATE); - const casesData = await fetch.csv('https://epistat.sciensano.be/Data/COVID19BE_CASES_AGESEX.csv', false); - const deathsData = await fetch.csv('https://epistat.sciensano.be/Data/COVID19BE_MORT.csv', false); - const hospitalizedData = await fetch.csv('https://epistat.sciensano.be/Data/COVID19BE_HOSP.csv', false); - const testsData = await fetch.csv('https://epistat.sciensano.be/Data/COVID19BE_tests.csv', false); + const casesUrl = 'https://epistat.sciensano.be/Data/COVID19BE_CASES_AGESEX.csv'; + const casesData = await fetch.csv(this, casesUrl, 'cases', false); + const deathsUrl = 'https://epistat.sciensano.be/Data/COVID19BE_MORT.csv'; + const deathsData = await fetch.csv(this, deathsUrl, 'deaths', false); + const hospUrl = 'https://epistat.sciensano.be/Data/COVID19BE_HOSP.csv'; + const hospitalizedData = await fetch.csv(this, hospUrl, 'hospitalized', false); + const testsUrl = 'https://epistat.sciensano.be/Data/COVID19BE_tests.csv'; + const testsData = await fetch.csv(this, testsUrl, 'tests', false); const dataByRegion = {}; const dataByProvince = {}; diff --git a/src/shared/scrapers/BR/index.js b/src/shared/scrapers/BR/index.js index 7eed380bd..4f67bad55 100644 --- a/src/shared/scrapers/BR/index.js +++ b/src/shared/scrapers/BR/index.js @@ -60,7 +60,7 @@ const scraper = { async scraper() { const response = []; const ufs = this._ufs; - const $ = await fetch.headless(this.url); + const $ = await fetch.headless(this, this.url, 'default'); $.root() .find('.list-itens .teste') diff --git a/src/shared/scrapers/CA/NS/index.js b/src/shared/scrapers/CA/NS/index.js index d7bdb20e5..89e032786 100644 --- a/src/shared/scrapers/CA/NS/index.js +++ b/src/shared/scrapers/CA/NS/index.js @@ -20,7 +20,7 @@ const scraper = { ], scraper: { '0': async function() { - const data = await fetch.csv(this.url); + const data = await fetch.csv(this, this.url, 'default'); const headers = Object.keys(data[0]); if (headers[0] !== 'Date' || headers[1] !== 'Positive' || headers[2] !== 'Negative') { @@ -65,7 +65,7 @@ const scraper = { }, '2020-04-12': async function() { this.url = 'https://novascotia.ca/coronavirus/data/COVID-19-data.csv'; - await fetch.csv(this.url); + await fetch.csv(this, this.url, 'default'); throw new Error('Someone needs to scrape this new data properly'); } } diff --git a/src/shared/scrapers/CA/index.js b/src/shared/scrapers/CA/index.js index 5a59b6b46..6d9daf9b4 100644 --- a/src/shared/scrapers/CA/index.js +++ b/src/shared/scrapers/CA/index.js @@ -43,7 +43,7 @@ const scraper = { /* '0': async function() { this.url = 'https://www.canada.ca/en/public-health/services/diseases/2019-novel-coronavirus-infection.html'; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $table = $('h2:contains("Current situation")') .nextAll('table') .first(); @@ -64,7 +64,7 @@ const scraper = { }, */ '0': async function() { - const data = await fetch.csv(this.url, false); + const data = await fetch.csv(this, this.url, 'default', false); // FIXME when we roll out new TZ support! const fallback = process.env.USE_ISO_DATETIME ? new Date(datetime.now.at('America/Toronto')) : datetime.getDate(); diff --git a/src/shared/scrapers/CH/index.js b/src/shared/scrapers/CH/index.js index 60b7c2828..b5eaeff61 100644 --- a/src/shared/scrapers/CH/index.js +++ b/src/shared/scrapers/CH/index.js @@ -54,10 +54,10 @@ const scraper = { const releasedURL = 'https://raw.githubusercontent.com/daenuprobst/covid19-cases-switzerland/master/covid19_released_switzerland_openzh.csv'; - const casesData = await fetch.csv(casesURL, false); - const deathsData = await fetch.csv(deathsURL, false); - const hospitalizedData = await fetch.csv(hospitalizedURL, false); - const releasedData = await fetch.csv(releasedURL, false); + const casesData = await fetch.csv(this, casesURL, 'cases', false); + const deathsData = await fetch.csv(this, deathsURL, 'deaths', false); + const hospitalizedData = await fetch.csv(this, hospitalizedURL, 'hospitalized', false); + const releasedData = await fetch.csv(this, releasedURL, 'released', false); const dataByCanton = {}; diff --git a/src/shared/scrapers/CY/index.js b/src/shared/scrapers/CY/index.js index 115bc04e8..3f5751def 100644 --- a/src/shared/scrapers/CY/index.js +++ b/src/shared/scrapers/CY/index.js @@ -22,10 +22,10 @@ const scraper = { async scraper() { const date = datetime.getYYYYMMDD(process.env.SCRAPE_DATE); - const datasetRaw = await fetch.json(this.url, false); + const datasetRaw = await fetch.json(this, this.url, 'tmpindex', false); const dataset = datasetRaw.result[0].resources.find(item => item.format === 'csv'); - const casesRaw = await fetch.csv(dataset.url, false); + const casesRaw = await fetch.csv(this, dataset.url, 'default', false); const casesData = casesRaw.filter(item => datetime.scrapeDateIs(reformatDate(item.date))); if (casesData.length > 0) { diff --git a/src/shared/scrapers/CZ/index.js b/src/shared/scrapers/CZ/index.js index b2076e2bf..88443c1b5 100644 --- a/src/shared/scrapers/CZ/index.js +++ b/src/shared/scrapers/CZ/index.js @@ -23,8 +23,8 @@ const scraper = { const casesURL = 'https://onemocneni-aktualne.mzcr.cz/api/v1/covid-19/osoby.csv'; const testedURL = 'https://onemocneni-aktualne.mzcr.cz/api/v1/covid-19/testy.csv'; - const casesData = await fetch.csv(casesURL, false); - const testedData = await fetch.csv(testedURL, false); + const casesData = await fetch.csv(this, casesURL, 'cases', false); + const testedData = await fetch.csv(this, testedURL, 'tested', false); const casesByRegion = {}; diff --git a/src/shared/scrapers/DE/_shared.js b/src/shared/scrapers/DE/_shared.js index 274f8e6fd..a3208b9a3 100644 --- a/src/shared/scrapers/DE/_shared.js +++ b/src/shared/scrapers/DE/_shared.js @@ -33,7 +33,7 @@ export const sharedSchema = { }; async function defaultScraperDEU() { - const data = await fetch.csv(this.url, false); + const data = await fetch.csv(this, this.url, 'default', false); // Rely on dataset to be sorted by time, in direction past -> future. const [lastRow] = data.slice(-1); diff --git a/src/shared/scrapers/EE/index.js b/src/shared/scrapers/EE/index.js index f9992a67b..eba9456e9 100644 --- a/src/shared/scrapers/EE/index.js +++ b/src/shared/scrapers/EE/index.js @@ -21,7 +21,7 @@ const scraper = { async scraper() { const date = datetime.getYYYYMMDD(process.env.SCRAPE_DATE); - const testedData = await fetch.csv(this.url, false); + const testedData = await fetch.csv(this, this.url, 'default', false); const testedByCounty = {}; const casesByCounty = {}; diff --git a/src/shared/scrapers/ES/index.js b/src/shared/scrapers/ES/index.js index 432275b1b..91f4f5a6d 100644 --- a/src/shared/scrapers/ES/index.js +++ b/src/shared/scrapers/ES/index.js @@ -83,7 +83,7 @@ const scraper = { async scraper() { const rawData = {}; for (const { name, url } of this._endpoints) { - rawData[name] = await fetch.csv(url, false); + rawData[name] = await fetch.csv(this, url, 'default', false); } // `rawData` looks like this: diff --git a/src/shared/scrapers/FR/index.js b/src/shared/scrapers/FR/index.js index 66262dc9f..ace5ec35e 100644 --- a/src/shared/scrapers/FR/index.js +++ b/src/shared/scrapers/FR/index.js @@ -33,13 +33,13 @@ const scraper = { const date = datetime.getYYYYMMDD(process.env.SCRAPE_DATE); // The latest datasets are posted in this CSV, which is updated daily - const datasets = await fetch.csv(this.url, false, { delimiter: ';' }); + const datasets = await fetch.csv(this, this.url, 'index', false, { delimiter: ';' }); // We grab the latest relevant dataset URLs const hopitalizedDataset = datasets.find(entry => entry.title.match(/donnees-hospitalieres-covid19-.*.csv/)); const testedDataset = datasets.find(entry => entry.title.match(/donnees-tests-covid19-labo-quotidien-.*.csv/)); - let hopitalizedData = await fetch.csv(hopitalizedDataset.url, false, { delimiter: ';' }); + let hopitalizedData = await fetch.csv(this, hopitalizedDataset.url, 'hospitalized', false, { delimiter: ';' }); // Hospitalized data is broken down by gender, we are only interested in all genders hopitalizedData = hopitalizedData.filter(item => item.sexe === '0'); @@ -47,7 +47,7 @@ const scraper = { // Sort by date to ensure accurate cummulative count hopitalizedData = hopitalizedData.sort((a, b) => a.jour - b.jour); - let testedData = await fetch.csv(testedDataset.url, false, { delimiter: ';' }); + let testedData = await fetch.csv(this, testedDataset.url, 'tested', false, { delimiter: ';' }); // Testing data is broken down by age group, we are only interested in all age groups testedData = testedData.filter(item => item.clage_covid === '0'); diff --git a/src/shared/scrapers/GB/SCT/index.js b/src/shared/scrapers/GB/SCT/index.js index 6ee880fa5..ce9539475 100644 --- a/src/shared/scrapers/GB/SCT/index.js +++ b/src/shared/scrapers/GB/SCT/index.js @@ -39,7 +39,7 @@ const scraper = { } const counties = []; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $table = $('tbody').first(); $table.children('tr').each((i, item) => { const columns = $(item).children('td'); @@ -64,7 +64,7 @@ const scraper = { }, '2020-03-29': async function() { const counties = []; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $table = $('td:contains("Positive cases")').closest('table'); $table.find('tr:not(:first-child)').each((i, tr) => { const $tr = $(tr); diff --git a/src/shared/scrapers/GB/index.js b/src/shared/scrapers/GB/index.js index 8a64d7b64..c4904b29c 100644 --- a/src/shared/scrapers/GB/index.js +++ b/src/shared/scrapers/GB/index.js @@ -28,7 +28,7 @@ const scraper = { // The UK coronavirus website provides an XML description file that outlines the available // timeseries files. We get this file to get the latest source we can use. - const $ = await fetch.page(this.url, false); + const $ = await fetch.page(this, this.url, 'tmpindex', false); const $blobs = $('Blob'); @@ -53,7 +53,7 @@ const scraper = { }); // Grab the json timeseries at the URL we found earlier - const casesData = await fetch.json(`https://c19pub.azureedge.net/${url}`, false); + const casesData = await fetch.json(this, `https://c19pub.azureedge.net/${url}`, 'cases', false); // Countries contains data for the four GB countries (Scotland, England, etc.) // and utlas contains data for the counties of England diff --git a/src/shared/scrapers/ID/index.js b/src/shared/scrapers/ID/index.js index 41bb00d4a..d0dde36f7 100644 --- a/src/shared/scrapers/ID/index.js +++ b/src/shared/scrapers/ID/index.js @@ -21,7 +21,7 @@ const scraper = { url: 'https://www.kemkes.go.id/', async scraper() { const data = {}; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $table = $('.covid-case-container table'); assert.equal($table.length, 1, 'The table can not be found, the page may not have loaded correctly'); diff --git a/src/shared/scrapers/IE/index.js b/src/shared/scrapers/IE/index.js index eefc16053..4d4104ff4 100644 --- a/src/shared/scrapers/IE/index.js +++ b/src/shared/scrapers/IE/index.js @@ -22,7 +22,7 @@ const scraper = { async scraper() { const date = datetime.getYYYYMMDD(process.env.SCRAPE_DATE); - const casesData = await fetch.csv(this.url, false); + const casesData = await fetch.csv(this, this.url, 'default', false); const casesByRegion = {}; diff --git a/src/shared/scrapers/IN/index.js b/src/shared/scrapers/IN/index.js index b84db2eae..c21bf61c0 100644 --- a/src/shared/scrapers/IN/index.js +++ b/src/shared/scrapers/IN/index.js @@ -72,7 +72,7 @@ const scraper = { type: 'table', aggregate: 'state', async scraper() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $table = $('#state-data'); assert.equal($table.length, 1, 'The table can not be found'); diff --git a/src/shared/scrapers/IT/index.js b/src/shared/scrapers/IT/index.js index ea3535a05..8a64e50f8 100644 --- a/src/shared/scrapers/IT/index.js +++ b/src/shared/scrapers/IT/index.js @@ -39,7 +39,7 @@ const scraper = { aggregate: 'state', priority: 1, async scraper() { - const data = await fetch.csv(this.url, false); + const data = await fetch.csv(this, this.url, 'default', false); // FIXME when we roll out new TZ support! const fallback = process.env.USE_ISO_DATETIME ? datetime.now.at('Europe/Rome') : datetime.getDate(); const scrapeDate = process.env.SCRAPE_DATE ? new Date(process.env.SCRAPE_DATE) : fallback; diff --git a/src/shared/scrapers/JP/index.js b/src/shared/scrapers/JP/index.js index 9f3021194..dc4c4377d 100644 --- a/src/shared/scrapers/JP/index.js +++ b/src/shared/scrapers/JP/index.js @@ -48,7 +48,7 @@ const scraper = { url: 'https://services8.arcgis.com/JdxivnCyd1rvJTrY/arcgis/rest/services/v2_covid19_list_csv/FeatureServer/0/query?where=0%3D0&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&resultType=none&distance=0.0&units=esriSRUnit_Meter&returnGeodetic=false&outFields=*&returnGeometry=false&featureEncoding=esriDefault&multipatchOption=xyFootprint&maxAllowableOffset=&geometryPrecision=&outSR=&datumTransformation=&applyVCSProjection=false&returnIdsOnly=false&returnUniqueIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&returnQueryGeometry=false&returnDistinctValues=false&cacheHint=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&having=&resultOffset=&resultRecordCount=&returnZ=false&returnM=false&returnExceededLimitFeatures=true&quantizationParameters=&sqlFormat=none&f=pjson', async scraper() { - const data = await fetch.json(this.url); + const data = await fetch.json(this, this.url, 'default'); assert(data, 'No data fetched'); assert(data.features.length > 1, 'features are unreasonable'); const attributes = data.features.map(({ attributes }) => attributes); diff --git a/src/shared/scrapers/KR/index.js b/src/shared/scrapers/KR/index.js index 142b03e7f..1f9c33207 100644 --- a/src/shared/scrapers/KR/index.js +++ b/src/shared/scrapers/KR/index.js @@ -59,7 +59,7 @@ const scraper = { maintainers.camjc ], async scraper() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $table = $('table.num'); let states = []; diff --git a/src/shared/scrapers/LT/index.js b/src/shared/scrapers/LT/index.js index b9fe7928b..27b73c799 100644 --- a/src/shared/scrapers/LT/index.js +++ b/src/shared/scrapers/LT/index.js @@ -18,7 +18,7 @@ const scraper = { ], maintainers: [maintainers.qgolsteyn], async scraper() { - const casesRaw = await fetch.json(this.url); + const casesRaw = await fetch.json(this, this.url, 'default'); const casesData = casesRaw.features.map(({ attributes }) => attributes); const casesByRegion = {}; diff --git a/src/shared/scrapers/LV/index.js b/src/shared/scrapers/LV/index.js index 5705c7f6f..1c0b96386 100644 --- a/src/shared/scrapers/LV/index.js +++ b/src/shared/scrapers/LV/index.js @@ -17,7 +17,7 @@ const scraper = { } ], async scraper() { - const casesRaw = await fetch.json(this.url); + const casesRaw = await fetch.json(this, this.url, 'default'); const casesData = casesRaw.features.map(({ attributes }) => attributes); const data = []; diff --git a/src/shared/scrapers/NL/index.js b/src/shared/scrapers/NL/index.js index d72ddc39f..0a4f045c8 100644 --- a/src/shared/scrapers/NL/index.js +++ b/src/shared/scrapers/NL/index.js @@ -25,12 +25,12 @@ const scraper = { const casesUrl = 'https://raw.githubusercontent.com/J535D165/CoronaWatchNL/master/data/rivm_NL_covid19_province.csv'; - const casesRaw = await fetch.csv(casesUrl, false); + const casesRaw = await fetch.csv(this, casesUrl, 'cases', false); const casesData = casesRaw.filter(item => datetime.scrapeDateIs(item.Datum)); const nationalUrl = 'https://raw.githubusercontent.com/J535D165/CoronaWatchNL/master/data/rivm_NL_covid19_national.csv'; - const nationalData = await fetch.csv(nationalUrl, false); + const nationalData = await fetch.csv(this, nationalUrl, 'national', false); const hospitalized = nationalData.find( item => datetime.scrapeDateIs(item.Datum) && item.Type === 'Ziekenhuisopname' diff --git a/src/shared/scrapers/NZ/index.js b/src/shared/scrapers/NZ/index.js index 040933318..ef8fb9a45 100644 --- a/src/shared/scrapers/NZ/index.js +++ b/src/shared/scrapers/NZ/index.js @@ -29,7 +29,7 @@ const scraper = { 'https://www.health.govt.nz/our-work/diseases-and-conditions/covid-19-novel-coronavirus/covid-19-current-situation/covid-19-current-cases', async scraper() { const data = {}; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $table = $('h2:contains("Summary") + table'); const $trs = $table.find('tbody tr'); $trs.each((index, tr) => { diff --git a/src/shared/scrapers/PA/index.js b/src/shared/scrapers/PA/index.js index 402b3dd72..33f0a9783 100644 --- a/src/shared/scrapers/PA/index.js +++ b/src/shared/scrapers/PA/index.js @@ -185,7 +185,7 @@ const scraper = { async scraper() { // We probably don't need to cache this every day; fetch could be commented out. - /* const corregimientos = */ await fetch.csv(this._corregimientosListUrl); + /* const corregimientos = */ await fetch.csv(this, this._corregimientosListUrl, 'corregimientos'); // Note: The first field names have funny characters in the name, and lint rules will prevent // me from including them in the comments (and they may be invisible). Use e.g. // log(corregimientos[0]); @@ -225,7 +225,7 @@ const scraper = { // } // Cache this. - /* const tests = */ await fetch.csv(this._testsUrl); + /* const tests = */ await fetch.csv(this, this._testsUrl, 'tests'); // Simple test counts (country level): // [ // { @@ -241,7 +241,7 @@ const scraper = { // TODO: How do we deal with multiple source for the same country? // i.e If I wanted to make a Panama nation-level timeseries scraper, how would I do it // and still keep this one which has greater granularity? - /* const timeseries = */ await fetch.csv(this._timeSeriesUrl); + /* const timeseries = */ await fetch.csv(this, this._timeSeriesUrl, 'timeseries'); // Array of: // { // 'Fecha': '2020-03-10T00:00:00.000Z', - date @@ -257,7 +257,7 @@ const scraper = { // } // This is the one we actually get the data from. - const caseList = await fetch.csv(this._caseListUrl); + const caseList = await fetch.csv(this, this._caseListUrl, 'cases'); // Array of: // { // 'numero_CASO': '201', - case number diff --git a/src/shared/scrapers/PL/index.js b/src/shared/scrapers/PL/index.js index e7966d9bd..3c1ff10ae 100644 --- a/src/shared/scrapers/PL/index.js +++ b/src/shared/scrapers/PL/index.js @@ -25,7 +25,7 @@ const scraper = { const date = datetime.getYYYYMMDD(process.env.SCRAPE_DATE); - const casesData = await fetch.csv(this.url, false); + const casesData = await fetch.csv(this, this.url, 'default', false); const casesByRegion = {}; const deathsByRegion = {}; @@ -59,7 +59,7 @@ const scraper = { } ]; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); // The website stores all data as a string in an element with id #registerData const $pre = $('#registerData'); diff --git a/src/shared/scrapers/PR/index.js b/src/shared/scrapers/PR/index.js index 72d484c90..6c2aaa17f 100644 --- a/src/shared/scrapers/PR/index.js +++ b/src/shared/scrapers/PR/index.js @@ -32,7 +32,7 @@ const scraper = { maintainers.camjc ], async scraper() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $table = $('th:contains("CONFIRMADOS")').closest('table'); const $trs = $table.find('tbody > tr'); const dataPairs = pivotTheTable($trs, $); diff --git a/src/shared/scrapers/RU/index.js b/src/shared/scrapers/RU/index.js index eb6ccbde5..691477b21 100644 --- a/src/shared/scrapers/RU/index.js +++ b/src/shared/scrapers/RU/index.js @@ -31,11 +31,13 @@ const scraper = { throw new DeprecatedError('RUS scraper did not exist for this date'); }, '2020-03-26': async function() { - const csrfRequestResponse = await fetch.jsonAndCookies(this.url); + const csrfRequestResponse = await fetch.jsonAndCookies(this, this.url, 'tmpcsrf'); const csrfCookies = csrfRequestResponse.cookies; const { csrfToken } = csrfRequestResponse.body; - const { data } = await fetch.json(`${this.url}${csrfToken}`, undefined, { cookies: csrfCookies }); + const finalUrl = `${this.url}${csrfToken}`; + const opts = { cookies: csrfCookies }; + const { data } = await fetch.json(this, finalUrl, 'default', undefined, opts); const ruEntries = data.items.filter(({ ru }) => ru); diff --git a/src/shared/scrapers/SA/index.js b/src/shared/scrapers/SA/index.js index 8461c6b99..b0a3d7052 100644 --- a/src/shared/scrapers/SA/index.js +++ b/src/shared/scrapers/SA/index.js @@ -17,7 +17,7 @@ const scraper = { async scraper() { const date = datetime.getYYYYMMDD(process.env.SCRAPE_DATE); - const raw = await fetch.csv(this.url, false); + const raw = await fetch.csv(this, this.url, 'default', false); const dataset = raw .filter(item => item.region !== 'Total') .map(item => ({ ...item, region: mapping[item.region] })); diff --git a/src/shared/scrapers/SE/index.js b/src/shared/scrapers/SE/index.js index bba8d6699..13a9fc10f 100644 --- a/src/shared/scrapers/SE/index.js +++ b/src/shared/scrapers/SE/index.js @@ -16,7 +16,7 @@ const scraper = { async scraper() { const date = datetime.getYYYYMMDD(process.env.SCRAPE_DATE); - const casesRaw = await fetch.json(this.url, false); + const casesRaw = await fetch.json(this, this.url, 'default', false); const casesData = casesRaw.features.map(({ attributes }) => attributes); const casesByRegion = {}; diff --git a/src/shared/scrapers/SI/index.js b/src/shared/scrapers/SI/index.js index 941d40c5b..8c9f7424a 100644 --- a/src/shared/scrapers/SI/index.js +++ b/src/shared/scrapers/SI/index.js @@ -20,9 +20,9 @@ const scraper = { async scraper() { const date = datetime.getYYYYMMDD(process.env.SCRAPE_DATE); - const casesData = await fetch.csv(this.url, false); + const casesData = await fetch.csv(this, this.url, 'cases', false); const regionUrl = 'https://raw.githubusercontent.com/slo-covid-19/data/master/csv/regions.csv'; - const regionData = await fetch.csv(regionUrl, false); + const regionData = await fetch.csv(this, regionUrl, 'region', false); let nationalData = {}; diff --git a/src/shared/scrapers/TH/index.js b/src/shared/scrapers/TH/index.js index aaff42b43..007b43702 100644 --- a/src/shared/scrapers/TH/index.js +++ b/src/shared/scrapers/TH/index.js @@ -17,7 +17,7 @@ const scraper = { url: 'https://ddcportal.ddc.moph.go.th/arcgis/rest/services/iT_Neillgis/thai_cities/FeatureServer/0/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=*&returnGeometry=false&maxAllowableOffset=&geometryPrecision=&outSR=&having=&gdbVersion=&historicMoment=&returnDistinctValues=false&returnIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&multipatchOption=xyFootprint&resultOffset=&resultRecordCount=&returnTrueCurves=false&returnExceededLimitFeatures=false&quantizationParameters=&returnCentroid=false&sqlFormat=none&resultType=&featureEncoding=esriDefault&f=pjson', async scraper() { - const data = await fetch.json(this.url); + const data = await fetch.json(this, this.url, 'default'); assert(data, 'No data fetched'); assert.equal(data.features.length, 1, 'more features added, we may be scraping the wrong thing'); const { attributes } = data.features[0]; diff --git a/src/shared/scrapers/UA/index.js b/src/shared/scrapers/UA/index.js index cdbfe4e88..ddd36f8d1 100644 --- a/src/shared/scrapers/UA/index.js +++ b/src/shared/scrapers/UA/index.js @@ -54,7 +54,7 @@ const scraper = { date = datetime.getYYYYMMDD(date); this.url = this._baseURL + date; - const data = await fetch.json(this.url, false, { disableSSL: true }); + const data = await fetch.json(this, this.url, 'default', false, { disableSSL: true }); if (data === null) { throw new Error(`UA: failed to fetch data from ${this.url}.`); diff --git a/src/shared/scrapers/US/AK/index.js b/src/shared/scrapers/US/AK/index.js index 0fb1c451c..79d24819d 100644 --- a/src/shared/scrapers/US/AK/index.js +++ b/src/shared/scrapers/US/AK/index.js @@ -52,7 +52,7 @@ const scraper = { async scraper() { const counties = []; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $table = $('td:contains("Seward")').closest('table'); const $trs = $table.find('tbody > tr'); $trs.each((index, tr) => { diff --git a/src/shared/scrapers/US/AL/index.js b/src/shared/scrapers/US/AL/index.js index e72781ed5..00e850cca 100644 --- a/src/shared/scrapers/US/AL/index.js +++ b/src/shared/scrapers/US/AL/index.js @@ -90,7 +90,7 @@ const scraper = { scraper: { '0': async function() { let counties = []; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $table = $('td:contains("(COVID-19) in Alabama")').closest('table'); const $trs = $table.find('tbody > tr:not(:last-child)'); $trs.each((index, tr) => { @@ -115,9 +115,9 @@ const scraper = { }, '2020-03-26': async function() { let counties = []; - this.url = await fetch.getArcGISCSVURLFromOrgId(7, '4RQmZZ0yaZkGR1zy', 'COV19_Public_Dashboard_ReadOnly'); + this.url = await fetch.getArcGISCSVURLFromOrgId(this, 7, '4RQmZZ0yaZkGR1zy', 'COV19_Public_Dashboard_ReadOnly'); this.type = 'csv'; - const data = await fetch.csv(this.url); + const data = await fetch.csv(this, this.url, 'default'); for (const row of data) { const county = geography.addCounty(row.CNTYNAME); const cases = parse.number(row.CONFIRMED); diff --git a/src/shared/scrapers/US/AR/index.js b/src/shared/scrapers/US/AR/index.js index 4d3de70d8..abca277fc 100644 --- a/src/shared/scrapers/US/AR/index.js +++ b/src/shared/scrapers/US/AR/index.js @@ -98,7 +98,7 @@ const scraper = { 'Yell County' ], async scraper() { - const data = await fetch.json(this.url); + const data = await fetch.json(this, this.url, 'default'); let counties = []; for (const countyData of data.features) { diff --git a/src/shared/scrapers/US/AZ/index.js b/src/shared/scrapers/US/AZ/index.js index 6eae08e30..6e020b996 100644 --- a/src/shared/scrapers/US/AZ/index.js +++ b/src/shared/scrapers/US/AZ/index.js @@ -21,7 +21,7 @@ const scraper = { scraper: { '0': async function() { // Get the Tableau chart - const $ = await fetch.headless(this.url); + const $ = await fetch.headless(this, this.url, 'tmpindex'); // Pull out our session id from the json stuffed inside the textarea const textArea = $('textarea#tsConfigContainer').text(); @@ -32,7 +32,7 @@ const scraper = { const url = `https://tableau.azdhs.gov/vizql/w/COVID-19Dashboard/v/COVID-19table/vud/sessions/${sessionId}/views/8275719771277684273_9753144220671897612?csv=true&summary=true`; // Parse the tab separated values file that comes back - const data = await fetch.tsv(url); + const data = await fetch.tsv(this, url, 'default'); const counties = []; for (const row of data) { @@ -50,7 +50,7 @@ const scraper = { }, '2020-03-30': async function() { this.url = 'https://opendata.arcgis.com/datasets/5b34cf1637434c7bb6793580c40d1685_0.csv'; - const data = await fetch.csv(this.url); + const data = await fetch.csv(this, this.url, 'default'); const counties = []; for (const county of data) { diff --git a/src/shared/scrapers/US/CA/alameda-county.js b/src/shared/scrapers/US/CA/alameda-county.js index 4fad06d88..1acf012fd 100644 --- a/src/shared/scrapers/US/CA/alameda-county.js +++ b/src/shared/scrapers/US/CA/alameda-county.js @@ -22,13 +22,13 @@ const scraper = { maintainers: [maintainers.jbencina], scraper: { '0': async function() { - const $ = await fetch.headless(this.url); + const $ = await fetch.headless(this, this.url, 'default'); const $el = $('p:contains("Positive Cases")'); const matches = $el.html().match(/Positive Cases:.*?(\d+).*/); return { cases: parse.number(matches[1]) }; }, '2020-04-15': async function() { - await fetch.headless(this.url); + await fetch.headless(this, this.url, 'default'); throw new DeprecatedError('Sunsetting county level scrapers'); } } diff --git a/src/shared/scrapers/US/CA/butte-county.js b/src/shared/scrapers/US/CA/butte-county.js index d1c07022c..d32e701dd 100644 --- a/src/shared/scrapers/US/CA/butte-county.js +++ b/src/shared/scrapers/US/CA/butte-county.js @@ -19,7 +19,7 @@ const scraper = { url: 'https://www.buttecounty.net/publichealth', maintainers: [maintainers.jbencina], async scraper() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const cases = parse.number( $('td:contains("Positive COVID-19 Tests")') .next() diff --git a/src/shared/scrapers/US/CA/calaveras-county.js b/src/shared/scrapers/US/CA/calaveras-county.js index b8a410bde..f3dbdc1d3 100644 --- a/src/shared/scrapers/US/CA/calaveras-county.js +++ b/src/shared/scrapers/US/CA/calaveras-county.js @@ -14,7 +14,7 @@ const scraper = { maintainers: [maintainers.jbencina], scraper: { '0': async function() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const cases = parse.number( $('h2:contains("in Calaveras County:")') .first() @@ -24,7 +24,7 @@ const scraper = { return { cases }; }, '2020-04-15': async function() { - await fetch.page(this.url); + await fetch.page(this, this.url, 'default'); throw new DeprecatedError('Sunsetting county level scrapers'); } } diff --git a/src/shared/scrapers/US/CA/colusa-county.js b/src/shared/scrapers/US/CA/colusa-county.js index f4b995dc6..e939269c3 100644 --- a/src/shared/scrapers/US/CA/colusa-county.js +++ b/src/shared/scrapers/US/CA/colusa-county.js @@ -12,7 +12,7 @@ const scraper = { url: 'http://www.countyofcolusa.org/99/Public-Health', maintainers: [maintainers.jbencina], async scraper() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const cases = parse.number( $('strong:contains("Confirmed Cases:")') .first() diff --git a/src/shared/scrapers/US/CA/contra-costa-county.js b/src/shared/scrapers/US/CA/contra-costa-county.js index b0f2fada2..c76628aab 100644 --- a/src/shared/scrapers/US/CA/contra-costa-county.js +++ b/src/shared/scrapers/US/CA/contra-costa-county.js @@ -12,7 +12,7 @@ const scraper = { url: 'https://www.coronavirus.cchealth.org/', maintainers: [maintainers.jbencina], async scraper() { - const $ = await fetch.headless(this.url); + const $ = await fetch.headless(this, this.url, 'default'); const cases = parse.number( $('h1:contains("TOTAL")') .parent() diff --git a/src/shared/scrapers/US/CA/del-norte-county.js b/src/shared/scrapers/US/CA/del-norte-county.js index 0d4d53881..bc769bce6 100644 --- a/src/shared/scrapers/US/CA/del-norte-county.js +++ b/src/shared/scrapers/US/CA/del-norte-county.js @@ -14,7 +14,7 @@ const scraper = { maintainers: [maintainers.jbencina], scraper: { '0': async function() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const cases = parse.number( $('font:contains("Number of Confirmed Cases")') .first() @@ -46,7 +46,7 @@ const scraper = { }; }, '2020-03-18': async function() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const cases = parse.number( $('font:contains("Number of Positive")') .first() @@ -65,7 +65,7 @@ const scraper = { }; }, '2020-04-15': async function() { - await fetch.page(this.url); + await fetch.page(this, this.url, 'default'); throw new DeprecatedError('Sunsetting county level scraper'); } } diff --git a/src/shared/scrapers/US/CA/fresno-county.js b/src/shared/scrapers/US/CA/fresno-county.js index 53f587841..ad2bdf8f1 100644 --- a/src/shared/scrapers/US/CA/fresno-county.js +++ b/src/shared/scrapers/US/CA/fresno-county.js @@ -12,7 +12,7 @@ const scraper = { url: 'https://www.co.fresno.ca.us/departments/public-health/covid-19', maintainers: [maintainers.jbencina], async scraper() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); return { cases: parse.number( $('li:contains("Total cases")') diff --git a/src/shared/scrapers/US/CA/glenn-county.js b/src/shared/scrapers/US/CA/glenn-county.js index b1d790097..726f03680 100644 --- a/src/shared/scrapers/US/CA/glenn-county.js +++ b/src/shared/scrapers/US/CA/glenn-county.js @@ -16,7 +16,7 @@ const scraper = { ? 'https://www.countyofglenn.net/dept/health-human-services/public-health/welcome' : 'https://www.countyofglenn.net/dept/health-human-services/public-health/covid-19'; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); if (datetime.scrapeDateIsBefore('2020-03-17')) { const cases = parse.number( $('font:contains("Glenn County COVID-19 Cases")') diff --git a/src/shared/scrapers/US/CA/index.js b/src/shared/scrapers/US/CA/index.js index 8192fa111..860cdacfa 100644 --- a/src/shared/scrapers/US/CA/index.js +++ b/src/shared/scrapers/US/CA/index.js @@ -44,7 +44,7 @@ const scraper = { async _fetchLatest() { this.url = 'https://docs.google.com/spreadsheets/d/1CwZA4RPNf_hUrwzNLyGGNHRlh1cwl8vDHwIoae51Hac/gviz/tq?tqx=out:csv&sheet=master'; - const data = await fetch.csv(this.url); + const data = await fetch.csv(this, this.url, 'default'); return this._processData(data); }, scraper: { @@ -54,7 +54,7 @@ const scraper = { '2020-04-06': async function() { this.url = 'https://docs.google.com/spreadsheets/d/1CwZA4RPNf_hUrwzNLyGGNHRlh1cwl8vDHwIoae51Hac/gviz/tq?tqx=out:csv&sheet=2020-04-06'; - const data = await fetch.csv(this.url, false); + const data = await fetch.csv(this, this.url, 'default', false); return this._processData(data); }, '2020-04-07': async function() { diff --git a/src/shared/scrapers/US/CA/kern-county.js b/src/shared/scrapers/US/CA/kern-county.js index b4c4a1ecd..c410dcc7d 100644 --- a/src/shared/scrapers/US/CA/kern-county.js +++ b/src/shared/scrapers/US/CA/kern-county.js @@ -13,14 +13,14 @@ const scraper = { type: 'table', scraper: { '0': async function() { - let $ = await fetch.headless(this.url); + let $ = await fetch.headless(this, this.url, 'default'); let cases = 0; let tested = 0; // Pull out and fetch the embedded iframe const frameURL = $('iframe').attr('src'); - $ = await fetch.headless(frameURL); + $ = await fetch.headless(this, frameURL, 'default'); const getVal = function(title) { const val = parse.number( diff --git a/src/shared/scrapers/US/CA/kings-county.js b/src/shared/scrapers/US/CA/kings-county.js index 2374b42e8..99c2354db 100644 --- a/src/shared/scrapers/US/CA/kings-county.js +++ b/src/shared/scrapers/US/CA/kings-county.js @@ -13,7 +13,7 @@ const scraper = { url: 'https://www.countyofkings.com/departments/health-welfare/public-health/coronavirus-disease-2019-covid-19/-fsiteid-1', async scraper() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const cases = parse.number( $('h3:contains("Confirmed Cases")') .text() diff --git a/src/shared/scrapers/US/CA/los-angeles-county.js b/src/shared/scrapers/US/CA/los-angeles-county.js index 384f6d18b..0fc04c25d 100644 --- a/src/shared/scrapers/US/CA/los-angeles-county.js +++ b/src/shared/scrapers/US/CA/los-angeles-county.js @@ -18,7 +18,7 @@ const scraper = { url: 'http://www.publichealth.lacounty.gov/media/Coronavirus/js/casecounter.js', scraper: { '0': async function() { - const $ = await fetch.page('http://www.publichealth.lacounty.gov/media/Coronavirus/'); + const $ = await fetch.page(this, 'http://www.publichealth.lacounty.gov/media/Coronavirus/', 'default'); return { cases: parse.number( $('.counter') @@ -33,7 +33,7 @@ const scraper = { }; }, '2020-03-27': async function() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const { content } = JSON.parse($.text().match(/data = (?[\S\s]+?);/).groups.json); return { cases: parse.number(content.count), diff --git a/src/shared/scrapers/US/CA/madera-county.js b/src/shared/scrapers/US/CA/madera-county.js index 5b7f7dca0..26130c443 100644 --- a/src/shared/scrapers/US/CA/madera-county.js +++ b/src/shared/scrapers/US/CA/madera-county.js @@ -14,7 +14,7 @@ const scraper = { url: 'https://www.maderacounty.com/government/public-health/health-updates/corona-virus', scraper: { '0': async function() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $el = $('*:contains("Confirmed cases")').first(); const matches = $el.text().match(/Confirmed cases:.*?(\d+)/); return { cases: parse.number(matches[1]) }; diff --git a/src/shared/scrapers/US/CA/marin-county.js b/src/shared/scrapers/US/CA/marin-county.js index 24d356910..8fccdbf10 100644 --- a/src/shared/scrapers/US/CA/marin-county.js +++ b/src/shared/scrapers/US/CA/marin-county.js @@ -14,13 +14,13 @@ const scraper = { url: 'https://coronavirus.marinhhs.org/surveillance', scraper: { '0': async function() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const text = $('td:contains("confirmed cases of COVID-19")').text(); const cases = parse.number(text.match(/there have been (\d+) confirmed cases of/)[1]); return { cases }; }, '2020-04-15': async function() { - await fetch.page(this.url); + await fetch.page(this, this.url, 'default'); throw new DeprecatedError('Sunsetting county level scrapers'); } } diff --git a/src/shared/scrapers/US/CA/mendocino-county.js b/src/shared/scrapers/US/CA/mendocino-county.js index f2611e732..c355f521f 100644 --- a/src/shared/scrapers/US/CA/mendocino-county.js +++ b/src/shared/scrapers/US/CA/mendocino-county.js @@ -14,7 +14,7 @@ const scraper = { url: 'https://www.mendocinocounty.org/community/novel-coronavirus', scraper: { '0': async function() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const cases = parse.number( $('strong:contains("current cases of COVID-19")') .text() @@ -23,20 +23,20 @@ const scraper = { return { cases }; }, '2020-03-18': async function() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $strong = $('strong:contains("current case")'); const cases = parse.number($strong.text().match(/(\d+) current/)[1]); return { cases }; }, '2020-03-23': async function() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $outerLI = $('li:contains("Testing Numbers")'); const $li = $outerLI.find('li:contains("Total Positives")'); const cases = parse.number($li.text().split(':')[1]); return { cases }; }, '2020-04-15': async function() { - await fetch.page(this.url); + await fetch.page(this, this.url, 'default'); throw new DeprecatedError('Sunsetting county level scrapers'); } } diff --git a/src/shared/scrapers/US/CA/merced-county.js b/src/shared/scrapers/US/CA/merced-county.js index 0fe8c5c96..7ec8eacf9 100644 --- a/src/shared/scrapers/US/CA/merced-county.js +++ b/src/shared/scrapers/US/CA/merced-county.js @@ -15,7 +15,7 @@ const scraper = { url: 'https://www.co.merced.ca.us/3350/Coronavirus-Disease-2019', scraper: { '0': async function() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $table = $('h3:contains("Merced County COVID-19 Statistics")') .parent() .next('table'); @@ -46,7 +46,7 @@ const scraper = { '2020-03-16': async function() { this.type = 'table'; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); cheerioTableparser($); const $table = $('td:contains("Cases")').closest('table'); @@ -72,7 +72,7 @@ const scraper = { }; }, '2020-04-15': async function() { - await fetch.page(this.url); + await fetch.page(this, this.url, 'default'); throw new DeprecatedError('Sunsetting county level scrapers'); } } diff --git a/src/shared/scrapers/US/CA/mono-county.js b/src/shared/scrapers/US/CA/mono-county.js index 389f53397..78487f50b 100644 --- a/src/shared/scrapers/US/CA/mono-county.js +++ b/src/shared/scrapers/US/CA/mono-county.js @@ -13,7 +13,7 @@ const scraper = { url: 'https://monocovid19-monomammoth.hub.arcgis.com/', scraper: { '0': async function() { - const $ = await fetch.headless(this.url); + const $ = await fetch.headless(this, this.url, 'default'); const cases = parse.number( $('h4:contains("POSITIVE")') .first() @@ -24,7 +24,7 @@ const scraper = { return { cases }; }, '2020-03-19': async function() { - const $ = await fetch.headless(this.url); + const $ = await fetch.headless(this, this.url, 'default'); const cases = parse.number( $('h4:contains("POSITIVECASES")') .first() diff --git a/src/shared/scrapers/US/CA/monterey-county.js b/src/shared/scrapers/US/CA/monterey-county.js index 1555f594e..7cf3912b9 100644 --- a/src/shared/scrapers/US/CA/monterey-county.js +++ b/src/shared/scrapers/US/CA/monterey-county.js @@ -12,7 +12,7 @@ const scraper = { 'https://www.co.monterey.ca.us/government/departments-a-h/administrative-office/office-of-emergency-services/response/covid-19', type: 'table', async scraper() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); let cases = 0; cases += parse.number( diff --git a/src/shared/scrapers/US/CA/orange-county.js b/src/shared/scrapers/US/CA/orange-county.js index d5e762a14..7ada9c544 100644 --- a/src/shared/scrapers/US/CA/orange-county.js +++ b/src/shared/scrapers/US/CA/orange-county.js @@ -14,7 +14,7 @@ const scraper = { url: 'http://www.ochealthinfo.com/phs/about/epidasmt/epi/dip/prevention/novel_coronavirus', scraper: { '0': async function scraper() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); return { cases: parse.number( $('td:contains("Cases")') @@ -30,7 +30,7 @@ const scraper = { }, '2020-03-18': async function scraper() { this.url = 'https://occovid19.ochealthinfo.com/coronavirus-in-oc'; - await fetch.page(this.url); + await fetch.page(this, this.url, 'default'); throw new DeprecatedError('Need to scrape new page'); } } diff --git a/src/shared/scrapers/US/CA/placer-county.js b/src/shared/scrapers/US/CA/placer-county.js index 9a1671bf1..ee7e3cf20 100644 --- a/src/shared/scrapers/US/CA/placer-county.js +++ b/src/shared/scrapers/US/CA/placer-county.js @@ -13,7 +13,7 @@ const scraper = { url: 'https://www.placer.ca.gov/6448/Cases-in-Placer', scraper: { '0': async function() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $table = $('p:contains("Confirmed COVID-19 Cases in Placer County")') .nextAll('table') .first(); @@ -35,7 +35,7 @@ const scraper = { }; }, '3/28/2020': async function() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); return { cases: parse.number( $('td:contains("Cases")') diff --git a/src/shared/scrapers/US/CA/riverside-county.js b/src/shared/scrapers/US/CA/riverside-county.js index 883c08942..faf18560d 100644 --- a/src/shared/scrapers/US/CA/riverside-county.js +++ b/src/shared/scrapers/US/CA/riverside-county.js @@ -14,13 +14,13 @@ const scraper = { url: 'https://www.rivcoph.org/coronavirus', scraper: { '0': async function() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $el = $('p:contains("Confirmed cases:")').first(); const matches = $el.text().match(/Confirmed cases:.*?(\d+)/); return { cases: parse.number(matches[1]) }; }, '2020-04-15': async function() { - await fetch.page(this.url); + await fetch.page(this, this.url, 'default'); throw new DeprecatedError('Sunsetting county scraper'); } } diff --git a/src/shared/scrapers/US/CA/sacramento-county.js b/src/shared/scrapers/US/CA/sacramento-county.js index 9ff3c70c3..0b4a144d6 100644 --- a/src/shared/scrapers/US/CA/sacramento-county.js +++ b/src/shared/scrapers/US/CA/sacramento-county.js @@ -14,7 +14,7 @@ const scraper = { url: 'https://www.saccounty.net/COVID-19/Pages/default.aspx', scraper: { '0': async function() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $table = $('th:contains("Confirmed")').closest('table'); const $tds = $table.find('tr:nth-child(2) > td'); return { @@ -23,7 +23,7 @@ const scraper = { }; }, '2020-04-15': async function() { - await fetch.page(this.url); + await fetch.page(this, this.url, 'default'); throw new DeprecatedError('Sunsetting county scraper'); } } diff --git a/src/shared/scrapers/US/CA/san-benito-county.js b/src/shared/scrapers/US/CA/san-benito-county.js index 027eff973..5f4b4a843 100644 --- a/src/shared/scrapers/US/CA/san-benito-county.js +++ b/src/shared/scrapers/US/CA/san-benito-county.js @@ -14,7 +14,7 @@ const scraper = { url: 'https://hhsa.cosb.us/publichealth/communicable-disease/coronavirus/', scraper: { '0': async function scraper() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $table = $('h1:contains("San Benito County COVID-19 Case Count")') .nextAll('table') .first(); diff --git a/src/shared/scrapers/US/CA/san-bernardino-county.js b/src/shared/scrapers/US/CA/san-bernardino-county.js index a10f4a246..6a2c030c8 100644 --- a/src/shared/scrapers/US/CA/san-bernardino-county.js +++ b/src/shared/scrapers/US/CA/san-bernardino-county.js @@ -14,7 +14,7 @@ const scraper = { url: 'http://wp.sbcounty.gov/dph/coronavirus/', scraper: { '0': async function() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const cases = parse.number( $('h3:contains("COVID-19 CASES")') .parent() @@ -23,7 +23,7 @@ const scraper = { return { cases }; }, '2020-04-15': async function() { - await fetch.page(this.url); + await fetch.page(this, this.url, 'default'); throw new DeprecatedError('Sunsetting county scraper'); } } diff --git a/src/shared/scrapers/US/CA/san-diego-county.js b/src/shared/scrapers/US/CA/san-diego-county.js index 1b69208bb..e3d8ce266 100644 --- a/src/shared/scrapers/US/CA/san-diego-county.js +++ b/src/shared/scrapers/US/CA/san-diego-county.js @@ -13,7 +13,7 @@ const scraper = { url: 'https://www.sandiegocounty.gov/content/sdc/hhsa/programs/phs/community_epidemiology/dc/2019-nCoV/status.html', scraper: { '0': async function scraper() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); let cases = 0; $('td:contains("Positive (confirmed cases)")') .nextAll('td') @@ -35,7 +35,7 @@ const scraper = { }; }, '2020-03-15': async function scraper() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const cases = parse.number( $('td:contains("Total Positives")') .next() diff --git a/src/shared/scrapers/US/CA/san-francisco-county.js b/src/shared/scrapers/US/CA/san-francisco-county.js index 376d78802..8fc83770c 100644 --- a/src/shared/scrapers/US/CA/san-francisco-county.js +++ b/src/shared/scrapers/US/CA/san-francisco-county.js @@ -15,7 +15,7 @@ const scraper = { async scraper() { let deaths; let cases; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $h2 = $('h2:contains("Cases in San Francisco")'); { const $p = $h2.nextAll('*:contains("Cases:")'); diff --git a/src/shared/scrapers/US/CA/san-joaquin-county.js b/src/shared/scrapers/US/CA/san-joaquin-county.js index 53dfa7086..a988d3e23 100644 --- a/src/shared/scrapers/US/CA/san-joaquin-county.js +++ b/src/shared/scrapers/US/CA/san-joaquin-county.js @@ -13,7 +13,7 @@ const scraper = { url: 'http://www.sjcphs.org/coronavirus.aspx#res', scraper: { '0': async function() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); this.type = 'paragraph'; const h3 = $('h6:contains("confirmed cases of COVID-19")') .first() @@ -22,7 +22,7 @@ const scraper = { return { cases }; }, '2020-03-17': async function() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); this.type = 'table'; const $table = $('h3:contains("San Joaquin County COVID-19 Numbers at a Glance")').closest('table'); const $headers = $table.find('tbody > tr:nth-child(2) > td'); diff --git a/src/shared/scrapers/US/CA/san-luis-obispo-county.js b/src/shared/scrapers/US/CA/san-luis-obispo-county.js index f4283238d..c7f025228 100644 --- a/src/shared/scrapers/US/CA/san-luis-obispo-county.js +++ b/src/shared/scrapers/US/CA/san-luis-obispo-county.js @@ -11,7 +11,7 @@ const scraper = { url: 'https://www.emergencyslo.org/en/covid19.aspx', type: 'paragraph', async scraper() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); let cases = $('td:contains("San Luis Obispo County")') .next() diff --git a/src/shared/scrapers/US/CA/san-mateo-county.js b/src/shared/scrapers/US/CA/san-mateo-county.js index 91db28ac9..4cf7ef67d 100644 --- a/src/shared/scrapers/US/CA/san-mateo-county.js +++ b/src/shared/scrapers/US/CA/san-mateo-county.js @@ -14,7 +14,7 @@ const scraper = { async scraper() { let deaths; let cases; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $th = $('th:contains("COVID-19 Case Count")'); const $table = $th.closest('table'); { diff --git a/src/shared/scrapers/US/CA/santa-barbara-county.js b/src/shared/scrapers/US/CA/santa-barbara-county.js index 6a2fd047a..5fed385df 100644 --- a/src/shared/scrapers/US/CA/santa-barbara-county.js +++ b/src/shared/scrapers/US/CA/santa-barbara-county.js @@ -16,7 +16,7 @@ const scraper = { type: 'paragraph', scraper: { '0': async function() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); let cases = 0; cases += parse.number( @@ -31,7 +31,7 @@ const scraper = { '2020-03-26': async function() { this.type = 'table'; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); cheerioTableparser($); let $table = $('td:contains("City or Area")').closest('table'); @@ -59,7 +59,7 @@ const scraper = { return { cases, tested }; }, '2020-04-15': async function() { - await fetch.page(this.url); + await fetch.page(this, this.url, 'default'); throw new DeprecatedError('Sunsetting county scraper'); } } diff --git a/src/shared/scrapers/US/CA/santa-clara-county.js b/src/shared/scrapers/US/CA/santa-clara-county.js index 890181ec7..0d7480a0c 100644 --- a/src/shared/scrapers/US/CA/santa-clara-county.js +++ b/src/shared/scrapers/US/CA/santa-clara-county.js @@ -14,7 +14,7 @@ const scraper = { url: 'https://www.sccgov.org/sites/phd/DiseaseInformation/novel-coronavirus/Pages/home.aspx', scraper: { '0': async function() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const scriptData = $('script:contains("Total_Confirmed_Cases")')[0].children[0].data; const regExp = /\[.*\]/; const data = JSON.parse(regExp.exec(scriptData))[0]; @@ -26,7 +26,7 @@ const scraper = { }; }, '2020-05-15': async function() { - await fetch.page(this.url); + await fetch.page(this, this.url, 'default'); throw new DeprecatedError('Sunsetting county scraper'); } } diff --git a/src/shared/scrapers/US/CA/santa-cruz-county.js b/src/shared/scrapers/US/CA/santa-cruz-county.js index 108c6bee0..ab5fc84da 100644 --- a/src/shared/scrapers/US/CA/santa-cruz-county.js +++ b/src/shared/scrapers/US/CA/santa-cruz-county.js @@ -14,7 +14,7 @@ const scraper = { url: 'http://www.santacruzhealth.org/HSAHome/HSADivisions/PublicHealth/CommunicableDiseaseControl/Coronavirus.aspx', scraper: { '0': async function() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $h2 = $('p:contains("Total Confirmed Cases")').nextAll('h2'); if ($h2.html() === null) { throw new Error('H2 not found'); @@ -23,7 +23,7 @@ const scraper = { return { cases }; }, '2020-04-15': async function() { - await fetch.page(this.url); + await fetch.page(this, this.url, 'default'); throw new DeprecatedError('Moved to Tableau'); } } diff --git a/src/shared/scrapers/US/CA/shasta-county.js b/src/shared/scrapers/US/CA/shasta-county.js index d581168df..dbc125eb1 100644 --- a/src/shared/scrapers/US/CA/shasta-county.js +++ b/src/shared/scrapers/US/CA/shasta-county.js @@ -14,7 +14,7 @@ const scraper = { '0': async function() { this.url = 'https://www.co.shasta.ca.us/index/hhsa/health-safety/current-heath-concerns/coronavirus'; this.type = 'paragraph'; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $el = $('h3:contains("Positive cases:")').first(); const matches = $el.text().match(/Positive cases:.*?(\d+)/); return { cases: parse.number(matches[1]) }; @@ -22,7 +22,7 @@ const scraper = { '2020-03-20': async function() { this.url = 'https://www.co.shasta.ca.us/covid-19/overview'; this.type = 'table'; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $el = $('td:contains("Total Confirmed Cases")').next('td'); return { cases: parse.number($el.text()) }; } diff --git a/src/shared/scrapers/US/CA/solano-county.js b/src/shared/scrapers/US/CA/solano-county.js index d3aa8fe58..3e28ce0ef 100644 --- a/src/shared/scrapers/US/CA/solano-county.js +++ b/src/shared/scrapers/US/CA/solano-county.js @@ -14,13 +14,13 @@ const scraper = { url: 'http://www.solanocounty.com/depts/ph/coronavirus.asp', scraper: { '0': async function() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $el = $('*:contains("Number of Positive Cases")').first(); const matches = $el.text().match(/Number of Positive Cases in Solano County: (\d+)/); return { cases: parse.number(matches[1]) }; }, '2020-03-23': async function() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const lines = $('font:contains("Confirmed COVID-19")') .html() diff --git a/src/shared/scrapers/US/CA/sonoma-county-argcgis.js b/src/shared/scrapers/US/CA/sonoma-county-argcgis.js index ae43846c1..96c1d15a3 100644 --- a/src/shared/scrapers/US/CA/sonoma-county-argcgis.js +++ b/src/shared/scrapers/US/CA/sonoma-county-argcgis.js @@ -18,7 +18,7 @@ const scraper = { } ], async scraper() { - const data = await fetch.csv(this.url); + const data = await fetch.csv(this, this.url, 'default'); const counties = []; for (const county of data) { counties.push({ diff --git a/src/shared/scrapers/US/CA/sonoma-county.js b/src/shared/scrapers/US/CA/sonoma-county.js index 458bfcb7f..93c1c8825 100644 --- a/src/shared/scrapers/US/CA/sonoma-county.js +++ b/src/shared/scrapers/US/CA/sonoma-county.js @@ -14,7 +14,7 @@ const scraper = { url: 'https://socoemergency.org/emergency/novel-coronavirus/novel-coronavirus-in-sonoma-county/', scraper: { '0': async function() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $th = $('th:contains("Total in Sonoma County")'); const $table = $th.closest('table'); const $td = $table.find('td:last-child'); @@ -22,7 +22,7 @@ const scraper = { return { cases }; }, '2020-03-28': async function() { - await fetch.page(this.url); + await fetch.page(this, this.url, 'default'); throw new DeprecatedError('Sonoma switched to ArcGIS, which is handled by another scraper'); } } diff --git a/src/shared/scrapers/US/CA/stanislaus-county.js b/src/shared/scrapers/US/CA/stanislaus-county.js index 8af555436..5f584de1c 100644 --- a/src/shared/scrapers/US/CA/stanislaus-county.js +++ b/src/shared/scrapers/US/CA/stanislaus-county.js @@ -13,7 +13,7 @@ const scraper = { url: 'http://www.schsa.org/PublicHealth/pages/corona-virus/', scraper: { '0': async function() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); return { cases: parse.number( $('.counter') @@ -23,7 +23,7 @@ const scraper = { }; }, '2020-03-25': async function() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); return { cases: parse.number( $('p:contains("Positive Cases")') diff --git a/src/shared/scrapers/US/CA/ventura-county.js b/src/shared/scrapers/US/CA/ventura-county.js index 40caef698..f6deb3093 100644 --- a/src/shared/scrapers/US/CA/ventura-county.js +++ b/src/shared/scrapers/US/CA/ventura-county.js @@ -14,7 +14,7 @@ const scraper = { scraper: { '0': async function() { this.url = 'https://www.ventura.org/covid19/'; - const $ = await fetch.headless(this.url); + const $ = await fetch.headless(this, this.url, 'default'); let cases = 0; let tested = 0; @@ -41,7 +41,7 @@ const scraper = { '2020-03-16': async function() { this.url = 'https://www.ventura.org/covid19/'; - const $ = await fetch.headless(this.url); + const $ = await fetch.headless(this, this.url, 'default'); let cases = 0; let tested = 0; @@ -72,7 +72,7 @@ const scraper = { '2020-03-18': async function() { this.url = 'https://www.ventura.org/covid19/'; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const cases = parse.number( $('td:contains("COVID-19 Cases")') @@ -87,7 +87,7 @@ const scraper = { '2020-03-19': async function() { this.url = 'https://www.vcemergency.com'; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const cases = parse.number( $('td:contains("COVID-19 Cases")') @@ -109,7 +109,7 @@ const scraper = { '2020-03-25': async function() { this.url = 'https://www.vcemergency.com'; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const cases = parse.number( $('td:contains("Positive Cases")') @@ -131,7 +131,7 @@ const scraper = { '2020-03-26': async function() { this.url = 'https://www.vcemergency.com'; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const positiveCases = $('td:contains("Positive Cases")').closest('tr'); if (positiveCases.text() !== 'Positive Cases') { @@ -154,7 +154,7 @@ const scraper = { '2020-03-30': async function() { this.url = 'https://www.vcemergency.com'; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const cases = parse.number( $('td:contains("TOTAL CASES")') .first() diff --git a/src/shared/scrapers/US/CA/yolo-county.js b/src/shared/scrapers/US/CA/yolo-county.js index 7bc6d3822..706a30e88 100644 --- a/src/shared/scrapers/US/CA/yolo-county.js +++ b/src/shared/scrapers/US/CA/yolo-county.js @@ -16,7 +16,7 @@ const scraper = { 'https://www.yolocounty.org/health-human-services/adults/communicable-disease-investigation-and-control/novel-coronavirus-2019', scraper: { '0': async function() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); if (datetime.scrapeDateIsBefore('2020-03-17')) { const $h3 = $('h3:contains("confirmed case")'); const matches = $h3.text().match(/there are (\d+) confirmed cases? in Yolo/); @@ -27,7 +27,7 @@ const scraper = { return { cases: parse.number(matches[1]) }; }, '2020-04-15': async function() { - await fetch.page(this.url); + await fetch.page(this, this.url, 'default'); throw new DeprecatedError('Sunsetting county scraper'); } } diff --git a/src/shared/scrapers/US/CO/index.js b/src/shared/scrapers/US/CO/index.js index 2035b3ccb..809f90648 100644 --- a/src/shared/scrapers/US/CO/index.js +++ b/src/shared/scrapers/US/CO/index.js @@ -17,7 +17,7 @@ const scraper = { this.url = 'https://docs.google.com/document/d/e/2PACX-1vRSxDeeJEaDxir0cCd9Sfji8ZPKzNaCPZnvRCbG63Oa1ztz4B4r7xG_wsoC9ucd_ei3--Pz7UD50yQD/pub'; this.type = 'list'; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const counties = []; const $lis = $('p:contains("Positive cases by county of residence")') .nextAll('ul') @@ -79,7 +79,7 @@ const scraper = { this.url = 'https://docs.google.com/document/d/e/2PACX-1vRSxDeeJEaDxir0cCd9Sfji8ZPKzNaCPZnvRCbG63Oa1ztz4B4r7xG_wsoC9ucd_ei3--Pz7UD50yQD/pub'; this.type = 'paragraph'; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); return { cases: parse.number( $('span:contains("Positive")') @@ -96,7 +96,7 @@ const scraper = { '2020-03-18': async function() { this.url = 'https://opendata.arcgis.com/datasets/46c727cc29424b1fb9db67554c7df04e_0.csv'; this.type = 'csv'; - const data = await fetch.csv(this.url); + const data = await fetch.csv(this, this.url, 'default'); const counties = []; for (const county of data) { counties.push({ @@ -111,7 +111,7 @@ const scraper = { '2020-03-19': async function() { this.url = 'https://opendata.arcgis.com/datasets/dec84f18254341419c514af8f9e784ba_0.csv'; this.type = 'csv'; - const data = await fetch.csv(this.url); + const data = await fetch.csv(this, this.url, 'default'); const counties = []; for (const county of data) { counties.push({ @@ -126,7 +126,7 @@ const scraper = { '2020-03-20': async function() { this.url = 'https://opendata.arcgis.com/datasets/fbae539746324ca69ff34f086286845b_0.csv'; this.type = 'csv'; - const data = await fetch.csv(this.url); + const data = await fetch.csv(this, this.url, 'default'); const counties = []; for (const county of data) { counties.push({ diff --git a/src/shared/scrapers/US/CT/index.js b/src/shared/scrapers/US/CT/index.js index 73bae2ab4..166e55261 100644 --- a/src/shared/scrapers/US/CT/index.js +++ b/src/shared/scrapers/US/CT/index.js @@ -27,7 +27,7 @@ const scraper = { '0': async function() { this.type = 'list'; const counties = []; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $lis = $('span:contains("Latest COVID-19 Testing Data in Connecticut")') .nextAll('ul') .first() @@ -49,7 +49,7 @@ const scraper = { '2020-03-18': async function() { this.type = 'paragraph'; const counties = []; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const p = $(':contains("Fairfield County:")') .last() .text(); @@ -68,7 +68,7 @@ const scraper = { '2020-03-19': async function() { this.type = 'table'; const counties = []; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $table = $('td:contains("Fairfield County")').closest('table'); const $trs = $table.find('tbody > tr:not(:last-child)'); @@ -94,7 +94,7 @@ const scraper = { let body; try { - body = await fetch.pdf(this.url); + body = await fetch.pdf(this, this.url, 'default'); } catch (err) { // The CT website does a 302 to a 404.html page if the PDF isn't yet available // This manifests as a PDF parsing error @@ -147,7 +147,7 @@ const scraper = { 'https://maps.ct.gov/arcgis/rest/services/CT_DPH_COVID_19_PROD_Layers/FeatureServer/1/query?f=json&where=1%3D1&returnGeometry=false&outFields=*'; this.type = 'json'; - const data = await fetch.json(this.url); + const data = await fetch.json(this, this.url, 'default'); const counties = []; data.features.forEach(item => { diff --git a/src/shared/scrapers/US/DC/index.js b/src/shared/scrapers/US/DC/index.js index 8abb07bb4..2e51b4d03 100644 --- a/src/shared/scrapers/US/DC/index.js +++ b/src/shared/scrapers/US/DC/index.js @@ -35,12 +35,12 @@ const scraper = { const formUrl = 'https://microstrategy.dc.gov/MicroStrategy/servlet/mstrWeb?evt=3067&src=mstrWeb.3067&reportID=DA2251A711EA6FB482660080EFA55B20&reportViewMode=1'; - const form = await fetch.page(formUrl, false, getOptions); + const form = await fetch.page(this, formUrl, 'form', false, getOptions); const rb = form('form input[name="rb"]').val(); const rawUrl = `https://microstrategy.dc.gov/MicroStrategy/export/Health_Statistics.csv?evt=3012&src=mstrWeb.3012&exportSection=1&exportFormatGrids=csvIServer&exportPlaintextDelimiter=1&exportMetricValuesAsText=0&exportHeadersAsText=0&exportFilterDetails=0&exportOverlapGridTitles=3&SaveReportProperties=*-1.*-1.0.0.0&rb=${rb}`; - const data = await fetch.raw(rawUrl, false, getOptions); + const data = await fetch.raw(this, rawUrl, 'default', false, getOptions); const json = await new Promise((resolve, reject) => { csvParse(data.slice(data.indexOf('"')).replace(/\0|\*/g, ''), (err, output) => { diff --git a/src/shared/scrapers/US/DE/index.js b/src/shared/scrapers/US/DE/index.js index 47863b089..8dde87e90 100644 --- a/src/shared/scrapers/US/DE/index.js +++ b/src/shared/scrapers/US/DE/index.js @@ -38,7 +38,7 @@ const scraper = { if (datetime.scrapeDateIsBefore('2020-03-16')) { this.url = 'https://www.dhss.delaware.gov/dhss/dph/epi/2019novelcoronavirus.html'; this.type = 'table'; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $td = $('*:contains("County breakdown")') .closest('tr') .find('td:last-child'); @@ -57,7 +57,7 @@ const scraper = { } this.url = 'http://opendata.arcgis.com/datasets/c8d4efa2a6bd48a1a7ae074a8166c6fa_0.csv'; this.type = 'csv'; - const data = await fetch.csv(this.url); + const data = await fetch.csv(this, this.url, 'default'); const counties = []; for (const county of data) { const countyObj = { @@ -76,7 +76,7 @@ const scraper = { this.url = 'https://services1.arcgis.com/PlCPCPzGOwulHUHo/arcgis/rest/services/DEMA_COVID_County_Boundary_Time_VIEW/FeatureServer/0/query?f=json&where=1%3D1&returnGeometry=false&spatialRel=esriSpatialRelIntersects&outFields=*&resultOffset=0&resultRecordCount=50&cacheHint=true'; this.type = 'json'; - const data = await fetch.json(this.url); + const data = await fetch.json(this, this.url, 'default'); const counties = []; for (const countyData of data.features) { diff --git a/src/shared/scrapers/US/FL/index.js b/src/shared/scrapers/US/FL/index.js index 801d9656d..a62ae00a0 100644 --- a/src/shared/scrapers/US/FL/index.js +++ b/src/shared/scrapers/US/FL/index.js @@ -104,7 +104,7 @@ const scraper = { this.type = 'table'; this.url = 'http://www.floridahealth.gov/diseases-and-conditions/COVID-19/index.html'; const countiesMap = {}; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $table = $('*:contains("Diagnosed in Florida")').closest('table'); const $trs = $table.find('tr'); $trs.each((index, tr) => { @@ -137,7 +137,7 @@ const scraper = { '2020-03-16': async function() { this.type = 'csv'; this.url = 'https://opendata.arcgis.com/datasets/b4930af3f43a48138c70bca409b5c452_0.csv'; - const data = await fetch.csv(this.url); + const data = await fetch.csv(this, this.url, 'default'); let counties = []; for (const county of data) { counties.push({ @@ -156,7 +156,7 @@ const scraper = { this.type = 'json'; this.url = 'https://services1.arcgis.com/CY1LXxl9zlJeBuRZ/arcgis/rest/services/Florida_Testing/FeatureServer/0/query?f=json&where=1%3D1&returnGeometry=true&spatialRel=esriSpatialRelIntersects&maxAllowableOffset=4891&geometry=%7B%22xmin%22%3A-10018754.1713954%2C%22ymin%22%3A2504688.542850271%2C%22xmax%22%3A-7514065.628547024%2C%22ymax%22%3A5009377.085698649%2C%22spatialReference%22%3A%7B%22wkid%22%3A102100%2C%22latestWkid%22%3A3857%7D%7D&geometryType=esriGeometryEnvelope&inSR=102100&outFields=*&outSR=102100&resultType=tile'; - const data = await fetch.json(this.url); + const data = await fetch.json(this, this.url, 'default'); let counties = []; for (const county of data.features) { @@ -184,8 +184,8 @@ const scraper = { }, '2020-03-25': async function() { this.type = 'csv'; - this.url = await fetch.getArcGISCSVURL(1, '74c7375b03894e68920c2d0131eef1e6', 'Florida_Testing'); - const data = await fetch.csv(this.url); + this.url = await fetch.getArcGISCSVURL(this, 1, '74c7375b03894e68920c2d0131eef1e6', 'Florida_Testing'); + const data = await fetch.csv(this, this.url, 'default'); let counties = []; for (const county of data) { @@ -212,7 +212,7 @@ const scraper = { '2020-03-30': async function() { this.type = 'csv'; this.url = 'https://opendata.arcgis.com/datasets/a7887f1940b34bf5a02c6f7f27a5cb2c_0.csv'; - const data = await fetch.csv(this.url); + const data = await fetch.csv(this, this.url, 'default'); let counties = []; const unassigned = { diff --git a/src/shared/scrapers/US/GA/index.js b/src/shared/scrapers/US/GA/index.js index 47f97d892..c785476c5 100755 --- a/src/shared/scrapers/US/GA/index.js +++ b/src/shared/scrapers/US/GA/index.js @@ -186,7 +186,7 @@ const scraper = { }, scraper: { '0': async function() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); let counties = []; const $trs = $('table:contains(County):contains(Cases) tbody > tr'); $trs.each((index, tr) => { @@ -210,11 +210,11 @@ const scraper = { return counties; }, '2020-03-27': async function() { - const tmp = await fetch.page(this.url); + const tmp = await fetch.page(this, this.url, 'tmpindex'); const pageHTML = tmp.html(); [this.url] = pageHTML.match(/https:\/\/(.*)\.cloudfront\.net/); - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); let counties = []; const $trs = $('.tcell:contains("COVID-19 Confirmed Cases By County")') .closest('tbody') diff --git a/src/shared/scrapers/US/GU/index.js b/src/shared/scrapers/US/GU/index.js index 3a182f6e7..910ab62cc 100644 --- a/src/shared/scrapers/US/GU/index.js +++ b/src/shared/scrapers/US/GU/index.js @@ -19,7 +19,7 @@ const scraper = { type: 'table', scraper: { '0': async function() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $divCases = $('.et_pb_cta_2') .find('div:contains("Confirmed Positives")') .find('p:nth-child(3)'); @@ -34,7 +34,7 @@ const scraper = { }; }, '2020-03-27': async function() { - await fetch.page(this.url); + await fetch.page(this, this.url, 'default'); throw new DeprecatedError(`Guam stopped offering totals on their website, but there's a sweet coloring book`); } } diff --git a/src/shared/scrapers/US/HI/index.js b/src/shared/scrapers/US/HI/index.js index c878e169f..a25b309dd 100644 --- a/src/shared/scrapers/US/HI/index.js +++ b/src/shared/scrapers/US/HI/index.js @@ -29,7 +29,7 @@ const scraper = { this.url = 'https://health.hawaii.gov/docd/advisories/novel-coronavirus-2019/'; this.type = 'table'; let counties = []; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $table = $('*:contains("Novel Coronavirus in Hawaii")').closest('table'); const $trs = $table.find('tr'); @@ -80,7 +80,7 @@ const scraper = { this.type = 'list'; let counties = []; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $list = $('dd:contains("Honolulu County")') .parent() .find('dd'); diff --git a/src/shared/scrapers/US/IA/index.js b/src/shared/scrapers/US/IA/index.js index 67b1542d1..ed7c50020 100644 --- a/src/shared/scrapers/US/IA/index.js +++ b/src/shared/scrapers/US/IA/index.js @@ -23,7 +23,7 @@ const scraper = { this.url = 'https://idph.iowa.gov/emerging-health-issues/novel-coronavirus'; this.type = 'table'; const counties = []; - const $ = await fetch.headless(this.url); + const $ = await fetch.headless(this, this.url, 'default'); const $table = $('caption:contains("Reported Cases in Iowa by County")').closest('table'); const $trs = $table.find('tbody > tr:not(:last-child)'); $trs.each((index, tr) => { @@ -49,7 +49,7 @@ const scraper = { '2020-03-20': async function() { this.url = 'https://opendata.arcgis.com/datasets/6a84756c2e444a87828bb7ce699fdac6_0.csv'; this.type = 'csv'; - const data = await fetch.csv(this.url); + const data = await fetch.csv(this, this.url, 'default'); const counties = []; for (const county of data) { let countyName = county.Name; diff --git a/src/shared/scrapers/US/ID/index.js b/src/shared/scrapers/US/ID/index.js index 8875e4ee9..64206cd16 100644 --- a/src/shared/scrapers/US/ID/index.js +++ b/src/shared/scrapers/US/ID/index.js @@ -68,7 +68,7 @@ const scraper = { scraper: { '0': async function() { this.url = 'https://coronavirus.idaho.gov'; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $th = $('th:contains("Public Health District")'); const $table = $th.closest('table'); @@ -116,7 +116,7 @@ const scraper = { 'https://public.tableau.com/views/DPHIdahoCOVID-19Dashboard_V2/DPHCOVID19Dashboard2?%3Aembed=y&%3AshowVizHome=no&%3Adisplay_count=y&%3Adisplay_static_image=y&%3AbootstrapWhenNotified=true'; // Get the Tableau chart - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'tmpindex'); // Pull out our session id from the json stuffed inside the textarea const textArea = $('textarea#tsConfigContainer').text(); @@ -130,7 +130,7 @@ const scraper = { // POST const options = { method: 'post', args: { sheet_id: sheet } }; - let data = await fetch.raw(url, undefined, options); + let data = await fetch.raw(this, url, 'default', undefined, options); // Now regex out the chunk of json we need const re = /^\d+;({.*})\d+;({.*})$/; @@ -191,7 +191,7 @@ const scraper = { 'https://services1.arcgis.com/CNPdEkvnGl65jCX8/arcgis/rest/services/iyptX/FeatureServer/0/query?f=json&where=1=1&returnGeometry=false&outFields=*'; this.type = 'json'; - const data = await fetch.json(this.url); + const data = await fetch.json(this, this.url, 'default'); const counties = data.features.map(item => { return { diff --git a/src/shared/scrapers/US/IL/index.js b/src/shared/scrapers/US/IL/index.js index 8f75fb716..4f8d1d521 100755 --- a/src/shared/scrapers/US/IL/index.js +++ b/src/shared/scrapers/US/IL/index.js @@ -38,7 +38,7 @@ const scraper = { this.url = 'http://www.dph.illinois.gov/sitefiles/COVIDTestResults.json'; } - const data = await fetch.json(this.url); + const data = await fetch.json(this, this.url, 'default'); const counties = []; const cookCounty = { county: 'Cook County', cases: 0, deaths: 0, tested: 0 }; for (const county of data.characteristics_by_county.values) { diff --git a/src/shared/scrapers/US/IN/index.js b/src/shared/scrapers/US/IN/index.js index 3d3943dc5..803c95b3c 100644 --- a/src/shared/scrapers/US/IN/index.js +++ b/src/shared/scrapers/US/IN/index.js @@ -18,7 +18,7 @@ const scraper = { 'St Joseph': 'St. Joseph' }, async scraper() { - const data = await fetch.csv(this.url); + const data = await fetch.csv(this, this.url, 'default'); const counties = []; for (const county of data) { let countyName = parse.string(county.COUNTYNAME); diff --git a/src/shared/scrapers/US/KS/index.js b/src/shared/scrapers/US/KS/index.js index fc818641d..525eff6d6 100644 --- a/src/shared/scrapers/US/KS/index.js +++ b/src/shared/scrapers/US/KS/index.js @@ -142,7 +142,7 @@ const scraper = { this.url = `${this._baseUrl}COVID-19_${datePart}_.pdf`; this.type = 'pdf'; - const body = await fetch.pdf(this.url); + const body = await fetch.pdf(this, this.url, 'default'); if (body === null) { throw new Error(`No data for ${date}`); @@ -191,7 +191,7 @@ const scraper = { this.type = 'json'; this.url = 'https://services9.arcgis.com/Q6wTdPdCh608iNrJ/arcgis/rest/services/COVID19_CountyStatus_KDHE/FeatureServer/0/query?f=json&where=Covid_Case%3D%27Yes%27&returnGeometry=false&spatialRel=esriSpatialRelIntersects&outFields=*&orderByFields=COUNTY%20asc&resultOffset=0&resultRecordCount=105&cacheHint=true'; - const data = await fetch.json(this.url); + const data = await fetch.json(this, this.url, 'default'); const counties = []; data.features.forEach(item => { @@ -259,7 +259,7 @@ const scraper = { '2020-04-02': async function() { this.type = 'pdf'; this.url = 'https://public.tableau.com/views/COVID-19Data_15851817634470/CountyCounts.pdf?:showVizHome=no'; - const pdfScrape = await fetch.pdf(this.url); + const pdfScrape = await fetch.pdf(this, this.url, 'cases'); const data = pdfScrape .filter(item => item && item.y > 6 && item.y < 46) @@ -297,7 +297,7 @@ const scraper = { }); const pdfUrl = 'https://public.tableau.com/views/COVID-19Data_15851817634470/Mortality.pdf?:showVizHome=no'; - const deathData = await fetch.pdf(pdfUrl); + const deathData = await fetch.pdf(this, pdfUrl, 'deaths'); let totalDeaths = ''; deathData.forEach(item => { if (item && item.text.match(/[0-9]/)) { diff --git a/src/shared/scrapers/US/KY/index.js b/src/shared/scrapers/US/KY/index.js index 7523085fd..12004248e 100644 --- a/src/shared/scrapers/US/KY/index.js +++ b/src/shared/scrapers/US/KY/index.js @@ -26,7 +26,7 @@ const scraper = { } ], async scraper() { - const $ = await fetch.headless(this.url); + const $ = await fetch.headless(this, this.url, 'default'); const counties = []; diff --git a/src/shared/scrapers/US/LA/index.js b/src/shared/scrapers/US/LA/index.js index fe8895f27..c81c3170a 100644 --- a/src/shared/scrapers/US/LA/index.js +++ b/src/shared/scrapers/US/LA/index.js @@ -21,7 +21,7 @@ const scraper = { this.url = 'http://ldh.la.gov/Coronavirus/'; this.type = 'table'; const counties = []; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $table = $('p:contains("Louisiana Cases")').nextAll('table'); const $trs = $table.find('tbody > tr:not(:last-child)'); $trs.each((index, tr) => { @@ -47,7 +47,7 @@ const scraper = { this.url = 'https://opendata.arcgis.com/datasets/cba425c2e5b8421c88827dc0ec8c663b_0.csv'; this.type = 'csv'; const counties = []; - const data = await fetch.csv(this.url); + const data = await fetch.csv(this, this.url, 'default'); const unassigned = { county: UNASSIGNED, cases: 0, @@ -79,7 +79,7 @@ const scraper = { this.url = 'https://opendata.arcgis.com/datasets/79e1165ecb95496589d39faa25a83ad4_0.csv'; this.type = 'csv'; const counties = []; - const data = await fetch.csv(this.url); + const data = await fetch.csv(this, this.url, 'default'); const unassigned = { county: UNASSIGNED, cases: 0, @@ -112,7 +112,7 @@ const scraper = { 'https://services5.arcgis.com/O5K6bb5dZVZcTo5M/arcgis/rest/services/Cases_by_Parish_2/FeatureServer/0/query?f=json&where=PFIPS%20%3C%3E%2099999&returnGeometry=false&spatialRel=esriSpatialRelIntersects&outFields=*&orderByFields=Deaths%20desc%2CCases%20desc%2CParish%20asc&resultOffset=0&resultRecordCount=65&cacheHint=true'; this.type = 'json'; - const data = await fetch.json(this.url); + const data = await fetch.json(this, this.url, 'default'); const unassigned = { county: UNASSIGNED, cases: 0, diff --git a/src/shared/scrapers/US/MA/index.js b/src/shared/scrapers/US/MA/index.js index 67f2c02a6..65903995e 100644 --- a/src/shared/scrapers/US/MA/index.js +++ b/src/shared/scrapers/US/MA/index.js @@ -51,7 +51,7 @@ const scraper = { date ).getUTCDate()}-2020/download`; - const body = await fetch.pdf(this.url); + const body = await fetch.pdf(this, this.url, 'default'); if (body === null) { throw new Error(`No data for ${date}`); @@ -126,7 +126,7 @@ const scraper = { this.url = 'https://services1.arcgis.com/TXaY625xGc0yvAuQ/arcgis/rest/services/COVID_CASES_MA/FeatureServer/0/query?f=json&where=1%3D1&returnGeometry=false&outFields=*'; const counties = []; - const data = await fetch.json(this.url); + const data = await fetch.json(this, this.url, 'default'); let onlySumDeaths = true; let onlySumTested = true; diff --git a/src/shared/scrapers/US/MD/index.js b/src/shared/scrapers/US/MD/index.js index b8e87970d..6456194b7 100644 --- a/src/shared/scrapers/US/MD/index.js +++ b/src/shared/scrapers/US/MD/index.js @@ -22,7 +22,7 @@ const scraper = { this.url = 'https://coronavirus.maryland.gov/'; this.type = 'paragraph'; const counties = []; - const $ = await fetch.headless(this.url); + const $ = await fetch.headless(this, this.url, 'default'); const paragraph = $('p:contains("Number of Confirmed Cases:")') .next('p') .text(); @@ -44,7 +44,7 @@ const scraper = { '2020-03-17': async function() { this.type = 'csv'; this.url = 'https://opendata.arcgis.com/datasets/3d9ca88970dd4689a701354d7fa6830b_0.csv'; - const data = await fetch.csv(this.url); + const data = await fetch.csv(this, this.url, 'default'); const counties = []; for (const county of data) { let countyName; @@ -69,9 +69,9 @@ const scraper = { const serverNumber = ''; const dashboardId = 'c34e541dd8b742d993159dbebb094d8b'; const layerName = 'MD_COVID19_Case_Counts_by_County'; - this.url = await fetch.getArcGISCSVURL(serverNumber, dashboardId, layerName); + this.url = await fetch.getArcGISCSVURL(this, serverNumber, dashboardId, layerName); - const data = await fetch.csv(this.url); + const data = await fetch.csv(this, this.url, 'default'); const counties = []; for (const county of data) { let countyName; diff --git a/src/shared/scrapers/US/ME/index.js b/src/shared/scrapers/US/ME/index.js index 83209fb7c..5a339a482 100644 --- a/src/shared/scrapers/US/ME/index.js +++ b/src/shared/scrapers/US/ME/index.js @@ -42,7 +42,7 @@ const scraper = { async scraper() { let counties = []; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $th = $('th:contains("Case Counts by County")'); const $table = $th.closest('table'); diff --git a/src/shared/scrapers/US/MI/index.js b/src/shared/scrapers/US/MI/index.js index e5976c0bf..44c4043ff 100644 --- a/src/shared/scrapers/US/MI/index.js +++ b/src/shared/scrapers/US/MI/index.js @@ -111,7 +111,7 @@ const scraper = { let detroitCases = 0; let detroitDeaths = 0; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $cap = $('caption:contains("Overall Confirmed COVID-19 Cases by County")'); const $table = $cap.closest('table'); diff --git a/src/shared/scrapers/US/MN/index.js b/src/shared/scrapers/US/MN/index.js index 53f23815b..6f4a35d0e 100644 --- a/src/shared/scrapers/US/MN/index.js +++ b/src/shared/scrapers/US/MN/index.js @@ -115,7 +115,7 @@ const scraper = { '0': async function() { this.url = 'https://www.health.state.mn.us/diseases/coronavirus/situation.html'; this.type = 'table'; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $th = $('th:contains("County")'); const $table = $th.closest('table'); @@ -149,7 +149,7 @@ const scraper = { this.url = 'https://services1.arcgis.com/RQG3sksSXcoDoIfj/arcgis/rest/services/MN_COVID19_County_Tracking_Public_View/FeatureServer/0/query?f=json&where=1%3D1&returnGeometry=false&outFields=*'; this.type = 'json'; - const data = await fetch.json(this.url); + const data = await fetch.json(this, this.url, 'default'); data.features.forEach(item => { const cases = item.attributes.COVID19POS || 0; diff --git a/src/shared/scrapers/US/MO/index.js b/src/shared/scrapers/US/MO/index.js index a096cb9fe..857cf1d4b 100644 --- a/src/shared/scrapers/US/MO/index.js +++ b/src/shared/scrapers/US/MO/index.js @@ -171,7 +171,7 @@ const scraper = { scraper: { '0': async function() { let counties = {}; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $table = $('table').first(); const $trs = $table.find('tr'); @@ -207,7 +207,7 @@ const scraper = { }, '2020-02-22': async function() { let counties = {}; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $table = $('table').first(); const $trs = $table.find('tr'); @@ -263,8 +263,8 @@ const scraper = { '2020-03-30': async function() { const counties = {}; - this.url = await fetch.getArcGISCSVURL(6, '6f2a47a25872470a815bcd95f52c2872', 'lpha_boundry'); - const data = await fetch.csv(this.url); + this.url = await fetch.getArcGISCSVURL(this, 6, '6f2a47a25872470a815bcd95f52c2872', 'lpha_boundry'); + const data = await fetch.csv(this, this.url, 'default'); const unassigned = { county: UNASSIGNED, diff --git a/src/shared/scrapers/US/MO/st-louis-county.js b/src/shared/scrapers/US/MO/st-louis-county.js index d5bfccaa6..cbb3a6ec0 100644 --- a/src/shared/scrapers/US/MO/st-louis-county.js +++ b/src/shared/scrapers/US/MO/st-louis-county.js @@ -20,8 +20,8 @@ const scraper = { type: 'table', maintainers: [maintainers.slezakbs], async scraper() { - this.url = await fetch.getArcGISCSVURLFromOrgId(2, 'w657bnjzrjguNyOy', 'StLouisCounty_Bdy_Geo'); - const rows = await fetch.csv(this.url); + this.url = await fetch.getArcGISCSVURLFromOrgId(this, 2, 'w657bnjzrjguNyOy', 'StLouisCounty_Bdy_Geo'); + const rows = await fetch.csv(this, this.url, 'default'); const data = rows[0]; return { county: geography.addCounty(this.county), diff --git a/src/shared/scrapers/US/MS/index.js b/src/shared/scrapers/US/MS/index.js index c9730c1a5..c358f03b1 100644 --- a/src/shared/scrapers/US/MS/index.js +++ b/src/shared/scrapers/US/MS/index.js @@ -104,7 +104,7 @@ const scraper = { ], scraper: { '0': async function() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $table = $('h3:contains("Mississippi Cases")') .nextAll('table') .first(); @@ -128,7 +128,7 @@ const scraper = { return counties; }, '2020-03-15': async function() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $table = $('h4:contains("All Mississippi cases to date")') .nextAll('table') .first(); @@ -150,7 +150,7 @@ const scraper = { return counties; }, '2020-03-20': async function() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); // Pick the last one, because older pages had a table of "new cases" // before the table of "total cases" diff --git a/src/shared/scrapers/US/MT/index.js b/src/shared/scrapers/US/MT/index.js index 20d11db54..e318dce3b 100644 --- a/src/shared/scrapers/US/MT/index.js +++ b/src/shared/scrapers/US/MT/index.js @@ -73,7 +73,7 @@ const scraper = { ], async scraper() { - const data = await fetch.json(this.url); + const data = await fetch.json(this, this.url, 'default'); let counties = []; for (const record of data.features) { diff --git a/src/shared/scrapers/US/NC/index.js b/src/shared/scrapers/US/NC/index.js index fb25c1ed4..76f6733af 100644 --- a/src/shared/scrapers/US/NC/index.js +++ b/src/shared/scrapers/US/NC/index.js @@ -12,7 +12,7 @@ const scraper = { state: 'iso2:US-NC', aggregate: 'county', async scraper() { - const data = await fetch.csv(this.url); + const data = await fetch.csv(this, this.url, 'default'); const counties = []; for (const county of data) { counties.push({ diff --git a/src/shared/scrapers/US/ND/index.js b/src/shared/scrapers/US/ND/index.js index 3074de74e..745c59a58 100644 --- a/src/shared/scrapers/US/ND/index.js +++ b/src/shared/scrapers/US/ND/index.js @@ -90,7 +90,7 @@ const scraper = { scraper: { '0': async function() { let counties = []; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); cheerioTableparser($); let $table = $('td:contains("Positive")').closest('table'); if ($table.length === 0) { @@ -134,7 +134,7 @@ const scraper = { this.url = 'https://static.dwcdn.net/data/yuhr0.csv'; this.type = 'csv'; - const data = await fetch.csv(this.url); + const data = await fetch.csv(this, this.url, 'default'); const counties = data.map(location => { return { diff --git a/src/shared/scrapers/US/NE/index.js b/src/shared/scrapers/US/NE/index.js index 4e0d805af..393da32f5 100644 --- a/src/shared/scrapers/US/NE/index.js +++ b/src/shared/scrapers/US/NE/index.js @@ -115,7 +115,7 @@ const scraper = { ], async scraper() { let counties = []; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const listItems = $('ul:contains("Lab-confirmed cases in Nebraska")').find('li'); diff --git a/src/shared/scrapers/US/NH/index.js b/src/shared/scrapers/US/NH/index.js index 043897fdb..280f351d5 100644 --- a/src/shared/scrapers/US/NH/index.js +++ b/src/shared/scrapers/US/NH/index.js @@ -37,7 +37,7 @@ const scraper = { ], scraper: { '0': async function() { - const body = await fetch.pdf(this.url); + const body = await fetch.pdf(this, this.url, 'default'); const rows = pdfUtils.asWords(body, 0, 1000).map(row => row[0]); const counties = []; @@ -78,14 +78,14 @@ const scraper = { return counties; }, '2020-3-31': async function() { - await fetch.pdf(this.url); + await fetch.pdf(this, this.url, 'default'); throw new DeprecatedError('New Hampshire stopped reporting county-level data as of 2020/3/31'); }, '2020-4-12': async function() { this.url = 'https://www.nh.gov/covid19/'; this.type = 'table'; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); cheerioTableparser($); const $countyTable = $('.county-table'); diff --git a/src/shared/scrapers/US/NJ/index.js b/src/shared/scrapers/US/NJ/index.js index 433c597d3..37a684c6c 100644 --- a/src/shared/scrapers/US/NJ/index.js +++ b/src/shared/scrapers/US/NJ/index.js @@ -17,7 +17,7 @@ const scraper = { scraper: { '0': async function() { this.url = 'https://opendata.arcgis.com/datasets/8840fd8ac1314f5188e6cf98b525321c_0.csv'; - const data = await fetch.csv(this.url); + const data = await fetch.csv(this, this.url, 'default'); const counties = []; for (const county of data) { counties.push({ @@ -31,7 +31,7 @@ const scraper = { }, '2020-03-19': async function() { this.url = 'https://opendata.arcgis.com/datasets/84737ef7f760486293b6afa536f028e0_0.csv'; - const data = await fetch.csv(this.url); + const data = await fetch.csv(this, this.url, 'default'); const counties = []; for (const county of data) { counties.push({ @@ -43,8 +43,8 @@ const scraper = { return counties; }, '2020-03-25': async function() { - this.url = await fetch.getArcGISCSVURL(7, 'ec4bffd48f7e495182226eee7962b422', 'DailyCaseCounts'); - const data = await fetch.csv(this.url); + this.url = await fetch.getArcGISCSVURL(this, 7, 'ec4bffd48f7e495182226eee7962b422', 'DailyCaseCounts'); + const data = await fetch.csv(this, this.url, 'default'); const counties = []; for (const county of data) { counties.push({ @@ -56,8 +56,8 @@ const scraper = { return counties; }, '2020-03-31': async function() { - this.url = await fetch.getArcGISCSVURL(7, 'ec4bffd48f7e495182226eee7962b422', 'DailyCaseCounts'); - const data = await fetch.csv(this.url); + this.url = await fetch.getArcGISCSVURL(this, 7, 'ec4bffd48f7e495182226eee7962b422', 'DailyCaseCounts'); + const data = await fetch.csv(this, this.url, 'default'); const counties = []; for (const county of data) { counties.push({ @@ -70,8 +70,8 @@ const scraper = { return counties; }, '2020-04-01': async function() { - this.url = await fetch.getArcGISCSVURL(7, 'ec4bffd48f7e495182226eee7962b422', 'DailyCaseCounts'); - const data = await fetch.csv(this.url); + this.url = await fetch.getArcGISCSVURL(this, 7, 'ec4bffd48f7e495182226eee7962b422', 'DailyCaseCounts'); + const data = await fetch.csv(this, this.url, 'default'); const counties = []; for (const county of data) { counties.push({ diff --git a/src/shared/scrapers/US/NM/index.js b/src/shared/scrapers/US/NM/index.js index c39ea67b9..5677d39ac 100644 --- a/src/shared/scrapers/US/NM/index.js +++ b/src/shared/scrapers/US/NM/index.js @@ -22,7 +22,7 @@ const scraper = { async scraper() { const counties = []; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $table = $('td:contains("County")').closest('table'); const $trs = $table.find('tbody > tr'); $trs.each((index, tr) => { diff --git a/src/shared/scrapers/US/NV/carson-city.js b/src/shared/scrapers/US/NV/carson-city.js index 88ff61416..72326d76b 100644 --- a/src/shared/scrapers/US/NV/carson-city.js +++ b/src/shared/scrapers/US/NV/carson-city.js @@ -20,7 +20,7 @@ const scraper = { type: 'table', async scraper() { const counties = []; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $table = $('table'); const $trs = $table.find('tbody > tr:not(:first-child)'); diff --git a/src/shared/scrapers/US/NV/clark-county.js b/src/shared/scrapers/US/NV/clark-county.js index 6a21ebeb6..37930db1b 100644 --- a/src/shared/scrapers/US/NV/clark-county.js +++ b/src/shared/scrapers/US/NV/clark-county.js @@ -19,7 +19,7 @@ const scraper = { type: 'table', scraper: { '0': async function() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $h1 = $('h1:contains("Total Cases:")'); const regexCases = /Total Cases: (\d+)/; const cases = parse.number(regexCases.exec($h1[0].children[0].data)[1]); @@ -31,7 +31,7 @@ const scraper = { }; }, '2020-03-25': async function() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const casesText = $('*:contains("Total Cases:")').text(); const regexCases = /Total Cases: (\d+)/; diff --git a/src/shared/scrapers/US/NV/washoe-county.js b/src/shared/scrapers/US/NV/washoe-county.js index fa0f3eabf..8dbd9a901 100644 --- a/src/shared/scrapers/US/NV/washoe-county.js +++ b/src/shared/scrapers/US/NV/washoe-county.js @@ -20,7 +20,7 @@ const scraper = { type: 'table', scraper: { '0': async function() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $span = $('span:contains("COVID-19 Case Count in Washoe County")'); const regexCases = /COVID-19 Case Count in Washoe County: (\d+)/; const regexRecovered = /COVID-19 Cases Who Fully Recovered: (\d+)/; @@ -39,10 +39,10 @@ const scraper = { }; }, '2020-03-27': async function() { - this.url = await fetch.getArcGISCSVURL('', 'a54a945cac82424fa4928139ee83f911', 'Cases_current'); + this.url = await fetch.getArcGISCSVURL(this, '', 'a54a945cac82424fa4928139ee83f911', 'Cases_current'); this.type = 'csv'; - const data = await fetch.csv(this.url); + const data = await fetch.csv(this, this.url, 'default'); for (const row of data) { return { cases: parse.number(row.confirmed), @@ -58,7 +58,7 @@ const scraper = { 'https://services.arcgis.com/iCGWaR7ZHc5saRIl/arcgis/rest/services/Cases_wdemographic_current/FeatureServer/0/query?f=json&where=1%3D1&returnGeometry=false&spatialRel=esriSpatialRelIntersects&outFields=*&resultOffset=0&resultRecordCount=50&cacheHint=true'; this.type = 'json'; - const response = await fetch.json(this.url); + const response = await fetch.json(this, this.url, 'default'); const data = response.features[0].attributes; return { cases: parse.number(data.confirmed), diff --git a/src/shared/scrapers/US/NY/index.js b/src/shared/scrapers/US/NY/index.js index 917d8d3ae..2c8a14f5a 100644 --- a/src/shared/scrapers/US/NY/index.js +++ b/src/shared/scrapers/US/NY/index.js @@ -84,7 +84,7 @@ const scraper = { async scraper() { this.url = 'https://health.data.ny.gov/api/views/xdss-u53e/rows.csv?accessType=DOWNLOAD'; this.type = 'csv'; - const data = await fetch.csv(this.url, false); + const data = await fetch.csv(this, this.url, 'default', false); const dateField = 'Test Date'; @@ -151,7 +151,7 @@ export default scraper; ? 'https://www.health.ny.gov/diseases/communicable/coronavirus/' : 'https://coronavirus.health.ny.gov/county-county-breakdown-positive-cases'; let counties = []; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); let $table; if (datetime.scrapeDateIsBefore('2020-03-17')) { $table = $('#case_count_table'); @@ -191,7 +191,7 @@ export default scraper; counties.push(transform.sumData(counties)); try { - const pdfScrape = await fetch.pdf(this._boroughURL); + const pdfScrape = await fetch.pdf(this, this._boroughURL, 'default'); Object.keys(this._boroughs).forEach(name => { const valIndex = pdfScrape.findIndex(ele => ele.text === name); diff --git a/src/shared/scrapers/US/OH/index.js b/src/shared/scrapers/US/OH/index.js index 1f1c2264e..92d472cf0 100644 --- a/src/shared/scrapers/US/OH/index.js +++ b/src/shared/scrapers/US/OH/index.js @@ -112,7 +112,7 @@ const scraper = { let arrayOfCounties = []; this.url = 'https://odh.ohio.gov/wps/portal/gov/odh/know-our-programs/Novel-Coronavirus/welcome/'; this.type = 'paragraph'; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $paragraph = $('p:contains("Number of counties with cases:")').text(); const regExp = /\(([^)]+)\)/; const parsed = regExp.exec($paragraph); @@ -138,7 +138,7 @@ const scraper = { this.url = 'https://coronavirus.ohio.gov/wps/portal/gov/covid-19/'; this.type = 'paragraph'; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $paragraph = $('p:contains("Number of counties with cases:")').text(); const parsed = $paragraph.replace(/([()])/g, '').replace('* Number of counties with cases: ', ''); arrayOfCounties = parsed.split(','); @@ -162,7 +162,7 @@ const scraper = { this.url = 'https://coronavirus.ohio.gov/wps/portal/gov/covid-19/'; this.type = 'paragraph'; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $paragraph = $('p:contains("Number of counties with cases:")').text(); const parsed = $paragraph .replace(/([()])/g, '') @@ -186,13 +186,13 @@ const scraper = { '2020-03-26': async function() { this.url = 'https://public.tableau.com/views/OverviewDashboard_15852499073250/DashboardOverview_1?:embed=y&:showVizHome=no&:host_url=https%3A%2F%2Fpublic.tableau.com%2F&:embed_code_version=3&:tabs=no&:toolbar=no&:showAppBanner=false&iframeSizedToWindow=true&:loadOrderID=0'; - await fetch.page(this.url); + await fetch.page(this, this.url, 'default'); throw new Error('Ohio has an impossible to access tablaeu dashboard'); }, '2020-03-28': async function() { this.url = 'https://coronavirus.ohio.gov/static/COVIDSummaryData.csv'; this.type = 'csv'; - const rows = await fetch.csv(this.url); + const rows = await fetch.csv(this, this.url, 'default'); // The CSV is coming in with the BOM bytes mangled onto the front. // So the header of the first column is 'County' diff --git a/src/shared/scrapers/US/OK/index.js b/src/shared/scrapers/US/OK/index.js index 6427f2650..872a39cc2 100644 --- a/src/shared/scrapers/US/OK/index.js +++ b/src/shared/scrapers/US/OK/index.js @@ -117,7 +117,7 @@ const scraper = { ], async scraper() { let counties = []; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $table = $("table[summary='COVID-19 Cases by County']").first(); const $trs = $table.find('tbody').find('tr'); diff --git a/src/shared/scrapers/US/OR/index.js b/src/shared/scrapers/US/OR/index.js index ec6d97d92..8ca8aa754 100644 --- a/src/shared/scrapers/US/OR/index.js +++ b/src/shared/scrapers/US/OR/index.js @@ -60,7 +60,7 @@ const scraper = { scraper: { '0': async function() { let counties = []; - const $ = await fetch.headless(this.url); + const $ = await fetch.headless(this, this.url, 'default'); const $table = $('table[summary="Cases by County in Oregon for COVID-19"]'); const $trs = $table.find('tbody > tr:not(:first-child):not(:last-child)'); $trs.each((index, tr) => { @@ -81,7 +81,7 @@ const scraper = { }, '2020-03-18': async function() { let counties = []; - const $ = await fetch.headless(this.url); + const $ = await fetch.headless(this, this.url, 'default'); const $table = $('th:contains("County")').closest('table'); const $trs = $table.find('tbody > tr:not(:last-child)'); $trs.each((index, tr) => { diff --git a/src/shared/scrapers/US/PA/index.js b/src/shared/scrapers/US/PA/index.js index 23a70d936..ef6f64d06 100644 --- a/src/shared/scrapers/US/PA/index.js +++ b/src/shared/scrapers/US/PA/index.js @@ -90,7 +90,7 @@ const scraper = { '0': async function scraper() { this.url = 'https://www.health.pa.gov/topics/disease/Pages/Coronavirus.aspx'; this.type = 'list'; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); let counties = []; const $lis = $('li:contains("Counties impacted to date include")') .nextAll('ul') @@ -116,7 +116,7 @@ const scraper = { '2020-03-16': async function scraper() { this.url = 'https://www.health.pa.gov/topics/disease/Pages/Coronavirus.aspx'; this.type = 'table'; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $table = $('table.ms-rteTable-default').first(); const $trs = $table.find('tbody > tr'); let counties = []; @@ -138,7 +138,7 @@ const scraper = { '2020-03-17': async function scraper() { this.url = 'https://www.health.pa.gov/topics/disease/Pages/Coronavirus.aspx'; this.type = 'table'; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $table = $('table.ms-rteTable-default').eq(1); const $trs = $table.find('tbody > tr'); let counties = []; @@ -160,7 +160,7 @@ const scraper = { '2020-03-18': async function scraper() { this.url = 'https://www.health.pa.gov/topics/disease/coronavirus/Pages/Cases.aspx'; this.type = 'table'; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $countyTable = $('th:contains("County")').closest('table'); const $trs = $countyTable.find('tbody > tr:not(:first-child)'); let counties = []; @@ -187,7 +187,7 @@ const scraper = { '2020-03-26': async function scraper() { this.url = 'https://www.health.pa.gov/topics/disease/coronavirus/Pages/Cases.aspx'; this.type = 'table'; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $countyTable = $('td:contains("County")').closest('table'); const rules = { diff --git a/src/shared/scrapers/US/RI/index.js b/src/shared/scrapers/US/RI/index.js index 8fa1f9f43..2ec04adc3 100644 --- a/src/shared/scrapers/US/RI/index.js +++ b/src/shared/scrapers/US/RI/index.js @@ -78,7 +78,7 @@ const scraper = { scraper: { '0': async function() { - const data = await fetch.csv(this.url); + const data = await fetch.csv(this, this.url, 'default'); const counties = []; for (const row of data) { const caseHdr = 'Number of COVID-19 positive (including presumptive positive) cases'; @@ -106,7 +106,7 @@ const scraper = { this.headless = true; this.url = 'https://health.ri.gov/data/covid-19/'; - const $ = await fetch.headless(this.url); + const $ = await fetch.headless(this, this.url, 'default'); cheerioTableparser($); // Need to pull this out explicitly because their html table includes diff --git a/src/shared/scrapers/US/SC/index.js b/src/shared/scrapers/US/SC/index.js index 02f865232..096cff245 100644 --- a/src/shared/scrapers/US/SC/index.js +++ b/src/shared/scrapers/US/SC/index.js @@ -71,7 +71,7 @@ const scraper = { scraper: { '0': async function() { - const data = await fetch.json(this.url); + const data = await fetch.json(this, this.url, 'default'); let counties = []; for (const record of data.features) { @@ -99,8 +99,8 @@ const scraper = { const serverNumber = 2; const dashboardId = '3732035614af4246877e20c3a496e397'; const layerName = 'Covid19_Cases_Centroid_SharingView'; - this.url = await fetch.getArcGISCSVURL(serverNumber, dashboardId, layerName); - const data = await fetch.csv(this.url); + this.url = await fetch.getArcGISCSVURL(this, serverNumber, dashboardId, layerName); + const data = await fetch.csv(this, this.url, 'default'); let counties = []; for (const county of data) { counties.push({ @@ -120,8 +120,8 @@ const scraper = { const serverNumber = 2; const dashboardId = '3732035614af4246877e20c3a496e397'; const layerName = 'COVID19_County_Polygon_SharingView2'; // they started updating this view - this.url = await fetch.getArcGISCSVURL(serverNumber, dashboardId, layerName); - const data = await fetch.csv(this.url); + this.url = await fetch.getArcGISCSVURL(this, serverNumber, dashboardId, layerName); + const data = await fetch.csv(this, this.url, 'default'); let counties = []; for (const county of data) { if (datetime.scrapeDateIsBefore(county.Date_)) { diff --git a/src/shared/scrapers/US/SD/index.js b/src/shared/scrapers/US/SD/index.js index 43bc26700..981bca6bc 100644 --- a/src/shared/scrapers/US/SD/index.js +++ b/src/shared/scrapers/US/SD/index.js @@ -89,7 +89,7 @@ const scraper = { scraper: { '0': async function() { let counties = []; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $th = $('h2:contains("South Dakota Counties with COVID-19 Cases")'); const $table = $th.next('table'); const $trs = $table.find('tbody > tr'); @@ -106,7 +106,7 @@ const scraper = { }, '2020-03-19': async function() { let counties = []; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $table = $('caption:contains("SOUTH DAKOTA COUNTIES WITH COVID-19 CASES")').closest('table'); const $trs = $table.find('tbody > tr'); $trs.each((index, tr) => { @@ -125,7 +125,7 @@ const scraper = { }, '2020-03-23': async function() { let counties = []; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $table = $('caption:contains("SD COUNTY OF RESIDENCE")').closest('table'); const $trs = $table.find('tbody > tr'); $trs.each((index, tr) => { diff --git a/src/shared/scrapers/US/TN/index.js b/src/shared/scrapers/US/TN/index.js index 0c8af45a9..278a9fa83 100755 --- a/src/shared/scrapers/US/TN/index.js +++ b/src/shared/scrapers/US/TN/index.js @@ -144,7 +144,7 @@ const scraper = { scraper: { '0': async function() { let counties = []; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $table = $('th:contains("Case Count")').closest('table'); const $trs = $table.find('tbody > tr:not(:last-child)'); @@ -193,7 +193,7 @@ const scraper = { }, '2020-03-21': async function() { let counties = []; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $table = $('th:contains("Count")').closest('table'); const $trs = $table.find('tbody > tr:not(:last-child)'); // skip grand total @@ -235,7 +235,7 @@ const scraper = { }, '2020-3-31': async function() { let counties = []; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); cheerioTableparser($); const $table = $('td:contains("Blount")').closest('table'); const data = $table.parsetable(false, false, true); @@ -289,7 +289,7 @@ const scraper = { 'https://services1.arcgis.com/YuVBSS7Y1of2Qud1/arcgis/rest/services/TN_Covid_Counties/FeatureServer/0/query?f=json&where=1%3D1&returnGeometry=false&spatialRel=esriSpatialRelIntersects&outFields=*&orderByFields=NAME%20asc&resultOffset=0&resultRecordCount=96&cacheHint=true'; this.type = 'json'; - const data = await fetch.json(this.url); + const data = await fetch.json(this, this.url, 'default'); const counties = []; data.features.forEach(item => { @@ -310,7 +310,7 @@ const scraper = { const totalsUrl = 'https://services1.arcgis.com/YuVBSS7Y1of2Qud1/ArcGIS/rest/services/TN_Covid_Total/FeatureServer/0/query?where=1%3D1&outFields=*&returnGeometry=false&f=pjson'; - const tmp = await fetch.json(totalsUrl); + const tmp = await fetch.json(this, totalsUrl, 'totals'); const totalsData = tmp.features.pop().attributes; const totals = transform.sumData(counties); diff --git a/src/shared/scrapers/US/TX/index.js b/src/shared/scrapers/US/TX/index.js index 3f9df44ac..5b12a8ce9 100644 --- a/src/shared/scrapers/US/TX/index.js +++ b/src/shared/scrapers/US/TX/index.js @@ -283,7 +283,7 @@ const scraper = { let counties = []; this.url = 'https://www.dshs.state.tx.us/news/updates.shtm'; this.type = 'table'; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); let $table; if (datetime.scrapeDateIsBefore('2020-03-16')) { $table = $('table[summary="Texas COVID-19 Cases"]'); @@ -313,7 +313,7 @@ const scraper = { let counties = []; this.url = 'https://www.dshs.state.tx.us/news/updates.shtm'; this.type = 'table'; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $table = $('table[summary="COVID-19 Cases in Texas Counties"]'); const $trs = $table.find('tbody > tr:not(:last-child)'); $trs.each((index, tr) => { @@ -345,7 +345,7 @@ const scraper = { let counties = []; this.url = 'https://opendata.arcgis.com/datasets/bc83058386d2434ca8cf90b26dc6b580_0.csv'; this.type = 'csv'; - const data = await fetch.csv(this.url); + const data = await fetch.csv(this, this.url, 'default'); for (const row of data) { let county = geography.addCounty(parse.string(row.County)); county = this._countyMap[county] || county; diff --git a/src/shared/scrapers/US/UT/index.js b/src/shared/scrapers/US/UT/index.js index 152ca759f..e0bbff7ee 100644 --- a/src/shared/scrapers/US/UT/index.js +++ b/src/shared/scrapers/US/UT/index.js @@ -103,7 +103,7 @@ const scraper = { '0': async function() { this.url = 'https://coronavirus.utah.gov/latest/'; this.type = 'table'; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); let counties = []; const $table = $('th:contains("District")').closest('table'); const $trs = $table.find('tbody > tr'); @@ -125,7 +125,7 @@ const scraper = { '2020-03-19': async function() { this.url = 'https://coronavirus-dashboard.utah.gov/'; this.type = 'table'; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); let counties = []; const script = $('script[type="application/json"]').html(); @@ -153,7 +153,7 @@ const scraper = { '2020-04-08': async function() { this.url = 'https://coronavirus-dashboard.utah.gov/'; this.type = 'table'; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); let counties = []; const script = $('script[type="application/json"]').html(); diff --git a/src/shared/scrapers/US/VA/index.js b/src/shared/scrapers/US/VA/index.js index 94a4028b3..412c0b383 100644 --- a/src/shared/scrapers/US/VA/index.js +++ b/src/shared/scrapers/US/VA/index.js @@ -227,7 +227,7 @@ const scraper = { endURL = endURL.slice(0, name.lastIndexOf(' ')); } const pdfUrl = pdfBaseURL + endURL; - const pdfScrape = await fetch.pdf(pdfUrl); + const pdfScrape = await fetch.pdf(this, pdfUrl, endURL); if (pdfScrape) { let pdfText = ''; @@ -251,7 +251,7 @@ const scraper = { } } } else { - const data = await fetch.csv(this.url); + const data = await fetch.csv(this, this.url, 'default'); this.type = 'csv'; data.forEach(location => { diff --git a/src/shared/scrapers/US/VT/index.js b/src/shared/scrapers/US/VT/index.js index 4cb516e70..8e46f4f45 100644 --- a/src/shared/scrapers/US/VT/index.js +++ b/src/shared/scrapers/US/VT/index.js @@ -38,7 +38,7 @@ const scraper = { ], scraper: { '0': async function() { - const data = await fetch.json(this.url); + const data = await fetch.json(this, this.url, 'default'); const counties = []; data.features.forEach(item => { @@ -59,7 +59,7 @@ const scraper = { const totalsurl = 'https://services1.arcgis.com/BkFxaEFNwHqX3tAw/arcgis/rest/services/county_summary/FeatureServer/0/query?where=1%3D1&outFields=*&f=pjson'; - const totalsData = await fetch.json(totalsurl); + const totalsData = await fetch.json(this, totalsurl, 'totals'); const totals = transform.sumData(counties); totals.tested = totalsData.features.pop().attributes.total_tests; diff --git a/src/shared/scrapers/US/WA/index.js b/src/shared/scrapers/US/WA/index.js index b15486d1b..6750bee79 100644 --- a/src/shared/scrapers/US/WA/index.js +++ b/src/shared/scrapers/US/WA/index.js @@ -66,7 +66,7 @@ const scraper = { this.url = 'https://www.doh.wa.gov/Emergencies/Coronavirus'; this.type = 'table'; this.headless = true; - const $ = await fetch.headless(this.url); + const $ = await fetch.headless(this, this.url, 'default'); const $th = $('th:contains("(COVID-19) in Washington")'); const $table = $th.closest('table'); const $trs = $table.find('tbody > tr'); @@ -96,7 +96,7 @@ const scraper = { this.url = 'https://www.doh.wa.gov/Emergencies/Coronavirus'; this.type = 'table'; this.headless = true; - const $ = await fetch.headless(this.url); + const $ = await fetch.headless(this, this.url, 'default'); const $table = $('caption:contains("Number of Individuals Tested")') .first() .closest('table'); @@ -128,7 +128,7 @@ const scraper = { this.url = 'https://www.doh.wa.gov/Emergencies/Coronavirus'; this.type = 'table'; this.headless = true; - const $ = await fetch.headless(this.url); + const $ = await fetch.headless(this, this.url, 'default'); const $table = $('caption:contains("Confirmed Cases")') .first() .closest('table'); @@ -162,7 +162,7 @@ const scraper = { 'https://services8.arcgis.com/rGGrs6HCnw87OFOT/arcgis/rest/services/CountyCases/FeatureServer/0/query?f=json&where=(CV_State_Cases%3E0)&returnGeometry=false&outFields=*&orderByFields=CNTY_NAME%20asc'; this.type = 'json'; this.headless = false; - const data = await fetch.json(this.url); + const data = await fetch.json(this, this.url, 'default'); data.features.forEach(item => { const cases = item.attributes.CV_PositiveCases; diff --git a/src/shared/scrapers/US/WI/index.js b/src/shared/scrapers/US/WI/index.js index 11113e4dd..d6b89a3b3 100644 --- a/src/shared/scrapers/US/WI/index.js +++ b/src/shared/scrapers/US/WI/index.js @@ -96,7 +96,7 @@ const scraper = { this.url = 'https://www.dhs.wisconsin.gov/outbreaks/index.htm'; this.type = 'table'; let regions = []; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $table = $('caption:contains("Number of Positive Results by County")').closest('table'); const $trs = $table.find('tbody > tr:not(:last-child)'); $trs.each((index, tr) => { @@ -114,7 +114,7 @@ const scraper = { this.url = 'https://www.dhs.wisconsin.gov/outbreaks/index.htm'; this.type = 'table'; let regions = []; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $table = $('h5:contains("Number of Positive Results by County")') .nextAll('table') .first(); @@ -153,7 +153,7 @@ const scraper = { 'https://services1.arcgis.com/ISZ89Z51ft1G16OK/arcgis/rest/services/COVID19_WI/FeatureServer/0//query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&resultType=none&distance=0.0&units=esriSRUnit_Meter&returnGeodetic=false&outFields=NAME%2CPOSITIVE%2CDATE%2CCMNTY_SPRD&returnGeometry=true&featureEncoding=esriDefault&multipatchOption=xyFootprint&maxAllowableOffset=&geometryPrecision=&outSR=&datumTransformation=&applyVCSProjection=false&returnIdsOnly=false&returnUniqueIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&returnQueryGeometry=false&returnDistinctValues=false&cacheHint=false&orderByFields=NAME+ASC&groupByFieldsForStatistics=&outStatistics=&having=&resultOffset=&resultRecordCount=&returnZ=false&returnM=false&returnExceededLimitFeatures=true&quantizationParameters=&sqlFormat=none&f=pjson&token='; this.type = 'json'; let regions = []; - const data = await fetch.json(this.url); + const data = await fetch.json(this, this.url, 'cases'); for (const field of data.features) { regions.push({ county: geography.addCounty(field.attributes.NAME), @@ -162,7 +162,7 @@ const scraper = { } const stateURL = 'https://services1.arcgis.com/ISZ89Z51ft1G16OK/arcgis/rest/services/COVID19_WI/FeatureServer/2/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&resultType=none&distance=0.0&units=esriSRUnit_Meter&returnGeodetic=false&outFields=NEGATIVE%2CPOSITIVE%2CDATE&returnHiddenFields=false&returnGeometry=true&featureEncoding=esriDefault&multipatchOption=xyFootprint&maxAllowableOffset=&geometryPrecision=&outSR=&datumTransformation=&applyVCSProjection=false&returnIdsOnly=false&returnUniqueIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&returnQueryGeometry=false&returnDistinctValues=false&cacheHint=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&having=&resultOffset=&resultRecordCount=&returnZ=false&returnM=false&returnExceededLimitFeatures=true&quantizationParameters=&sqlFormat=none&f=pjson&token='; - const stateData = await fetch.json(stateURL); + const stateData = await fetch.json(this, stateURL, 'tested'); regions.push({ tested: stateData.features[0].attributes.NEGATIVE + stateData.features[0].attributes.POSITIVE, cases: stateData.features[0].attributes.POSITIVE @@ -179,7 +179,7 @@ const scraper = { this.type = 'table'; let regions = []; - const $ = await fetch.headless(this.url); + const $ = await fetch.headless(this, this.url, 'default'); const $table = $('#covid-county-table').find('table'); const $trs = $table.find('tbody > tr:not(:last-child)'); $trs.each((index, tr) => { @@ -205,7 +205,7 @@ const scraper = { this.type = 'table'; let regions = []; - const $ = await fetch.headless(this.url); + const $ = await fetch.headless(this, this.url, 'default'); const $table = $('#covid-county-table').find('table'); const $trs = $table.find('tbody > tr:not(:last-child)'); $trs.each((index, tr) => { diff --git a/src/shared/scrapers/US/WV/index.js b/src/shared/scrapers/US/WV/index.js index c5d24b6d7..8f9f7a6de 100644 --- a/src/shared/scrapers/US/WV/index.js +++ b/src/shared/scrapers/US/WV/index.js @@ -79,7 +79,7 @@ const scraper = { scraper: { '0': async function() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $p = $('p:contains("Counties with positive cases")'); @@ -122,7 +122,7 @@ const scraper = { const options = { headers: { 'X-PowerBI-ResourceKey': '187b4de8-78ef-40be-9510-7fa9a1ef89f2' } }; - const data = await fetch.json(this.url, undefined, options); + const data = await fetch.json(this, this.url, 'default', undefined, options); const { sections } = data.exploration; let counties = []; diff --git a/src/shared/scrapers/US/WY/index.js b/src/shared/scrapers/US/WY/index.js index 3db4a6109..7890171b9 100644 --- a/src/shared/scrapers/US/WY/index.js +++ b/src/shared/scrapers/US/WY/index.js @@ -24,7 +24,7 @@ const scraper = { scraper: { '0': async function() { const counties = []; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $p = $('strong:contains("Cases by County")').parent(); const items = $p.html().split('
'); @@ -56,7 +56,7 @@ const scraper = { }, '2020-04-8': async function() { const counties = []; - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $p = $('strong:contains("Albany")').parent(); const items = $p.html().split('
'); diff --git a/src/shared/scrapers/US/covidtracking.js b/src/shared/scrapers/US/covidtracking.js index 8b743003c..c5123c981 100644 --- a/src/shared/scrapers/US/covidtracking.js +++ b/src/shared/scrapers/US/covidtracking.js @@ -21,7 +21,7 @@ const scraper = { aggregate: 'state', priority: -0.5, async scraper() { - const data = await fetch.json(this.url); + const data = await fetch.json(this, this.url, 'default'); const regions = []; diff --git a/src/shared/scrapers/US/nyt-counties.js b/src/shared/scrapers/US/nyt-counties.js index 04243c00f..9fa862cd9 100644 --- a/src/shared/scrapers/US/nyt-counties.js +++ b/src/shared/scrapers/US/nyt-counties.js @@ -22,7 +22,7 @@ const scraper = { ], async scraper() { this.url = 'https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv'; - const data = await fetch.csv(this.url, false); + const data = await fetch.csv(this, this.url, 'default', false); // FIXME when we roll out new TZ support! let scrapeDate = process.env.SCRAPE_DATE diff --git a/src/shared/scrapers/VI/index.js b/src/shared/scrapers/VI/index.js index bd0b8482f..ca6b9b7e7 100644 --- a/src/shared/scrapers/VI/index.js +++ b/src/shared/scrapers/VI/index.js @@ -32,7 +32,7 @@ const scraper = { maintainers.camjc ], async scraper() { - const $ = await fetch.page(this.url); + const $ = await fetch.page(this, this.url, 'default'); const $paragraphs = $('.block-content p'); const data = {}; $paragraphs diff --git a/src/shared/scrapers/ZA/index.js b/src/shared/scrapers/ZA/index.js index fb04ba15a..1feec3965 100644 --- a/src/shared/scrapers/ZA/index.js +++ b/src/shared/scrapers/ZA/index.js @@ -28,13 +28,13 @@ const scraper = { const casesUrl = 'https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_provincial_cumulative_timeline_confirmed.csv'; - const casesData = await fetch.csv(casesUrl, false); + const casesData = await fetch.csv(this, casesUrl, 'cases', false); const deathsUrl = 'https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_timeline_deaths.csv'; - const deathsData = await fetch.csv(deathsUrl, false); + const deathsData = await fetch.csv(this, deathsUrl, 'deaths', false); const testedUrl = 'https://raw.githubusercontent.com/dsfsi/covid19za/master/data/covid19za_timeline_testing.csv'; - const testedData = await fetch.csv(testedUrl, false); + const testedData = await fetch.csv(this, testedUrl, 'tested', false); const dataByProvince = {}; const nationalData = { tested: 0, deaths: 0, cases: 0 }; diff --git a/src/shared/scrapers/jhu-usa.js b/src/shared/scrapers/jhu-usa.js index 42bb0f3b8..6be888ddd 100644 --- a/src/shared/scrapers/jhu-usa.js +++ b/src/shared/scrapers/jhu-usa.js @@ -31,8 +31,8 @@ const scraper = { scraper: { '0': async function() { const urls = this._urls; - const cases = await fetch.csv(urls.cases, false); - const deaths = await fetch.csv(urls.deaths, false); + const cases = await fetch.csv(this, urls.cases, 'cases', false); + const deaths = await fetch.csv(this, urls.deaths, 'deaths', false); let regions = []; diff --git a/src/shared/scrapers/jhu.js b/src/shared/scrapers/jhu.js index 54e4daf3d..6f81210da 100644 --- a/src/shared/scrapers/jhu.js +++ b/src/shared/scrapers/jhu.js @@ -108,10 +108,10 @@ const scraper = { scraper: { '0': async function() { const urls = this._urls; - const cases = await fetch.csv(urls.cases, false); - const deaths = await fetch.csv(urls.deaths, false); - const recovered = await fetch.csv(urls.recovered, false); - const isoMapCsv = await fetch.csv(urls.isoMap, false); + const cases = await fetch.csv(this, urls.cases, 'cases', false); + const deaths = await fetch.csv(this, urls.deaths, 'deaths', false); + const recovered = await fetch.csv(this, urls.recovered, 'recovered', false); + const isoMapCsv = await fetch.csv(this, urls.isoMap, 'isomap', false); const isoMap = this._createIsoMap(isoMapCsv); diff --git a/tests/integration/scrapers/scrapers-new-test.js b/tests/integration/scrapers/scrapers-new-test.js index 16aacb17d..751c6864c 100644 --- a/tests/integration/scrapers/scrapers-new-test.js +++ b/tests/integration/scrapers/scrapers-new-test.js @@ -2,11 +2,11 @@ const imports = require('esm')(module); const { join } = require('path'); const test = require('tape'); const exec = require('child_process').execSync; +const fs = require('fs'); const shared = join(process.cwd(), 'src', 'shared'); const lib = join(shared, 'lib'); -const fs = imports(join(lib, 'fs.js')); const schema = imports(join(lib, 'schema.js')); const runScraper = imports('./run-scraper.js').default; @@ -21,26 +21,23 @@ if (files) { // Ignore any files or subdirectory in scrapers that starts with _ filePath.match(/scrapers(?![^/])(?!.*\/_).*\.js$/gi) ) - .filter(filePath => !filePath.startsWith('tests/')); + .filter(filePath => !filePath.startsWith('tests/')) + .filter(filePath => fs.existsSync(join(process.cwd(), filePath))); if (scrapers.length > 0) { - test('Test updated scrapers', async t => { - // We run up to two tests per scraper - t.plan(scrapers.length); - for (const scraperPath of scrapers) { - if (await fs.exists(scraperPath)) { + for (const scraperPath of scrapers) { + test(`Updated scraper ${scraperPath}`, async t => { + try { const scraper = imports(join(process.cwd(), scraperPath)); - try { - await runScraper(scraper); - } catch (err) { - t.fail(`${scraperPath} failed with error: ${err}`); - } + await runScraper(scraper); + t.pass('Scraper ran'); const hasErrors = schema.schemaHasErrors(scraper.default, schema.schemas.scraperSchema); t.notOk(hasErrors, 'Scraper had no errors'); - } else { - t.pass(`${scraperPath} was deleted`); + } catch (err) { + t.fail(`Scraper failed with error: ${err}`); } - } - }); + t.end(); + }); + } } } diff --git a/tests/unit/shared/scrapers/scrapers-all-test.js b/tests/unit/shared/scrapers/scrapers-all-test.js index 42140e968..5eebc4960 100644 --- a/tests/unit/shared/scrapers/scrapers-all-test.js +++ b/tests/unit/shared/scrapers/scrapers-all-test.js @@ -58,35 +58,38 @@ function validateCodingConventions(t, lin) { t.fail(`Doesn't follow convention`); } - /* - // DISABLING THESE CHECKS FOR NOW, WILL RE-ENABLE LATER. - // Each call should have the scraper object, the URL, and the // "cache key". // console.log(lin); const fetchArgs = lin - .trim() - .replace(/.*\(/, '') - .replace(/\);$/, '') - .split(',') - .map(s => s.trim()); - const lenMsg = `Expected >=3 args to fetch (scraper, url, cacheKey), got ${fetchArgs.length}`; - t.ok(fetchArgs.length >= 3, lenMsg); - - if (fetchArgs.length >= 3) { - const first = fetchArgs[0]; - const third = fetchArgs[2]; - - // First arg: Most scrapers can pass 'this', but some scrapers - // use helper functions, and so must pass 'obj'. - t.ok(first == 'this' || first == 'obj', 'first arg is this or obj'); - - // Third arg: Must be cache key. - const apos = "'"; - const ckmsg = `third arg (${third}) is cache key, must be string`; - t.ok(third.startsWith(apos) && third.endsWith(apos), ckmsg); + .trim() + .replace(/.*\(/, '') + .replace(/\);$/, '') + .split(',') + .map(s => s.trim()); + const n = fetchArgs.length; + if (n < 3) { + t.fail(`Expected >=3 args to fetch (scraper, url, cacheKey), got ${n}`); + return; } - */ + + const first = fetchArgs[0]; + + // First arg: Most scrapers can pass 'this', but some scrapers + // use helper functions, and so must pass 'obj'. + t.ok(first === 'this' || first === 'obj', 'first arg is this or obj'); + + // Third arg: If not ArcGIS, third arg must be be cache key. + // ArcGIS calls don't have a "cache key", as they're intermediate steps only. + // const third = fetchArgs[2]; + // + // DISABLED THIS CHECK. + // NOTE: can't check this, b/c some scrapers use variables as the cache key + // (e.g. when fetching multiple times for counties, see US/VA/index.js). + // const apos = "'"; + // const ckmsg = `third arg (${third}) is cache key, must be string`; + // if (!lin.match(/getArcGIS/)) { + // t.ok(third.startsWith(apos) && third.endsWith(apos), ckmsg); } const checkFiles = scraperCodeFiles.filter(scf => scf.importsFetch); diff --git a/tools/compare-report-dirs.js b/tools/compare-report-dirs.js index 226398454..d102e5e80 100644 --- a/tools/compare-report-dirs.js +++ b/tools/compare-report-dirs.js @@ -49,19 +49,31 @@ const stringDiff = imports(path.join(lib, 'diffing', 'string-diff.js')).default; // Utilities ///////////////////////////////////////// +function printTitleAndErrors(f, errs) { + const b = path.basename(f); + if (errs.length === 0) { + console.log(`${b}: equal`); + return; + } + console.log(`\n${b}\n${'-'.repeat(b.length)}`); + errs.forEach(e => { + console.log(`* ${e}`); + }); +} + /** Compare two json files. */ function compareJson(leftFname, rightFname, formatters) { - const loadJson = f => { - return JSON.parse(fs.readFileSync(f, 'utf8')); - }; - const left = loadJson(leftFname); - const right = loadJson(rightFname); + const leftcontent = fs.readFileSync(leftFname, 'utf-8'); + const rightcontent = fs.readFileSync(rightFname, 'utf-8'); + if (leftcontent === rightcontent) { + printTitleAndErrors(leftFname, []); + return; + } + + const left = JSON.parse(leftcontent); + const right = JSON.parse(rightcontent); const errs = jsonDiff.jsonDiff(left, right, 10, formatters); - if (errs.length === 0) console.log(' equal'); - else - errs.forEach(e => { - console.log(`* ${e}`); - }); + printTitleAndErrors(leftFname, errs); } /** Compare two CSV files. */ @@ -84,11 +96,7 @@ function compareCsv(leftFname, rightFname) { if (errs.length >= 10) break; } - if (errs.length === 0) console.log(' equal'); - else - errs.forEach(e => { - console.log(`* ${e}`); - }); + printTitleAndErrors(leftFname, errs); } /** Find _one_ file in leftPaths and rightPaths that matches the @@ -99,7 +107,7 @@ function findLeftRightFiles(regex, leftPaths, rightPaths) { return regex.test(f); }); if (drs.length === 0) { - console.log(`Missing ${regex} file.`); + // console.log(`Missing ${regex} file.`); return null; } if (drs.length > 1) { @@ -121,11 +129,6 @@ function compareReportFolders(left, right) { const leftPaths = fpaths(left); const rightPaths = fpaths(right); - const printTitle = s => { - const b = path.basename(s); - console.log(`\n${b}\n${'-'.repeat(b.length)}`); - }; - const jsonReports = [ { regex: /data(.*).json/, @@ -154,13 +157,18 @@ function compareReportFolders(left, right) { { regex: /features(.*).json/, formatters: {} - } + }, + + { regex: /timeseries-byLocation.json/, formatters: {} }, + { regex: /timeseries-jhu.csv/, formatters: {} }, + { regex: /timeseries-tidy.csv/, formatters: {} }, + { regex: /timeseries.csv/, formatters: {} }, + { regex: /timeseries.json/, formatters: {} } ]; jsonReports.forEach(hsh => { const [left, right] = findLeftRightFiles(hsh.regex, leftPaths, rightPaths); if (left && right) { - printTitle(left); compareJson(left, right, hsh.formatters); } }); @@ -169,7 +177,6 @@ function compareReportFolders(left, right) { csvReports.forEach(regex => { const [left, right] = findLeftRightFiles(regex, leftPaths, rightPaths); if (left && right) { - printTitle(left); compareCsv(left, right); } });