Skip to content

Commit

Permalink
Change fetch api (covidatlas#822)
Browse files Browse the repository at this point in the history
* Add scraper as first arg to all fetch methods.
* Script to add 'this' as first arg to all fetch calls.
* Clarify tape test messaging.
* Throw on invalid scraper.
* Add test:integration yarn script.
* Add cacheKey to fetch API public methods.
* Script to add 'default' cache key to all fetch calls.
* Add 'default' cache key to all scraper fetch calls.
* Add scraper fetch call format check unit test.
* Add option to log cache calls if LOG_CACHE_CALLS is set.
* First draft, migration of cache hits to temp dir.
* Fix cache key collisions.
* Zip migrated file.
* Update GB to use latest fetch API.
* Differentiating cache keys for timeseries, even though they're ignored.
* Add timeseries to report compare.
* Move cache migration to separate module to simplify code review.
* Allow comma-delimited list for --location
* Update new/changed scrapers to use new fetch API.
  • Loading branch information
jzohrab authored Apr 16, 2020
1 parent 0fc9b11 commit 27c32c0
Show file tree
Hide file tree
Showing 154 changed files with 984 additions and 411 deletions.
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
"qa": "pta ./src/qa/*.qa.js -r log | node src/qa/utils/qa-reporter.js",
"test": "tape tests/**/*-test.js | tap-spec",
"test:unit": "tape tests/unit/**/*-test.js | tap-spec",
"test:integration": "tape tests/integration/**/*-test.js | tap-spec",
"test:watch": "tape-watch tests/**/*-test.js -p tap-spec",
"test:tz": "tape scripts/test-timezones.js | tap-spec",
"timeseries": "node src/shared/timeseries/index.js",
Expand Down
181 changes: 181 additions & 0 deletions scripts/change-fetch-api/add-cacheKey.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
# Hack all of the scraper files.
#
# USAGE:
#
# - cd into this directory,
# - run `ruby this-script.rb WRITE [FILENAME]`
#
# WRITE is either true or false.
# - 'save' to overwrite source files
# - 'print' to dump to console
# - 'mute' to not print or save (useful to see the changes that would happen)
#
# FILENAME: name of the file to work with. If missing,
# do all files.
#
# eg.,
# ruby <this-script.rb> print DEU/_shared.js

if (ARGV.size < 1) then
puts "usage: ruby <this_script.rb> WRITE [FILENAME]"
puts "where WRITE = save/print/mute"
return
end

WRITE = ARGV[0].to_s.downcase
FILENAME = (ARGV.size > 1) ? ARGV[1] : nil

# Skip some files.
# Not bothering to try to determine these programmatically.
# IGNORE_FILES = %w(
# AUS/_shared/get-data-with-tested-negative-applied.js
# AUS/_shared/get-key.js
# )

METHODS = 'page|fetch|raw|json|jsonAndCookies|csv|tsv|pdf|headless'


# NEED TO DO THIS:
FETCH_RE = /(await\s+.*?\.(?:#{METHODS})\s*\(.*?, )([^,)]*)(.*)/


# Manual tests to verify
# lin = "const $ = await fetch.page(this, healthUrl);
# const $ = await fetch.page(this, healthUrl, some_other_stuff);
# const $ = await fetch.csv(this, 'something.com', some_other_stuff);
# "
# lin.scan(FETCH_RE).each do |m|
# puts '----'
# puts m.inspect
# puts "#{m[0]}#{m[1]}, 'default'#{m[2]}"
# end


# Print warnings only for each file f in scraper_dir.
def validate(scraper_dir, f)
fpath = File.join(scraper_dir, f)
src = File.read(fpath)
[ FETCH_RE ].each do |re|
puts "WARN: No match for #{re} in #{f}" if (src !~ re)
end
end


def add_cacheKey_to_fetch_calls(src)
original_src = "CLONE: #{src}"

matches = src.scan(FETCH_RE)
# puts "add cacheKey: #{matches.inspect}"
matches.uniq.each do |m|
raise "bad re? #{m}" if m.size != 3
before, url, after = m
wholeline = [before, url, after].join('')
newline = "#{before}#{url}, 'default'#{after}"
puts " + \"#{wholeline}\" => \"#{newline}\""
src = src.gsub(wholeline, newline)
end

src
end


# # Specific hack
# def postmigration_AU_QLD_stuff(src)
# old = "async function getCurrentArticlePage(obj) {
# const $ = await fetch.page(this, obj.url);
# const anchors = $('#content h3:first-of-type > a');
# const currentArticleUrl = anchors[0].attribs.href;
# return fetch.page(currentArticleUrl);
# }"
# new = "async function getCurrentArticlePage(obj) {
# const $ = await fetch.page(obj, obj.url);
# const anchors = $('#content h3:first-of-type > a');
# const currentArticleUrl = anchors[0].attribs.href;
# return fetch.page(obj, currentArticleUrl);
# }"
# src = src.gsub(old, new)
# src
# end


# def post_migration_check(src)
# matches = src.scan(FETCH_RE)
# # puts "add this: #{matches.inspect}"
# matches.each do |m|
# raise "bad re? #{m}" if m.size != 3
# wholeline, before, after = m
# if (after !~ /this, /) then
# puts " ??? Missing 'this' in fetch call in \"#{wholeline}\" ???"
# end
# end
# end

########################################

scraper_dir = File.join(__dir__, '..', '..', 'src', 'shared', 'scrapers')

files = []
Dir.chdir(scraper_dir) do
files = Dir.glob(File.join('**', '*.js'))
end
# puts "Pre remove count: #{files.count}"
# files -= IGNORE_FILES
# puts "Post remove count: #{files.count}"
puts "#{files.size} scraper files."


if (!FILENAME.nil?) then
if (!files.include?(FILENAME)) then
puts "#{FILENAME} is not in the list of scraper files:"
puts files.sort.map { |s| " #{s}" }
return
else
files = [FILENAME]
end
end

files.sort!

puts "VALIDATION ========================================"
files.each do |f|
validate(scraper_dir, f)
end
puts "END VALIDATION ===================================="


# During dev, just do one file.
# add_filename_to_scraper_this(scraper_dir, files[0])
# files = [files[0]]
# files = ['DEU/_shared.js']

puts "MUTATION ========================================"
files.each do |f|
puts
puts '=' * 50
puts f
puts '-' * 50
fpath = File.join(scraper_dir, f)
src = File.read(fpath)

src = add_cacheKey_to_fetch_calls(src)

# post_migration_check(src)
puts

case(WRITE)
when 'save' then
File.open(fpath, 'w') { |p| p.puts(src) }
when 'print' then
puts
puts "Result:"
puts "-" * 50
puts src
puts "-" * 50
puts
when 'mute' then
puts ''
end

end
puts "END MUTATION ===================================="
puts
205 changes: 205 additions & 0 deletions scripts/change-fetch-api/migrate-scrapers-add-this-to-fetch.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
# Hack all of the scraper files.
#
# USAGE:
#
# - cd into this directory,
# - run `ruby this-script.rb WRITE [FILENAME]`
#
# WRITE is either true or false.
# - 'save' to overwrite source files
# - 'print' to dump to console
# - 'mute' to not print or save (useful to see the changes that would happen)
#
# FILENAME: name of the file to work with. If missing,
# do all files.
#
# eg.,
# ruby <this-script.rb> print DEU/_shared.js

if (ARGV.size < 1) then
puts "usage: ruby <this_script.rb> WRITE [FILENAME]"
puts "where WRITE = save/print/mute"
return
end

WRITE = ARGV[0].to_s.downcase
FILENAME = (ARGV.size > 1) ? ARGV[1] : nil

# Skip some files.
# Not bothering to try to determine these programmatically.
IGNORE_FILES = %w(
AUS/_shared/get-data-with-tested-negative-applied.js
AUS/_shared/get-key.js
)


LOCATION_RE = /(\s*)(city|county|state|country):/
METHODS = 'page|fetch|raw|json|jsonAndCookies|csv|tsv|pdf|headless|getArcGISCSVURLFromOrgId|getArcGISCSVURL'

# The fancy RE below splits a line like "await fetch.csv(this.url)"
# into ["await fetch.csv(this.url)", "await fetch.csv(", "this.url)"]
# It can screw up in some cases, so we add a hack.
FETCH_RE = /((await\s+.*?\.(?:#{METHODS})\s*\()(.*))/

# Print warnings only for each file f in scraper_dir.
def validate(scraper_dir, f)
fpath = File.join(scraper_dir, f)
src = File.read(fpath)
[ LOCATION_RE, FETCH_RE ].each do |re|
puts "WARN: No match for #{re} in #{f}" if (src !~ re)
end
end


# DISABLING THIS -- scrapers should already have _path.
# def add_filename_to_scraper_this(src)
# m = src.match(LOCATION_RE)
# # puts "add filename: #{m.inspect}"
# if (m.nil?) then
# puts " - skipping adding filepath (no match for RE)"
# return src
# end
#
# if (src =~ /filepath: __filename/)
# puts " - skipping adding _filepath, already added"
# return src
# end
#
# spaces = m[1].gsub("\n", '')
# loctype = m[2]
# puts " + adding filepath above #{loctype}"
# add_code = "
# #{spaces}_filepath: __filename,
# #{spaces}#{loctype}:"
# src = src.sub(LOCATION_RE, add_code)
# src
# end


def add_this_to_fetch_calls(src)
original_src = "CLONE: #{src}"

matches = src.scan(FETCH_RE)
puts "add this: #{matches.inspect}"
matches.uniq.each do |m|
raise "bad re? #{m}" if m.size != 3
wholeline, before, after = m
if (after =~ /this, /) then
puts " - 'this, ' already in \"#{wholeline}\", skipping"
else
newline = "#{before}this, #{after}"
puts " + \"#{wholeline}\" => \"#{newline}\""
src = src.gsub(wholeline, newline)
end
end

if (original_src !~ /this,\s*this,/ && src =~ /this,\s*this,/) then
src = src.gsub(/this,\s*this,/, 'this, ')
end
raise "still have 'this, this'" if (src =~ /this,\s*this,/)

src
end

# Specific hack
def postmigration_AU_QLD_stuff(src)
old = "async function getCurrentArticlePage(obj) {
const $ = await fetch.page(this, obj.url);
const anchors = $('#content h3:first-of-type > a');
const currentArticleUrl = anchors[0].attribs.href;
return fetch.page(currentArticleUrl);
}"
new = "async function getCurrentArticlePage(obj) {
const $ = await fetch.page(obj, obj.url);
const anchors = $('#content h3:first-of-type > a');
const currentArticleUrl = anchors[0].attribs.href;
return fetch.page(obj, currentArticleUrl);
}"
src = src.gsub(old, new)
src
end


def post_migration_check(src)
matches = src.scan(FETCH_RE)
# puts "add this: #{matches.inspect}"
matches.each do |m|
raise "bad re? #{m}" if m.size != 3
wholeline, before, after = m
if (after !~ /this, /) then
puts " ??? Missing 'this' in fetch call in \"#{wholeline}\" ???"
end
end
end

########################################

scraper_dir = File.join(__dir__, '..', '..', 'src', 'shared', 'scrapers')

files = []
Dir.chdir(scraper_dir) do
files = Dir.glob(File.join('**', '*.js'))
end
# puts "Pre remove count: #{files.count}"
files -= IGNORE_FILES
# puts "Post remove count: #{files.count}"
puts "#{files.size} scraper files."


if (!FILENAME.nil?) then
if (!files.include?(FILENAME)) then
puts "#{FILENAME} is not in the list of scraper files:"
puts files.sort.map { |s| " #{s}" }
return
else
files = [FILENAME]
end
end

files.sort!

puts "VALIDATION ========================================"
files.each do |f|
validate(scraper_dir, f)
end
puts "END VALIDATION ===================================="


# During dev, just do one file.
# add_filename_to_scraper_this(scraper_dir, files[0])
# files = [files[0]]
# files = ['DEU/_shared.js']

puts "MUTATION ========================================"
files.each do |f|
puts
puts '=' * 50
puts f
puts '-' * 50
fpath = File.join(scraper_dir, f)
src = File.read(fpath)
# src = add_filename_to_scraper_this(src)
src = add_this_to_fetch_calls(src)
raise "BAD this, this" if (src =~ /this, this,/)
src = postmigration_AU_QLD_stuff(src)

post_migration_check(src)
puts

case(WRITE)
when 'save' then
File.open(fpath, 'w') { |p| p.puts(src) }
when 'print' then
puts
puts "Result:"
puts "-" * 50
puts src
puts "-" * 50
puts
when 'mute' then
puts ''
end

end
puts "END MUTATION ===================================="
puts
Loading

0 comments on commit 27c32c0

Please sign in to comment.